From d96dcf92723f4749c250de184e0a695b4ddcb12e Mon Sep 17 00:00:00 2001
From: Alexandru Gherghescu <gherghescu_alex1@yahoo.ro>
Date: Tue, 4 Jun 2024 00:32:22 +0300
Subject: [PATCH] Add logging throughout the framework code

This should make it much easier to control logging levels, when and
where to log etc.
---
 inference.py                 | 22 ++++++-----
 optimus/trainer.py           | 71 +++++++-----------------------------
 optimus/utils/setup_utils.py | 24 +++++++-----
 training.py                  | 33 +++++++++--------
 4 files changed, 58 insertions(+), 92 deletions(-)

diff --git a/inference.py b/inference.py
index 3463d93..99426b7 100644
--- a/inference.py
+++ b/inference.py
@@ -1,6 +1,7 @@
 import os
 import time
 import warnings
+import logging
 
 import fire
 import torch
@@ -10,6 +11,9 @@ from optimus.models.optimus import OptimusTransformer, OptimusConfig
 from optimus.utils import setup_utils
 
 
+logger = logging.getLogger(__name__)
+
+
 def sample_top_p(probs: Tensor, top_p: float) -> Tensor:
     """
     Nucleus (top-p) sampling.
@@ -67,7 +71,7 @@ def main(model_path: str = 'model.pth',
 
     """
 
-    print(f"Running with:\n"
+    logger.info(f"Running with:\n"
         f"\t- model path: '{model_path}'\n"
         f"\t- huggingface tokenizer: '{hf_tokenizer_name}'\n"
         f"\t- prompt: '{prompt[:30] + ('' if len(prompt) <= 30 else '...') if prompt is not None else '(empty)'}'\n"
@@ -97,7 +101,7 @@ def main(model_path: str = 'model.pth',
             warnings.simplefilter('ignore')
             torch.set_default_tensor_type(torch.FloatTensor)
 
-    print('Loading model from disk...')
+    logger.info('Loading model from disk...')
 
     # load state from file
     assert os.path.exists(model_path)
@@ -128,18 +132,18 @@ def main(model_path: str = 'model.pth',
     model.load_state_dict(state, strict=True)
     model.eval()
 
-    print(f'Loaded model on device {device}!')
+    logger.info(f'Loaded model on device {device}!')
 
     _total_params = sum(p.numel() for p in model.parameters())
-    print(f'Number of model parameters: {_total_params}')
+    logger.info(f'Number of model parameters: {_total_params}')
 
     # inference loop
-    print('Starting inference...')
+    logger.info('Starting inference...')
 
     if prompt is not None:
         input_sentence = prompt
     else:
-        print('Waiting for user input... (prompt to complete)')
+        logger.info('Waiting for user input... (prompt to complete)')
         input_sentence = input('User: ')
 
     # tokenize input
@@ -181,10 +185,10 @@ def main(model_path: str = 'model.pth',
 
             seq_len = inp.shape[-1]
 
-    print(f"Model output: {' '.join(tokenizer.decode(inp.tolist()))}")
-    print(f'Tokens / second: {toks_generated / (time.time() - start_time):.2f}')
+    logger.info(f"Model output: {' '.join(tokenizer.decode(inp.tolist()))}")
+    logger.info(f'Tokens / second: {toks_generated / (time.time() - start_time):.2f}')
 
-    print('Finished inference!')
+    logger.info('Finished inference!')
 
 
 if __name__=='__main__':
diff --git a/optimus/trainer.py b/optimus/trainer.py
index 39db79a..2984f6b 100644
--- a/optimus/trainer.py
+++ b/optimus/trainer.py
@@ -1,4 +1,4 @@
-import json
+import logging
 from pathlib import Path
 from typing import Optional
 
@@ -9,6 +9,9 @@ from torch.utils.data import DataLoader
 from transformers.tokenization_utils_base import PreTrainedTokenizerBase
 
 
+logger = logging.getLogger(__name__)
+
+
 class TrainingArguments():
     """
     Training arguments class to hold important switches and knobs related to
@@ -71,11 +74,6 @@ class TrainingArguments():
         self.save_steps = save_steps
         self.save_limit = save_limit
 
-    @classmethod
-    def from_json_file(cls, file_path):
-        with open(file_path, 'r') as file:
-            return cls(**json.load(file))
-
 
 class Trainer():
     """
@@ -127,13 +125,13 @@ class Trainer():
         # scaler used for mixed precision fp16 training on GPU
         self.scaler = torch.cuda.amp.GradScaler(enabled=self.args.use_fp16)
 
-        print("***** Running training *****")
-        print(f"  Num examples = {num_examples:,}")
-        print(f"  Num epochs = {self.args.num_train_epochs:,}")
-        print(f"  Instantaneous batch size per device = {self.args.per_device_batch_size:,}")
-        print(f"  Gradient Accumulation steps = {self.args.gradient_accumulation_steps}")
-        print(f"  Global batch size (w. distributed & accumulation) = {global_batch_size:,}")
-        print(f"  Total optimization steps = {max_steps:,}")
+        logger.info("***** Running training *****")
+        logger.info(f"  Num examples = {num_examples:,}")
+        logger.info(f"  Num epochs = {self.args.num_train_epochs:,}")
+        logger.info(f"  Instantaneous batch size per device = {self.args.per_device_batch_size:,}")
+        logger.info(f"  Gradient Accumulation steps = {self.args.gradient_accumulation_steps}")
+        logger.info(f"  Global batch size (w. distributed & accumulation) = {global_batch_size:,}")
+        logger.info(f"  Total optimization steps = {max_steps:,}")
 
         self.model.train()
 
@@ -154,7 +152,7 @@ class Trainer():
                     logits = self.model(inputs)
 
                     labels = inputs[..., 1:].contiguous().view(-1)
-                    logits = logits[..., :-1, :].contiguous().view(-1, self.model.module.vocab_size)
+                    logits = logits[..., :-1, :].contiguous().view(-1, self.model.vocab_size)
 
                     loss = loss_fn(logits, labels)
 
@@ -182,53 +180,10 @@ class Trainer():
                     # lr = self.args.lr_scheduler.get_last_lr()[0]
 
                     if (step + 1) % self.args.log_steps * self.args.gradient_accumulation_steps == 0:
-                        print(f"Loss is {tr_loss:,}")
+                        logger.info(f"Loss is {tr_loss:,}")
 
                     self.progress.update(1)
 
-    # def _do_epoch_validate(self):
-    #     self.model.eval() # put model in eval mode
-
-    #     total_loss = 0.
-
-    #     # progress bar for batches
-    #     pb = progress_bar(range(len(self.dl.test)), parent=self.mb)
-
-    #     with torch.no_grad():
-    #         for i, (x, y) in enumerate(self.dl.test):
-
-    #             if self.progress_bar is True:
-    #                 pb.update(i)
-
-    #             with torch.cuda.amp.autocast(dtype=self.fp16_dtype,
-    #                                          enabled=self.use_fp16):
-    #                 output = self.model(x)
-    #                 loss = self.criterion(output.view(-1, len(self.dl.test.tok)),
-    #                                       y.reshape(-1))
-
-    #             total_loss += loss.item()
-
-    #             self.mb.child.comment = f" | valid loss: {loss.item():.4f}"
-
-    #         self.val_loss = total_loss / (len(self.dl.test) - 1)
-    #         self.val_ppl = math.exp(self.val_loss)
-
-    #         pb.on_iter_end()
-
-    # def _write_epoch_stats(self):
-    #     if self.progress_bar is True:
-    #         epoch_time = format_time(self.epoch_time)
-    #         self.mb.write(
-    #             f"* End of epoch {self.epoch:3d}:\n"
-    #             f"\tTotal time: {epoch_time:9s} | "
-    #             f"Est. ms/batch: {self.ms_per_batch:.2f}\n"
-    #             f"\tTotal train batches: {len(self.dl.train):10d} | "
-    #             f"Train loss: {self.train_loss: 7.2f} | "
-    #             f"Train perplexity: {self.train_ppl: 8.2f}\n"
-    #             f"\tTotal valid batches: {len(self.dl.test):10d} | "
-    #             f"Valid loss: {self.val_loss: 7.2f} | "
-    #             f"Valid perplexity: {self.val_ppl: 8.2f}")
-
     def save_model(self, save_dir: Path) -> None:
         """
         Save model and tokenizer to a directory.
diff --git a/optimus/utils/setup_utils.py b/optimus/utils/setup_utils.py
index 7802ea4..d50ad19 100644
--- a/optimus/utils/setup_utils.py
+++ b/optimus/utils/setup_utils.py
@@ -1,8 +1,12 @@
+import logging
+
 from transformers import AutoTokenizer
 from datasets import load_from_disk
 
-from optimus.models import OptimusTransformer
-from optimus.models.optimus import OptimusConfig
+from optimus.models.optimus import OptimusTransformer, OptimusConfig
+
+
+logger = logging.getLogger(__name__)
 
 
 def load_and_chunk_dataset(data_dir, seq_len):
@@ -36,15 +40,15 @@ def load_and_chunk_dataset(data_dir, seq_len):
     )
     tokenized_datasets = tokenized_datasets.select_columns(['input_ids'])
 
-    print("Result: ")
-    print(tokenized_datasets)
+    logger.info("Result: ")
+    logger.info(tokenized_datasets)
 
     ctx_len = len(tokenized_datasets['train'][0]['input_ids'])
     n_batches = len(tokenized_datasets['train'])
-    print(f"Dataset info:")
-    print(f"  - context length: {ctx_len}")
-    print(f"  - number of batches (train set): {n_batches}")
-    print(f"  - total number of tokens: {ctx_len * n_batches}")
+    logger.info(f"Dataset info:")
+    logger.info(f"  - context length: {ctx_len}")
+    logger.info(f"  - number of batches (train set): {n_batches}")
+    logger.info(f"  - total number of tokens: {ctx_len * n_batches}")
 
     return tokenized_datasets
 
@@ -67,9 +71,9 @@ def create_model(config_file, device):
     model = OptimusTransformer(config)
     model.to(device)
 
-    print(model)
+    logger.info(model)
 
     _total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
-    print(f"Number of trainable model parameters: {_total_params}")
+    logger.info(f"Number of trainable model parameters: {_total_params}")
 
     return model
diff --git a/training.py b/training.py
index 50db310..b822fb1 100644
--- a/training.py
+++ b/training.py
@@ -1,3 +1,5 @@
+import os
+import logging
 from pathlib import Path
 
 import fire
@@ -8,6 +10,9 @@ from optimus.trainer import Trainer, TrainingArguments
 from optimus.utils import setup_utils
 
 
+logger = logging.getLogger(__name__)
+
+
 def main(
     # training args
     batch_size: int = 2, # per GPU
@@ -89,6 +94,13 @@ def main(
     else:
         dataset_dir = data_dir / dataset_dir
 
+    # set up logging before anything else
+    logging.basicConfig(
+        level=os.environ.get('LOG_LEVEL', logging.INFO),
+        format='{asctime} - {filename} | {levelname} | {message}',
+        style='{',
+    )
+
     # create paths
     if isinstance(model_config_path, str):
         model_config_path = Path(model_config_path)
@@ -116,7 +128,7 @@ def main(
     else:
         save_dir = data_dir / save_dir
 
-    print(f"Running with:\n"
+    logger.info(f"Running with:\n"
         f"\t- batch size: {batch_size}\n"
         f"\t- gradient accumulation steps: {grad_acc_steps}\n"
         f"\t- context length: {seq_len}\n"
@@ -127,7 +139,6 @@ def main(
         f"\t- gradient clipping norm: {grad_clip_norm}\n"
         f"\t- 16-bit floating-point training (fp16): {use_fp16}\n"
         f"\t- seed: {seed}\n"
-        f"\t- only main rank logs: {log_on_main_rank_only}\n"
         f"\t- model config file: {model_config_path}\n"
         f"\t- huggingface tokenizer: {hf_tokenizer_name}\n"
         f"\t- training data directory: {str(data_dir)}\n"
@@ -141,7 +152,7 @@ def main(
     device = f'cuda'
 
     # load dataset and split into batches
-    dataset = setup_utils.load_and_chunk_dataset(dataset_dir, seq_len)
+    dataset = setup_utils.load_and_chunk_dataset(str(dataset_dir), seq_len)
     dataset.set_format('torch')
 
     # load tokenizer
@@ -150,7 +161,7 @@ def main(
     # create model and move to device
     model = setup_utils.create_model(model_config_path, device)
 
-    # create samplers + dataloader    train_dataloader = DataLoader(
+    # create dataloaders
     train_dataloader = DataLoader(
         dataset['train'],
         batch_size=batch_size, # per GPU
@@ -165,6 +176,7 @@ def main(
         batch_size=batch_size, # per GPU
         num_workers=6, # allow pre-fetching data through multi-process
         pin_memory=True, # fast CPU-GPU transfer
+        drop_last=True,
     )
 
     # create optimizer
@@ -215,20 +227,11 @@ def main(
         tokenizer=tokenizer,
     )
 
-    # create trainer
-    trainer = Trainer(
-        train_args=train_args,
-        model=model,
-        train_dataloader=train_dataloader,
-        eval_dataloader=eval_dataloader,
-        tokenizer=tokenizer,
-    )
-
-    print('Starting training...')
+    logger.info('Starting training...')
 
     trainer.train()
 
-    print(f"Finished training! Saving model weights to '{str(save_dir)}'")
+    logger.info(f"Finished training! Saving model weights to '{str(save_dir)}'")
 
     # save model + tokenizer
     trainer.save_model(save_dir)
-- 
GitLab