From d96dcf92723f4749c250de184e0a695b4ddcb12e Mon Sep 17 00:00:00 2001 From: Alexandru Gherghescu <gherghescu_alex1@yahoo.ro> Date: Tue, 4 Jun 2024 00:32:22 +0300 Subject: [PATCH] Add logging throughout the framework code This should make it much easier to control logging levels, when and where to log etc. --- inference.py | 22 ++++++----- optimus/trainer.py | 71 +++++++----------------------------- optimus/utils/setup_utils.py | 24 +++++++----- training.py | 33 +++++++++-------- 4 files changed, 58 insertions(+), 92 deletions(-) diff --git a/inference.py b/inference.py index 3463d93..99426b7 100644 --- a/inference.py +++ b/inference.py @@ -1,6 +1,7 @@ import os import time import warnings +import logging import fire import torch @@ -10,6 +11,9 @@ from optimus.models.optimus import OptimusTransformer, OptimusConfig from optimus.utils import setup_utils +logger = logging.getLogger(__name__) + + def sample_top_p(probs: Tensor, top_p: float) -> Tensor: """ Nucleus (top-p) sampling. @@ -67,7 +71,7 @@ def main(model_path: str = 'model.pth', """ - print(f"Running with:\n" + logger.info(f"Running with:\n" f"\t- model path: '{model_path}'\n" f"\t- huggingface tokenizer: '{hf_tokenizer_name}'\n" f"\t- prompt: '{prompt[:30] + ('' if len(prompt) <= 30 else '...') if prompt is not None else '(empty)'}'\n" @@ -97,7 +101,7 @@ def main(model_path: str = 'model.pth', warnings.simplefilter('ignore') torch.set_default_tensor_type(torch.FloatTensor) - print('Loading model from disk...') + logger.info('Loading model from disk...') # load state from file assert os.path.exists(model_path) @@ -128,18 +132,18 @@ def main(model_path: str = 'model.pth', model.load_state_dict(state, strict=True) model.eval() - print(f'Loaded model on device {device}!') + logger.info(f'Loaded model on device {device}!') _total_params = sum(p.numel() for p in model.parameters()) - print(f'Number of model parameters: {_total_params}') + logger.info(f'Number of model parameters: {_total_params}') # inference loop - print('Starting inference...') + logger.info('Starting inference...') if prompt is not None: input_sentence = prompt else: - print('Waiting for user input... (prompt to complete)') + logger.info('Waiting for user input... (prompt to complete)') input_sentence = input('User: ') # tokenize input @@ -181,10 +185,10 @@ def main(model_path: str = 'model.pth', seq_len = inp.shape[-1] - print(f"Model output: {' '.join(tokenizer.decode(inp.tolist()))}") - print(f'Tokens / second: {toks_generated / (time.time() - start_time):.2f}') + logger.info(f"Model output: {' '.join(tokenizer.decode(inp.tolist()))}") + logger.info(f'Tokens / second: {toks_generated / (time.time() - start_time):.2f}') - print('Finished inference!') + logger.info('Finished inference!') if __name__=='__main__': diff --git a/optimus/trainer.py b/optimus/trainer.py index 39db79a..2984f6b 100644 --- a/optimus/trainer.py +++ b/optimus/trainer.py @@ -1,4 +1,4 @@ -import json +import logging from pathlib import Path from typing import Optional @@ -9,6 +9,9 @@ from torch.utils.data import DataLoader from transformers.tokenization_utils_base import PreTrainedTokenizerBase +logger = logging.getLogger(__name__) + + class TrainingArguments(): """ Training arguments class to hold important switches and knobs related to @@ -71,11 +74,6 @@ class TrainingArguments(): self.save_steps = save_steps self.save_limit = save_limit - @classmethod - def from_json_file(cls, file_path): - with open(file_path, 'r') as file: - return cls(**json.load(file)) - class Trainer(): """ @@ -127,13 +125,13 @@ class Trainer(): # scaler used for mixed precision fp16 training on GPU self.scaler = torch.cuda.amp.GradScaler(enabled=self.args.use_fp16) - print("***** Running training *****") - print(f" Num examples = {num_examples:,}") - print(f" Num epochs = {self.args.num_train_epochs:,}") - print(f" Instantaneous batch size per device = {self.args.per_device_batch_size:,}") - print(f" Gradient Accumulation steps = {self.args.gradient_accumulation_steps}") - print(f" Global batch size (w. distributed & accumulation) = {global_batch_size:,}") - print(f" Total optimization steps = {max_steps:,}") + logger.info("***** Running training *****") + logger.info(f" Num examples = {num_examples:,}") + logger.info(f" Num epochs = {self.args.num_train_epochs:,}") + logger.info(f" Instantaneous batch size per device = {self.args.per_device_batch_size:,}") + logger.info(f" Gradient Accumulation steps = {self.args.gradient_accumulation_steps}") + logger.info(f" Global batch size (w. distributed & accumulation) = {global_batch_size:,}") + logger.info(f" Total optimization steps = {max_steps:,}") self.model.train() @@ -154,7 +152,7 @@ class Trainer(): logits = self.model(inputs) labels = inputs[..., 1:].contiguous().view(-1) - logits = logits[..., :-1, :].contiguous().view(-1, self.model.module.vocab_size) + logits = logits[..., :-1, :].contiguous().view(-1, self.model.vocab_size) loss = loss_fn(logits, labels) @@ -182,53 +180,10 @@ class Trainer(): # lr = self.args.lr_scheduler.get_last_lr()[0] if (step + 1) % self.args.log_steps * self.args.gradient_accumulation_steps == 0: - print(f"Loss is {tr_loss:,}") + logger.info(f"Loss is {tr_loss:,}") self.progress.update(1) - # def _do_epoch_validate(self): - # self.model.eval() # put model in eval mode - - # total_loss = 0. - - # # progress bar for batches - # pb = progress_bar(range(len(self.dl.test)), parent=self.mb) - - # with torch.no_grad(): - # for i, (x, y) in enumerate(self.dl.test): - - # if self.progress_bar is True: - # pb.update(i) - - # with torch.cuda.amp.autocast(dtype=self.fp16_dtype, - # enabled=self.use_fp16): - # output = self.model(x) - # loss = self.criterion(output.view(-1, len(self.dl.test.tok)), - # y.reshape(-1)) - - # total_loss += loss.item() - - # self.mb.child.comment = f" | valid loss: {loss.item():.4f}" - - # self.val_loss = total_loss / (len(self.dl.test) - 1) - # self.val_ppl = math.exp(self.val_loss) - - # pb.on_iter_end() - - # def _write_epoch_stats(self): - # if self.progress_bar is True: - # epoch_time = format_time(self.epoch_time) - # self.mb.write( - # f"* End of epoch {self.epoch:3d}:\n" - # f"\tTotal time: {epoch_time:9s} | " - # f"Est. ms/batch: {self.ms_per_batch:.2f}\n" - # f"\tTotal train batches: {len(self.dl.train):10d} | " - # f"Train loss: {self.train_loss: 7.2f} | " - # f"Train perplexity: {self.train_ppl: 8.2f}\n" - # f"\tTotal valid batches: {len(self.dl.test):10d} | " - # f"Valid loss: {self.val_loss: 7.2f} | " - # f"Valid perplexity: {self.val_ppl: 8.2f}") - def save_model(self, save_dir: Path) -> None: """ Save model and tokenizer to a directory. diff --git a/optimus/utils/setup_utils.py b/optimus/utils/setup_utils.py index 7802ea4..d50ad19 100644 --- a/optimus/utils/setup_utils.py +++ b/optimus/utils/setup_utils.py @@ -1,8 +1,12 @@ +import logging + from transformers import AutoTokenizer from datasets import load_from_disk -from optimus.models import OptimusTransformer -from optimus.models.optimus import OptimusConfig +from optimus.models.optimus import OptimusTransformer, OptimusConfig + + +logger = logging.getLogger(__name__) def load_and_chunk_dataset(data_dir, seq_len): @@ -36,15 +40,15 @@ def load_and_chunk_dataset(data_dir, seq_len): ) tokenized_datasets = tokenized_datasets.select_columns(['input_ids']) - print("Result: ") - print(tokenized_datasets) + logger.info("Result: ") + logger.info(tokenized_datasets) ctx_len = len(tokenized_datasets['train'][0]['input_ids']) n_batches = len(tokenized_datasets['train']) - print(f"Dataset info:") - print(f" - context length: {ctx_len}") - print(f" - number of batches (train set): {n_batches}") - print(f" - total number of tokens: {ctx_len * n_batches}") + logger.info(f"Dataset info:") + logger.info(f" - context length: {ctx_len}") + logger.info(f" - number of batches (train set): {n_batches}") + logger.info(f" - total number of tokens: {ctx_len * n_batches}") return tokenized_datasets @@ -67,9 +71,9 @@ def create_model(config_file, device): model = OptimusTransformer(config) model.to(device) - print(model) + logger.info(model) _total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) - print(f"Number of trainable model parameters: {_total_params}") + logger.info(f"Number of trainable model parameters: {_total_params}") return model diff --git a/training.py b/training.py index 50db310..b822fb1 100644 --- a/training.py +++ b/training.py @@ -1,3 +1,5 @@ +import os +import logging from pathlib import Path import fire @@ -8,6 +10,9 @@ from optimus.trainer import Trainer, TrainingArguments from optimus.utils import setup_utils +logger = logging.getLogger(__name__) + + def main( # training args batch_size: int = 2, # per GPU @@ -89,6 +94,13 @@ def main( else: dataset_dir = data_dir / dataset_dir + # set up logging before anything else + logging.basicConfig( + level=os.environ.get('LOG_LEVEL', logging.INFO), + format='{asctime} - {filename} | {levelname} | {message}', + style='{', + ) + # create paths if isinstance(model_config_path, str): model_config_path = Path(model_config_path) @@ -116,7 +128,7 @@ def main( else: save_dir = data_dir / save_dir - print(f"Running with:\n" + logger.info(f"Running with:\n" f"\t- batch size: {batch_size}\n" f"\t- gradient accumulation steps: {grad_acc_steps}\n" f"\t- context length: {seq_len}\n" @@ -127,7 +139,6 @@ def main( f"\t- gradient clipping norm: {grad_clip_norm}\n" f"\t- 16-bit floating-point training (fp16): {use_fp16}\n" f"\t- seed: {seed}\n" - f"\t- only main rank logs: {log_on_main_rank_only}\n" f"\t- model config file: {model_config_path}\n" f"\t- huggingface tokenizer: {hf_tokenizer_name}\n" f"\t- training data directory: {str(data_dir)}\n" @@ -141,7 +152,7 @@ def main( device = f'cuda' # load dataset and split into batches - dataset = setup_utils.load_and_chunk_dataset(dataset_dir, seq_len) + dataset = setup_utils.load_and_chunk_dataset(str(dataset_dir), seq_len) dataset.set_format('torch') # load tokenizer @@ -150,7 +161,7 @@ def main( # create model and move to device model = setup_utils.create_model(model_config_path, device) - # create samplers + dataloader train_dataloader = DataLoader( + # create dataloaders train_dataloader = DataLoader( dataset['train'], batch_size=batch_size, # per GPU @@ -165,6 +176,7 @@ def main( batch_size=batch_size, # per GPU num_workers=6, # allow pre-fetching data through multi-process pin_memory=True, # fast CPU-GPU transfer + drop_last=True, ) # create optimizer @@ -215,20 +227,11 @@ def main( tokenizer=tokenizer, ) - # create trainer - trainer = Trainer( - train_args=train_args, - model=model, - train_dataloader=train_dataloader, - eval_dataloader=eval_dataloader, - tokenizer=tokenizer, - ) - - print('Starting training...') + logger.info('Starting training...') trainer.train() - print(f"Finished training! Saving model weights to '{str(save_dir)}'") + logger.info(f"Finished training! Saving model weights to '{str(save_dir)}'") # save model + tokenizer trainer.save_model(save_dir) -- GitLab