From 643022654eb17f54e8fe9606a110cdfee815c1b4 Mon Sep 17 00:00:00 2001 From: Alexandru Gherghescu <gherghescu_alex1@yahoo.ro> Date: Wed, 24 Jan 2024 18:57:44 +0200 Subject: [PATCH] Fix final training loss calculation, fix estimation interval Visual change, correctly display final training loss. The final training loss didn't account for gradient accumulation, and was therefore much smaller than it should've been in reality. Fix the estimation interval, which was also not properly calculated due to gradient accumulation. --- optimus/trainer.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/optimus/trainer.py b/optimus/trainer.py index f55f73e..7ba8385 100644 --- a/optimus/trainer.py +++ b/optimus/trainer.py @@ -102,12 +102,12 @@ class Trainer(): def _do_epoch_train(self): self.model.train() # put model in training mode - # compute average train loss, train ppl and ms/batch every ~200 batches - # (depending on gradient accumulation steps), or every 10% of training - # dataset (whichever is smaller) + # compute average train loss, train ppl and ms/batch every ~200 batches, + # or every 10% of training dataset (whichever is smaller), rounded to + # gradient accumulation steps self.ms_per_batch = 0. total_loss = 0. - est_interval = int(max(min(200 // self.grad_acc_steps, 0.1 * len(self.dl.train)), 1)) * self.grad_acc_steps + est_interval = int(max(min(200, 0.1 * len(self.dl.train)), 1)) // self.grad_acc_steps * self.grad_acc_steps start_time = time.time() # progress bar for batches @@ -145,7 +145,7 @@ class Trainer(): # update train loss, train ppl and estimated ms/batch if (i + 1) % est_interval == 0: self.ms_per_batch = (time.time() - start_time) * 1000 / est_interval - self.train_loss = total_loss / est_interval + self.train_loss = (total_loss * self.grad_acc_steps) / est_interval self.train_ppl = math.exp(self.train_loss) total_loss = 0. -- GitLab