diff --git a/optimus/trainer.py b/optimus/trainer.py index f55f73ebb1532ca67869bc0244a7723b371b9f8f..7ba83856d53089c70a9c4ccb1a3b03b3f20bbdd8 100644 --- a/optimus/trainer.py +++ b/optimus/trainer.py @@ -102,12 +102,12 @@ class Trainer(): def _do_epoch_train(self): self.model.train() # put model in training mode - # compute average train loss, train ppl and ms/batch every ~200 batches - # (depending on gradient accumulation steps), or every 10% of training - # dataset (whichever is smaller) + # compute average train loss, train ppl and ms/batch every ~200 batches, + # or every 10% of training dataset (whichever is smaller), rounded to + # gradient accumulation steps self.ms_per_batch = 0. total_loss = 0. - est_interval = int(max(min(200 // self.grad_acc_steps, 0.1 * len(self.dl.train)), 1)) * self.grad_acc_steps + est_interval = int(max(min(200, 0.1 * len(self.dl.train)), 1)) // self.grad_acc_steps * self.grad_acc_steps start_time = time.time() # progress bar for batches @@ -145,7 +145,7 @@ class Trainer(): # update train loss, train ppl and estimated ms/batch if (i + 1) % est_interval == 0: self.ms_per_batch = (time.time() - start_time) * 1000 / est_interval - self.train_loss = total_loss / est_interval + self.train_loss = (total_loss * self.grad_acc_steps) / est_interval self.train_ppl = math.exp(self.train_loss) total_loss = 0.