Alexandru-Mihai GHERGHESCU · 7aa99b4a · 64302265 · 7aa99b4a
--- a/optimus/trainer.py

+ 5

− 5
+++ b/optimus/trainer.py

+ 5

− 5
 @@ -102,12 +102,12 @@ class Trainer():
    def _do_epoch_train(self):
        self.model.train() # put model in training mode

-        # compute average train loss, train ppl and ms/batch every ~200 batches
-        # (depending on gradient accumulation steps), or every 10% of training
-        # dataset (whichever is smaller)
+        # compute average train loss, train ppl and ms/batch every ~200 batches,
+        # or every 10% of training dataset (whichever is smaller), rounded to
+        # gradient accumulation steps
        self.ms_per_batch = 0.
        total_loss = 0.
-        est_interval = int(max(min(200 // self.grad_acc_steps, 0.1 * len(self.dl.train)), 1)) * self.grad_acc_steps
+        est_interval = int(max(min(200, 0.1 * len(self.dl.train)), 1)) // self.grad_acc_steps * self.grad_acc_steps
        start_time = time.time()

        # progress bar for batches
 @@ -145,7 +145,7 @@ class Trainer():
                # update train loss, train ppl and estimated ms/batch
                if (i + 1) % est_interval == 0:
                    self.ms_per_batch = (time.time() - start_time) * 1000 / est_interval
-                    self.train_loss = total_loss / est_interval
+                    self.train_loss = (total_loss * self.grad_acc_steps) / est_interval
                    self.train_ppl = math.exp(self.train_loss)

                    total_loss = 0.