diff --git a/optimus/trainer.py b/optimus/trainer.py
index f55f73ebb1532ca67869bc0244a7723b371b9f8f..7ba83856d53089c70a9c4ccb1a3b03b3f20bbdd8 100644
--- a/optimus/trainer.py
+++ b/optimus/trainer.py
@@ -102,12 +102,12 @@ class Trainer():
     def _do_epoch_train(self):
         self.model.train() # put model in training mode
 
-        # compute average train loss, train ppl and ms/batch every ~200 batches
-        # (depending on gradient accumulation steps), or every 10% of training
-        # dataset (whichever is smaller)
+        # compute average train loss, train ppl and ms/batch every ~200 batches,
+        # or every 10% of training dataset (whichever is smaller), rounded to
+        # gradient accumulation steps
         self.ms_per_batch = 0.
         total_loss = 0.
-        est_interval = int(max(min(200 // self.grad_acc_steps, 0.1 * len(self.dl.train)), 1)) * self.grad_acc_steps
+        est_interval = int(max(min(200, 0.1 * len(self.dl.train)), 1)) // self.grad_acc_steps * self.grad_acc_steps
         start_time = time.time()
 
         # progress bar for batches
@@ -145,7 +145,7 @@ class Trainer():
                 # update train loss, train ppl and estimated ms/batch
                 if (i + 1) % est_interval == 0:
                     self.ms_per_batch = (time.time() - start_time) * 1000 / est_interval
-                    self.train_loss = total_loss / est_interval
+                    self.train_loss = (total_loss * self.grad_acc_steps) / est_interval
                     self.train_ppl = math.exp(self.train_loss)
 
                     total_loss = 0.