From 643022654eb17f54e8fe9606a110cdfee815c1b4 Mon Sep 17 00:00:00 2001
From: Alexandru Gherghescu <gherghescu_alex1@yahoo.ro>
Date: Wed, 24 Jan 2024 18:57:44 +0200
Subject: [PATCH] Fix final training loss calculation, fix estimation interval

Visual change, correctly display final training loss.

The final training loss didn't account for gradient accumulation, and
was therefore much smaller than it should've been in reality.

Fix the estimation interval, which was also not properly calculated due
to gradient accumulation.
---
 optimus/trainer.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/optimus/trainer.py b/optimus/trainer.py
index f55f73e..7ba8385 100644
--- a/optimus/trainer.py
+++ b/optimus/trainer.py
@@ -102,12 +102,12 @@ class Trainer():
     def _do_epoch_train(self):
         self.model.train() # put model in training mode
 
-        # compute average train loss, train ppl and ms/batch every ~200 batches
-        # (depending on gradient accumulation steps), or every 10% of training
-        # dataset (whichever is smaller)
+        # compute average train loss, train ppl and ms/batch every ~200 batches,
+        # or every 10% of training dataset (whichever is smaller), rounded to
+        # gradient accumulation steps
         self.ms_per_batch = 0.
         total_loss = 0.
-        est_interval = int(max(min(200 // self.grad_acc_steps, 0.1 * len(self.dl.train)), 1)) * self.grad_acc_steps
+        est_interval = int(max(min(200, 0.1 * len(self.dl.train)), 1)) // self.grad_acc_steps * self.grad_acc_steps
         start_time = time.time()
 
         # progress bar for batches
@@ -145,7 +145,7 @@ class Trainer():
                 # update train loss, train ppl and estimated ms/batch
                 if (i + 1) % est_interval == 0:
                     self.ms_per_batch = (time.time() - start_time) * 1000 / est_interval
-                    self.train_loss = total_loss / est_interval
+                    self.train_loss = (total_loss * self.grad_acc_steps) / est_interval
                     self.train_ppl = math.exp(self.train_loss)
 
                     total_loss = 0.
-- 
GitLab