Skip to content
Snippets Groups Projects

Fix a number of issues with the infrastructure, no major rework

Merged Alexandru-Mihai GHERGHESCU requested to merge fix/general_small_fixes into main
1 file
+ 5
5
Compare changes
  • Side-by-side
  • Inline
+ 5
5
@@ -102,12 +102,12 @@ class Trainer():
def _do_epoch_train(self):
self.model.train() # put model in training mode
# compute average train loss, train ppl and ms/batch every ~200 batches
# (depending on gradient accumulation steps), or every 10% of training
# dataset (whichever is smaller)
# compute average train loss, train ppl and ms/batch every ~200 batches,
# or every 10% of training dataset (whichever is smaller), rounded to
# gradient accumulation steps
self.ms_per_batch = 0.
total_loss = 0.
est_interval = int(max(min(200 // self.grad_acc_steps, 0.1 * len(self.dl.train)), 1)) * self.grad_acc_steps
est_interval = int(max(min(200, 0.1 * len(self.dl.train)), 1)) // self.grad_acc_steps * self.grad_acc_steps
start_time = time.time()
# progress bar for batches
@@ -145,7 +145,7 @@ class Trainer():
# update train loss, train ppl and estimated ms/batch
if (i + 1) % est_interval == 0:
self.ms_per_batch = (time.time() - start_time) * 1000 / est_interval
self.train_loss = total_loss / est_interval
self.train_loss = (total_loss * self.grad_acc_steps) / est_interval
self.train_ppl = math.exp(self.train_loss)
total_loss = 0.
Loading