Skip to content
Snippets Groups Projects

Compare revisions

Changes are shown as if the source revision was being merged into the target revision. Learn more about comparing revisions.

Source

Select target project
No results found

Target

Select target project
  • netsys/optimus-prime
1 result
Show changes
Commits on Source (2)
  • Alexandru-Mihai GHERGHESCU's avatar
    Add fp16 mixed precision training · 6db26eb1
    Alexandru-Mihai GHERGHESCU authored
    This should give training a theoretical 2x speedup in time (though in
    practice that's not usually the case), with close to no loss in
    performance.
    
    The interface allows the user to choose between mixed precision or no
    mixed precision training, which falls back to normal float32 precision.
    
    CPU support for training has been dropped, as it takes (with or without
    mixed precision) much much longer to train than on GPU's, and it's not
    really an alternative anyone considers. With the addition of mixed
    precision, supporting both CPU and GPU would complicate things too much,
    therefore CPU training support has been dropped.
    Unverified
    6db26eb1
  • Alexandru-Mihai GHERGHESCU's avatar
    Adjust optimizer epsilon value for AMP · 8579fc15
    Alexandru-Mihai GHERGHESCU authored
    Pick a better default as epsilon value. Although this value should never
    touch the fp16 gradients in mixed precision training (as the optimizer
    should only ever work on the master fp32 copy of the model), this value
    didn't need to be changed. However, in pure fp16 training, any epsilon
    value lower than 1e-7 would simply underflow to 0, causing it to become
    useless.
    
    Although the framework doesn't directly support the second case above,
    an epsilon value of 1e-7 seems like a better default for both AMP and
    normal training.
    Unverified
    8579fc15
......@@ -22,6 +22,7 @@ class Trainer():
grad_acc_steps: int,
grad_clip_norm: float,
model_save_path: str,
use_fp16: bool,
progress_bar: bool = True):
"""
Trainer implementation for Optimus models.
......@@ -40,6 +41,9 @@ class Trainer():
grad_clip_norm (float): Gradient clipping norm value.
model_save_path (str): The best model (based on validation loss) is
saved to the specified path.
use_fp16 (bool): Whether to train the model in 16-bit floating point
precision. If such hardware is not supported, a warning is
issued and normal 32-bit precision is used instead.
progress_bar (bool): Whether to show a progress bar in console while
training. This is automatically disabled if output is a file,
however some stats are printed after finishing epochs. If False,
......@@ -58,6 +62,10 @@ class Trainer():
self.grad_clip_norm = grad_clip_norm
self.model_save_path = model_save_path
self.use_fp16 = use_fp16
self.fp16_dtype = torch.float16
self.progress_bar = progress_bar
def fit(self, n_epochs: int) -> None:
......@@ -76,6 +84,9 @@ class Trainer():
epochs=n_epochs,
steps_per_epoch=len(self.dl.train) // self.grad_acc_steps)
# scaler used for mixed precision fp16 training on GPU
self.scaler = torch.cuda.amp.GradScaler(enabled=self.use_fp16)
best_val_loss = float('inf')
# progress bar for epochs
......@@ -120,11 +131,16 @@ class Trainer():
if self.progress_bar is True:
pb.update(i)
output = self.model(x)
loss = self.criterion(output.view(-1, len(self.dl.train.tok)),
y.reshape(-1))
loss = loss / self.grad_acc_steps # normalize to account for gradient accumulation
loss.backward()
# automatic mixed precision training
with torch.cuda.amp.autocast(dtype=self.fp16_dtype,
enabled=self.use_fp16):
output = self.model(x)
loss = self.criterion(output.view(-1, len(self.dl.train.tok)),
y.reshape(-1))
loss = loss / self.grad_acc_steps # normalize to account for gradient accumulation
self.scaler.scale(loss).backward()
total_loss += loss.item()
......@@ -135,10 +151,12 @@ class Trainer():
# number of batches doesn't cleanly divide by grad_acc_steps
# gradient clipping
self.scaler.unscale_(self.optimizer)
nn.utils.clip_grad_norm_(self.model.parameters(),
max_norm=self.grad_clip_norm)
self.optimizer.step()
self.scaler.step(self.optimizer)
self.scaler.update()
self.optimizer.zero_grad()
self.scheduler.step()
......@@ -173,9 +191,12 @@ class Trainer():
if self.progress_bar is True:
pb.update(i)
output = self.model(x)
loss = self.criterion(output.view(-1, len(self.dl.test.tok)),
y.reshape(-1))
with torch.cuda.amp.autocast(dtype=self.fp16_dtype,
enabled=self.use_fp16):
output = self.model(x)
loss = self.criterion(output.view(-1, len(self.dl.test.tok)),
y.reshape(-1))
total_loss += loss.item()
self.mb.child.comment = f" | valid loss: {loss.item():.4f}"
......
......@@ -21,7 +21,7 @@ def main(batch_size: int = 8,
n_layers: int = 6,
n_heads: int = 8,
dropout: float = 0.0,
device: str = 'cuda'):
use_fp16: bool = True):
"""
Run the main training loop for the model.
......@@ -40,8 +40,8 @@ def main(batch_size: int = 8,
n_layers (int): Number of layers for the model.
n_heads (int): Number of heads inside an attention layer for the model.
dropout (float): Dropout to use for the model.
device (str): Device where to train the model. Viable options are 'cpu',
'cuda', 'cuda:2' etc.
use_fp16 (bool): Whether to train using floating-point 16-bits
precision.
"""
......@@ -58,7 +58,7 @@ def main(batch_size: int = 8,
f"\t- model layers: {n_layers}\n"
f"\t- model attention heads: {n_heads}\n"
f"\t- model dropout: {dropout}\n"
f"\t- training on device: {device}\n"
f"\t- 16-bit floating-point training (fp16): {use_fp16}\n"
f"Please see '--help' if you want to change these settings")
# load tokenizer
......@@ -75,7 +75,7 @@ def main(batch_size: int = 8,
dl = OptimusDataLoader(train_ds, test_ds, tok,
bs=batch_size,
seq_len=seq_len,
device=device)
device='cuda')
# create model and move to device
model = OptimusTransformer(len(tok),
......@@ -84,14 +84,19 @@ def main(batch_size: int = 8,
n_heads=n_heads,
p_drop=dropout,
weight_tying=False)
model = model.to(device)
model = model.to('cuda')
_total_params = sum(p.numel() for p in model.parameters())
print(f"Number of model parameters: {_total_params}")
# define loss metric and optimizer
# define loss metric
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.999), eps=1e-9)
# define optimizer
# see [1] for a discussion on what the epsilon value should be for amp; 1e-7
# is a good default for both amp and normal training
# [1]: https://github.com/pytorch/pytorch/issues/26218
optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.999), eps=1e-7)
print("Starting training...")
......@@ -104,6 +109,7 @@ def main(batch_size: int = 8,
grad_acc_steps=grad_acc_steps,
grad_clip_norm=grad_clip_norm,
model_save_path=checkpoints_path,
use_fp16=use_fp16,
progress_bar=True)
trainer.fit(epochs)
......