Skip to content
Snippets Groups Projects
Unverified Commit 6db26eb1 authored by Alexandru-Mihai GHERGHESCU's avatar Alexandru-Mihai GHERGHESCU
Browse files

Add fp16 mixed precision training

This should give training a theoretical 2x speedup in time (though in
practice that's not usually the case), with close to no loss in
performance.

The interface allows the user to choose between mixed precision or no
mixed precision training, which falls back to normal float32 precision.

CPU support for training has been dropped, as it takes (with or without
mixed precision) much much longer to train than on GPU's, and it's not
really an alternative anyone considers. With the addition of mixed
precision, supporting both CPU and GPU would complicate things too much,
therefore CPU training support has been dropped.
parent fe76efab
No related branches found
No related tags found
1 merge request!17Add fp16 mixed precision training
...@@ -22,6 +22,7 @@ class Trainer(): ...@@ -22,6 +22,7 @@ class Trainer():
grad_acc_steps: int, grad_acc_steps: int,
grad_clip_norm: float, grad_clip_norm: float,
model_save_path: str, model_save_path: str,
use_fp16: bool,
progress_bar: bool = True): progress_bar: bool = True):
""" """
Trainer implementation for Optimus models. Trainer implementation for Optimus models.
...@@ -40,6 +41,9 @@ class Trainer(): ...@@ -40,6 +41,9 @@ class Trainer():
grad_clip_norm (float): Gradient clipping norm value. grad_clip_norm (float): Gradient clipping norm value.
model_save_path (str): The best model (based on validation loss) is model_save_path (str): The best model (based on validation loss) is
saved to the specified path. saved to the specified path.
use_fp16 (bool): Whether to train the model in 16-bit floating point
precision. If such hardware is not supported, a warning is
issued and normal 32-bit precision is used instead.
progress_bar (bool): Whether to show a progress bar in console while progress_bar (bool): Whether to show a progress bar in console while
training. This is automatically disabled if output is a file, training. This is automatically disabled if output is a file,
however some stats are printed after finishing epochs. If False, however some stats are printed after finishing epochs. If False,
...@@ -58,6 +62,10 @@ class Trainer(): ...@@ -58,6 +62,10 @@ class Trainer():
self.grad_clip_norm = grad_clip_norm self.grad_clip_norm = grad_clip_norm
self.model_save_path = model_save_path self.model_save_path = model_save_path
self.use_fp16 = use_fp16
self.fp16_dtype = torch.float16
self.progress_bar = progress_bar self.progress_bar = progress_bar
def fit(self, n_epochs: int) -> None: def fit(self, n_epochs: int) -> None:
...@@ -76,6 +84,9 @@ class Trainer(): ...@@ -76,6 +84,9 @@ class Trainer():
epochs=n_epochs, epochs=n_epochs,
steps_per_epoch=len(self.dl.train) // self.grad_acc_steps) steps_per_epoch=len(self.dl.train) // self.grad_acc_steps)
# scaler used for mixed precision fp16 training on GPU
self.scaler = torch.cuda.amp.GradScaler(enabled=self.use_fp16)
best_val_loss = float('inf') best_val_loss = float('inf')
# progress bar for epochs # progress bar for epochs
...@@ -120,11 +131,16 @@ class Trainer(): ...@@ -120,11 +131,16 @@ class Trainer():
if self.progress_bar is True: if self.progress_bar is True:
pb.update(i) pb.update(i)
output = self.model(x) # automatic mixed precision training
loss = self.criterion(output.view(-1, len(self.dl.train.tok)), with torch.cuda.amp.autocast(dtype=self.fp16_dtype,
y.reshape(-1)) enabled=self.use_fp16):
loss = loss / self.grad_acc_steps # normalize to account for gradient accumulation output = self.model(x)
loss.backward() loss = self.criterion(output.view(-1, len(self.dl.train.tok)),
y.reshape(-1))
loss = loss / self.grad_acc_steps # normalize to account for gradient accumulation
self.scaler.scale(loss).backward()
total_loss += loss.item() total_loss += loss.item()
...@@ -135,10 +151,12 @@ class Trainer(): ...@@ -135,10 +151,12 @@ class Trainer():
# number of batches doesn't cleanly divide by grad_acc_steps # number of batches doesn't cleanly divide by grad_acc_steps
# gradient clipping # gradient clipping
self.scaler.unscale_(self.optimizer)
nn.utils.clip_grad_norm_(self.model.parameters(), nn.utils.clip_grad_norm_(self.model.parameters(),
max_norm=self.grad_clip_norm) max_norm=self.grad_clip_norm)
self.optimizer.step() self.scaler.step(self.optimizer)
self.scaler.update()
self.optimizer.zero_grad() self.optimizer.zero_grad()
self.scheduler.step() self.scheduler.step()
...@@ -173,9 +191,12 @@ class Trainer(): ...@@ -173,9 +191,12 @@ class Trainer():
if self.progress_bar is True: if self.progress_bar is True:
pb.update(i) pb.update(i)
output = self.model(x) with torch.cuda.amp.autocast(dtype=self.fp16_dtype,
loss = self.criterion(output.view(-1, len(self.dl.test.tok)), enabled=self.use_fp16):
y.reshape(-1)) output = self.model(x)
loss = self.criterion(output.view(-1, len(self.dl.test.tok)),
y.reshape(-1))
total_loss += loss.item() total_loss += loss.item()
self.mb.child.comment = f" | valid loss: {loss.item():.4f}" self.mb.child.comment = f" | valid loss: {loss.item():.4f}"
......
...@@ -21,7 +21,7 @@ def main(batch_size: int = 8, ...@@ -21,7 +21,7 @@ def main(batch_size: int = 8,
n_layers: int = 6, n_layers: int = 6,
n_heads: int = 8, n_heads: int = 8,
dropout: float = 0.0, dropout: float = 0.0,
device: str = 'cuda'): use_fp16: bool = True):
""" """
Run the main training loop for the model. Run the main training loop for the model.
...@@ -40,8 +40,8 @@ def main(batch_size: int = 8, ...@@ -40,8 +40,8 @@ def main(batch_size: int = 8,
n_layers (int): Number of layers for the model. n_layers (int): Number of layers for the model.
n_heads (int): Number of heads inside an attention layer for the model. n_heads (int): Number of heads inside an attention layer for the model.
dropout (float): Dropout to use for the model. dropout (float): Dropout to use for the model.
device (str): Device where to train the model. Viable options are 'cpu', use_fp16 (bool): Whether to train using floating-point 16-bits
'cuda', 'cuda:2' etc. precision.
""" """
...@@ -58,7 +58,7 @@ def main(batch_size: int = 8, ...@@ -58,7 +58,7 @@ def main(batch_size: int = 8,
f"\t- model layers: {n_layers}\n" f"\t- model layers: {n_layers}\n"
f"\t- model attention heads: {n_heads}\n" f"\t- model attention heads: {n_heads}\n"
f"\t- model dropout: {dropout}\n" f"\t- model dropout: {dropout}\n"
f"\t- training on device: {device}\n" f"\t- 16-bit floating-point training (fp16): {use_fp16}\n"
f"Please see '--help' if you want to change these settings") f"Please see '--help' if you want to change these settings")
# load tokenizer # load tokenizer
...@@ -75,7 +75,7 @@ def main(batch_size: int = 8, ...@@ -75,7 +75,7 @@ def main(batch_size: int = 8,
dl = OptimusDataLoader(train_ds, test_ds, tok, dl = OptimusDataLoader(train_ds, test_ds, tok,
bs=batch_size, bs=batch_size,
seq_len=seq_len, seq_len=seq_len,
device=device) device='cuda')
# create model and move to device # create model and move to device
model = OptimusTransformer(len(tok), model = OptimusTransformer(len(tok),
...@@ -84,7 +84,7 @@ def main(batch_size: int = 8, ...@@ -84,7 +84,7 @@ def main(batch_size: int = 8,
n_heads=n_heads, n_heads=n_heads,
p_drop=dropout, p_drop=dropout,
weight_tying=False) weight_tying=False)
model = model.to(device) model = model.to('cuda')
_total_params = sum(p.numel() for p in model.parameters()) _total_params = sum(p.numel() for p in model.parameters())
print(f"Number of model parameters: {_total_params}") print(f"Number of model parameters: {_total_params}")
...@@ -104,6 +104,7 @@ def main(batch_size: int = 8, ...@@ -104,6 +104,7 @@ def main(batch_size: int = 8,
grad_acc_steps=grad_acc_steps, grad_acc_steps=grad_acc_steps,
grad_clip_norm=grad_clip_norm, grad_clip_norm=grad_clip_norm,
model_save_path=checkpoints_path, model_save_path=checkpoints_path,
use_fp16=use_fp16,
progress_bar=True) progress_bar=True)
trainer.fit(epochs) trainer.fit(epochs)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment