diff --git a/inference.py b/inference.py index 0f69c01c5473d1d80856debebbd57577ee732c19..2c0bd02d89a7e1228d5bdc9d0da678f7ca5d6fd8 100644 --- a/inference.py +++ b/inference.py @@ -88,7 +88,7 @@ def main(model_path: str = 'model.pth', warnings.simplefilter('ignore') torch.set_default_tensor_type(torch.cuda.HalfTensor) elif device == 'cpu': - assert 0 == 1, "Cannot run 16-bit inference on CPU!" + assert 0 == 1, 'Cannot run 16-bit inference on CPU!' else: if device == 'cuda': warnings.simplefilter('ignore') @@ -97,7 +97,7 @@ def main(model_path: str = 'model.pth', warnings.simplefilter('ignore') torch.set_default_tensor_type(torch.FloatTensor) - print("Loading model from disk...") + print('Loading model from disk...') # load state from file assert os.path.exists(model_path) @@ -113,9 +113,9 @@ def main(model_path: str = 'model.pth', p_drop = float(state['p_drop']) weight_tying = bool(state['weight_tying']) - assert vocab_sz == len(tok), ("The tokenizer passed for inference is " - "different from the tokenizer used for training! This will result in " - "erroneous generation!") + assert vocab_sz == len(tok), ('The tokenizer passed for inference is ' + 'different from the tokenizer used for training! This will result in ' + 'erroneous generation!') # create model, load weights config = OptimusConfig(vocab_size=vocab_sz, @@ -128,19 +128,19 @@ def main(model_path: str = 'model.pth', model.load_state_dict(state, strict=True) model.eval() - print(f"Loaded model on device {device}!") + print(f'Loaded model on device {device}!') _total_params = sum(p.numel() for p in model.parameters()) - print(f"Number of model parameters: {_total_params}") + print(f'Number of model parameters: {_total_params}') # inference loop - print("Starting inference...") + print('Starting inference...') if prompt is not None: input_sentence = prompt else: - print("Waiting for user input... (prompt to complete)") - input_sentence = input("User: ") + print('Waiting for user input... (prompt to complete)') + input_sentence = input('User: ') # tokenize input inp = torch.tensor(tok.encode(input_sentence, bos=True, eos=False), @@ -183,9 +183,9 @@ def main(model_path: str = 'model.pth', seq_len = inp.shape[-1] print(f"Model output: {' '.join(tok.decode(inp.tolist()))}") - print(f"Tokens / second: {toks_generated / (time.time() - start_time):.2f}") + print(f'Tokens / second: {toks_generated / (time.time() - start_time):.2f}') - print("Finished inference!") + print('Finished inference!') if __name__=='__main__': diff --git a/optimus/models/optimus.py b/optimus/models/optimus.py index fc11a93fafa6c8fb6e3a1f00281ca05b2b352983..d478e3f2b5dbdd72a35b03a4e9190c0d49323f2f 100644 --- a/optimus/models/optimus.py +++ b/optimus/models/optimus.py @@ -67,12 +67,12 @@ class OptimusConfig(): intermediate_size=4352, num_hidden_layers=48, num_attention_heads=40, - hidden_act="silu", + hidden_act='silu', max_position_embeddings=2048, initializer_range=0.02, rms_norm_eps=1e-6, tie_word_embeddings=False, - attn_implementation="sdpa", + attn_implementation='sdpa', attention_bias=False, attention_dropout=0.0, gradient_checkpointing=True, @@ -152,7 +152,7 @@ class OptimusFeedForward(nn.Module): if config.hidden_act == 'silu' or config.hidden_act == 'swish': self.act_fn = F.silu else: - raise KeyError(f"Currently only silu and swish are supported as activation functions, but got {config.hidden_act}") + raise KeyError(f'Currently only silu and swish are supported as activation functions, but got {config.hidden_act}') def forward(self, x): return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x)) @@ -266,8 +266,8 @@ class OptimusSdpaAttention(OptimusAttention): OPTIMUS_ATTENTION_CLASSES = { - "eager": OptimusAttention, - "sdpa": OptimusSdpaAttention, + 'eager': OptimusAttention, + 'sdpa': OptimusSdpaAttention, } diff --git a/training.py b/training.py index 897a285d483bdcfa00a34a5bee12d7eb5789de05..dc56920998b163baef46b9e1d1b56ede14ab23f8 100644 --- a/training.py +++ b/training.py @@ -68,8 +68,8 @@ def main(batch_size: int = 8, train_ds = WikiText103Dataset(split='train') test_ds = WikiText103Dataset(split='test') - print(f"Number of examples in training set: {len(train_ds)}") - print(f"Number of examples in testing set: {len(test_ds)}") + print(f'Number of examples in training set: {len(train_ds)}') + print(f'Number of examples in testing set: {len(test_ds)}') # create dataloader object and move to device dl = OptimusDataLoader(train_ds, test_ds, tok, @@ -88,7 +88,7 @@ def main(batch_size: int = 8, model = model.to('cuda') _total_params = sum(p.numel() for p in model.parameters()) - print(f"Number of model parameters: {_total_params}") + print(f'Number of model parameters: {_total_params}') # define loss metric criterion = nn.CrossEntropyLoss() @@ -99,7 +99,7 @@ def main(batch_size: int = 8, # [1]: https://github.com/pytorch/pytorch/issues/26218 optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.999), eps=1e-7) - print("Starting training...") + print('Starting training...') # create trainer and start fitting trainer = Trainer(dl=dl, @@ -114,8 +114,8 @@ def main(batch_size: int = 8, progress_bar=True) trainer.fit(epochs) - print(f"Finished training! Best model weights saved at '{checkpoints_path}'") + print(f'Finished training! Best model weights saved at '{checkpoints_path}'') -if __name__=="__main__": +if __name__=='__main__': fire.Fire(main)