diff --git a/inference.py b/inference.py
index 0f69c01c5473d1d80856debebbd57577ee732c19..2c0bd02d89a7e1228d5bdc9d0da678f7ca5d6fd8 100644
--- a/inference.py
+++ b/inference.py
@@ -88,7 +88,7 @@ def main(model_path: str = 'model.pth',
             warnings.simplefilter('ignore')
             torch.set_default_tensor_type(torch.cuda.HalfTensor)
         elif device == 'cpu':
-            assert 0 == 1, "Cannot run 16-bit inference on CPU!"
+            assert 0 == 1, 'Cannot run 16-bit inference on CPU!'
     else:
         if device == 'cuda':
             warnings.simplefilter('ignore')
@@ -97,7 +97,7 @@ def main(model_path: str = 'model.pth',
             warnings.simplefilter('ignore')
             torch.set_default_tensor_type(torch.FloatTensor)
 
-    print("Loading model from disk...")
+    print('Loading model from disk...')
 
     # load state from file
     assert os.path.exists(model_path)
@@ -113,9 +113,9 @@ def main(model_path: str = 'model.pth',
     p_drop = float(state['p_drop'])
     weight_tying = bool(state['weight_tying'])
 
-    assert vocab_sz == len(tok), ("The tokenizer passed for inference is "
-        "different from the tokenizer used for training! This will result in "
-        "erroneous generation!")
+    assert vocab_sz == len(tok), ('The tokenizer passed for inference is '
+        'different from the tokenizer used for training! This will result in '
+        'erroneous generation!')
 
     # create model, load weights
     config = OptimusConfig(vocab_size=vocab_sz,
@@ -128,19 +128,19 @@ def main(model_path: str = 'model.pth',
     model.load_state_dict(state, strict=True)
     model.eval()
 
-    print(f"Loaded model on device {device}!")
+    print(f'Loaded model on device {device}!')
 
     _total_params = sum(p.numel() for p in model.parameters())
-    print(f"Number of model parameters: {_total_params}")
+    print(f'Number of model parameters: {_total_params}')
 
     # inference loop
-    print("Starting inference...")
+    print('Starting inference...')
 
     if prompt is not None:
         input_sentence = prompt
     else:
-        print("Waiting for user input... (prompt to complete)")
-        input_sentence = input("User: ")
+        print('Waiting for user input... (prompt to complete)')
+        input_sentence = input('User: ')
 
     # tokenize input
     inp = torch.tensor(tok.encode(input_sentence, bos=True, eos=False),
@@ -183,9 +183,9 @@ def main(model_path: str = 'model.pth',
             seq_len = inp.shape[-1]
 
     print(f"Model output: {' '.join(tok.decode(inp.tolist()))}")
-    print(f"Tokens / second: {toks_generated / (time.time() - start_time):.2f}")
+    print(f'Tokens / second: {toks_generated / (time.time() - start_time):.2f}')
 
-    print("Finished inference!")
+    print('Finished inference!')
 
 
 if __name__=='__main__':
diff --git a/optimus/models/optimus.py b/optimus/models/optimus.py
index fc11a93fafa6c8fb6e3a1f00281ca05b2b352983..d478e3f2b5dbdd72a35b03a4e9190c0d49323f2f 100644
--- a/optimus/models/optimus.py
+++ b/optimus/models/optimus.py
@@ -67,12 +67,12 @@ class OptimusConfig():
         intermediate_size=4352,
         num_hidden_layers=48,
         num_attention_heads=40,
-        hidden_act="silu",
+        hidden_act='silu',
         max_position_embeddings=2048,
         initializer_range=0.02,
         rms_norm_eps=1e-6,
         tie_word_embeddings=False,
-        attn_implementation="sdpa",
+        attn_implementation='sdpa',
         attention_bias=False,
         attention_dropout=0.0,
         gradient_checkpointing=True,
@@ -152,7 +152,7 @@ class OptimusFeedForward(nn.Module):
         if config.hidden_act == 'silu' or config.hidden_act == 'swish':
             self.act_fn = F.silu
         else:
-            raise KeyError(f"Currently only silu and swish are supported as activation functions, but got {config.hidden_act}")
+            raise KeyError(f'Currently only silu and swish are supported as activation functions, but got {config.hidden_act}')
 
     def forward(self, x):
         return self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
@@ -266,8 +266,8 @@ class OptimusSdpaAttention(OptimusAttention):
 
 
 OPTIMUS_ATTENTION_CLASSES = {
-    "eager": OptimusAttention,
-    "sdpa": OptimusSdpaAttention,
+    'eager': OptimusAttention,
+    'sdpa': OptimusSdpaAttention,
 }
 
 
diff --git a/training.py b/training.py
index 897a285d483bdcfa00a34a5bee12d7eb5789de05..dc56920998b163baef46b9e1d1b56ede14ab23f8 100644
--- a/training.py
+++ b/training.py
@@ -68,8 +68,8 @@ def main(batch_size: int = 8,
     train_ds = WikiText103Dataset(split='train')
     test_ds = WikiText103Dataset(split='test')
 
-    print(f"Number of examples in training set: {len(train_ds)}")
-    print(f"Number of examples in testing set: {len(test_ds)}")
+    print(f'Number of examples in training set: {len(train_ds)}')
+    print(f'Number of examples in testing set: {len(test_ds)}')
 
     # create dataloader object and move to device
     dl = OptimusDataLoader(train_ds, test_ds, tok,
@@ -88,7 +88,7 @@ def main(batch_size: int = 8,
     model = model.to('cuda')
 
     _total_params = sum(p.numel() for p in model.parameters())
-    print(f"Number of model parameters: {_total_params}")
+    print(f'Number of model parameters: {_total_params}')
 
     # define loss metric
     criterion = nn.CrossEntropyLoss()
@@ -99,7 +99,7 @@ def main(batch_size: int = 8,
     # [1]: https://github.com/pytorch/pytorch/issues/26218
     optimizer = torch.optim.Adam(model.parameters(), betas=(0.9, 0.999), eps=1e-7)
 
-    print("Starting training...")
+    print('Starting training...')
 
     # create trainer and start fitting
     trainer = Trainer(dl=dl,
@@ -114,8 +114,8 @@ def main(batch_size: int = 8,
                       progress_bar=True)
     trainer.fit(epochs)
 
-    print(f"Finished training! Best model weights saved at '{checkpoints_path}'")
+    print(f'Finished training! Best model weights saved at '{checkpoints_path}'')
 
 
-if __name__=="__main__":
+if __name__=='__main__':
     fire.Fire(main)