diff --git a/optimus/example_inference.py b/optimus/example_inference.py
deleted file mode 100644
index 2e55e36af378f93ed1e4655271135a354c9999d0..0000000000000000000000000000000000000000
--- a/optimus/example_inference.py
+++ /dev/null
@@ -1,120 +0,0 @@
-import sys
-import time
-import math
-import argparse
-
-import fire
-import torch
-from torch import nn
-from torch.utils.data import Dataset
-
-from datasets.wikitext103 import WikiText103Dataset
-from tokenizer import Tokenizer
-from dataloader import OptimusDataLoader
-from model import Transformer
-from trainer import Trainer
-
-
-def main(batch_size: int = 8,
-         grad_acc_steps: int = 1,
-         seq_len: int = 512,
-         lr_max: float = 1e-4,
-         grad_clip_norm: float = 1.0,
-         epochs: int = 1,
-         tokenizer_path: str = 'optimus.model',
-         checkpoints_path: str = 'best_model.pth',
-         n_layers: int = 6,
-         dim: int = 512,
-         n_heads: int = 8,
-         dropout: float = 0.0,
-         device: str = 'cuda',
-         prompt = "Once upon time"
-):
-    """
-    Run the main training loop for the model.
-
-    Args:
-        batch_size (int): Batch size for training.
-        grad_acc_steps (int): Number of batches to accumulate gradients for
-            before running backpropagation to update weights.
-        seq_len (int): Context length for training.
-        lr_max (float): Maximum learning rate, used for one-cycle scheduling.
-        grad_clip_norm (float): Gradient clipping value for gradient's norm.
-        epochs (int): Number of epochs to train for.
-        tokenizer_path (str): Path to the tokenizer model.
-        checkpoints_path (str): Where to save the trained model. Should be a .pt
-            or .pth file.
-        n_layers (int): Number of layers for the model.
-        dim (int): Dimension of the model.
-        n_heads (int): Number of heads inside an attention layer for the model.
-        dropout (float): Dropout to use for the model.
-        device (str): Device where to train the model. Viable options are 'cpu',
-            'cuda', 'cuda:2' etc.
-
-    """
-
-    print(f"Running with:\n"
-        f"\t- batch size: {batch_size}\n"
-        f"\t- gradient accumulation steps: {grad_acc_steps}\n"
-        f"\t- context length: {seq_len}\n"
-        f"\t- max learning rate: {lr_max}\n"
-        f"\t- gradient clipping norm: {grad_clip_norm}\n"
-        f"\t- epochs: {epochs}\n"
-        f"\t- tokenizer: {tokenizer_path}\n"
-        f"\t- checkpoints path: {checkpoints_path}\n"
-        f"\t- model layers: {n_layers}\n"
-        f"\t- model dimension: {dim}\n"
-        f"\t- model attention heads: {n_heads}\n"
-        f"\t- model dropout: {dropout}\n"
-        f"\t- training on device: {device}\n"
-        f"Please see '--help' if you want to change these settings")
-
-    # load tokenizer
-    tok = Tokenizer(model_path=tokenizer_path)
-
-    # create model
-    model = Transformer(len(tok),
-                        n_layers=n_layers,
-                        dim=dim,
-                        n_heads=n_heads,
-                        p_drop=dropout,
-                        weight_tying=False)
-    
-    # load checkpoint
-    checkpoint = torch.load(checkpoints_path, map_location=device)
-    state_dict = checkpoint
-    model.load_state_dict(state_dict)
-
-    model.eval()
-    model = model.to(device)
-
-    _total_params = sum(p.numel() for p in model.parameters())
-    print(f"Number of model parameters: {_total_params}")
-
-    # create trainer and start fitting
-    trainer = Trainer(dl=None,
-                      model=model,
-                      criterion=None,
-                      optimizer=None,
-                      lr=lr_max,
-                      grad_acc_steps=grad_acc_steps,
-                      grad_clip_norm=grad_clip_norm,
-                      model_save_path=checkpoints_path,
-                      progress_bar=True)
-
-    # Run the generation for 128 tokens
-    dtype = 'bfloat16' if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else 'float16' # 'float32' or 'bfloat16' or 'float16'
-    ptdtype = {'float32': torch.float32, 'bfloat16': torch.bfloat16, 'float16': torch.float16}[dtype]
-    ctx = torch.amp.autocast(device_type=device, dtype=ptdtype)
-
-
-    with torch.no_grad():
-        with ctx:
-            start_ids = tok.encode(prompt, bos=False, eos=False)
-            x = (torch.tensor(start_ids, dtype=torch.long, device=device)[None, ...])
-
-    print(f"Finished training! Best model weights saved at '{checkpoints_path}'")
-
-
-if __name__=="__main__":
-    fire.Fire(main)
diff --git a/optimus/trainer.py b/optimus/trainer.py
index abfbaafa2bdf7d0c59bde5c9d1848d8315c4affd..fd547ba2910fe18b749a85656333b840239ab447 100644
--- a/optimus/trainer.py
+++ b/optimus/trainer.py
@@ -197,28 +197,3 @@ class Trainer():
                 f"\tTotal valid batches: {len(self.dl.test):10d} | "
                 f"Valid loss: {self.val_loss: 7.2f} | "
                 f"Valid perplexity: {self.val_ppl: 8.2f}")
-            
-    @torch.no_grad()
-    def generate(self, idx, max_new_tokens, temperature=1.0, top_k=None):
-        """
-        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
-        the sequence max_new_tokens times, feeding the predictions back into the model each time.
-        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
-        """
-
-        for _ in range(max_new_tokens):
-            idx_cond = idx 
-            # TODO: Once we have access to context_size change the line to
-            # if idx.size(1) <= self.context_size else idx[:, -self.context_size:]
-            logits = self(idx_cond)
-
-            logits = logits[:, -1, :] / temperature
-            if top_k is not None:
-                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                logits[logits < v[:, [-1]]] = -float('Inf')
-            
-            probs = F.softmax(logits, dim=-1)
-            idx_next = torch.multinomial(probs, num_samples=1)
-            idx = torch.cat((idx, idx_next), dim=1)
-        
-        return idx