diff --git a/Dockerfile-13b-chat b/Dockerfile-13b-chat
index 0ca12b1cc7dc54510934d505be7ab78d96bbb9b6..bea11affa75e36f4a46e26f5a494564e6f94579a 100644
--- a/Dockerfile-13b-chat
+++ b/Dockerfile-13b-chat
@@ -20,17 +20,14 @@ RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cu
 RUN pip install fairscale sentencepiece fire && \
     pip cache purge
 
-# add the llama repo
-RUN git clone https://github.com/facebookresearch/llama /llama
-
-# add the tokenizer
-ADD tokenizer.model /llama/tokenizer.model
+# add the llama repo (netsys edition)
+RUN git clone https://gitlab.cs.pub.ro/netsys/llama /llama
 
 # add the weights
 ADD llama-2-13b-chat/ /llama/llama-2-13b-chat/
 
-# add the dialog script
-ADD dialog.py /llama/dialog.py
+# avoid an annoying PyTorch (torch.distributed) warning
+ARG OMP_NUM_THREADS=1
 
 # run llama example program
 CMD ["torchrun", \
diff --git a/Dockerfile-70b-chat b/Dockerfile-70b-chat
index c114ad73baab5d86400c20fdfd6aaadecc3bf67b..9d6f6a2c8e34366418fcf5ccd1867c574c78b52c 100644
--- a/Dockerfile-70b-chat
+++ b/Dockerfile-70b-chat
@@ -7,7 +7,7 @@
 # current folder
 
 # build image with: `docker build -t gitlab.cs.pub.ro:5050/netsys/llama-images:llama-70b-chat -f Dockerfile-70b-chat .`
-# run image with: `docker run -it --gpus all gitlab.cs.pub.ro:5050/netsys/llama-images:llama-70b-chat`
+# run image with: `docker run -it --gpus all --shm-size 2gb gitlab.cs.pub.ro:5050/netsys/llama-images:llama-70b-chat`
 
 FROM condaforge/mambaforge
 
@@ -20,17 +20,14 @@ RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cu
 RUN pip install fairscale sentencepiece fire && \
     pip cache purge
 
-# add the llama repo
-RUN git clone https://github.com/facebookresearch/llama /llama
-
-# add the tokenizer
-ADD tokenizer.model /llama/tokenizer.model
+# add the llama repo (netsys edition)
+RUN git clone https://gitlab.cs.pub.ro/netsys/llama /llama
 
 # add the weights
 ADD llama-2-70b-chat/ /llama/llama-2-70b-chat/
 
-# add the dialog script
-ADD dialog.py /llama/dialog.py
+# avoid an annoying PyTorch (torch.distributed) warning
+ARG OMP_NUM_THREADS=1
 
 # run llama example program
 CMD ["torchrun", \
diff --git a/Dockerfile-7b-chat b/Dockerfile-7b-chat
index 0c15c21dd602f9f97526421a6de364939aab1bd8..92fe7dba4f6a43864339529ab498c7eca48d9a6c 100644
--- a/Dockerfile-7b-chat
+++ b/Dockerfile-7b-chat
@@ -20,17 +20,14 @@ RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cu
 RUN pip install fairscale sentencepiece fire && \
     pip cache purge
 
-# add the llama repo
-RUN git clone https://github.com/facebookresearch/llama /llama
-
-# add the tokenizer
-ADD tokenizer.model /llama/tokenizer.model
+# add the llama repo (netsys edition)
+RUN git clone https://gitlab.cs.pub.ro/netsys/llama /llama
 
 # add the weights
 ADD llama-2-7b-chat/ /llama/llama-2-7b-chat/
 
-# add the dialog script
-ADD dialog.py /llama/dialog.py
+# avoid an annoying PyTorch (torch.distributed) warning
+ARG OMP_NUM_THREADS=1
 
 # run llama example program
 CMD ["torchrun", \
diff --git a/dialog.py b/dialog.py
deleted file mode 100644
index a6fdf7dede479962924bbda569f795b1fdcfbce9..0000000000000000000000000000000000000000
--- a/dialog.py
+++ /dev/null
@@ -1,87 +0,0 @@
-from typing import List, Optional
-
-import fire
-
-from llama import Llama, Dialog
-
-import torch
-from fairscale.nn.model_parallel.initialize import (
-    get_model_parallel_rank,
-    destroy_model_parallel
-)
-
-
-def main(
-    ckpt_dir: str,
-    tokenizer_path: str,
-    temperature: float = 0.6,
-    top_p: float = 0.9,
-    max_seq_len: int = 4096,
-    max_batch_size: int = 8,
-    max_gen_len: Optional[int] = None,
-):
-    """
-    Entry point of the program for generating text using a pretrained model.
-
-    Args:
-        ckpt_dir (str): The directory containing checkpoint files for the pretrained model.
-        tokenizer_path (str): The path to the tokenizer model used for text encoding/decoding.
-        temperature (float, optional): The temperature value for controlling randomness in generation.
-            Defaults to 0.6.
-        top_p (float, optional): The top-p sampling parameter for controlling diversity in generation.
-            Defaults to 0.9.
-        max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 4096.
-        max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 8.
-        max_gen_len (int, optional): The maximum length of generated sequences. If None, it will be
-            set to the model's max sequence length. Defaults to None.
-    """
-    # build the model
-    generator = Llama.build(
-        ckpt_dir=ckpt_dir,
-        tokenizer_path=tokenizer_path,
-        max_seq_len=max_seq_len,
-        max_batch_size=max_batch_size,
-    )
-
-    # at this point, we forked into multiple processes, one for each GPU
-
-    dialog: Dialog = []
-
-    # dialog loop
-    print("You can now start typing to chat!")
-    while True:
-
-        # only the first process (rank 0) will interact with the user
-        user_input = [None]
-        if get_model_parallel_rank() == 0:
-            user_input = [input("User: ")]
-
-        # broadcast user input to all ranks
-        torch.distributed.broadcast_object_list(user_input, src=0)
-
-        print('=====(processing query...)=====')
-        dialog.append({ 'role': 'user', 'content': user_input[0] })
-
-        # run inference
-        results = generator.chat_completion(
-            [dialog],
-            max_gen_len=max_gen_len,
-            temperature=temperature,
-            top_p=top_p,
-        )
-
-        # get the response
-        result = results[0]
-        role = result['generation']['role']
-        content = result['generation']['content']
-
-        # print to user and append to dialog context
-        print(f'{role.capitalize()}: {content}')
-        dialog.append({ 'role': role, 'content': content })
-
-    # at the very end, cleanup
-    destroy_model_parallel()
-
-
-if __name__ == "__main__":
-    fire.Fire(main)