diff --git a/Dockerfile b/Dockerfile
index 215924a8f6d012ab825d58bbf9582cc42778b9db..69606ac805d4ccc4305bbe5db0decb988c86f665 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,22 +1,38 @@
-# there's an extra step needed to install the Nvidia Container Toolkit, which allows the docker containers to access the gpus outside
-# there's a guide for ubuntu about that here: https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
+# there's an extra step needed to install the Nvidia Container Toolkit, which
+# allows the docker containers to access the gpus outside; there's a guide for
+# ubuntu about that here:
+# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
 
-# make sure the llama repo is in ./llama or edit the run command below as needed
-# make sure the weights are in ./llama
-# make sure the dialog.py file is placed inside the ./llama directory
+# before building, note that the weights and the script need to be in the
+# current folder
 
-# build image with: sudo docker build -t mamba-img .
-# run image with: sudo docker run -it -v ~/llama:/llama --gpus all mamba-img
+# build image with: docker build -t llama-7b-img .
+# run image with: docker run -it --gpus all llama-7b-img
 
 FROM condaforge/mambaforge
 
 # install stuff inside conda
 RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
-	mamba install -c fastai fastai -y && \
-	mamba clean -afy
+    mamba install -c fastai fastai -y && \
+    mamba clean -afy
 
 # llama dependencies
 RUN pip install fairscale sentencepiece fire
 
+# add the llama repo
+RUN git clone https://github.com/facebookresearch/llama /llama
+
+# add the weights
+ADD llama-2-7b-chat/ /llama/llama-2-7b-chat/
+
+# add the dialog script
+ADD dialog.py /llama/dialog.py
+
 # run llama example program
-CMD ["torchrun", "--nproc_per_node", "1", "llama/dialog.py", "--ckpt_dir", "llama/llama-2-7b-chat/", "--tokenizer_path", "llama/tokenizer.model", "--max_seq_len", "512", "--max_batch_size", "6"]
+CMD ["torchrun", \
+     "--nproc_per_node", "1", \
+     "/llama/dialog.py", \
+     "--ckpt_dir", "/llama/llama-2-7b-chat/", \
+     "--tokenizer_path", "/llama/tokenizer.model", \
+     "--max_seq_len", "512", \
+     "--max_batch_size", "6"]
diff --git a/README.md b/README.md
index 93972a652a066a21c3dc276bca9e7406452f05da..40d11aafb26cab6223d0a4f4ca4a29efa041cd79 100644
--- a/README.md
+++ b/README.md
@@ -10,10 +10,12 @@ weights onto the GPU, no actual computation is done on the CPU).
 
 There are a few requirements to get the model to run. Broadly speaking, these
 are the actual model (the code), the weights and the Python script to open a
-dialog.
+dialog, as well as some Python packages.
 
-The Python packages necessary to run the code are all packaged inside the
-Dockerfile that comes with this repo. The image is already built on Dockerhub.
+A Dockerfile is provided to build an image from scratch using the above. A
+Docker image is already built (see
+[here](https://gitlab.cs.pub.ro/netsys/llama-test/container_registry)), so you
+can use that instead (you need to be logged in).
 
 Other than that, an Nvidia Container Toolkit driver is necessary to run Nvidia
 code on the GPU inside a docker container.
@@ -24,50 +26,28 @@ Steps:
 1. Install [Nvidia Container Toolkit (steps for
    Ubuntu)](https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/).
    Necessary to let docker containers use the GPU.
-2. Clone [Meta AI's Llama2 repo](github.com/facebookresearch/llama) locally
-   (`git clone https://github.com/facebookresearch/llama`).
-3. Copy the `dialog.py` script to the root of the llama repo.
-4. Download the weights of the model from
-   [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
-   Move the weights to the root of the repo. Make sure to not change the paths
-   of the weights (they should be in llama-2-7b-chat/ etc.). Currently, the
-   docker container only uses the 7B Chat weights, so feel free to only download
-   those.
-5. Download the Docker container image (`docker image pull
-   alexghergh/llama-test:latest`). This container holds all the necessary python
-   packages, and will run the dialog script.
-6. Run the docker image with `docker run -it -v <path/to/llama>:/llama
-   --gpus all alexghergh/llama-test`. Change the path to the llama repo
-   accordingly.
+2. Download the Docker container image (`docker image pull
+   gitlab.cs.pub.ro:5050/netsys/llama-test:latest`).
+3. Run the docker image with `docker run -it --gpus all
+   gitlab.cs.pub.ro:5050/netsys/llama-test:latest`. This should take a while to
+   load, but then a prompt to interact with Llama is displayed.
 
 ### On the UPB cluster (fep)
 
 Steps:
-1. Log in to fep (ssh <username>@fep.grid.pub.ro).
+1. Log in to fep (`ssh <username>@fep.grid.pub.ro`).
 2. Get a bash shell into a partition with a GPU (`srun -p xl --gres
    gpu:tesla_p100:1 --mem=40G --pty bash`).
-3. Clone [Meta AI's Llama2 repo](github.com/facebookresearch/llama) here
-   (`git clone https://github.com/facebookresearch/llama`).
-3. Copy the `dialog.py` script in the current repo to the `llama/` directory
-   of the repo downloaded above.
-4. Download the weights of the model from
-   [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
-   Move the weights in the llama repo. Make sure to not change the paths of the
-   weights (they should be in llama/llama-2-7b-chat/ etc.). Currently, the
-   docker container only uses the 7B Chat weights, so feel free to only download
-   those.
-5. Pull and build the docker image into an apptainer container on the grid
-   (`apptainer pull docker://alexghergh/llama-test:latest`). This will take a
-   while (probably around 15-20 minutes). If it fails because of a timeout,
-   simply run the same command again.
-6. Run the apptainer image with `apptainer run --nv
-   docker://alexghergh/llama-test:latest`. The first time it should take about 7
-   minutes for it to start (I think because the weights are located on a
-   different storage server, so they have to get copied through scp on the
-   machine with the GPU), but subsequent runs will take a few seconds
-   (subsequent run = don't log out).
-7. ???
-8. Profit
+3. Pull and build the docker image into an apptainer image on the grid
+   (`apptainer pull docker://gitlab.cs.pub.ro:5050/netsys/llama-test:latest`).
+   This will take a while (probably around ~40 minutes). If it fails because of
+   a timeout, simply run the same command again.
+4. Run the apptainer image with `apptainer run --nv
+   docker://gitlab.cs.pub.ro:5050/netsys/llama-test:latest`. The first time it
+   should take about 3 minutes for it to start, but subsequent runs will take a
+   few seconds (subsequent run = don't log out).
+5. ???
+6. Profit
 
 *Note*: The script will sometimes still error out because of Out-of-Memory
 errors or because the context length was reached. If that happens, reissue the
@@ -82,7 +62,7 @@ will (almost surely) result in an Out-of-Memory CUDA error.
 
 ## TODOs
 
-- [ ] Choose Python package versions to use inside the Dockerfile, rather than
+[ ] Choose Python package versions to use inside the Dockerfile, rather than
 have them dangling, to prevent compatibility problems.
-- [ ] Look into quantization (the current model is 8-bit quantized already).
-- [ ] Better dialog script file.
+[ ] Look into quantization (the current model is 8-bit quantized already).
+[ ] Better dialog script file.