From fadb8a9c338a665a73059043d3b930cff7cf60de Mon Sep 17 00:00:00 2001
From: Alexandru Gherghescu <gherghescu_alex1@yahoo.ro>
Date: Fri, 29 Sep 2023 20:13:39 +0300
Subject: [PATCH] Update README and Dockerfile to include everything in the
 image

Include all the small pieces in the Dockerfile and build an image with
that. That means the llama repo, the weights and the dialog script are
all included in the Docker image now. The steps are updated accordingly.
---
 Dockerfile | 36 +++++++++++++++++++++--------
 README.md  | 68 +++++++++++++++++++-----------------------------------
 2 files changed, 50 insertions(+), 54 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index 215924a..69606ac 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,22 +1,38 @@
-# there's an extra step needed to install the Nvidia Container Toolkit, which allows the docker containers to access the gpus outside
-# there's a guide for ubuntu about that here: https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
+# there's an extra step needed to install the Nvidia Container Toolkit, which
+# allows the docker containers to access the gpus outside; there's a guide for
+# ubuntu about that here:
+# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
 
-# make sure the llama repo is in ./llama or edit the run command below as needed
-# make sure the weights are in ./llama
-# make sure the dialog.py file is placed inside the ./llama directory
+# before building, note that the weights and the script need to be in the
+# current folder
 
-# build image with: sudo docker build -t mamba-img .
-# run image with: sudo docker run -it -v ~/llama:/llama --gpus all mamba-img
+# build image with: docker build -t llama-7b-img .
+# run image with: docker run -it --gpus all llama-7b-img
 
 FROM condaforge/mambaforge
 
 # install stuff inside conda
 RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
-	mamba install -c fastai fastai -y && \
-	mamba clean -afy
+    mamba install -c fastai fastai -y && \
+    mamba clean -afy
 
 # llama dependencies
 RUN pip install fairscale sentencepiece fire
 
+# add the llama repo
+RUN git clone https://github.com/facebookresearch/llama /llama
+
+# add the weights
+ADD llama-2-7b-chat/ /llama/llama-2-7b-chat/
+
+# add the dialog script
+ADD dialog.py /llama/dialog.py
+
 # run llama example program
-CMD ["torchrun", "--nproc_per_node", "1", "llama/dialog.py", "--ckpt_dir", "llama/llama-2-7b-chat/", "--tokenizer_path", "llama/tokenizer.model", "--max_seq_len", "512", "--max_batch_size", "6"]
+CMD ["torchrun", \
+     "--nproc_per_node", "1", \
+     "/llama/dialog.py", \
+     "--ckpt_dir", "/llama/llama-2-7b-chat/", \
+     "--tokenizer_path", "/llama/tokenizer.model", \
+     "--max_seq_len", "512", \
+     "--max_batch_size", "6"]
diff --git a/README.md b/README.md
index 93972a6..40d11aa 100644
--- a/README.md
+++ b/README.md
@@ -10,10 +10,12 @@ weights onto the GPU, no actual computation is done on the CPU).
 
 There are a few requirements to get the model to run. Broadly speaking, these
 are the actual model (the code), the weights and the Python script to open a
-dialog.
+dialog, as well as some Python packages.
 
-The Python packages necessary to run the code are all packaged inside the
-Dockerfile that comes with this repo. The image is already built on Dockerhub.
+A Dockerfile is provided to build an image from scratch using the above. A
+Docker image is already built (see
+[here](https://gitlab.cs.pub.ro/netsys/llama-test/container_registry)), so you
+can use that instead (you need to be logged in).
 
 Other than that, an Nvidia Container Toolkit driver is necessary to run Nvidia
 code on the GPU inside a docker container.
@@ -24,50 +26,28 @@ Steps:
 1. Install [Nvidia Container Toolkit (steps for
    Ubuntu)](https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/).
    Necessary to let docker containers use the GPU.
-2. Clone [Meta AI's Llama2 repo](github.com/facebookresearch/llama) locally
-   (`git clone https://github.com/facebookresearch/llama`).
-3. Copy the `dialog.py` script to the root of the llama repo.
-4. Download the weights of the model from
-   [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
-   Move the weights to the root of the repo. Make sure to not change the paths
-   of the weights (they should be in llama-2-7b-chat/ etc.). Currently, the
-   docker container only uses the 7B Chat weights, so feel free to only download
-   those.
-5. Download the Docker container image (`docker image pull
-   alexghergh/llama-test:latest`). This container holds all the necessary python
-   packages, and will run the dialog script.
-6. Run the docker image with `docker run -it -v <path/to/llama>:/llama
-   --gpus all alexghergh/llama-test`. Change the path to the llama repo
-   accordingly.
+2. Download the Docker container image (`docker image pull
+   gitlab.cs.pub.ro:5050/netsys/llama-test:latest`).
+3. Run the docker image with `docker run -it --gpus all
+   gitlab.cs.pub.ro:5050/netsys/llama-test:latest`. This should take a while to
+   load, but then a prompt to interact with Llama is displayed.
 
 ### On the UPB cluster (fep)
 
 Steps:
-1. Log in to fep (ssh <username>@fep.grid.pub.ro).
+1. Log in to fep (`ssh <username>@fep.grid.pub.ro`).
 2. Get a bash shell into a partition with a GPU (`srun -p xl --gres
    gpu:tesla_p100:1 --mem=40G --pty bash`).
-3. Clone [Meta AI's Llama2 repo](github.com/facebookresearch/llama) here
-   (`git clone https://github.com/facebookresearch/llama`).
-3. Copy the `dialog.py` script in the current repo to the `llama/` directory
-   of the repo downloaded above.
-4. Download the weights of the model from
-   [here](https://ai.meta.com/resources/models-and-libraries/llama-downloads/).
-   Move the weights in the llama repo. Make sure to not change the paths of the
-   weights (they should be in llama/llama-2-7b-chat/ etc.). Currently, the
-   docker container only uses the 7B Chat weights, so feel free to only download
-   those.
-5. Pull and build the docker image into an apptainer container on the grid
-   (`apptainer pull docker://alexghergh/llama-test:latest`). This will take a
-   while (probably around 15-20 minutes). If it fails because of a timeout,
-   simply run the same command again.
-6. Run the apptainer image with `apptainer run --nv
-   docker://alexghergh/llama-test:latest`. The first time it should take about 7
-   minutes for it to start (I think because the weights are located on a
-   different storage server, so they have to get copied through scp on the
-   machine with the GPU), but subsequent runs will take a few seconds
-   (subsequent run = don't log out).
-7. ???
-8. Profit
+3. Pull and build the docker image into an apptainer image on the grid
+   (`apptainer pull docker://gitlab.cs.pub.ro:5050/netsys/llama-test:latest`).
+   This will take a while (probably around ~40 minutes). If it fails because of
+   a timeout, simply run the same command again.
+4. Run the apptainer image with `apptainer run --nv
+   docker://gitlab.cs.pub.ro:5050/netsys/llama-test:latest`. The first time it
+   should take about 3 minutes for it to start, but subsequent runs will take a
+   few seconds (subsequent run = don't log out).
+5. ???
+6. Profit
 
 *Note*: The script will sometimes still error out because of Out-of-Memory
 errors or because the context length was reached. If that happens, reissue the
@@ -82,7 +62,7 @@ will (almost surely) result in an Out-of-Memory CUDA error.
 
 ## TODOs
 
-- [ ] Choose Python package versions to use inside the Dockerfile, rather than
+[ ] Choose Python package versions to use inside the Dockerfile, rather than
 have them dangling, to prevent compatibility problems.
-- [ ] Look into quantization (the current model is 8-bit quantized already).
-- [ ] Better dialog script file.
+[ ] Look into quantization (the current model is 8-bit quantized already).
+[ ] Better dialog script file.
-- 
GitLab