From 3eaa8cb430f1514464c0ed09ca789cb33271f685 Mon Sep 17 00:00:00 2001
From: Alexandru Gherghescu <gherghescu_alex1@yahoo.ro>
Date: Fri, 24 Nov 2023 15:46:20 +0200
Subject: [PATCH] Update old information, increase context length to 4096

Upload Dockerfiles for normal and chat LLama2 variants.
---
 Dockerfile-13b                   | 42 ++++++++++++++++++++++
 Dockerfile-13b-chat              | 42 ++++++++++++++++++++++
 Dockerfile-70b                   | 42 ++++++++++++++++++++++
 Dockerfile-70b-chat              | 42 ++++++++++++++++++++++
 Dockerfile-7b                    | 42 ++++++++++++++++++++++
 Dockerfile => Dockerfile-7b-chat |  6 ++--
 README.md                        | 61 ++++++++++++++++----------------
 dialog.py                        |  4 +--
 8 files changed, 245 insertions(+), 36 deletions(-)
 create mode 100644 Dockerfile-13b
 create mode 100644 Dockerfile-13b-chat
 create mode 100644 Dockerfile-70b
 create mode 100644 Dockerfile-70b-chat
 create mode 100644 Dockerfile-7b
 rename Dockerfile => Dockerfile-7b-chat (82%)

diff --git a/Dockerfile-13b b/Dockerfile-13b
new file mode 100644
index 0000000..5267946
--- /dev/null
+++ b/Dockerfile-13b
@@ -0,0 +1,42 @@
+# there's an extra step needed to install the Nvidia Container Toolkit, which
+# allows the docker containers to access the gpus outside; there's a guide for
+# ubuntu about that here:
+# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
+
+# before building, note that the weights and the script need to be in the
+# current folder
+
+# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-13b -f Dockerfile-13b .`
+# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-13b`
+
+FROM condaforge/mambaforge
+
+# install stuff inside conda
+RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
+    mamba install -c fastai fastai -y && \
+    mamba clean -afy
+
+# llama dependencies
+RUN pip install fairscale sentencepiece fire && \
+    pip cache purge
+
+# add the llama repo
+RUN git clone https://github.com/facebookresearch/llama /llama
+
+# add the tokenizer
+ADD tokenizer.model /llama/tokenizer.model
+
+# add the weights
+ADD llama-2-13b/ /llama/llama-2-13b/
+
+# add the dialog script
+ADD dialog.py /llama/dialog.py
+
+# run llama example program
+CMD ["torchrun", \
+     "--nproc_per_node", "2", \
+     "/llama/dialog.py", \
+     "--ckpt_dir", "/llama/llama-2-13b/", \
+     "--tokenizer_path", "/llama/tokenizer.model", \
+     "--max_seq_len", "4096", \
+     "--max_batch_size", "6"]
diff --git a/Dockerfile-13b-chat b/Dockerfile-13b-chat
new file mode 100644
index 0000000..f2d53ee
--- /dev/null
+++ b/Dockerfile-13b-chat
@@ -0,0 +1,42 @@
+# there's an extra step needed to install the Nvidia Container Toolkit, which
+# allows the docker containers to access the gpus outside; there's a guide for
+# ubuntu about that here:
+# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
+
+# before building, note that the weights and the script need to be in the
+# current folder
+
+# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-13b-chat -f Dockerfile-13b-chat .`
+# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-13b-chat`
+
+FROM condaforge/mambaforge
+
+# install stuff inside conda
+RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
+    mamba install -c fastai fastai -y && \
+    mamba clean -afy
+
+# llama dependencies
+RUN pip install fairscale sentencepiece fire && \
+    pip cache purge
+
+# add the llama repo
+RUN git clone https://github.com/facebookresearch/llama /llama
+
+# add the tokenizer
+ADD tokenizer.model /llama/tokenizer.model
+
+# add the weights
+ADD llama-2-13b-chat/ /llama/llama-2-13b-chat/
+
+# add the dialog script
+ADD dialog.py /llama/dialog.py
+
+# run llama example program
+CMD ["torchrun", \
+     "--nproc_per_node", "2", \
+     "/llama/dialog.py", \
+     "--ckpt_dir", "/llama/llama-2-13b-chat/", \
+     "--tokenizer_path", "/llama/tokenizer.model", \
+     "--max_seq_len", "4096", \
+     "--max_batch_size", "6"]
diff --git a/Dockerfile-70b b/Dockerfile-70b
new file mode 100644
index 0000000..f68e775
--- /dev/null
+++ b/Dockerfile-70b
@@ -0,0 +1,42 @@
+# there's an extra step needed to install the Nvidia Container Toolkit, which
+# allows the docker containers to access the gpus outside; there's a guide for
+# ubuntu about that here:
+# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
+
+# before building, note that the weights and the script need to be in the
+# current folder
+
+# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-70b -f Dockerfile-70b .`
+# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-70b`
+
+FROM condaforge/mambaforge
+
+# install stuff inside conda
+RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
+    mamba install -c fastai fastai -y && \
+    mamba clean -afy
+
+# llama dependencies
+RUN pip install fairscale sentencepiece fire && \
+    pip cache purge
+
+# add the llama repo
+RUN git clone https://github.com/facebookresearch/llama /llama
+
+# add the tokenizer
+ADD tokenizer.model /llama/tokenizer.model
+
+# add the weights
+ADD llama-2-70b/ /llama/llama-2-70b/
+
+# add the dialog script
+ADD dialog.py /llama/dialog.py
+
+# run llama example program
+CMD ["torchrun", \
+     "--nproc_per_node", "8", \
+     "/llama/dialog.py", \
+     "--ckpt_dir", "/llama/llama-2-70b/", \
+     "--tokenizer_path", "/llama/tokenizer.model", \
+     "--max_seq_len", "4096", \
+     "--max_batch_size", "6"]
diff --git a/Dockerfile-70b-chat b/Dockerfile-70b-chat
new file mode 100644
index 0000000..ab0c221
--- /dev/null
+++ b/Dockerfile-70b-chat
@@ -0,0 +1,42 @@
+# there's an extra step needed to install the Nvidia Container Toolkit, which
+# allows the docker containers to access the gpus outside; there's a guide for
+# ubuntu about that here:
+# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
+
+# before building, note that the weights and the script need to be in the
+# current folder
+
+# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-70b-chat -f Dockerfile-70b-chat .`
+# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-70b-chat`
+
+FROM condaforge/mambaforge
+
+# install stuff inside conda
+RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
+    mamba install -c fastai fastai -y && \
+    mamba clean -afy
+
+# llama dependencies
+RUN pip install fairscale sentencepiece fire && \
+    pip cache purge
+
+# add the llama repo
+RUN git clone https://github.com/facebookresearch/llama /llama
+
+# add the tokenizer
+ADD tokenizer.model /llama/tokenizer.model
+
+# add the weights
+ADD llama-2-70b-chat/ /llama/llama-2-70b-chat/
+
+# add the dialog script
+ADD dialog.py /llama/dialog.py
+
+# run llama example program
+CMD ["torchrun", \
+     "--nproc_per_node", "8", \
+     "/llama/dialog.py", \
+     "--ckpt_dir", "/llama/llama-2-70b-chat/", \
+     "--tokenizer_path", "/llama/tokenizer.model", \
+     "--max_seq_len", "4096", \
+     "--max_batch_size", "6"]
diff --git a/Dockerfile-7b b/Dockerfile-7b
new file mode 100644
index 0000000..2ab8ebc
--- /dev/null
+++ b/Dockerfile-7b
@@ -0,0 +1,42 @@
+# there's an extra step needed to install the Nvidia Container Toolkit, which
+# allows the docker containers to access the gpus outside; there's a guide for
+# ubuntu about that here:
+# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
+
+# before building, note that the weights and the script need to be in the
+# current folder
+
+# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-7b -f Dockerfile-7b .`
+# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-7b`
+
+FROM condaforge/mambaforge
+
+# install stuff inside conda
+RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
+    mamba install -c fastai fastai -y && \
+    mamba clean -afy
+
+# llama dependencies
+RUN pip install fairscale sentencepiece fire && \
+    pip cache purge
+
+# add the llama repo
+RUN git clone https://github.com/facebookresearch/llama /llama
+
+# add the tokenizer
+ADD tokenizer.model /llama/tokenizer.model
+
+# add the weights
+ADD llama-2-7b/ /llama/llama-2-7b/
+
+# add the dialog script
+ADD dialog.py /llama/dialog.py
+
+# run llama example program
+CMD ["torchrun", \
+     "--nproc_per_node", "1", \
+     "/llama/dialog.py", \
+     "--ckpt_dir", "/llama/llama-2-7b/", \
+     "--tokenizer_path", "/llama/tokenizer.model", \
+     "--max_seq_len", "4096", \
+     "--max_batch_size", "6"]
diff --git a/Dockerfile b/Dockerfile-7b-chat
similarity index 82%
rename from Dockerfile
rename to Dockerfile-7b-chat
index 3c6ed60..4793e9a 100644
--- a/Dockerfile
+++ b/Dockerfile-7b-chat
@@ -6,8 +6,8 @@
 # before building, note that the weights and the script need to be in the
 # current folder
 
-# build image with: docker build -t llama-7b-img .
-# run image with: docker run -it --gpus all llama-7b-img
+# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-7b-chat -f Dockerfile-7b-chat .`
+# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-7b-chat`
 
 FROM condaforge/mambaforge
 
@@ -38,5 +38,5 @@ CMD ["torchrun", \
      "/llama/dialog.py", \
      "--ckpt_dir", "/llama/llama-2-7b-chat/", \
      "--tokenizer_path", "/llama/tokenizer.model", \
-     "--max_seq_len", "512", \
+     "--max_seq_len", "4096", \
      "--max_batch_size", "6"]
diff --git a/README.md b/README.md
index f395ab0..50400c3 100644
--- a/README.md
+++ b/README.md
@@ -2,19 +2,20 @@
 
 ## Minimum hardware requirements to run the model
 
-The 7B Llama2 model (the smallest one), works on ~16GB of vRAM and RAM. If RAM
-is too small, use a bigger swap (this should only be needed to transfer the
-weights onto the GPU, no actual computation is done on the CPU).
+The 7B Llama2 model demands around 16GB of vRAM for weights + KV cache (which
+depends on the maximum context length). The 13B model demands around 28GB of
+vRAM, and the bigger 70B model around 140GB of vRAM. Note that you need a big
+enough RAM to transfer the weights from disk to vRAM.
 
 ## How to use
 
-There are a few requirements to get the model to run. Broadly speaking, these
-are the actual model (the code), the weights and the Python script to open a
-dialog, as well as some Python packages.
+There are a few requirements to get a model to run. Broadly speaking, these are
+the actual model (the code), the weights and the Python script to open a dialog,
+as well as some Python packages.
 
-A Dockerfile is provided to build an image from scratch using the above. A
-Docker image is already built (see
-[here](https://gitlab.cs.pub.ro/netsys/llama-test/container_registry)), so you
+Dockerfiles are provided to build model images from scratch using the above.
+Docker images are already built (see
+[here](https://gitlab.cs.pub.ro/netsys/llama-images/container_registry)), so you
 can use that instead (you need to be logged in).
 
 Other than that, an Nvidia Container Toolkit driver is necessary to run Nvidia
@@ -26,14 +27,17 @@ Steps:
 1. Install [Nvidia Container Toolkit (steps for
    Ubuntu)](https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/).
    Necessary to let docker containers use the GPU.
-2. Download the Docker container image (`docker image pull
-   gitlab.cs.pub.ro:5050/netsys/llama-test:latest`).
+2. Download the Docker container image of choice (e.g. `docker image pull
+   gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`).
 3. Run the docker image with `docker run -it --gpus all
-   gitlab.cs.pub.ro:5050/netsys/llama-test:latest`. This should take a while to
-   load, but then a prompt to interact with Llama is displayed.
+   gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`. This should take a
+   while to load, but then a prompt to interact with Llama is displayed.
 
 ### On the UPB cluster (fep)
 
+<strike>
+
+```
 Steps:
 0. The Nvidia Container Toolkit is already installed on the cluster, so skip
    that.
@@ -41,30 +45,25 @@ Steps:
 2. Get a bash shell into a partition with a GPU (`srun -p xl --gres
    gpu:tesla_p100:1 --mem=40G --pty bash`).
 3. Pull and build the docker image into an apptainer image on the grid
-   (`apptainer pull docker://gitlab.cs.pub.ro:5050/netsys/llama-test:latest`).
-   This will take a while (probably around ~40 minutes). If it fails because of
-   a timeout, simply run the same command again.
+   (`apptainer pull
+   docker://gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`). This will
+   take a while (probably around ~40 minutes). If it fails because of a timeout,
+   simply run the same command again.
 4. Run the apptainer image with `apptainer run --nv
-   docker://gitlab.cs.pub.ro:5050/netsys/llama-test:latest`. The first time it
-   should take about 3 minutes for it to start, but subsequent runs will take a
-   few seconds (subsequent run = don't log out).
+   docker://gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`. The first
+   time it should take about 3 minutes for it to start, but subsequent runs will
+   take a few seconds (subsequent run = don't log out).
 5. ???
 6. Profit
 
 *Note*: The script will sometimes still error out because of Out-of-Memory
 errors or because the context length was reached. If that happens, reissue the
 command to start a new dialog.
+```
 
-## Limitations
-
-Currently only tested with 7B Llama2, with a 16GB vRAM GPU (Nvidia P100). The
-conversation context length (`--max_seq_len` parameter of the script) is limited
-to 512 tokens (about 2-3 back-and-forth dialogs with the AI). Increasing this
-will (almost surely) result in an Out-of-Memory CUDA error.
-
-## TODOs
+</strike>
 
-- [ ] Choose Python package versions to use inside the Dockerfile, rather than
-have them dangling, to prevent compatibility problems.
-- [ ] Look into quantization (the current model is 8-bit quantized already).
-- [ ] Better dialog script file.
+Increased the context length of all images to 4096 tokens, therefore the cluster
+GPUs won't be able to run the images anymore. To make it work again on the fep
+cluster, you have to manually modify the context length inside a Dockerfile for
+one of the LLama 7B images (to something like 512), and build the image yourself.
diff --git a/dialog.py b/dialog.py
index edc1043..45c3885 100644
--- a/dialog.py
+++ b/dialog.py
@@ -10,7 +10,7 @@ def main(
     tokenizer_path: str,
     temperature: float = 0.6,
     top_p: float = 0.9,
-    max_seq_len: int = 512,
+    max_seq_len: int = 4096,
     max_batch_size: int = 8,
     max_gen_len: Optional[int] = None,
 ):
@@ -24,7 +24,7 @@ def main(
             Defaults to 0.6.
         top_p (float, optional): The top-p sampling parameter for controlling diversity in generation.
             Defaults to 0.9.
-        max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 512.
+        max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 4096.
         max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 8.
         max_gen_len (int, optional): The maximum length of generated sequences. If None, it will be
             set to the model's max sequence length. Defaults to None.
-- 
GitLab