From 3eaa8cb430f1514464c0ed09ca789cb33271f685 Mon Sep 17 00:00:00 2001 From: Alexandru Gherghescu <gherghescu_alex1@yahoo.ro> Date: Fri, 24 Nov 2023 15:46:20 +0200 Subject: [PATCH] Update old information, increase context length to 4096 Upload Dockerfiles for normal and chat LLama2 variants. --- Dockerfile-13b | 42 ++++++++++++++++++++++ Dockerfile-13b-chat | 42 ++++++++++++++++++++++ Dockerfile-70b | 42 ++++++++++++++++++++++ Dockerfile-70b-chat | 42 ++++++++++++++++++++++ Dockerfile-7b | 42 ++++++++++++++++++++++ Dockerfile => Dockerfile-7b-chat | 6 ++-- README.md | 61 ++++++++++++++++---------------- dialog.py | 4 +-- 8 files changed, 245 insertions(+), 36 deletions(-) create mode 100644 Dockerfile-13b create mode 100644 Dockerfile-13b-chat create mode 100644 Dockerfile-70b create mode 100644 Dockerfile-70b-chat create mode 100644 Dockerfile-7b rename Dockerfile => Dockerfile-7b-chat (82%) diff --git a/Dockerfile-13b b/Dockerfile-13b new file mode 100644 index 0000000..5267946 --- /dev/null +++ b/Dockerfile-13b @@ -0,0 +1,42 @@ +# there's an extra step needed to install the Nvidia Container Toolkit, which +# allows the docker containers to access the gpus outside; there's a guide for +# ubuntu about that here: +# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/ + +# before building, note that the weights and the script need to be in the +# current folder + +# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-13b -f Dockerfile-13b .` +# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-13b` + +FROM condaforge/mambaforge + +# install stuff inside conda +RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \ + mamba install -c fastai fastai -y && \ + mamba clean -afy + +# llama dependencies +RUN pip install fairscale sentencepiece fire && \ + pip cache purge + +# add the llama repo +RUN git clone https://github.com/facebookresearch/llama /llama + +# add the tokenizer +ADD tokenizer.model /llama/tokenizer.model + +# add the weights +ADD llama-2-13b/ /llama/llama-2-13b/ + +# add the dialog script +ADD dialog.py /llama/dialog.py + +# run llama example program +CMD ["torchrun", \ + "--nproc_per_node", "2", \ + "/llama/dialog.py", \ + "--ckpt_dir", "/llama/llama-2-13b/", \ + "--tokenizer_path", "/llama/tokenizer.model", \ + "--max_seq_len", "4096", \ + "--max_batch_size", "6"] diff --git a/Dockerfile-13b-chat b/Dockerfile-13b-chat new file mode 100644 index 0000000..f2d53ee --- /dev/null +++ b/Dockerfile-13b-chat @@ -0,0 +1,42 @@ +# there's an extra step needed to install the Nvidia Container Toolkit, which +# allows the docker containers to access the gpus outside; there's a guide for +# ubuntu about that here: +# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/ + +# before building, note that the weights and the script need to be in the +# current folder + +# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-13b-chat -f Dockerfile-13b-chat .` +# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-13b-chat` + +FROM condaforge/mambaforge + +# install stuff inside conda +RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \ + mamba install -c fastai fastai -y && \ + mamba clean -afy + +# llama dependencies +RUN pip install fairscale sentencepiece fire && \ + pip cache purge + +# add the llama repo +RUN git clone https://github.com/facebookresearch/llama /llama + +# add the tokenizer +ADD tokenizer.model /llama/tokenizer.model + +# add the weights +ADD llama-2-13b-chat/ /llama/llama-2-13b-chat/ + +# add the dialog script +ADD dialog.py /llama/dialog.py + +# run llama example program +CMD ["torchrun", \ + "--nproc_per_node", "2", \ + "/llama/dialog.py", \ + "--ckpt_dir", "/llama/llama-2-13b-chat/", \ + "--tokenizer_path", "/llama/tokenizer.model", \ + "--max_seq_len", "4096", \ + "--max_batch_size", "6"] diff --git a/Dockerfile-70b b/Dockerfile-70b new file mode 100644 index 0000000..f68e775 --- /dev/null +++ b/Dockerfile-70b @@ -0,0 +1,42 @@ +# there's an extra step needed to install the Nvidia Container Toolkit, which +# allows the docker containers to access the gpus outside; there's a guide for +# ubuntu about that here: +# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/ + +# before building, note that the weights and the script need to be in the +# current folder + +# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-70b -f Dockerfile-70b .` +# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-70b` + +FROM condaforge/mambaforge + +# install stuff inside conda +RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \ + mamba install -c fastai fastai -y && \ + mamba clean -afy + +# llama dependencies +RUN pip install fairscale sentencepiece fire && \ + pip cache purge + +# add the llama repo +RUN git clone https://github.com/facebookresearch/llama /llama + +# add the tokenizer +ADD tokenizer.model /llama/tokenizer.model + +# add the weights +ADD llama-2-70b/ /llama/llama-2-70b/ + +# add the dialog script +ADD dialog.py /llama/dialog.py + +# run llama example program +CMD ["torchrun", \ + "--nproc_per_node", "8", \ + "/llama/dialog.py", \ + "--ckpt_dir", "/llama/llama-2-70b/", \ + "--tokenizer_path", "/llama/tokenizer.model", \ + "--max_seq_len", "4096", \ + "--max_batch_size", "6"] diff --git a/Dockerfile-70b-chat b/Dockerfile-70b-chat new file mode 100644 index 0000000..ab0c221 --- /dev/null +++ b/Dockerfile-70b-chat @@ -0,0 +1,42 @@ +# there's an extra step needed to install the Nvidia Container Toolkit, which +# allows the docker containers to access the gpus outside; there's a guide for +# ubuntu about that here: +# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/ + +# before building, note that the weights and the script need to be in the +# current folder + +# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-70b-chat -f Dockerfile-70b-chat .` +# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-70b-chat` + +FROM condaforge/mambaforge + +# install stuff inside conda +RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \ + mamba install -c fastai fastai -y && \ + mamba clean -afy + +# llama dependencies +RUN pip install fairscale sentencepiece fire && \ + pip cache purge + +# add the llama repo +RUN git clone https://github.com/facebookresearch/llama /llama + +# add the tokenizer +ADD tokenizer.model /llama/tokenizer.model + +# add the weights +ADD llama-2-70b-chat/ /llama/llama-2-70b-chat/ + +# add the dialog script +ADD dialog.py /llama/dialog.py + +# run llama example program +CMD ["torchrun", \ + "--nproc_per_node", "8", \ + "/llama/dialog.py", \ + "--ckpt_dir", "/llama/llama-2-70b-chat/", \ + "--tokenizer_path", "/llama/tokenizer.model", \ + "--max_seq_len", "4096", \ + "--max_batch_size", "6"] diff --git a/Dockerfile-7b b/Dockerfile-7b new file mode 100644 index 0000000..2ab8ebc --- /dev/null +++ b/Dockerfile-7b @@ -0,0 +1,42 @@ +# there's an extra step needed to install the Nvidia Container Toolkit, which +# allows the docker containers to access the gpus outside; there's a guide for +# ubuntu about that here: +# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/ + +# before building, note that the weights and the script need to be in the +# current folder + +# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-7b -f Dockerfile-7b .` +# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-7b` + +FROM condaforge/mambaforge + +# install stuff inside conda +RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \ + mamba install -c fastai fastai -y && \ + mamba clean -afy + +# llama dependencies +RUN pip install fairscale sentencepiece fire && \ + pip cache purge + +# add the llama repo +RUN git clone https://github.com/facebookresearch/llama /llama + +# add the tokenizer +ADD tokenizer.model /llama/tokenizer.model + +# add the weights +ADD llama-2-7b/ /llama/llama-2-7b/ + +# add the dialog script +ADD dialog.py /llama/dialog.py + +# run llama example program +CMD ["torchrun", \ + "--nproc_per_node", "1", \ + "/llama/dialog.py", \ + "--ckpt_dir", "/llama/llama-2-7b/", \ + "--tokenizer_path", "/llama/tokenizer.model", \ + "--max_seq_len", "4096", \ + "--max_batch_size", "6"] diff --git a/Dockerfile b/Dockerfile-7b-chat similarity index 82% rename from Dockerfile rename to Dockerfile-7b-chat index 3c6ed60..4793e9a 100644 --- a/Dockerfile +++ b/Dockerfile-7b-chat @@ -6,8 +6,8 @@ # before building, note that the weights and the script need to be in the # current folder -# build image with: docker build -t llama-7b-img . -# run image with: docker run -it --gpus all llama-7b-img +# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-7b-chat -f Dockerfile-7b-chat .` +# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-7b-chat` FROM condaforge/mambaforge @@ -38,5 +38,5 @@ CMD ["torchrun", \ "/llama/dialog.py", \ "--ckpt_dir", "/llama/llama-2-7b-chat/", \ "--tokenizer_path", "/llama/tokenizer.model", \ - "--max_seq_len", "512", \ + "--max_seq_len", "4096", \ "--max_batch_size", "6"] diff --git a/README.md b/README.md index f395ab0..50400c3 100644 --- a/README.md +++ b/README.md @@ -2,19 +2,20 @@ ## Minimum hardware requirements to run the model -The 7B Llama2 model (the smallest one), works on ~16GB of vRAM and RAM. If RAM -is too small, use a bigger swap (this should only be needed to transfer the -weights onto the GPU, no actual computation is done on the CPU). +The 7B Llama2 model demands around 16GB of vRAM for weights + KV cache (which +depends on the maximum context length). The 13B model demands around 28GB of +vRAM, and the bigger 70B model around 140GB of vRAM. Note that you need a big +enough RAM to transfer the weights from disk to vRAM. ## How to use -There are a few requirements to get the model to run. Broadly speaking, these -are the actual model (the code), the weights and the Python script to open a -dialog, as well as some Python packages. +There are a few requirements to get a model to run. Broadly speaking, these are +the actual model (the code), the weights and the Python script to open a dialog, +as well as some Python packages. -A Dockerfile is provided to build an image from scratch using the above. A -Docker image is already built (see -[here](https://gitlab.cs.pub.ro/netsys/llama-test/container_registry)), so you +Dockerfiles are provided to build model images from scratch using the above. +Docker images are already built (see +[here](https://gitlab.cs.pub.ro/netsys/llama-images/container_registry)), so you can use that instead (you need to be logged in). Other than that, an Nvidia Container Toolkit driver is necessary to run Nvidia @@ -26,14 +27,17 @@ Steps: 1. Install [Nvidia Container Toolkit (steps for Ubuntu)](https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/). Necessary to let docker containers use the GPU. -2. Download the Docker container image (`docker image pull - gitlab.cs.pub.ro:5050/netsys/llama-test:latest`). +2. Download the Docker container image of choice (e.g. `docker image pull + gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`). 3. Run the docker image with `docker run -it --gpus all - gitlab.cs.pub.ro:5050/netsys/llama-test:latest`. This should take a while to - load, but then a prompt to interact with Llama is displayed. + gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`. This should take a + while to load, but then a prompt to interact with Llama is displayed. ### On the UPB cluster (fep) +<strike> + +``` Steps: 0. The Nvidia Container Toolkit is already installed on the cluster, so skip that. @@ -41,30 +45,25 @@ Steps: 2. Get a bash shell into a partition with a GPU (`srun -p xl --gres gpu:tesla_p100:1 --mem=40G --pty bash`). 3. Pull and build the docker image into an apptainer image on the grid - (`apptainer pull docker://gitlab.cs.pub.ro:5050/netsys/llama-test:latest`). - This will take a while (probably around ~40 minutes). If it fails because of - a timeout, simply run the same command again. + (`apptainer pull + docker://gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`). This will + take a while (probably around ~40 minutes). If it fails because of a timeout, + simply run the same command again. 4. Run the apptainer image with `apptainer run --nv - docker://gitlab.cs.pub.ro:5050/netsys/llama-test:latest`. The first time it - should take about 3 minutes for it to start, but subsequent runs will take a - few seconds (subsequent run = don't log out). + docker://gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`. The first + time it should take about 3 minutes for it to start, but subsequent runs will + take a few seconds (subsequent run = don't log out). 5. ??? 6. Profit *Note*: The script will sometimes still error out because of Out-of-Memory errors or because the context length was reached. If that happens, reissue the command to start a new dialog. +``` -## Limitations - -Currently only tested with 7B Llama2, with a 16GB vRAM GPU (Nvidia P100). The -conversation context length (`--max_seq_len` parameter of the script) is limited -to 512 tokens (about 2-3 back-and-forth dialogs with the AI). Increasing this -will (almost surely) result in an Out-of-Memory CUDA error. - -## TODOs +</strike> -- [ ] Choose Python package versions to use inside the Dockerfile, rather than -have them dangling, to prevent compatibility problems. -- [ ] Look into quantization (the current model is 8-bit quantized already). -- [ ] Better dialog script file. +Increased the context length of all images to 4096 tokens, therefore the cluster +GPUs won't be able to run the images anymore. To make it work again on the fep +cluster, you have to manually modify the context length inside a Dockerfile for +one of the LLama 7B images (to something like 512), and build the image yourself. diff --git a/dialog.py b/dialog.py index edc1043..45c3885 100644 --- a/dialog.py +++ b/dialog.py @@ -10,7 +10,7 @@ def main( tokenizer_path: str, temperature: float = 0.6, top_p: float = 0.9, - max_seq_len: int = 512, + max_seq_len: int = 4096, max_batch_size: int = 8, max_gen_len: Optional[int] = None, ): @@ -24,7 +24,7 @@ def main( Defaults to 0.6. top_p (float, optional): The top-p sampling parameter for controlling diversity in generation. Defaults to 0.9. - max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 512. + max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 4096. max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 8. max_gen_len (int, optional): The maximum length of generated sequences. If None, it will be set to the model's max sequence length. Defaults to None. -- GitLab