Skip to content
Snippets Groups Projects
Unverified Commit 3eaa8cb4 authored by Alexandru-Mihai GHERGHESCU's avatar Alexandru-Mihai GHERGHESCU
Browse files

Update old information, increase context length to 4096

Upload Dockerfiles for normal and chat LLama2 variants.
parent fde316cc
No related branches found
No related tags found
No related merge requests found
# there's an extra step needed to install the Nvidia Container Toolkit, which
# allows the docker containers to access the gpus outside; there's a guide for
# ubuntu about that here:
# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
# before building, note that the weights and the script need to be in the
# current folder
# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-13b -f Dockerfile-13b .`
# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-13b`
FROM condaforge/mambaforge
# install stuff inside conda
RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
mamba install -c fastai fastai -y && \
mamba clean -afy
# llama dependencies
RUN pip install fairscale sentencepiece fire && \
pip cache purge
# add the llama repo
RUN git clone https://github.com/facebookresearch/llama /llama
# add the tokenizer
ADD tokenizer.model /llama/tokenizer.model
# add the weights
ADD llama-2-13b/ /llama/llama-2-13b/
# add the dialog script
ADD dialog.py /llama/dialog.py
# run llama example program
CMD ["torchrun", \
"--nproc_per_node", "2", \
"/llama/dialog.py", \
"--ckpt_dir", "/llama/llama-2-13b/", \
"--tokenizer_path", "/llama/tokenizer.model", \
"--max_seq_len", "4096", \
"--max_batch_size", "6"]
# there's an extra step needed to install the Nvidia Container Toolkit, which
# allows the docker containers to access the gpus outside; there's a guide for
# ubuntu about that here:
# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
# before building, note that the weights and the script need to be in the
# current folder
# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-13b-chat -f Dockerfile-13b-chat .`
# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-13b-chat`
FROM condaforge/mambaforge
# install stuff inside conda
RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
mamba install -c fastai fastai -y && \
mamba clean -afy
# llama dependencies
RUN pip install fairscale sentencepiece fire && \
pip cache purge
# add the llama repo
RUN git clone https://github.com/facebookresearch/llama /llama
# add the tokenizer
ADD tokenizer.model /llama/tokenizer.model
# add the weights
ADD llama-2-13b-chat/ /llama/llama-2-13b-chat/
# add the dialog script
ADD dialog.py /llama/dialog.py
# run llama example program
CMD ["torchrun", \
"--nproc_per_node", "2", \
"/llama/dialog.py", \
"--ckpt_dir", "/llama/llama-2-13b-chat/", \
"--tokenizer_path", "/llama/tokenizer.model", \
"--max_seq_len", "4096", \
"--max_batch_size", "6"]
# there's an extra step needed to install the Nvidia Container Toolkit, which
# allows the docker containers to access the gpus outside; there's a guide for
# ubuntu about that here:
# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
# before building, note that the weights and the script need to be in the
# current folder
# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-70b -f Dockerfile-70b .`
# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-70b`
FROM condaforge/mambaforge
# install stuff inside conda
RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
mamba install -c fastai fastai -y && \
mamba clean -afy
# llama dependencies
RUN pip install fairscale sentencepiece fire && \
pip cache purge
# add the llama repo
RUN git clone https://github.com/facebookresearch/llama /llama
# add the tokenizer
ADD tokenizer.model /llama/tokenizer.model
# add the weights
ADD llama-2-70b/ /llama/llama-2-70b/
# add the dialog script
ADD dialog.py /llama/dialog.py
# run llama example program
CMD ["torchrun", \
"--nproc_per_node", "8", \
"/llama/dialog.py", \
"--ckpt_dir", "/llama/llama-2-70b/", \
"--tokenizer_path", "/llama/tokenizer.model", \
"--max_seq_len", "4096", \
"--max_batch_size", "6"]
# there's an extra step needed to install the Nvidia Container Toolkit, which
# allows the docker containers to access the gpus outside; there's a guide for
# ubuntu about that here:
# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
# before building, note that the weights and the script need to be in the
# current folder
# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-70b-chat -f Dockerfile-70b-chat .`
# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-70b-chat`
FROM condaforge/mambaforge
# install stuff inside conda
RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
mamba install -c fastai fastai -y && \
mamba clean -afy
# llama dependencies
RUN pip install fairscale sentencepiece fire && \
pip cache purge
# add the llama repo
RUN git clone https://github.com/facebookresearch/llama /llama
# add the tokenizer
ADD tokenizer.model /llama/tokenizer.model
# add the weights
ADD llama-2-70b-chat/ /llama/llama-2-70b-chat/
# add the dialog script
ADD dialog.py /llama/dialog.py
# run llama example program
CMD ["torchrun", \
"--nproc_per_node", "8", \
"/llama/dialog.py", \
"--ckpt_dir", "/llama/llama-2-70b-chat/", \
"--tokenizer_path", "/llama/tokenizer.model", \
"--max_seq_len", "4096", \
"--max_batch_size", "6"]
# there's an extra step needed to install the Nvidia Container Toolkit, which
# allows the docker containers to access the gpus outside; there's a guide for
# ubuntu about that here:
# https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/
# before building, note that the weights and the script need to be in the
# current folder
# build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-7b -f Dockerfile-7b .`
# run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-7b`
FROM condaforge/mambaforge
# install stuff inside conda
RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cuda=11.8 -y && \
mamba install -c fastai fastai -y && \
mamba clean -afy
# llama dependencies
RUN pip install fairscale sentencepiece fire && \
pip cache purge
# add the llama repo
RUN git clone https://github.com/facebookresearch/llama /llama
# add the tokenizer
ADD tokenizer.model /llama/tokenizer.model
# add the weights
ADD llama-2-7b/ /llama/llama-2-7b/
# add the dialog script
ADD dialog.py /llama/dialog.py
# run llama example program
CMD ["torchrun", \
"--nproc_per_node", "1", \
"/llama/dialog.py", \
"--ckpt_dir", "/llama/llama-2-7b/", \
"--tokenizer_path", "/llama/tokenizer.model", \
"--max_seq_len", "4096", \
"--max_batch_size", "6"]
...@@ -6,8 +6,8 @@ ...@@ -6,8 +6,8 @@
# before building, note that the weights and the script need to be in the # before building, note that the weights and the script need to be in the
# current folder # current folder
# build image with: docker build -t llama-7b-img . # build image with: `docker build -t gitlab.cs.pub.ro/netsys/llama-images:llama-7b-chat -f Dockerfile-7b-chat .`
# run image with: docker run -it --gpus all llama-7b-img # run image with: `docker run -it --gpus all gitlab.cs.pub.ro/netsys/llama-images:llama-7b-chat`
FROM condaforge/mambaforge FROM condaforge/mambaforge
...@@ -38,5 +38,5 @@ CMD ["torchrun", \ ...@@ -38,5 +38,5 @@ CMD ["torchrun", \
"/llama/dialog.py", \ "/llama/dialog.py", \
"--ckpt_dir", "/llama/llama-2-7b-chat/", \ "--ckpt_dir", "/llama/llama-2-7b-chat/", \
"--tokenizer_path", "/llama/tokenizer.model", \ "--tokenizer_path", "/llama/tokenizer.model", \
"--max_seq_len", "512", \ "--max_seq_len", "4096", \
"--max_batch_size", "6"] "--max_batch_size", "6"]
...@@ -2,19 +2,20 @@ ...@@ -2,19 +2,20 @@
## Minimum hardware requirements to run the model ## Minimum hardware requirements to run the model
The 7B Llama2 model (the smallest one), works on ~16GB of vRAM and RAM. If RAM The 7B Llama2 model demands around 16GB of vRAM for weights + KV cache (which
is too small, use a bigger swap (this should only be needed to transfer the depends on the maximum context length). The 13B model demands around 28GB of
weights onto the GPU, no actual computation is done on the CPU). vRAM, and the bigger 70B model around 140GB of vRAM. Note that you need a big
enough RAM to transfer the weights from disk to vRAM.
## How to use ## How to use
There are a few requirements to get the model to run. Broadly speaking, these There are a few requirements to get a model to run. Broadly speaking, these are
are the actual model (the code), the weights and the Python script to open a the actual model (the code), the weights and the Python script to open a dialog,
dialog, as well as some Python packages. as well as some Python packages.
A Dockerfile is provided to build an image from scratch using the above. A Dockerfiles are provided to build model images from scratch using the above.
Docker image is already built (see Docker images are already built (see
[here](https://gitlab.cs.pub.ro/netsys/llama-test/container_registry)), so you [here](https://gitlab.cs.pub.ro/netsys/llama-images/container_registry)), so you
can use that instead (you need to be logged in). can use that instead (you need to be logged in).
Other than that, an Nvidia Container Toolkit driver is necessary to run Nvidia Other than that, an Nvidia Container Toolkit driver is necessary to run Nvidia
...@@ -26,14 +27,17 @@ Steps: ...@@ -26,14 +27,17 @@ Steps:
1. Install [Nvidia Container Toolkit (steps for 1. Install [Nvidia Container Toolkit (steps for
Ubuntu)](https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/). Ubuntu)](https://saturncloud.io/blog/how-to-install-pytorch-on-the-gpu-with-docker/).
Necessary to let docker containers use the GPU. Necessary to let docker containers use the GPU.
2. Download the Docker container image (`docker image pull 2. Download the Docker container image of choice (e.g. `docker image pull
gitlab.cs.pub.ro:5050/netsys/llama-test:latest`). gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`).
3. Run the docker image with `docker run -it --gpus all 3. Run the docker image with `docker run -it --gpus all
gitlab.cs.pub.ro:5050/netsys/llama-test:latest`. This should take a while to gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`. This should take a
load, but then a prompt to interact with Llama is displayed. while to load, but then a prompt to interact with Llama is displayed.
### On the UPB cluster (fep) ### On the UPB cluster (fep)
<strike>
```
Steps: Steps:
0. The Nvidia Container Toolkit is already installed on the cluster, so skip 0. The Nvidia Container Toolkit is already installed on the cluster, so skip
that. that.
...@@ -41,30 +45,25 @@ Steps: ...@@ -41,30 +45,25 @@ Steps:
2. Get a bash shell into a partition with a GPU (`srun -p xl --gres 2. Get a bash shell into a partition with a GPU (`srun -p xl --gres
gpu:tesla_p100:1 --mem=40G --pty bash`). gpu:tesla_p100:1 --mem=40G --pty bash`).
3. Pull and build the docker image into an apptainer image on the grid 3. Pull and build the docker image into an apptainer image on the grid
(`apptainer pull docker://gitlab.cs.pub.ro:5050/netsys/llama-test:latest`). (`apptainer pull
This will take a while (probably around ~40 minutes). If it fails because of docker://gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`). This will
a timeout, simply run the same command again. take a while (probably around ~40 minutes). If it fails because of a timeout,
simply run the same command again.
4. Run the apptainer image with `apptainer run --nv 4. Run the apptainer image with `apptainer run --nv
docker://gitlab.cs.pub.ro:5050/netsys/llama-test:latest`. The first time it docker://gitlab.cs.pub.ro:5050/netsys/llama-images:llama-7b-chat`. The first
should take about 3 minutes for it to start, but subsequent runs will take a time it should take about 3 minutes for it to start, but subsequent runs will
few seconds (subsequent run = don't log out). take a few seconds (subsequent run = don't log out).
5. ??? 5. ???
6. Profit 6. Profit
*Note*: The script will sometimes still error out because of Out-of-Memory *Note*: The script will sometimes still error out because of Out-of-Memory
errors or because the context length was reached. If that happens, reissue the errors or because the context length was reached. If that happens, reissue the
command to start a new dialog. command to start a new dialog.
```
## Limitations </strike>
Currently only tested with 7B Llama2, with a 16GB vRAM GPU (Nvidia P100). The
conversation context length (`--max_seq_len` parameter of the script) is limited
to 512 tokens (about 2-3 back-and-forth dialogs with the AI). Increasing this
will (almost surely) result in an Out-of-Memory CUDA error.
## TODOs
- [ ] Choose Python package versions to use inside the Dockerfile, rather than Increased the context length of all images to 4096 tokens, therefore the cluster
have them dangling, to prevent compatibility problems. GPUs won't be able to run the images anymore. To make it work again on the fep
- [ ] Look into quantization (the current model is 8-bit quantized already). cluster, you have to manually modify the context length inside a Dockerfile for
- [ ] Better dialog script file. one of the LLama 7B images (to something like 512), and build the image yourself.
...@@ -10,7 +10,7 @@ def main( ...@@ -10,7 +10,7 @@ def main(
tokenizer_path: str, tokenizer_path: str,
temperature: float = 0.6, temperature: float = 0.6,
top_p: float = 0.9, top_p: float = 0.9,
max_seq_len: int = 512, max_seq_len: int = 4096,
max_batch_size: int = 8, max_batch_size: int = 8,
max_gen_len: Optional[int] = None, max_gen_len: Optional[int] = None,
): ):
...@@ -24,7 +24,7 @@ def main( ...@@ -24,7 +24,7 @@ def main(
Defaults to 0.6. Defaults to 0.6.
top_p (float, optional): The top-p sampling parameter for controlling diversity in generation. top_p (float, optional): The top-p sampling parameter for controlling diversity in generation.
Defaults to 0.9. Defaults to 0.9.
max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 512. max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 4096.
max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 8. max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 8.
max_gen_len (int, optional): The maximum length of generated sequences. If None, it will be max_gen_len (int, optional): The maximum length of generated sequences. If None, it will be
set to the model's max sequence length. Defaults to None. set to the model's max sequence length. Defaults to None.
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment