diff --git a/Dockerfile-13b-chat b/Dockerfile-13b-chat index 0ca12b1cc7dc54510934d505be7ab78d96bbb9b6..bea11affa75e36f4a46e26f5a494564e6f94579a 100644 --- a/Dockerfile-13b-chat +++ b/Dockerfile-13b-chat @@ -20,17 +20,14 @@ RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cu RUN pip install fairscale sentencepiece fire && \ pip cache purge -# add the llama repo -RUN git clone https://github.com/facebookresearch/llama /llama - -# add the tokenizer -ADD tokenizer.model /llama/tokenizer.model +# add the llama repo (netsys edition) +RUN git clone https://gitlab.cs.pub.ro/netsys/llama /llama # add the weights ADD llama-2-13b-chat/ /llama/llama-2-13b-chat/ -# add the dialog script -ADD dialog.py /llama/dialog.py +# avoid an annoying PyTorch (torch.distributed) warning +ARG OMP_NUM_THREADS=1 # run llama example program CMD ["torchrun", \ diff --git a/Dockerfile-70b-chat b/Dockerfile-70b-chat index c114ad73baab5d86400c20fdfd6aaadecc3bf67b..9d6f6a2c8e34366418fcf5ccd1867c574c78b52c 100644 --- a/Dockerfile-70b-chat +++ b/Dockerfile-70b-chat @@ -7,7 +7,7 @@ # current folder # build image with: `docker build -t gitlab.cs.pub.ro:5050/netsys/llama-images:llama-70b-chat -f Dockerfile-70b-chat .` -# run image with: `docker run -it --gpus all gitlab.cs.pub.ro:5050/netsys/llama-images:llama-70b-chat` +# run image with: `docker run -it --gpus all --shm-size 2gb gitlab.cs.pub.ro:5050/netsys/llama-images:llama-70b-chat` FROM condaforge/mambaforge @@ -20,17 +20,14 @@ RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cu RUN pip install fairscale sentencepiece fire && \ pip cache purge -# add the llama repo -RUN git clone https://github.com/facebookresearch/llama /llama - -# add the tokenizer -ADD tokenizer.model /llama/tokenizer.model +# add the llama repo (netsys edition) +RUN git clone https://gitlab.cs.pub.ro/netsys/llama /llama # add the weights ADD llama-2-70b-chat/ /llama/llama-2-70b-chat/ -# add the dialog script -ADD dialog.py /llama/dialog.py +# avoid an annoying PyTorch (torch.distributed) warning +ARG OMP_NUM_THREADS=1 # run llama example program CMD ["torchrun", \ diff --git a/Dockerfile-7b-chat b/Dockerfile-7b-chat index 0c15c21dd602f9f97526421a6de364939aab1bd8..92fe7dba4f6a43864339529ab498c7eca48d9a6c 100644 --- a/Dockerfile-7b-chat +++ b/Dockerfile-7b-chat @@ -20,17 +20,14 @@ RUN mamba install -c pytorch -c nvidia pytorch torchvision torchaudio pytorch-cu RUN pip install fairscale sentencepiece fire && \ pip cache purge -# add the llama repo -RUN git clone https://github.com/facebookresearch/llama /llama - -# add the tokenizer -ADD tokenizer.model /llama/tokenizer.model +# add the llama repo (netsys edition) +RUN git clone https://gitlab.cs.pub.ro/netsys/llama /llama # add the weights ADD llama-2-7b-chat/ /llama/llama-2-7b-chat/ -# add the dialog script -ADD dialog.py /llama/dialog.py +# avoid an annoying PyTorch (torch.distributed) warning +ARG OMP_NUM_THREADS=1 # run llama example program CMD ["torchrun", \ diff --git a/dialog.py b/dialog.py deleted file mode 100644 index a6fdf7dede479962924bbda569f795b1fdcfbce9..0000000000000000000000000000000000000000 --- a/dialog.py +++ /dev/null @@ -1,87 +0,0 @@ -from typing import List, Optional - -import fire - -from llama import Llama, Dialog - -import torch -from fairscale.nn.model_parallel.initialize import ( - get_model_parallel_rank, - destroy_model_parallel -) - - -def main( - ckpt_dir: str, - tokenizer_path: str, - temperature: float = 0.6, - top_p: float = 0.9, - max_seq_len: int = 4096, - max_batch_size: int = 8, - max_gen_len: Optional[int] = None, -): - """ - Entry point of the program for generating text using a pretrained model. - - Args: - ckpt_dir (str): The directory containing checkpoint files for the pretrained model. - tokenizer_path (str): The path to the tokenizer model used for text encoding/decoding. - temperature (float, optional): The temperature value for controlling randomness in generation. - Defaults to 0.6. - top_p (float, optional): The top-p sampling parameter for controlling diversity in generation. - Defaults to 0.9. - max_seq_len (int, optional): The maximum sequence length for input prompts. Defaults to 4096. - max_batch_size (int, optional): The maximum batch size for generating sequences. Defaults to 8. - max_gen_len (int, optional): The maximum length of generated sequences. If None, it will be - set to the model's max sequence length. Defaults to None. - """ - # build the model - generator = Llama.build( - ckpt_dir=ckpt_dir, - tokenizer_path=tokenizer_path, - max_seq_len=max_seq_len, - max_batch_size=max_batch_size, - ) - - # at this point, we forked into multiple processes, one for each GPU - - dialog: Dialog = [] - - # dialog loop - print("You can now start typing to chat!") - while True: - - # only the first process (rank 0) will interact with the user - user_input = [None] - if get_model_parallel_rank() == 0: - user_input = [input("User: ")] - - # broadcast user input to all ranks - torch.distributed.broadcast_object_list(user_input, src=0) - - print('=====(processing query...)=====') - dialog.append({ 'role': 'user', 'content': user_input[0] }) - - # run inference - results = generator.chat_completion( - [dialog], - max_gen_len=max_gen_len, - temperature=temperature, - top_p=top_p, - ) - - # get the response - result = results[0] - role = result['generation']['role'] - content = result['generation']['content'] - - # print to user and append to dialog context - print(f'{role.capitalize()}: {content}') - dialog.append({ 'role': role, 'content': content }) - - # at the very end, cleanup - destroy_model_parallel() - - -if __name__ == "__main__": - fire.Fire(main)