From 5f4899f0c0644559e0df6adaac339aa0af40853f Mon Sep 17 00:00:00 2001 From: Alexandru Gherghescu <gherghescu_alex1@yahoo.ro> Date: Wed, 31 Jan 2024 19:38:39 +0200 Subject: [PATCH] Add compute requirements script and docs --- scripts/memory_compute_estimations/README.md | 30 +++++++ .../memory_compute_estimations/compute_req.py | 81 +++++++++++++++++++ 2 files changed, 111 insertions(+) create mode 100644 scripts/memory_compute_estimations/compute_req.py diff --git a/scripts/memory_compute_estimations/README.md b/scripts/memory_compute_estimations/README.md index 4410ff0..5db08d2 100644 --- a/scripts/memory_compute_estimations/README.md +++ b/scripts/memory_compute_estimations/README.md @@ -51,3 +51,33 @@ scaling models using Megatron](https://developer.nvidia.com/blog/scaling-language-model-training-to-a-trillion-parameters-using-megatron/), as well as [scaling experiments using Megatron and AMD on the LUMI cluster](https://lumi-supercomputer.eu/scaling-the-pre-training-of-large-language-models-of-100b-parameters-to-thousands-of-amd-mi250x-gpus-on-lumi/). + +## Compute requirements + +Compute requirements for training models can be calculated using the script +`compute_req.py`. Change the values at the top (or use predefined defaults), run +it and get the output. + +Notice that total compute is not affected by either batch size or context +length. Since the model needs to see the whole dataset anyway, it doesn't really +matter how it is partitioned (it doesn't matter whether there are fewer big +chunks, or more large chunks). Batch size and context length will, however, +affect memory usage. Context length will also indirectly affect dataset size. +The intuition is that bigger context would need more dataset tokens to be +fully trained. Increasing context length should generally result in increasing +dataset size, though the scaling is definitely not linear (it's a best-guess +scenario). + +Be careful about the estimations given low numbers (low dataset size, a model +with a low number of parameters etc.), as communication/software times will +start to matter when the compute needed per step update is low. The GPU's +usually work best when fed big matrices, which keeps them occupied more fully. + +# Running the scripts together + +> You probably want to first run `memory_req.py`, which outputs the number of +> GPU's needed for baseline model parallelism (tensor + pipeline). Don't bother +> too much about adjusting batch size, as gradient accumulation can be used to +> increase that value without memory overhead. The total number of GPU's should +> then be adapted in `compute_req.py`, and multiplied by whatever factor for +> using data-parallel (2x, 3x, 4x etc.), as described above. diff --git a/scripts/memory_compute_estimations/compute_req.py b/scripts/memory_compute_estimations/compute_req.py new file mode 100644 index 0000000..5f3188a --- /dev/null +++ b/scripts/memory_compute_estimations/compute_req.py @@ -0,0 +1,81 @@ +setups = { + "70M": { "L": 10, "H": 10, "D": 640, }, + "284M": { "L": 20, "H": 16, "D": 1024, }, + "512M": { "L": 24, "H": 10, "D": 1280, }, + "1B": { "L": 26, "H": 14, "D": 1792, }, + "1.5B": { "L": 28, "H": 16, "D": 2048, }, + "6.5B": { "L": 32, "H": 32, "D": 4096, }, + "13B": { "L": 40, "H": 40, "D": 5120, }, + "30B": { "L": 60, "H": 52, "D": 6656, }, + "65B": { "L": 80, "H": 64, "D": 8192, }, + "140B": { "L": 80, "H": 96, "D": 12288, }, + "310B": { "L": 96, "H": 128, "D": 16384, }, + "1T": { "L": 128, "H": 160, "D": 25600, } +} + +CURRENT = setups["65B"] + +L = CURRENT["L"] # number of layers +H = CURRENT["H"] # number of heads +D = CURRENT["D"] # embedding dimension + +TOKS = 32_000 # number of tokens in the vocab + +# expected peak TFLOPS of GPU (for fp16, A100's have 312, MI250X's have 383, and +# H100's have ~1000) +GPU_PEAK_TFLOPS = 312 + +# expected GPU throughput (40% GPU utilization for large model training is +# usually the case, although 50% has been achieved with different techniques, at +# different scales of training etc.) +EXPECTED_GPU_THROUGHPUT = 0.4 + +# dataset size (in tokens) +DATASET_SIZE = 2_000_000_000_000 + +# expected available GPU's (to correctly assess, you probably want to increase +# this in multiples of the number of GPU's needed for tensor and pipeline +# parallelism; e.g. training a 70B requires at least 2x DGX clusters, each with +# 8 GPU's; therefore, the base number of required GPU's to hold the model is 16; +# data parallel adds, for each data parallel unit, another 16 GPU's, therefore +# the number of available GPU's should be 16, 32, 48, 64 etc. to get an accurate +# count; the base number of required GPU's is the output of the `memory_req.py` +# script) +EXPECTED_AVAILABLE_GPUS = 2048 + +# -- END OF GLOBALS -- + +# model parameters +embedding_layer = TOKS * D +multi_head_attention_layer = 4 * D * D +feed_forward_layer = 3 * D * (8 * D // 3) +norm_layer = D +out_layer = TOKS * D +model_params = embedding_layer + L * (multi_head_attention_layer + \ + feed_forward_layer + 2 * norm_layer) + norm_layer + out_layer + +# per-GPU throughput in FLOPS +per_gpu_throughput = GPU_PEAK_TFLOPS * 10**12 * EXPECTED_GPU_THROUGHPUT + +# 4 passes = 2x forward and 2x backward, if using gradient checkpointing; +# otherwise change to 3 passes = 1x forward and 2x backward +number_of_model_passes = 4 + +# all-reduce compute multiplier +all_reduce_compute = 2 + +# estimated compute (FLOPS) +total_compute = number_of_model_passes * all_reduce_compute * \ + model_params * DATASET_SIZE + +# estimated gpu-hours +gpu_hours = total_compute / (per_gpu_throughput * 3600) + +# estimated time needed given number of GPU's available (seconds) +time_needed = total_compute / (EXPECTED_AVAILABLE_GPUS * per_gpu_throughput) + +print(f"Model params: {model_params:,}") +print(f"Dataset size (tokens): {DATASET_SIZE:,}") +print(f"Estimated compute needed (PFLOPS): {total_compute / 10**15:,.2f}") +print(f"Estimated GPU-hours needed: {gpu_hours:,.2f} with {EXPECTED_GPU_THROUGHPUT * 100:.0f}% GPU utilization") +print(f"Days to train (with tensor/pipeline/data parallel): {time_needed / (60 * 60 * 24):.1f} with {EXPECTED_AVAILABLE_GPUS} GPUs available") -- GitLab