Skip to content
Snippets Groups Projects

Compute/memory requirements scripts

Open Alexandru-Mihai GHERGHESCU requested to merge feature/scripts into main
@@ -31,17 +31,17 @@ GPU_PEAK_TFLOPS = 312
EXPECTED_GPU_THROUGHPUT = 0.4
# dataset size (in tokens)
DATASET_SIZE = 10_000_000_000
DATASET_SIZE = 2_000_000_000_000
# expected available GPU's (to correctly assess, you probably want to increase
# this in multiples of the number of GPU's needed for tensor and pipeline
# parallelism; e.g. training a 70B requires at least 8x DGX clusters, each with
# 8 GPU's; therefore, the base number of required GPU's to hold the model is 32;
# data parallel adds, for each data parallel unit, another 32 GPU's, therefore
# the number of available GPU's should be 32, 64, 96 etc. to get an accurate
# parallelism; e.g. training a 70B requires at least 2x DGX clusters, each with
# 8 GPU's; therefore, the base number of required GPU's to hold the model is 16;
# data parallel adds, for each data parallel unit, another 16 GPU's, therefore
# the number of available GPU's should be 16, 32, 48, 64 etc. to get an accurate
# count; the base number of required GPU's is the output of the `memory_req.py`
# script)
EXPECTED_AVAILABLE_GPUS = 32
EXPECTED_AVAILABLE_GPUS = 2048
# -- END OF GLOBALS --
Loading