Skip to content
Snippets Groups Projects

Compute/memory requirements scripts

Open Alexandru-Mihai GHERGHESCU requested to merge feature/scripts into main
from setups import setups
# setup (please see setups.py)
setup = setups["Gemma"]
model = setup["MODELS"]["2B"]
# per training setup
BS = 1 # batch size
SEQ = 256 # sequence length
# fp16 percent memory saved for activations; realistic values range beetwen 15%
# and 30%; for full fp32 training, set to 0
FP16_SAVED_ACTIVATIONS = 0
# -- END OF GLOBALS --
# per model
L = model["L"] # number of layers
H = model["H"] # number of heads
D = model["D"] # embedding dimension
# per architecture
INTERMEDIATE_MULTIPLIER = setup["INTERMEDIATE_MULTIPLIER"]
INTERMEDIATE_MATRICES = setup["INTERMEDIATE_MATRICES"]
VOCAB_SIZE = setup["VOCAB_SIZE"]
# feed forward layer size
FFN_DIM = INTERMEDIATE_MULTIPLIER * D
# model parameters
embedding_layer = VOCAB_SIZE * D
multi_head_attention_layer = 4 * D * D
feed_forward_layer = INTERMEDIATE_MATRICES * D * FFN_DIM
norm_layer = D
out_layer = VOCAB_SIZE * D
model_params = embedding_layer + L * (
multi_head_attention_layer +
feed_forward_layer +
2 * norm_layer
) + norm_layer + out_layer
# activations
activations = (BS * SEQ + # input embedding
\
\
# attention layer + norm
L * (
3 * BS * SEQ * D + # (BS, SEQ, D)
2 * BS * SEQ * 1 + # (BS, SEQ, 1)
4 * BS * SEQ * D + # (BS * SEQ, D)
4 * D * D + # (D, D)
1 * BS * H * SEQ * D / H + # (BS * H, SEQ, D / H)
1 * BS * H * D / H * SEQ + # (BS * H, D / H, SEQ)
1 * BS * H * SEQ * SEQ + # (BS, H, SEQ, SEQ)
1 * BS * H * SEQ * SEQ + # (BS * H, SEQ, SEQ)
1 * BS * H * SEQ * D / H # (BS * H, SEQ, D / H)
) +
\
\
# ffn layer + norm
L * (
3 * BS * SEQ * D + # (BS, SEQ, D)
2 * BS * SEQ * 1 + # (BS, SEQ, 1)
2 * BS * SEQ * D + # (BS * SEQ, D)
2 * D * FFN_DIM + # (D, FFN_DIM)
3 * BS * SEQ * FFN_DIM + # (BS, SEQ, FFN_DIM)
1 * BS * SEQ * FFN_DIM + # (BS * SEQ, FFN_DIM)
1 * FFN_DIM * D # (FFN_DIM, D)
) +
\
\
# output layer + norm
1 * BS * SEQ * D + # (BS * SEQ, D)
3 * BS * SEQ * D + # (BS, SEQ, D)
2 * BS * SEQ * 1 + # (BS, SEQ, 1)
1 * D * VOCAB_SIZE # (D, VOCAB_SIZE)
)
# backpropagation gradients
gradients = 1 * model_params
# optimizer momentums saved per parameter (Adam holds 2, SGD holds 1)
OPTIMIZER_MOMENTUMS = 2
# optimizer state
optimizer = OPTIMIZER_MOMENTUMS * model_params
bytes_to_gigs = 1_073_741_824 # bytes in a gigabyte
# 4 bytes (fp32) used; for 2 bytes activations (fp16), adjust the percent value;
gigabytes_used = 4 * (
model_params +
optimizer +
(1 - FP16_SAVED_ACTIVATIONS) * activations +
gradients
) / bytes_to_gigs
print(f"Model params: {model_params:,} - {model_params * 4 / bytes_to_gigs:,.2f}GB")
print(f"Activations: {activations:,} - {activations * 4 / bytes_to_gigs:,.2f}GB")
print(f"Backprop gradients: {gradients:,} - {gradients * 4 / bytes_to_gigs:,.2f}GB")
print(f"GB total (no parallelism): {gigabytes_used:,.2f}GB")
print(f"GB total (+40% memory overhead for tensor and pipeline parallelism): {1.4 * gigabytes_used:,.2f}GB")
print(f"Minimum A100 (80GB) GPUs (with overhead): {1.4 * (gigabytes_used / 80):.1f}")
Loading