Skip to content
Snippets Groups Projects

Compute/memory requirements scripts

Open Alexandru-Mihai GHERGHESCU requested to merge feature/scripts into main
1 file
+ 10
7
Compare changes
  • Side-by-side
  • Inline
 
from setups import setups
 
 
 
# setup (please see setups.py)
 
setup = setups["Gemma"]
 
model = setup["MODELS"]["2B"]
 
 
# per training setup
 
BS = 1 # batch size
 
SEQ = 256 # sequence length
 
 
# fp16 percent memory saved for activations; realistic values range beetwen 15%
 
# and 30%; for full fp32 training, set to 0
 
FP16_SAVED_ACTIVATIONS = 0
 
 
# -- END OF GLOBALS --
 
 
# per model
 
L = model["L"] # number of layers
 
H = model["H"] # number of heads
 
D = model["D"] # embedding dimension
 
 
# per architecture
 
INTERMEDIATE_MULTIPLIER = setup["INTERMEDIATE_MULTIPLIER"]
 
INTERMEDIATE_MATRICES = setup["INTERMEDIATE_MATRICES"]
 
VOCAB_SIZE = setup["VOCAB_SIZE"]
 
 
# feed forward layer size
 
FFN_DIM = INTERMEDIATE_MULTIPLIER * D
 
 
# model parameters
 
embedding_layer = VOCAB_SIZE * D
 
multi_head_attention_layer = 4 * D * D
 
feed_forward_layer = INTERMEDIATE_MATRICES * D * FFN_DIM
 
norm_layer = D
 
out_layer = VOCAB_SIZE * D
 
model_params = embedding_layer + L * (
 
multi_head_attention_layer +
 
feed_forward_layer +
 
2 * norm_layer
 
) + norm_layer + out_layer
 
 
# activations
 
activations = (BS * SEQ + # input embedding
 
\
 
\
 
# attention layer + norm
 
L * (
 
3 * BS * SEQ * D + # (BS, SEQ, D)
 
2 * BS * SEQ * 1 + # (BS, SEQ, 1)
 
4 * BS * SEQ * D + # (BS * SEQ, D)
 
4 * D * D + # (D, D)
 
1 * BS * H * SEQ * D / H + # (BS * H, SEQ, D / H)
 
1 * BS * H * D / H * SEQ + # (BS * H, D / H, SEQ)
 
1 * BS * H * SEQ * SEQ + # (BS, H, SEQ, SEQ)
 
1 * BS * H * SEQ * SEQ + # (BS * H, SEQ, SEQ)
 
1 * BS * H * SEQ * D / H # (BS * H, SEQ, D / H)
 
) +
 
\
 
\
 
# ffn layer + norm
 
L * (
 
3 * BS * SEQ * D + # (BS, SEQ, D)
 
2 * BS * SEQ * 1 + # (BS, SEQ, 1)
 
2 * BS * SEQ * D + # (BS * SEQ, D)
 
2 * D * FFN_DIM + # (D, FFN_DIM)
 
3 * BS * SEQ * FFN_DIM + # (BS, SEQ, FFN_DIM)
 
1 * BS * SEQ * FFN_DIM + # (BS * SEQ, FFN_DIM)
 
1 * FFN_DIM * D # (FFN_DIM, D)
 
) +
 
\
 
\
 
# output layer + norm
 
1 * BS * SEQ * D + # (BS * SEQ, D)
 
3 * BS * SEQ * D + # (BS, SEQ, D)
 
2 * BS * SEQ * 1 + # (BS, SEQ, 1)
 
1 * D * VOCAB_SIZE # (D, VOCAB_SIZE)
 
)
 
 
# backpropagation gradients
 
gradients = 1 * model_params
 
 
# optimizer momentums saved per parameter (Adam holds 2, SGD holds 1)
 
OPTIMIZER_MOMENTUMS = 2
 
 
# optimizer state
 
optimizer = OPTIMIZER_MOMENTUMS * model_params
 
 
bytes_to_gigs = 1_073_741_824 # bytes in a gigabyte
 
 
# 4 bytes (fp32) used; for 2 bytes activations (fp16), adjust the percent value;
 
gigabytes_used = 4 * (
 
model_params +
 
optimizer +
 
(1 - FP16_SAVED_ACTIVATIONS) * activations +
 
gradients
 
) / bytes_to_gigs
 
 
print(f"Model params: {model_params:,} - {model_params * 4 / bytes_to_gigs:,.2f}GB")
 
print(f"Activations: {activations:,} - {activations * 4 / bytes_to_gigs:,.2f}GB")
 
print(f"Backprop gradients: {gradients:,} - {gradients * 4 / bytes_to_gigs:,.2f}GB")
 
print(f"GB total (no parallelism): {gigabytes_used:,.2f}GB")
 
print(f"GB total (+40% memory overhead for tensor and pipeline parallelism): {1.4 * gigabytes_used:,.2f}GB")
 
print(f"Minimum A100 (80GB) GPUs (with overhead): {1.4 * (gigabytes_used / 80):.1f}")
Loading