Alexandru-Mihai GHERGHESCU
--- a/scripts/memory_compute_estimations/memory_req.py 0 → 100644

+ 104

− 0
+++ b/scripts/memory_compute_estimations/memory_req.py 0 → 100644

+ 104

− 0
+from setups import setups
+# setup (please see setups.py)
+setup = setups["Gemma"]
+model = setup["MODELS"]["2B"]
+# per training setup
+BS = 1 # batch size
+SEQ = 256 # sequence length
+# fp16 percent memory saved for activations; realistic values range beetwen 15%
+# and 30%; for full fp32 training, set to 0
+FP16_SAVED_ACTIVATIONS = 0
+# -- END OF GLOBALS --
+# per model
+L = model["L"] # number of layers
+H = model["H"] # number of heads
+D = model["D"] # embedding dimension
+# per architecture
+INTERMEDIATE_MULTIPLIER = setup["INTERMEDIATE_MULTIPLIER"]
+INTERMEDIATE_MATRICES = setup["INTERMEDIATE_MATRICES"]
+VOCAB_SIZE = setup["VOCAB_SIZE"]
+# feed forward layer size
+FFN_DIM = INTERMEDIATE_MULTIPLIER * D
+# model parameters
+embedding_layer = VOCAB_SIZE * D
+multi_head_attention_layer = 4 * D * D
+feed_forward_layer = INTERMEDIATE_MATRICES * D * FFN_DIM
+norm_layer = D
+out_layer = VOCAB_SIZE * D
+model_params = embedding_layer + L * (
+    multi_head_attention_layer +
+    feed_forward_layer +
+    2 * norm_layer
+) + norm_layer + out_layer
+# activations
+activations = (BS * SEQ + # input embedding
+    \
+    \
+    # attention layer + norm
+    L * (
+        3 * BS * SEQ * D + # (BS, SEQ, D)
+        2 * BS * SEQ * 1 + # (BS, SEQ, 1)
+        4 * BS * SEQ * D + # (BS * SEQ, D)
+        4 * D * D + # (D, D)
+        1 * BS * H * SEQ * D / H + # (BS * H, SEQ, D / H)
+        1 * BS * H * D / H * SEQ + # (BS * H, D / H, SEQ)
+        1 * BS * H * SEQ * SEQ + # (BS, H, SEQ, SEQ)
+        1 * BS * H * SEQ * SEQ + # (BS * H, SEQ, SEQ)
+        1 * BS * H * SEQ * D / H # (BS * H, SEQ, D / H)
+    ) +
+    \
+    \
+    # ffn layer + norm
+    L * (
+        3 * BS * SEQ * D + # (BS, SEQ, D)
+        2 * BS * SEQ * 1 + # (BS, SEQ, 1)
+        2 * BS * SEQ * D + # (BS * SEQ, D)
+        2 * D * FFN_DIM + # (D, FFN_DIM)
+        3 * BS * SEQ * FFN_DIM + # (BS, SEQ, FFN_DIM)
+        1 * BS * SEQ * FFN_DIM + # (BS * SEQ, FFN_DIM)
+        1 * FFN_DIM * D # (FFN_DIM, D)
+    ) +
+    \
+    \
+    # output layer + norm
+    1 * BS * SEQ * D + # (BS * SEQ, D)
+    3 * BS * SEQ * D + # (BS, SEQ, D)
+    2 * BS * SEQ * 1 + # (BS, SEQ, 1)
+    1 * D * VOCAB_SIZE # (D, VOCAB_SIZE)
+)
+# backpropagation gradients
+gradients = 1 * model_params
+# optimizer momentums saved per parameter (Adam holds 2, SGD holds 1)
+OPTIMIZER_MOMENTUMS = 2
+# optimizer state
+optimizer = OPTIMIZER_MOMENTUMS * model_params
+bytes_to_gigs = 1_073_741_824 # bytes in a gigabyte
+# 4 bytes (fp32) used; for 2 bytes activations (fp16), adjust the percent value;
+gigabytes_used = 4 * (
+    model_params +
+    optimizer +
+    (1 - FP16_SAVED_ACTIVATIONS) * activations +
+    gradients
+) / bytes_to_gigs
+print(f"Model params: {model_params:,} - {model_params * 4 / bytes_to_gigs:,.2f}GB")
+print(f"Activations: {activations:,} - {activations * 4 / bytes_to_gigs:,.2f}GB")
+print(f"Backprop gradients: {gradients:,} - {gradients * 4 / bytes_to_gigs:,.2f}GB")
+print(f"GB total (no parallelism): {gigabytes_used:,.2f}GB")
+print(f"GB total (+40% memory overhead for tensor and pipeline parallelism): {1.4 * gigabytes_used:,.2f}GB")
+print(f"Minimum A100 (80GB) GPUs (with overhead): {1.4 * (gigabytes_used / 80):.1f}")