Alexandru-Mihai GHERGHESCU
--- a/scripts/memory_compute_estimations/memory_req.py

+ 11

− 6
+++ b/scripts/memory_compute_estimations/memory_req.py

+ 11

− 6
 @@ -13,15 +13,15 @@ setups = {
    "1T": { "L": 128, "H": 160, "D": 25600, }
 }

-CURRENT = setups["65B"]
+CURRENT = setups["284M"]

 L = CURRENT["L"] # number of layers
 H = CURRENT["H"] # number of heads
 D = CURRENT["D"] # embedding dimension

-BS = 1 # batch size
-SEQ = 4096 # sequence length
-TOKS = 32_000 # number of tokens in the vocab
+BS = 32 # batch size
+SEQ = 512 # sequence length
+TOKS = 16_000 # number of tokens in the vocab

 # -- END OF GLOBALS --

 @@ -81,10 +81,15 @@ gradients = 1 * model_params
 moms = 2
 optimizer = moms * model_params

-gigabytes_used = (
+# 4 bytes (fp32) used; for 2 bytes activations (fp16), adjust the percent value;
+# ideally, it should be 0.5, however that value can realistically not be
+# reached; real values could be between 0.2 and 0.3
+activations_saved_percent = 0.25
+gigabytes_used = 4 * (
    model_params +
    optimizer +
-    max(activations, gradients)
+    (1 - activations_saved_percent) * activations +
+    gradients
 ) / bytes_to_gigs

 print(f"Model params: {model_params:,} - {model_params * 4 / bytes_to_gigs:,.2f}GB")