Alexandru-Mihai GHERGHESCU
--- a/scripts/memory_compute_estimations/memory_req.py

+ 10

− 7
+++ b/scripts/memory_compute_estimations/memory_req.py

+ 10

− 7
 @@ -23,6 +23,13 @@ BS = 1 # batch size
 @@ -23,6 +23,13 @@ BS = 1 # batch size
 SEQ = 4096 # sequence length
 TOKS = 32_000 # number of tokens in the vocab
+# optimizer momentums saved per parameter (Adam holds 2, SGD holds 1)
+OPTIMIZER_MOMENTUMS = 2
+# fp16 percent memory saved for activations; realistic values range beetwen 15%
+# and 30%; for full fp32 training, set to 0
+FP16_SAVED_ACTIVATIONS = 0
 # -- END OF GLOBALS --
 bytes_to_gigs = 1_073_741_824 # bytes in a gigabyte
 @@ -77,18 +84,14 @@ activations = (BS * SEQ + # input embedding
 @@ -77,18 +84,14 @@ activations = (BS * SEQ + # input embedding
 # backpropagation gradients
 gradients = 1 * model_params
-# optimizer state (adam holds 2 momentums for each param, sgd 1)
+# optimizer state
-moms = 2
+optimizer = OPTIMIZER_MOMENTUMS * model_params
-optimizer = moms * model_params
 # 4 bytes (fp32) used; for 2 bytes activations (fp16), adjust the percent value;
-# ideally, it should be 0.5, however that value can realistically not be
-# reached; real values could be between 0.2 and 0.3
-activations_saved_percent = 0.25
 gigabytes_used = 4 * (
    model_params +
    optimizer +
-    (1 - activations_saved_percent) * activations +
+    (1 - FP16_SAVED_ACTIVATIONS) * activations +
    gradients
 ) / bytes_to_gigs