Skip to content
Snippets Groups Projects

Compute/memory requirements scripts

Open Alexandru-Mihai GHERGHESCU requested to merge feature/scripts into main
1 file
+ 10
7
Compare changes
  • Side-by-side
  • Inline
@@ -23,6 +23,13 @@ BS = 1 # batch size
@@ -23,6 +23,13 @@ BS = 1 # batch size
SEQ = 4096 # sequence length
SEQ = 4096 # sequence length
TOKS = 32_000 # number of tokens in the vocab
TOKS = 32_000 # number of tokens in the vocab
 
# optimizer momentums saved per parameter (Adam holds 2, SGD holds 1)
 
OPTIMIZER_MOMENTUMS = 2
 
 
# fp16 percent memory saved for activations; realistic values range beetwen 15%
 
# and 30%; for full fp32 training, set to 0
 
FP16_SAVED_ACTIVATIONS = 0
 
# -- END OF GLOBALS --
# -- END OF GLOBALS --
bytes_to_gigs = 1_073_741_824 # bytes in a gigabyte
bytes_to_gigs = 1_073_741_824 # bytes in a gigabyte
@@ -77,18 +84,14 @@ activations = (BS * SEQ + # input embedding
@@ -77,18 +84,14 @@ activations = (BS * SEQ + # input embedding
# backpropagation gradients
# backpropagation gradients
gradients = 1 * model_params
gradients = 1 * model_params
# optimizer state (adam holds 2 momentums for each param, sgd 1)
# optimizer state
moms = 2
optimizer = OPTIMIZER_MOMENTUMS * model_params
optimizer = moms * model_params
# 4 bytes (fp32) used; for 2 bytes activations (fp16), adjust the percent value;
# 4 bytes (fp32) used; for 2 bytes activations (fp16), adjust the percent value;
# ideally, it should be 0.5, however that value can realistically not be
# reached; real values could be between 0.2 and 0.3
activations_saved_percent = 0.25
gigabytes_used = 4 * (
gigabytes_used = 4 * (
model_params +
model_params +
optimizer +
optimizer +
(1 - activations_saved_percent) * activations +
(1 - FP16_SAVED_ACTIVATIONS) * activations +
gradients
gradients
) / bytes_to_gigs
) / bytes_to_gigs
Loading