Skip to content
Snippets Groups Projects

Compute/memory requirements scripts

Open Alexandru-Mihai GHERGHESCU requested to merge feature/scripts into main
1 file
+ 11
6
Compare changes
  • Side-by-side
  • Inline
@@ -13,15 +13,15 @@ setups = {
"1T": { "L": 128, "H": 160, "D": 25600, }
}
CURRENT = setups["65B"]
CURRENT = setups["284M"]
L = CURRENT["L"] # number of layers
H = CURRENT["H"] # number of heads
D = CURRENT["D"] # embedding dimension
BS = 1 # batch size
SEQ = 4096 # sequence length
TOKS = 32_000 # number of tokens in the vocab
BS = 32 # batch size
SEQ = 512 # sequence length
TOKS = 16_000 # number of tokens in the vocab
# -- END OF GLOBALS --
@@ -81,10 +81,15 @@ gradients = 1 * model_params
moms = 2
optimizer = moms * model_params
gigabytes_used = (
# 4 bytes (fp32) used; for 2 bytes activations (fp16), adjust the percent value;
# ideally, it should be 0.5, however that value can realistically not be
# reached; real values could be between 0.2 and 0.3
activations_saved_percent = 0.25
gigabytes_used = 4 * (
model_params +
optimizer +
max(activations, gradients)
(1 - activations_saved_percent) * activations +
gradients
) / bytes_to_gigs
print(f"Model params: {model_params:,} - {model_params * 4 / bytes_to_gigs:,.2f}GB")
Loading