Skip to content
Snippets Groups Projects
Commit c30547e0 authored by Vlad-Andrei BĂDOIU (78692)'s avatar Vlad-Andrei BĂDOIU (78692)
Browse files

Introduce automatic sanity check for training

parent 0faac554
No related branches found
No related tags found
1 merge request!20Draft: Introduce automatic sanity check for training
Pipeline #55061 passed
......@@ -5,5 +5,7 @@ useGPU:
image: pytorch/pytorch:latest
stage: test_gpu
script:
- echo "Check whether we have enabled our GPU or not."
- echo "Check whether GPU is enabled."
- pip install -r requirements.txt
- nvidia-smi
- python3 training.py --tokenizer_path optimus16K-wikitext103.model --dataset TEST_DS
import os
from enum import Enum
from functools import partial
from .tinystories import TinyStoriesDataset
from .wikitext103 import WikiText103Dataset
from .line_ds import LineDataset
class Datasets(Enum):
WIKITEXT103 = WikiText103Dataset
TINYSTORIES = TinyStoriesDataset
TEST_DS = partial(LineDataset, path=os.getenv("TEST_DATASET_PATH", default='tests/fixtures/sample.txt'))
def __call__(self, **kwargs):
return self.value(**kwargs)
\ No newline at end of file
import os
from torch.utils.data import Dataset
class LineDataset(Dataset):
def __init__(self, path: str, root: str | None = None, split: str = 'train'):
"""
Line based dataset. Note that this dataset is expected to be used only for testing.
Args:
path (str): path to a textfile to build the dataset from.
root (str | None): Not used.
split (str): Split to be returned. Can be 'train', 'test' or
'valid'.
"""
super().__init__()
assert split in ('train', 'test', 'valid'), \
"Split must be 'train', 'test' or 'valid'! Aborting..."
if os.path.isfile(path) is False:
raise ValueError(f"{path} not found")
with open(path, encoding="utf-8") as f:
self.lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
def __len__(self) -> int:
"""
Return the length of the dataset, which is the total number of lines
contained in the text file passed as an argument.
"""
return len(self.lines)
def __getitem__(self, idx: int) -> str:
"""
Return the article indexed by idx.
Args:
idx (int): The index of the article in the dataset.
"""
return self.lines[idx]
\ No newline at end of file
This diff is collapsed.
......@@ -2,13 +2,13 @@ import fire
import torch
from torch import nn
from optimus.datasets import WikiText103Dataset
from optimus.datasets import Datasets
from optimus.tokenizers import SentencePieceTokenizer
from optimus.dataloader import OptimusDataLoader
from optimus.models import OptimusTransformer
from optimus.trainer import Trainer
def main(batch_size: int = 8,
grad_acc_steps: int = 1,
seq_len: int = 512,
......@@ -21,7 +21,8 @@ def main(batch_size: int = 8,
n_layers: int = 6,
n_heads: int = 8,
dropout: float = 0.0,
use_fp16: bool = True):
use_fp16: bool = True,
dataset: Datasets = Datasets.WIKITEXT103):
"""
Run the main training loop for the model.
......@@ -42,6 +43,8 @@ def main(batch_size: int = 8,
dropout (float): Dropout to use for the model.
use_fp16 (bool): Whether to train using floating-point 16-bits
precision.
dataset (str): Name of the dataset to use from the available options.
Options can be seen in optimus/models/__init__.py
"""
......@@ -59,14 +62,15 @@ def main(batch_size: int = 8,
f"\t- model attention heads: {n_heads}\n"
f"\t- model dropout: {dropout}\n"
f"\t- 16-bit floating-point training (fp16): {use_fp16}\n"
f"\t- training on dataset: {dataset}\n"
f"Please see '--help' if you want to change these settings")
# load tokenizer
tok = SentencePieceTokenizer(model_path=tokenizer_path)
# load dataset splits
train_ds = WikiText103Dataset(split='train')
test_ds = WikiText103Dataset(split='test')
train_ds = Datasets[dataset](split='train')
test_ds = Datasets[dataset](split='test')
print(f"Number of examples in training set: {len(train_ds)}")
print(f"Number of examples in testing set: {len(test_ds)}")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment