Introduce automatic sanity check for training

c30547e0 · Vlad-Andrei BĂDOIU (78692) · 0faac554 · c30547e0 · c30547e0 · c30547e0
Commit c30547e0 authored 1 year ago by Vlad-Andrei BĂDOIU (78692)
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -5,5 +5,7 @@ useGPU:
    image: pytorch/pytorch:latest
    stage: test_gpu
    script:
-        - echo "Check whether we have enabled our GPU or not."
+        - echo "Check whether GPU is enabled."
+        - pip install -r requirements.txt
        - nvidia-smi
+        - python3 training.py --tokenizer_path optimus16K-wikitext103.model --dataset TEST_DS
--- a/optimus/datasets/__init__.py
+++ b/optimus/datasets/__init__.py
+import os
+
+from enum import Enum
+from functools import partial
+
 from .tinystories import TinyStoriesDataset
 from .wikitext103 import WikiText103Dataset
+from .line_ds import LineDataset
+
+
+class Datasets(Enum):
+
+    WIKITEXT103 = WikiText103Dataset
+    TINYSTORIES = TinyStoriesDataset
+
+    TEST_DS = partial(LineDataset, path=os.getenv("TEST_DATASET_PATH", default='tests/fixtures/sample.txt'))
+
+    def __call__(self, **kwargs):
+        return self.value(**kwargs)
\ No newline at end of file
--- a/optimus/datasets/line_ds.py
+++ b/optimus/datasets/line_ds.py
+import os
+
+from torch.utils.data import Dataset
+
+
+class LineDataset(Dataset):
+
+    def __init__(self, path: str, root: str | None = None, split: str = 'train'):
+        """
+        Line based dataset. Note that this dataset is expected to be used only for testing.
+
+        Args:
+            path (str): path to a textfile to build the dataset from.
+            root (str | None): Not used.
+            split (str): Split to be returned. Can be 'train', 'test' or
+                'valid'.
+
+        """
+        super().__init__()
+
+        assert split in ('train', 'test', 'valid'), \
+            "Split must be 'train', 'test' or 'valid'! Aborting..."
+
+        if os.path.isfile(path) is False:
+            raise ValueError(f"{path} not found")
+
+        with open(path, encoding="utf-8") as f:
+            self.lines = [line for line in f.read().splitlines() if (len(line) > 0 and not line.isspace())]
+
+    def __len__(self) -> int:
+        """
+        Return the length of the dataset, which is the total number of lines
+        contained in the text file passed as an argument.
+        """
+        return len(self.lines)
+
+    def __getitem__(self, idx: int) -> str:
+        """
+        Return the article indexed by idx.
+
+        Args:
+            idx (int): The index of the article in the dataset.
+
+        """
+        return self.lines[idx]
\ No newline at end of file
--- a/requirements.txt
+++ b/requirements.txt
+sentencepiece
+pytest
+fastai
+fastprogress
+fire
--- a/tests/__init__.py
+++ b/tests/__init__.py
--- a/tests/fixtures/sample.txt
+++ b/tests/fixtures/sample.txt
--- a/training.py
+++ b/training.py
@@ -2,13 +2,13 @@ import fire
 import torch
 from torch import nn

-from optimus.datasets import WikiText103Dataset
+from optimus.datasets import Datasets
+
 from optimus.tokenizers import SentencePieceTokenizer
 from optimus.dataloader import OptimusDataLoader
 from optimus.models import OptimusTransformer
 from optimus.trainer import Trainer

-
 def main(batch_size: int = 8,
         grad_acc_steps: int = 1,
         seq_len: int = 512,
@@ -21,7 +21,8 @@ def main(batch_size: int = 8,
         n_layers: int = 6,
         n_heads: int = 8,
         dropout: float = 0.0,
-         use_fp16: bool = True):
+         use_fp16: bool = True,
+         dataset: Datasets = Datasets.WIKITEXT103):
    """
    Run the main training loop for the model.

@@ -42,6 +43,8 @@ def main(batch_size: int = 8,
        dropout (float): Dropout to use for the model.
        use_fp16 (bool): Whether to train using floating-point 16-bits
            precision.
+        dataset (str): Name of the dataset to use from the available options.
+            Options can be seen in optimus/models/__init__.py

    """

@@ -59,14 +62,15 @@ def main(batch_size: int = 8,
        f"\t- model attention heads: {n_heads}\n"
        f"\t- model dropout: {dropout}\n"
        f"\t- 16-bit floating-point training (fp16): {use_fp16}\n"
+        f"\t- training on dataset: {dataset}\n"
        f"Please see '--help' if you want to change these settings")

    # load tokenizer
    tok = SentencePieceTokenizer(model_path=tokenizer_path)

    # load dataset splits
-    train_ds = WikiText103Dataset(split='train')
-    test_ds = WikiText103Dataset(split='test')
+    train_ds = Datasets[dataset](split='train')
+    test_ds = Datasets[dataset](split='test')

    print(f"Number of examples in training set: {len(train_ds)}")
    print(f"Number of examples in testing set: {len(test_ds)}")