Skip to content
Snippets Groups Projects

Switch to PyTorch Dataloader and HF datasets

Closed Vlad-Andrei BĂDOIU (78692) requested to merge vladb/py_dataloader into main
Files
6
@@ -64,6 +64,7 @@ def download_data(url: str, path: str) -> None:
pass
def tokenize_dataset(data, tokenizer):
"""Tokenizes a list of strings"""
start = time.time()
print("Tokenizing dataset...")
@@ -74,4 +75,9 @@ def tokenize_dataset(data, tokenizer):
print(f"Done. Took {time.time() - start:.2f}s.")
return data_tok
\ No newline at end of file
return data_tok
def flatten_tokenized(data, seq_len):
data = torch.cat(data, dim=-1)
return data
\ No newline at end of file
Loading