diff --git a/optimus/optimus16K-wikitext103.model b/optimus/optimus16K-wikitext103.model new file mode 100644 index 0000000000000000000000000000000000000000..dfb4afc9fa857a1c2b860bb4418e943e99098148 Binary files /dev/null and b/optimus/optimus16K-wikitext103.model differ diff --git a/optimus/optimus32K-wikitext103.model b/optimus/optimus32K-wikitext103.model deleted file mode 100644 index a13ec4baf18de12b03b1c3ce39b696ab465482a1..0000000000000000000000000000000000000000 Binary files a/optimus/optimus32K-wikitext103.model and /dev/null differ diff --git a/optimus/optimus60K-wikitext103.model b/optimus/optimus60K-wikitext103.model index 16d7047c07944c152634fbfe8920c001b5621253..bde1bf13e7a6742e70cf5ac5dc1458141c19168e 100644 Binary files a/optimus/optimus60K-wikitext103.model and b/optimus/optimus60K-wikitext103.model differ diff --git a/optimus/tokenizer.py b/optimus/tokenizer.py index 2df2ad1afd2302605587f9bda46b39d38be39a3f..170d4a5caf946894809fbf4a27a9e2f8b5dfe8ba 100644 --- a/optimus/tokenizer.py +++ b/optimus/tokenizer.py @@ -11,7 +11,7 @@ class Tokenizer(): SentencePiece tokenizer. Args: - model (str): Path of the tokenizer model. Defaults to + model_path (str): Path of the tokenizer model. Defaults to 'optimus.model'. """ @@ -58,7 +58,10 @@ class Tokenizer(): vocab_size=vocab_size, max_sentence_length=4096, input_sentence_size=1000000, - shuffle_input_sentence=True) + shuffle_input_sentence=True, + remove_extra_whitespaces=False, + normalization_rule_name='identity', + model_type='unigram') def encode(self, input: str, bos: bool, eos: bool) -> List[int]: """ @@ -118,15 +121,15 @@ if __name__=='__main__': train = False - if train: + if train is True: filename = './wikitext-103/wiki.train.tokens' with open(filename, 'r') as f: lines = f.readlines() - print(type(iter(lines))) Tokenizer.train(iter(lines), vocab_size=16000) else: - tok = Tokenizer(root='..') + tok = Tokenizer(model_path='./optimus.model') + print(len(tok)) print(tok.encode("this is some sunny day", False, True)) print(tok.encode_as_pieces("this is some sunny day")) print(tok.decode([1, 77, 34, 122, 9, 5, 10181, 206, 2]))