Source code for recwizard.modules.kbrd.tokenizer_nltk

from typing import List

import nltk
from tokenizers import Tokenizer, NormalizedString, PreTokenizedString
from tokenizers.models import WordLevel
from tokenizers.normalizers import Lowercase
from tokenizers.pre_tokenizers import PreTokenizer
from transformers import PreTrainedTokenizerFast


[docs]class NLTKTokenizer: def __init__(self, language="english"): """Initialize the NLTK tokenizer. Args: language(str): language to use for the tokenizer """ # nltk.download('punkt') st_path = f"tokenizers/punkt/{language}.pickle" try: self.tokenizer = nltk.data.load(st_path) except LookupError: nltk.download("punkt") self.tokenizer = nltk.data.load(st_path) self.word_tokenizer = nltk.tokenize.treebank.TreebankWordTokenizer() def word_tokenize( self, i: int, normalized_string: NormalizedString ) -> List[NormalizedString]: """Tokenize a string into words.""" return [ normalized_string[s:t] for s, t in self.word_tokenizer.span_tokenize(str(normalized_string)) ] def nltk_split( self, i: int, normalized_string: NormalizedString ) -> List[NormalizedString]: """Split a string into sentences using NLTK.""" sentences = [ normalized_string[s:t] for s, t in self.tokenizer.span_tokenize(str(normalized_string)) ] tokenized = [] for sentence in sentences: tokenized += self.word_tokenize(i, sentence) return tokenized def pre_tokenize(self, pretok: PreTokenizedString): """Pre-tokenize a string into sentences using NLTK.""" # Let's call split on the PreTokenizedString to split using `self.jieba_split` pretok.split(self.nltk_split)
tokenizers = {}
[docs]def get_tokenizer(name="kbrd"): """Return a tokenizer from the cache.""" return tokenizers[name]
[docs]def KBRDWordTokenizer(vocab, name="kbrd"): """ Return a tokenizer for language models from the given vocabulary Args: vocab(List[str]): list of words name(str): name of the tokenizer. Used to cache the tokenizer Returns: PreTrainedTokenizerFast """ if tokenizers.get(name): return tokenizers[name] word2id = {word: i for i, word in enumerate(vocab)} tokenizer = Tokenizer(WordLevel(unk_token="__unk__", vocab=word2id)) tokenizer.normalizer = Lowercase() wrapped_tokenizer = PreTrainedTokenizerFast( tokenizer_object=tokenizer, unk_token="__unk__", pad_token="__null__", bos_token="__start__", eos_token="__end__", ) wrapped_tokenizer.backend_tokenizer.pre_tokenizer = PreTokenizer.custom( NLTKTokenizer() ) tokenizers[name] = wrapped_tokenizer return wrapped_tokenizer