In the kaggle notebook (Getting started with NLP for absolute beginners) below 17th line of code it is noted that Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
. I got stuck there. Could someone please help to understand how I make sure the word embeddings are fine-tuned?
I have assigned model ‘microsoft/deberta-v3-small’ to a variable model_nm but have not done any installation or training. Do I need to first install or download this model before using it?
Below is the error stack trace
ValueError Traceback (most recent call last)
Cell In[41], line 1
----> 1 tokz = AutoTokenizer.from_pretrained(model_nm)
File ~/Desktop/code/ml/venv/lib/python3.10/site-packages/transformers/models/auto/tokenization_auto.py:676, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
674 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
675 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
--> 676 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
677 else:
678 if tokenizer_class_py is not None:
File ~/Desktop/code/ml/venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1804, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, *init_inputs, **kwargs)
1801 else:
1802 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 1804 return cls._from_pretrained(
1805 resolved_vocab_files,
1806 pretrained_model_name_or_path,
1807 init_configuration,
1808 *init_inputs,
1809 use_auth_token=use_auth_token,
1810 cache_dir=cache_dir,
1811 local_files_only=local_files_only,
1812 _commit_hash=commit_hash,
1813 **kwargs,
1814 )
File ~/Desktop/code/ml/venv/lib/python3.10/site-packages/transformers/tokenization_utils_base.py:1959, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, use_auth_token, cache_dir, local_files_only, _commit_hash, *init_inputs, **kwargs)
1957 # Instantiate tokenizer.
1958 try:
-> 1959 tokenizer = cls(*init_inputs, **init_kwargs)
1960 except OSError:
1961 raise OSError(
1962 "Unable to load vocabulary from file. "
1963 "Please check that the provided vocabulary is accessible and not corrupted."
1964 )
File ~/Desktop/code/ml/venv/lib/python3.10/site-packages/transformers/models/deberta_v2/tokenization_deberta_v2_fast.py:133, in DebertaV2TokenizerFast.__init__(self, vocab_file, tokenizer_file, do_lower_case, split_by_punct, bos_token, eos_token, unk_token, sep_token, pad_token, cls_token, mask_token, **kwargs)
118 def __init__(
119 self,
120 vocab_file=None,
(...)
131 **kwargs
132 ) -> None:
--> 133 super().__init__(
134 vocab_file,
135 tokenizer_file=tokenizer_file,
136 do_lower_case=do_lower_case,
137 bos_token=bos_token,
138 eos_token=eos_token,
139 unk_token=unk_token,
140 sep_token=sep_token,
141 pad_token=pad_token,
142 cls_token=cls_token,
143 mask_token=mask_token,
144 split_by_punct=split_by_punct,
145 **kwargs,
146 )
148 self.do_lower_case = do_lower_case
149 self.split_by_punct = split_by_punct
File ~/Desktop/code/ml/venv/lib/python3.10/site-packages/transformers/tokenization_utils_fast.py:120, in PreTrainedTokenizerFast.__init__(self, *args, **kwargs)
118 fast_tokenizer = convert_slow_tokenizer(slow_tokenizer)
119 else:
--> 120 raise ValueError(
121 "Couldn't instantiate the backend tokenizer from one of: \n"
122 "(1) a `tokenizers` library serialization file, \n"
123 "(2) a slow tokenizer instance to convert or \n"
124 "(3) an equivalent slow tokenizer class to instantiate and convert. \n"
125 "You need to have sentencepiece installed to convert a slow tokenizer to a fast one."
126 )
128 self._tokenizer = fast_tokenizer
130 if slow_tokenizer is not None:
ValueError: Couldn't instantiate the backend tokenizer from one of:
(1) a `tokenizers` library serialization file,
(2) a slow tokenizer instance to convert or
(3) an equivalent slow tokenizer class to instantiate and convert.
You need to have sentencepiece installed to convert a slow tokenizer to a fast one.
You will receive notifications because you created this topic.
Welcome to fast.ai Course Forums — thanks for contributing!
Be kind to your fellow community members.
Does your reply improve the conversation?
Constructive criticism is welcome, but criticize ideas, not people.
For more, see our community guidelines. This panel will only appear for your first 2 posts.
Lesson 4 official topic