Error
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
Cell In[3], line 1
----> 1 llm = LLM(model="/workspace/llama-7b-orca-math-10k-bnb-qdora-vllm-test")
File /usr/local/lib/python3.11/dist-packages/vllm/entrypoints/llm.py:334, in LLM.__init__(self, model, runner, convert, tokenizer, tokenizer_mode, skip_tokenizer_init, trust_remote_code, allowed_local_media_path, allowed_media_domains, tensor_parallel_size, dtype, quantization, revision, tokenizer_revision, seed, gpu_memory_utilization, swap_space, cpu_offload_gb, enforce_eager, disable_custom_all_reduce, hf_token, hf_overrides, mm_processor_kwargs, pooler_config, structured_outputs_config, kv_cache_memory_bytes, compilation_config, logits_processors, **kwargs)
300 engine_args = EngineArgs(
301 model=model,
302 runner=runner,
(...)
329 **kwargs,
330 )
332 log_non_default_args(engine_args)
--> 334 self.llm_engine = LLMEngine.from_engine_args(
335 engine_args=engine_args, usage_context=UsageContext.LLM_CLASS
336 )
337 self.engine_class = type(self.llm_engine)
339 self.request_counter = Counter()
File /usr/local/lib/python3.11/dist-packages/vllm/v1/engine/llm_engine.py:183, in LLMEngine.from_engine_args(cls, engine_args, usage_context, stat_loggers, enable_multiprocessing)
180 enable_multiprocessing = True
182 # Create the LLMEngine.
--> 183 return cls(
184 vllm_config=vllm_config,
185 executor_class=executor_class,
186 log_stats=not engine_args.disable_log_stats,
187 usage_context=usage_context,
188 stat_loggers=stat_loggers,
189 multiprocess_mode=enable_multiprocessing,
190 )
File /usr/local/lib/python3.11/dist-packages/vllm/v1/engine/llm_engine.py:89, in LLMEngine.__init__(self, vllm_config, executor_class, log_stats, aggregate_engine_logging, usage_context, stat_loggers, mm_registry, use_cached_outputs, multiprocess_mode)
87 tokenizer = None
88 else:
---> 89 tokenizer = init_tokenizer_from_config(self.model_config)
91 self.input_processor = InputProcessor(self.vllm_config, tokenizer)
92 self.io_processor = get_io_processor(
93 self.vllm_config,
94 self.model_config.io_processor_plugin,
95 )
File /usr/local/lib/python3.11/dist-packages/vllm/tokenizers/registry.py:227, in init_tokenizer_from_config(model_config)
224 else:
225 assert_never(runner_type)
--> 227 return get_tokenizer(
228 model_config.tokenizer,
229 tokenizer_mode=model_config.tokenizer_mode,
230 trust_remote_code=model_config.trust_remote_code,
231 revision=model_config.tokenizer_revision,
232 truncation_side=truncation_side,
233 )
File /usr/local/lib/python3.11/dist-packages/vllm/tokenizers/registry.py:191, in get_tokenizer(tokenizer_name, tokenizer_mode, trust_remote_code, revision, download_dir, *args, **kwargs)
177 logger.warning_once(
178 "TokenizerRegistry now uses `tokenizer_mode` as the registry key "
179 "instead of `tokenizer_name`. "
(...)
186 tokenizer_mode,
187 )
189 tokenizer_mode = str(tokenizer_name)
--> 191 tokenizer = TokenizerRegistry.get_tokenizer(
192 tokenizer_mode,
193 *tokenizer_args,
194 **tokenizer_kwargs,
195 )
196 if not tokenizer.is_fast:
197 logger.warning(
198 "Using a slow tokenizer. This might cause a significant "
199 "slowdown. Consider using a fast tokenizer instead."
200 )
File /usr/local/lib/python3.11/dist-packages/vllm/tokenizers/registry.py:86, in TokenizerRegistry.get_tokenizer(tokenizer_mode, *args, **kwargs)
84 item = TokenizerRegistry.REGISTRY[tokenizer_mode]
85 if isinstance(item, type):
---> 86 return item.from_pretrained(*args, **kwargs)
88 module, class_name = item
89 logger.debug_once(f"Loading {class_name} for {tokenizer_mode=!r}")
File /usr/local/lib/python3.11/dist-packages/vllm/tokenizers/hf.py:84, in HfTokenizer.from_pretrained(cls, path_or_repo_id, trust_remote_code, revision, download_dir, *args, **kwargs)
73 @classmethod
74 def from_pretrained(
75 cls,
(...)
81 **kwargs,
82 ) -> "TokenizerLike":
83 try:
---> 84 tokenizer = AutoTokenizer.from_pretrained(
85 path_or_repo_id,
86 *args,
87 trust_remote_code=trust_remote_code,
88 revision=revision,
89 cache_dir=download_dir,
90 **kwargs,
91 )
92 except ValueError as e:
93 # If the error pertains to the tokenizer class not existing or not
94 # currently being imported,
95 # suggest using the --trust-remote-code flag.
96 if not trust_remote_code and (
97 "does not exist or is not currently imported." in str(e)
98 or "requires you to execute the tokenizer file" in str(e)
99 ):
File /usr/local/lib/python3.11/dist-packages/transformers/models/auto/tokenization_auto.py:1175, in AutoTokenizer.from_pretrained(cls, pretrained_model_name_or_path, *inputs, **kwargs)
1172 tokenizer_class_py, tokenizer_class_fast = TOKENIZER_MAPPING[type(config)]
1174 if tokenizer_class_fast and (use_fast or tokenizer_class_py is None):
-> 1175 return tokenizer_class_fast.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
1176 else:
1177 if tokenizer_class_py is not None:
File /usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2113, in PreTrainedTokenizerBase.from_pretrained(cls, pretrained_model_name_or_path, cache_dir, force_download, local_files_only, token, revision, trust_remote_code, *init_inputs, **kwargs)
2110 else:
2111 logger.info(f"loading file {file_path} from cache at {resolved_vocab_files[file_id]}")
-> 2113 return cls._from_pretrained(
2114 resolved_vocab_files,
2115 pretrained_model_name_or_path,
2116 init_configuration,
2117 *init_inputs,
2118 token=token,
2119 cache_dir=cache_dir,
2120 local_files_only=local_files_only,
2121 _commit_hash=commit_hash,
2122 _is_local=is_local,
2123 trust_remote_code=trust_remote_code,
2124 **kwargs,
2125 )
File /usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2151, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
2148 # If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be
2149 # loaded directly from the GGUF file.
2150 if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None and not gguf_file:
-> 2151 slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
2152 copy.deepcopy(resolved_vocab_files),
2153 pretrained_model_name_or_path,
2154 copy.deepcopy(init_configuration),
2155 *init_inputs,
2156 token=token,
2157 cache_dir=cache_dir,
2158 local_files_only=local_files_only,
2159 _commit_hash=_commit_hash,
2160 **(copy.deepcopy(kwargs)),
2161 )
2162 else:
2163 slow_tokenizer = None
File /usr/local/lib/python3.11/dist-packages/transformers/tokenization_utils_base.py:2359, in PreTrainedTokenizerBase._from_pretrained(cls, resolved_vocab_files, pretrained_model_name_or_path, init_configuration, token, cache_dir, local_files_only, _commit_hash, _is_local, trust_remote_code, *init_inputs, **kwargs)
2357 # Instantiate the tokenizer.
2358 try:
-> 2359 tokenizer = cls(*init_inputs, **init_kwargs)
2360 except import_protobuf_decode_error():
2361 logger.info(
2362 "Unable to load tokenizer model from SPM, loading from TikToken will be attempted instead."
2363 "(Google protobuf error: Tried to load SPM model with non-SPM vocab file).",
2364 )
File /usr/local/lib/python3.11/dist-packages/transformers/models/llama/tokenization_llama.py:171, in LlamaTokenizer.__init__(self, vocab_file, unk_token, bos_token, eos_token, pad_token, sp_model_kwargs, add_bos_token, add_eos_token, clean_up_tokenization_spaces, use_default_system_prompt, spaces_between_special_tokens, legacy, add_prefix_space, **kwargs)
169 self.add_eos_token = add_eos_token
170 self.use_default_system_prompt = use_default_system_prompt
--> 171 self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
172 self.add_prefix_space = add_prefix_space
174 super().__init__(
175 bos_token=bos_token,
176 eos_token=eos_token,
(...)
187 **kwargs,
188 )
File /usr/local/lib/python3.11/dist-packages/transformers/models/llama/tokenization_llama.py:198, in LlamaTokenizer.get_spm_processor(self, from_slow)
196 tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
197 if self.legacy or from_slow: # no dependency on protobuf
--> 198 tokenizer.Load(self.vocab_file)
199 return tokenizer
201 with open(self.vocab_file, "rb") as f:
File /usr/local/lib/python3.11/dist-packages/sentencepiece/__init__.py:961, in SentencePieceProcessor.Load(self, model_file, model_proto)
959 if model_proto:
960 return self.LoadFromSerializedProto(model_proto)
--> 961 return self.LoadFromFile(model_file)
File /usr/local/lib/python3.11/dist-packages/sentencepiece/__init__.py:316, in SentencePieceProcessor.LoadFromFile(self, arg)
315 def LoadFromFile(self, arg):
--> 316 return _sentencepiece.SentencePieceProcessor_LoadFromFile(self, arg)
TypeError: not a string
This notebook contains the working code to run vllm inference with the resultant model.