Monitoring
!cat models/local-chat-completions/model-settings.json{
"name": "local-chat-completions",
"implementation": "mlserver_llm_local.runtime.Local",
"parameters": {
"extra": {
"backend": "vllm",
"config": {
"model_type": "chat.completions",
"model_settings": {
"model": "microsoft/Phi-3.5-mini-instruct",
"tensor_parallel_size": 4,
"dtype": "float16",
"gpu_memory_utilization": 0.8,
"max_model_len": 4096,
"default_generate_kwargs": {
"max_tokens": 1024
}
}
}
}
}
}Last updated
Was this helpful?




