Multimodel serving
!cat models/local-gpt2-transformers/model-settings.json{
"name": "local-gpt2-transformers",
"implementation": "mlserver_llm_local.runtime.Local",
"parameters": {
"extra": {
"backend": "transformers",
"config": {
"model_type": "completions",
"model_settings": {
"model": "gpt2",
"device": "cuda",
"gpu_memory_utilization": 0.3
}
}
}
}
}Last updated
Was this helpful?