Local
Transformers
{
"name": "Llama-2-7B-chat-AWQ",
"implementation": "mlserver_llm_local.runtime.Local",
"parameters": {
"extra": {
"backend": "transformers",
"config": {
"model_type": "chat.completions",
"model_settings": {
"model": "TheBloke/Llama-2-7B-chat-AWQ",
"device": "cuda",
"max_tokens": -1
}
}
}
}
}Model Settings
model: str
strload_kwargs: dict
dictdevice: Literal["cpu", "cuda", "auto"]
Literal["cpu", "cuda", "auto"]dtype: str = "auto"
str = "auto"tensor_parallel_size: int
intpipeline_parallel_size: int
intenable_profile: bool
boolprofile_kwargs: Optional["ProfilerSettings"]
Optional["ProfilerSettings"]enable_optimisation: bool
booloptimisation_kwargs: Optional["OptimisationSettings"]
Optional["OptimisationSettings"]config: Optional["PretrainedConfig"]
Optional["PretrainedConfig"]max_model_len: Optional[int]
Optional[int]max_tokens: Optional[int]
Optional[int]max_num_seqs: int
intmax_paddings: int
intgpu_memory_utilization: float
floatdefault_generate_kwargs: Dict[str, Any]
Dict[str, Any]stream: bool
boolskip_special_tokens: bool
boolmax_tokens: int
intignore_eos: bool
booltemperature: float
floatrepetition_penalty: float
floattop_p: float
floattop_k: int
intvLLM
default_generate_kwargs: Dict[str, Any]
Dict[str, Any]stream: bool
boolskip_special_tokens: bool
boolDeepSpeed
Model Settings
model: str
strtokenizer: Optional[str]
Optional[str]load_kwargs: dict
dictdevice: Literal["cuda"]
Literal["cuda"]tensor_parallel_size: int
intinference_engine_config: Dict[str, Any]
Dict[str, Any]torch_dist_port: int
intmax_model_len: Optional[int]
Optional[int]worker_use_ray: bool
boolconfig: Optional["PretrainedConfig"]
Optional["PretrainedConfig"]quantization_mode: Optional[str]
Optional[str]default_generate_kwargs: Dict[str, Any]
Dict[str, Any]stream: bool
boolskip_special_tokens: bool
boolmax_tokens: int
intignore_eos: bool
booltemperature: float
floatrepetition_penalty: float
floattop_p: float
floattop_k: int
intLast updated
Was this helpful?