base_model | string | Yes | | The base model to deploy. Can be a short name from our models list or Hugging Face model path |
cooldown_time | integer | No | 3600 | Time in seconds before scaling down idle replicas. Minimum is 10 minutes (600 seconds) |
hf_token | string | No | | Hugging Face token for private models |
min_replicas | integer | No | 0 | Minimum number of replicas |
max_replicas | integer | No | 1 | Maximum number of replicas |
scale_up_threshold | integer | No | 1 | Number of queued requests before scaling up additional replicas |
quantization | string | No | | Quantization method (none , fp8 , bitsandbytes-nf4 ). Default is based on the model and accelerator. |
uses_guaranteed_capacity | boolean | No | false | Whether to use guaranteed capacity |
max_total_tokens | integer | No | | Maximum number of tokens per request |
max_num_batched_tokens | integer | No | | Maximum number of tokens that can be batched together. Higher values increase throughput but may cause request preemption |
lorax_image_tag | string | No | | Tag for the LoRAX image |
request_logging_enabled | boolean | No | false | Whether to enable request logging |
direct_ingress | boolean | No | false | Creates a direct endpoint to the LLM, bypassing the Predibase control plane |
preloaded_adapters | array[string] | No | | List of adapter IDs to preload on deployment initialization |
speculator | string | No | | Speculator to use for the deployment (auto , disabled , or adapter ID of a Turbo or Turbo LoRA) |
prefix_caching | boolean | No | false | Whether to enable prefix caching |
merge_adapter | boolean | No | false | Whether to merge a preloaded adapter with the base model |
cache_model | boolean | No | false | If true, caches the HF weights of the model in a private S3 bucket (see details) |
custom_args | array[string] | No | | Custom arguments to pass to the LoRAX launcher |