单模型
llama-server.exe -m "G:\AI-AI\LLM\stablediffusionv2.gguf" --port 8081
多模型
llama-server.exe --config_file <config_file>
{"host": "0.0.0.0","port": 8080,"models": [{"model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf","model_alias": "gpt-3.5-turbo","chat_format": "chatml","n_gpu_layers": -1,"offload_kqv": true,"n_threads": 12,"n_batch": 512,"n_ctx": 2048},{"model": "models/OpenHermes-2.5-Mistral-7B-GGUF/openhermes-2.5-mistral-7b.Q4_K_M.gguf","model_alias": "gpt-4","chat_format": "chatml","n_gpu_layers": -1,"offload_kqv": true,"n_threads": 12,"n_batch": 512,"n_ctx": 2048},{"model": "models/ggml_llava-v1.5-7b/ggml-model-q4_k.gguf","model_alias": "gpt-4-vision-preview","chat_format": "llava-1-5","clip_model_path": "models/ggml_llava-v1.5-7b/mmproj-model-f16.gguf","n_gpu_layers": -1,"offload_kqv": true,"n_threads": 12,"n_batch": 512,"n_ctx": 2048},{"model": "models/mistral-7b-v0.1-GGUF/ggml-model-Q4_K.gguf","model_alias": "text-davinci-003","n_gpu_layers": -1,"offload_kqv": true,"n_threads": 12,"n_batch": 512,"n_ctx": 2048},{"model": "models/replit-code-v1_5-3b-GGUF/replit-code-v1_5-3b.Q4_0.gguf","model_alias": "copilot-codex","n_gpu_layers": -1,"offload_kqv": true,"n_threads": 12,"n_batch": 1024,"n_ctx": 9216}]
}