project refactored
This commit is contained in:
@@ -1,27 +1,20 @@
|
||||
services:
|
||||
vllm-qwen:
|
||||
build:
|
||||
context: .
|
||||
dockerfile: Dockerfile
|
||||
container_name: vllm_qwen_08b
|
||||
qwen-08b:
|
||||
build: .
|
||||
container_name: qwen_08b_server
|
||||
ports:
|
||||
- "8000:8000"
|
||||
ipc: host
|
||||
devices:
|
||||
- "/dev/kfd:/dev/kfd"
|
||||
- "/dev/dri:/dev/dri"
|
||||
group_add:
|
||||
- video
|
||||
- render
|
||||
- /dev/kfd:/dev/kfd
|
||||
- /dev/dri:/dev/dri
|
||||
volumes:
|
||||
# Montaggio in sola lettura del modello
|
||||
- /opt/models:/app/models:ro
|
||||
environment:
|
||||
- HSA_OVERRIDE_GFX_VERSION=11.0.0
|
||||
- HIP_VISIBLE_DEVICES=0
|
||||
- /opt/models:/models
|
||||
# -ngl 99 offloada tutti i layer sulla GPU AMD
|
||||
# --host 0.0.0.0 lo rende accessibile fuori dal container
|
||||
command: >
|
||||
--model /app/models/Qwen3.5-0.8B-UD-Q8_K_XL.gguf
|
||||
--quantization gguf
|
||||
--gpu-memory-utilization 0.06
|
||||
--max-model-len 32768
|
||||
-m /models/qwen1_5-0_8b-chat-q8_0.gguf
|
||||
--host 0.0.0.0
|
||||
--port 8000
|
||||
-ngl 99
|
||||
-c 4096
|
||||
restart: unless-stopped
|
||||
|
||||
Reference in New Issue
Block a user