first commit

2026-03-27 13:05:47 +00:00
commit 549f70b97a
4 changed files with 82 additions and 0 deletions
@@ -0,0 +1,8 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+## [1.0.0] - 2026-03-27
+### Added
+- Inizializzazione ambiente vLLM per Qwen3.5-0.8B GGUF.
+- Dockerfile con build da sorgente per architettura gfx1100.
+- docker-compose.yml con HSA_OVERRIDE e IPC host.
@@ -0,0 +1,22 @@
+FROM rocm/pytorch:latest
+
+ENV DEBIAN_FRONTEND=noninteractive
+# Specifica l'architettura target per la compilazione dei kernel C++/HIP
+ENV PYTORCH_ROCM_ARCH="gfx1100"
+ENV HSA_OVERRIDE_GFX_VERSION="11.0.0"
+ENV HIP_VISIBLE_DEVICES="0"
+
+RUN apt-get update -y && apt-get install -y     git build-essential python3-dev     && rm -rf /var/lib/apt/lists/*
+
+WORKDIR /workspace
+
+# Clona vLLM e usa un tag stabile recente che supporta GGUF (0.6.0+)
+RUN git clone https://github.com/vllm-project/vllm.git .     && git checkout v0.6.3
+
+RUN pip install -U pip &&     pip install -r requirements-rocm.txt
+
+# Compilazione (richiederà tempo)
+RUN python3 setup.py install
+
+EXPOSE 8000
+ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
@@ -0,0 +1,27 @@
+services:
+  vllm-qwen:
+    build: 
+      context: .
+      dockerfile: Dockerfile
+    container_name: vllm_qwen_08b
+    ports:
+      - "8000:8000"
+    ipc: host 
+    devices:
+      - "/dev/kfd:/dev/kfd"
+      - "/dev/dri:/dev/dri"
+    group_add:
+      - video
+      - render
+    volumes:
+      # Montaggio in sola lettura del modello
+      - /opt/models:/app/models:ro
+    environment:
+      - HSA_OVERRIDE_GFX_VERSION=11.0.0
+      - HIP_VISIBLE_DEVICES=0
+    command: >
+      --model /app/models/Qwen3.5-0.8B-UD-Q8_K_XL.gguf
+      --quantization gguf
+      --gpu-memory-utilization 0.50
+      --max-model-len 4096
+    restart: unless-stopped
@@ -0,0 +1,25 @@
+import urllib.request
+import json
+
+url = "http://localhost:8000/v1/chat/completions"
+headers = {"Content-Type": "application/json"}
+data = {
+    "model": "/app/models/Qwen3.5-0.8B-UD-Q8_K_XL.gguf",
+    "messages": [
+        {"role": "system", "content": "Sei un assistente utile e conciso."},
+        {"role": "user", "content": "Quali sono i vantaggi principali del sistema operativo Linux?"}
+    ],
+    "max_tokens": 100,
+    "temperature": 0.2
+}
+
+req = urllib.request.Request(url, data=json.dumps(data).encode('utf-8'), headers=headers)
+
+print("Inviando richiesta a vLLM...")
+try:
+    with urllib.request.urlopen(req) as response:
+        result = json.loads(response.read().decode('utf-8'))
+        print("\n✅ TEST SUPERATO. Risposta dal modello:\n")
+        print(result['choices'][0]['message']['content'])
+except Exception as e:
+    print(f"\n❌ ERRORE DURANTE IL TEST: {e}")