first commit

2026-03-27 13:05:47 +00:00
commit 549f70b97a
4 changed files with 82 additions and 0 deletions
@@ -0,0 +1,8 @@
 # Changelog
 All notable changes to this project will be documented in this file.
 ## [1.0.0] - 2026-03-27
 ### Added
 - Inizializzazione ambiente vLLM per Qwen3.5-0.8B GGUF.
 - Dockerfile con build da sorgente per architettura gfx1100.
 - docker-compose.yml con HSA_OVERRIDE e IPC host.
@@ -0,0 +1,22 @@
 FROM rocm/pytorch:latest
 ENV DEBIAN_FRONTEND=noninteractive
 # Specifica l'architettura target per la compilazione dei kernel C++/HIP
 ENV PYTORCH_ROCM_ARCH="gfx1100"
 ENV HSA_OVERRIDE_GFX_VERSION="11.0.0"
 ENV HIP_VISIBLE_DEVICES="0"
 RUN apt-get update -y && apt-get install -y     git build-essential python3-dev     && rm -rf /var/lib/apt/lists/*
 WORKDIR /workspace
 # Clona vLLM e usa un tag stabile recente che supporta GGUF (0.6.0+)
 RUN git clone https://github.com/vllm-project/vllm.git .     && git checkout v0.6.3
 RUN pip install -U pip &&     pip install -r requirements-rocm.txt
 # Compilazione (richiederà tempo)
 RUN python3 setup.py install
 EXPOSE 8000
 ENTRYPOINT ["python3", "-m", "vllm.entrypoints.openai.api_server"]
@@ -0,0 +1,27 @@
 services:
  vllm-qwen:
    build: 
      context: .
      dockerfile: Dockerfile
    container_name: vllm_qwen_08b
    ports:
      - "8000:8000"
    ipc: host 
    devices:
      - "/dev/kfd:/dev/kfd"
      - "/dev/dri:/dev/dri"
    group_add:
      - video
      - render
    volumes:
      # Montaggio in sola lettura del modello
      - /opt/models:/app/models:ro
    environment:
      - HSA_OVERRIDE_GFX_VERSION=11.0.0
      - HIP_VISIBLE_DEVICES=0
    command: >
      --model /app/models/Qwen3.5-0.8B-UD-Q8_K_XL.gguf
      --quantization gguf
      --gpu-memory-utilization 0.50
      --max-model-len 4096
    restart: unless-stopped
@@ -0,0 +1,25 @@
 import urllib.request
 import json
 url = "http://localhost:8000/v1/chat/completions"
 headers = {"Content-Type": "application/json"}
 data = {
    "model": "/app/models/Qwen3.5-0.8B-UD-Q8_K_XL.gguf",
    "messages": [
        {"role": "system", "content": "Sei un assistente utile e conciso."},
        {"role": "user", "content": "Quali sono i vantaggi principali del sistema operativo Linux?"}
    ],
    "max_tokens": 100,
    "temperature": 0.2
 }
 req = urllib.request.Request(url, data=json.dumps(data).encode('utf-8'), headers=headers)
 print("Inviando richiesta a vLLM...")
 try:
    with urllib.request.urlopen(req) as response:
        result = json.loads(response.read().decode('utf-8'))
        print("\n✅ TEST SUPERATO. Risposta dal modello:\n")
        print(result['choices'][0]['message']['content'])
 except Exception as e:
    print(f"\n❌ ERRORE DURANTE IL TEST: {e}")