Files
TurboQuant_ROCm_Tutorial/index.html

728 lines
44 KiB
HTML
Raw Blame History

This file contains invisible Unicode characters
This file contains invisible Unicode characters that are indistinguishable to humans but may be processed differently by a computer. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
<!DOCTYPE html>
<html lang="it" data-theme="dark">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>TurboQuant + llama.cpp su ROCm — Tutorial</title>
<link rel="preconnect" href="https://fonts.googleapis.com">
<link rel="preconnect" href="https://fonts.gstatic.com" crossorigin>
<link href="https://fonts.googleapis.com/css2?family=JetBrains+Mono:wght@400;600&family=Inter:wght@300..700&display=swap" rel="stylesheet">
<style>
:root,[data-theme="dark"]{
--color-bg:#0d1117;--color-surface:#161b22;--color-surface-2:#21262d;
--color-surface-offset:#30363d;--color-border:#30363d;--color-divider:#21262d;
--color-text:#e6edf3;--color-text-muted:#8b949e;--color-text-faint:#484f58;
--color-primary:#39d353;--color-primary-hover:#2ea043;--color-primary-active:#1a7f37;
--color-primary-highlight:rgba(57,211,83,0.12);
--color-warning:#d29922;--color-error:#f85149;--color-blue:#58a6ff;
--color-purple:#bc8cff;--color-orange:#ffa657;
--font-body:'Inter',system-ui,sans-serif;--font-mono:'JetBrains Mono',monospace;
--text-xs:clamp(0.75rem,0.7rem + 0.25vw,0.875rem);
--text-sm:clamp(0.875rem,0.8rem + 0.35vw,1rem);
--text-base:clamp(1rem,0.95rem + 0.25vw,1.125rem);
--text-lg:clamp(1.125rem,1rem + 0.75vw,1.5rem);
--text-xl:clamp(1.5rem,1.2rem + 1.25vw,2.25rem);
--text-2xl:clamp(2rem,1.2rem + 2.5vw,3.5rem);
--space-1:.25rem;--space-2:.5rem;--space-3:.75rem;--space-4:1rem;
--space-5:1.25rem;--space-6:1.5rem;--space-8:2rem;--space-10:2.5rem;
--space-12:3rem;--space-16:4rem;
--radius-sm:.375rem;--radius-md:.5rem;--radius-lg:.75rem;--radius-xl:1rem;--radius-full:9999px;
--shadow-sm:0 1px 2px rgba(0,0,0,.4);--shadow-md:0 4px 12px rgba(0,0,0,.5);
--transition:180ms cubic-bezier(0.16,1,0.3,1);
--content-default:900px;--content-narrow:640px;
}
[data-theme="light"]{
--color-bg:#f6f8fa;--color-surface:#ffffff;--color-surface-2:#f6f8fa;
--color-surface-offset:#eaeef2;--color-border:#d0d7de;--color-divider:#eaeef2;
--color-text:#1f2328;--color-text-muted:#636c76;--color-text-faint:#aeb6c0;
--color-primary:#1a7f37;--color-primary-hover:#0d5e25;--color-primary-active:#08401a;
--color-primary-highlight:rgba(26,127,55,0.10);
--color-warning:#9a6700;--color-error:#d1242f;--color-blue:#0969da;
--color-purple:#8250df;--color-orange:#bc4c00;
--shadow-sm:0 1px 2px rgba(0,0,0,.07);--shadow-md:0 4px 12px rgba(0,0,0,.1);
}
*,*::before,*::after{box-sizing:border-box;margin:0;padding:0}
html{scroll-behavior:smooth;scroll-padding-top:4rem}
body{font-family:var(--font-body);font-size:var(--text-base);color:var(--color-text);background:var(--color-bg);min-height:100dvh;-webkit-font-smoothing:antialiased}
img,svg{display:block;max-width:100%}
a,button,[role="button"]{transition:color var(--transition),background var(--transition),border-color var(--transition),box-shadow var(--transition)}
/* NAV */
nav{position:sticky;top:0;z-index:100;background:oklch(from var(--color-bg) l c h / 0.92);backdrop-filter:blur(12px);border-bottom:1px solid var(--color-border);padding:var(--space-3) var(--space-6)}
.nav-inner{max-width:var(--content-default);margin:auto;display:flex;align-items:center;justify-content:space-between;gap:var(--space-4)}
.nav-logo{display:flex;align-items:center;gap:var(--space-2);font-weight:700;font-size:var(--text-sm);color:var(--color-text);text-decoration:none}
.nav-logo svg{color:var(--color-primary)}
.nav-links{display:flex;gap:var(--space-1);align-items:center}
.nav-links a{font-size:var(--text-xs);color:var(--color-text-muted);text-decoration:none;padding:var(--space-2) var(--space-3);border-radius:var(--radius-md)}
.nav-links a:hover{color:var(--color-text);background:var(--color-surface-offset)}
.theme-toggle{background:none;border:1px solid var(--color-border);border-radius:var(--radius-md);padding:var(--space-2);color:var(--color-text-muted);cursor:pointer;display:flex;align-items:center}
.theme-toggle:hover{color:var(--color-text);background:var(--color-surface-offset)}
/* HERO */
.hero{padding:var(--space-16) var(--space-6) var(--space-12);text-align:center;background:radial-gradient(ellipse 80% 50% at 50% 0%,var(--color-primary-highlight),transparent)}
.hero-badge{display:inline-flex;align-items:center;gap:var(--space-2);font-size:var(--text-xs);font-family:var(--font-mono);color:var(--color-primary);background:var(--color-primary-highlight);border:1px solid oklch(from var(--color-primary) l c h / 0.3);border-radius:var(--radius-full);padding:var(--space-1) var(--space-3);margin-bottom:var(--space-6)}
.hero h1{font-size:var(--text-2xl);font-weight:800;line-height:1.1;margin-bottom:var(--space-4);letter-spacing:-.02em}
.hero h1 span{color:var(--color-primary)}
.hero p{max-width:55ch;margin:0 auto var(--space-8);color:var(--color-text-muted);font-size:var(--text-base);line-height:1.7}
.hero-meta{display:flex;gap:var(--space-4);justify-content:center;flex-wrap:wrap}
.badge{display:inline-flex;align-items:center;gap:var(--space-1);font-size:var(--text-xs);font-family:var(--font-mono);padding:var(--space-1) var(--space-2);border-radius:var(--radius-full)}
.badge-green{background:rgba(57,211,83,.1);color:#39d353;border:1px solid rgba(57,211,83,.2)}
.badge-blue{background:rgba(88,166,255,.1);color:var(--color-blue);border:1px solid rgba(88,166,255,.2)}
.badge-orange{background:rgba(255,166,87,.1);color:var(--color-orange);border:1px solid rgba(255,166,87,.2)}
.badge-purple{background:rgba(188,140,255,.1);color:var(--color-purple);border:1px solid rgba(188,140,255,.2)}
/* LAYOUT */
.container{max-width:var(--content-default);margin:0 auto;padding:0 var(--space-6)}
/* ALERT BOXES */
.alert{border-radius:var(--radius-lg);padding:var(--space-4) var(--space-5);margin:var(--space-6) 0;display:flex;gap:var(--space-3);align-items:flex-start;font-size:var(--text-sm)}
.alert-warning{background:rgba(210,153,34,.08);border:1px solid rgba(210,153,34,.25);color:var(--color-text)}
.alert-info{background:rgba(88,166,255,.08);border:1px solid rgba(88,166,255,.2);color:var(--color-text)}
.alert-success{background:var(--color-primary-highlight);border:1px solid oklch(from var(--color-primary) l c h / 0.3);color:var(--color-text)}
.alert-error{background:rgba(248,81,73,.08);border:1px solid rgba(248,81,73,.2);color:var(--color-text)}
.alert-icon{flex-shrink:0;font-size:1.1em;margin-top:.1em}
/* STEPS */
.steps{counter-reset:step;display:flex;flex-direction:column;gap:0}
.step{display:grid;grid-template-columns:40px 1fr;gap:var(--space-4);padding:var(--space-8) 0;border-bottom:1px solid var(--color-divider)}
.step:last-child{border-bottom:none}
.step-num{counter-increment:step;width:40px;height:40px;border-radius:var(--radius-full);background:var(--color-surface-2);border:2px solid var(--color-border);display:flex;align-items:center;justify-content:center;font-family:var(--font-mono);font-weight:700;font-size:var(--text-sm);color:var(--color-primary);flex-shrink:0;position:relative;top:2px}
.step-content h2{font-size:var(--text-lg);font-weight:700;margin-bottom:var(--space-2);line-height:1.25}
.step-content p{color:var(--color-text-muted);font-size:var(--text-sm);margin-bottom:var(--space-4);max-width:70ch;line-height:1.7}
.step-content p+p{margin-top:var(--space-3)}
/* CODE BLOCKS */
.code-block{position:relative;margin:var(--space-4) 0;border-radius:var(--radius-lg);overflow:hidden;border:1px solid var(--color-border)}
.code-header{background:var(--color-surface);padding:var(--space-2) var(--space-4);display:flex;align-items:center;justify-content:space-between;border-bottom:1px solid var(--color-border)}
.code-lang{font-family:var(--font-mono);font-size:var(--text-xs);color:var(--color-text-muted);font-weight:600;letter-spacing:.05em;text-transform:uppercase}
.copy-btn{background:none;border:1px solid var(--color-border);border-radius:var(--radius-sm);padding:var(--space-1) var(--space-2);font-size:var(--text-xs);color:var(--color-text-muted);cursor:pointer;font-family:var(--font-body);display:flex;align-items:center;gap:var(--space-1)}
.copy-btn:hover{background:var(--color-surface-offset);color:var(--color-text)}
.copy-btn.copied{color:var(--color-primary);border-color:oklch(from var(--color-primary) l c h / 0.3)}
pre{background:var(--color-surface);padding:var(--space-5);overflow-x:auto;line-height:1.65;margin:0}
code{font-family:var(--font-mono);font-size:.875em;color:var(--color-text)}
pre code{color:var(--color-text);display:block}
/* Syntax highlight pseudo-classes via JS coloring */
.kw{color:var(--color-blue)}
.cm{color:var(--color-text-faint);font-style:italic}
.str{color:var(--color-orange)}
.num{color:var(--color-purple)}
.flag{color:var(--color-primary)}
.cmd{color:var(--color-warning)}
/* INLINE CODE */
:not(pre)>code{font-family:var(--font-mono);font-size:.85em;background:var(--color-surface-offset);padding:.1em .35em;border-radius:var(--radius-sm);border:1px solid var(--color-border);color:var(--color-orange)}
/* TABS */
.tabs{margin:var(--space-4) 0}
.tab-list{display:flex;gap:var(--space-1);border-bottom:1px solid var(--color-border);margin-bottom:var(--space-4)}
.tab-btn{background:none;border:none;border-bottom:2px solid transparent;padding:var(--space-2) var(--space-4);font-size:var(--text-sm);color:var(--color-text-muted);cursor:pointer;font-family:var(--font-body);margin-bottom:-1px;border-radius:var(--radius-sm) var(--radius-sm) 0 0;transition:color var(--transition),border-color var(--transition)}
.tab-btn:hover{color:var(--color-text);background:var(--color-surface-2)}
.tab-btn.active{color:var(--color-primary);border-bottom-color:var(--color-primary);font-weight:600}
.tab-panel{display:none}.tab-panel.active{display:block}
/* TABLE */
.table-wrap{overflow-x:auto;border-radius:var(--radius-lg);border:1px solid var(--color-border);margin:var(--space-4) 0}
table{border-collapse:collapse;width:100%;font-size:var(--text-sm)}
th{background:var(--color-surface);padding:var(--space-3) var(--space-4);text-align:left;font-weight:600;color:var(--color-text-muted);font-size:var(--text-xs);text-transform:uppercase;letter-spacing:.05em;border-bottom:1px solid var(--color-border)}
td{padding:var(--space-3) var(--space-4);border-bottom:1px solid var(--color-divider);vertical-align:top}
tr:last-child td{border-bottom:none}
tr:hover td{background:var(--color-surface-2)}
/* SECTION HEADERS */
.section{padding:var(--space-12) 0}
.section-label{font-family:var(--font-mono);font-size:var(--text-xs);color:var(--color-primary);text-transform:uppercase;letter-spacing:.1em;margin-bottom:var(--space-3);display:block}
.section h2{font-size:var(--text-xl);font-weight:800;margin-bottom:var(--space-3);line-height:1.2}
.section-desc{color:var(--color-text-muted);max-width:65ch;line-height:1.7;margin-bottom:var(--space-8)}
/* PROGRESS BAR */
.progress-steps{display:flex;gap:0;margin-bottom:var(--space-8);border:1px solid var(--color-border);border-radius:var(--radius-lg);overflow:hidden}
.progress-item{flex:1;padding:var(--space-3) var(--space-2);text-align:center;font-size:var(--text-xs);font-family:var(--font-mono);color:var(--color-text-muted);background:var(--color-surface);border-right:1px solid var(--color-border);cursor:pointer;transition:background var(--transition)}
.progress-item:last-child{border-right:none}
.progress-item.done{background:var(--color-primary-highlight);color:var(--color-primary)}
.progress-item.active{background:var(--color-surface-2);color:var(--color-text);font-weight:600}
.progress-item:hover{background:var(--color-surface-offset)}
/* PILL LABELS */
.pill{display:inline-block;font-size:var(--text-xs);font-family:var(--font-mono);padding:.15em .5em;border-radius:var(--radius-full);font-weight:600}
.pill-ok{background:rgba(57,211,83,.12);color:#39d353}
.pill-warn{background:rgba(210,153,34,.12);color:var(--color-warning)}
.pill-err{background:rgba(248,81,73,.12);color:var(--color-error)}
.pill-info{background:rgba(88,166,255,.1);color:var(--color-blue)}
/* PREREQ GRID */
.prereq-grid{display:grid;grid-template-columns:repeat(auto-fill,minmax(200px,1fr));gap:var(--space-3);margin:var(--space-4) 0}
.prereq-card{background:var(--color-surface);border:1px solid var(--color-border);border-radius:var(--radius-lg);padding:var(--space-4);display:flex;flex-direction:column;gap:var(--space-2)}
.prereq-card .title{font-size:var(--text-sm);font-weight:600;display:flex;align-items:center;gap:var(--space-2)}
.prereq-card .desc{font-size:var(--text-xs);color:var(--color-text-muted);line-height:1.5}
.prereq-card .version{font-family:var(--font-mono);font-size:var(--text-xs);color:var(--color-primary)}
/* COLLAPSIBLE */
details{border:1px solid var(--color-border);border-radius:var(--radius-lg);margin:var(--space-4) 0;overflow:hidden}
summary{padding:var(--space-3) var(--space-4);cursor:pointer;font-weight:600;font-size:var(--text-sm);background:var(--color-surface);list-style:none;display:flex;justify-content:space-between;align-items:center;user-select:none}
summary::-webkit-details-marker{display:none}
summary::after{content:"";color:var(--color-text-muted);font-size:var(--text-sm)}
details[open]>summary::after{content:""}
details[open]>summary{border-bottom:1px solid var(--color-border)}
.details-body{padding:var(--space-4);background:var(--color-surface)}
/* FOOTER */
footer{border-top:1px solid var(--color-divider);padding:var(--space-8) var(--space-6);text-align:center;color:var(--color-text-faint);font-size:var(--text-xs);font-family:var(--font-mono)}
@media(max-width:640px){
.step{grid-template-columns:32px 1fr;gap:var(--space-3)}
.progress-steps{flex-direction:column}
.progress-item{border-right:none;border-bottom:1px solid var(--color-border)}
.progress-item:last-child{border-bottom:none}
.hero h1{font-size:clamp(1.8rem,8vw,3rem)}
.nav-links a:not(:last-child){display:none}
}
</style>
</head>
<body>
<nav>
<div class="nav-inner">
<a href="#" class="nav-logo">
<svg width="20" height="20" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><polygon points="13 2 3 14 12 14 11 22 21 10 12 10 13 2"/></svg>
TurboQuant × ROCm
</a>
<div class="nav-links">
<a href="#prereq">Prerequisiti</a>
<a href="#build">Build</a>
<a href="#test">Test</a>
<a href="#troubleshoot">Troubleshooting</a>
<button class="theme-toggle" data-theme-toggle aria-label="Switch theme">
<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z"/></svg>
</button>
</div>
</div>
</nav>
<section class="hero">
<div class="hero-badge">
<svg width="12" height="12" viewBox="0 0 24 24" fill="currentColor"><circle cx="12" cy="12" r="10"/></svg>
Aprile 2026 — Sperimentale
</div>
<h1>llama.cpp + <span>TurboQuant</span><br>su AMD ROCm</h1>
<p>Guida passo-passo per compilare llama.cpp con la patch TurboQuant (jagsan-cyber fork) su hardware AMD con ROCm/HIP. Comprende prerequisiti, build, test KV cache e troubleshooting.</p>
<div class="hero-meta">
<span class="badge badge-green">ROCm 6.x</span>
<span class="badge badge-blue">HIP / hipBLAS</span>
<span class="badge badge-orange">TurboQuant KV</span>
<span class="badge badge-purple">RDNA2/3/4</span>
</div>
</section>
<div class="container">
<div class="alert alert-warning">
<span class="alert-icon">⚠️</span>
<div><strong>Stato:</strong> Il supporto TurboQuant su ROCm è <strong>sperimentale</strong> (fork <code>jagsan-cyber/turboquant-rocm-llamacpp</code>, aprile 2026). Non è ancora nel main di llama.cpp. Aspettati possibili crash e performance variabile. Testa sempre su un sistema non di produzione.</div>
</div>
<!-- PREREQUISITI -->
<section class="section" id="prereq">
<span class="section-label">Step 0</span>
<h2>Prerequisiti hardware &amp; software</h2>
<p class="section-desc">Prima di iniziare, verifica che il tuo sistema soddisfi i requisiti minimi. Le GPU AMD supportate da ROCm partono da RDNA2 (gfx1030+).</p>
<div class="prereq-grid">
<div class="prereq-card">
<div class="title">🖥️ GPU AMD</div>
<div class="desc">RDNA2, RDNA3 o RDNA4. Architetture gfx1030, gfx110x, gfx120x, Strix Halo gfx1151.</div>
<div class="version">≥ RX 6000 series</div>
</div>
<div class="prereq-card">
<div class="title">🐧 Linux Distro</div>
<div class="desc">Ubuntu 22.04 o 24.04 LTS, Fedora 39+. Kernel 6.x raccomandato.</div>
<div class="version">Ubuntu 22.04 / 24.04</div>
</div>
<div class="prereq-card">
<div class="title">⚙️ ROCm</div>
<div class="desc">AMD ROCm runtime e HIP SDK installati. Versione 6.0 o superiore.</div>
<div class="version">ROCm ≥ 6.0</div>
</div>
<div class="prereq-card">
<div class="title">📦 CMake</div>
<div class="desc">CMake 3.21 o superiore, con supporto HIP targets.</div>
<div class="version">cmake ≥ 3.21</div>
</div>
<div class="prereq-card">
<div class="title">🔧 Build tools</div>
<div class="desc">gcc/g++ 12+, clang (dal pacchetto ROCm), make, git, python3.</div>
<div class="version">gcc ≥ 12</div>
</div>
<div class="prereq-card">
<div class="title">💾 VRAM</div>
<div class="desc">Minimo 8 GB VRAM per modelli 7B. 16 GB+ per modelli 1334B.</div>
<div class="version">≥ 8 GB VRAM</div>
</div>
</div>
<div class="alert alert-info">
<span class="alert-icon"></span>
<div>Verifica la tua architettura AMD con: <code>rocminfo | grep gfx</code>. Nota il valore (es. <code>gfx1100</code> per RX 7900) — ti servirà nella fase di build come <code>AMDGPU_TARGETS</code>.</div>
</div>
</section>
<!-- STEPS -->
<section class="section" id="build">
<span class="section-label">Build Guide</span>
<h2>Compilazione passo per passo</h2>
<p class="section-desc">Segui gli step nell'ordine. Ogni blocco di codice è copiabile con un click.</p>
<div class="progress-steps">
<div class="progress-item done">1 · Dipendenze</div>
<div class="progress-item done">2 · ROCm Check</div>
<div class="progress-item active">3 · Clone Fork</div>
<div class="progress-item">4 · CMake Build</div>
<div class="progress-item">5 · Quantize</div>
<div class="progress-item">6 · Run Test</div>
</div>
<div class="steps">
<!-- STEP 1 -->
<div class="step">
<div class="step-num">1</div>
<div class="step-content">
<h2>Installa le dipendenze di sistema</h2>
<p>Installa i pacchetti necessari. Su Ubuntu 22.04/24.04:</p>
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Aggiorna e installa build tools</span>
<span class="cmd">sudo apt update && sudo apt upgrade -y</span>
<span class="cmd">sudo apt install -y</span> build-essential gcc g++ clang cmake cmake-extras git wget curl python3 python3-pip libopenblas-dev pkg-config</code></pre>
</div>
<p>Su <strong>Fedora/RHEL</strong>:</p>
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cmd">sudo dnf install -y</span> gcc gcc-c++ clang cmake git wget curl python3 python3-pip openblas-devel</code></pre>
</div>
</div>
</div>
<!-- STEP 2 -->
<div class="step">
<div class="step-num">2</div>
<div class="step-content">
<h2>Installa e verifica ROCm</h2>
<p>Se ROCm non è ancora installato, usa lo script ufficiale AMD. Se già installato, salta alla verifica.</p>
<div class="tabs">
<div class="tab-list">
<button class="tab-btn active" onclick="switchTab(this,'tab-rocm-install')">Installazione ROCm</button>
<button class="tab-btn" onclick="switchTab(this,'tab-rocm-verify')">Verifica installazione</button>
</div>
<div id="tab-rocm-install" class="tab-panel active">
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Scarica e installa ROCm 6.x (Ubuntu 22.04)</span>
<span class="cmd">wget</span> <span class="str">https://repo.radeon.com/amdgpu-install/6.3/ubuntu/jammy/amdgpu-install_6.3.60300-1_all.deb</span>
<span class="cmd">sudo dpkg -i</span> amdgpu-install_6.3.60300-1_all.deb
<span class="cmd">sudo amdgpu-install</span> <span class="flag">--usecase=rocm,hip</span> <span class="flag">--no-dkms</span>
<span class="cm"># Aggiungi il tuo utente al gruppo render e video</span>
<span class="cmd">sudo usermod -aG render,video</span> $USER
<span class="cmd">newgrp render</span>
<span class="cm"># Riavvia per applicare i cambiamenti</span>
<span class="cmd">sudo reboot</span></code></pre>
</div>
</div>
<div id="tab-rocm-verify" class="tab-panel">
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Verifica che ROCm rilevi la GPU</span>
<span class="cmd">rocminfo</span>
<span class="cm"># Trova la tua architettura target (es. gfx1100)</span>
<span class="cmd">rocminfo</span> | <span class="cmd">grep</span> <span class="flag">-E</span> <span class="str">"gfx|Name"</span>
<span class="cm"># Verifica HIP</span>
<span class="cmd">hipconfig</span> <span class="flag">-v</span>
<span class="cmd">hipconfig</span> <span class="flag">-l</span> <span class="cm"># mostra path del compilatore</span>
<span class="cm"># Test rapido: elenca device</span>
<span class="cmd">rocm-smi</span></code></pre>
</div>
<div class="alert alert-success" style="margin-top:var(--space-3)">
<span class="alert-icon"></span>
<div>Se <code>rocminfo</code> mostra la tua GPU con architettura <code>gfx****</code>, ROCm è correttamente installato. Annota il valore — es. <code>gfx1100</code> per RX 7900 XTX.</div>
</div>
</div>
</div>
</div>
</div>
<!-- STEP 3 -->
<div class="step">
<div class="step-num">3</div>
<div class="step-content">
<h2>Clona il fork TurboQuant ROCm</h2>
<p>Usa il fork <code>jagsan-cyber/turboquant-rocm-llamacpp</code> che include la patch TurboQuant con backend ROCm/HIP. Contiene anche la Heavy-Hitter Oracle (H2O) per KV cache eviction.</p>
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Crea una directory di lavoro</span>
<span class="cmd">mkdir</span> <span class="flag">-p</span> ~/llm-inference && <span class="cmd">cd</span> ~/llm-inference
<span class="cm"># Clona il fork TurboQuant con supporto ROCm</span>
<span class="cmd">git clone</span> https://github.com/jagsan-cyber/turboquant-rocm-llamacpp.git
<span class="cmd">cd</span> turboquant-rocm-llamacpp
<span class="cm"># Mostra il branch corrente e i log recenti</span>
<span class="cmd">git log</span> <span class="flag">--oneline -10</span>
<span class="cmd">git branch -a</span></code></pre>
</div>
<div class="alert alert-info">
<span class="alert-icon">💡</span>
<div>In alternativa puoi usare il fork <code>TheTom/llama-cpp-turboquant</code> (CUDA + CPU) e applicare manualmente i patch ROCm dal repo jagsan-cyber. Vedi la sezione Troubleshooting.</div>
</div>
</div>
</div>
<!-- STEP 4 -->
<div class="step">
<div class="step-num">4</div>
<div class="step-content">
<h2>Configura e compila con CMake + HIP</h2>
<p>Questa è la fase critica. Sostituisci <code>gfx1100</code> con l'architettura della tua GPU rilevata al passo 2. Se hai più GPU, separa con punto e virgola: <code>gfx1100;gfx1030</code>.</p>
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Imposta la tua architettura GPU (modifica questo valore!)</span>
<span class="cm"># Esempi: gfx1030 (RX 6800), gfx1100 (RX 7900 XTX), gfx1101 (RX 7600)</span>
<span class="cm"># gfx1151 (Strix Halo / Ryzen AI MAX)</span>
<span class="cmd">export</span> <span class="kw">AMDGPU_TARGETS</span>=<span class="str">"gfx1100"</span> <span class="cm"># ← modifica qui</span>
<span class="cm"># Imposta variabili HIP dal runtime ROCm</span>
<span class="cmd">export</span> <span class="kw">HIPCXX</span>=<span class="str">"$(hipconfig -l)/clang"</span>
<span class="cmd">export</span> <span class="kw">HIP_PATH</span>=<span class="str">"$(hipconfig -R)"</span>
<span class="cm"># Crea la directory di build e configura</span>
<span class="cmd">cmake</span> -S . -B build <span class="flag">-DGGML_HIP=ON</span> <span class="flag">-DGGML_HIP_TURBOQUANT=ON</span> <span class="flag">-DAMDGPU_TARGETS=$AMDGPU_TARGETS</span> <span class="flag">-DCMAKE_BUILD_TYPE=Release</span> <span class="flag">-DGGML_BLAS=ON</span> <span class="flag">-DGGML_BLAS_VENDOR=hipBLAS</span>
<span class="cm"># Compila (usa tutti i core disponibili)</span>
<span class="cmd">cmake</span> <span class="flag">--build</span> build <span class="flag">-j$(nproc)</span>
<span class="cm"># Verifica i binari prodotti</span>
<span class="cmd">ls</span> <span class="flag">-la</span> build/bin/</code></pre>
</div>
<div class="alert alert-warning">
<span class="alert-icon">⚠️</span>
<div>Se il flag <code>-DGGML_HIP_TURBOQUANT=ON</code> non viene riconosciuto (CMake error), consulta il README del fork: alcune versioni usano <code>-DLLAMA_TURBOQUANT=ON</code> o richiede un branch specifico. Controlla con <code>git branch -a</code>.</div>
</div>
<details>
<summary>Flag CMake opzionali avanzati</summary>
<div class="details-body">
<div class="code-block">
<div class="code-header"><span class="code-lang">bash (opzionale)</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Flag aggiuntivi per ottimizzazioni specifiche</span>
<span class="cmd">cmake</span> -S . -B build <span class="flag">-DGGML_HIP=ON</span> <span class="flag">-DGGML_HIP_TURBOQUANT=ON</span> <span class="flag">-DAMDGPU_TARGETS=$AMDGPU_TARGETS</span> <span class="flag">-DCMAKE_BUILD_TYPE=Release</span> <span class="flag">-DGGML_BLAS=ON</span> <span class="flag">-DGGML_BLAS_VENDOR=hipBLAS</span> <span class="flag">-DGGML_NATIVE=ON</span> \ <span class="cm"># ottimizza per la CPU locale</span>
<span class="flag">-DGGML_F16C=ON</span> \ <span class="cm"># abilita FP16 se supportato</span>
<span class="flag">-DGGML_AVX2=ON</span> \ <span class="cm"># AVX2 per CPU Ryzen</span>
<span class="flag">-DLLAMA_CURL=ON</span> <span class="cm"># abilita download diretto modelli</span></code></pre>
</div>
</div>
</details>
</div>
</div>
<!-- STEP 5 -->
<div class="step">
<div class="step-num">5</div>
<div class="step-content">
<h2>Scarica e quantizza un modello</h2>
<p>Per testare TurboQuant hai bisogno di un modello in formato GGUF. Puoi scaricarne uno pre-quantizzato da HuggingFace o usarne uno esistente.</p>
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Crea directory per i modelli</span>
<span class="cmd">mkdir</span> <span class="flag">-p</span> ~/llm-inference/models
<span class="cm"># Opzione A: scarica un GGUF già pronto (es. Qwen2.5-7B Q4_K_M)</span>
<span class="cmd">cd</span> ~/llm-inference/models
<span class="cmd">wget</span> <span class="flag">-c</span> <span class="str">"https://huggingface.co/Qwen/Qwen2.5-7B-Instruct-GGUF/resolve/main/qwen2.5-7b-instruct-q4_k_m.gguf"</span>
<span class="cm"># Opzione B: converti da safetensors (se hai il modello HF locale)</span>
<span class="cmd">cd</span> ~/llm-inference/turboquant-rocm-llamacpp
<span class="cmd">python3</span> convert_hf_to_gguf.py <span class="str">/path/to/hf/model</span> <span class="flag">--outfile</span> ~/llm-inference/models/mio-modello-f16.gguf <span class="flag">--outtype</span> f16
<span class="cm"># Quantizzazione classica Q4_K_M (opzionale, se hai il modello F16)</span>
./build/bin/llama-quantize ~/llm-inference/models/mio-modello-f16.gguf ~/llm-inference/models/mio-modello-q4km.gguf Q4_K_M</code></pre>
</div>
</div>
</div>
<!-- STEP 6 -->
<div class="step">
<div class="step-num">6</div>
<div class="step-content">
<h2>Esegui il test TurboQuant KV cache</h2>
<p>Avvia <code>llama-cli</code> o <code>llama-bench</code> con i flag TurboQuant attivi. Il parametro chiave è <code>--cache-type-k</code> e <code>--cache-type-v</code> impostati a <code>tq</code> (TurboQuant).</p>
<div class="tabs">
<div class="tab-list">
<button class="tab-btn active" onclick="switchTab(this,'tab-basic')">Run base</button>
<button class="tab-btn" onclick="switchTab(this,'tab-bench')">Benchmark</button>
<button class="tab-btn" onclick="switchTab(this,'tab-server')">Server mode</button>
</div>
<div id="tab-basic" class="tab-panel active">
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Inferenza con TurboQuant KV cache attivo su GPU AMD</span>
./build/bin/llama-cli <span class="flag">-m</span> ~/llm-inference/models/qwen2.5-7b-instruct-q4_k_m.gguf <span class="flag">-ngl</span> 99 \ <span class="cm"># offload tutti i layer su GPU</span>
<span class="flag">--cache-type-k</span> tq1_0 \ <span class="cm"># TurboQuant ~1-bit keys</span>
<span class="flag">--cache-type-v</span> tq4_0 \ <span class="cm"># TurboQuant 4-bit values</span>
<span class="flag">-c</span> 16384 \ <span class="cm"># context window 16K token</span>
<span class="flag">-n</span> 256 \ <span class="cm"># genera 256 token</span>
<span class="flag">-p</span> <span class="str">"Ciao! Spiegami cos'è il KV cache in un LLM."</span></code></pre>
</div>
<div class="alert alert-info" style="margin-top:var(--space-3)">
<span class="alert-icon">💡</span>
<div><strong>Tipi KV cache disponibili:</strong> <code>f16</code> (standard), <code>q8_0</code>, <code>q4_0</code>, <code>tq1_0</code> (TurboQuant ~1-bit), <code>tq4_0</code> (TurboQuant 4-bit). Combinare <code>tq1_0</code> per K e <code>tq4_0</code> per V dà il miglior rapporto qualità/memoria.</div>
</div>
</div>
<div id="tab-bench" class="tab-panel">
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Benchmark comparativo: standard f16 vs TurboQuant</span>
<span class="cm"># Test 1: KV cache standard F16 (baseline)</span>
./build/bin/llama-bench <span class="flag">-m</span> ~/llm-inference/models/qwen2.5-7b-instruct-q4_k_m.gguf <span class="flag">-ngl</span> 99 <span class="flag">--cache-type-k</span> f16 <span class="flag">--cache-type-v</span> f16 <span class="flag">-c</span> 8192 <span class="flag">-n</span> 128 <span class="flag">-r</span> 3 <span class="cm"># ripeti 3 volte per media</span>
<span class="cm"># Test 2: TurboQuant attivo</span>
./build/bin/llama-bench <span class="flag">-m</span> ~/llm-inference/models/qwen2.5-7b-instruct-q4_k_m.gguf <span class="flag">-ngl</span> 99 <span class="flag">--cache-type-k</span> tq1_0 <span class="flag">--cache-type-v</span> tq4_0 <span class="flag">-c</span> 8192 <span class="flag">-n</span> 128 <span class="flag">-r</span> 3
<span class="cm"># Monitor VRAM in tempo reale (apri in un secondo terminale)</span>
<span class="cmd">watch</span> <span class="flag">-n 0.5</span> rocm-smi <span class="flag">--showmeminfo</span> vram</code></pre>
</div>
</div>
<div id="tab-server" class="tab-panel">
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Server OpenAI-compatibile con TurboQuant abilitato</span>
./build/bin/llama-server <span class="flag">-m</span> ~/llm-inference/models/qwen2.5-7b-instruct-q4_k_m.gguf <span class="flag">-ngl</span> 99 <span class="flag">--cache-type-k</span> tq1_0 <span class="flag">--cache-type-v</span> tq4_0 <span class="flag">-c</span> 32768 \ <span class="cm"># context 32K — qui TurboQuant fa la differenza!</span>
<span class="flag">--host</span> 0.0.0.0 <span class="flag">--port</span> 8080
<span class="cm"># Test rapido con curl (in un altro terminale)</span>
<span class="cmd">curl</span> <span class="flag">-s</span> http://localhost:8080/v1/chat/completions <span class="flag">-H</span> <span class="str">"Content-Type: application/json"</span> <span class="flag">-d</span> <span class="str">'{"model":"local","messages":[{"role":"user","content":"Ciao!"}],"max_tokens":100}'</span> | python3 <span class="flag">-m json.tool</span></code></pre>
</div>
</div>
</div>
</div>
</div>
</div>
</section>
<!-- RISULTATI ATTESI -->
<section class="section" id="test">
<span class="section-label">Risultati attesi</span>
<h2>Cosa aspettarsi su ROCm</h2>
<p class="section-desc">Benchmark indicativi basati su test della community (aprile 2026). I valori ROCm sono stimati a partire dai dati CUDA applicando il gap storico di ~15-25%.</p>
<div class="table-wrap">
<table>
<thead>
<tr>
<th>Config KV cache</th>
<th>VRAM (7B, 8K ctx)</th>
<th>TPS decode</th>
<th>Qualità output</th>
<th>Stato ROCm</th>
</tr>
</thead>
<tbody>
<tr>
<td><code>f16 / f16</code> (baseline)</td>
<td>~6.5 GB</td>
<td>100% (ref)</td>
<td><span class="pill pill-ok">Perfetta</span></td>
<td><span class="pill pill-ok">Stabile</span></td>
</tr>
<tr>
<td><code>q8_0 / q8_0</code></td>
<td>~5.2 GB (20%)</td>
<td>~105%</td>
<td><span class="pill pill-ok">Ottima</span></td>
<td><span class="pill pill-ok">Stabile</span></td>
</tr>
<tr>
<td><code>q4_0 / q4_0</code></td>
<td>~3.8 GB (42%)</td>
<td>~108%</td>
<td><span class="pill pill-warn">Buona</span></td>
<td><span class="pill pill-ok">Stabile</span></td>
</tr>
<tr>
<td><code>tq4_0 / tq4_0</code></td>
<td>~2.4 GB (63%)</td>
<td>~96%</td>
<td><span class="pill pill-warn">Buona</span></td>
<td><span class="pill pill-warn">Sperimentale</span></td>
</tr>
<tr>
<td><code>tq1_0 / tq4_0</code> (max compression)</td>
<td>~1.1 GB (83%)</td>
<td>~85-92%</td>
<td><span class="pill pill-warn">Discreta</span></td>
<td><span class="pill pill-warn">Sperimentale</span></td>
</tr>
</tbody>
</table>
</div>
<p style="color:var(--color-text-muted);font-size:var(--text-xs);margin-top:var(--space-2)">* I dati ROCm sono stime. Su Metal (Apple Silicon) si registra un calo TPS maggiore (~50%) per un bug noto. CUDA mostra +22.8% decode a 32K context.</p>
</section>
<!-- TROUBLESHOOTING -->
<section class="section" id="troubleshoot">
<span class="section-label">Problemi comuni</span>
<h2>Troubleshooting ROCm + TurboQuant</h2>
<p class="section-desc">Errori frequenti e come risolverli.</p>
<details>
<summary>❌ Errore: <code>HIPCC not found</code> o <code>hipconfig: command not found</code></summary>
<div class="details-body">
<p style="color:var(--color-text-muted);font-size:var(--text-sm);margin-bottom:var(--space-3)">ROCm non è nel PATH o non è installato correttamente.</p>
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Aggiungi ROCm al PATH</span>
<span class="cmd">echo</span> <span class="str">'export PATH=$PATH:/opt/rocm/bin:/opt/rocm/hip/bin'</span> >> ~/.bashrc
<span class="cmd">echo</span> <span class="str">'export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/opt/rocm/lib'</span> >> ~/.bashrc
<span class="cmd">source</span> ~/.bashrc
<span class="cm"># Verifica</span>
<span class="cmd">which</span> hipconfig && hipconfig <span class="flag">-v</span></code></pre>
</div>
</div>
</details>
<details>
<summary>❌ Errore: <code>GPU not found</code> o <code>Device 0: gfx000</code></summary>
<div class="details-body">
<p style="color:var(--color-text-muted);font-size:var(--text-sm);margin-bottom:var(--space-3)">L'utente non ha i permessi sul device o il driver non carica correttamente.</p>
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Controlla che l'utente sia nel gruppo render/video</span>
<span class="cmd">groups</span> $USER
<span class="cm"># Se mancano, aggiungili e rilogga</span>
<span class="cmd">sudo usermod -aG render,video</span> $USER
<span class="cmd">logout</span> <span class="cm"># poi rilogga</span>
<span class="cm"># Verifica device files</span>
<span class="cmd">ls -la</span> /dev/kfd /dev/dri/renderD*</code></pre>
</div>
</div>
</details>
<details>
<summary>❌ Errore CMake: <code>DGGML_HIP_TURBOQUANT not recognized</code></summary>
<div class="details-body">
<p style="color:var(--color-text-muted);font-size:var(--text-sm);margin-bottom:var(--space-3)">Alcune versioni del fork usano nomi di flag diversi. Prova queste alternative:</p>
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Alternativa 1</span>
<span class="cmd">cmake</span> -S . -B build <span class="flag">-DGGML_HIP=ON -DLLAMA_TURBOQUANT=ON</span> ...
<span class="cm"># Alternativa 2: cerca il flag corretto nel CMakeLists</span>
<span class="cmd">grep</span> <span class="flag">-i</span> <span class="str">"turboquant"</span> CMakeLists.txt
<span class="cmd">grep</span> <span class="flag">-i</span> <span class="str">"turboquant"</span> ggml/CMakeLists.txt</code></pre>
</div>
</div>
</details>
<details>
<summary>⚠️ Performance peggiori del previsto (TPS molto bassi)</summary>
<div class="details-body">
<p style="color:var(--color-text-muted);font-size:var(--text-sm);margin-bottom:var(--space-3)">Su ROCm il backend TurboQuant non è ancora ottimizzato quanto CUDA. Prova queste mitigazioni:</p>
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Usa tq4_0 invece di tq1_0 per k — meno compressione, più veloce</span>
<span class="flag">--cache-type-k</span> tq4_0 <span class="flag">--cache-type-v</span> tq4_0
<span class="cm"># Riduci il context se la VRAM non è un problema</span>
<span class="flag">-c</span> 4096
<span class="cm"># Forza un numero specifico di thread CPU per il dequantize</span>
<span class="flag">-t</span> 8
<span class="cm"># Monitora l'utilizzo GPU in tempo reale</span>
<span class="cmd">rocm-smi</span> <span class="flag">--showuse --showmeminfo vram -d 0</span></code></pre>
</div>
</div>
</details>
<details>
<summary>🔄 Alternativa: applicare patch manualmente su llama.cpp ufficiale</summary>
<div class="details-body">
<p style="color:var(--color-text-muted);font-size:var(--text-sm);margin-bottom:var(--space-3)">Se preferisci partire dal repo ufficiale llama.cpp e applicare la patch TurboQuant manualmente:</p>
<div class="code-block">
<div class="code-header"><span class="code-lang">bash</span><button class="copy-btn" onclick="copyCode(this)">📋 Copia</button></div>
<pre><code><span class="cm"># Clona llama.cpp ufficiale</span>
<span class="cmd">git clone</span> https://github.com/ggml-org/llama.cpp.git
<span class="cmd">cd</span> llama.cpp
<span class="cm"># Aggiungi il fork come remote</span>
<span class="cmd">git remote add</span> thetom https://github.com/TheTom/llama-cpp-turboquant.git
<span class="cmd">git fetch</span> thetom
<span class="cm"># Cherry-pick solo i commit TurboQuant (vedi git log thetom/main)</span>
<span class="cmd">git log</span> thetom/main <span class="flag">--oneline</span> | <span class="cmd">grep</span> <span class="flag">-i</span> turboquant
<span class="cm"># poi: git cherry-pick &lt;commit-hash&gt;</span>
<span class="cm"># Build con ROCm standard</span>
<span class="cmd">cmake</span> -S . -B build <span class="flag">-DGGML_HIP=ON</span> <span class="flag">-DAMDGPU_TARGETS=$AMDGPU_TARGETS</span> <span class="flag">-DCMAKE_BUILD_TYPE=Release</span>
<span class="cmd">cmake</span> <span class="flag">--build</span> build <span class="flag">-j$(nproc)</span></code></pre>
</div>
</div>
</details>
</section>
</div>
<footer>
Tutorial generato ad aprile 2026 · Basato su fork jagsan-cyber/turboquant-rocm-llamacpp · Stato: sperimentale
</footer>
<script>
// Theme toggle
(function(){
const t=document.querySelector('[data-theme-toggle]'),r=document.documentElement;
let d='dark';
r.setAttribute('data-theme',d);
if(t) t.addEventListener('click',()=>{
d=d==='dark'?'light':'dark';
r.setAttribute('data-theme',d);
t.innerHTML=d==='dark'?
'<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><path d="M21 12.79A9 9 0 1 1 11.21 3 7 7 0 0 0 21 12.79z"/></svg>':
'<svg width="16" height="16" viewBox="0 0 24 24" fill="none" stroke="currentColor" stroke-width="2"><circle cx="12" cy="12" r="5"/><path d="M12 1v2M12 21v2M4.22 4.22l1.42 1.42M18.36 18.36l1.42 1.42M1 12h2M21 12h2M4.22 19.78l1.42-1.42M18.36 5.64l1.42-1.42"/></svg>';
});
})();
// Tab switcher
function switchTab(btn, panelId) {
const container = btn.closest('.tabs');
container.querySelectorAll('.tab-btn').forEach(b => b.classList.remove('active'));
container.querySelectorAll('.tab-panel').forEach(p => p.classList.remove('active'));
btn.classList.add('active');
document.getElementById(panelId).classList.add('active');
}
// Copy code
function copyCode(btn) {
const pre = btn.closest('.code-block').querySelector('pre');
const text = pre.innerText;
navigator.clipboard.writeText(text).then(() => {
btn.textContent = '✅ Copiato';
btn.classList.add('copied');
setTimeout(() => { btn.textContent = '📋 Copia'; btn.classList.remove('copied'); }, 2000);
});
}
</script>
</body>
</html>