Single-rack production inference blueprint (2026)

Spec a single 19-inch rack for production LLM inference. HGX B200 in OEM 4U as the default, DGX B200 if you want the appliance, GB200 NVL72 if your facility is liquid-ready.

Job-to-be-done · Spec a single rack for company-tier LLM inference. Decide air vs liquid; own vs rent.

const{Fragment:e,jsx:r,jsxs:n}=arguments[0];function _createMdxContent(t){const i={h2:"h2",li:"li",p:"p",strong:"strong",table:"table",tbody:"tbody",td:"td",th:"th",thead:"thead",tr:"tr",ul:"ul",...t.components};return n(e,{children:[r(i.h2,{children:"The job"}),"\n",r(i.p,{children:"You are an infra lead. The CFO asked why the inference bill is what it is, and you have to answer with a spreadsheet. You serve a production LLM workload to internal users or paying customers. Latency targets are written down. Utilization is high enough that renting Hopper-class instances on demand stopped being clever about a quarter ago."}),"\n",r(i.p,{children:"This guide picks parts for one 19-inch rack. The question it answers: do you colocate your own Blackwell-class gear, or keep renting? The short version is that owning wins above roughly 70% utilization on a horizon longer than a year. Below that, rent."}),"\n",r(i.p,{children:"This guide is not for you if:"}),"\n",n(i.ul,{children:["\n",r(i.li,{children:"You need multi-rack scale-out. NVLink is rack-bounded; spanning racks is a different blueprint."}),"\n",r(i.li,{children:"You're pretraining a frontier model. This rack serves; it does not pretrain."}),"\n",r(i.li,{children:"Your facility can't deliver 35+ kW per cabinet with rear-door cooling. Different conversation."}),"\n"]}),"\n",r(i.h2,{children:"The build"}),"\n",n(i.table,{children:[r(i.thead,{children:n(i.tr,{children:[r(i.th,{children:"Item"}),r(i.th,{children:"Pick"}),r(i.th,{children:"Why"})]})}),n(i.tbody,{children:[n(i.tr,{children:[r(i.td,{children:"Compute"}),r(i.td,{children:"2-3x OEM 4U servers with NVIDIA HGX B200 baseboard"}),r(i.td,{children:"8x B200 per node, 144 PFLOPS FP4 sparse, air-cooled, OEM choice"})]}),n(i.tr,{children:[r(i.td,{children:"Memory"}),r(i.td,{children:"1,400 GB HBM3e per baseboard; 2 TB system DRAM"}),r(i.td,{children:"Fits trillion-parameter weights resident; enough host RAM for KV offload"})]}),n(i.tr,{children:[r(i.td,{children:"Networking"}),r(i.td,{children:"ConnectX-7 NICs at 400 Gbps; BlueField-3 DPUs"}),r(i.td,{children:"East-west fabric for tensor parallel; DPU offloads storage and security"})]}),n(i.tr,{children:[r(i.td,{children:"Storage"}),r(i.td,{children:"NVMe all-flash tier, separate from compute nodes"}),r(i.td,{children:"Weights and logs survive a node swap; keeps the GPU chassis simple"})]}),n(i.tr,{children:[r(i.td,{children:"Power"}),r(i.td,{children:"2x 30A 208V three-phase PDUs, A+B feeds"}),r(i.td,{children:"One HGX node draws ~10 kW; three nodes plus fabric clears 35 kW"})]}),n(i.tr,{children:[r(i.td,{children:"Cooling"}),r(i.td,{children:"Rear-door heat exchanger or hot-aisle containment"}),r(i.td,{children:"Air-cooled HGX/DGX B200 lets you skip facility water plumbing"})]}),n(i.tr,{children:[r(i.td,{children:"Software"}),r(i.td,{children:"Triton, vLLM or TensorRT-LLM, Kubernetes, MIG"}),r(i.td,{children:"Open serving stack; MIG partitions a B200 for smaller models"})]})]})]}),"\n",r(i.h2,{children:"Numbers"}),"\n",n(i.ul,{children:["\n",n(i.li,{children:[r(i.strong,{children:"HGX B200 baseboard"})," — 8 GPUs, 1,400 GB total HBM3e, 14.4 TB/s NVLink, 144 PFLOPS FP4 sparse."]}),"\n",n(i.li,{children:[r(i.strong,{children:"DGX B200 appliance"})," — 10U, 14.3 kW max, 2 TB system memory, 4 OSFP ports at 400 Gbps each, 2 BlueField-3 DPUs."]}),"\n",n(i.li,{children:[r(i.strong,{children:"GB200 NVL72"})," — 72 Blackwell GPUs and 36 Grace CPUs in one rack, 13.4 TB HBM3e, ~120 kW per rack, liquid-cooled."]}),"\n",n(i.li,{children:[r(i.strong,{children:"DGX H200"})," — 8 GPUs, 1,128 GB HBM3e, 10.2 kW max. The non-liquid Hopper fallback."]}),"\n"]}),"\n",r(i.h2,{children:"Tradeoffs"}),"\n",n(i.ul,{children:["\n",n(i.li,{children:[r(i.strong,{children:"GB200 NVL72 instead"})," — Treat the whole rack as one platform. Top throughput per rack-U and the right call if you train as well as serve. The catch is liquid: ~120 kW and facility water are non-negotiable. If your colo cannot deliver chilled water to the cabinet, this is a non-starter regardless of price."]}),"\n",n(i.li,{children:[r(i.strong,{children:"Hopper (DGX H200) instead"})," — 10.2 kW per node fits a standard air-cooled cabinet without rear-door heat exchangers. Memory per system is 1,128 GB HBM3e, which is enough for most production serving workloads today. You give up the FP4 throughput Blackwell brings; if your workload is INT8 or FP8, the gap is smaller than the spec sheet suggests."]}),"\n",n(i.li,{children:[r(i.strong,{children:"Cloud rental instead"})," — Reserved Blackwell capacity from a hyperscaler is the right call below ~70% utilization or under a one-year horizon. Above that, the rack pays itself off and the next year is gross margin. Run the math with your actual contract pricing, your actual colo quote, and a three-year depreciation. If the answer is close, rent — owning is only worth it when the answer is obvious."]}),"\n"]}),"\n",r(i.h2,{children:"What this doesn't get you"}),"\n",n(i.ul,{children:["\n",r(i.li,{children:"Multi-rack scale-out. NVLink is rack-bounded; spanning racks means InfiniBand or Ethernet and a different blueprint."}),"\n",r(i.li,{children:"Training a frontier model from scratch. This rack serves; it does not pretrain."}),"\n",r(i.li,{children:"A solved facility problem. Power density and heat rejection are the gating decisions, not the GPU SKU."}),"\n"]})]})}return{default:function(e={}){const{wrapper:n}=e.components||{};return n?r(n,{...e,children:r(_createMdxContent,{...e})}):_createMdxContent(e)}};