diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 8abbde44c..7fabdc193 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -141,6 +141,14 @@ jobs:
     env:
       GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
     steps:
+      # Increase pagefile to handle momentary spikes in RAM from NVCC compiles
+      - if: startsWith(matrix.preset, 'MLX ')
+        name: Increase pagefile to 200 GB
+        uses: al-cheb/configure-pagefile-action@v1.5
+        with:
+          minimum-size: 16GB
+          maximum-size: 200GB
+          disk-root: "D:"
       - name: Install system dependencies
         run: |
           choco install -y --no-progress ccache ninja
@@ -237,7 +245,7 @@ jobs:
           Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
           Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
           cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} --install-prefix "$((pwd).Path)\dist\${{ matrix.os }}-${{ matrix.arch }}"
-          cmake --build --parallel ([Environment]::ProcessorCount) --preset "${{ matrix.preset }}"
+          cmake --build --preset "${{ matrix.preset }}" -- -l $([Environment]::ProcessorCount)
           cmake --install build --component "${{ startsWith(matrix.preset, 'MLX ') && 'MLX' || startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || startsWith(matrix.preset, 'Vulkan') && 'Vulkan' || 'CPU' }}" --strip
           Remove-Item -Path dist\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue
         env:
@@ -380,20 +388,36 @@ jobs:
             dist/*.ps1
             dist/OllamaSetup.exe
 
-  linux-build:
+  # Pre-build each Dockerfile stage on its own runner in parallel and push the
+  # resulting layers to a per-stage registry cache.  The downstream
+  # docker-build-push job then assembles cache-hit-only.
+  linux-depends:
     strategy:
       matrix:
         include:
-          - os: linux
-            arch: amd64
-            target: archive
-          - os: linux
-            arch: amd64
-            target: rocm
-          - os: linux
-            arch: arm64
-            target: archive
-    runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
+          - arch: amd64
+            target: cpu
+          - arch: amd64
+            target: cuda-12
+          - arch: amd64
+            target: cuda-13
+          - arch: amd64
+            target: mlx
+          - arch: amd64
+            target: rocm-7
+          - arch: amd64
+            target: vulkan
+          - arch: arm64
+            target: cpu
+          - arch: arm64
+            target: cuda-12
+          - arch: arm64
+            target: cuda-13
+          - arch: arm64
+            target: jetpack-5
+          - arch: arm64
+            target: jetpack-6
+    runs-on: ${{ matrix.arch == 'arm64' && 'linux-arm64' || 'linux' }}
     environment: release
     needs: setup-environment
     env:
@@ -401,53 +425,53 @@ jobs:
     steps:
       - uses: actions/checkout@v4
       - uses: docker/setup-buildx-action@v3
+      - uses: docker/login-action@v3
+        with:
+          username: ${{ vars.DOCKER_USER }}
+          password: ${{ secrets.DOCKER_ACCESS_TOKEN }}
+      # Increase swap to handle momentary spikes in RAM from NVCC compiles
+      - if: matrix.target == 'mlx'
+        name: Increase Linux swap to 200 GB
+        shell: bash
+        run: |
+          set -e
+          SWAP_PATH=/swapfile-mlx
+          SWAP_SIZE_GB=200
+          if [ -f "$SWAP_PATH" ]; then
+              sudo swapoff "$SWAP_PATH" 2>/dev/null || true
+              sudo rm -f "$SWAP_PATH"
+          fi
+          if ! sudo fallocate -l ${SWAP_SIZE_GB}G "$SWAP_PATH" 2>/dev/null; then
+              echo "fallocate unsupported, falling back to dd"
+              sudo dd if=/dev/zero of="$SWAP_PATH" bs=1M count=$((SWAP_SIZE_GB * 1024))
+          fi
+          sudo chmod 600 "$SWAP_PATH"
+          sudo mkswap "$SWAP_PATH"
+          sudo swapon "$SWAP_PATH"
+          swapon --show
+          free -h
       - uses: docker/build-push-action@v6
         with:
           context: .
-          platforms: ${{ matrix.os }}/${{ matrix.arch }}
+          platforms: linux/${{ matrix.arch }}
           target: ${{ matrix.target }}
+          provenance: false
+          sbom: false
           build-args: |
-            GOFLAGS=${{ env.GOFLAGS }}
             CGO_CFLAGS=${{ env.CGO_CFLAGS }}
             CGO_CXXFLAGS=${{ env.CGO_CXXFLAGS }}
-          outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
-          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
-          cache-to: type=inline
-      - name: Deduplicate CUDA libraries
-        run: |
-          ./scripts/deduplicate_cuda_libs.sh dist/${{ matrix.os }}-${{ matrix.arch }}
-      - run: |
-          for COMPONENT in bin/* lib/ollama/*; do
-            case "$COMPONENT" in
-              bin/ollama*)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/vulkan*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
-              lib/ollama/mlx*)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-mlx.tar.in ;;
-              lib/ollama/include*)       echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-mlx.tar.in ;;
-              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
-              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
-              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
-            esac
-          done
-        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
-      - run: |
-          echo "Manifests"
-          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in ; do
-            echo $ARCHIVE
-            cat $ARCHIVE
-          done
-      - run: |
-          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
-            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | zstd --ultra -22 -T0 >$(basename ${ARCHIVE//.*/}.tar.zst);
-          done
-      - uses: actions/upload-artifact@v4
-        with:
-          name: bundles-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }}
-          path: |
-            *.tar.zst
+            GOFLAGS=${{ env.GOFLAGS }}
+            APT_MIRROR=http://azure.archive.ubuntu.com/ubuntu
+            OLLAMA_MLX_BUILD_JOBS=16
+            OLLAMA_MLX_NVCC_THREADS=6
+          cache-from: |
+            type=registry,ref=ollama/release:cache-${{ matrix.arch }}-${{ matrix.target }}
+            type=registry,ref=${{ vars.DOCKER_REPO }}:latest
+          cache-to: type=registry,ref=ollama/release:cache-${{ matrix.arch }}-${{ matrix.target }},mode=max
 
   # Build each Docker variant (OS, arch, and flavor) separately. Using QEMU is unreliable and slower.
+  # Heavy stages were pre-built by linux-depends; this job is cache-hit-only for those layers
+  # and just assembles, runs the Go build, and pushes the final image.
   docker-build-push:
     strategy:
       matrix:
@@ -459,6 +483,15 @@ jobs:
               CGO_CXXFLAGS
               GOFLAGS
               APT_MIRROR=http://azure.archive.ubuntu.com/ubuntu
+              OLLAMA_MLX_BUILD_JOBS=16
+              OLLAMA_MLX_NVCC_THREADS=6
+            cache-from: |
+              type=registry,ref=${{ vars.DOCKER_REPO }}:latest
+              type=registry,ref=ollama/release:cache-arm64-cpu
+              type=registry,ref=ollama/release:cache-arm64-cuda-12
+              type=registry,ref=ollama/release:cache-arm64-cuda-13
+              type=registry,ref=ollama/release:cache-arm64-jetpack-5
+              type=registry,ref=ollama/release:cache-arm64-jetpack-6
           - os: linux
             arch: amd64
             build-args: |
@@ -466,6 +499,15 @@ jobs:
               CGO_CXXFLAGS
               GOFLAGS
               APT_MIRROR=http://azure.archive.ubuntu.com/ubuntu
+              OLLAMA_MLX_BUILD_JOBS=16
+              OLLAMA_MLX_NVCC_THREADS=6
+            cache-from: |
+              type=registry,ref=${{ vars.DOCKER_REPO }}:latest
+              type=registry,ref=ollama/release:cache-amd64-cpu
+              type=registry,ref=ollama/release:cache-amd64-cuda-12
+              type=registry,ref=ollama/release:cache-amd64-cuda-13
+              type=registry,ref=ollama/release:cache-amd64-mlx
+              type=registry,ref=ollama/release:cache-amd64-vulkan
           - os: linux
             arch: amd64
             suffix: '-rocm'
@@ -475,9 +517,15 @@ jobs:
               GOFLAGS
               FLAVOR=rocm
               APT_MIRROR=http://azure.archive.ubuntu.com/ubuntu
+              OLLAMA_MLX_BUILD_JOBS=16
+              OLLAMA_MLX_NVCC_THREADS=6
+            cache-from: |
+              type=registry,ref=${{ vars.DOCKER_REPO }}:latest
+              type=registry,ref=ollama/release:cache-amd64-cpu
+              type=registry,ref=ollama/release:cache-amd64-rocm-7
     runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }}
     environment: release
-    needs: setup-environment
+    needs: [setup-environment, linux-depends]
     env:
       GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }}
     steps:
@@ -492,9 +540,11 @@ jobs:
         with:
           context: .
           platforms: ${{ matrix.os }}/${{ matrix.arch }}
+          provenance: false
+          sbom: false
           build-args: ${{ matrix.build-args }}
           outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true
-          cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest
+          cache-from: ${{ matrix.cache-from }}
           cache-to: type=inline
       - run: |
           mkdir -p ${{ matrix.os }}-${{ matrix.arch }}
@@ -505,6 +555,53 @@ jobs:
           name: digest-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.suffix }}
           path: |
             ${{ runner.temp }}/${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.suffix }}.txt
+      # Re-run buildx with --target archive against buildkit's local cache to
+      # extract the release directory layout.  All upstream stages were just
+      # built above, so this is a cache-hit-only pass that just writes files.
+      - uses: docker/build-push-action@v6
+        with:
+          context: .
+          platforms: ${{ matrix.os }}/${{ matrix.arch }}
+          target: archive
+          provenance: false
+          sbom: false
+          build-args: ${{ matrix.build-args }}
+          outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }}
+          cache-from: ${{ matrix.cache-from }}
+      - name: Deduplicate CUDA libraries
+        run: |
+          ./scripts/deduplicate_cuda_libs.sh dist/${{ matrix.os }}-${{ matrix.arch }}
+      - run: |
+          for COMPONENT in bin/* lib/ollama/*; do
+            case "$COMPONENT" in
+              bin/ollama*)               echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/*.so*)          echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_v*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/vulkan*)        echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/mlx*)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-mlx.tar.in ;;
+              lib/ollama/include*)       echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;;
+              lib/ollama/cuda_jetpack5)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;;
+              lib/ollama/cuda_jetpack6)  echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;;
+              lib/ollama/rocm)           echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;;
+            esac
+          done
+        working-directory: dist/${{ matrix.os }}-${{ matrix.arch }}
+      - run: |
+          echo "Manifests"
+          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in ; do
+            echo $ARCHIVE
+            cat $ARCHIVE
+          done
+      - run: |
+          for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do
+            tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | zstd -19 -T0 >$(basename ${ARCHIVE//.*/}.tar.zst) &
+          done
+          wait
+      - uses: actions/upload-artifact@v4
+        with:
+          name: bundles-${{ matrix.os }}-${{ matrix.arch }}${{ matrix.suffix }}
+          path: |
+            *.tar.zst
 
   # Merge Docker images for the same flavor into a single multi-arch manifest
   docker-merge-push:
@@ -544,7 +641,7 @@ jobs:
   release:
     runs-on: ubuntu-latest
     environment: release
-    needs: [darwin-build, windows-app, linux-build]
+    needs: [darwin-build, windows-app, docker-build-push]
     permissions:
       contents: write
     env:
diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml
index 56fd22880..4ae685642 100644
--- a/.github/workflows/test.yaml
+++ b/.github/workflows/test.yaml
@@ -65,7 +65,7 @@ jobs:
           - preset: 'MLX CUDA 13'
             container: nvidia/cuda:13.0.0-devel-ubuntu22.04
             extra-packages: libcudnn9-dev-cuda-13 libopenblas-dev liblapack-dev liblapacke-dev git curl
-            flags: '-DCMAKE_CUDA_ARCHITECTURES=87 -DBLAS_INCLUDE_DIRS=/usr/include/x86_64-linux-gnu -DLAPACK_INCLUDE_DIRS=/usr/include/x86_64-linux-gnu'
+            flags: '-DCMAKE_CUDA_ARCHITECTURES=87 -DMLX_CUDA_ARCHITECTURES=80-virtual -DBLAS_INCLUDE_DIRS=/usr/include/x86_64-linux-gnu -DLAPACK_INCLUDE_DIRS=/usr/include/x86_64-linux-gnu'
             install-go: true
     runs-on: linux
     container: ${{ matrix.container }}
@@ -105,7 +105,7 @@ jobs:
           key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }}
       - run: |
           cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
-          cmake --build --preset "${{ matrix.preset }}" --parallel
+          cmake --build --preset "${{ matrix.preset }}" -- -l $(nproc)
 
   windows:
     needs: [changes]
@@ -134,7 +134,7 @@ jobs:
           - preset: 'MLX CUDA 13'
             install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe
             cudnn-install: https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.18.1.3_cuda13-archive.zip
-            flags: '-DCMAKE_CUDA_ARCHITECTURES=80'
+            flags: '-DCMAKE_CUDA_ARCHITECTURES=80 -DMLX_CUDA_ARCHITECTURES=80-virtual'
             cuda-components:
               - '"cudart"'
               - '"nvcc"'
@@ -240,7 +240,7 @@ jobs:
           Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll'
           Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation  -DevCmdArguments '-arch=x64 -no_logo'
           cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }}
-          cmake --build --parallel --preset "${{ matrix.preset }}"
+          cmake --build --preset "${{ matrix.preset }}" -- -l $([Environment]::ProcessorCount)
         env:
           CMAKE_GENERATOR: Ninja
 
diff --git a/CMakeLists.txt b/CMakeLists.txt
index e2af188d3..f402d2382 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -228,15 +228,20 @@ if(MLX_ENGINE)
         list(APPEND MLX_INCLUDE_REGEXES "^dl\\.dll$")
     endif()
 
+    # Split mlx/mlxc libraries from runtime deps to avoid stripping deps
     install(TARGETS mlx mlxc
-        RUNTIME_DEPENDENCIES
-            DIRECTORIES ${MLX_RUNTIME_DIRS}
-            PRE_INCLUDE_REGEXES ${MLX_INCLUDE_REGEXES}
-            PRE_EXCLUDE_REGEXES ".*"
+        RUNTIME_DEPENDENCY_SET mlx_runtime_deps
         RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
         LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
         FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX
     )
+    install(RUNTIME_DEPENDENCY_SET mlx_runtime_deps
+        DIRECTORIES ${MLX_RUNTIME_DIRS}
+        PRE_INCLUDE_REGEXES ${MLX_INCLUDE_REGEXES}
+        PRE_EXCLUDE_REGEXES ".*"
+        RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX_VENDOR
+        LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX_VENDOR
+    )
 
     if(TARGET jaccl)
         install(TARGETS jaccl
@@ -366,7 +371,7 @@ if(MLX_ENGINE)
         if(MLX_CUDA_LIBS)
             install(FILES ${MLX_CUDA_LIBS}
                 DESTINATION ${OLLAMA_INSTALL_DIR}
-                COMPONENT MLX)
+                COMPONENT MLX_VENDOR)
         endif()
     endif()
 endif()
diff --git a/CMakePresets.json b/CMakePresets.json
index 0fdbc1442..fd647e4b7 100644
--- a/CMakePresets.json
+++ b/CMakePresets.json
@@ -112,7 +112,7 @@
       "name": "MLX CUDA 13",
       "inherits": [ "MLX", "CUDA 13" ],
       "cacheVariables": {
-        "MLX_CUDA_ARCHITECTURES": "86;89;90;90a;100;103;75-virtual;80-virtual;110-virtual;120-virtual;121-virtual",
+        "MLX_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual",
         "OLLAMA_RUNNER_DIR": "mlx_cuda_v13"
       }
     }
diff --git a/Dockerfile b/Dockerfile
index 0485b09c4..461c8585e 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -144,6 +144,9 @@ RUN --mount=type=cache,target=/root/.ccache \
 
 FROM base AS mlx
 ARG CUDA13VERSION=13.0
+#   OLLAMA_MLX_BUILD_JOBS  empty -> ninja gates by load average (-l $(nproc))
+ARG OLLAMA_MLX_BUILD_JOBS=
+ARG OLLAMA_MLX_NVCC_THREADS=2
 RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-} \
     && dnf install -y openblas-devel lapack-devel \
     && dnf install -y libcudnn9-cuda-13 libcudnn9-devel-cuda-13 \
@@ -170,9 +173,10 @@ RUN --mount=type=cache,target=/root/.ccache \
     && if [ -f /tmp/local-mlx-c/CMakeLists.txt ]; then \
         export OLLAMA_MLX_C_SOURCE=/tmp/local-mlx-c; \
     fi \
-    && cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas \
-        && cmake --build --preset 'MLX CUDA 13' -- -l $(nproc) \
-        && cmake --install build --component MLX --strip
+    && cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas -DCMAKE_CUDA_FLAGS="-t ${OLLAMA_MLX_NVCC_THREADS}" \
+        && cmake --build --preset 'MLX CUDA 13' -- -l $(nproc) ${OLLAMA_MLX_BUILD_JOBS:+-j ${OLLAMA_MLX_BUILD_JOBS}} \
+        && cmake --install build --component MLX --strip \
+        && cmake --install build --component MLX_VENDOR
 
 FROM base AS build
 WORKDIR /go/src/github.com/ollama/ollama
diff --git a/scripts/build_darwin.sh b/scripts/build_darwin.sh
index e17525083..3cd712167 100755
--- a/scripts/build_darwin.sh
+++ b/scripts/build_darwin.sh
@@ -76,6 +76,7 @@ _build_darwin() {
             cmake --build $BUILD_DIR --target mlx mlxc -j
             cmake --install $BUILD_DIR --component CPU
             cmake --install $BUILD_DIR --component MLX
+            cmake --install $BUILD_DIR --component MLX_VENDOR
             # Override CGO flags to point to the amd64 build directory
             MLX_CGO_CFLAGS="-O3 -mmacosx-version-min=14.0"
             MLX_CGO_LDFLAGS="-ldl -lc++ -framework Accelerate -mmacosx-version-min=14.0"
@@ -103,6 +104,7 @@ _build_darwin() {
             cmake --build $BUILD_DIR --target mlx mlxc --parallel
             _relink_mlx_metallib $BUILD_DIR
             cmake --install $BUILD_DIR --component MLX
+            cmake --install $BUILD_DIR --component MLX_VENDOR
 
             # Metal 4.x build (NAX-enabled, macOS 26+)
             # Only possible with Xcode 26+ SDK; skip on older toolchains.
@@ -124,6 +126,7 @@ _build_darwin() {
                     -DFETCHCONTENT_SOURCE_DIR_METAL_CPP=$V3_DEPS/metal_cpp-src
                 cmake --build $BUILD_DIR_V4 --target mlx mlxc --parallel
                 cmake --install $BUILD_DIR_V4 --component MLX
+                cmake --install $BUILD_DIR_V4 --component MLX_VENDOR
             else
                 status "Skipping MLX Metal v4 (SDK $SDK_MAJOR < 26, need Xcode 26+)"
             fi
diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh
index 5421d70c7..79d852672 100755
--- a/scripts/build_linux.sh
+++ b/scripts/build_linux.sh
@@ -1,9 +1,14 @@
 #!/bin/sh
 #
-# Mac ARM users, rosetta can be flaky, so to use a remote x86 builder
+# Mac ARM users, rosetta can be flaky, so to use a remote x86 builder.
+# Use the docker-container driver with the bundled buildkit GC config 
+# for improved cache behavior
 #
 # docker context create amd64 --docker host=ssh://mybuildhost
-# docker buildx create --name mybuilder amd64 --platform linux/amd64
+# docker buildx create --name mybuilder \
+#     --driver docker-container \
+#     --config ./buildkitd.toml.example \
+#     --bootstrap amd64 --platform linux/amd64
 # docker buildx create --name mybuilder --append desktop-linux --platform linux/arm64
 # docker buildx use mybuilder
 
@@ -59,18 +64,28 @@ fi
 # buildx behavior changes for single vs. multiplatform
 echo "Compressing linux tar bundles..."
 if echo $PLATFORM | grep "," > /dev/null ; then
-        tar c -C ./dist/linux_arm64 --exclude cuda_jetpack5 --exclude cuda_jetpack6 . | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64.tar.zst
-        tar c -C ./dist/linux_arm64 ./lib/ollama/cuda_jetpack5  | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64-jetpack5.tar.zst
-        tar c -C ./dist/linux_arm64 ./lib/ollama/cuda_jetpack6  | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64-jetpack6.tar.zst
-        tar c -C ./dist/linux_amd64 --exclude rocm --exclude 'mlx*' --exclude include . | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64.tar.zst
-        tar c -C ./dist/linux_amd64 ./lib/ollama/rocm  | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64-rocm.tar.zst
-        tar c -C ./dist/linux_amd64 ./lib/ollama/mlx_cuda_v13 ./lib/ollama/include  | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64-mlx.tar.zst
+        tar c -C ./dist/linux_arm64 --exclude cuda_jetpack5 --exclude cuda_jetpack6 . | zstd -9 -T0 >./dist/ollama-linux-arm64.tar.zst
+        tar c -C ./dist/linux_arm64 ./lib/ollama/cuda_jetpack5  | zstd -9 -T0 >./dist/ollama-linux-arm64-jetpack5.tar.zst
+        tar c -C ./dist/linux_arm64 ./lib/ollama/cuda_jetpack6  | zstd -9 -T0 >./dist/ollama-linux-arm64-jetpack6.tar.zst
+        tar c -C ./dist/linux_amd64 --exclude rocm --exclude 'mlx*' . | zstd -9 -T0 >./dist/ollama-linux-amd64.tar.zst
+        tar c -C ./dist/linux_amd64 ./lib/ollama/rocm  | zstd -9 -T0 >./dist/ollama-linux-amd64-rocm.tar.zst
+        ( cd ./dist/linux_amd64 && tar c lib/ollama/mlx* ) | zstd -9 -T0 >./dist/ollama-linux-amd64-mlx.tar.zst
 elif echo $PLATFORM | grep "arm64" > /dev/null ; then
-        tar c -C ./dist/ --exclude cuda_jetpack5 --exclude cuda_jetpack6 bin lib | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64.tar.zst
-        tar c -C ./dist/ ./lib/ollama/cuda_jetpack5  | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64-jetpack5.tar.zst
-        tar c -C ./dist/ ./lib/ollama/cuda_jetpack6  | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64-jetpack6.tar.zst
+        tar c -C ./dist/ --exclude cuda_jetpack5 --exclude cuda_jetpack6 bin lib | zstd -9 -T0 >./dist/ollama-linux-arm64.tar.zst
+        tar c -C ./dist/ ./lib/ollama/cuda_jetpack5  | zstd -9 -T0 >./dist/ollama-linux-arm64-jetpack5.tar.zst
+        tar c -C ./dist/ ./lib/ollama/cuda_jetpack6  | zstd -9 -T0 >./dist/ollama-linux-arm64-jetpack6.tar.zst
 elif echo $PLATFORM | grep "amd64" > /dev/null ; then
-        tar c -C ./dist/ --exclude rocm --exclude 'mlx*' --exclude include bin lib | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64.tar.zst
-        tar c -C ./dist/ ./lib/ollama/rocm  | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64-rocm.tar.zst
-        tar c -C ./dist/ ./lib/ollama/mlx_cuda_v13 ./lib/ollama/include  | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64-mlx.tar.zst
+        tar c -C ./dist/ --exclude rocm --exclude 'mlx*' bin lib | zstd -9 -T0 >./dist/ollama-linux-amd64.tar.zst
+        tar c -C ./dist/ ./lib/ollama/rocm  | zstd -9 -T0 >./dist/ollama-linux-amd64-rocm.tar.zst
+        ( cd ./dist/ && tar c lib/ollama/mlx* ) | zstd -9 -T0 >./dist/ollama-linux-amd64-mlx.tar.zst
 fi
+
+# Warn if any compressed tarball exceeds GitHub's 2 GiB release-asset limit
+LIMIT=2147483648
+for f in ./dist/ollama-linux-*.tar.zst; do
+    [ -f "$f" ] || continue
+    size=$(stat -f%z "$f" 2>/dev/null || stat -c%s "$f")
+    if [ "$size" -gt "$LIMIT" ]; then
+        echo "WARNING: $f is $size bytes ($((size - LIMIT)) over the 2 GiB GitHub release-asset limit)" >&2
+    fi
+done
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index 5e797e3d2..09e66bf46 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -308,6 +308,8 @@ function mlxCuda13 {
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
             & cmake --install build\mlx_cuda_v$cudaMajorVer --component "MLX" --strip
             if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
+            & cmake --install build\mlx_cuda_v$cudaMajorVer --component "MLX_VENDOR"
+            if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
         } else {
             Write-Output "CUDA v$cudaMajorVer not detected, skipping MLX build"
         }
@@ -430,7 +432,7 @@ function newZipJob($sourceDir, $destZip) {
     Start-Job -ScriptBlock {
         param($src, $dst, $use7z)
         if ($use7z) {
-            & 7z a -tzip -mx=9 -mmt=on $dst "${src}\*"
+            & 7z a -tzip -mx=7 -mmt=on $dst "${src}\*"
             if ($LASTEXITCODE -ne 0) { throw "7z failed with exit code $LASTEXITCODE" }
         } else {
             Compress-Archive -CompressionLevel Optimal -Path "${src}\*" -DestinationPath $dst -Force
diff --git a/scripts/buildkitd.toml.example b/scripts/buildkitd.toml.example
new file mode 100644
index 000000000..ae81800fe
--- /dev/null
+++ b/scripts/buildkitd.toml.example
@@ -0,0 +1,21 @@
+# Suggested BuildKit GC config for ollama local development.
+#
+[worker.oci]
+  gc = true
+  gckeepstorage = "150GB"
+
+[[worker.oci.gcpolicy]]
+  filters = ["type==source.local", "type==source.git.checkout"]
+  keepDuration = "48h"
+  maxUsedSpace = "5GB"
+
+[[worker.oci.gcpolicy]]
+  filters = ["type==exec.cachemount"]
+  keepDuration = "168h"     # 7 days
+  maxUsedSpace = "20GB"
+
+[[worker.oci.gcpolicy]]
+  keepDuration = "720h"     # 30 days
+  reservedSpace = "20GB"
+  maxUsedSpace = "150GB"
+  minFreeSpace = "50GB"
diff --git a/x/imagegen/mlx/CMakeLists.txt b/x/imagegen/mlx/CMakeLists.txt
index 7b86a2cf1..47dfdc83c 100644
--- a/x/imagegen/mlx/CMakeLists.txt
+++ b/x/imagegen/mlx/CMakeLists.txt
@@ -94,6 +94,47 @@ FetchContent_Declare(
 )
 FetchContent_MakeAvailable(mlx-c)
 
+# To avoid a "long tail" when building MLX with a large set of GPU
+# architectures, utilize a higher --threads (-t) setting. At high -t
+# every .cu spawns concurrent cicc instances; each cicc can consume several GB
+# compiling MLX's CUTLASS-using kernels.  This in turn can cause OOMs.
+#
+# We use a pool to cover all MLX CUDA sources. Pool size is derived from total 
+# host RAM via a per-file memory budget.
+#
+# This was calibrated with `-t 6`.  Higher -t may require overriding
+# MLX_CUDA_RAM_MB
+if(CMAKE_GENERATOR STREQUAL "Ninja")
+    file(GLOB_RECURSE _mlx_cu
+        "${mlx_SOURCE_DIR}/mlx/backend/cuda/*.cu"
+        "${mlx_BINARY_DIR}/mlx/backend/cuda/*.cu"
+    )
+    if(_mlx_cu)
+        set(MLX_CUDA_RAM_MB 22000 CACHE STRING
+            "Per-file memory budget (MB) for the cuda_compile JOB_POOL.  Override for higher -t.")
+        cmake_host_system_information(RESULT _ram_mb QUERY TOTAL_PHYSICAL_MEMORY)
+        math(EXPR _cuda_pool "${_ram_mb} / ${MLX_CUDA_RAM_MB}")
+        if(_cuda_pool LESS 2)
+            set(_cuda_pool 2)
+        endif()
+        set_property(GLOBAL APPEND PROPERTY JOB_POOLS cuda_compile=${_cuda_pool})
+        list(LENGTH _mlx_cu _cu_count)
+        # SOURCE properties default to directory-scoped, which means a plain
+        # set_property(SOURCE ...) here would NOT affect the build rules
+        # generated for the mlx target (defined in mlx_SOURCE_DIR after
+        # FetchContent).  TARGET_DIRECTORY mlx puts the property in the
+        # directory where mlx was defined, so it actually applies.
+        foreach(f ${_mlx_cu})
+            set_property(SOURCE "${f}"
+                TARGET_DIRECTORY mlx
+                PROPERTY JOB_POOL_COMPILE cuda_compile)
+        endforeach()
+        message(STATUS "MLX cuda_compile JOB_POOL: ${_cu_count} files, pool size ${_cuda_pool} (host RAM ${_ram_mb} MB / ${MLX_CUDA_RAM_MB} MB per file)")
+    else()
+        message(WARNING "MLX cuda_compile JOB_POOL: no .cu files found under mlx/backend/cuda/ - check MLX layout")
+    endif()
+endif()
+
 # Sync vendored headers with fetched version
 file(GLOB _mlx_c_hdrs "${mlx-c_SOURCE_DIR}/mlx/c/*.h")
 file(COPY ${_mlx_c_hdrs} DESTINATION "${CMAKE_SOURCE_DIR}/x/mlxrunner/mlx/include/mlx/c/")