diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 8abbde44c..7fabdc193 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -141,6 +141,14 @@ jobs: env: GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }} steps: + # Increase pagefile to handle momentary spikes in RAM from NVCC compiles + - if: startsWith(matrix.preset, 'MLX ') + name: Increase pagefile to 200 GB + uses: al-cheb/configure-pagefile-action@v1.5 + with: + minimum-size: 16GB + maximum-size: 200GB + disk-root: "D:" - name: Install system dependencies run: | choco install -y --no-progress ccache ninja @@ -237,7 +245,7 @@ jobs: Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo' cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} --install-prefix "$((pwd).Path)\dist\${{ matrix.os }}-${{ matrix.arch }}" - cmake --build --parallel ([Environment]::ProcessorCount) --preset "${{ matrix.preset }}" + cmake --build --preset "${{ matrix.preset }}" -- -l $([Environment]::ProcessorCount) cmake --install build --component "${{ startsWith(matrix.preset, 'MLX ') && 'MLX' || startsWith(matrix.preset, 'CUDA ') && 'CUDA' || startsWith(matrix.preset, 'ROCm ') && 'HIP' || startsWith(matrix.preset, 'Vulkan') && 'Vulkan' || 'CPU' }}" --strip Remove-Item -Path dist\lib\ollama\rocm\rocblas\library\*gfx906* -ErrorAction SilentlyContinue env: @@ -380,20 +388,36 @@ jobs: dist/*.ps1 dist/OllamaSetup.exe - linux-build: + # Pre-build each Dockerfile stage on its own runner in parallel and push the + # resulting layers to a per-stage registry cache. The downstream + # docker-build-push job then assembles cache-hit-only. + linux-depends: strategy: matrix: include: - - os: linux - arch: amd64 - target: archive - - os: linux - arch: amd64 - target: rocm - - os: linux - arch: arm64 - target: archive - runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }} + - arch: amd64 + target: cpu + - arch: amd64 + target: cuda-12 + - arch: amd64 + target: cuda-13 + - arch: amd64 + target: mlx + - arch: amd64 + target: rocm-7 + - arch: amd64 + target: vulkan + - arch: arm64 + target: cpu + - arch: arm64 + target: cuda-12 + - arch: arm64 + target: cuda-13 + - arch: arm64 + target: jetpack-5 + - arch: arm64 + target: jetpack-6 + runs-on: ${{ matrix.arch == 'arm64' && 'linux-arm64' || 'linux' }} environment: release needs: setup-environment env: @@ -401,53 +425,53 @@ jobs: steps: - uses: actions/checkout@v4 - uses: docker/setup-buildx-action@v3 + - uses: docker/login-action@v3 + with: + username: ${{ vars.DOCKER_USER }} + password: ${{ secrets.DOCKER_ACCESS_TOKEN }} + # Increase swap to handle momentary spikes in RAM from NVCC compiles + - if: matrix.target == 'mlx' + name: Increase Linux swap to 200 GB + shell: bash + run: | + set -e + SWAP_PATH=/swapfile-mlx + SWAP_SIZE_GB=200 + if [ -f "$SWAP_PATH" ]; then + sudo swapoff "$SWAP_PATH" 2>/dev/null || true + sudo rm -f "$SWAP_PATH" + fi + if ! sudo fallocate -l ${SWAP_SIZE_GB}G "$SWAP_PATH" 2>/dev/null; then + echo "fallocate unsupported, falling back to dd" + sudo dd if=/dev/zero of="$SWAP_PATH" bs=1M count=$((SWAP_SIZE_GB * 1024)) + fi + sudo chmod 600 "$SWAP_PATH" + sudo mkswap "$SWAP_PATH" + sudo swapon "$SWAP_PATH" + swapon --show + free -h - uses: docker/build-push-action@v6 with: context: . - platforms: ${{ matrix.os }}/${{ matrix.arch }} + platforms: linux/${{ matrix.arch }} target: ${{ matrix.target }} + provenance: false + sbom: false build-args: | - GOFLAGS=${{ env.GOFLAGS }} CGO_CFLAGS=${{ env.CGO_CFLAGS }} CGO_CXXFLAGS=${{ env.CGO_CXXFLAGS }} - outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }} - cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest - cache-to: type=inline - - name: Deduplicate CUDA libraries - run: | - ./scripts/deduplicate_cuda_libs.sh dist/${{ matrix.os }}-${{ matrix.arch }} - - run: | - for COMPONENT in bin/* lib/ollama/*; do - case "$COMPONENT" in - bin/ollama*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; - lib/ollama/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; - lib/ollama/cuda_v*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; - lib/ollama/vulkan*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; - lib/ollama/mlx*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-mlx.tar.in ;; - lib/ollama/include*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-mlx.tar.in ;; - lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;; - lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;; - lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;; - esac - done - working-directory: dist/${{ matrix.os }}-${{ matrix.arch }} - - run: | - echo "Manifests" - for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in ; do - echo $ARCHIVE - cat $ARCHIVE - done - - run: | - for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do - tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | zstd --ultra -22 -T0 >$(basename ${ARCHIVE//.*/}.tar.zst); - done - - uses: actions/upload-artifact@v4 - with: - name: bundles-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.target }} - path: | - *.tar.zst + GOFLAGS=${{ env.GOFLAGS }} + APT_MIRROR=http://azure.archive.ubuntu.com/ubuntu + OLLAMA_MLX_BUILD_JOBS=16 + OLLAMA_MLX_NVCC_THREADS=6 + cache-from: | + type=registry,ref=ollama/release:cache-${{ matrix.arch }}-${{ matrix.target }} + type=registry,ref=${{ vars.DOCKER_REPO }}:latest + cache-to: type=registry,ref=ollama/release:cache-${{ matrix.arch }}-${{ matrix.target }},mode=max # Build each Docker variant (OS, arch, and flavor) separately. Using QEMU is unreliable and slower. + # Heavy stages were pre-built by linux-depends; this job is cache-hit-only for those layers + # and just assembles, runs the Go build, and pushes the final image. docker-build-push: strategy: matrix: @@ -459,6 +483,15 @@ jobs: CGO_CXXFLAGS GOFLAGS APT_MIRROR=http://azure.archive.ubuntu.com/ubuntu + OLLAMA_MLX_BUILD_JOBS=16 + OLLAMA_MLX_NVCC_THREADS=6 + cache-from: | + type=registry,ref=${{ vars.DOCKER_REPO }}:latest + type=registry,ref=ollama/release:cache-arm64-cpu + type=registry,ref=ollama/release:cache-arm64-cuda-12 + type=registry,ref=ollama/release:cache-arm64-cuda-13 + type=registry,ref=ollama/release:cache-arm64-jetpack-5 + type=registry,ref=ollama/release:cache-arm64-jetpack-6 - os: linux arch: amd64 build-args: | @@ -466,6 +499,15 @@ jobs: CGO_CXXFLAGS GOFLAGS APT_MIRROR=http://azure.archive.ubuntu.com/ubuntu + OLLAMA_MLX_BUILD_JOBS=16 + OLLAMA_MLX_NVCC_THREADS=6 + cache-from: | + type=registry,ref=${{ vars.DOCKER_REPO }}:latest + type=registry,ref=ollama/release:cache-amd64-cpu + type=registry,ref=ollama/release:cache-amd64-cuda-12 + type=registry,ref=ollama/release:cache-amd64-cuda-13 + type=registry,ref=ollama/release:cache-amd64-mlx + type=registry,ref=ollama/release:cache-amd64-vulkan - os: linux arch: amd64 suffix: '-rocm' @@ -475,9 +517,15 @@ jobs: GOFLAGS FLAVOR=rocm APT_MIRROR=http://azure.archive.ubuntu.com/ubuntu + OLLAMA_MLX_BUILD_JOBS=16 + OLLAMA_MLX_NVCC_THREADS=6 + cache-from: | + type=registry,ref=${{ vars.DOCKER_REPO }}:latest + type=registry,ref=ollama/release:cache-amd64-cpu + type=registry,ref=ollama/release:cache-amd64-rocm-7 runs-on: ${{ matrix.arch == 'arm64' && format('{0}-{1}', matrix.os, matrix.arch) || matrix.os }} environment: release - needs: setup-environment + needs: [setup-environment, linux-depends] env: GOFLAGS: ${{ needs.setup-environment.outputs.GOFLAGS }} steps: @@ -492,9 +540,11 @@ jobs: with: context: . platforms: ${{ matrix.os }}/${{ matrix.arch }} + provenance: false + sbom: false build-args: ${{ matrix.build-args }} outputs: type=image,name=${{ vars.DOCKER_REPO }},push-by-digest=true,name-canonical=true,push=true - cache-from: type=registry,ref=${{ vars.DOCKER_REPO }}:latest + cache-from: ${{ matrix.cache-from }} cache-to: type=inline - run: | mkdir -p ${{ matrix.os }}-${{ matrix.arch }} @@ -505,6 +555,53 @@ jobs: name: digest-${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.suffix }} path: | ${{ runner.temp }}/${{ matrix.os }}-${{ matrix.arch }}-${{ matrix.suffix }}.txt + # Re-run buildx with --target archive against buildkit's local cache to + # extract the release directory layout. All upstream stages were just + # built above, so this is a cache-hit-only pass that just writes files. + - uses: docker/build-push-action@v6 + with: + context: . + platforms: ${{ matrix.os }}/${{ matrix.arch }} + target: archive + provenance: false + sbom: false + build-args: ${{ matrix.build-args }} + outputs: type=local,dest=dist/${{ matrix.os }}-${{ matrix.arch }} + cache-from: ${{ matrix.cache-from }} + - name: Deduplicate CUDA libraries + run: | + ./scripts/deduplicate_cuda_libs.sh dist/${{ matrix.os }}-${{ matrix.arch }} + - run: | + for COMPONENT in bin/* lib/ollama/*; do + case "$COMPONENT" in + bin/ollama*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; + lib/ollama/*.so*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; + lib/ollama/cuda_v*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; + lib/ollama/vulkan*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; + lib/ollama/mlx*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-mlx.tar.in ;; + lib/ollama/include*) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}.tar.in ;; + lib/ollama/cuda_jetpack5) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack5.tar.in ;; + lib/ollama/cuda_jetpack6) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-jetpack6.tar.in ;; + lib/ollama/rocm) echo $COMPONENT >>ollama-${{ matrix.os }}-${{ matrix.arch }}-rocm.tar.in ;; + esac + done + working-directory: dist/${{ matrix.os }}-${{ matrix.arch }} + - run: | + echo "Manifests" + for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in ; do + echo $ARCHIVE + cat $ARCHIVE + done + - run: | + for ARCHIVE in dist/${{ matrix.os }}-${{ matrix.arch }}/*.tar.in; do + tar c -C dist/${{ matrix.os }}-${{ matrix.arch }} -T $ARCHIVE --owner 0 --group 0 | zstd -19 -T0 >$(basename ${ARCHIVE//.*/}.tar.zst) & + done + wait + - uses: actions/upload-artifact@v4 + with: + name: bundles-${{ matrix.os }}-${{ matrix.arch }}${{ matrix.suffix }} + path: | + *.tar.zst # Merge Docker images for the same flavor into a single multi-arch manifest docker-merge-push: @@ -544,7 +641,7 @@ jobs: release: runs-on: ubuntu-latest environment: release - needs: [darwin-build, windows-app, linux-build] + needs: [darwin-build, windows-app, docker-build-push] permissions: contents: write env: diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 56fd22880..4ae685642 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -65,7 +65,7 @@ jobs: - preset: 'MLX CUDA 13' container: nvidia/cuda:13.0.0-devel-ubuntu22.04 extra-packages: libcudnn9-dev-cuda-13 libopenblas-dev liblapack-dev liblapacke-dev git curl - flags: '-DCMAKE_CUDA_ARCHITECTURES=87 -DBLAS_INCLUDE_DIRS=/usr/include/x86_64-linux-gnu -DLAPACK_INCLUDE_DIRS=/usr/include/x86_64-linux-gnu' + flags: '-DCMAKE_CUDA_ARCHITECTURES=87 -DMLX_CUDA_ARCHITECTURES=80-virtual -DBLAS_INCLUDE_DIRS=/usr/include/x86_64-linux-gnu -DLAPACK_INCLUDE_DIRS=/usr/include/x86_64-linux-gnu' install-go: true runs-on: linux container: ${{ matrix.container }} @@ -105,7 +105,7 @@ jobs: key: ccache-${{ runner.os }}-${{ runner.arch }}-${{ matrix.preset }}-${{ needs.changes.outputs.vendorsha }} - run: | cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} - cmake --build --preset "${{ matrix.preset }}" --parallel + cmake --build --preset "${{ matrix.preset }}" -- -l $(nproc) windows: needs: [changes] @@ -134,7 +134,7 @@ jobs: - preset: 'MLX CUDA 13' install: https://developer.download.nvidia.com/compute/cuda/13.0.0/local_installers/cuda_13.0.0_windows.exe cudnn-install: https://developer.download.nvidia.com/compute/cudnn/redist/cudnn/windows-x86_64/cudnn-windows-x86_64-9.18.1.3_cuda13-archive.zip - flags: '-DCMAKE_CUDA_ARCHITECTURES=80' + flags: '-DCMAKE_CUDA_ARCHITECTURES=80 -DMLX_CUDA_ARCHITECTURES=80-virtual' cuda-components: - '"cudart"' - '"nvcc"' @@ -240,7 +240,7 @@ jobs: Import-Module 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise\Common7\Tools\Microsoft.VisualStudio.DevShell.dll' Enter-VsDevShell -VsInstallPath 'C:\Program Files\Microsoft Visual Studio\2022\Enterprise' -SkipAutomaticLocation -DevCmdArguments '-arch=x64 -no_logo' cmake --preset "${{ matrix.preset }}" ${{ matrix.flags }} - cmake --build --parallel --preset "${{ matrix.preset }}" + cmake --build --preset "${{ matrix.preset }}" -- -l $([Environment]::ProcessorCount) env: CMAKE_GENERATOR: Ninja diff --git a/CMakeLists.txt b/CMakeLists.txt index e2af188d3..f402d2382 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -228,15 +228,20 @@ if(MLX_ENGINE) list(APPEND MLX_INCLUDE_REGEXES "^dl\\.dll$") endif() + # Split mlx/mlxc libraries from runtime deps to avoid stripping deps install(TARGETS mlx mlxc - RUNTIME_DEPENDENCIES - DIRECTORIES ${MLX_RUNTIME_DIRS} - PRE_INCLUDE_REGEXES ${MLX_INCLUDE_REGEXES} - PRE_EXCLUDE_REGEXES ".*" + RUNTIME_DEPENDENCY_SET mlx_runtime_deps RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX FRAMEWORK DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX ) + install(RUNTIME_DEPENDENCY_SET mlx_runtime_deps + DIRECTORIES ${MLX_RUNTIME_DIRS} + PRE_INCLUDE_REGEXES ${MLX_INCLUDE_REGEXES} + PRE_EXCLUDE_REGEXES ".*" + RUNTIME DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX_VENDOR + LIBRARY DESTINATION ${OLLAMA_INSTALL_DIR} COMPONENT MLX_VENDOR + ) if(TARGET jaccl) install(TARGETS jaccl @@ -366,7 +371,7 @@ if(MLX_ENGINE) if(MLX_CUDA_LIBS) install(FILES ${MLX_CUDA_LIBS} DESTINATION ${OLLAMA_INSTALL_DIR} - COMPONENT MLX) + COMPONENT MLX_VENDOR) endif() endif() endif() diff --git a/CMakePresets.json b/CMakePresets.json index 0fdbc1442..fd647e4b7 100644 --- a/CMakePresets.json +++ b/CMakePresets.json @@ -112,7 +112,7 @@ "name": "MLX CUDA 13", "inherits": [ "MLX", "CUDA 13" ], "cacheVariables": { - "MLX_CUDA_ARCHITECTURES": "86;89;90;90a;100;103;75-virtual;80-virtual;110-virtual;120-virtual;121-virtual", + "MLX_CUDA_ARCHITECTURES": "75-virtual;80-virtual;86-virtual;89-virtual;90-virtual;90a-virtual;100-virtual;103-virtual;110-virtual;120-virtual;121-virtual", "OLLAMA_RUNNER_DIR": "mlx_cuda_v13" } } diff --git a/Dockerfile b/Dockerfile index 0485b09c4..461c8585e 100644 --- a/Dockerfile +++ b/Dockerfile @@ -144,6 +144,9 @@ RUN --mount=type=cache,target=/root/.ccache \ FROM base AS mlx ARG CUDA13VERSION=13.0 +# OLLAMA_MLX_BUILD_JOBS empty -> ninja gates by load average (-l $(nproc)) +ARG OLLAMA_MLX_BUILD_JOBS= +ARG OLLAMA_MLX_NVCC_THREADS=2 RUN dnf install -y cuda-toolkit-${CUDA13VERSION//./-} \ && dnf install -y openblas-devel lapack-devel \ && dnf install -y libcudnn9-cuda-13 libcudnn9-devel-cuda-13 \ @@ -170,9 +173,10 @@ RUN --mount=type=cache,target=/root/.ccache \ && if [ -f /tmp/local-mlx-c/CMakeLists.txt ]; then \ export OLLAMA_MLX_C_SOURCE=/tmp/local-mlx-c; \ fi \ - && cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas \ - && cmake --build --preset 'MLX CUDA 13' -- -l $(nproc) \ - && cmake --install build --component MLX --strip + && cmake --preset 'MLX CUDA 13' -DBLAS_INCLUDE_DIRS=/usr/include/openblas -DLAPACK_INCLUDE_DIRS=/usr/include/openblas -DCMAKE_CUDA_FLAGS="-t ${OLLAMA_MLX_NVCC_THREADS}" \ + && cmake --build --preset 'MLX CUDA 13' -- -l $(nproc) ${OLLAMA_MLX_BUILD_JOBS:+-j ${OLLAMA_MLX_BUILD_JOBS}} \ + && cmake --install build --component MLX --strip \ + && cmake --install build --component MLX_VENDOR FROM base AS build WORKDIR /go/src/github.com/ollama/ollama diff --git a/scripts/build_darwin.sh b/scripts/build_darwin.sh index e17525083..3cd712167 100755 --- a/scripts/build_darwin.sh +++ b/scripts/build_darwin.sh @@ -76,6 +76,7 @@ _build_darwin() { cmake --build $BUILD_DIR --target mlx mlxc -j cmake --install $BUILD_DIR --component CPU cmake --install $BUILD_DIR --component MLX + cmake --install $BUILD_DIR --component MLX_VENDOR # Override CGO flags to point to the amd64 build directory MLX_CGO_CFLAGS="-O3 -mmacosx-version-min=14.0" MLX_CGO_LDFLAGS="-ldl -lc++ -framework Accelerate -mmacosx-version-min=14.0" @@ -103,6 +104,7 @@ _build_darwin() { cmake --build $BUILD_DIR --target mlx mlxc --parallel _relink_mlx_metallib $BUILD_DIR cmake --install $BUILD_DIR --component MLX + cmake --install $BUILD_DIR --component MLX_VENDOR # Metal 4.x build (NAX-enabled, macOS 26+) # Only possible with Xcode 26+ SDK; skip on older toolchains. @@ -124,6 +126,7 @@ _build_darwin() { -DFETCHCONTENT_SOURCE_DIR_METAL_CPP=$V3_DEPS/metal_cpp-src cmake --build $BUILD_DIR_V4 --target mlx mlxc --parallel cmake --install $BUILD_DIR_V4 --component MLX + cmake --install $BUILD_DIR_V4 --component MLX_VENDOR else status "Skipping MLX Metal v4 (SDK $SDK_MAJOR < 26, need Xcode 26+)" fi diff --git a/scripts/build_linux.sh b/scripts/build_linux.sh index 5421d70c7..79d852672 100755 --- a/scripts/build_linux.sh +++ b/scripts/build_linux.sh @@ -1,9 +1,14 @@ #!/bin/sh # -# Mac ARM users, rosetta can be flaky, so to use a remote x86 builder +# Mac ARM users, rosetta can be flaky, so to use a remote x86 builder. +# Use the docker-container driver with the bundled buildkit GC config +# for improved cache behavior # # docker context create amd64 --docker host=ssh://mybuildhost -# docker buildx create --name mybuilder amd64 --platform linux/amd64 +# docker buildx create --name mybuilder \ +# --driver docker-container \ +# --config ./buildkitd.toml.example \ +# --bootstrap amd64 --platform linux/amd64 # docker buildx create --name mybuilder --append desktop-linux --platform linux/arm64 # docker buildx use mybuilder @@ -59,18 +64,28 @@ fi # buildx behavior changes for single vs. multiplatform echo "Compressing linux tar bundles..." if echo $PLATFORM | grep "," > /dev/null ; then - tar c -C ./dist/linux_arm64 --exclude cuda_jetpack5 --exclude cuda_jetpack6 . | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64.tar.zst - tar c -C ./dist/linux_arm64 ./lib/ollama/cuda_jetpack5 | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64-jetpack5.tar.zst - tar c -C ./dist/linux_arm64 ./lib/ollama/cuda_jetpack6 | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64-jetpack6.tar.zst - tar c -C ./dist/linux_amd64 --exclude rocm --exclude 'mlx*' --exclude include . | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64.tar.zst - tar c -C ./dist/linux_amd64 ./lib/ollama/rocm | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64-rocm.tar.zst - tar c -C ./dist/linux_amd64 ./lib/ollama/mlx_cuda_v13 ./lib/ollama/include | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64-mlx.tar.zst + tar c -C ./dist/linux_arm64 --exclude cuda_jetpack5 --exclude cuda_jetpack6 . | zstd -9 -T0 >./dist/ollama-linux-arm64.tar.zst + tar c -C ./dist/linux_arm64 ./lib/ollama/cuda_jetpack5 | zstd -9 -T0 >./dist/ollama-linux-arm64-jetpack5.tar.zst + tar c -C ./dist/linux_arm64 ./lib/ollama/cuda_jetpack6 | zstd -9 -T0 >./dist/ollama-linux-arm64-jetpack6.tar.zst + tar c -C ./dist/linux_amd64 --exclude rocm --exclude 'mlx*' . | zstd -9 -T0 >./dist/ollama-linux-amd64.tar.zst + tar c -C ./dist/linux_amd64 ./lib/ollama/rocm | zstd -9 -T0 >./dist/ollama-linux-amd64-rocm.tar.zst + ( cd ./dist/linux_amd64 && tar c lib/ollama/mlx* ) | zstd -9 -T0 >./dist/ollama-linux-amd64-mlx.tar.zst elif echo $PLATFORM | grep "arm64" > /dev/null ; then - tar c -C ./dist/ --exclude cuda_jetpack5 --exclude cuda_jetpack6 bin lib | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64.tar.zst - tar c -C ./dist/ ./lib/ollama/cuda_jetpack5 | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64-jetpack5.tar.zst - tar c -C ./dist/ ./lib/ollama/cuda_jetpack6 | zstd --ultra -22 -T0 >./dist/ollama-linux-arm64-jetpack6.tar.zst + tar c -C ./dist/ --exclude cuda_jetpack5 --exclude cuda_jetpack6 bin lib | zstd -9 -T0 >./dist/ollama-linux-arm64.tar.zst + tar c -C ./dist/ ./lib/ollama/cuda_jetpack5 | zstd -9 -T0 >./dist/ollama-linux-arm64-jetpack5.tar.zst + tar c -C ./dist/ ./lib/ollama/cuda_jetpack6 | zstd -9 -T0 >./dist/ollama-linux-arm64-jetpack6.tar.zst elif echo $PLATFORM | grep "amd64" > /dev/null ; then - tar c -C ./dist/ --exclude rocm --exclude 'mlx*' --exclude include bin lib | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64.tar.zst - tar c -C ./dist/ ./lib/ollama/rocm | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64-rocm.tar.zst - tar c -C ./dist/ ./lib/ollama/mlx_cuda_v13 ./lib/ollama/include | zstd --ultra -22 -T0 >./dist/ollama-linux-amd64-mlx.tar.zst + tar c -C ./dist/ --exclude rocm --exclude 'mlx*' bin lib | zstd -9 -T0 >./dist/ollama-linux-amd64.tar.zst + tar c -C ./dist/ ./lib/ollama/rocm | zstd -9 -T0 >./dist/ollama-linux-amd64-rocm.tar.zst + ( cd ./dist/ && tar c lib/ollama/mlx* ) | zstd -9 -T0 >./dist/ollama-linux-amd64-mlx.tar.zst fi + +# Warn if any compressed tarball exceeds GitHub's 2 GiB release-asset limit +LIMIT=2147483648 +for f in ./dist/ollama-linux-*.tar.zst; do + [ -f "$f" ] || continue + size=$(stat -f%z "$f" 2>/dev/null || stat -c%s "$f") + if [ "$size" -gt "$LIMIT" ]; then + echo "WARNING: $f is $size bytes ($((size - LIMIT)) over the 2 GiB GitHub release-asset limit)" >&2 + fi +done diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1 index 5e797e3d2..09e66bf46 100644 --- a/scripts/build_windows.ps1 +++ b/scripts/build_windows.ps1 @@ -308,6 +308,8 @@ function mlxCuda13 { if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} & cmake --install build\mlx_cuda_v$cudaMajorVer --component "MLX" --strip if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} + & cmake --install build\mlx_cuda_v$cudaMajorVer --component "MLX_VENDOR" + if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)} } else { Write-Output "CUDA v$cudaMajorVer not detected, skipping MLX build" } @@ -430,7 +432,7 @@ function newZipJob($sourceDir, $destZip) { Start-Job -ScriptBlock { param($src, $dst, $use7z) if ($use7z) { - & 7z a -tzip -mx=9 -mmt=on $dst "${src}\*" + & 7z a -tzip -mx=7 -mmt=on $dst "${src}\*" if ($LASTEXITCODE -ne 0) { throw "7z failed with exit code $LASTEXITCODE" } } else { Compress-Archive -CompressionLevel Optimal -Path "${src}\*" -DestinationPath $dst -Force diff --git a/scripts/buildkitd.toml.example b/scripts/buildkitd.toml.example new file mode 100644 index 000000000..ae81800fe --- /dev/null +++ b/scripts/buildkitd.toml.example @@ -0,0 +1,21 @@ +# Suggested BuildKit GC config for ollama local development. +# +[worker.oci] + gc = true + gckeepstorage = "150GB" + +[[worker.oci.gcpolicy]] + filters = ["type==source.local", "type==source.git.checkout"] + keepDuration = "48h" + maxUsedSpace = "5GB" + +[[worker.oci.gcpolicy]] + filters = ["type==exec.cachemount"] + keepDuration = "168h" # 7 days + maxUsedSpace = "20GB" + +[[worker.oci.gcpolicy]] + keepDuration = "720h" # 30 days + reservedSpace = "20GB" + maxUsedSpace = "150GB" + minFreeSpace = "50GB" diff --git a/x/imagegen/mlx/CMakeLists.txt b/x/imagegen/mlx/CMakeLists.txt index 7b86a2cf1..47dfdc83c 100644 --- a/x/imagegen/mlx/CMakeLists.txt +++ b/x/imagegen/mlx/CMakeLists.txt @@ -94,6 +94,47 @@ FetchContent_Declare( ) FetchContent_MakeAvailable(mlx-c) +# To avoid a "long tail" when building MLX with a large set of GPU +# architectures, utilize a higher --threads (-t) setting. At high -t +# every .cu spawns concurrent cicc instances; each cicc can consume several GB +# compiling MLX's CUTLASS-using kernels. This in turn can cause OOMs. +# +# We use a pool to cover all MLX CUDA sources. Pool size is derived from total +# host RAM via a per-file memory budget. +# +# This was calibrated with `-t 6`. Higher -t may require overriding +# MLX_CUDA_RAM_MB +if(CMAKE_GENERATOR STREQUAL "Ninja") + file(GLOB_RECURSE _mlx_cu + "${mlx_SOURCE_DIR}/mlx/backend/cuda/*.cu" + "${mlx_BINARY_DIR}/mlx/backend/cuda/*.cu" + ) + if(_mlx_cu) + set(MLX_CUDA_RAM_MB 22000 CACHE STRING + "Per-file memory budget (MB) for the cuda_compile JOB_POOL. Override for higher -t.") + cmake_host_system_information(RESULT _ram_mb QUERY TOTAL_PHYSICAL_MEMORY) + math(EXPR _cuda_pool "${_ram_mb} / ${MLX_CUDA_RAM_MB}") + if(_cuda_pool LESS 2) + set(_cuda_pool 2) + endif() + set_property(GLOBAL APPEND PROPERTY JOB_POOLS cuda_compile=${_cuda_pool}) + list(LENGTH _mlx_cu _cu_count) + # SOURCE properties default to directory-scoped, which means a plain + # set_property(SOURCE ...) here would NOT affect the build rules + # generated for the mlx target (defined in mlx_SOURCE_DIR after + # FetchContent). TARGET_DIRECTORY mlx puts the property in the + # directory where mlx was defined, so it actually applies. + foreach(f ${_mlx_cu}) + set_property(SOURCE "${f}" + TARGET_DIRECTORY mlx + PROPERTY JOB_POOL_COMPILE cuda_compile) + endforeach() + message(STATUS "MLX cuda_compile JOB_POOL: ${_cu_count} files, pool size ${_cuda_pool} (host RAM ${_ram_mb} MB / ${MLX_CUDA_RAM_MB} MB per file)") + else() + message(WARNING "MLX cuda_compile JOB_POOL: no .cu files found under mlx/backend/cuda/ - check MLX layout") + endif() +endif() + # Sync vendored headers with fetched version file(GLOB _mlx_c_hdrs "${mlx-c_SOURCE_DIR}/mlx/c/*.h") file(COPY ${_mlx_c_hdrs} DESTINATION "${CMAKE_SOURCE_DIR}/x/mlxrunner/mlx/include/mlx/c/")