mirror of
https://github.com/ollama/ollama.git
synced 2026-05-13 14:27:00 +00:00
scheduler improvements
This commit is contained in:
parent
fa7092b1d3
commit
88935c21b5
10 changed files with 393 additions and 393 deletions
|
|
@ -1 +1 @@
|
|||
b9048
|
||||
b9075
|
||||
|
|
|
|||
|
|
@ -61,6 +61,7 @@ func llamaServerDiscoverDevices(ctx context.Context, libDirs []string, extraEnvs
|
|||
"--host", "127.0.0.1",
|
||||
"--no-webui",
|
||||
"--offline",
|
||||
"--verbose",
|
||||
)
|
||||
cmd.WaitDelay = llamaServerDiscoveryWaitDelay
|
||||
cmd.Env = os.Environ()
|
||||
|
|
@ -137,7 +138,7 @@ func llamaServerDiscoverDevices(ctx context.Context, libDirs []string, extraEnvs
|
|||
|
||||
// Also run --list-devices to get the stdout device list with free memory
|
||||
// (the brief server startup doesn't print that)
|
||||
cmd2 := exec.CommandContext(ctx, llamaServer, "--list-devices", "--offline")
|
||||
cmd2 := exec.CommandContext(ctx, llamaServer, "--list-devices", "--offline", "--verbose")
|
||||
cmd2.WaitDelay = llamaServerDiscoveryWaitDelay
|
||||
cmd2.Env = cmd.Env // reuse same environment
|
||||
listOutput, err := cmd2.CombinedOutput()
|
||||
|
|
|
|||
|
|
@ -1,348 +1,261 @@
|
|||
package discover
|
||||
|
||||
import (
|
||||
"fmt"
|
||||
"io"
|
||||
"log/slog"
|
||||
"strconv"
|
||||
"strings"
|
||||
"testing"
|
||||
|
||||
"github.com/ollama/ollama/logutil"
|
||||
)
|
||||
|
||||
func TestLlamaServerDiscoveryOutputOnlyTrace(t *testing.T) {
|
||||
original := slog.Default()
|
||||
t.Cleanup(func() {
|
||||
slog.SetDefault(original)
|
||||
func TestLlamaServerDiscovery(t *testing.T) {
|
||||
t.Run("output only trace", func(t *testing.T) {
|
||||
original := slog.Default()
|
||||
t.Cleanup(func() {
|
||||
slog.SetDefault(original)
|
||||
})
|
||||
|
||||
slog.SetDefault(logutil.NewLogger(io.Discard, slog.LevelDebug))
|
||||
if got := llamaServerDiscoveryOutput(t.Context()); got != io.Discard {
|
||||
t.Fatal("debug logging should discard raw llama-server discovery output")
|
||||
}
|
||||
|
||||
slog.SetDefault(logutil.NewLogger(io.Discard, logutil.LevelTrace))
|
||||
if got := llamaServerDiscoveryOutput(t.Context()); got == io.Discard {
|
||||
t.Fatal("trace logging should emit raw llama-server discovery output")
|
||||
}
|
||||
})
|
||||
|
||||
slog.SetDefault(logutil.NewLogger(io.Discard, slog.LevelDebug))
|
||||
if got := llamaServerDiscoveryOutput(t.Context()); got != io.Discard {
|
||||
t.Fatal("debug logging should discard raw llama-server discovery output")
|
||||
}
|
||||
t.Run("parse devices", func(t *testing.T) {
|
||||
type wantDevice struct {
|
||||
name string
|
||||
library string
|
||||
totalMiB uint64
|
||||
compute string
|
||||
gfxTarget string
|
||||
checkIntegrated bool
|
||||
integrated bool
|
||||
}
|
||||
|
||||
slog.SetDefault(logutil.NewLogger(io.Discard, logutil.LevelTrace))
|
||||
if got := llamaServerDiscoveryOutput(t.Context()); got == io.Discard {
|
||||
t.Fatal("trace logging should emit raw llama-server discovery output")
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaServerDevices(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
output string
|
||||
libDirs []string
|
||||
wantLen int
|
||||
wantName string
|
||||
wantLib string
|
||||
wantMiB uint64
|
||||
}{
|
||||
{
|
||||
name: "NVIDIA CUDA",
|
||||
output: `load_backend: loaded CUDA backend from /lib/ollama/cuda_v12/libggml-cuda.so
|
||||
tests := []struct {
|
||||
name string
|
||||
output string
|
||||
libDirs []string
|
||||
want []wantDevice
|
||||
}{
|
||||
{
|
||||
name: "NVIDIA CUDA",
|
||||
output: `load_backend: loaded CUDA backend from /lib/ollama/cuda_v12/libggml-cuda.so
|
||||
Available devices:
|
||||
NVIDIA GeForce RTX 4090: NVIDIA CUDA (24564 MiB, 23592 MiB free)
|
||||
`,
|
||||
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
|
||||
wantLen: 1,
|
||||
wantName: "NVIDIA GeForce RTX 4090",
|
||||
wantLib: "CUDA",
|
||||
wantMiB: 24564,
|
||||
},
|
||||
{
|
||||
name: "Metal",
|
||||
output: `Available devices:
|
||||
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
|
||||
want: []wantDevice{{
|
||||
name: "NVIDIA GeForce RTX 4090",
|
||||
library: "CUDA",
|
||||
totalMiB: 24564,
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "Metal",
|
||||
output: `Available devices:
|
||||
Metal: Apple M3 Max (98304 MiB, 98303 MiB free)
|
||||
`,
|
||||
libDirs: []string{"/lib/ollama"},
|
||||
wantLen: 1,
|
||||
wantName: "Metal",
|
||||
wantLib: "Metal",
|
||||
wantMiB: 98304,
|
||||
},
|
||||
{
|
||||
name: "ROCm with gfx target",
|
||||
output: ` Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
|
||||
want: []wantDevice{{
|
||||
name: "Metal",
|
||||
library: "Metal",
|
||||
totalMiB: 98304,
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "ROCm with gfx target",
|
||||
output: ` Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
|
||||
Available devices:
|
||||
ROCm0: AMD Radeon RX 6700 XT (12272 MiB, 12248 MiB free)
|
||||
`,
|
||||
libDirs: []string{"/lib/ollama", "/lib/ollama/rocm"},
|
||||
wantLen: 1,
|
||||
wantName: "ROCm0",
|
||||
wantLib: "ROCm",
|
||||
wantMiB: 12272,
|
||||
},
|
||||
{
|
||||
name: "multi GPU",
|
||||
output: `Available devices:
|
||||
libDirs: []string{"/lib/ollama", "/lib/ollama/rocm"},
|
||||
want: []wantDevice{{
|
||||
name: "ROCm0",
|
||||
library: "ROCm",
|
||||
totalMiB: 12272,
|
||||
compute: "gfx1031",
|
||||
gfxTarget: "gfx1031",
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "multi GPU",
|
||||
output: `Available devices:
|
||||
CUDA0: NVIDIA GeForce RTX 4090 (24564 MiB, 23592 MiB free)
|
||||
CUDA1: NVIDIA GeForce RTX 3060 (12288 MiB, 11500 MiB free)
|
||||
`,
|
||||
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
|
||||
wantLen: 2,
|
||||
},
|
||||
{
|
||||
name: "no devices",
|
||||
output: "Available devices:\n",
|
||||
libDirs: []string{"/lib/ollama"},
|
||||
wantLen: 0,
|
||||
},
|
||||
{
|
||||
name: "empty output",
|
||||
output: "",
|
||||
libDirs: []string{"/lib/ollama"},
|
||||
wantLen: 0,
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
devices := parseLlamaServerDevices(tt.output, tt.libDirs)
|
||||
if len(devices) != tt.wantLen {
|
||||
t.Fatalf("got %d devices, want %d", len(devices), tt.wantLen)
|
||||
}
|
||||
if tt.wantLen > 0 {
|
||||
if tt.wantName != "" && devices[0].Name != tt.wantName {
|
||||
t.Errorf("name = %q, want %q", devices[0].Name, tt.wantName)
|
||||
}
|
||||
if tt.wantLib != "" && devices[0].Library != tt.wantLib {
|
||||
t.Errorf("library = %q, want %q", devices[0].Library, tt.wantLib)
|
||||
}
|
||||
if tt.wantMiB > 0 {
|
||||
expectedBytes := tt.wantMiB * 1024 * 1024
|
||||
if devices[0].TotalMemory != expectedBytes {
|
||||
t.Errorf("total memory = %d, want %d", devices[0].TotalMemory, expectedBytes)
|
||||
}
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestParseLlamaServerDevicesMarksVulkanUMAGPUsIntegrated(t *testing.T) {
|
||||
output := `ggml_vulkan: 0 = Intel(R) Graphics (Intel open-source Mesa driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 65536 | int dot: 1 | matrix cores: none
|
||||
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
|
||||
want: []wantDevice{
|
||||
{name: "CUDA0", library: "CUDA", totalMiB: 24564},
|
||||
{name: "CUDA1", library: "CUDA", totalMiB: 12288},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "Vulkan UMA",
|
||||
output: `ggml_vulkan: 0 = Intel(R) Graphics (Intel open-source Mesa driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 65536 | int dot: 1 | matrix cores: none
|
||||
Available devices:
|
||||
Vulkan0: Intel(R) Graphics (16384 MiB, 12288 MiB free)
|
||||
`
|
||||
devices := parseLlamaServerDevices(output, []string{"/lib/ollama", "/lib/ollama/vulkan"})
|
||||
if len(devices) != 1 {
|
||||
t.Fatalf("expected 1 device, got %d", len(devices))
|
||||
}
|
||||
if !devices[0].Integrated {
|
||||
t.Fatal("expected Vulkan UMA device to be marked integrated")
|
||||
}
|
||||
}
|
||||
|
||||
func TestCUDADeviceFilteredByArchs(t *testing.T) {
|
||||
// GTX 1060 (CC 6.1 = 610) with v13 ARCHS that don't include 610
|
||||
output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
|
||||
`,
|
||||
libDirs: []string{"/lib/ollama", "/lib/ollama/vulkan"},
|
||||
want: []wantDevice{{
|
||||
name: "Vulkan0",
|
||||
library: "Vulkan",
|
||||
totalMiB: 16384,
|
||||
checkIntegrated: true,
|
||||
integrated: true,
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "Vulkan without UMA metadata",
|
||||
output: `Available devices:
|
||||
Vulkan0: AMD Radeon(TM) Graphics (32768 MiB, 31000 MiB free)
|
||||
`,
|
||||
libDirs: []string{"/lib/ollama", "/lib/ollama/vulkan"},
|
||||
want: []wantDevice{{
|
||||
name: "Vulkan0",
|
||||
library: "Vulkan",
|
||||
totalMiB: 32768,
|
||||
checkIntegrated: true,
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "CUDA device filtered by compiled archs",
|
||||
output: `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
|
||||
Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
|
||||
load_backend: loaded CUDA backend from /lib/ollama/cuda_v13/libggml-cuda.so
|
||||
system_info: n_threads = 4 | CUDA : ARCHS = 750,800,860,890,900,1000,1030,1100,1200,1210 |
|
||||
Available devices:
|
||||
CUDA0: NVIDIA GeForce GTX 1060 6GB (6063 MiB, 5900 MiB free)
|
||||
`
|
||||
devices := parseLlamaServerDevices(output, []string{"/lib/ollama", "/lib/ollama/cuda_v13"})
|
||||
if len(devices) != 0 {
|
||||
t.Fatalf("expected 0 devices (GTX 1060 CC 610 not in ARCHS), got %d", len(devices))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCUDADeviceKeptByArchs(t *testing.T) {
|
||||
// RTX 4060 Ti (CC 8.9 = 890) with v13 ARCHS that include 890
|
||||
output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 16379 MiB):
|
||||
`,
|
||||
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v13"},
|
||||
},
|
||||
{
|
||||
name: "CUDA device kept by compiled archs",
|
||||
output: `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 16379 MiB):
|
||||
Device 0: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB
|
||||
system_info: n_threads = 16 | CUDA : ARCHS = 750,800,860,890,900,1000,1030,1100,1200,1210 |
|
||||
Available devices:
|
||||
CUDA0: NVIDIA GeForce RTX 4060 Ti (16379 MiB, 14900 MiB free)
|
||||
`
|
||||
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
|
||||
if len(devices) != 1 {
|
||||
t.Fatalf("expected 1 device (CC 890 in ARCHS), got %d", len(devices))
|
||||
}
|
||||
if devices[0].ComputeMajor != 8 || devices[0].ComputeMinor != 9 {
|
||||
t.Fatalf("expected compute 8.9, got %s", devices[0].Compute())
|
||||
}
|
||||
}
|
||||
|
||||
func TestCUDANoArchsFailOpen(t *testing.T) {
|
||||
// No system_info line — should keep all devices (fail open)
|
||||
output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
|
||||
`,
|
||||
want: []wantDevice{{
|
||||
name: "CUDA0",
|
||||
library: "CUDA",
|
||||
totalMiB: 16379,
|
||||
compute: "8.9",
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "CUDA without compiled archs fails open",
|
||||
output: `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
|
||||
Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
|
||||
Available devices:
|
||||
CUDA0: NVIDIA GeForce GTX 1060 6GB (6063 MiB, 5900 MiB free)
|
||||
`
|
||||
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
|
||||
if len(devices) != 1 {
|
||||
t.Fatalf("expected 1 device (no ARCHS = fail open), got %d", len(devices))
|
||||
}
|
||||
if devices[0].ComputeMajor != 6 || devices[0].ComputeMinor != 1 {
|
||||
t.Fatalf("expected compute 6.1, got %s", devices[0].Compute())
|
||||
}
|
||||
}
|
||||
|
||||
func TestCUDANoCCFailOpen(t *testing.T) {
|
||||
// Device line without compute capability — should keep (fail open)
|
||||
output := `system_info: n_threads = 4 | CUDA : ARCHS = 750,800 |
|
||||
`,
|
||||
want: []wantDevice{{
|
||||
name: "CUDA0",
|
||||
library: "CUDA",
|
||||
totalMiB: 6063,
|
||||
compute: "6.1",
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "CUDA without compute capability fails open",
|
||||
output: `system_info: n_threads = 4 | CUDA : ARCHS = 750,800 |
|
||||
Available devices:
|
||||
CUDA0: Some Future GPU (8192 MiB, 8000 MiB free)
|
||||
`
|
||||
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
|
||||
if len(devices) != 1 {
|
||||
t.Fatalf("expected 1 device (no CC = fail open), got %d", len(devices))
|
||||
}
|
||||
}
|
||||
|
||||
func TestCUDAMultiDeviceMixedFilter(t *testing.T) {
|
||||
// Two devices: one supported (CC 890), one not (CC 610)
|
||||
output := `ggml_cuda_init: found 2 CUDA devices:
|
||||
`,
|
||||
want: []wantDevice{{
|
||||
name: "CUDA0",
|
||||
library: "CUDA",
|
||||
totalMiB: 8192,
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "CUDA mixed arch support",
|
||||
output: `ggml_cuda_init: found 2 CUDA devices:
|
||||
Device 0: NVIDIA GeForce GTX 1060, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
|
||||
Device 1: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB
|
||||
system_info: n_threads = 8 | CUDA : ARCHS = 750,800,860,890 |
|
||||
Available devices:
|
||||
CUDA0: NVIDIA GeForce GTX 1060 (6063 MiB, 5900 MiB free)
|
||||
CUDA1: NVIDIA GeForce RTX 4060 Ti (16379 MiB, 14900 MiB free)
|
||||
`
|
||||
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
|
||||
if len(devices) != 1 {
|
||||
t.Fatalf("expected 1 device (only RTX 4060 Ti), got %d", len(devices))
|
||||
}
|
||||
if devices[0].Name != "CUDA1" {
|
||||
t.Errorf("expected CUDA1, got %s", devices[0].Name)
|
||||
}
|
||||
}
|
||||
|
||||
func TestROCmDeviceGFXTarget(t *testing.T) {
|
||||
output := `ggml_cuda_init: found 1 ROCm devices (Total VRAM: 12272 MiB):
|
||||
Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
|
||||
Available devices:
|
||||
ROCm0: AMD Radeon RX 6700 XT (12272 MiB, 12248 MiB free)
|
||||
`
|
||||
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
|
||||
if len(devices) != 1 {
|
||||
t.Fatalf("expected 1 device, got %d", len(devices))
|
||||
}
|
||||
if devices[0].GFXTarget != "gfx1031" {
|
||||
t.Errorf("expected gfx1031, got %s", devices[0].GFXTarget)
|
||||
}
|
||||
if devices[0].Compute() != "gfx1031" {
|
||||
t.Errorf("expected compute gfx1031, got %s", devices[0].Compute())
|
||||
}
|
||||
}
|
||||
|
||||
func TestROCmDeviceGFXTargetWithXnack(t *testing.T) {
|
||||
// gfx906 with :sramecc+:xnack- suffix (e.g., Radeon Pro VII)
|
||||
output := `ggml_cuda_init: found 2 ROCm devices (Total VRAM: 32736 MiB):
|
||||
`,
|
||||
want: []wantDevice{{
|
||||
name: "CUDA1",
|
||||
library: "CUDA",
|
||||
totalMiB: 16379,
|
||||
compute: "8.9",
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "ROCm gfx target with xnack suffix",
|
||||
output: `ggml_cuda_init: found 2 ROCm devices (Total VRAM: 32736 MiB):
|
||||
Device 0: AMD Radeon RX 6800, gfx1030 (0x1030), VMM: no, Wave Size: 32, VRAM: 16368 MiB
|
||||
Device 1: AMD Radeon Pro VII, gfx906:sramecc+:xnack- (0x906), VMM: no, Wave Size: 64, VRAM: 16368 MiB
|
||||
Available devices:
|
||||
ROCm0: AMD Radeon RX 6800 (16368 MiB, 16342 MiB free)
|
||||
ROCm1: AMD Radeon Pro VII (16368 MiB, 16348 MiB free)
|
||||
`
|
||||
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
|
||||
if len(devices) != 2 {
|
||||
t.Fatalf("expected 2 devices, got %d", len(devices))
|
||||
}
|
||||
if devices[0].GFXTarget != "gfx1030" {
|
||||
t.Errorf("device 0: expected gfx1030, got %s", devices[0].GFXTarget)
|
||||
}
|
||||
if devices[1].GFXTarget != "gfx906" {
|
||||
t.Errorf("device 1: expected gfx906, got %s", devices[1].GFXTarget)
|
||||
}
|
||||
if devices[0].Compute() != "gfx1030" {
|
||||
t.Errorf("device 0: expected compute gfx1030, got %s", devices[0].Compute())
|
||||
}
|
||||
if devices[1].Compute() != "gfx906" {
|
||||
t.Errorf("device 1: expected compute gfx906, got %s", devices[1].Compute())
|
||||
}
|
||||
}
|
||||
`,
|
||||
want: []wantDevice{
|
||||
{name: "ROCm0", library: "ROCm", totalMiB: 16368, compute: "gfx1030", gfxTarget: "gfx1030"},
|
||||
{name: "ROCm1", library: "ROCm", totalMiB: 16368, compute: "gfx906", gfxTarget: "gfx906"},
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "unknown library",
|
||||
output: `Available devices:
|
||||
Future0: Mystery Accelerator (8192 MiB, 8000 MiB free)
|
||||
`,
|
||||
want: []wantDevice{{
|
||||
name: "Future0",
|
||||
library: "Mystery Accelerator",
|
||||
totalMiB: 8192,
|
||||
}},
|
||||
},
|
||||
{
|
||||
name: "no devices",
|
||||
output: "Available devices:\n",
|
||||
},
|
||||
{
|
||||
name: "empty output",
|
||||
},
|
||||
}
|
||||
|
||||
func TestInferLibrary(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
desc string
|
||||
want string
|
||||
}{
|
||||
{"NVIDIA CUDA", "NVIDIA GeForce RTX 4090", "CUDA"},
|
||||
{"CUDA0", "NVIDIA GeForce RTX 4090", "CUDA"},
|
||||
{"AMD ROCm", "AMD Radeon RX 6700 XT", "ROCm"},
|
||||
{"ROCm0", "AMD Radeon RX 6700 XT", "ROCm"},
|
||||
{"Metal", "Apple M3 Max", "Metal"},
|
||||
{"Vulkan0", "NVIDIA GeForce RTX 4090 (Vulkan)", "Vulkan"},
|
||||
{"Unknown", "Unknown Backend", "Unknown Backend"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
got := inferLibrary(tt.name, tt.desc)
|
||||
if got != tt.want {
|
||||
t.Errorf("inferLibrary(%q, %q) = %q, want %q", tt.name, tt.desc, got, tt.want)
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
if tt.libDirs == nil {
|
||||
tt.libDirs = []string{"/lib/ollama"}
|
||||
}
|
||||
devices := parseLlamaServerDevices(tt.output, tt.libDirs)
|
||||
if len(devices) != len(tt.want) {
|
||||
t.Fatalf("got %d devices, want %d", len(devices), len(tt.want))
|
||||
}
|
||||
for i, want := range tt.want {
|
||||
got := devices[i]
|
||||
if want.name != "" && got.Name != want.name {
|
||||
t.Errorf("device %d name = %q, want %q", i, got.Name, want.name)
|
||||
}
|
||||
if want.library != "" && got.Library != want.library {
|
||||
t.Errorf("device %d library = %q, want %q", i, got.Library, want.library)
|
||||
}
|
||||
if want.totalMiB > 0 && got.TotalMemory != want.totalMiB*1024*1024 {
|
||||
t.Errorf("device %d total memory = %d, want %d MiB", i, got.TotalMemory, want.totalMiB)
|
||||
}
|
||||
if want.compute != "" && got.Compute() != want.compute {
|
||||
t.Errorf("device %d compute = %q, want %q", i, got.Compute(), want.compute)
|
||||
}
|
||||
if want.gfxTarget != "" && got.GFXTarget != want.gfxTarget {
|
||||
t.Errorf("device %d gfx target = %q, want %q", i, got.GFXTarget, want.gfxTarget)
|
||||
}
|
||||
if want.checkIntegrated && got.Integrated != want.integrated {
|
||||
t.Errorf("device %d integrated = %v, want %v", i, got.Integrated, want.integrated)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCudaCCRegex(t *testing.T) {
|
||||
tests := []struct {
|
||||
line string
|
||||
wantIdx int
|
||||
wantCC string
|
||||
}{
|
||||
{" Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB", 0, "610"},
|
||||
{" Device 1: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB", 1, "890"},
|
||||
{" Device 0: NVIDIA RTX PRO 6000, compute capability 12.0, VMM: yes, VRAM: 97250 MiB", 0, "1200"},
|
||||
{" Device 0: Tesla V100-PCIE-16GB, compute capability 7.0, VMM: yes, VRAM: 16160 MiB", 0, "700"},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
matches := cudaCCRegex.FindStringSubmatch(tt.line)
|
||||
if matches == nil {
|
||||
t.Errorf("expected match for %q", tt.line)
|
||||
continue
|
||||
}
|
||||
idx, _ := strconv.Atoi(matches[1])
|
||||
major, _ := strconv.Atoi(matches[2])
|
||||
minor, _ := strconv.Atoi(matches[3])
|
||||
cc := fmt.Sprintf("%d%d0", major, minor)
|
||||
if idx != tt.wantIdx {
|
||||
t.Errorf("for %q: got idx %d, want %d", tt.line, idx, tt.wantIdx)
|
||||
}
|
||||
if cc != tt.wantCC {
|
||||
t.Errorf("for %q: got CC %s, want %s", tt.line, cc, tt.wantCC)
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
func TestCudaArchsRegex(t *testing.T) {
|
||||
tests := []struct {
|
||||
line string
|
||||
want []string
|
||||
}{
|
||||
{
|
||||
"system_info: n_threads = 16 | CUDA : ARCHS = 750,800,860,890 | USE_GRAPHS = 1 |",
|
||||
[]string{"750", "800", "860", "890"},
|
||||
},
|
||||
{
|
||||
"system_info: | CUDA : ARCHS = 500,520,600,610,700,750,800,860,890,900,1200 |",
|
||||
[]string{"500", "520", "600", "610", "700", "750", "800", "860", "890", "900", "1200"},
|
||||
},
|
||||
{
|
||||
"no archs here",
|
||||
nil,
|
||||
},
|
||||
}
|
||||
for _, tt := range tests {
|
||||
matches := cudaArchsRegex.FindStringSubmatch(tt.line)
|
||||
if tt.want == nil {
|
||||
if matches != nil {
|
||||
t.Errorf("expected no match for %q, got %v", tt.line, matches)
|
||||
}
|
||||
continue
|
||||
}
|
||||
if matches == nil {
|
||||
t.Errorf("expected match for %q, got nil", tt.line)
|
||||
continue
|
||||
}
|
||||
got := strings.Split(matches[1], ",")
|
||||
if len(got) != len(tt.want) {
|
||||
t.Errorf("for %q: got %v, want %v", tt.line, got, tt.want)
|
||||
}
|
||||
}
|
||||
})
|
||||
}
|
||||
|
|
|
|||
|
|
@ -139,7 +139,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
|||
wg.Add(1)
|
||||
go func(i int) {
|
||||
defer wg.Done()
|
||||
extraEnvs := ml.GetDevicesEnv(devices[i:i+1], true)
|
||||
extraEnvs := ml.GetDevicesEnv(devices[i : i+1])
|
||||
devices[i].AddInitValidation(extraEnvs)
|
||||
if len(bootstrapDevicesWithMetalRetry(ctx2ndPass, ctx, 30*time.Second, devices[i].LibraryPath, extraEnvs)) == 0 {
|
||||
slog.Debug("filtering device which didn't fully initialize",
|
||||
|
|
@ -324,9 +324,10 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
|
|||
rctx, cancel := context.WithTimeout(ctx, 3*time.Second)
|
||||
defer cancel()
|
||||
|
||||
// Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct
|
||||
// We avoid CUDA filters here to keep ROCm from failing to discover GPUs in a mixed environment
|
||||
devFilter := ml.GetDevicesEnv(devices, false)
|
||||
// Apply any device filters to avoid re-discovering unsupported devices,
|
||||
// and get IDs correct. GetDevicesEnv only filters visibility when this
|
||||
// pass is narrowed to a single device.
|
||||
devFilter := ml.GetDevicesEnv(devices)
|
||||
|
||||
for dir := range libDirs {
|
||||
updatedDevices := bootstrapDevicesWithMetalRetry(rctx, ctx, 3*time.Second, []string{ml.LibOllamaPath, dir}, devFilter)
|
||||
|
|
|
|||
77
llama/compat/llama-ollama-compat.cpp
vendored
77
llama/compat/llama-ollama-compat.cpp
vendored
|
|
@ -28,12 +28,6 @@ using namespace llama_ollama_compat::detail; // pull detail:: helpers into scope
|
|||
|
||||
namespace {
|
||||
|
||||
thread_local uint64_t g_compat_info_logs = 0;
|
||||
|
||||
void note_compat_info_log() {
|
||||
++g_compat_info_logs;
|
||||
}
|
||||
|
||||
#ifdef OLLAMA_COMPAT_MTMD_BUILD
|
||||
void ollama_compat_log(const char * format, ...) {
|
||||
std::va_list args;
|
||||
|
|
@ -42,10 +36,10 @@ void ollama_compat_log(const char * format, ...) {
|
|||
va_end(args);
|
||||
}
|
||||
|
||||
#define OLLAMA_COMPAT_LOG_INFO(...) do { note_compat_info_log(); ollama_compat_log(__VA_ARGS__); } while (0)
|
||||
#define OLLAMA_COMPAT_LOG_INFO(...) do { ollama_compat_log(__VA_ARGS__); } while (0)
|
||||
#define OLLAMA_COMPAT_LOG_ERROR(...) ollama_compat_log(__VA_ARGS__)
|
||||
#else
|
||||
#define OLLAMA_COMPAT_LOG_INFO(...) do { note_compat_info_log(); LLAMA_LOG_INFO(__VA_ARGS__); } while (0)
|
||||
#define OLLAMA_COMPAT_LOG_INFO(...) do { LLAMA_LOG_INFO(__VA_ARGS__); } while (0)
|
||||
#define OLLAMA_COMPAT_LOG_ERROR(...) LLAMA_LOG_ERROR(__VA_ARGS__)
|
||||
#endif
|
||||
|
||||
|
|
@ -53,47 +47,23 @@ double elapsed_ms(std::chrono::steady_clock::time_point start) {
|
|||
return std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - start).count();
|
||||
}
|
||||
|
||||
class CompatScopeTimer {
|
||||
public:
|
||||
CompatScopeTimer(const char * phase, const std::string * arch = nullptr)
|
||||
: phase_(phase),
|
||||
arch_before_(arch ? *arch : ""),
|
||||
arch_(arch),
|
||||
log_start_(g_compat_info_logs),
|
||||
start_(std::chrono::steady_clock::now()) {}
|
||||
|
||||
void force_log() {
|
||||
forced_ = true;
|
||||
}
|
||||
|
||||
~CompatScopeTimer() {
|
||||
const bool arch_changed = arch_ && *arch_ != arch_before_;
|
||||
if (!forced_ && !arch_changed && g_compat_info_logs == log_start_) return;
|
||||
|
||||
if (arch_) {
|
||||
if (arch_changed) {
|
||||
OLLAMA_COMPAT_LOG_INFO("compat patch timing: phase=%s arch_before=%s arch_after=%s duration_ms=%.3f\n",
|
||||
phase_, arch_before_.c_str(), arch_->c_str(), elapsed_ms(start_));
|
||||
} else {
|
||||
OLLAMA_COMPAT_LOG_INFO("compat patch timing: phase=%s arch=%s duration_ms=%.3f\n",
|
||||
phase_, arch_->c_str(), elapsed_ms(start_));
|
||||
}
|
||||
return;
|
||||
}
|
||||
|
||||
OLLAMA_COMPAT_LOG_INFO("compat patch timing: phase=%s duration_ms=%.3f\n",
|
||||
phase_, elapsed_ms(start_));
|
||||
}
|
||||
|
||||
private:
|
||||
const char * phase_;
|
||||
std::string arch_before_;
|
||||
const std::string * arch_;
|
||||
uint64_t log_start_;
|
||||
std::chrono::steady_clock::time_point start_;
|
||||
bool forced_ = false;
|
||||
struct TransformTiming {
|
||||
uint64_t count;
|
||||
size_t bytes;
|
||||
double ms;
|
||||
};
|
||||
|
||||
std::mutex g_transform_timing_mutex;
|
||||
TransformTiming g_transform_timing = {};
|
||||
|
||||
TransformTiming record_transform_timing(size_t bytes, double ms) {
|
||||
std::lock_guard<std::mutex> lk(g_transform_timing_mutex);
|
||||
g_transform_timing.count++;
|
||||
g_transform_timing.bytes += bytes;
|
||||
g_transform_timing.ms += ms;
|
||||
return g_transform_timing;
|
||||
}
|
||||
|
||||
// Per-loader file path registry — set by translate_metadata, read by
|
||||
// maybe_load_text_tensor so it can pass the path to load ops without a
|
||||
// separate patch insertion in the model loader's load_all_data path.
|
||||
|
|
@ -2791,7 +2761,6 @@ bool translate_metadata(const llama_model_loader * ml,
|
|||
std::string & arch_name,
|
||||
const char * fname) {
|
||||
if (!meta) return false;
|
||||
CompatScopeTimer timing("metadata", &arch_name);
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(g_loader_path_mutex);
|
||||
g_loader_paths[ml] = fname ? fname : "";
|
||||
|
|
@ -2826,13 +2795,14 @@ bool translate_metadata(const llama_model_loader * ml,
|
|||
// Dispatch. Add more arches as they are wired up.
|
||||
|
||||
const bool no_mmap = is_mmap_disabled_for(ml);
|
||||
if (no_mmap) timing.force_log();
|
||||
if (no_mmap) {
|
||||
OLLAMA_COMPAT_LOG_INFO("compat patch disabled mmap for transformed text tensors\n");
|
||||
}
|
||||
return no_mmap;
|
||||
}
|
||||
|
||||
void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
|
||||
if (!meta) return;
|
||||
CompatScopeTimer timing("clip");
|
||||
|
||||
handle_legacy_llava_projector(meta);
|
||||
|
||||
|
|
@ -2921,8 +2891,11 @@ bool maybe_load_tensor(ggml_tensor * cur,
|
|||
ggml_backend_tensor_set(cur, dst.data(), 0, dst_size);
|
||||
}
|
||||
|
||||
OLLAMA_COMPAT_LOG_INFO("%s: %s for %s (%zu bytes) in %.3f ms\n",
|
||||
__func__, op.description, ggml_get_name(cur), dst_size, elapsed_ms(start));
|
||||
const double ms = elapsed_ms(start);
|
||||
const TransformTiming total = record_transform_timing(dst_size, ms);
|
||||
OLLAMA_COMPAT_LOG_INFO("compat tensor transform: op=%s tensor=%s bytes=%zu duration_ms=%.3f total_ops=%llu total_bytes=%zu total_ms=%.3f\n",
|
||||
op.description, ggml_get_name(cur), dst_size, ms,
|
||||
(unsigned long long) total.count, total.bytes, total.ms);
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -601,7 +601,7 @@ func NewLlamaServerRunner(
|
|||
memWriter := &memoryParsingWriter{inner: status}
|
||||
|
||||
mediaMarker := newLlamaServerMediaMarker()
|
||||
extraEnvs := ml.GetDevicesEnv(gpus, false)
|
||||
extraEnvs := ml.GetDevicesEnv(gpus)
|
||||
serverEnvs := make(map[string]string, len(extraEnvs)+1)
|
||||
for k, v := range extraEnvs {
|
||||
serverEnvs[k] = v
|
||||
|
|
|
|||
12
ml/device.go
12
ml/device.go
|
|
@ -532,12 +532,12 @@ func (f FlashAttentionType) String() string {
|
|||
|
||||
// Given the list of GPUs this instantiation is targeted for,
|
||||
// figure out the device environment variables and any recorded
|
||||
// per-device runner environment overrides. Set mustFilter true to enable
|
||||
// filtering of CUDA devices.
|
||||
func GetDevicesEnv(l []DeviceInfo, mustFilter bool) map[string]string {
|
||||
// per-device runner environment overrides.
|
||||
func GetDevicesEnv(l []DeviceInfo) map[string]string {
|
||||
if len(l) == 0 {
|
||||
return nil
|
||||
}
|
||||
mustFilter := len(l) == 1
|
||||
env := map[string]string{}
|
||||
for _, d := range l {
|
||||
d.updateVisibleDevicesEnv(env, mustFilter)
|
||||
|
|
@ -595,8 +595,12 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string, mustFilter bo
|
|||
return
|
||||
}
|
||||
envVar = "CUDA_VISIBLE_DEVICES"
|
||||
case "Vulkan":
|
||||
if !mustFilter {
|
||||
return
|
||||
}
|
||||
envVar = "GGML_VK_VISIBLE_DEVICES"
|
||||
default:
|
||||
// Vulkan is not filtered via env var, but via scheduling decisions
|
||||
return
|
||||
}
|
||||
v, existing := env[envVar]
|
||||
|
|
|
|||
|
|
@ -13,20 +13,13 @@ func TestMergeEnvWithRunnerEnvOverrides(t *testing.T) {
|
|||
DeviceID: DeviceID{Library: "Metal", ID: "0"},
|
||||
RunnerEnvOverrides: map[string]string{"GGML_METAL_TENSOR_DISABLE": "1"},
|
||||
},
|
||||
{
|
||||
DeviceID: DeviceID{Library: "CUDA", ID: "3"},
|
||||
},
|
||||
}
|
||||
|
||||
env := GetDevicesEnv(devices, true)
|
||||
env := GetDevicesEnv(devices)
|
||||
|
||||
if got, want := env["GGML_METAL_TENSOR_DISABLE"], "1"; got != want {
|
||||
t.Fatalf("GGML_METAL_TENSOR_DISABLE = %q, want %q", got, want)
|
||||
}
|
||||
|
||||
if got, want := env["CUDA_VISIBLE_DEVICES"], "3"; got != want {
|
||||
t.Fatalf("CUDA_VISIBLE_DEVICES = %q, want %q", got, want)
|
||||
}
|
||||
}
|
||||
|
||||
func TestGetDevicesEnvWarnsOnConflictingOverrides(t *testing.T) {
|
||||
|
|
@ -48,7 +41,7 @@ func TestGetDevicesEnvWarnsOnConflictingOverrides(t *testing.T) {
|
|||
},
|
||||
}
|
||||
|
||||
env := GetDevicesEnv(devices, false)
|
||||
env := GetDevicesEnv(devices)
|
||||
|
||||
if got, want := env["TEST_OVERRIDE"], "two"; got != want {
|
||||
t.Fatalf("TEST_OVERRIDE = %q, want %q", got, want)
|
||||
|
|
@ -59,6 +52,53 @@ func TestGetDevicesEnvWarnsOnConflictingOverrides(t *testing.T) {
|
|||
}
|
||||
}
|
||||
|
||||
func TestGetDevicesEnvFiltersSingleDevice(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
gpus []DeviceInfo
|
||||
key string
|
||||
want string
|
||||
}{
|
||||
{
|
||||
name: "single CUDA",
|
||||
gpus: []DeviceInfo{{DeviceID: DeviceID{Library: "CUDA", ID: "3"}}},
|
||||
key: "CUDA_VISIBLE_DEVICES",
|
||||
want: "3",
|
||||
},
|
||||
{
|
||||
name: "multiple CUDA",
|
||||
gpus: []DeviceInfo{
|
||||
{DeviceID: DeviceID{Library: "CUDA", ID: "3"}},
|
||||
{DeviceID: DeviceID{Library: "CUDA", ID: "4"}},
|
||||
},
|
||||
key: "CUDA_VISIBLE_DEVICES",
|
||||
},
|
||||
{
|
||||
name: "single Vulkan",
|
||||
gpus: []DeviceInfo{{DeviceID: DeviceID{Library: "Vulkan", ID: "0"}, FilterID: "1"}},
|
||||
key: "GGML_VK_VISIBLE_DEVICES",
|
||||
want: "1",
|
||||
},
|
||||
{
|
||||
name: "multiple Vulkan",
|
||||
gpus: []DeviceInfo{
|
||||
{DeviceID: DeviceID{Library: "Vulkan", ID: "0"}, FilterID: "1"},
|
||||
{DeviceID: DeviceID{Library: "Vulkan", ID: "1"}, FilterID: "0"},
|
||||
},
|
||||
key: "GGML_VK_VISIBLE_DEVICES",
|
||||
},
|
||||
}
|
||||
|
||||
for _, tt := range tests {
|
||||
t.Run(tt.name, func(t *testing.T) {
|
||||
env := GetDevicesEnv(tt.gpus)
|
||||
if got := env[tt.key]; got != tt.want {
|
||||
t.Fatalf("%s = %q, want %q", tt.key, got, tt.want)
|
||||
}
|
||||
})
|
||||
}
|
||||
}
|
||||
|
||||
func TestFlashAttentionSupported(t *testing.T) {
|
||||
tests := []struct {
|
||||
name string
|
||||
|
|
|
|||
|
|
@ -784,26 +784,43 @@ func selectLlamaServerPlacement(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo,
|
|||
}
|
||||
|
||||
if opts.MainGPU != nil {
|
||||
selected, ok := bestExplicitMainGPUGroup(systemInfo, groups, *opts.MainGPU)
|
||||
gpu, available, ok := bestExplicitMainGPU(systemInfo, groups, *opts.MainGPU)
|
||||
if !ok {
|
||||
selected = bestGPUGroupByAvailableMemory(systemInfo, groups)
|
||||
selected := bestGPUGroupByAvailableMemory(systemInfo, groups)
|
||||
slog.Warn("requested main_gpu is outside the selected GPU group; passing value through to llama-server",
|
||||
"main_gpu", *opts.MainGPU,
|
||||
"gpu_count", len(selected))
|
||||
logSelectedGPUGroup(gpus, selected)
|
||||
return selected, launchOpts
|
||||
}
|
||||
|
||||
selected, launchOpts := singleLlamaServerGPUPlacement(gpu, launchOpts)
|
||||
slog.Info("selecting requested single GPU for llama-server model",
|
||||
"requested_main_gpu", *opts.MainGPU,
|
||||
"main_gpu", *launchOpts.MainGPU,
|
||||
"id", gpu.ID,
|
||||
"filter_id", gpu.FilterID,
|
||||
"library", gpu.Library,
|
||||
"name", gpu.Name,
|
||||
"description", gpu.Description,
|
||||
"integrated", gpu.Integrated,
|
||||
"available", format.HumanBytes2(available))
|
||||
logSelectedGPUGroup(gpus, selected)
|
||||
return selected, launchOpts
|
||||
}
|
||||
|
||||
if !envconfig.SchedSpread() && predictedVRAM > 0 {
|
||||
selected, mainGPU, gpu, available, ok := bestSingleGPUFit(systemInfo, groups, predictedVRAM)
|
||||
gpu, available, ok := bestSingleGPUFit(systemInfo, groups, predictedVRAM)
|
||||
if ok {
|
||||
launchOpts.MainGPU = &mainGPU
|
||||
selected, launchOpts := singleLlamaServerGPUPlacement(gpu, launchOpts)
|
||||
slog.Info("selecting single GPU for llama-server model",
|
||||
"main_gpu", mainGPU,
|
||||
"main_gpu", *launchOpts.MainGPU,
|
||||
"id", gpu.ID,
|
||||
"filter_id", gpu.FilterID,
|
||||
"library", gpu.Library,
|
||||
"name", gpu.Name,
|
||||
"description", gpu.Description,
|
||||
"integrated", gpu.Integrated,
|
||||
"predicted", format.HumanBytes2(predictedVRAM),
|
||||
"available", format.HumanBytes2(available))
|
||||
logSelectedGPUGroup(gpus, selected)
|
||||
|
|
@ -816,37 +833,41 @@ func selectLlamaServerPlacement(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo,
|
|||
return selected, launchOpts
|
||||
}
|
||||
|
||||
func bestExplicitMainGPUGroup(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, mainGPU int) ([]ml.DeviceInfo, bool) {
|
||||
func singleLlamaServerGPUPlacement(gpu ml.DeviceInfo, opts api.Options) ([]ml.DeviceInfo, api.Options) {
|
||||
mainGPU := 0
|
||||
opts.MainGPU = &mainGPU
|
||||
return []ml.DeviceInfo{gpu}, opts
|
||||
}
|
||||
|
||||
func bestExplicitMainGPU(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, mainGPU int) (gpu ml.DeviceInfo, available uint64, ok bool) {
|
||||
if mainGPU < 0 {
|
||||
return nil, false
|
||||
return ml.DeviceInfo{}, 0, false
|
||||
}
|
||||
|
||||
var best []ml.DeviceInfo
|
||||
var bestAvailable uint64
|
||||
for _, group := range groups {
|
||||
if mainGPU >= len(group) {
|
||||
continue
|
||||
}
|
||||
available := availableMemoryForGPU(systemInfo, group[mainGPU])
|
||||
if best == nil || available > bestAvailable {
|
||||
best = group
|
||||
bestAvailable = available
|
||||
candidate := group[mainGPU]
|
||||
candidateAvailable := availableMemoryForGPU(systemInfo, candidate)
|
||||
if !ok || betterPlacementGPU(candidate, candidateAvailable, gpu, available) {
|
||||
gpu = candidate
|
||||
available = candidateAvailable
|
||||
ok = true
|
||||
}
|
||||
}
|
||||
|
||||
return best, best != nil
|
||||
return gpu, available, ok
|
||||
}
|
||||
|
||||
func bestSingleGPUFit(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, predictedVRAM uint64) (selected []ml.DeviceInfo, mainGPU int, gpu ml.DeviceInfo, available uint64, ok bool) {
|
||||
func bestSingleGPUFit(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, predictedVRAM uint64) (gpu ml.DeviceInfo, available uint64, ok bool) {
|
||||
for _, group := range groups {
|
||||
for i, candidate := range group {
|
||||
for _, candidate := range group {
|
||||
candidateAvailable := availableMemoryForGPU(systemInfo, candidate)
|
||||
if predictedVRAM > candidateAvailable*80/100 {
|
||||
continue
|
||||
}
|
||||
if !ok || candidateAvailable > available {
|
||||
selected = group
|
||||
mainGPU = i
|
||||
if !ok || betterPlacementGPU(candidate, candidateAvailable, gpu, available) {
|
||||
gpu = candidate
|
||||
available = candidateAvailable
|
||||
ok = true
|
||||
|
|
@ -854,7 +875,15 @@ func bestSingleGPUFit(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, predic
|
|||
}
|
||||
}
|
||||
|
||||
return selected, mainGPU, gpu, available, ok
|
||||
return gpu, available, ok
|
||||
}
|
||||
|
||||
func betterPlacementGPU(candidate ml.DeviceInfo, candidateAvailable uint64, current ml.DeviceInfo, currentAvailable uint64) bool {
|
||||
if candidate.Integrated != current.Integrated {
|
||||
return !candidate.Integrated
|
||||
}
|
||||
|
||||
return candidateAvailable > currentAvailable
|
||||
}
|
||||
|
||||
func bestGPUGroupByAvailableMemory(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo) []ml.DeviceInfo {
|
||||
|
|
@ -862,7 +891,7 @@ func bestGPUGroupByAvailableMemory(systemInfo ml.SystemInfo, groups [][]ml.Devic
|
|||
var bestAvailable uint64
|
||||
for _, group := range groups {
|
||||
available, _, _ := availableMemoryForLoad(systemInfo, group)
|
||||
if best == nil || available > bestAvailable {
|
||||
if best == nil || betterPlacementGroup(group, available, best, bestAvailable) {
|
||||
best = group
|
||||
bestAvailable = available
|
||||
}
|
||||
|
|
@ -871,6 +900,25 @@ func bestGPUGroupByAvailableMemory(systemInfo ml.SystemInfo, groups [][]ml.Devic
|
|||
return best
|
||||
}
|
||||
|
||||
func betterPlacementGroup(candidate []ml.DeviceInfo, candidateAvailable uint64, current []ml.DeviceInfo, currentAvailable uint64) bool {
|
||||
candidateDiscrete := hasDiscreteGPU(candidate)
|
||||
currentDiscrete := hasDiscreteGPU(current)
|
||||
if candidateDiscrete != currentDiscrete {
|
||||
return candidateDiscrete
|
||||
}
|
||||
|
||||
return candidateAvailable > currentAvailable
|
||||
}
|
||||
|
||||
func hasDiscreteGPU(gpus []ml.DeviceInfo) bool {
|
||||
for _, gpu := range gpus {
|
||||
if !gpu.Integrated {
|
||||
return true
|
||||
}
|
||||
}
|
||||
return false
|
||||
}
|
||||
|
||||
func availableMemoryForGPU(systemInfo ml.SystemInfo, gpu ml.DeviceInfo) uint64 {
|
||||
if gpu.Integrated && systemInfo.FreeMemory > 0 && systemInfo.FreeMemory < gpu.FreeMemory {
|
||||
return systemInfo.FreeMemory
|
||||
|
|
|
|||
|
|
@ -1176,6 +1176,7 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
|
|||
wantLibrary string
|
||||
wantMainGPU *int
|
||||
wantSelectedGPUs int
|
||||
wantGPUID string
|
||||
}{
|
||||
{
|
||||
name: "compacts onto largest same-backend GPU",
|
||||
|
|
@ -1186,8 +1187,9 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
|
|||
},
|
||||
opts: api.DefaultOptions(),
|
||||
wantLibrary: "CUDA",
|
||||
wantMainGPU: testIntPtr(1),
|
||||
wantSelectedGPUs: 2,
|
||||
wantMainGPU: testIntPtr(0),
|
||||
wantSelectedGPUs: 1,
|
||||
wantGPUID: "1",
|
||||
},
|
||||
{
|
||||
name: "explicit main gpu selects matching backend group",
|
||||
|
|
@ -1201,8 +1203,9 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
|
|||
Runner: api.Runner{MainGPU: testIntPtr(1), NumGPU: -1},
|
||||
},
|
||||
wantLibrary: "ROCm",
|
||||
wantMainGPU: testIntPtr(1),
|
||||
wantSelectedGPUs: 2,
|
||||
wantMainGPU: testIntPtr(0),
|
||||
wantSelectedGPUs: 1,
|
||||
wantGPUID: "1",
|
||||
},
|
||||
{
|
||||
name: "integrated GPU is capped by system free memory",
|
||||
|
|
@ -1213,8 +1216,22 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
|
|||
},
|
||||
opts: api.DefaultOptions(),
|
||||
wantLibrary: "Metal",
|
||||
wantMainGPU: testIntPtr(1),
|
||||
wantSelectedGPUs: 2,
|
||||
wantMainGPU: testIntPtr(0),
|
||||
wantSelectedGPUs: 1,
|
||||
wantGPUID: "1",
|
||||
},
|
||||
{
|
||||
name: "prefers discrete GPU over integrated GPU with more available memory",
|
||||
predictedVRAM: 8 * format.GigaByte,
|
||||
gpus: []ml.DeviceInfo{
|
||||
{DeviceID: ml.DeviceID{ID: "0", Library: "Vulkan"}, Name: "integrated", Integrated: true, FreeMemory: 32 * format.GigaByte},
|
||||
{DeviceID: ml.DeviceID{ID: "1", Library: "Vulkan"}, Name: "discrete", FreeMemory: 10 * format.GigaByte},
|
||||
},
|
||||
opts: api.DefaultOptions(),
|
||||
wantLibrary: "Vulkan",
|
||||
wantMainGPU: testIntPtr(0),
|
||||
wantSelectedGPUs: 1,
|
||||
wantGPUID: "1",
|
||||
},
|
||||
{
|
||||
name: "spread disables automatic compaction",
|
||||
|
|
@ -1249,6 +1266,9 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
|
|||
selected, launchOpts := selectLlamaServerPlacement(systemInfo, tt.gpus, tt.predictedVRAM, tt.opts)
|
||||
require.Len(t, selected, tt.wantSelectedGPUs)
|
||||
require.Equal(t, tt.wantLibrary, selected[0].Library)
|
||||
if tt.wantGPUID != "" {
|
||||
require.Equal(t, tt.wantGPUID, selected[0].ID)
|
||||
}
|
||||
if tt.wantMainGPU == nil {
|
||||
require.Nil(t, launchOpts.MainGPU)
|
||||
} else {
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue