scheduler improvements

This commit is contained in:
Daniel Hiltgen 2026-05-08 11:14:34 -07:00
parent fa7092b1d3
commit 88935c21b5
10 changed files with 393 additions and 393 deletions

View file

@ -1 +1 @@
b9048
b9075

View file

@ -61,6 +61,7 @@ func llamaServerDiscoverDevices(ctx context.Context, libDirs []string, extraEnvs
"--host", "127.0.0.1",
"--no-webui",
"--offline",
"--verbose",
)
cmd.WaitDelay = llamaServerDiscoveryWaitDelay
cmd.Env = os.Environ()
@ -137,7 +138,7 @@ func llamaServerDiscoverDevices(ctx context.Context, libDirs []string, extraEnvs
// Also run --list-devices to get the stdout device list with free memory
// (the brief server startup doesn't print that)
cmd2 := exec.CommandContext(ctx, llamaServer, "--list-devices", "--offline")
cmd2 := exec.CommandContext(ctx, llamaServer, "--list-devices", "--offline", "--verbose")
cmd2.WaitDelay = llamaServerDiscoveryWaitDelay
cmd2.Env = cmd.Env // reuse same environment
listOutput, err := cmd2.CombinedOutput()

View file

@ -1,348 +1,261 @@
package discover
import (
"fmt"
"io"
"log/slog"
"strconv"
"strings"
"testing"
"github.com/ollama/ollama/logutil"
)
func TestLlamaServerDiscoveryOutputOnlyTrace(t *testing.T) {
original := slog.Default()
t.Cleanup(func() {
slog.SetDefault(original)
func TestLlamaServerDiscovery(t *testing.T) {
t.Run("output only trace", func(t *testing.T) {
original := slog.Default()
t.Cleanup(func() {
slog.SetDefault(original)
})
slog.SetDefault(logutil.NewLogger(io.Discard, slog.LevelDebug))
if got := llamaServerDiscoveryOutput(t.Context()); got != io.Discard {
t.Fatal("debug logging should discard raw llama-server discovery output")
}
slog.SetDefault(logutil.NewLogger(io.Discard, logutil.LevelTrace))
if got := llamaServerDiscoveryOutput(t.Context()); got == io.Discard {
t.Fatal("trace logging should emit raw llama-server discovery output")
}
})
slog.SetDefault(logutil.NewLogger(io.Discard, slog.LevelDebug))
if got := llamaServerDiscoveryOutput(t.Context()); got != io.Discard {
t.Fatal("debug logging should discard raw llama-server discovery output")
}
t.Run("parse devices", func(t *testing.T) {
type wantDevice struct {
name string
library string
totalMiB uint64
compute string
gfxTarget string
checkIntegrated bool
integrated bool
}
slog.SetDefault(logutil.NewLogger(io.Discard, logutil.LevelTrace))
if got := llamaServerDiscoveryOutput(t.Context()); got == io.Discard {
t.Fatal("trace logging should emit raw llama-server discovery output")
}
}
func TestParseLlamaServerDevices(t *testing.T) {
tests := []struct {
name string
output string
libDirs []string
wantLen int
wantName string
wantLib string
wantMiB uint64
}{
{
name: "NVIDIA CUDA",
output: `load_backend: loaded CUDA backend from /lib/ollama/cuda_v12/libggml-cuda.so
tests := []struct {
name string
output string
libDirs []string
want []wantDevice
}{
{
name: "NVIDIA CUDA",
output: `load_backend: loaded CUDA backend from /lib/ollama/cuda_v12/libggml-cuda.so
Available devices:
NVIDIA GeForce RTX 4090: NVIDIA CUDA (24564 MiB, 23592 MiB free)
`,
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
wantLen: 1,
wantName: "NVIDIA GeForce RTX 4090",
wantLib: "CUDA",
wantMiB: 24564,
},
{
name: "Metal",
output: `Available devices:
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
want: []wantDevice{{
name: "NVIDIA GeForce RTX 4090",
library: "CUDA",
totalMiB: 24564,
}},
},
{
name: "Metal",
output: `Available devices:
Metal: Apple M3 Max (98304 MiB, 98303 MiB free)
`,
libDirs: []string{"/lib/ollama"},
wantLen: 1,
wantName: "Metal",
wantLib: "Metal",
wantMiB: 98304,
},
{
name: "ROCm with gfx target",
output: ` Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
want: []wantDevice{{
name: "Metal",
library: "Metal",
totalMiB: 98304,
}},
},
{
name: "ROCm with gfx target",
output: ` Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
Available devices:
ROCm0: AMD Radeon RX 6700 XT (12272 MiB, 12248 MiB free)
`,
libDirs: []string{"/lib/ollama", "/lib/ollama/rocm"},
wantLen: 1,
wantName: "ROCm0",
wantLib: "ROCm",
wantMiB: 12272,
},
{
name: "multi GPU",
output: `Available devices:
libDirs: []string{"/lib/ollama", "/lib/ollama/rocm"},
want: []wantDevice{{
name: "ROCm0",
library: "ROCm",
totalMiB: 12272,
compute: "gfx1031",
gfxTarget: "gfx1031",
}},
},
{
name: "multi GPU",
output: `Available devices:
CUDA0: NVIDIA GeForce RTX 4090 (24564 MiB, 23592 MiB free)
CUDA1: NVIDIA GeForce RTX 3060 (12288 MiB, 11500 MiB free)
`,
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
wantLen: 2,
},
{
name: "no devices",
output: "Available devices:\n",
libDirs: []string{"/lib/ollama"},
wantLen: 0,
},
{
name: "empty output",
output: "",
libDirs: []string{"/lib/ollama"},
wantLen: 0,
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
devices := parseLlamaServerDevices(tt.output, tt.libDirs)
if len(devices) != tt.wantLen {
t.Fatalf("got %d devices, want %d", len(devices), tt.wantLen)
}
if tt.wantLen > 0 {
if tt.wantName != "" && devices[0].Name != tt.wantName {
t.Errorf("name = %q, want %q", devices[0].Name, tt.wantName)
}
if tt.wantLib != "" && devices[0].Library != tt.wantLib {
t.Errorf("library = %q, want %q", devices[0].Library, tt.wantLib)
}
if tt.wantMiB > 0 {
expectedBytes := tt.wantMiB * 1024 * 1024
if devices[0].TotalMemory != expectedBytes {
t.Errorf("total memory = %d, want %d", devices[0].TotalMemory, expectedBytes)
}
}
}
})
}
}
func TestParseLlamaServerDevicesMarksVulkanUMAGPUsIntegrated(t *testing.T) {
output := `ggml_vulkan: 0 = Intel(R) Graphics (Intel open-source Mesa driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 65536 | int dot: 1 | matrix cores: none
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
want: []wantDevice{
{name: "CUDA0", library: "CUDA", totalMiB: 24564},
{name: "CUDA1", library: "CUDA", totalMiB: 12288},
},
},
{
name: "Vulkan UMA",
output: `ggml_vulkan: 0 = Intel(R) Graphics (Intel open-source Mesa driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 65536 | int dot: 1 | matrix cores: none
Available devices:
Vulkan0: Intel(R) Graphics (16384 MiB, 12288 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama", "/lib/ollama/vulkan"})
if len(devices) != 1 {
t.Fatalf("expected 1 device, got %d", len(devices))
}
if !devices[0].Integrated {
t.Fatal("expected Vulkan UMA device to be marked integrated")
}
}
func TestCUDADeviceFilteredByArchs(t *testing.T) {
// GTX 1060 (CC 6.1 = 610) with v13 ARCHS that don't include 610
output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
`,
libDirs: []string{"/lib/ollama", "/lib/ollama/vulkan"},
want: []wantDevice{{
name: "Vulkan0",
library: "Vulkan",
totalMiB: 16384,
checkIntegrated: true,
integrated: true,
}},
},
{
name: "Vulkan without UMA metadata",
output: `Available devices:
Vulkan0: AMD Radeon(TM) Graphics (32768 MiB, 31000 MiB free)
`,
libDirs: []string{"/lib/ollama", "/lib/ollama/vulkan"},
want: []wantDevice{{
name: "Vulkan0",
library: "Vulkan",
totalMiB: 32768,
checkIntegrated: true,
}},
},
{
name: "CUDA device filtered by compiled archs",
output: `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
load_backend: loaded CUDA backend from /lib/ollama/cuda_v13/libggml-cuda.so
system_info: n_threads = 4 | CUDA : ARCHS = 750,800,860,890,900,1000,1030,1100,1200,1210 |
Available devices:
CUDA0: NVIDIA GeForce GTX 1060 6GB (6063 MiB, 5900 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama", "/lib/ollama/cuda_v13"})
if len(devices) != 0 {
t.Fatalf("expected 0 devices (GTX 1060 CC 610 not in ARCHS), got %d", len(devices))
}
}
func TestCUDADeviceKeptByArchs(t *testing.T) {
// RTX 4060 Ti (CC 8.9 = 890) with v13 ARCHS that include 890
output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 16379 MiB):
`,
libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v13"},
},
{
name: "CUDA device kept by compiled archs",
output: `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 16379 MiB):
Device 0: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB
system_info: n_threads = 16 | CUDA : ARCHS = 750,800,860,890,900,1000,1030,1100,1200,1210 |
Available devices:
CUDA0: NVIDIA GeForce RTX 4060 Ti (16379 MiB, 14900 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 1 {
t.Fatalf("expected 1 device (CC 890 in ARCHS), got %d", len(devices))
}
if devices[0].ComputeMajor != 8 || devices[0].ComputeMinor != 9 {
t.Fatalf("expected compute 8.9, got %s", devices[0].Compute())
}
}
func TestCUDANoArchsFailOpen(t *testing.T) {
// No system_info line — should keep all devices (fail open)
output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
`,
want: []wantDevice{{
name: "CUDA0",
library: "CUDA",
totalMiB: 16379,
compute: "8.9",
}},
},
{
name: "CUDA without compiled archs fails open",
output: `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
Available devices:
CUDA0: NVIDIA GeForce GTX 1060 6GB (6063 MiB, 5900 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 1 {
t.Fatalf("expected 1 device (no ARCHS = fail open), got %d", len(devices))
}
if devices[0].ComputeMajor != 6 || devices[0].ComputeMinor != 1 {
t.Fatalf("expected compute 6.1, got %s", devices[0].Compute())
}
}
func TestCUDANoCCFailOpen(t *testing.T) {
// Device line without compute capability — should keep (fail open)
output := `system_info: n_threads = 4 | CUDA : ARCHS = 750,800 |
`,
want: []wantDevice{{
name: "CUDA0",
library: "CUDA",
totalMiB: 6063,
compute: "6.1",
}},
},
{
name: "CUDA without compute capability fails open",
output: `system_info: n_threads = 4 | CUDA : ARCHS = 750,800 |
Available devices:
CUDA0: Some Future GPU (8192 MiB, 8000 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 1 {
t.Fatalf("expected 1 device (no CC = fail open), got %d", len(devices))
}
}
func TestCUDAMultiDeviceMixedFilter(t *testing.T) {
// Two devices: one supported (CC 890), one not (CC 610)
output := `ggml_cuda_init: found 2 CUDA devices:
`,
want: []wantDevice{{
name: "CUDA0",
library: "CUDA",
totalMiB: 8192,
}},
},
{
name: "CUDA mixed arch support",
output: `ggml_cuda_init: found 2 CUDA devices:
Device 0: NVIDIA GeForce GTX 1060, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
Device 1: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB
system_info: n_threads = 8 | CUDA : ARCHS = 750,800,860,890 |
Available devices:
CUDA0: NVIDIA GeForce GTX 1060 (6063 MiB, 5900 MiB free)
CUDA1: NVIDIA GeForce RTX 4060 Ti (16379 MiB, 14900 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 1 {
t.Fatalf("expected 1 device (only RTX 4060 Ti), got %d", len(devices))
}
if devices[0].Name != "CUDA1" {
t.Errorf("expected CUDA1, got %s", devices[0].Name)
}
}
func TestROCmDeviceGFXTarget(t *testing.T) {
output := `ggml_cuda_init: found 1 ROCm devices (Total VRAM: 12272 MiB):
Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
Available devices:
ROCm0: AMD Radeon RX 6700 XT (12272 MiB, 12248 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 1 {
t.Fatalf("expected 1 device, got %d", len(devices))
}
if devices[0].GFXTarget != "gfx1031" {
t.Errorf("expected gfx1031, got %s", devices[0].GFXTarget)
}
if devices[0].Compute() != "gfx1031" {
t.Errorf("expected compute gfx1031, got %s", devices[0].Compute())
}
}
func TestROCmDeviceGFXTargetWithXnack(t *testing.T) {
// gfx906 with :sramecc+:xnack- suffix (e.g., Radeon Pro VII)
output := `ggml_cuda_init: found 2 ROCm devices (Total VRAM: 32736 MiB):
`,
want: []wantDevice{{
name: "CUDA1",
library: "CUDA",
totalMiB: 16379,
compute: "8.9",
}},
},
{
name: "ROCm gfx target with xnack suffix",
output: `ggml_cuda_init: found 2 ROCm devices (Total VRAM: 32736 MiB):
Device 0: AMD Radeon RX 6800, gfx1030 (0x1030), VMM: no, Wave Size: 32, VRAM: 16368 MiB
Device 1: AMD Radeon Pro VII, gfx906:sramecc+:xnack- (0x906), VMM: no, Wave Size: 64, VRAM: 16368 MiB
Available devices:
ROCm0: AMD Radeon RX 6800 (16368 MiB, 16342 MiB free)
ROCm1: AMD Radeon Pro VII (16368 MiB, 16348 MiB free)
`
devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
if len(devices) != 2 {
t.Fatalf("expected 2 devices, got %d", len(devices))
}
if devices[0].GFXTarget != "gfx1030" {
t.Errorf("device 0: expected gfx1030, got %s", devices[0].GFXTarget)
}
if devices[1].GFXTarget != "gfx906" {
t.Errorf("device 1: expected gfx906, got %s", devices[1].GFXTarget)
}
if devices[0].Compute() != "gfx1030" {
t.Errorf("device 0: expected compute gfx1030, got %s", devices[0].Compute())
}
if devices[1].Compute() != "gfx906" {
t.Errorf("device 1: expected compute gfx906, got %s", devices[1].Compute())
}
}
`,
want: []wantDevice{
{name: "ROCm0", library: "ROCm", totalMiB: 16368, compute: "gfx1030", gfxTarget: "gfx1030"},
{name: "ROCm1", library: "ROCm", totalMiB: 16368, compute: "gfx906", gfxTarget: "gfx906"},
},
},
{
name: "unknown library",
output: `Available devices:
Future0: Mystery Accelerator (8192 MiB, 8000 MiB free)
`,
want: []wantDevice{{
name: "Future0",
library: "Mystery Accelerator",
totalMiB: 8192,
}},
},
{
name: "no devices",
output: "Available devices:\n",
},
{
name: "empty output",
},
}
func TestInferLibrary(t *testing.T) {
tests := []struct {
name string
desc string
want string
}{
{"NVIDIA CUDA", "NVIDIA GeForce RTX 4090", "CUDA"},
{"CUDA0", "NVIDIA GeForce RTX 4090", "CUDA"},
{"AMD ROCm", "AMD Radeon RX 6700 XT", "ROCm"},
{"ROCm0", "AMD Radeon RX 6700 XT", "ROCm"},
{"Metal", "Apple M3 Max", "Metal"},
{"Vulkan0", "NVIDIA GeForce RTX 4090 (Vulkan)", "Vulkan"},
{"Unknown", "Unknown Backend", "Unknown Backend"},
}
for _, tt := range tests {
got := inferLibrary(tt.name, tt.desc)
if got != tt.want {
t.Errorf("inferLibrary(%q, %q) = %q, want %q", tt.name, tt.desc, got, tt.want)
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
if tt.libDirs == nil {
tt.libDirs = []string{"/lib/ollama"}
}
devices := parseLlamaServerDevices(tt.output, tt.libDirs)
if len(devices) != len(tt.want) {
t.Fatalf("got %d devices, want %d", len(devices), len(tt.want))
}
for i, want := range tt.want {
got := devices[i]
if want.name != "" && got.Name != want.name {
t.Errorf("device %d name = %q, want %q", i, got.Name, want.name)
}
if want.library != "" && got.Library != want.library {
t.Errorf("device %d library = %q, want %q", i, got.Library, want.library)
}
if want.totalMiB > 0 && got.TotalMemory != want.totalMiB*1024*1024 {
t.Errorf("device %d total memory = %d, want %d MiB", i, got.TotalMemory, want.totalMiB)
}
if want.compute != "" && got.Compute() != want.compute {
t.Errorf("device %d compute = %q, want %q", i, got.Compute(), want.compute)
}
if want.gfxTarget != "" && got.GFXTarget != want.gfxTarget {
t.Errorf("device %d gfx target = %q, want %q", i, got.GFXTarget, want.gfxTarget)
}
if want.checkIntegrated && got.Integrated != want.integrated {
t.Errorf("device %d integrated = %v, want %v", i, got.Integrated, want.integrated)
}
}
})
}
}
}
func TestCudaCCRegex(t *testing.T) {
tests := []struct {
line string
wantIdx int
wantCC string
}{
{" Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB", 0, "610"},
{" Device 1: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB", 1, "890"},
{" Device 0: NVIDIA RTX PRO 6000, compute capability 12.0, VMM: yes, VRAM: 97250 MiB", 0, "1200"},
{" Device 0: Tesla V100-PCIE-16GB, compute capability 7.0, VMM: yes, VRAM: 16160 MiB", 0, "700"},
}
for _, tt := range tests {
matches := cudaCCRegex.FindStringSubmatch(tt.line)
if matches == nil {
t.Errorf("expected match for %q", tt.line)
continue
}
idx, _ := strconv.Atoi(matches[1])
major, _ := strconv.Atoi(matches[2])
minor, _ := strconv.Atoi(matches[3])
cc := fmt.Sprintf("%d%d0", major, minor)
if idx != tt.wantIdx {
t.Errorf("for %q: got idx %d, want %d", tt.line, idx, tt.wantIdx)
}
if cc != tt.wantCC {
t.Errorf("for %q: got CC %s, want %s", tt.line, cc, tt.wantCC)
}
}
}
func TestCudaArchsRegex(t *testing.T) {
tests := []struct {
line string
want []string
}{
{
"system_info: n_threads = 16 | CUDA : ARCHS = 750,800,860,890 | USE_GRAPHS = 1 |",
[]string{"750", "800", "860", "890"},
},
{
"system_info: | CUDA : ARCHS = 500,520,600,610,700,750,800,860,890,900,1200 |",
[]string{"500", "520", "600", "610", "700", "750", "800", "860", "890", "900", "1200"},
},
{
"no archs here",
nil,
},
}
for _, tt := range tests {
matches := cudaArchsRegex.FindStringSubmatch(tt.line)
if tt.want == nil {
if matches != nil {
t.Errorf("expected no match for %q, got %v", tt.line, matches)
}
continue
}
if matches == nil {
t.Errorf("expected match for %q, got nil", tt.line)
continue
}
got := strings.Split(matches[1], ",")
if len(got) != len(tt.want) {
t.Errorf("for %q: got %v, want %v", tt.line, got, tt.want)
}
}
})
}

View file

@ -139,7 +139,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
wg.Add(1)
go func(i int) {
defer wg.Done()
extraEnvs := ml.GetDevicesEnv(devices[i:i+1], true)
extraEnvs := ml.GetDevicesEnv(devices[i : i+1])
devices[i].AddInitValidation(extraEnvs)
if len(bootstrapDevicesWithMetalRetry(ctx2ndPass, ctx, 30*time.Second, devices[i].LibraryPath, extraEnvs)) == 0 {
slog.Debug("filtering device which didn't fully initialize",
@ -324,9 +324,10 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
rctx, cancel := context.WithTimeout(ctx, 3*time.Second)
defer cancel()
// Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct
// We avoid CUDA filters here to keep ROCm from failing to discover GPUs in a mixed environment
devFilter := ml.GetDevicesEnv(devices, false)
// Apply any device filters to avoid re-discovering unsupported devices,
// and get IDs correct. GetDevicesEnv only filters visibility when this
// pass is narrowed to a single device.
devFilter := ml.GetDevicesEnv(devices)
for dir := range libDirs {
updatedDevices := bootstrapDevicesWithMetalRetry(rctx, ctx, 3*time.Second, []string{ml.LibOllamaPath, dir}, devFilter)

View file

@ -28,12 +28,6 @@ using namespace llama_ollama_compat::detail; // pull detail:: helpers into scope
namespace {
thread_local uint64_t g_compat_info_logs = 0;
void note_compat_info_log() {
++g_compat_info_logs;
}
#ifdef OLLAMA_COMPAT_MTMD_BUILD
void ollama_compat_log(const char * format, ...) {
std::va_list args;
@ -42,10 +36,10 @@ void ollama_compat_log(const char * format, ...) {
va_end(args);
}
#define OLLAMA_COMPAT_LOG_INFO(...) do { note_compat_info_log(); ollama_compat_log(__VA_ARGS__); } while (0)
#define OLLAMA_COMPAT_LOG_INFO(...) do { ollama_compat_log(__VA_ARGS__); } while (0)
#define OLLAMA_COMPAT_LOG_ERROR(...) ollama_compat_log(__VA_ARGS__)
#else
#define OLLAMA_COMPAT_LOG_INFO(...) do { note_compat_info_log(); LLAMA_LOG_INFO(__VA_ARGS__); } while (0)
#define OLLAMA_COMPAT_LOG_INFO(...) do { LLAMA_LOG_INFO(__VA_ARGS__); } while (0)
#define OLLAMA_COMPAT_LOG_ERROR(...) LLAMA_LOG_ERROR(__VA_ARGS__)
#endif
@ -53,47 +47,23 @@ double elapsed_ms(std::chrono::steady_clock::time_point start) {
return std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - start).count();
}
class CompatScopeTimer {
public:
CompatScopeTimer(const char * phase, const std::string * arch = nullptr)
: phase_(phase),
arch_before_(arch ? *arch : ""),
arch_(arch),
log_start_(g_compat_info_logs),
start_(std::chrono::steady_clock::now()) {}
void force_log() {
forced_ = true;
}
~CompatScopeTimer() {
const bool arch_changed = arch_ && *arch_ != arch_before_;
if (!forced_ && !arch_changed && g_compat_info_logs == log_start_) return;
if (arch_) {
if (arch_changed) {
OLLAMA_COMPAT_LOG_INFO("compat patch timing: phase=%s arch_before=%s arch_after=%s duration_ms=%.3f\n",
phase_, arch_before_.c_str(), arch_->c_str(), elapsed_ms(start_));
} else {
OLLAMA_COMPAT_LOG_INFO("compat patch timing: phase=%s arch=%s duration_ms=%.3f\n",
phase_, arch_->c_str(), elapsed_ms(start_));
}
return;
}
OLLAMA_COMPAT_LOG_INFO("compat patch timing: phase=%s duration_ms=%.3f\n",
phase_, elapsed_ms(start_));
}
private:
const char * phase_;
std::string arch_before_;
const std::string * arch_;
uint64_t log_start_;
std::chrono::steady_clock::time_point start_;
bool forced_ = false;
struct TransformTiming {
uint64_t count;
size_t bytes;
double ms;
};
std::mutex g_transform_timing_mutex;
TransformTiming g_transform_timing = {};
TransformTiming record_transform_timing(size_t bytes, double ms) {
std::lock_guard<std::mutex> lk(g_transform_timing_mutex);
g_transform_timing.count++;
g_transform_timing.bytes += bytes;
g_transform_timing.ms += ms;
return g_transform_timing;
}
// Per-loader file path registry — set by translate_metadata, read by
// maybe_load_text_tensor so it can pass the path to load ops without a
// separate patch insertion in the model loader's load_all_data path.
@ -2791,7 +2761,6 @@ bool translate_metadata(const llama_model_loader * ml,
std::string & arch_name,
const char * fname) {
if (!meta) return false;
CompatScopeTimer timing("metadata", &arch_name);
{
std::lock_guard<std::mutex> lk(g_loader_path_mutex);
g_loader_paths[ml] = fname ? fname : "";
@ -2826,13 +2795,14 @@ bool translate_metadata(const llama_model_loader * ml,
// Dispatch. Add more arches as they are wired up.
const bool no_mmap = is_mmap_disabled_for(ml);
if (no_mmap) timing.force_log();
if (no_mmap) {
OLLAMA_COMPAT_LOG_INFO("compat patch disabled mmap for transformed text tensors\n");
}
return no_mmap;
}
void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
if (!meta) return;
CompatScopeTimer timing("clip");
handle_legacy_llava_projector(meta);
@ -2921,8 +2891,11 @@ bool maybe_load_tensor(ggml_tensor * cur,
ggml_backend_tensor_set(cur, dst.data(), 0, dst_size);
}
OLLAMA_COMPAT_LOG_INFO("%s: %s for %s (%zu bytes) in %.3f ms\n",
__func__, op.description, ggml_get_name(cur), dst_size, elapsed_ms(start));
const double ms = elapsed_ms(start);
const TransformTiming total = record_transform_timing(dst_size, ms);
OLLAMA_COMPAT_LOG_INFO("compat tensor transform: op=%s tensor=%s bytes=%zu duration_ms=%.3f total_ops=%llu total_bytes=%zu total_ms=%.3f\n",
op.description, ggml_get_name(cur), dst_size, ms,
(unsigned long long) total.count, total.bytes, total.ms);
return true;
}

View file

@ -601,7 +601,7 @@ func NewLlamaServerRunner(
memWriter := &memoryParsingWriter{inner: status}
mediaMarker := newLlamaServerMediaMarker()
extraEnvs := ml.GetDevicesEnv(gpus, false)
extraEnvs := ml.GetDevicesEnv(gpus)
serverEnvs := make(map[string]string, len(extraEnvs)+1)
for k, v := range extraEnvs {
serverEnvs[k] = v

View file

@ -532,12 +532,12 @@ func (f FlashAttentionType) String() string {
// Given the list of GPUs this instantiation is targeted for,
// figure out the device environment variables and any recorded
// per-device runner environment overrides. Set mustFilter true to enable
// filtering of CUDA devices.
func GetDevicesEnv(l []DeviceInfo, mustFilter bool) map[string]string {
// per-device runner environment overrides.
func GetDevicesEnv(l []DeviceInfo) map[string]string {
if len(l) == 0 {
return nil
}
mustFilter := len(l) == 1
env := map[string]string{}
for _, d := range l {
d.updateVisibleDevicesEnv(env, mustFilter)
@ -595,8 +595,12 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string, mustFilter bo
return
}
envVar = "CUDA_VISIBLE_DEVICES"
case "Vulkan":
if !mustFilter {
return
}
envVar = "GGML_VK_VISIBLE_DEVICES"
default:
// Vulkan is not filtered via env var, but via scheduling decisions
return
}
v, existing := env[envVar]

View file

@ -13,20 +13,13 @@ func TestMergeEnvWithRunnerEnvOverrides(t *testing.T) {
DeviceID: DeviceID{Library: "Metal", ID: "0"},
RunnerEnvOverrides: map[string]string{"GGML_METAL_TENSOR_DISABLE": "1"},
},
{
DeviceID: DeviceID{Library: "CUDA", ID: "3"},
},
}
env := GetDevicesEnv(devices, true)
env := GetDevicesEnv(devices)
if got, want := env["GGML_METAL_TENSOR_DISABLE"], "1"; got != want {
t.Fatalf("GGML_METAL_TENSOR_DISABLE = %q, want %q", got, want)
}
if got, want := env["CUDA_VISIBLE_DEVICES"], "3"; got != want {
t.Fatalf("CUDA_VISIBLE_DEVICES = %q, want %q", got, want)
}
}
func TestGetDevicesEnvWarnsOnConflictingOverrides(t *testing.T) {
@ -48,7 +41,7 @@ func TestGetDevicesEnvWarnsOnConflictingOverrides(t *testing.T) {
},
}
env := GetDevicesEnv(devices, false)
env := GetDevicesEnv(devices)
if got, want := env["TEST_OVERRIDE"], "two"; got != want {
t.Fatalf("TEST_OVERRIDE = %q, want %q", got, want)
@ -59,6 +52,53 @@ func TestGetDevicesEnvWarnsOnConflictingOverrides(t *testing.T) {
}
}
func TestGetDevicesEnvFiltersSingleDevice(t *testing.T) {
tests := []struct {
name string
gpus []DeviceInfo
key string
want string
}{
{
name: "single CUDA",
gpus: []DeviceInfo{{DeviceID: DeviceID{Library: "CUDA", ID: "3"}}},
key: "CUDA_VISIBLE_DEVICES",
want: "3",
},
{
name: "multiple CUDA",
gpus: []DeviceInfo{
{DeviceID: DeviceID{Library: "CUDA", ID: "3"}},
{DeviceID: DeviceID{Library: "CUDA", ID: "4"}},
},
key: "CUDA_VISIBLE_DEVICES",
},
{
name: "single Vulkan",
gpus: []DeviceInfo{{DeviceID: DeviceID{Library: "Vulkan", ID: "0"}, FilterID: "1"}},
key: "GGML_VK_VISIBLE_DEVICES",
want: "1",
},
{
name: "multiple Vulkan",
gpus: []DeviceInfo{
{DeviceID: DeviceID{Library: "Vulkan", ID: "0"}, FilterID: "1"},
{DeviceID: DeviceID{Library: "Vulkan", ID: "1"}, FilterID: "0"},
},
key: "GGML_VK_VISIBLE_DEVICES",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
env := GetDevicesEnv(tt.gpus)
if got := env[tt.key]; got != tt.want {
t.Fatalf("%s = %q, want %q", tt.key, got, tt.want)
}
})
}
}
func TestFlashAttentionSupported(t *testing.T) {
tests := []struct {
name string

View file

@ -784,26 +784,43 @@ func selectLlamaServerPlacement(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo,
}
if opts.MainGPU != nil {
selected, ok := bestExplicitMainGPUGroup(systemInfo, groups, *opts.MainGPU)
gpu, available, ok := bestExplicitMainGPU(systemInfo, groups, *opts.MainGPU)
if !ok {
selected = bestGPUGroupByAvailableMemory(systemInfo, groups)
selected := bestGPUGroupByAvailableMemory(systemInfo, groups)
slog.Warn("requested main_gpu is outside the selected GPU group; passing value through to llama-server",
"main_gpu", *opts.MainGPU,
"gpu_count", len(selected))
logSelectedGPUGroup(gpus, selected)
return selected, launchOpts
}
selected, launchOpts := singleLlamaServerGPUPlacement(gpu, launchOpts)
slog.Info("selecting requested single GPU for llama-server model",
"requested_main_gpu", *opts.MainGPU,
"main_gpu", *launchOpts.MainGPU,
"id", gpu.ID,
"filter_id", gpu.FilterID,
"library", gpu.Library,
"name", gpu.Name,
"description", gpu.Description,
"integrated", gpu.Integrated,
"available", format.HumanBytes2(available))
logSelectedGPUGroup(gpus, selected)
return selected, launchOpts
}
if !envconfig.SchedSpread() && predictedVRAM > 0 {
selected, mainGPU, gpu, available, ok := bestSingleGPUFit(systemInfo, groups, predictedVRAM)
gpu, available, ok := bestSingleGPUFit(systemInfo, groups, predictedVRAM)
if ok {
launchOpts.MainGPU = &mainGPU
selected, launchOpts := singleLlamaServerGPUPlacement(gpu, launchOpts)
slog.Info("selecting single GPU for llama-server model",
"main_gpu", mainGPU,
"main_gpu", *launchOpts.MainGPU,
"id", gpu.ID,
"filter_id", gpu.FilterID,
"library", gpu.Library,
"name", gpu.Name,
"description", gpu.Description,
"integrated", gpu.Integrated,
"predicted", format.HumanBytes2(predictedVRAM),
"available", format.HumanBytes2(available))
logSelectedGPUGroup(gpus, selected)
@ -816,37 +833,41 @@ func selectLlamaServerPlacement(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo,
return selected, launchOpts
}
func bestExplicitMainGPUGroup(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, mainGPU int) ([]ml.DeviceInfo, bool) {
func singleLlamaServerGPUPlacement(gpu ml.DeviceInfo, opts api.Options) ([]ml.DeviceInfo, api.Options) {
mainGPU := 0
opts.MainGPU = &mainGPU
return []ml.DeviceInfo{gpu}, opts
}
func bestExplicitMainGPU(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, mainGPU int) (gpu ml.DeviceInfo, available uint64, ok bool) {
if mainGPU < 0 {
return nil, false
return ml.DeviceInfo{}, 0, false
}
var best []ml.DeviceInfo
var bestAvailable uint64
for _, group := range groups {
if mainGPU >= len(group) {
continue
}
available := availableMemoryForGPU(systemInfo, group[mainGPU])
if best == nil || available > bestAvailable {
best = group
bestAvailable = available
candidate := group[mainGPU]
candidateAvailable := availableMemoryForGPU(systemInfo, candidate)
if !ok || betterPlacementGPU(candidate, candidateAvailable, gpu, available) {
gpu = candidate
available = candidateAvailable
ok = true
}
}
return best, best != nil
return gpu, available, ok
}
func bestSingleGPUFit(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, predictedVRAM uint64) (selected []ml.DeviceInfo, mainGPU int, gpu ml.DeviceInfo, available uint64, ok bool) {
func bestSingleGPUFit(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, predictedVRAM uint64) (gpu ml.DeviceInfo, available uint64, ok bool) {
for _, group := range groups {
for i, candidate := range group {
for _, candidate := range group {
candidateAvailable := availableMemoryForGPU(systemInfo, candidate)
if predictedVRAM > candidateAvailable*80/100 {
continue
}
if !ok || candidateAvailable > available {
selected = group
mainGPU = i
if !ok || betterPlacementGPU(candidate, candidateAvailable, gpu, available) {
gpu = candidate
available = candidateAvailable
ok = true
@ -854,7 +875,15 @@ func bestSingleGPUFit(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, predic
}
}
return selected, mainGPU, gpu, available, ok
return gpu, available, ok
}
func betterPlacementGPU(candidate ml.DeviceInfo, candidateAvailable uint64, current ml.DeviceInfo, currentAvailable uint64) bool {
if candidate.Integrated != current.Integrated {
return !candidate.Integrated
}
return candidateAvailable > currentAvailable
}
func bestGPUGroupByAvailableMemory(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo) []ml.DeviceInfo {
@ -862,7 +891,7 @@ func bestGPUGroupByAvailableMemory(systemInfo ml.SystemInfo, groups [][]ml.Devic
var bestAvailable uint64
for _, group := range groups {
available, _, _ := availableMemoryForLoad(systemInfo, group)
if best == nil || available > bestAvailable {
if best == nil || betterPlacementGroup(group, available, best, bestAvailable) {
best = group
bestAvailable = available
}
@ -871,6 +900,25 @@ func bestGPUGroupByAvailableMemory(systemInfo ml.SystemInfo, groups [][]ml.Devic
return best
}
func betterPlacementGroup(candidate []ml.DeviceInfo, candidateAvailable uint64, current []ml.DeviceInfo, currentAvailable uint64) bool {
candidateDiscrete := hasDiscreteGPU(candidate)
currentDiscrete := hasDiscreteGPU(current)
if candidateDiscrete != currentDiscrete {
return candidateDiscrete
}
return candidateAvailable > currentAvailable
}
func hasDiscreteGPU(gpus []ml.DeviceInfo) bool {
for _, gpu := range gpus {
if !gpu.Integrated {
return true
}
}
return false
}
func availableMemoryForGPU(systemInfo ml.SystemInfo, gpu ml.DeviceInfo) uint64 {
if gpu.Integrated && systemInfo.FreeMemory > 0 && systemInfo.FreeMemory < gpu.FreeMemory {
return systemInfo.FreeMemory

View file

@ -1176,6 +1176,7 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
wantLibrary string
wantMainGPU *int
wantSelectedGPUs int
wantGPUID string
}{
{
name: "compacts onto largest same-backend GPU",
@ -1186,8 +1187,9 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
},
opts: api.DefaultOptions(),
wantLibrary: "CUDA",
wantMainGPU: testIntPtr(1),
wantSelectedGPUs: 2,
wantMainGPU: testIntPtr(0),
wantSelectedGPUs: 1,
wantGPUID: "1",
},
{
name: "explicit main gpu selects matching backend group",
@ -1201,8 +1203,9 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
Runner: api.Runner{MainGPU: testIntPtr(1), NumGPU: -1},
},
wantLibrary: "ROCm",
wantMainGPU: testIntPtr(1),
wantSelectedGPUs: 2,
wantMainGPU: testIntPtr(0),
wantSelectedGPUs: 1,
wantGPUID: "1",
},
{
name: "integrated GPU is capped by system free memory",
@ -1213,8 +1216,22 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
},
opts: api.DefaultOptions(),
wantLibrary: "Metal",
wantMainGPU: testIntPtr(1),
wantSelectedGPUs: 2,
wantMainGPU: testIntPtr(0),
wantSelectedGPUs: 1,
wantGPUID: "1",
},
{
name: "prefers discrete GPU over integrated GPU with more available memory",
predictedVRAM: 8 * format.GigaByte,
gpus: []ml.DeviceInfo{
{DeviceID: ml.DeviceID{ID: "0", Library: "Vulkan"}, Name: "integrated", Integrated: true, FreeMemory: 32 * format.GigaByte},
{DeviceID: ml.DeviceID{ID: "1", Library: "Vulkan"}, Name: "discrete", FreeMemory: 10 * format.GigaByte},
},
opts: api.DefaultOptions(),
wantLibrary: "Vulkan",
wantMainGPU: testIntPtr(0),
wantSelectedGPUs: 1,
wantGPUID: "1",
},
{
name: "spread disables automatic compaction",
@ -1249,6 +1266,9 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
selected, launchOpts := selectLlamaServerPlacement(systemInfo, tt.gpus, tt.predictedVRAM, tt.opts)
require.Len(t, selected, tt.wantSelectedGPUs)
require.Equal(t, tt.wantLibrary, selected[0].Library)
if tt.wantGPUID != "" {
require.Equal(t, tt.wantGPUID, selected[0].ID)
}
if tt.wantMainGPU == nil {
require.Nil(t, launchOpts.MainGPU)
} else {