scheduler improvements

2026-05-13 14:27:00 +00:00 · 2026-05-08 11:14:34 -07:00 · 2026-05-08 11:14:34 -07:00 · 88935c21b5
commit 88935c21b5
parent fa7092b1d3
10 changed files with 393 additions and 393 deletions
--- a/2
+++ b/2
@ -1 +1 @@
-b9048
+b9075
--- a/discover/llama_server.go
+++ b/discover/llama_server.go
@ -61,6 +61,7 @@ func llamaServerDiscoverDevices(ctx context.Context, libDirs []string, extraEnvs
 		"--host", "127.0.0.1",
 		"--no-webui",
 		"--offline",
+		"--verbose",
 	)
 	cmd.WaitDelay = llamaServerDiscoveryWaitDelay
 	cmd.Env = os.Environ()
@ -137,7 +138,7 @@ func llamaServerDiscoverDevices(ctx context.Context, libDirs []string, extraEnvs

 	// Also run --list-devices to get the stdout device list with free memory
 	// (the brief server startup doesn't print that)
-	cmd2 := exec.CommandContext(ctx, llamaServer, "--list-devices", "--offline")
+	cmd2 := exec.CommandContext(ctx, llamaServer, "--list-devices", "--offline", "--verbose")
 	cmd2.WaitDelay = llamaServerDiscoveryWaitDelay
 	cmd2.Env = cmd.Env // reuse same environment
 	listOutput, err := cmd2.CombinedOutput()
--- a/discover/llama_server_test.go
+++ b/discover/llama_server_test.go
@ -1,348 +1,261 @@
 package discover

 import (
-	"fmt"
 	"io"
 	"log/slog"
-	"strconv"
-	"strings"
 	"testing"

 	"github.com/ollama/ollama/logutil"
 )

-func TestLlamaServerDiscoveryOutputOnlyTrace(t *testing.T) {
-	original := slog.Default()
-	t.Cleanup(func() {
-		slog.SetDefault(original)
+func TestLlamaServerDiscovery(t *testing.T) {
+	t.Run("output only trace", func(t *testing.T) {
+		original := slog.Default()
+		t.Cleanup(func() {
+			slog.SetDefault(original)
+		})
+
+		slog.SetDefault(logutil.NewLogger(io.Discard, slog.LevelDebug))
+		if got := llamaServerDiscoveryOutput(t.Context()); got != io.Discard {
+			t.Fatal("debug logging should discard raw llama-server discovery output")
+		}
+
+		slog.SetDefault(logutil.NewLogger(io.Discard, logutil.LevelTrace))
+		if got := llamaServerDiscoveryOutput(t.Context()); got == io.Discard {
+			t.Fatal("trace logging should emit raw llama-server discovery output")
+		}
 	})

-	slog.SetDefault(logutil.NewLogger(io.Discard, slog.LevelDebug))
-	if got := llamaServerDiscoveryOutput(t.Context()); got != io.Discard {
-		t.Fatal("debug logging should discard raw llama-server discovery output")
-	}
+	t.Run("parse devices", func(t *testing.T) {
+		type wantDevice struct {
+			name            string
+			library         string
+			totalMiB        uint64
+			compute         string
+			gfxTarget       string
+			checkIntegrated bool
+			integrated      bool
+		}

-	slog.SetDefault(logutil.NewLogger(io.Discard, logutil.LevelTrace))
-	if got := llamaServerDiscoveryOutput(t.Context()); got == io.Discard {
-		t.Fatal("trace logging should emit raw llama-server discovery output")
-	}
-}
-
-func TestParseLlamaServerDevices(t *testing.T) {
-	tests := []struct {
-		name     string
-		output   string
-		libDirs  []string
-		wantLen  int
-		wantName string
-		wantLib  string
-		wantMiB  uint64
-	}{
-		{
-			name: "NVIDIA CUDA",
-			output: `load_backend: loaded CUDA backend from /lib/ollama/cuda_v12/libggml-cuda.so
+		tests := []struct {
+			name    string
+			output  string
+			libDirs []string
+			want    []wantDevice
+		}{
+			{
+				name: "NVIDIA CUDA",
+				output: `load_backend: loaded CUDA backend from /lib/ollama/cuda_v12/libggml-cuda.so
 Available devices:
  NVIDIA GeForce RTX 4090: NVIDIA CUDA (24564 MiB, 23592 MiB free)
 `,
-			libDirs:  []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
-			wantLen:  1,
-			wantName: "NVIDIA GeForce RTX 4090",
-			wantLib:  "CUDA",
-			wantMiB:  24564,
-		},
-		{
-			name: "Metal",
-			output: `Available devices:
+				libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
+				want: []wantDevice{{
+					name:     "NVIDIA GeForce RTX 4090",
+					library:  "CUDA",
+					totalMiB: 24564,
+				}},
+			},
+			{
+				name: "Metal",
+				output: `Available devices:
  Metal: Apple M3 Max (98304 MiB, 98303 MiB free)
 `,
-			libDirs:  []string{"/lib/ollama"},
-			wantLen:  1,
-			wantName: "Metal",
-			wantLib:  "Metal",
-			wantMiB:  98304,
-		},
-		{
-			name: "ROCm with gfx target",
-			output: `  Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
+				want: []wantDevice{{
+					name:     "Metal",
+					library:  "Metal",
+					totalMiB: 98304,
+				}},
+			},
+			{
+				name: "ROCm with gfx target",
+				output: `  Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
 Available devices:
  ROCm0: AMD Radeon RX 6700 XT (12272 MiB, 12248 MiB free)
 `,
-			libDirs:  []string{"/lib/ollama", "/lib/ollama/rocm"},
-			wantLen:  1,
-			wantName: "ROCm0",
-			wantLib:  "ROCm",
-			wantMiB:  12272,
-		},
-		{
-			name: "multi GPU",
-			output: `Available devices:
+				libDirs: []string{"/lib/ollama", "/lib/ollama/rocm"},
+				want: []wantDevice{{
+					name:      "ROCm0",
+					library:   "ROCm",
+					totalMiB:  12272,
+					compute:   "gfx1031",
+					gfxTarget: "gfx1031",
+				}},
+			},
+			{
+				name: "multi GPU",
+				output: `Available devices:
  CUDA0: NVIDIA GeForce RTX 4090 (24564 MiB, 23592 MiB free)
  CUDA1: NVIDIA GeForce RTX 3060 (12288 MiB, 11500 MiB free)
 `,
-			libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
-			wantLen: 2,
-		},
-		{
-			name:    "no devices",
-			output:  "Available devices:\n",
-			libDirs: []string{"/lib/ollama"},
-			wantLen: 0,
-		},
-		{
-			name:    "empty output",
-			output:  "",
-			libDirs: []string{"/lib/ollama"},
-			wantLen: 0,
-		},
-	}
-
-	for _, tt := range tests {
-		t.Run(tt.name, func(t *testing.T) {
-			devices := parseLlamaServerDevices(tt.output, tt.libDirs)
-			if len(devices) != tt.wantLen {
-				t.Fatalf("got %d devices, want %d", len(devices), tt.wantLen)
-			}
-			if tt.wantLen > 0 {
-				if tt.wantName != "" && devices[0].Name != tt.wantName {
-					t.Errorf("name = %q, want %q", devices[0].Name, tt.wantName)
-				}
-				if tt.wantLib != "" && devices[0].Library != tt.wantLib {
-					t.Errorf("library = %q, want %q", devices[0].Library, tt.wantLib)
-				}
-				if tt.wantMiB > 0 {
-					expectedBytes := tt.wantMiB * 1024 * 1024
-					if devices[0].TotalMemory != expectedBytes {
-						t.Errorf("total memory = %d, want %d", devices[0].TotalMemory, expectedBytes)
-					}
-				}
-			}
-		})
-	}
-}
-
-func TestParseLlamaServerDevicesMarksVulkanUMAGPUsIntegrated(t *testing.T) {
-	output := `ggml_vulkan: 0 = Intel(R) Graphics (Intel open-source Mesa driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 65536 | int dot: 1 | matrix cores: none
+				libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v12"},
+				want: []wantDevice{
+					{name: "CUDA0", library: "CUDA", totalMiB: 24564},
+					{name: "CUDA1", library: "CUDA", totalMiB: 12288},
+				},
+			},
+			{
+				name: "Vulkan UMA",
+				output: `ggml_vulkan: 0 = Intel(R) Graphics (Intel open-source Mesa driver) | uma: 1 | fp16: 1 | bf16: 0 | warp size: 32 | shared memory: 65536 | int dot: 1 | matrix cores: none
 Available devices:
  Vulkan0: Intel(R) Graphics (16384 MiB, 12288 MiB free)
-`
-	devices := parseLlamaServerDevices(output, []string{"/lib/ollama", "/lib/ollama/vulkan"})
-	if len(devices) != 1 {
-		t.Fatalf("expected 1 device, got %d", len(devices))
-	}
-	if !devices[0].Integrated {
-		t.Fatal("expected Vulkan UMA device to be marked integrated")
-	}
-}
-
-func TestCUDADeviceFilteredByArchs(t *testing.T) {
-	// GTX 1060 (CC 6.1 = 610) with v13 ARCHS that don't include 610
-	output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
+`,
+				libDirs: []string{"/lib/ollama", "/lib/ollama/vulkan"},
+				want: []wantDevice{{
+					name:            "Vulkan0",
+					library:         "Vulkan",
+					totalMiB:        16384,
+					checkIntegrated: true,
+					integrated:      true,
+				}},
+			},
+			{
+				name: "Vulkan without UMA metadata",
+				output: `Available devices:
+  Vulkan0: AMD Radeon(TM) Graphics (32768 MiB, 31000 MiB free)
+`,
+				libDirs: []string{"/lib/ollama", "/lib/ollama/vulkan"},
+				want: []wantDevice{{
+					name:            "Vulkan0",
+					library:         "Vulkan",
+					totalMiB:        32768,
+					checkIntegrated: true,
+				}},
+			},
+			{
+				name: "CUDA device filtered by compiled archs",
+				output: `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
  Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
 load_backend: loaded CUDA backend from /lib/ollama/cuda_v13/libggml-cuda.so
 system_info: n_threads = 4 | CUDA : ARCHS = 750,800,860,890,900,1000,1030,1100,1200,1210 |
 Available devices:
  CUDA0: NVIDIA GeForce GTX 1060 6GB (6063 MiB, 5900 MiB free)
-`
-	devices := parseLlamaServerDevices(output, []string{"/lib/ollama", "/lib/ollama/cuda_v13"})
-	if len(devices) != 0 {
-		t.Fatalf("expected 0 devices (GTX 1060 CC 610 not in ARCHS), got %d", len(devices))
-	}
-}
-
-func TestCUDADeviceKeptByArchs(t *testing.T) {
-	// RTX 4060 Ti (CC 8.9 = 890) with v13 ARCHS that include 890
-	output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 16379 MiB):
+`,
+				libDirs: []string{"/lib/ollama", "/lib/ollama/cuda_v13"},
+			},
+			{
+				name: "CUDA device kept by compiled archs",
+				output: `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 16379 MiB):
  Device 0: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB
 system_info: n_threads = 16 | CUDA : ARCHS = 750,800,860,890,900,1000,1030,1100,1200,1210 |
 Available devices:
  CUDA0: NVIDIA GeForce RTX 4060 Ti (16379 MiB, 14900 MiB free)
-`
-	devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
-	if len(devices) != 1 {
-		t.Fatalf("expected 1 device (CC 890 in ARCHS), got %d", len(devices))
-	}
-	if devices[0].ComputeMajor != 8 || devices[0].ComputeMinor != 9 {
-		t.Fatalf("expected compute 8.9, got %s", devices[0].Compute())
-	}
-}
-
-func TestCUDANoArchsFailOpen(t *testing.T) {
-	// No system_info line — should keep all devices (fail open)
-	output := `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
+`,
+				want: []wantDevice{{
+					name:     "CUDA0",
+					library:  "CUDA",
+					totalMiB: 16379,
+					compute:  "8.9",
+				}},
+			},
+			{
+				name: "CUDA without compiled archs fails open",
+				output: `ggml_cuda_init: found 1 CUDA devices (Total VRAM: 6063 MiB):
  Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
 Available devices:
  CUDA0: NVIDIA GeForce GTX 1060 6GB (6063 MiB, 5900 MiB free)
-`
-	devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
-	if len(devices) != 1 {
-		t.Fatalf("expected 1 device (no ARCHS = fail open), got %d", len(devices))
-	}
-	if devices[0].ComputeMajor != 6 || devices[0].ComputeMinor != 1 {
-		t.Fatalf("expected compute 6.1, got %s", devices[0].Compute())
-	}
-}
-
-func TestCUDANoCCFailOpen(t *testing.T) {
-	// Device line without compute capability — should keep (fail open)
-	output := `system_info: n_threads = 4 | CUDA : ARCHS = 750,800 |
+`,
+				want: []wantDevice{{
+					name:     "CUDA0",
+					library:  "CUDA",
+					totalMiB: 6063,
+					compute:  "6.1",
+				}},
+			},
+			{
+				name: "CUDA without compute capability fails open",
+				output: `system_info: n_threads = 4 | CUDA : ARCHS = 750,800 |
 Available devices:
  CUDA0: Some Future GPU (8192 MiB, 8000 MiB free)
-`
-	devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
-	if len(devices) != 1 {
-		t.Fatalf("expected 1 device (no CC = fail open), got %d", len(devices))
-	}
-}
-
-func TestCUDAMultiDeviceMixedFilter(t *testing.T) {
-	// Two devices: one supported (CC 890), one not (CC 610)
-	output := `ggml_cuda_init: found 2 CUDA devices:
+`,
+				want: []wantDevice{{
+					name:     "CUDA0",
+					library:  "CUDA",
+					totalMiB: 8192,
+				}},
+			},
+			{
+				name: "CUDA mixed arch support",
+				output: `ggml_cuda_init: found 2 CUDA devices:
  Device 0: NVIDIA GeForce GTX 1060, compute capability 6.1, VMM: yes, VRAM: 6063 MiB
  Device 1: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB
 system_info: n_threads = 8 | CUDA : ARCHS = 750,800,860,890 |
 Available devices:
  CUDA0: NVIDIA GeForce GTX 1060 (6063 MiB, 5900 MiB free)
  CUDA1: NVIDIA GeForce RTX 4060 Ti (16379 MiB, 14900 MiB free)
-`
-	devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
-	if len(devices) != 1 {
-		t.Fatalf("expected 1 device (only RTX 4060 Ti), got %d", len(devices))
-	}
-	if devices[0].Name != "CUDA1" {
-		t.Errorf("expected CUDA1, got %s", devices[0].Name)
-	}
-}
-
-func TestROCmDeviceGFXTarget(t *testing.T) {
-	output := `ggml_cuda_init: found 1 ROCm devices (Total VRAM: 12272 MiB):
-  Device 0: AMD Radeon RX 6700 XT, gfx1031 (0x1031), VMM: no, Wave Size: 32, VRAM: 12272 MiB
-Available devices:
-  ROCm0: AMD Radeon RX 6700 XT (12272 MiB, 12248 MiB free)
-`
-	devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
-	if len(devices) != 1 {
-		t.Fatalf("expected 1 device, got %d", len(devices))
-	}
-	if devices[0].GFXTarget != "gfx1031" {
-		t.Errorf("expected gfx1031, got %s", devices[0].GFXTarget)
-	}
-	if devices[0].Compute() != "gfx1031" {
-		t.Errorf("expected compute gfx1031, got %s", devices[0].Compute())
-	}
-}
-
-func TestROCmDeviceGFXTargetWithXnack(t *testing.T) {
-	// gfx906 with :sramecc+:xnack- suffix (e.g., Radeon Pro VII)
-	output := `ggml_cuda_init: found 2 ROCm devices (Total VRAM: 32736 MiB):
+`,
+				want: []wantDevice{{
+					name:     "CUDA1",
+					library:  "CUDA",
+					totalMiB: 16379,
+					compute:  "8.9",
+				}},
+			},
+			{
+				name: "ROCm gfx target with xnack suffix",
+				output: `ggml_cuda_init: found 2 ROCm devices (Total VRAM: 32736 MiB):
  Device 0: AMD Radeon RX 6800, gfx1030 (0x1030), VMM: no, Wave Size: 32, VRAM: 16368 MiB
  Device 1: AMD Radeon Pro VII, gfx906:sramecc+:xnack- (0x906), VMM: no, Wave Size: 64, VRAM: 16368 MiB
 Available devices:
  ROCm0: AMD Radeon RX 6800 (16368 MiB, 16342 MiB free)
  ROCm1: AMD Radeon Pro VII (16368 MiB, 16348 MiB free)
-`
-	devices := parseLlamaServerDevices(output, []string{"/lib/ollama"})
-	if len(devices) != 2 {
-		t.Fatalf("expected 2 devices, got %d", len(devices))
-	}
-	if devices[0].GFXTarget != "gfx1030" {
-		t.Errorf("device 0: expected gfx1030, got %s", devices[0].GFXTarget)
-	}
-	if devices[1].GFXTarget != "gfx906" {
-		t.Errorf("device 1: expected gfx906, got %s", devices[1].GFXTarget)
-	}
-	if devices[0].Compute() != "gfx1030" {
-		t.Errorf("device 0: expected compute gfx1030, got %s", devices[0].Compute())
-	}
-	if devices[1].Compute() != "gfx906" {
-		t.Errorf("device 1: expected compute gfx906, got %s", devices[1].Compute())
-	}
-}
+`,
+				want: []wantDevice{
+					{name: "ROCm0", library: "ROCm", totalMiB: 16368, compute: "gfx1030", gfxTarget: "gfx1030"},
+					{name: "ROCm1", library: "ROCm", totalMiB: 16368, compute: "gfx906", gfxTarget: "gfx906"},
+				},
+			},
+			{
+				name: "unknown library",
+				output: `Available devices:
+  Future0: Mystery Accelerator (8192 MiB, 8000 MiB free)
+`,
+				want: []wantDevice{{
+					name:     "Future0",
+					library:  "Mystery Accelerator",
+					totalMiB: 8192,
+				}},
+			},
+			{
+				name:   "no devices",
+				output: "Available devices:\n",
+			},
+			{
+				name: "empty output",
+			},
+		}

-func TestInferLibrary(t *testing.T) {
-	tests := []struct {
-		name string
-		desc string
-		want string
-	}{
-		{"NVIDIA CUDA", "NVIDIA GeForce RTX 4090", "CUDA"},
-		{"CUDA0", "NVIDIA GeForce RTX 4090", "CUDA"},
-		{"AMD ROCm", "AMD Radeon RX 6700 XT", "ROCm"},
-		{"ROCm0", "AMD Radeon RX 6700 XT", "ROCm"},
-		{"Metal", "Apple M3 Max", "Metal"},
-		{"Vulkan0", "NVIDIA GeForce RTX 4090 (Vulkan)", "Vulkan"},
-		{"Unknown", "Unknown Backend", "Unknown Backend"},
-	}
-	for _, tt := range tests {
-		got := inferLibrary(tt.name, tt.desc)
-		if got != tt.want {
-			t.Errorf("inferLibrary(%q, %q) = %q, want %q", tt.name, tt.desc, got, tt.want)
+		for _, tt := range tests {
+			t.Run(tt.name, func(t *testing.T) {
+				if tt.libDirs == nil {
+					tt.libDirs = []string{"/lib/ollama"}
+				}
+				devices := parseLlamaServerDevices(tt.output, tt.libDirs)
+				if len(devices) != len(tt.want) {
+					t.Fatalf("got %d devices, want %d", len(devices), len(tt.want))
+				}
+				for i, want := range tt.want {
+					got := devices[i]
+					if want.name != "" && got.Name != want.name {
+						t.Errorf("device %d name = %q, want %q", i, got.Name, want.name)
+					}
+					if want.library != "" && got.Library != want.library {
+						t.Errorf("device %d library = %q, want %q", i, got.Library, want.library)
+					}
+					if want.totalMiB > 0 && got.TotalMemory != want.totalMiB*1024*1024 {
+						t.Errorf("device %d total memory = %d, want %d MiB", i, got.TotalMemory, want.totalMiB)
+					}
+					if want.compute != "" && got.Compute() != want.compute {
+						t.Errorf("device %d compute = %q, want %q", i, got.Compute(), want.compute)
+					}
+					if want.gfxTarget != "" && got.GFXTarget != want.gfxTarget {
+						t.Errorf("device %d gfx target = %q, want %q", i, got.GFXTarget, want.gfxTarget)
+					}
+					if want.checkIntegrated && got.Integrated != want.integrated {
+						t.Errorf("device %d integrated = %v, want %v", i, got.Integrated, want.integrated)
+					}
+				}
+			})
 		}
-	}
-}
-
-func TestCudaCCRegex(t *testing.T) {
-	tests := []struct {
-		line    string
-		wantIdx int
-		wantCC  string
-	}{
-		{"  Device 0: NVIDIA GeForce GTX 1060 6GB, compute capability 6.1, VMM: yes, VRAM: 6063 MiB", 0, "610"},
-		{"  Device 1: NVIDIA GeForce RTX 4060 Ti, compute capability 8.9, VMM: yes, VRAM: 16379 MiB", 1, "890"},
-		{"  Device 0: NVIDIA RTX PRO 6000, compute capability 12.0, VMM: yes, VRAM: 97250 MiB", 0, "1200"},
-		{"  Device 0: Tesla V100-PCIE-16GB, compute capability 7.0, VMM: yes, VRAM: 16160 MiB", 0, "700"},
-	}
-	for _, tt := range tests {
-		matches := cudaCCRegex.FindStringSubmatch(tt.line)
-		if matches == nil {
-			t.Errorf("expected match for %q", tt.line)
-			continue
-		}
-		idx, _ := strconv.Atoi(matches[1])
-		major, _ := strconv.Atoi(matches[2])
-		minor, _ := strconv.Atoi(matches[3])
-		cc := fmt.Sprintf("%d%d0", major, minor)
-		if idx != tt.wantIdx {
-			t.Errorf("for %q: got idx %d, want %d", tt.line, idx, tt.wantIdx)
-		}
-		if cc != tt.wantCC {
-			t.Errorf("for %q: got CC %s, want %s", tt.line, cc, tt.wantCC)
-		}
-	}
-}
-
-func TestCudaArchsRegex(t *testing.T) {
-	tests := []struct {
-		line string
-		want []string
-	}{
-		{
-			"system_info: n_threads = 16 | CUDA : ARCHS = 750,800,860,890 | USE_GRAPHS = 1 |",
-			[]string{"750", "800", "860", "890"},
-		},
-		{
-			"system_info: | CUDA : ARCHS = 500,520,600,610,700,750,800,860,890,900,1200 |",
-			[]string{"500", "520", "600", "610", "700", "750", "800", "860", "890", "900", "1200"},
-		},
-		{
-			"no archs here",
-			nil,
-		},
-	}
-	for _, tt := range tests {
-		matches := cudaArchsRegex.FindStringSubmatch(tt.line)
-		if tt.want == nil {
-			if matches != nil {
-				t.Errorf("expected no match for %q, got %v", tt.line, matches)
-			}
-			continue
-		}
-		if matches == nil {
-			t.Errorf("expected match for %q, got nil", tt.line)
-			continue
-		}
-		got := strings.Split(matches[1], ",")
-		if len(got) != len(tt.want) {
-			t.Errorf("for %q: got %v, want %v", tt.line, got, tt.want)
-		}
-	}
+	})
 }
--- a/discover/runner.go
+++ b/discover/runner.go
@ -139,7 +139,7 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 			wg.Add(1)
 			go func(i int) {
 				defer wg.Done()
-				extraEnvs := ml.GetDevicesEnv(devices[i:i+1], true)
+				extraEnvs := ml.GetDevicesEnv(devices[i : i+1])
 				devices[i].AddInitValidation(extraEnvs)
 				if len(bootstrapDevicesWithMetalRetry(ctx2ndPass, ctx, 30*time.Second, devices[i].LibraryPath, extraEnvs)) == 0 {
 					slog.Debug("filtering device which didn't fully initialize",
@ -324,9 +324,10 @@ func GPUDevices(ctx context.Context, runners []ml.FilteredRunnerDiscovery) []ml.
 			rctx, cancel := context.WithTimeout(ctx, 3*time.Second)
 			defer cancel()

-			// Apply any dev filters to avoid re-discovering unsupported devices, and get IDs correct
-			// We avoid CUDA filters here to keep ROCm from failing to discover GPUs in a mixed environment
-			devFilter := ml.GetDevicesEnv(devices, false)
+			// Apply any device filters to avoid re-discovering unsupported devices,
+			// and get IDs correct. GetDevicesEnv only filters visibility when this
+			// pass is narrowed to a single device.
+			devFilter := ml.GetDevicesEnv(devices)

 			for dir := range libDirs {
 				updatedDevices := bootstrapDevicesWithMetalRetry(rctx, ctx, 3*time.Second, []string{ml.LibOllamaPath, dir}, devFilter)
--- a/llama/compat/llama-ollama-compat.cpp
+++ b/llama/compat/llama-ollama-compat.cpp
@ -28,12 +28,6 @@ using namespace llama_ollama_compat::detail; // pull detail:: helpers into scope

 namespace {

-thread_local uint64_t g_compat_info_logs = 0;
-
-void note_compat_info_log() {
-    ++g_compat_info_logs;
-}
-
 #ifdef OLLAMA_COMPAT_MTMD_BUILD
 void ollama_compat_log(const char * format, ...) {
    std::va_list args;
@ -42,10 +36,10 @@ void ollama_compat_log(const char * format, ...) {
    va_end(args);
 }

-#define OLLAMA_COMPAT_LOG_INFO(...)  do { note_compat_info_log(); ollama_compat_log(__VA_ARGS__); } while (0)
+#define OLLAMA_COMPAT_LOG_INFO(...)  do { ollama_compat_log(__VA_ARGS__); } while (0)
 #define OLLAMA_COMPAT_LOG_ERROR(...) ollama_compat_log(__VA_ARGS__)
 #else
-#define OLLAMA_COMPAT_LOG_INFO(...)  do { note_compat_info_log(); LLAMA_LOG_INFO(__VA_ARGS__); } while (0)
+#define OLLAMA_COMPAT_LOG_INFO(...)  do { LLAMA_LOG_INFO(__VA_ARGS__); } while (0)
 #define OLLAMA_COMPAT_LOG_ERROR(...) LLAMA_LOG_ERROR(__VA_ARGS__)
 #endif

@ -53,47 +47,23 @@ double elapsed_ms(std::chrono::steady_clock::time_point start) {
    return std::chrono::duration<double, std::milli>(std::chrono::steady_clock::now() - start).count();
 }

-class CompatScopeTimer {
-  public:
-    CompatScopeTimer(const char * phase, const std::string * arch = nullptr)
-        : phase_(phase),
-          arch_before_(arch ? *arch : ""),
-          arch_(arch),
-          log_start_(g_compat_info_logs),
-          start_(std::chrono::steady_clock::now()) {}
-
-    void force_log() {
-        forced_ = true;
-    }
-
-    ~CompatScopeTimer() {
-        const bool arch_changed = arch_ && *arch_ != arch_before_;
-        if (!forced_ && !arch_changed && g_compat_info_logs == log_start_) return;
-
-        if (arch_) {
-            if (arch_changed) {
-                OLLAMA_COMPAT_LOG_INFO("compat patch timing: phase=%s arch_before=%s arch_after=%s duration_ms=%.3f\n",
-                                       phase_, arch_before_.c_str(), arch_->c_str(), elapsed_ms(start_));
-            } else {
-                OLLAMA_COMPAT_LOG_INFO("compat patch timing: phase=%s arch=%s duration_ms=%.3f\n",
-                                       phase_, arch_->c_str(), elapsed_ms(start_));
-            }
-            return;
-        }
-
-        OLLAMA_COMPAT_LOG_INFO("compat patch timing: phase=%s duration_ms=%.3f\n",
-                               phase_, elapsed_ms(start_));
-    }
-
-  private:
-    const char * phase_;
-    std::string arch_before_;
-    const std::string * arch_;
-    uint64_t log_start_;
-    std::chrono::steady_clock::time_point start_;
-    bool forced_ = false;
+struct TransformTiming {
+    uint64_t count;
+    size_t bytes;
+    double ms;
 };

+std::mutex g_transform_timing_mutex;
+TransformTiming g_transform_timing = {};
+
+TransformTiming record_transform_timing(size_t bytes, double ms) {
+    std::lock_guard<std::mutex> lk(g_transform_timing_mutex);
+    g_transform_timing.count++;
+    g_transform_timing.bytes += bytes;
+    g_transform_timing.ms += ms;
+    return g_transform_timing;
+}
+
 // Per-loader file path registry — set by translate_metadata, read by
 // maybe_load_text_tensor so it can pass the path to load ops without a
 // separate patch insertion in the model loader's load_all_data path.
@ -2791,7 +2761,6 @@ bool translate_metadata(const llama_model_loader * ml,
                        std::string & arch_name,
                        const char * fname) {
    if (!meta) return false;
-    CompatScopeTimer timing("metadata", &arch_name);
    {
        std::lock_guard<std::mutex> lk(g_loader_path_mutex);
        g_loader_paths[ml] = fname ? fname : "";
@ -2826,13 +2795,14 @@ bool translate_metadata(const llama_model_loader * ml,
    // Dispatch. Add more arches as they are wired up.

    const bool no_mmap = is_mmap_disabled_for(ml);
-    if (no_mmap) timing.force_log();
+    if (no_mmap) {
+        OLLAMA_COMPAT_LOG_INFO("compat patch disabled mmap for transformed text tensors\n");
+    }
    return no_mmap;
 }

 void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
    if (!meta) return;
-    CompatScopeTimer timing("clip");

    handle_legacy_llava_projector(meta);

@ -2921,8 +2891,11 @@ bool maybe_load_tensor(ggml_tensor * cur,
        ggml_backend_tensor_set(cur, dst.data(), 0, dst_size);
    }

-    OLLAMA_COMPAT_LOG_INFO("%s: %s for %s (%zu bytes) in %.3f ms\n",
-                           __func__, op.description, ggml_get_name(cur), dst_size, elapsed_ms(start));
+    const double ms = elapsed_ms(start);
+    const TransformTiming total = record_transform_timing(dst_size, ms);
+    OLLAMA_COMPAT_LOG_INFO("compat tensor transform: op=%s tensor=%s bytes=%zu duration_ms=%.3f total_ops=%llu total_bytes=%zu total_ms=%.3f\n",
+                           op.description, ggml_get_name(cur), dst_size, ms,
+                           (unsigned long long) total.count, total.bytes, total.ms);
    return true;
 }

--- a/llm/llama_server.go
+++ b/llm/llama_server.go
@ -601,7 +601,7 @@ func NewLlamaServerRunner(
 	memWriter := &memoryParsingWriter{inner: status}

 	mediaMarker := newLlamaServerMediaMarker()
-	extraEnvs := ml.GetDevicesEnv(gpus, false)
+	extraEnvs := ml.GetDevicesEnv(gpus)
 	serverEnvs := make(map[string]string, len(extraEnvs)+1)
 	for k, v := range extraEnvs {
 		serverEnvs[k] = v
--- a/ml/device.go
+++ b/ml/device.go
@ -532,12 +532,12 @@ func (f FlashAttentionType) String() string {

 // Given the list of GPUs this instantiation is targeted for,
 // figure out the device environment variables and any recorded
-// per-device runner environment overrides. Set mustFilter true to enable
-// filtering of CUDA devices.
-func GetDevicesEnv(l []DeviceInfo, mustFilter bool) map[string]string {
+// per-device runner environment overrides.
+func GetDevicesEnv(l []DeviceInfo) map[string]string {
 	if len(l) == 0 {
 		return nil
 	}
+	mustFilter := len(l) == 1
 	env := map[string]string{}
 	for _, d := range l {
 		d.updateVisibleDevicesEnv(env, mustFilter)
@ -595,8 +595,12 @@ func (d DeviceInfo) updateVisibleDevicesEnv(env map[string]string, mustFilter bo
 			return
 		}
 		envVar = "CUDA_VISIBLE_DEVICES"
+	case "Vulkan":
+		if !mustFilter {
+			return
+		}
+		envVar = "GGML_VK_VISIBLE_DEVICES"
 	default:
-		// Vulkan is not filtered via env var, but via scheduling decisions
 		return
 	}
 	v, existing := env[envVar]
--- a/ml/device_test.go
+++ b/ml/device_test.go
@ -13,20 +13,13 @@ func TestMergeEnvWithRunnerEnvOverrides(t *testing.T) {
 			DeviceID:           DeviceID{Library: "Metal", ID: "0"},
 			RunnerEnvOverrides: map[string]string{"GGML_METAL_TENSOR_DISABLE": "1"},
 		},
-		{
-			DeviceID: DeviceID{Library: "CUDA", ID: "3"},
-		},
 	}

-	env := GetDevicesEnv(devices, true)
+	env := GetDevicesEnv(devices)

 	if got, want := env["GGML_METAL_TENSOR_DISABLE"], "1"; got != want {
 		t.Fatalf("GGML_METAL_TENSOR_DISABLE = %q, want %q", got, want)
 	}
-
-	if got, want := env["CUDA_VISIBLE_DEVICES"], "3"; got != want {
-		t.Fatalf("CUDA_VISIBLE_DEVICES = %q, want %q", got, want)
-	}
 }

 func TestGetDevicesEnvWarnsOnConflictingOverrides(t *testing.T) {
@ -48,7 +41,7 @@ func TestGetDevicesEnvWarnsOnConflictingOverrides(t *testing.T) {
 		},
 	}

-	env := GetDevicesEnv(devices, false)
+	env := GetDevicesEnv(devices)

 	if got, want := env["TEST_OVERRIDE"], "two"; got != want {
 		t.Fatalf("TEST_OVERRIDE = %q, want %q", got, want)
@ -59,6 +52,53 @@ func TestGetDevicesEnvWarnsOnConflictingOverrides(t *testing.T) {
 	}
 }

+func TestGetDevicesEnvFiltersSingleDevice(t *testing.T) {
+	tests := []struct {
+		name string
+		gpus []DeviceInfo
+		key  string
+		want string
+	}{
+		{
+			name: "single CUDA",
+			gpus: []DeviceInfo{{DeviceID: DeviceID{Library: "CUDA", ID: "3"}}},
+			key:  "CUDA_VISIBLE_DEVICES",
+			want: "3",
+		},
+		{
+			name: "multiple CUDA",
+			gpus: []DeviceInfo{
+				{DeviceID: DeviceID{Library: "CUDA", ID: "3"}},
+				{DeviceID: DeviceID{Library: "CUDA", ID: "4"}},
+			},
+			key: "CUDA_VISIBLE_DEVICES",
+		},
+		{
+			name: "single Vulkan",
+			gpus: []DeviceInfo{{DeviceID: DeviceID{Library: "Vulkan", ID: "0"}, FilterID: "1"}},
+			key:  "GGML_VK_VISIBLE_DEVICES",
+			want: "1",
+		},
+		{
+			name: "multiple Vulkan",
+			gpus: []DeviceInfo{
+				{DeviceID: DeviceID{Library: "Vulkan", ID: "0"}, FilterID: "1"},
+				{DeviceID: DeviceID{Library: "Vulkan", ID: "1"}, FilterID: "0"},
+			},
+			key: "GGML_VK_VISIBLE_DEVICES",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			env := GetDevicesEnv(tt.gpus)
+			if got := env[tt.key]; got != tt.want {
+				t.Fatalf("%s = %q, want %q", tt.key, got, tt.want)
+			}
+		})
+	}
+}
+
 func TestFlashAttentionSupported(t *testing.T) {
 	tests := []struct {
 		name string
--- a/server/sched.go
+++ b/server/sched.go
@ -784,26 +784,43 @@ func selectLlamaServerPlacement(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo,
 	}

 	if opts.MainGPU != nil {
-		selected, ok := bestExplicitMainGPUGroup(systemInfo, groups, *opts.MainGPU)
+		gpu, available, ok := bestExplicitMainGPU(systemInfo, groups, *opts.MainGPU)
 		if !ok {
-			selected = bestGPUGroupByAvailableMemory(systemInfo, groups)
+			selected := bestGPUGroupByAvailableMemory(systemInfo, groups)
 			slog.Warn("requested main_gpu is outside the selected GPU group; passing value through to llama-server",
 				"main_gpu", *opts.MainGPU,
 				"gpu_count", len(selected))
+			logSelectedGPUGroup(gpus, selected)
+			return selected, launchOpts
 		}
+
+		selected, launchOpts := singleLlamaServerGPUPlacement(gpu, launchOpts)
+		slog.Info("selecting requested single GPU for llama-server model",
+			"requested_main_gpu", *opts.MainGPU,
+			"main_gpu", *launchOpts.MainGPU,
+			"id", gpu.ID,
+			"filter_id", gpu.FilterID,
+			"library", gpu.Library,
+			"name", gpu.Name,
+			"description", gpu.Description,
+			"integrated", gpu.Integrated,
+			"available", format.HumanBytes2(available))
 		logSelectedGPUGroup(gpus, selected)
 		return selected, launchOpts
 	}

 	if !envconfig.SchedSpread() && predictedVRAM > 0 {
-		selected, mainGPU, gpu, available, ok := bestSingleGPUFit(systemInfo, groups, predictedVRAM)
+		gpu, available, ok := bestSingleGPUFit(systemInfo, groups, predictedVRAM)
 		if ok {
-			launchOpts.MainGPU = &mainGPU
+			selected, launchOpts := singleLlamaServerGPUPlacement(gpu, launchOpts)
 			slog.Info("selecting single GPU for llama-server model",
-				"main_gpu", mainGPU,
+				"main_gpu", *launchOpts.MainGPU,
 				"id", gpu.ID,
+				"filter_id", gpu.FilterID,
 				"library", gpu.Library,
 				"name", gpu.Name,
+				"description", gpu.Description,
+				"integrated", gpu.Integrated,
 				"predicted", format.HumanBytes2(predictedVRAM),
 				"available", format.HumanBytes2(available))
 			logSelectedGPUGroup(gpus, selected)
@ -816,37 +833,41 @@ func selectLlamaServerPlacement(systemInfo ml.SystemInfo, gpus []ml.DeviceInfo,
 	return selected, launchOpts
 }

-func bestExplicitMainGPUGroup(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, mainGPU int) ([]ml.DeviceInfo, bool) {
+func singleLlamaServerGPUPlacement(gpu ml.DeviceInfo, opts api.Options) ([]ml.DeviceInfo, api.Options) {
+	mainGPU := 0
+	opts.MainGPU = &mainGPU
+	return []ml.DeviceInfo{gpu}, opts
+}
+
+func bestExplicitMainGPU(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, mainGPU int) (gpu ml.DeviceInfo, available uint64, ok bool) {
 	if mainGPU < 0 {
-		return nil, false
+		return ml.DeviceInfo{}, 0, false
 	}

-	var best []ml.DeviceInfo
-	var bestAvailable uint64
 	for _, group := range groups {
 		if mainGPU >= len(group) {
 			continue
 		}
-		available := availableMemoryForGPU(systemInfo, group[mainGPU])
-		if best == nil || available > bestAvailable {
-			best = group
-			bestAvailable = available
+		candidate := group[mainGPU]
+		candidateAvailable := availableMemoryForGPU(systemInfo, candidate)
+		if !ok || betterPlacementGPU(candidate, candidateAvailable, gpu, available) {
+			gpu = candidate
+			available = candidateAvailable
+			ok = true
 		}
 	}

-	return best, best != nil
+	return gpu, available, ok
 }

-func bestSingleGPUFit(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, predictedVRAM uint64) (selected []ml.DeviceInfo, mainGPU int, gpu ml.DeviceInfo, available uint64, ok bool) {
+func bestSingleGPUFit(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, predictedVRAM uint64) (gpu ml.DeviceInfo, available uint64, ok bool) {
 	for _, group := range groups {
-		for i, candidate := range group {
+		for _, candidate := range group {
 			candidateAvailable := availableMemoryForGPU(systemInfo, candidate)
 			if predictedVRAM > candidateAvailable*80/100 {
 				continue
 			}
-			if !ok || candidateAvailable > available {
-				selected = group
-				mainGPU = i
+			if !ok || betterPlacementGPU(candidate, candidateAvailable, gpu, available) {
 				gpu = candidate
 				available = candidateAvailable
 				ok = true
@ -854,7 +875,15 @@ func bestSingleGPUFit(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo, predic
 		}
 	}

-	return selected, mainGPU, gpu, available, ok
+	return gpu, available, ok
+}
+
+func betterPlacementGPU(candidate ml.DeviceInfo, candidateAvailable uint64, current ml.DeviceInfo, currentAvailable uint64) bool {
+	if candidate.Integrated != current.Integrated {
+		return !candidate.Integrated
+	}
+
+	return candidateAvailable > currentAvailable
 }

 func bestGPUGroupByAvailableMemory(systemInfo ml.SystemInfo, groups [][]ml.DeviceInfo) []ml.DeviceInfo {
@ -862,7 +891,7 @@ func bestGPUGroupByAvailableMemory(systemInfo ml.SystemInfo, groups [][]ml.Devic
 	var bestAvailable uint64
 	for _, group := range groups {
 		available, _, _ := availableMemoryForLoad(systemInfo, group)
-		if best == nil || available > bestAvailable {
+		if best == nil || betterPlacementGroup(group, available, best, bestAvailable) {
 			best = group
 			bestAvailable = available
 		}
@ -871,6 +900,25 @@ func bestGPUGroupByAvailableMemory(systemInfo ml.SystemInfo, groups [][]ml.Devic
 	return best
 }

+func betterPlacementGroup(candidate []ml.DeviceInfo, candidateAvailable uint64, current []ml.DeviceInfo, currentAvailable uint64) bool {
+	candidateDiscrete := hasDiscreteGPU(candidate)
+	currentDiscrete := hasDiscreteGPU(current)
+	if candidateDiscrete != currentDiscrete {
+		return candidateDiscrete
+	}
+
+	return candidateAvailable > currentAvailable
+}
+
+func hasDiscreteGPU(gpus []ml.DeviceInfo) bool {
+	for _, gpu := range gpus {
+		if !gpu.Integrated {
+			return true
+		}
+	}
+	return false
+}
+
 func availableMemoryForGPU(systemInfo ml.SystemInfo, gpu ml.DeviceInfo) uint64 {
 	if gpu.Integrated && systemInfo.FreeMemory > 0 && systemInfo.FreeMemory < gpu.FreeMemory {
 		return systemInfo.FreeMemory
--- a/server/sched_test.go
+++ b/server/sched_test.go
@ -1176,6 +1176,7 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
 		wantLibrary      string
 		wantMainGPU      *int
 		wantSelectedGPUs int
+		wantGPUID        string
 	}{
 		{
 			name:          "compacts onto largest same-backend GPU",
@ -1186,8 +1187,9 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
 			},
 			opts:             api.DefaultOptions(),
 			wantLibrary:      "CUDA",
-			wantMainGPU:      testIntPtr(1),
-			wantSelectedGPUs: 2,
+			wantMainGPU:      testIntPtr(0),
+			wantSelectedGPUs: 1,
+			wantGPUID:        "1",
 		},
 		{
 			name:          "explicit main gpu selects matching backend group",
@ -1201,8 +1203,9 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
 				Runner: api.Runner{MainGPU: testIntPtr(1), NumGPU: -1},
 			},
 			wantLibrary:      "ROCm",
-			wantMainGPU:      testIntPtr(1),
-			wantSelectedGPUs: 2,
+			wantMainGPU:      testIntPtr(0),
+			wantSelectedGPUs: 1,
+			wantGPUID:        "1",
 		},
 		{
 			name:          "integrated GPU is capped by system free memory",
@ -1213,8 +1216,22 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
 			},
 			opts:             api.DefaultOptions(),
 			wantLibrary:      "Metal",
-			wantMainGPU:      testIntPtr(1),
-			wantSelectedGPUs: 2,
+			wantMainGPU:      testIntPtr(0),
+			wantSelectedGPUs: 1,
+			wantGPUID:        "1",
+		},
+		{
+			name:          "prefers discrete GPU over integrated GPU with more available memory",
+			predictedVRAM: 8 * format.GigaByte,
+			gpus: []ml.DeviceInfo{
+				{DeviceID: ml.DeviceID{ID: "0", Library: "Vulkan"}, Name: "integrated", Integrated: true, FreeMemory: 32 * format.GigaByte},
+				{DeviceID: ml.DeviceID{ID: "1", Library: "Vulkan"}, Name: "discrete", FreeMemory: 10 * format.GigaByte},
+			},
+			opts:             api.DefaultOptions(),
+			wantLibrary:      "Vulkan",
+			wantMainGPU:      testIntPtr(0),
+			wantSelectedGPUs: 1,
+			wantGPUID:        "1",
 		},
 		{
 			name:          "spread disables automatic compaction",
@ -1249,6 +1266,9 @@ func TestSelectLlamaServerPlacement(t *testing.T) {
 			selected, launchOpts := selectLlamaServerPlacement(systemInfo, tt.gpus, tt.predictedVRAM, tt.opts)
 			require.Len(t, selected, tt.wantSelectedGPUs)
 			require.Equal(t, tt.wantLibrary, selected[0].Library)
+			if tt.wantGPUID != "" {
+				require.Equal(t, tt.wantGPUID, selected[0].ID)
+			}
 			if tt.wantMainGPU == nil {
 				require.Nil(t, launchOpts.MainGPU)
 			} else {