llama/compat: shrink clip.cpp injection from 18 lines to 1

The clip.cpp tensor-read loop was the fattest hook in the patch — it
duplicated the host-vs-device buffer dispatch around a call into the
compat layer. Move that dispatch into our code (maybe_load_tensor),
so the upstream patch is a single conditional call.

Net: upstream patch drops from 48 lines across 6 files to 34 lines.
Every remaining edit is either a 1-line include, a 1-line function call,
or the gguf_rename_tensor shim (which accesses gguf_context internals
and has to live in gguf.cpp).

Verified end-to-end: text + vision both still correct after rebuild.
This commit is contained in:
jmorganca 2026-04-19 10:50:34 -07:00
parent 8c2c9d4c89
commit 021389f7bb
3 changed files with 38 additions and 50 deletions

View file

@ -393,50 +393,53 @@ void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
}
}
bool supply_promoted_tensor_data(const ggml_tensor * cur,
const char * source_file,
size_t file_offset,
std::vector<uint8_t> & out) {
bool maybe_load_tensor(ggml_tensor * cur,
const char * source_file,
size_t file_offset,
ggml_backend_buffer_type_t buft) {
// Check registry: is this tensor marked for F16→F32 promotion?
{
std::lock_guard<std::mutex> lk(g_promote_mutex);
if (g_promote_f16_to_f32.find(ggml_get_name(cur)) == g_promote_f16_to_f32.end()) {
return false;
}
}
// cur->type is F32 (after promotion). Source bytes are F16 at file_offset.
if (cur->type != GGML_TYPE_F32) {
return false;
}
// Destination was promoted to F32 by translate_clip_metadata. Source
// bytes on disk are still F16 at file_offset.
if (cur->type != GGML_TYPE_F32) return false;
const size_t n_elem = ggml_nelements(cur);
const size_t src_bytes = n_elem * sizeof(uint16_t);
const size_t dst_bytes = n_elem * sizeof(float);
const size_t n_elem = ggml_nelements(cur);
const size_t src_size = n_elem * sizeof(uint16_t);
const size_t dst_size = n_elem * sizeof(float);
std::vector<uint8_t> src(src_bytes);
std::vector<uint8_t> src(src_size);
FILE * f = std::fopen(source_file, "rb");
if (!f) {
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, source_file);
return false;
}
if (std::fseek(f, (long) file_offset, SEEK_SET) != 0) {
std::fclose(f);
LLAMA_LOG_ERROR("%s: failed to seek in '%s'\n", __func__, source_file);
return false;
}
if (std::fread(src.data(), 1, src_bytes, f) != src_bytes) {
if (std::fseek(f, (long) file_offset, SEEK_SET) != 0 ||
std::fread(src.data(), 1, src_size, f) != src_size) {
std::fclose(f);
LLAMA_LOG_ERROR("%s: failed to read %zu bytes for '%s'\n",
__func__, src_bytes, ggml_get_name(cur));
__func__, src_size, ggml_get_name(cur));
return false;
}
std::fclose(f);
out.resize(dst_bytes);
std::vector<uint8_t> dst(dst_size);
convert_f16_to_f32(reinterpret_cast<const uint16_t *>(src.data()),
reinterpret_cast<float *>(out.data()),
reinterpret_cast<float *>(dst.data()),
n_elem);
// Deliver the converted bytes to the tensor's final backend buffer.
if (ggml_backend_buft_is_host(buft)) {
std::memcpy(cur->data, dst.data(), dst_size);
} else {
ggml_backend_tensor_set(cur, dst.data(), 0, dst_size);
}
LLAMA_LOG_INFO("%s: promoted F16->F32 for %s (%zu elems)\n",
__func__, ggml_get_name(cur), n_elem);
return true;

View file

@ -18,6 +18,8 @@
#include <string>
#include <vector>
#include "ggml-backend.h" // for ggml_backend_buffer_type_t
struct gguf_context;
struct ggml_context;
struct ggml_tensor;
@ -64,15 +66,17 @@ void apply_tensor_transforms(const llama_model_loader * ml, ggml_context * ctx);
void translate_clip_metadata(gguf_context * meta, ggml_context * ctx);
// Called from clip.cpp's tensor-loading loop, before reading bytes from the
// file. If this tensor was marked for type promotion by translate_clip_metadata,
// fills `out` with the promoted data (e.g. F16→F32) and returns true. The
// caller should then use `out` instead of reading from the file.
// file. If this tensor was marked for type promotion by translate_clip_metadata
// (e.g. F16→F32), reads the source bytes, converts them, and writes the
// result directly into `cur` (choosing host copy vs. backend upload based
// on `buft`). Returns true if the tensor was handled — caller should skip
// its normal file-read path. Returns false otherwise; caller loads normally.
//
// `file_offset` is the absolute file offset of the original (pre-promotion)
// tensor data in the source GGUF.
bool supply_promoted_tensor_data(const ggml_tensor * cur,
const char * source_file,
size_t file_offset,
std::vector<uint8_t> & out);
bool maybe_load_tensor(ggml_tensor * cur,
const char * source_file,
size_t file_offset,
ggml_backend_buffer_type_t buft);
} // namespace llama_ollama_compat

View file

@ -108,7 +108,7 @@ index 4ded484dd..7d3509c23 100644
if (use_mmap_buffer) {
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
index f0e8786b6..ec2a7d320 100644
index f0e8786b6..1e6319ca0 100644
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -10,6 +10,8 @@
@ -132,30 +132,11 @@ index f0e8786b6..ec2a7d320 100644
const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
// print gguf info
@@ -2358,11 +2365,25 @@ struct clip_model_loader {
@@ -2358,6 +2365,7 @@ struct clip_model_loader {
auto it_off = tensor_offset.find(t->name);
GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor");
const size_t offset = it_off->second;
+ size_t num_bytes = ggml_nbytes(cur);
+
+ // Ollama-compat: let the compat layer supply promoted tensor
+ // data (e.g. F16→F32 for conv weights) instead of reading
+ // bytes directly from the file.
+ std::vector<uint8_t> compat_buf;
+ if (llama_ollama_compat::supply_promoted_tensor_data(cur, fname.c_str(), offset, compat_buf)) {
+ if (ggml_backend_buft_is_host(buft)) {
+ std::memcpy(cur->data, compat_buf.data(), num_bytes);
+ } else {
+ ggml_backend_tensor_set(cur, compat_buf.data(), 0, num_bytes);
+ }
+ continue;
+ }
+
+ if (llama_ollama_compat::maybe_load_tensor(cur, fname.c_str(), offset, buft)) continue;
fin.seekg(offset, std::ios::beg);
if (!fin) {
throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
}
- size_t num_bytes = ggml_nbytes(cur);
if (ggml_backend_buft_is_host(buft)) {
// for the CPU and Metal backend, we can read directly into the tensor
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);