mirror of
https://github.com/ollama/ollama.git
synced 2026-05-13 14:27:00 +00:00
llama/compat: shrink clip.cpp injection from 18 lines to 1
The clip.cpp tensor-read loop was the fattest hook in the patch — it duplicated the host-vs-device buffer dispatch around a call into the compat layer. Move that dispatch into our code (maybe_load_tensor), so the upstream patch is a single conditional call. Net: upstream patch drops from 48 lines across 6 files to 34 lines. Every remaining edit is either a 1-line include, a 1-line function call, or the gguf_rename_tensor shim (which accesses gguf_context internals and has to live in gguf.cpp). Verified end-to-end: text + vision both still correct after rebuild.
This commit is contained in:
parent
8c2c9d4c89
commit
021389f7bb
3 changed files with 38 additions and 50 deletions
45
llama/compat/llama-ollama-compat.cpp
vendored
45
llama/compat/llama-ollama-compat.cpp
vendored
|
|
@ -393,50 +393,53 @@ void translate_clip_metadata(gguf_context * meta, ggml_context * ctx) {
|
|||
}
|
||||
}
|
||||
|
||||
bool supply_promoted_tensor_data(const ggml_tensor * cur,
|
||||
const char * source_file,
|
||||
size_t file_offset,
|
||||
std::vector<uint8_t> & out) {
|
||||
bool maybe_load_tensor(ggml_tensor * cur,
|
||||
const char * source_file,
|
||||
size_t file_offset,
|
||||
ggml_backend_buffer_type_t buft) {
|
||||
// Check registry: is this tensor marked for F16→F32 promotion?
|
||||
{
|
||||
std::lock_guard<std::mutex> lk(g_promote_mutex);
|
||||
if (g_promote_f16_to_f32.find(ggml_get_name(cur)) == g_promote_f16_to_f32.end()) {
|
||||
return false;
|
||||
}
|
||||
}
|
||||
// cur->type is F32 (after promotion). Source bytes are F16 at file_offset.
|
||||
if (cur->type != GGML_TYPE_F32) {
|
||||
return false;
|
||||
}
|
||||
// Destination was promoted to F32 by translate_clip_metadata. Source
|
||||
// bytes on disk are still F16 at file_offset.
|
||||
if (cur->type != GGML_TYPE_F32) return false;
|
||||
|
||||
const size_t n_elem = ggml_nelements(cur);
|
||||
const size_t src_bytes = n_elem * sizeof(uint16_t);
|
||||
const size_t dst_bytes = n_elem * sizeof(float);
|
||||
const size_t n_elem = ggml_nelements(cur);
|
||||
const size_t src_size = n_elem * sizeof(uint16_t);
|
||||
const size_t dst_size = n_elem * sizeof(float);
|
||||
|
||||
std::vector<uint8_t> src(src_bytes);
|
||||
std::vector<uint8_t> src(src_size);
|
||||
|
||||
FILE * f = std::fopen(source_file, "rb");
|
||||
if (!f) {
|
||||
LLAMA_LOG_ERROR("%s: failed to open '%s'\n", __func__, source_file);
|
||||
return false;
|
||||
}
|
||||
if (std::fseek(f, (long) file_offset, SEEK_SET) != 0) {
|
||||
std::fclose(f);
|
||||
LLAMA_LOG_ERROR("%s: failed to seek in '%s'\n", __func__, source_file);
|
||||
return false;
|
||||
}
|
||||
if (std::fread(src.data(), 1, src_bytes, f) != src_bytes) {
|
||||
if (std::fseek(f, (long) file_offset, SEEK_SET) != 0 ||
|
||||
std::fread(src.data(), 1, src_size, f) != src_size) {
|
||||
std::fclose(f);
|
||||
LLAMA_LOG_ERROR("%s: failed to read %zu bytes for '%s'\n",
|
||||
__func__, src_bytes, ggml_get_name(cur));
|
||||
__func__, src_size, ggml_get_name(cur));
|
||||
return false;
|
||||
}
|
||||
std::fclose(f);
|
||||
|
||||
out.resize(dst_bytes);
|
||||
std::vector<uint8_t> dst(dst_size);
|
||||
convert_f16_to_f32(reinterpret_cast<const uint16_t *>(src.data()),
|
||||
reinterpret_cast<float *>(out.data()),
|
||||
reinterpret_cast<float *>(dst.data()),
|
||||
n_elem);
|
||||
|
||||
// Deliver the converted bytes to the tensor's final backend buffer.
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
std::memcpy(cur->data, dst.data(), dst_size);
|
||||
} else {
|
||||
ggml_backend_tensor_set(cur, dst.data(), 0, dst_size);
|
||||
}
|
||||
|
||||
LLAMA_LOG_INFO("%s: promoted F16->F32 for %s (%zu elems)\n",
|
||||
__func__, ggml_get_name(cur), n_elem);
|
||||
return true;
|
||||
|
|
|
|||
18
llama/compat/llama-ollama-compat.h
vendored
18
llama/compat/llama-ollama-compat.h
vendored
|
|
@ -18,6 +18,8 @@
|
|||
#include <string>
|
||||
#include <vector>
|
||||
|
||||
#include "ggml-backend.h" // for ggml_backend_buffer_type_t
|
||||
|
||||
struct gguf_context;
|
||||
struct ggml_context;
|
||||
struct ggml_tensor;
|
||||
|
|
@ -64,15 +66,17 @@ void apply_tensor_transforms(const llama_model_loader * ml, ggml_context * ctx);
|
|||
void translate_clip_metadata(gguf_context * meta, ggml_context * ctx);
|
||||
|
||||
// Called from clip.cpp's tensor-loading loop, before reading bytes from the
|
||||
// file. If this tensor was marked for type promotion by translate_clip_metadata,
|
||||
// fills `out` with the promoted data (e.g. F16→F32) and returns true. The
|
||||
// caller should then use `out` instead of reading from the file.
|
||||
// file. If this tensor was marked for type promotion by translate_clip_metadata
|
||||
// (e.g. F16→F32), reads the source bytes, converts them, and writes the
|
||||
// result directly into `cur` (choosing host copy vs. backend upload based
|
||||
// on `buft`). Returns true if the tensor was handled — caller should skip
|
||||
// its normal file-read path. Returns false otherwise; caller loads normally.
|
||||
//
|
||||
// `file_offset` is the absolute file offset of the original (pre-promotion)
|
||||
// tensor data in the source GGUF.
|
||||
bool supply_promoted_tensor_data(const ggml_tensor * cur,
|
||||
const char * source_file,
|
||||
size_t file_offset,
|
||||
std::vector<uint8_t> & out);
|
||||
bool maybe_load_tensor(ggml_tensor * cur,
|
||||
const char * source_file,
|
||||
size_t file_offset,
|
||||
ggml_backend_buffer_type_t buft);
|
||||
|
||||
} // namespace llama_ollama_compat
|
||||
|
|
|
|||
|
|
@ -108,7 +108,7 @@ index 4ded484dd..7d3509c23 100644
|
|||
|
||||
if (use_mmap_buffer) {
|
||||
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp
|
||||
index f0e8786b6..ec2a7d320 100644
|
||||
index f0e8786b6..1e6319ca0 100644
|
||||
--- a/tools/mtmd/clip.cpp
|
||||
+++ b/tools/mtmd/clip.cpp
|
||||
@@ -10,6 +10,8 @@
|
||||
|
|
@ -132,30 +132,11 @@ index f0e8786b6..ec2a7d320 100644
|
|||
const int n_tensors = gguf_get_n_tensors(ctx_gguf.get());
|
||||
|
||||
// print gguf info
|
||||
@@ -2358,11 +2365,25 @@ struct clip_model_loader {
|
||||
@@ -2358,6 +2365,7 @@ struct clip_model_loader {
|
||||
auto it_off = tensor_offset.find(t->name);
|
||||
GGML_ASSERT(it_off != tensor_offset.end() && "no offset for tensor");
|
||||
const size_t offset = it_off->second;
|
||||
+ size_t num_bytes = ggml_nbytes(cur);
|
||||
+
|
||||
+ // Ollama-compat: let the compat layer supply promoted tensor
|
||||
+ // data (e.g. F16→F32 for conv weights) instead of reading
|
||||
+ // bytes directly from the file.
|
||||
+ std::vector<uint8_t> compat_buf;
|
||||
+ if (llama_ollama_compat::supply_promoted_tensor_data(cur, fname.c_str(), offset, compat_buf)) {
|
||||
+ if (ggml_backend_buft_is_host(buft)) {
|
||||
+ std::memcpy(cur->data, compat_buf.data(), num_bytes);
|
||||
+ } else {
|
||||
+ ggml_backend_tensor_set(cur, compat_buf.data(), 0, num_bytes);
|
||||
+ }
|
||||
+ continue;
|
||||
+ }
|
||||
+
|
||||
+ if (llama_ollama_compat::maybe_load_tensor(cur, fname.c_str(), offset, buft)) continue;
|
||||
fin.seekg(offset, std::ios::beg);
|
||||
if (!fin) {
|
||||
throw std::runtime_error(string_format("%s: failed to seek for tensor %s\n", __func__, t->name));
|
||||
}
|
||||
- size_t num_bytes = ggml_nbytes(cur);
|
||||
if (ggml_backend_buft_is_host(buft)) {
|
||||
// for the CPU and Metal backend, we can read directly into the tensor
|
||||
fin.read(reinterpret_cast<char *>(cur->data), num_bytes);
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue