mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-21 22:33:56 +00:00
Merge branch 'ollama:main' into main
This commit is contained in:
@@ -66,6 +66,7 @@ Examples:
|
||||
|
||||
llm/backend/mlx: support the llama architecture
|
||||
CONTRIBUTING: provide clarity on good commit messages, and bad
|
||||
docs: simplify manual installation with shorter curl commands
|
||||
|
||||
Bad Examples:
|
||||
|
||||
|
||||
@@ -769,8 +769,8 @@ func (t *ThinkValue) IsString() bool {
|
||||
return ok
|
||||
}
|
||||
|
||||
// AsBool returns the value as a bool (true if enabled in any way)
|
||||
func (t *ThinkValue) AsBool() bool {
|
||||
// Bool returns the value as a bool (true if enabled in any way)
|
||||
func (t *ThinkValue) Bool() bool {
|
||||
if t == nil || t.Value == nil {
|
||||
return false
|
||||
}
|
||||
@@ -786,8 +786,8 @@ func (t *ThinkValue) AsBool() bool {
|
||||
}
|
||||
}
|
||||
|
||||
// AsString returns the value as a string
|
||||
func (t *ThinkValue) AsString() string {
|
||||
// String returns the value as a string
|
||||
func (t *ThinkValue) String() string {
|
||||
if t == nil || t.Value == nil {
|
||||
return ""
|
||||
}
|
||||
|
||||
@@ -171,7 +171,8 @@ func (si SystemInfo) GetOptimalThreadCount() int {
|
||||
// For each GPU, check if it does NOT support flash attention
|
||||
func (l GpuInfoList) FlashAttentionSupported() bool {
|
||||
for _, gpu := range l {
|
||||
supportsFA := gpu.Library == "metal" ||
|
||||
supportsFA := gpu.Library == "cpu" ||
|
||||
gpu.Library == "metal" ||
|
||||
(gpu.Library == "cuda" && gpu.DriverMajor >= 7) ||
|
||||
gpu.Library == "rocm"
|
||||
|
||||
|
||||
@@ -1593,7 +1593,7 @@ Then there is a series of downloading responses. Until any of the download is co
|
||||
|
||||
```json
|
||||
{
|
||||
"status": "downloading digestname",
|
||||
"status": "pulling digestname",
|
||||
"digest": "digestname",
|
||||
"total": 2142590208,
|
||||
"completed": 241970
|
||||
|
||||
18
docs/faq.md
18
docs/faq.md
@@ -20,9 +20,9 @@ Please refer to the [GPU docs](./gpu.md).
|
||||
|
||||
## How can I specify the context window size?
|
||||
|
||||
By default, Ollama uses a context window size of 4096 tokens.
|
||||
By default, Ollama uses a context window size of 4096 tokens for most models. The `gpt-oss` model has a default context window size of 8192 tokens.
|
||||
|
||||
This can be overridden with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
|
||||
This can be overridden in Settings in the Windows and macOS App, or with the `OLLAMA_CONTEXT_LENGTH` environment variable. For example, to set the default context window to 8K, use:
|
||||
|
||||
```shell
|
||||
OLLAMA_CONTEXT_LENGTH=8192 ollama serve
|
||||
@@ -46,6 +46,8 @@ curl http://localhost:11434/api/generate -d '{
|
||||
}'
|
||||
```
|
||||
|
||||
Setting the context length higher may cause the model to not be able to fit onto the GPU which make the model run more slowly.
|
||||
|
||||
## How can I tell if my model was loaded onto the GPU?
|
||||
|
||||
Use the `ollama ps` command to see what models are currently loaded into memory.
|
||||
@@ -57,8 +59,8 @@ ollama ps
|
||||
> **Output**:
|
||||
>
|
||||
> ```
|
||||
> NAME ID SIZE PROCESSOR UNTIL
|
||||
> llama3:70b bcfb190ca3a7 42 GB 100% GPU 4 minutes from now
|
||||
> NAME ID SIZE PROCESSOR CONTEXT UNTIL
|
||||
> gpt-oss:20b 05afbac4bad6 16 GB 100% GPU 8192 4 minutes from now
|
||||
> ```
|
||||
|
||||
The `Processor` column will show which memory the model was loaded in to:
|
||||
@@ -148,9 +150,11 @@ docker build -t ollama-with-ca .
|
||||
docker run -d -e HTTPS_PROXY=https://my.proxy.example.com -p 11434:11434 ollama-with-ca
|
||||
```
|
||||
|
||||
## Does Ollama send my prompts and answers back to ollama.com?
|
||||
## Does Ollama send my prompts and responses back to ollama.com?
|
||||
|
||||
No. Ollama runs locally, and conversation data does not leave your machine.
|
||||
If you're running a model locally, your prompts and responses will always stay on your machine. Ollama Turbo in the App allows you to run your queries on Ollama's servers if you don't have a powerful enough GPU. Web search lets a model query the web, giving you more accurate and up-to-date information. Both Turbo and web search require sending your prompts and responses to Ollama.com. This data is neither logged nor stored.
|
||||
|
||||
If you don't want to see the Turbo and web search options in the app, you can disable them in Settings by turning on Airplane mode. In Airplane mode, all models will run locally, and your prompts and responses will stay on your machine.
|
||||
|
||||
## How can I expose Ollama on my network?
|
||||
|
||||
@@ -345,4 +349,4 @@ Ollama for Windows and macOS register as a login item during installation. You
|
||||
- Open `Settings` -> `Users & Groups` -> `Login Items` and find the `Ollama` entry, then click the `-` (minus) to remove
|
||||
|
||||
**MacOS Ventura (v13) and later**
|
||||
- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
|
||||
- Open `Settings` and search for "Login Items", find the `Ollama` entry under "Allow in the Background`, then click the slider to disable.
|
||||
|
||||
@@ -28,7 +28,7 @@ import (
|
||||
"github.com/stretchr/testify/require"
|
||||
)
|
||||
|
||||
const (
|
||||
var (
|
||||
smol = "llama3.2:1b"
|
||||
)
|
||||
|
||||
@@ -37,6 +37,7 @@ var (
|
||||
|
||||
// Note: add newer models at the top of the list to test them first
|
||||
ollamaEngineChatModels = []string{
|
||||
"gpt-oss:20b",
|
||||
"gemma3n:e2b",
|
||||
"mistral-small3.2:latest",
|
||||
"deepseek-r1:1.5b",
|
||||
@@ -126,6 +127,7 @@ var (
|
||||
"gemma3n",
|
||||
"glm4",
|
||||
"goliath",
|
||||
"gpt-oss:20b",
|
||||
"granite-code",
|
||||
"granite3-dense",
|
||||
"granite3-guardian",
|
||||
@@ -255,8 +257,13 @@ var (
|
||||
}
|
||||
)
|
||||
|
||||
func Init() {
|
||||
func init() {
|
||||
lifecycle.InitLogging()
|
||||
custom := os.Getenv("OLLAMA_TEST_SMOL_MODEL")
|
||||
if custom != "" {
|
||||
slog.Info("setting smol test model to " + custom)
|
||||
smol = custom
|
||||
}
|
||||
}
|
||||
|
||||
func FindPort() string {
|
||||
|
||||
@@ -7,12 +7,12 @@ This enables matching up devices and information reported by the backend
|
||||
with tools (e.g. nvidia-smi) and system management libraries (e.g. nvml).
|
||||
---
|
||||
ggml/include/ggml-backend.h | 1 +
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 39 ++++++++++++++++++++++++++++++++
|
||||
ggml/src/ggml-cuda/ggml-cuda.cu | 67 +++++++++++++++++++++++++++++---
|
||||
ggml/src/ggml-metal/ggml-metal.m | 1 +
|
||||
3 files changed, 41 insertions(+)
|
||||
3 files changed, 63 insertions(+), 6 deletions(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 74e46716..48839339 100644
|
||||
index 74e467163..48839339d 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -152,6 +152,7 @@ extern "C" {
|
||||
@@ -24,10 +24,93 @@ index 74e46716..48839339 100644
|
||||
size_t memory_total;
|
||||
enum ggml_backend_dev_type type;
|
||||
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
index cb0d8528..d6960174 100644
|
||||
index cb0d8528d..1492368de 100644
|
||||
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
|
||||
@@ -2884,6 +2884,7 @@ struct ggml_backend_cuda_device_context {
|
||||
@@ -173,6 +173,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
+static std::string ggml_cuda_parse_uuid(cudaDeviceProp prop, int device_num) {
|
||||
+ char id[64];
|
||||
+
|
||||
+ #if !defined(GGML_USE_HIP)
|
||||
+ snprintf(id, sizeof(id),
|
||||
+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
+ (unsigned char)prop.uuid.bytes[0],
|
||||
+ (unsigned char)prop.uuid.bytes[1],
|
||||
+ (unsigned char)prop.uuid.bytes[2],
|
||||
+ (unsigned char)prop.uuid.bytes[3],
|
||||
+ (unsigned char)prop.uuid.bytes[4],
|
||||
+ (unsigned char)prop.uuid.bytes[5],
|
||||
+ (unsigned char)prop.uuid.bytes[6],
|
||||
+ (unsigned char)prop.uuid.bytes[7],
|
||||
+ (unsigned char)prop.uuid.bytes[8],
|
||||
+ (unsigned char)prop.uuid.bytes[9],
|
||||
+ (unsigned char)prop.uuid.bytes[10],
|
||||
+ (unsigned char)prop.uuid.bytes[11],
|
||||
+ (unsigned char)prop.uuid.bytes[12],
|
||||
+ (unsigned char)prop.uuid.bytes[13],
|
||||
+ (unsigned char)prop.uuid.bytes[14],
|
||||
+ (unsigned char)prop.uuid.bytes[15]
|
||||
+ );
|
||||
+ #else
|
||||
+ #ifdef _WIN32
|
||||
+ snprintf(id, sizeof(id), "%d", device_num);
|
||||
+ #else
|
||||
+ try {
|
||||
+ std::string uuid = std::string(prop.uuid.bytes, 16);
|
||||
+
|
||||
+ size_t pos = 0;
|
||||
+ unsigned long long v = stoull(uuid, &pos, 16);
|
||||
+ if (v == 0 || pos != uuid.size() || (!uuid.empty() && uuid[0] == '-'))
|
||||
+ throw std::invalid_argument("invalid uuid");
|
||||
+
|
||||
+ snprintf(id, sizeof(id), "GPU-%016llx", v);
|
||||
+ } catch (const std::exception &e) {
|
||||
+ snprintf(id, sizeof(id), "%d", device_num);
|
||||
+ }
|
||||
+ #endif
|
||||
+ #endif
|
||||
+
|
||||
+ return id;
|
||||
+}
|
||||
+
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
@@ -261,22 +306,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].cc += prop.minor * 0x10;
|
||||
}
|
||||
}
|
||||
- GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
|
||||
+ GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n",
|
||||
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
|
||||
- device_vmm ? "yes" : "no", prop.warpSize);
|
||||
+ device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
#elif defined(GGML_USE_MUSA)
|
||||
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
|
||||
info.devices[id].warp_size = 32;
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
|
||||
info.devices[id].cc += prop.minor * 0x10;
|
||||
- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
||||
- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
||||
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
+ ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
#else
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
- GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
||||
- id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
||||
+ GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||
+ id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
+ ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
@@ -2884,6 +2931,7 @@ struct ggml_backend_cuda_device_context {
|
||||
int device;
|
||||
std::string name;
|
||||
std::string description;
|
||||
@@ -35,7 +118,7 @@ index cb0d8528..d6960174 100644
|
||||
};
|
||||
|
||||
static const char * ggml_backend_cuda_device_get_name(ggml_backend_dev_t dev) {
|
||||
@@ -2896,6 +2897,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
||||
@@ -2896,6 +2944,11 @@ static const char * ggml_backend_cuda_device_get_description(ggml_backend_dev_t
|
||||
return ctx->description.c_str();
|
||||
}
|
||||
|
||||
@@ -47,7 +130,7 @@ index cb0d8528..d6960174 100644
|
||||
static void ggml_backend_cuda_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
|
||||
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
|
||||
ggml_cuda_set_device(ctx->device);
|
||||
@@ -2910,6 +2916,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
@@ -2910,6 +2963,7 @@ static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend
|
||||
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
|
||||
props->name = ggml_backend_cuda_device_get_name(dev);
|
||||
props->description = ggml_backend_cuda_device_get_description(dev);
|
||||
@@ -55,47 +138,16 @@ index cb0d8528..d6960174 100644
|
||||
props->type = ggml_backend_cuda_device_get_type(dev);
|
||||
ggml_backend_cuda_device_get_memory(dev, &props->memory_free, &props->memory_total);
|
||||
|
||||
@@ -3458,6 +3465,38 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
@@ -3457,6 +3511,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
+ dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
|
||||
|
||||
+ #if !defined(GGML_USE_HIP)
|
||||
+ char id[64];
|
||||
+ snprintf(id, sizeof(id),
|
||||
+ "GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
+ (unsigned char)prop.uuid.bytes[0],
|
||||
+ (unsigned char)prop.uuid.bytes[1],
|
||||
+ (unsigned char)prop.uuid.bytes[2],
|
||||
+ (unsigned char)prop.uuid.bytes[3],
|
||||
+ (unsigned char)prop.uuid.bytes[4],
|
||||
+ (unsigned char)prop.uuid.bytes[5],
|
||||
+ (unsigned char)prop.uuid.bytes[6],
|
||||
+ (unsigned char)prop.uuid.bytes[7],
|
||||
+ (unsigned char)prop.uuid.bytes[8],
|
||||
+ (unsigned char)prop.uuid.bytes[9],
|
||||
+ (unsigned char)prop.uuid.bytes[10],
|
||||
+ (unsigned char)prop.uuid.bytes[11],
|
||||
+ (unsigned char)prop.uuid.bytes[12],
|
||||
+ (unsigned char)prop.uuid.bytes[13],
|
||||
+ (unsigned char)prop.uuid.bytes[14],
|
||||
+ (unsigned char)prop.uuid.bytes[15]
|
||||
+ );
|
||||
+ dev_ctx->id = id;
|
||||
+ #else
|
||||
+ #ifdef _WIN32
|
||||
+ char id[16];
|
||||
+ snprintf(id, sizeof(id), "%d", i);
|
||||
+ dev_ctx->id = id;
|
||||
+ #else
|
||||
+ dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
|
||||
+ #endif
|
||||
+ #endif
|
||||
+
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
/* .reg = */ ®,
|
||||
diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m
|
||||
index 1b56f858..a9eeebc6 100644
|
||||
index 1b56f858c..a9eeebc6a 100644
|
||||
--- a/ggml/src/ggml-metal/ggml-metal.m
|
||||
+++ b/ggml/src/ggml-metal/ggml-metal.m
|
||||
@@ -5703,6 +5703,7 @@ static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backen
|
||||
|
||||
99
llama/patches/0026-ggml-No-alloc-mode.patch
Normal file
99
llama/patches/0026-ggml-No-alloc-mode.patch
Normal file
@@ -0,0 +1,99 @@
|
||||
From 0000000000000000000000000000000000000000 Mon Sep 17 00:00:00 2001
|
||||
From: Jesse Gross <jesse@ollama.com>
|
||||
Date: Wed, 23 Jul 2025 11:58:49 -0700
|
||||
Subject: [PATCH] ggml: No-alloc mode
|
||||
|
||||
Callers can set a backend buffer type to be no-alloc, meaning that
|
||||
it does not allocate memory for tensors or operations. This can
|
||||
be used for calculating memory requirements. Tensors and graphs
|
||||
must be recreated with no-alloc set to false before loading data.
|
||||
|
||||
Defaults to false for newly created backend buffer types.
|
||||
---
|
||||
ggml/include/ggml-backend.h | 1 +
|
||||
ggml/src/ggml-backend-impl.h | 2 ++
|
||||
ggml/src/ggml-backend.cpp | 19 ++++++++++++++++++-
|
||||
3 files changed, 21 insertions(+), 1 deletion(-)
|
||||
|
||||
diff --git a/ggml/include/ggml-backend.h b/ggml/include/ggml-backend.h
|
||||
index 48839339..3903c3cb 100644
|
||||
--- a/ggml/include/ggml-backend.h
|
||||
+++ b/ggml/include/ggml-backend.h
|
||||
@@ -35,6 +35,7 @@ extern "C" {
|
||||
//
|
||||
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
+ GGML_API void ggml_backend_buft_set_alloc (ggml_backend_buffer_type_t buft, bool alloc);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
diff --git a/ggml/src/ggml-backend-impl.h b/ggml/src/ggml-backend-impl.h
|
||||
index c36c12d6..81749a5a 100644
|
||||
--- a/ggml/src/ggml-backend-impl.h
|
||||
+++ b/ggml/src/ggml-backend-impl.h
|
||||
@@ -32,6 +32,7 @@ extern "C" {
|
||||
struct ggml_backend_buffer_type_i iface;
|
||||
ggml_backend_dev_t device;
|
||||
void * context;
|
||||
+ bool no_alloc;
|
||||
};
|
||||
|
||||
//
|
||||
@@ -63,6 +64,7 @@ extern "C" {
|
||||
void * context;
|
||||
size_t size;
|
||||
enum ggml_backend_buffer_usage usage;
|
||||
+ bool no_alloc;
|
||||
};
|
||||
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
diff --git a/ggml/src/ggml-backend.cpp b/ggml/src/ggml-backend.cpp
|
||||
index be335e8c..84928bc3 100644
|
||||
--- a/ggml/src/ggml-backend.cpp
|
||||
+++ b/ggml/src/ggml-backend.cpp
|
||||
@@ -35,12 +35,22 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name(buft);
|
||||
}
|
||||
|
||||
+void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) {
|
||||
+ buft->no_alloc = !alloc;
|
||||
+}
|
||||
+
|
||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
if (size == 0) {
|
||||
// return a dummy buffer for zero-sized allocations
|
||||
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
||||
}
|
||||
|
||||
+ if (buft->no_alloc) {
|
||||
+ ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size);
|
||||
+ buf->no_alloc = true;
|
||||
+ return buf;
|
||||
+ }
|
||||
+
|
||||
return buft->iface.alloc_buffer(buft, size);
|
||||
}
|
||||
|
||||
@@ -89,7 +99,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
/* .buft = */ buft,
|
||||
/* .context = */ context,
|
||||
/* .size = */ size,
|
||||
- /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
|
||||
+ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY,
|
||||
+ /* .no_alloc = */ false
|
||||
};
|
||||
|
||||
return buffer;
|
||||
@@ -119,6 +130,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
+ // If we aren't allocating memory, return a placeholder non-NULL pointer
|
||||
+ // that meets alignment requirements
|
||||
+ if (buffer->no_alloc) {
|
||||
+ return (void *)ggml_backend_buffer_get_alignment(buffer);
|
||||
+ }
|
||||
+
|
||||
void * base = buffer->iface.get_base(buffer);
|
||||
|
||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||
@@ -15,6 +15,9 @@ import (
|
||||
)
|
||||
|
||||
type Backend interface {
|
||||
// Close frees all memory associated with this backend
|
||||
Close()
|
||||
|
||||
Load(ctx context.Context, progress func(float32)) error
|
||||
|
||||
// BackendMemory returns the memory allocations that were made for this model
|
||||
|
||||
@@ -19,6 +19,7 @@ import (
|
||||
"slices"
|
||||
"strconv"
|
||||
"strings"
|
||||
"sync"
|
||||
"sync/atomic"
|
||||
"unicode"
|
||||
"unsafe"
|
||||
@@ -33,15 +34,33 @@ import (
|
||||
"golang.org/x/sync/errgroup"
|
||||
)
|
||||
|
||||
func devices() []*C.struct_ggml_backend_device {
|
||||
ggml.OnceLoad()
|
||||
ds := make([]*C.struct_ggml_backend_device, C.ggml_backend_dev_count())
|
||||
for i := range ds {
|
||||
ds[i] = C.ggml_backend_dev_get(C.size_t(i))
|
||||
}
|
||||
var (
|
||||
cpus, accels, gpus []C.ggml_backend_dev_t
|
||||
backends map[C.ggml_backend_dev_t]C.ggml_backend_t
|
||||
)
|
||||
|
||||
return ds
|
||||
}
|
||||
var initDevices = sync.OnceFunc(func() {
|
||||
ggml.OnceLoad()
|
||||
|
||||
backends = make(map[C.ggml_backend_dev_t]C.ggml_backend_t)
|
||||
for i := range C.ggml_backend_dev_count() {
|
||||
d := C.ggml_backend_dev_get(i)
|
||||
|
||||
switch C.ggml_backend_dev_type(d) {
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
if len(cpus) == 0 {
|
||||
// only the first cpu device should be used
|
||||
cpus = append(cpus, d)
|
||||
}
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
accels = append(accels, d)
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
gpus = append(gpus, d)
|
||||
}
|
||||
|
||||
backends[d] = C.ggml_backend_dev_init(d, nil)
|
||||
}
|
||||
})
|
||||
|
||||
type Backend struct {
|
||||
// modelPath is the location of the model data
|
||||
@@ -53,28 +72,31 @@ type Backend struct {
|
||||
// to the name that is used by the model definition
|
||||
tensorLoadTargets map[string][]string
|
||||
|
||||
sched *C.struct_ggml_backend_sched
|
||||
schedBackends []*C.struct_ggml_backend
|
||||
schedBufts []*C.struct_ggml_backend_buffer_type
|
||||
sched C.ggml_backend_sched_t
|
||||
schedBackends []C.ggml_backend_t
|
||||
schedBufts []C.ggml_backend_buffer_type_t
|
||||
|
||||
tensors map[string]*C.struct_ggml_tensor
|
||||
|
||||
// input is the backend used for inputs
|
||||
input *C.struct_ggml_backend_buffer_type
|
||||
input C.ggml_backend_buffer_type_t
|
||||
|
||||
// layers is the backend used for repeating layers
|
||||
layers map[int]*C.struct_ggml_backend_buffer_type
|
||||
layers map[int]C.ggml_backend_buffer_type_t
|
||||
|
||||
// requiredMemory is the cumulative memory allocations needed by the backend
|
||||
requiredMemory *ml.BackendMemory
|
||||
|
||||
// btDeviceMemory maps from a buffer type to the memory allocations associated with that device
|
||||
btDeviceMemory map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory
|
||||
btDeviceMemory map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory
|
||||
|
||||
flashAttention bool
|
||||
|
||||
// maxGraphNodes is the maximum allowed number of graph nodes in this scheduler
|
||||
maxGraphNodes int
|
||||
|
||||
// weightBuffers are the GGML contexts and buffers for allocating weights
|
||||
weightBuffers map[*C.struct_ggml_context]C.ggml_backend_buffer_t
|
||||
}
|
||||
|
||||
func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
@@ -99,27 +121,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
"num_key_values", len(meta.KV()),
|
||||
)
|
||||
|
||||
initDevices()
|
||||
|
||||
var requiredMemory ml.BackendMemory
|
||||
btDeviceMemory := make(map[*C.struct_ggml_backend_buffer_type]*ml.DeviceMemory)
|
||||
btDeviceMemory := make(map[C.ggml_backend_buffer_type_t]*ml.DeviceMemory)
|
||||
|
||||
type deviceBufferType struct {
|
||||
d *C.struct_ggml_backend_device
|
||||
bts []*C.struct_ggml_backend_buffer_type
|
||||
}
|
||||
|
||||
var cpus, accels, gpus []*C.struct_ggml_backend_device
|
||||
for _, d := range devices() {
|
||||
switch C.ggml_backend_dev_type(d) {
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
if len(cpus) == 0 {
|
||||
// only the first cpu device should be used
|
||||
cpus = append(cpus, d)
|
||||
}
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_ACCEL:
|
||||
accels = append(accels, d)
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_GPU:
|
||||
gpus = append(gpus, d)
|
||||
}
|
||||
d C.ggml_backend_dev_t
|
||||
bts []C.ggml_backend_buffer_type_t
|
||||
}
|
||||
|
||||
blocks := int(meta.KV().BlockCount())
|
||||
@@ -149,7 +158,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
bt := C.ggml_backend_dev_buffer_type(d)
|
||||
gpuDeviceBufferTypes = append(gpuDeviceBufferTypes, deviceBufferType{
|
||||
d: d,
|
||||
bts: append([]*C.struct_ggml_backend_buffer_type{bt}, cpuDeviceBufferType.bts...),
|
||||
bts: append([]C.ggml_backend_buffer_type_t{bt}, cpuDeviceBufferType.bts...),
|
||||
})
|
||||
btDeviceMemory[bt] = &requiredMemory.GPUs[i]
|
||||
requiredMemory.GPUs[i].Name = C.GoString(C.ggml_backend_dev_name(d))
|
||||
@@ -235,16 +244,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
targets := make(map[string][]string)
|
||||
|
||||
// contexts are shared by tensors of the same buffer type
|
||||
ctxs := make(map[*C.struct_ggml_backend_buffer_type]*C.struct_ggml_context)
|
||||
createTensor := func(t tensor, bts []*C.struct_ggml_backend_buffer_type, layer int) *C.struct_ggml_tensor {
|
||||
ctxs := make(map[C.ggml_backend_buffer_type_t]*C.struct_ggml_context)
|
||||
createTensor := func(t tensor, bts []C.ggml_backend_buffer_type_t, layer int) *C.struct_ggml_tensor {
|
||||
for _, bt := range bts {
|
||||
if _, ok := ctxs[bt]; !ok {
|
||||
// slog.Info("XXX before ggml_init")
|
||||
ctxs[bt] = C.ggml_init(C.struct_ggml_init_params{
|
||||
mem_size: C.ggml_tensor_overhead() * C.size_t(maxTensors),
|
||||
no_alloc: true,
|
||||
})
|
||||
// slog.Info("XXX after ggml_init")
|
||||
}
|
||||
|
||||
targets[t.source.Name] = append(targets[t.source.Name], t.target)
|
||||
@@ -332,7 +339,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
}
|
||||
|
||||
// allocate buffers for each context
|
||||
bbs := make(map[*C.struct_ggml_context]*C.struct_ggml_backend_buffer, len(ctxs))
|
||||
bbs := make(map[*C.struct_ggml_context]C.ggml_backend_buffer_t, len(ctxs))
|
||||
for bt, c := range ctxs {
|
||||
if C.ggml_get_first_tensor(c) == nil {
|
||||
continue
|
||||
@@ -350,6 +357,14 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
}
|
||||
|
||||
if b == nil {
|
||||
for _, b := range bbs {
|
||||
C.ggml_backend_buffer_free(b)
|
||||
}
|
||||
|
||||
for _, ctx := range ctxs {
|
||||
C.ggml_free(ctx)
|
||||
}
|
||||
|
||||
panic(ml.ErrNoMem{BackendMemory: requiredMemory})
|
||||
}
|
||||
|
||||
@@ -390,13 +405,13 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
}
|
||||
|
||||
// map devices to backend buffer types so new tensors can be assigned to the correct device
|
||||
deviceBufferTypes := make(map[*C.struct_ggml_backend_device]*C.struct_ggml_backend_buffer_type)
|
||||
deviceBufferTypes := make(map[C.ggml_backend_dev_t]C.ggml_backend_buffer_type_t)
|
||||
|
||||
// create backends and buffer types used for the compute graph scheduler
|
||||
var schedBackends []*C.struct_ggml_backend
|
||||
var schedBufts []*C.struct_ggml_backend_buffer_type
|
||||
var schedBackends []C.ggml_backend_t
|
||||
var schedBufts []C.ggml_backend_buffer_type_t
|
||||
for _, d := range append(gpus, append(accels, cpus...)...) {
|
||||
b := C.ggml_backend_dev_init(d, nil)
|
||||
b := backends[d]
|
||||
bt := C.ggml_backend_get_default_buffer_type(b)
|
||||
|
||||
deviceBufferTypes[d] = bt
|
||||
@@ -428,8 +443,8 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
schedBackends: schedBackends,
|
||||
schedBufts: schedBufts,
|
||||
input: deviceBufferTypes[input.d],
|
||||
layers: func() map[int]*C.struct_ggml_backend_buffer_type {
|
||||
m := make(map[int]*C.struct_ggml_backend_buffer_type)
|
||||
layers: func() map[int]C.ggml_backend_buffer_type_t {
|
||||
m := make(map[int]C.ggml_backend_buffer_type_t)
|
||||
for i, layer := range layers {
|
||||
m[i] = deviceBufferTypes[layer.d]
|
||||
}
|
||||
@@ -438,6 +453,7 @@ func New(modelPath string, params ml.BackendParams) (ml.Backend, error) {
|
||||
requiredMemory: &requiredMemory,
|
||||
btDeviceMemory: btDeviceMemory,
|
||||
maxGraphNodes: maxGraphNodes,
|
||||
weightBuffers: bbs,
|
||||
}, nil
|
||||
}
|
||||
|
||||
@@ -445,6 +461,19 @@ func init() {
|
||||
ml.RegisterBackend("ggml", New)
|
||||
}
|
||||
|
||||
func (b *Backend) Close() {
|
||||
if b == nil {
|
||||
return
|
||||
}
|
||||
|
||||
for ctx, b := range b.weightBuffers {
|
||||
C.ggml_backend_buffer_free(b)
|
||||
C.ggml_free(ctx)
|
||||
}
|
||||
|
||||
C.ggml_backend_sched_free(b.sched)
|
||||
}
|
||||
|
||||
func (b *Backend) Load(ctx context.Context, progress func(float32)) error {
|
||||
var doneBytes atomic.Uint64
|
||||
totalBytes := uint64(b.meta.Length) - b.meta.Tensors().Offset
|
||||
@@ -541,10 +570,8 @@ func (b *Backend) NewContextSize(n int) ml.Context {
|
||||
panic(fmt.Errorf("requested number of graph nodes (%v) for new context exceeds maximum (%v)", n, b.maxGraphNodes))
|
||||
}
|
||||
|
||||
var allocatedBuffers []*C.struct_ggml_backend_buffer
|
||||
var allocatedBuffers []C.ggml_backend_buffer_t
|
||||
|
||||
// slog.Info("XXX before ggml_init")
|
||||
// defer slog.Info("XXX after ggml_init")
|
||||
return &Context{
|
||||
b: b,
|
||||
maxGraphNodes: n,
|
||||
@@ -572,11 +599,11 @@ type Context struct {
|
||||
graph *C.struct_ggml_cgraph
|
||||
|
||||
// buft is the buffer type used for new tensors
|
||||
buft *C.struct_ggml_backend_buffer_type
|
||||
buft C.ggml_backend_buffer_type_t
|
||||
|
||||
// allocatedBuffers are buffers for tensors that we have allocated in this context
|
||||
// so that we can free them when we close the context
|
||||
allocatedBuffers *[]*C.struct_ggml_backend_buffer
|
||||
allocatedBuffers *[]C.ggml_backend_buffer_t
|
||||
|
||||
// maxGraphNodes is the maximum allowed number of graph nodes in this context
|
||||
maxGraphNodes int
|
||||
@@ -1407,55 +1434,3 @@ func (c Context) FromBytes(dtype ml.DType, s []uint8, shape ...int) ml.Tensor {
|
||||
|
||||
return t
|
||||
}
|
||||
|
||||
// TODO - DRY this out with New if possible
|
||||
func newTestBackend(size int) *Backend {
|
||||
var cpus []*C.struct_ggml_backend_device
|
||||
for _, d := range devices() {
|
||||
switch C.ggml_backend_dev_type(d) {
|
||||
case C.GGML_BACKEND_DEVICE_TYPE_CPU:
|
||||
if len(cpus) == 0 {
|
||||
// only the first cpu device should be used
|
||||
cpus = append(cpus, d)
|
||||
break
|
||||
}
|
||||
}
|
||||
}
|
||||
var schedBackends []*C.struct_ggml_backend
|
||||
var schedBufts []*C.struct_ggml_backend_buffer_type
|
||||
b := C.ggml_backend_dev_init(cpus[0], nil)
|
||||
bt := C.ggml_backend_get_default_buffer_type(b)
|
||||
C.ggml_backend_cpu_set_n_threads(b, C.int(Threads(runtime.NumCPU())))
|
||||
// C.ggml_backend_cpu_set_n_threads(b, 1) // DEBUGGING
|
||||
schedBackends = append(schedBackends, b)
|
||||
schedBufts = append(schedBufts, bt)
|
||||
return &Backend{
|
||||
meta: nil,
|
||||
sched: C.ggml_backend_sched_new(
|
||||
(*C.ggml_backend_t)(unsafe.Pointer(&schedBackends[0])),
|
||||
(*C.ggml_backend_buffer_type_t)(unsafe.Pointer(&schedBufts[0])),
|
||||
C.int(len(schedBackends)),
|
||||
C.size_t(max(8192, size)),
|
||||
false,
|
||||
false,
|
||||
),
|
||||
input: bt,
|
||||
maxGraphNodes: max(8192, size),
|
||||
schedBackends: schedBackends,
|
||||
schedBufts: schedBufts,
|
||||
}
|
||||
}
|
||||
|
||||
func newTestContext(b *Backend, n int) *Context {
|
||||
n = max(8192, n)
|
||||
// slog.Info("XXX before ggml_init")
|
||||
// defer slog.Info("XXX after ggml_init")
|
||||
return &Context{
|
||||
b: b,
|
||||
maxGraphNodes: n,
|
||||
ctx: C.ggml_init(C.struct_ggml_init_params{
|
||||
mem_size: C.size_t(n)*C.ggml_tensor_overhead() + C.ggml_graph_overhead_custom(C.size_t(n), false),
|
||||
no_alloc: true,
|
||||
}),
|
||||
}
|
||||
}
|
||||
|
||||
1
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
1
ml/backend/ggml/ggml/include/ggml-backend.h
vendored
@@ -35,6 +35,7 @@ extern "C" {
|
||||
//
|
||||
|
||||
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
|
||||
GGML_API void ggml_backend_buft_set_alloc (ggml_backend_buffer_type_t buft, bool alloc);
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
|
||||
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
|
||||
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
|
||||
|
||||
2
ml/backend/ggml/ggml/src/ggml-backend-impl.h
vendored
2
ml/backend/ggml/ggml/src/ggml-backend-impl.h
vendored
@@ -32,6 +32,7 @@ extern "C" {
|
||||
struct ggml_backend_buffer_type_i iface;
|
||||
ggml_backend_dev_t device;
|
||||
void * context;
|
||||
bool no_alloc;
|
||||
};
|
||||
|
||||
//
|
||||
@@ -63,6 +64,7 @@ extern "C" {
|
||||
void * context;
|
||||
size_t size;
|
||||
enum ggml_backend_buffer_usage usage;
|
||||
bool no_alloc;
|
||||
};
|
||||
|
||||
GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
|
||||
19
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
19
ml/backend/ggml/ggml/src/ggml-backend.cpp
vendored
@@ -35,12 +35,22 @@ const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
|
||||
return buft->iface.get_name(buft);
|
||||
}
|
||||
|
||||
void ggml_backend_buft_set_alloc(ggml_backend_buffer_type_t buft, bool alloc) {
|
||||
buft->no_alloc = !alloc;
|
||||
}
|
||||
|
||||
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
|
||||
if (size == 0) {
|
||||
// return a dummy buffer for zero-sized allocations
|
||||
return ggml_backend_buffer_init(buft, {}, NULL, 0);
|
||||
}
|
||||
|
||||
if (buft->no_alloc) {
|
||||
ggml_backend_buffer_t buf = ggml_backend_buffer_init(buft, {}, NULL, size);
|
||||
buf->no_alloc = true;
|
||||
return buf;
|
||||
}
|
||||
|
||||
return buft->iface.alloc_buffer(buft, size);
|
||||
}
|
||||
|
||||
@@ -89,7 +99,8 @@ ggml_backend_buffer_t ggml_backend_buffer_init(
|
||||
/* .buft = */ buft,
|
||||
/* .context = */ context,
|
||||
/* .size = */ size,
|
||||
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
|
||||
/* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY,
|
||||
/* .no_alloc = */ false
|
||||
};
|
||||
|
||||
return buffer;
|
||||
@@ -119,6 +130,12 @@ void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// If we aren't allocating memory, return a placeholder non-NULL pointer
|
||||
// that meets alignment requirements
|
||||
if (buffer->no_alloc) {
|
||||
return (void *)ggml_backend_buffer_get_alignment(buffer);
|
||||
}
|
||||
|
||||
void * base = buffer->iface.get_base(buffer);
|
||||
|
||||
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
|
||||
|
||||
92
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
92
ml/backend/ggml/ggml/src/ggml-cuda/ggml-cuda.cu
vendored
@@ -175,6 +175,51 @@ static int ggml_cuda_parse_id(char devName[]) {
|
||||
}
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
|
||||
static std::string ggml_cuda_parse_uuid(cudaDeviceProp prop, int device_num) {
|
||||
char id[64];
|
||||
|
||||
#if !defined(GGML_USE_HIP)
|
||||
snprintf(id, sizeof(id),
|
||||
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
(unsigned char)prop.uuid.bytes[0],
|
||||
(unsigned char)prop.uuid.bytes[1],
|
||||
(unsigned char)prop.uuid.bytes[2],
|
||||
(unsigned char)prop.uuid.bytes[3],
|
||||
(unsigned char)prop.uuid.bytes[4],
|
||||
(unsigned char)prop.uuid.bytes[5],
|
||||
(unsigned char)prop.uuid.bytes[6],
|
||||
(unsigned char)prop.uuid.bytes[7],
|
||||
(unsigned char)prop.uuid.bytes[8],
|
||||
(unsigned char)prop.uuid.bytes[9],
|
||||
(unsigned char)prop.uuid.bytes[10],
|
||||
(unsigned char)prop.uuid.bytes[11],
|
||||
(unsigned char)prop.uuid.bytes[12],
|
||||
(unsigned char)prop.uuid.bytes[13],
|
||||
(unsigned char)prop.uuid.bytes[14],
|
||||
(unsigned char)prop.uuid.bytes[15]
|
||||
);
|
||||
#else
|
||||
#ifdef _WIN32
|
||||
snprintf(id, sizeof(id), "%d", device_num);
|
||||
#else
|
||||
try {
|
||||
std::string uuid = std::string(prop.uuid.bytes, 16);
|
||||
|
||||
size_t pos = 0;
|
||||
unsigned long long v = stoull(uuid, &pos, 16);
|
||||
if (v == 0 || pos != uuid.size() || (!uuid.empty() && uuid[0] == '-'))
|
||||
throw std::invalid_argument("invalid uuid");
|
||||
|
||||
snprintf(id, sizeof(id), "GPU-%016llx", v);
|
||||
} catch (const std::exception &e) {
|
||||
snprintf(id, sizeof(id), "%d", device_num);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
return id;
|
||||
}
|
||||
|
||||
static ggml_cuda_device_info ggml_cuda_init() {
|
||||
#ifdef __HIP_PLATFORM_AMD__
|
||||
// Workaround for a rocBLAS bug when using multiple graphics cards:
|
||||
@@ -263,22 +308,24 @@ static ggml_cuda_device_info ggml_cuda_init() {
|
||||
info.devices[id].cc += prop.minor * 0x10;
|
||||
}
|
||||
}
|
||||
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d\n",
|
||||
GGML_LOG_INFO(" Device %d: %s, %s (0x%x), VMM: %s, Wave Size: %d, ID: %s\n",
|
||||
id, prop.name, prop.gcnArchName, info.devices[id].cc & 0xffff,
|
||||
device_vmm ? "yes" : "no", prop.warpSize);
|
||||
device_vmm ? "yes" : "no", prop.warpSize, ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
#elif defined(GGML_USE_MUSA)
|
||||
// FIXME: Ensure compatibility with varying warp sizes across different MUSA archs.
|
||||
info.devices[id].warp_size = 32;
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = GGML_CUDA_CC_OFFSET_MTHREADS + prop.major * 0x100;
|
||||
info.devices[id].cc += prop.minor * 0x10;
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
#else
|
||||
info.devices[id].smpbo = prop.sharedMemPerBlockOptin;
|
||||
info.devices[id].cc = 100*prop.major + 10*prop.minor;
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
|
||||
GGML_LOG_INFO(" Device %d: %s, compute capability %d.%d, VMM: %s, ID: %s\n",
|
||||
id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no",
|
||||
ggml_cuda_parse_uuid(prop, id).c_str());
|
||||
#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
|
||||
}
|
||||
|
||||
@@ -3475,38 +3522,7 @@ ggml_backend_reg_t ggml_backend_cuda_reg() {
|
||||
cudaDeviceProp prop;
|
||||
CUDA_CHECK(cudaGetDeviceProperties(&prop, i));
|
||||
dev_ctx->description = prop.name;
|
||||
|
||||
#if !defined(GGML_USE_HIP)
|
||||
char id[64];
|
||||
snprintf(id, sizeof(id),
|
||||
"GPU-%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-%02x%02x%02x%02x%02x%02x",
|
||||
(unsigned char)prop.uuid.bytes[0],
|
||||
(unsigned char)prop.uuid.bytes[1],
|
||||
(unsigned char)prop.uuid.bytes[2],
|
||||
(unsigned char)prop.uuid.bytes[3],
|
||||
(unsigned char)prop.uuid.bytes[4],
|
||||
(unsigned char)prop.uuid.bytes[5],
|
||||
(unsigned char)prop.uuid.bytes[6],
|
||||
(unsigned char)prop.uuid.bytes[7],
|
||||
(unsigned char)prop.uuid.bytes[8],
|
||||
(unsigned char)prop.uuid.bytes[9],
|
||||
(unsigned char)prop.uuid.bytes[10],
|
||||
(unsigned char)prop.uuid.bytes[11],
|
||||
(unsigned char)prop.uuid.bytes[12],
|
||||
(unsigned char)prop.uuid.bytes[13],
|
||||
(unsigned char)prop.uuid.bytes[14],
|
||||
(unsigned char)prop.uuid.bytes[15]
|
||||
);
|
||||
dev_ctx->id = id;
|
||||
#else
|
||||
#ifdef _WIN32
|
||||
char id[16];
|
||||
snprintf(id, sizeof(id), "%d", i);
|
||||
dev_ctx->id = id;
|
||||
#else
|
||||
dev_ctx->id = "GPU-" + std::string(prop.uuid.bytes, 16);
|
||||
#endif
|
||||
#endif
|
||||
dev_ctx->id = ggml_cuda_parse_uuid(prop, i);
|
||||
|
||||
ggml_backend_dev_t dev = new ggml_backend_device {
|
||||
/* .iface = */ ggml_backend_cuda_device_interface,
|
||||
|
||||
@@ -34,10 +34,12 @@ type ErrorResponse struct {
|
||||
}
|
||||
|
||||
type Message struct {
|
||||
Role string `json:"role"`
|
||||
Content any `json:"content"`
|
||||
Reasoning string `json:"reasoning,omitempty"`
|
||||
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
|
||||
Role string `json:"role"`
|
||||
Content any `json:"content"`
|
||||
Reasoning string `json:"reasoning,omitempty"`
|
||||
ToolCalls []ToolCall `json:"tool_calls,omitempty"`
|
||||
Name string `json:"name,omitempty"`
|
||||
ToolCallID string `json:"tool_call_id,omitempty"`
|
||||
}
|
||||
|
||||
type Choice struct {
|
||||
@@ -101,6 +103,7 @@ type ChatCompletionRequest struct {
|
||||
ResponseFormat *ResponseFormat `json:"response_format"`
|
||||
Tools []api.Tool `json:"tools"`
|
||||
Reasoning *Reasoning `json:"reasoning,omitempty"`
|
||||
ReasoningEffort *string `json:"reasoning_effort,omitempty"`
|
||||
}
|
||||
|
||||
type ChatCompletion struct {
|
||||
@@ -401,9 +404,20 @@ func toModel(r api.ShowResponse, m string) Model {
|
||||
func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
||||
var messages []api.Message
|
||||
for _, msg := range r.Messages {
|
||||
toolName := ""
|
||||
if strings.ToLower(msg.Role) == "tool" {
|
||||
toolName = msg.Name
|
||||
if toolName == "" && msg.ToolCallID != "" {
|
||||
toolName = nameFromToolCallID(r.Messages, msg.ToolCallID)
|
||||
}
|
||||
}
|
||||
switch content := msg.Content.(type) {
|
||||
case string:
|
||||
messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning})
|
||||
toolCalls, err := fromCompletionToolCall(msg.ToolCalls)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
messages = append(messages, api.Message{Role: msg.Role, Content: content, Thinking: msg.Reasoning, ToolCalls: toolCalls, ToolName: toolName})
|
||||
case []any:
|
||||
for _, c := range content {
|
||||
data, ok := c.(map[string]any)
|
||||
@@ -454,7 +468,21 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
||||
return nil, errors.New("invalid message format")
|
||||
}
|
||||
}
|
||||
// since we might have added multiple messages above, if we have tools
|
||||
// calls we'll add them to the last message
|
||||
if len(messages) > 0 && len(msg.ToolCalls) > 0 {
|
||||
toolCalls, err := fromCompletionToolCall(msg.ToolCalls)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
messages[len(messages)-1].ToolCalls = toolCalls
|
||||
if toolName != "" {
|
||||
messages[len(messages)-1].ToolName = toolName
|
||||
}
|
||||
messages[len(messages)-1].Thinking = msg.Reasoning
|
||||
}
|
||||
default:
|
||||
// content is only optional if tool calls are present
|
||||
if msg.ToolCalls == nil {
|
||||
return nil, fmt.Errorf("invalid message content type: %T", content)
|
||||
}
|
||||
@@ -467,7 +495,7 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
||||
return nil, errors.New("invalid tool call arguments")
|
||||
}
|
||||
}
|
||||
messages = append(messages, api.Message{Role: msg.Role, ToolCalls: toolCalls})
|
||||
messages = append(messages, api.Message{Role: msg.Role, Thinking: msg.Reasoning, ToolCalls: toolCalls})
|
||||
}
|
||||
}
|
||||
|
||||
@@ -514,10 +542,6 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
||||
options["top_p"] = 1.0
|
||||
}
|
||||
|
||||
if r.Reasoning != nil {
|
||||
options["reasoning"] = *r.Reasoning.Effort
|
||||
}
|
||||
|
||||
var format json.RawMessage
|
||||
if r.ResponseFormat != nil {
|
||||
switch strings.ToLower(strings.TrimSpace(r.ResponseFormat.Type)) {
|
||||
@@ -533,9 +557,15 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
||||
|
||||
var think *api.ThinkValue
|
||||
if r.Reasoning != nil {
|
||||
options["reasoning"] = *r.Reasoning.Effort
|
||||
think = &api.ThinkValue{
|
||||
Value: *r.Reasoning.Effort,
|
||||
}
|
||||
} else if r.ReasoningEffort != nil {
|
||||
options["reasoning"] = *r.ReasoningEffort
|
||||
think = &api.ThinkValue{
|
||||
Value: *r.ReasoningEffort,
|
||||
}
|
||||
}
|
||||
|
||||
return &api.ChatRequest{
|
||||
@@ -549,6 +579,33 @@ func fromChatRequest(r ChatCompletionRequest) (*api.ChatRequest, error) {
|
||||
}, nil
|
||||
}
|
||||
|
||||
func nameFromToolCallID(messages []Message, toolCallID string) string {
|
||||
// iterate backwards to be more resilient to duplicate tool call IDs (this
|
||||
// follows "last one wins")
|
||||
for i := len(messages) - 1; i >= 0; i-- {
|
||||
msg := messages[i]
|
||||
for _, tc := range msg.ToolCalls {
|
||||
if tc.ID == toolCallID {
|
||||
return tc.Function.Name
|
||||
}
|
||||
}
|
||||
}
|
||||
return ""
|
||||
}
|
||||
|
||||
func fromCompletionToolCall(toolCalls []ToolCall) ([]api.ToolCall, error) {
|
||||
apiToolCalls := make([]api.ToolCall, len(toolCalls))
|
||||
for i, tc := range toolCalls {
|
||||
apiToolCalls[i].Function.Name = tc.Function.Name
|
||||
err := json.Unmarshal([]byte(tc.Function.Arguments), &apiToolCalls[i].Function.Arguments)
|
||||
if err != nil {
|
||||
return nil, errors.New("invalid tool call arguments")
|
||||
}
|
||||
}
|
||||
|
||||
return apiToolCalls, nil
|
||||
}
|
||||
|
||||
func fromCompleteRequest(r CompletionRequest) (api.GenerateRequest, error) {
|
||||
options := make(map[string]any)
|
||||
|
||||
|
||||
@@ -235,6 +235,210 @@ func TestChatMiddleware(t *testing.T) {
|
||||
Stream: &False,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "chat handler with tools and content",
|
||||
body: `{
|
||||
"model": "test-model",
|
||||
"messages": [
|
||||
{"role": "user", "content": "What's the weather like in Paris Today?"},
|
||||
{"role": "assistant", "content": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
|
||||
]
|
||||
}`,
|
||||
req: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: "What's the weather like in Paris Today?",
|
||||
},
|
||||
{
|
||||
Role: "assistant",
|
||||
Content: "Let's see what the weather is like in Paris",
|
||||
ToolCalls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "get_current_weather",
|
||||
Arguments: map[string]any{
|
||||
"location": "Paris, France",
|
||||
"format": "celsius",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Options: map[string]any{
|
||||
"temperature": 1.0,
|
||||
"top_p": 1.0,
|
||||
},
|
||||
Stream: &False,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "chat handler with tools and empty content",
|
||||
body: `{
|
||||
"model": "test-model",
|
||||
"messages": [
|
||||
{"role": "user", "content": "What's the weather like in Paris Today?"},
|
||||
{"role": "assistant", "content": "", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
|
||||
]
|
||||
}`,
|
||||
req: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: "What's the weather like in Paris Today?",
|
||||
},
|
||||
{
|
||||
Role: "assistant",
|
||||
ToolCalls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "get_current_weather",
|
||||
Arguments: map[string]any{
|
||||
"location": "Paris, France",
|
||||
"format": "celsius",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Options: map[string]any{
|
||||
"temperature": 1.0,
|
||||
"top_p": 1.0,
|
||||
},
|
||||
Stream: &False,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "chat handler with tools and thinking content",
|
||||
body: `{
|
||||
"model": "test-model",
|
||||
"messages": [
|
||||
{"role": "user", "content": "What's the weather like in Paris Today?"},
|
||||
{"role": "assistant", "reasoning": "Let's see what the weather is like in Paris", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]}
|
||||
]
|
||||
}`,
|
||||
req: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: "What's the weather like in Paris Today?",
|
||||
},
|
||||
{
|
||||
Role: "assistant",
|
||||
Thinking: "Let's see what the weather is like in Paris",
|
||||
ToolCalls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "get_current_weather",
|
||||
Arguments: map[string]any{
|
||||
"location": "Paris, France",
|
||||
"format": "celsius",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
Options: map[string]any{
|
||||
"temperature": 1.0,
|
||||
"top_p": 1.0,
|
||||
},
|
||||
Stream: &False,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "tool response with call ID",
|
||||
body: `{
|
||||
"model": "test-model",
|
||||
"messages": [
|
||||
{"role": "user", "content": "What's the weather like in Paris Today?"},
|
||||
{"role": "assistant", "tool_calls": [{"id": "id_abc", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]},
|
||||
{"role": "tool", "tool_call_id": "id_abc", "content": "The weather in Paris is 20 degrees Celsius"}
|
||||
]
|
||||
}`,
|
||||
req: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: "What's the weather like in Paris Today?",
|
||||
},
|
||||
{
|
||||
Role: "assistant",
|
||||
ToolCalls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "get_current_weather",
|
||||
Arguments: map[string]any{
|
||||
"location": "Paris, France",
|
||||
"format": "celsius",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Role: "tool",
|
||||
Content: "The weather in Paris is 20 degrees Celsius",
|
||||
ToolName: "get_current_weather",
|
||||
},
|
||||
},
|
||||
Options: map[string]any{
|
||||
"temperature": 1.0,
|
||||
"top_p": 1.0,
|
||||
},
|
||||
Stream: &False,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "tool response with name",
|
||||
body: `{
|
||||
"model": "test-model",
|
||||
"messages": [
|
||||
{"role": "user", "content": "What's the weather like in Paris Today?"},
|
||||
{"role": "assistant", "tool_calls": [{"id": "id", "type": "function", "function": {"name": "get_current_weather", "arguments": "{\"location\": \"Paris, France\", \"format\": \"celsius\"}"}}]},
|
||||
{"role": "tool", "name": "get_current_weather", "content": "The weather in Paris is 20 degrees Celsius"}
|
||||
]
|
||||
}`,
|
||||
req: api.ChatRequest{
|
||||
Model: "test-model",
|
||||
Messages: []api.Message{
|
||||
{
|
||||
Role: "user",
|
||||
Content: "What's the weather like in Paris Today?",
|
||||
},
|
||||
{
|
||||
Role: "assistant",
|
||||
ToolCalls: []api.ToolCall{
|
||||
{
|
||||
Function: api.ToolCallFunction{
|
||||
Name: "get_current_weather",
|
||||
Arguments: map[string]any{
|
||||
"location": "Paris, France",
|
||||
"format": "celsius",
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
},
|
||||
{
|
||||
Role: "tool",
|
||||
Content: "The weather in Paris is 20 degrees Celsius",
|
||||
ToolName: "get_current_weather",
|
||||
},
|
||||
},
|
||||
Options: map[string]any{
|
||||
"temperature": 1.0,
|
||||
"top_p": 1.0,
|
||||
},
|
||||
Stream: &False,
|
||||
},
|
||||
},
|
||||
{
|
||||
name: "chat handler with streaming tools",
|
||||
body: `{
|
||||
|
||||
@@ -70,6 +70,10 @@ func kvCacheTypeFromStr(s string) ml.DType {
|
||||
}
|
||||
|
||||
func (c *InputCache) Close() {
|
||||
if c == nil {
|
||||
return
|
||||
}
|
||||
|
||||
c.cache.Close()
|
||||
}
|
||||
|
||||
|
||||
@@ -877,6 +877,15 @@ func (s *Server) load(
|
||||
) {
|
||||
err := s.initModel(mpath, params, lpath, parallel, kvCacheType, kvSize, multiUserCache)
|
||||
if err != nil {
|
||||
var noMem ml.ErrNoMem
|
||||
if errors.As(err, &noMem) {
|
||||
// We can't yet handle this but in the future we will
|
||||
s.cache.Close()
|
||||
if s.model != nil {
|
||||
s.model.Backend().Close()
|
||||
}
|
||||
}
|
||||
|
||||
panic(err)
|
||||
}
|
||||
|
||||
|
||||
@@ -44,8 +44,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
thinkVal := false
|
||||
thinkLevel := ""
|
||||
if think != nil {
|
||||
thinkVal = think.AsBool()
|
||||
thinkLevel = think.AsString()
|
||||
thinkVal = think.Bool()
|
||||
thinkLevel = think.String()
|
||||
}
|
||||
var b bytes.Buffer
|
||||
if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[i:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
|
||||
@@ -105,8 +105,8 @@ func chatPrompt(ctx context.Context, m *Model, tokenize tokenizeFunc, opts *api.
|
||||
thinkVal := false
|
||||
thinkLevel := ""
|
||||
if think != nil {
|
||||
thinkVal = think.AsBool()
|
||||
thinkLevel = think.AsString()
|
||||
thinkVal = think.Bool()
|
||||
thinkLevel = think.String()
|
||||
}
|
||||
if err := m.Template.Execute(&b, template.Values{Messages: append(system, msgs[currMsgIdx:]...), Tools: tools, Think: thinkVal, ThinkLevel: thinkLevel, IsThinkSet: think != nil}); err != nil {
|
||||
return "", nil, err
|
||||
|
||||
@@ -30,6 +30,7 @@ import (
|
||||
"github.com/ollama/ollama/api"
|
||||
"github.com/ollama/ollama/discover"
|
||||
"github.com/ollama/ollama/envconfig"
|
||||
"github.com/ollama/ollama/format"
|
||||
"github.com/ollama/ollama/fs/ggml"
|
||||
"github.com/ollama/ollama/llm"
|
||||
"github.com/ollama/ollama/logutil"
|
||||
@@ -50,11 +51,16 @@ func experimentEnabled(name string) bool {
|
||||
|
||||
var useClient2 = experimentEnabled("client2")
|
||||
|
||||
// Low VRAM mode is based on the sum of total VRAM (not free) and triggers
|
||||
// reduced context length on some models
|
||||
var lowVRAMThreshold uint64 = 20 * format.GibiByte
|
||||
|
||||
var mode string = gin.DebugMode
|
||||
|
||||
type Server struct {
|
||||
addr net.Addr
|
||||
sched *Scheduler
|
||||
addr net.Addr
|
||||
sched *Scheduler
|
||||
lowVRAM bool
|
||||
}
|
||||
|
||||
func init() {
|
||||
@@ -112,8 +118,9 @@ func (s *Server) scheduleRunner(ctx context.Context, name string, caps []model.C
|
||||
return nil, nil, nil, err
|
||||
}
|
||||
|
||||
// This model requires a minimum context to function effectively
|
||||
if slices.Contains(model.Config.ModelFamilies, "gptoss") {
|
||||
// This model is much more capable with a larger context, so set that
|
||||
// unless it would penalize performance too much
|
||||
if !s.lowVRAM && slices.Contains(model.Config.ModelFamilies, "gptoss") {
|
||||
opts.NumCtx = max(opts.NumCtx, 8192)
|
||||
}
|
||||
|
||||
@@ -198,7 +205,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
|
||||
// Validate Think value: string values currently only allowed for gptoss models
|
||||
if req.Think != nil && req.Think.IsString() && !useHarmony {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -206,7 +213,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
if req.Suffix != "" {
|
||||
caps = append(caps, model.CapabilityInsert)
|
||||
}
|
||||
if req.Think != nil && req.Think.AsBool() {
|
||||
if req.Think != nil && req.Think.Bool() {
|
||||
caps = append(caps, model.CapabilityThinking)
|
||||
// TODO(drifkin): consider adding a warning if it's false and the model
|
||||
// doesn't support thinking. It's not strictly required, but it can be a
|
||||
@@ -281,10 +288,10 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
values.Messages = append(msgs, api.Message{Role: "user", Content: req.Prompt})
|
||||
}
|
||||
|
||||
values.Think = req.Think != nil && req.Think.AsBool()
|
||||
values.Think = req.Think != nil && req.Think.Bool()
|
||||
values.ThinkLevel = ""
|
||||
if req.Think != nil {
|
||||
values.ThinkLevel = req.Think.AsString()
|
||||
values.ThinkLevel = req.Think.String()
|
||||
}
|
||||
values.IsThinkSet = req.Think != nil
|
||||
|
||||
@@ -310,7 +317,7 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
var thinkingState *thinking.Parser
|
||||
if !useHarmony {
|
||||
openingTag, closingTag := thinking.InferTags(m.Template.Template)
|
||||
if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
|
||||
if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
|
||||
thinkingState = &thinking.Parser{
|
||||
OpeningTag: openingTag,
|
||||
ClosingTag: closingTag,
|
||||
@@ -364,7 +371,8 @@ func (s *Server) GenerateHandler(c *gin.Context) {
|
||||
*toolName = strings.TrimPrefix(*toolName, "functions.")
|
||||
var args api.ToolCallFunctionArguments
|
||||
if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
|
||||
ch <- gin.H{"error parsing tool call": err.Error()}
|
||||
errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
|
||||
ch <- gin.H{"error": errStr}
|
||||
return
|
||||
}
|
||||
|
||||
@@ -1382,6 +1390,15 @@ func Serve(ln net.Listener) error {
|
||||
gpus := discover.GetGPUInfo()
|
||||
gpus.LogDetails()
|
||||
|
||||
var totalVRAM uint64
|
||||
for _, gpu := range gpus {
|
||||
totalVRAM += gpu.TotalMemory - envconfig.GpuOverhead()
|
||||
}
|
||||
if totalVRAM < lowVRAMThreshold {
|
||||
s.lowVRAM = true
|
||||
slog.Info("entering low vram mode", "total vram", format.HumanBytes2(totalVRAM), "threshold", format.HumanBytes2(lowVRAMThreshold))
|
||||
}
|
||||
|
||||
err = srvr.Serve(ln)
|
||||
// If server is closed from the signal handler, wait for the ctx to be done
|
||||
// otherwise error out quickly
|
||||
@@ -1530,7 +1547,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
if len(req.Tools) > 0 {
|
||||
caps = append(caps, model.CapabilityTools)
|
||||
}
|
||||
if req.Think != nil && req.Think.AsBool() {
|
||||
if req.Think != nil && req.Think.Bool() {
|
||||
caps = append(caps, model.CapabilityThinking)
|
||||
}
|
||||
|
||||
@@ -1584,7 +1601,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
|
||||
// Validate Think value: string values currently only allowed for gptoss models
|
||||
if req.Think != nil && req.Think.IsString() && !useHarmony {
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.AsString())})
|
||||
c.JSON(http.StatusBadRequest, gin.H{"error": fmt.Sprintf("think value %q is not supported for this model", req.Think.String())})
|
||||
return
|
||||
}
|
||||
|
||||
@@ -1603,7 +1620,7 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
|
||||
var thinkingState *thinking.Parser
|
||||
openingTag, closingTag := thinking.InferTags(m.Template.Template)
|
||||
if req.Think != nil && req.Think.AsBool() && openingTag != "" && closingTag != "" {
|
||||
if req.Think != nil && req.Think.Bool() && openingTag != "" && closingTag != "" {
|
||||
thinkingState = &thinking.Parser{
|
||||
OpeningTag: openingTag,
|
||||
ClosingTag: closingTag,
|
||||
@@ -1655,7 +1672,8 @@ func (s *Server) ChatHandler(c *gin.Context) {
|
||||
*toolName = strings.TrimPrefix(*toolName, "functions.")
|
||||
var args api.ToolCallFunctionArguments
|
||||
if err := json.Unmarshal([]byte(toolContent), &args); err != nil {
|
||||
ch <- gin.H{"error parsing tool call": err.Error()}
|
||||
errStr := fmt.Sprintf("error parsing tool call: raw='%s', err=%s", toolContent, err.Error())
|
||||
ch <- gin.H{"error": errStr}
|
||||
return
|
||||
}
|
||||
res.Message.ToolCalls = []api.ToolCall{{Function: api.ToolCallFunction{Name: *toolName, Arguments: args}}}
|
||||
|
||||
@@ -758,8 +758,6 @@ func (a ByDurationAndName) Less(i, j int) bool {
|
||||
// If numParallel is <= 0, this will attempt try to optimize parallelism based on available VRAM, and adjust
|
||||
// opts.NumCtx accordingly
|
||||
func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuInfoList, numParallel *int) discover.GpuInfoList {
|
||||
var estimatedVRAM uint64
|
||||
|
||||
var numParallelToTry []int
|
||||
if *numParallel <= 0 {
|
||||
// If no specific parallel setting was provided, try larger then smaller, always end with 1
|
||||
@@ -769,39 +767,51 @@ func pickBestFullFitByLibrary(req *LlmRequest, f *ggml.GGML, gpus discover.GpuIn
|
||||
}
|
||||
|
||||
for _, gl := range gpus.ByLibrary() {
|
||||
var ok bool
|
||||
sgl := append(make(discover.GpuInfoList, 0, len(gl)), gl...)
|
||||
|
||||
// TODO - potentially sort by performance capability, existing models loaded, etc.
|
||||
// TODO - Eliminate any GPUs that already have envconfig.MaxRunners loaded on them
|
||||
// Note: at present, this will favor more VRAM over faster GPU speed in mixed setups
|
||||
// Note: at present, this will favor most current available VRAM descending and ignoring faster GPU speed in mixed setups
|
||||
sort.Sort(sort.Reverse(discover.ByFreeMemory(sgl)))
|
||||
|
||||
// First attempt to fit the model into a single GPU
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if !envconfig.SchedSpread() {
|
||||
for _, g := range sgl {
|
||||
if ok, estimatedVRAM = llm.PredictServerFit([]discover.GpuInfo{g}, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
||||
slog.Info("new model will fit in available VRAM in single GPU, loading", "model", req.model.ModelPath, "gpu", g.ID, "parallel", p, "available", g.FreeMemory, "required", format.HumanBytes2(estimatedVRAM))
|
||||
if !envconfig.SchedSpread() {
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
// Try to pack into as few GPUs as possible, starting from 1 GPU
|
||||
for numGPUs := 1; numGPUs <= len(sgl); numGPUs++ {
|
||||
gpuSubset := sgl[:numGPUs]
|
||||
ok, estimatedVRAM := llm.PredictServerFit(gpuSubset, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p)
|
||||
|
||||
if ok {
|
||||
slog.Info("new model will fit in available VRAM across minimum required GPUs, loading",
|
||||
"model", req.model.ModelPath,
|
||||
"library", sgl[0].Library,
|
||||
"parallel", p,
|
||||
"required", format.HumanBytes2(estimatedVRAM),
|
||||
"gpus", numGPUs)
|
||||
*numParallel = p
|
||||
return []discover.GpuInfo{g}
|
||||
return gpuSubset
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// TODO future refinements
|
||||
// - if multiple Libraries, see if any single GPU in any Library will fit
|
||||
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
||||
|
||||
// TODO future refinements
|
||||
// - if multiple Libraries, see if any single GPU in any Library will fit
|
||||
// - try subsets of GPUs instead of just falling back to 1 or all in a family
|
||||
|
||||
// Now try all the GPUs
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if ok, estimatedVRAM = llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
||||
slog.Info("new model will fit in available VRAM, loading", "model", req.model.ModelPath, "library", sgl[0].Library, "parallel", p, "required", format.HumanBytes2(estimatedVRAM))
|
||||
*numParallel = p
|
||||
return sgl
|
||||
// Now try all the GPUS (OLLAMA_SCHED_SPREAD is set)
|
||||
for _, p := range numParallelToTry {
|
||||
req.opts.NumCtx = req.origNumCtx * p
|
||||
if ok, estimatedVRAM := llm.PredictServerFit(sgl, f, req.model.AdapterPaths, req.model.ProjectorPaths, req.opts, p); ok {
|
||||
slog.Info("new model will fit in available VRAM, loading",
|
||||
"model", req.model.ModelPath,
|
||||
"library", sgl[0].Library,
|
||||
"parallel", p,
|
||||
"required", format.HumanBytes2(estimatedVRAM),
|
||||
"gpus", len(sgl))
|
||||
*numParallel = p
|
||||
return sgl
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user