mirror of
https://github.com/likelovewant/ollama-for-amd.git
synced 2025-12-23 07:03:57 +00:00
Merge branch 'ollama:main' into main
This commit is contained in:
5
.github/workflows/release.yaml
vendored
5
.github/workflows/release.yaml
vendored
@@ -304,6 +304,11 @@ jobs:
|
|||||||
write-host "Installing plugin"
|
write-host "Installing plugin"
|
||||||
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
|
& "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
|
||||||
write-host "plugin installed"
|
write-host "plugin installed"
|
||||||
|
- name: remove unwanted mingw dll.a files
|
||||||
|
run: |
|
||||||
|
Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force
|
||||||
|
Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force
|
||||||
|
Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force
|
||||||
- uses: actions/setup-go@v5
|
- uses: actions/setup-go@v5
|
||||||
with:
|
with:
|
||||||
go-version-file: go.mod
|
go-version-file: go.mod
|
||||||
|
|||||||
@@ -56,7 +56,7 @@ func GetCPUInfo() GpuInfoList {
|
|||||||
func GetCPUMem() (memInfo, error) {
|
func GetCPUMem() (memInfo, error) {
|
||||||
return memInfo{
|
return memInfo{
|
||||||
TotalMemory: uint64(C.getPhysicalMemory()),
|
TotalMemory: uint64(C.getPhysicalMemory()),
|
||||||
FreeMemory: 0,
|
FreeMemory: uint64(C.getFreeMemory()),
|
||||||
}, nil
|
}, nil
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -2,3 +2,4 @@
|
|||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
uint64_t getRecommendedMaxVRAM();
|
uint64_t getRecommendedMaxVRAM();
|
||||||
uint64_t getPhysicalMemory();
|
uint64_t getPhysicalMemory();
|
||||||
|
uint64_t getFreeMemory();
|
||||||
|
|||||||
@@ -1,4 +1,5 @@
|
|||||||
// go:build darwin
|
#import <Foundation/Foundation.h>
|
||||||
|
#import <mach/mach.h>
|
||||||
#include "gpu_info_darwin.h"
|
#include "gpu_info_darwin.h"
|
||||||
|
|
||||||
uint64_t getRecommendedMaxVRAM() {
|
uint64_t getRecommendedMaxVRAM() {
|
||||||
@@ -8,6 +9,27 @@ uint64_t getRecommendedMaxVRAM() {
|
|||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// getPhysicalMemory returns the total physical memory in bytes
|
||||||
uint64_t getPhysicalMemory() {
|
uint64_t getPhysicalMemory() {
|
||||||
return [[NSProcessInfo processInfo] physicalMemory];
|
return [NSProcessInfo processInfo].physicalMemory;
|
||||||
|
}
|
||||||
|
|
||||||
|
// getFreeMemory returns the total free memory in bytes, including inactive
|
||||||
|
// memory that can be reclaimed by the system.
|
||||||
|
uint64_t getFreeMemory() {
|
||||||
|
mach_port_t host_port = mach_host_self();
|
||||||
|
mach_msg_type_number_t host_size = sizeof(vm_statistics64_data_t) / sizeof(integer_t);
|
||||||
|
vm_size_t pagesize;
|
||||||
|
vm_statistics64_data_t vm_stat;
|
||||||
|
|
||||||
|
host_page_size(host_port, &pagesize);
|
||||||
|
if (host_statistics64(host_port, HOST_VM_INFO64, (host_info64_t)&vm_stat, &host_size) != KERN_SUCCESS) {
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint64_t free_memory = (uint64_t)vm_stat.free_count * pagesize;
|
||||||
|
free_memory += (uint64_t)vm_stat.speculative_count * pagesize;
|
||||||
|
free_memory += (uint64_t)vm_stat.inactive_count * pagesize;
|
||||||
|
|
||||||
|
return free_memory;
|
||||||
}
|
}
|
||||||
|
|||||||
6
llm/ext_server/CMakeLists.txt
vendored
6
llm/ext_server/CMakeLists.txt
vendored
@@ -1,16 +1,12 @@
|
|||||||
|
|
||||||
set(TARGET ollama_llama_server)
|
set(TARGET ollama_llama_server)
|
||||||
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
|
||||||
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
include_directories(${CMAKE_CURRENT_SOURCE_DIR})
|
||||||
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
|
add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
|
||||||
|
install(TARGETS ${TARGET} RUNTIME)
|
||||||
target_compile_definitions(${TARGET} PRIVATE
|
target_compile_definitions(${TARGET} PRIVATE
|
||||||
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
|
||||||
)
|
)
|
||||||
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
|
target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
|
||||||
install(TARGETS ollama_llama_server ggml llama
|
|
||||||
RUNTIME DESTINATION "${CMAKE_BINARY_DIR}/bin"
|
|
||||||
LIBRARY DESTINATION "${CMAKE_BINARY_DIR}/bin"
|
|
||||||
COMPONENT ollama_llama_server)
|
|
||||||
if (WIN32)
|
if (WIN32)
|
||||||
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
|
||||||
endif()
|
endif()
|
||||||
|
|||||||
31
llm/ext_server/server.cpp
vendored
31
llm/ext_server/server.cpp
vendored
@@ -1413,7 +1413,7 @@ struct llama_server_context
|
|||||||
return get_slot(-1);
|
return get_slot(-1);
|
||||||
}
|
}
|
||||||
|
|
||||||
LOG_INFO("slot with common prefix found", {{
|
LOG_DEBUG("slot with common prefix found", {{
|
||||||
"slot_id", slot->id,
|
"slot_id", slot->id,
|
||||||
"characters", longest
|
"characters", longest
|
||||||
}});
|
}});
|
||||||
@@ -1688,22 +1688,8 @@ struct llama_server_context
|
|||||||
}
|
}
|
||||||
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
|
||||||
|
|
||||||
char buf[256];
|
|
||||||
llama_model_meta_val_str(model, "general.architecture", buf, 256);
|
|
||||||
bool gemma2 = strcmp(buf, "gemma2") == 0;
|
|
||||||
|
|
||||||
int32_t truncate_at = slot.n_ctx;
|
|
||||||
|
|
||||||
// truncate at 2/3 of the context length for gemma2 models
|
|
||||||
// as they do not support context shifts (from the sliding window implementation).
|
|
||||||
// this way, prompts that almost fit the context length can still generate a full
|
|
||||||
// response without a sudden stop from hitting the context limit
|
|
||||||
if (gemma2) {
|
|
||||||
truncate_at = 2 * slot.n_ctx / 3;
|
|
||||||
}
|
|
||||||
|
|
||||||
// if input prompt is too big, truncate it, if group attention self-extend is disabled
|
// if input prompt is too big, truncate it, if group attention self-extend is disabled
|
||||||
if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at)
|
if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
|
||||||
{
|
{
|
||||||
const int n_left = slot.n_ctx - slot.params.n_keep;
|
const int n_left = slot.n_ctx - slot.params.n_keep;
|
||||||
const int n_shift = n_left / 2;
|
const int n_shift = n_left / 2;
|
||||||
@@ -1731,19 +1717,6 @@ struct llama_server_context
|
|||||||
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Models with sliding window attention do not work with context shifts, so
|
|
||||||
// limit their prediction to the context length
|
|
||||||
if (gemma2) {
|
|
||||||
int32_t limit = slot.n_ctx - slot.n_prompt_tokens;
|
|
||||||
slot.n_predict = limit;
|
|
||||||
slot.params.n_predict = limit;
|
|
||||||
LOG_INFO("model does not support sliding window, limiting generation", {
|
|
||||||
{"n_ctx", slot.n_ctx},
|
|
||||||
{"n_prompt_tokens", slot.n_prompt_tokens},
|
|
||||||
{"n_predict", slot.n_predict}
|
|
||||||
});
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!slot.params.cache_prompt)
|
if (!slot.params.cache_prompt)
|
||||||
{
|
{
|
||||||
llama_sampling_reset(slot.ctx_sampling);
|
llama_sampling_reset(slot.ctx_sampling);
|
||||||
|
|||||||
@@ -81,7 +81,6 @@ apply_patches() {
|
|||||||
build() {
|
build() {
|
||||||
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
|
cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
|
||||||
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
|
cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
|
||||||
cmake --install ${BUILD_DIR} --component ollama_llama_server
|
|
||||||
}
|
}
|
||||||
|
|
||||||
compress() {
|
compress() {
|
||||||
|
|||||||
@@ -18,7 +18,7 @@ sign() {
|
|||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
|
COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
|
||||||
|
|
||||||
case "${GOARCH}" in
|
case "${GOARCH}" in
|
||||||
"amd64")
|
"amd64")
|
||||||
@@ -27,7 +27,7 @@ case "${GOARCH}" in
|
|||||||
# Static build for linking into the Go binary
|
# Static build for linking into the Go binary
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_TARGETS="--target llama --target ggml"
|
CMAKE_TARGETS="--target llama --target ggml"
|
||||||
CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DGGML_BLAS=off -DGGML_ACCELERATE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_BLAS=off -DGGML_ACCELERATE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}_static"
|
BUILD_DIR="../build/darwin/${ARCH}_static"
|
||||||
echo "Building static library"
|
echo "Building static library"
|
||||||
build
|
build
|
||||||
@@ -75,7 +75,7 @@ case "${GOARCH}" in
|
|||||||
# Static build for linking into the Go binary
|
# Static build for linking into the Go binary
|
||||||
init_vars
|
init_vars
|
||||||
CMAKE_TARGETS="--target llama --target ggml"
|
CMAKE_TARGETS="--target llama --target ggml"
|
||||||
CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
|
CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/darwin/${ARCH}_static"
|
BUILD_DIR="../build/darwin/${ARCH}_static"
|
||||||
echo "Building static library"
|
echo "Building static library"
|
||||||
build
|
build
|
||||||
|
|||||||
@@ -60,7 +60,7 @@ if [ -z "${CUDACXX}" ]; then
|
|||||||
export CUDACXX=$(command -v nvcc)
|
export CUDACXX=$(command -v nvcc)
|
||||||
fi
|
fi
|
||||||
fi
|
fi
|
||||||
COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
|
COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
|
||||||
source $(dirname $0)/gen_common.sh
|
source $(dirname $0)/gen_common.sh
|
||||||
init_vars
|
init_vars
|
||||||
git_module_setup
|
git_module_setup
|
||||||
@@ -86,7 +86,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||||||
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
|
||||||
init_vars
|
init_vars
|
||||||
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
|
||||||
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
|
||||||
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
BUILD_DIR="../build/linux/${ARCH}/cpu"
|
||||||
echo "Building custom CPU"
|
echo "Building custom CPU"
|
||||||
build
|
build
|
||||||
@@ -102,7 +102,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
|
|||||||
# -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
|
# -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
|
||||||
# -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
|
# -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
|
||||||
|
|
||||||
COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
|
COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
|
||||||
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
|
||||||
#
|
#
|
||||||
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
# CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)
|
||||||
|
|||||||
Submodule llm/llama.cpp updated: d7fd29fff1...a8db2a9ce6
@@ -1,7 +1,7 @@
|
|||||||
package llm
|
package llm
|
||||||
|
|
||||||
// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
|
// #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
|
||||||
// #cgo LDFLAGS: -lllama -lggml -lstdc++
|
// #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
|
||||||
// #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
|
// #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
|
||||||
// #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
|
// #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
|
||||||
// #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src
|
// #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src
|
||||||
|
|||||||
@@ -1,11 +1,11 @@
|
|||||||
diff --git a/src/llama.cpp b/src/llama.cpp
|
diff --git a/src/llama.cpp b/src/llama.cpp
|
||||||
index 73f52435..2b81b4bd 100644
|
index 2b9ace28..172640e2 100644
|
||||||
--- a/src/llama.cpp
|
--- a/src/llama.cpp
|
||||||
+++ b/src/llama.cpp
|
+++ b/src/llama.cpp
|
||||||
@@ -5092,16 +5092,7 @@ static void llm_load_vocab(
|
@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
|
||||||
|
|
||||||
// for now, only BPE models have pre-tokenizers
|
|
||||||
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
|
||||||
|
vocab.tokenizer_add_space_prefix = false;
|
||||||
|
vocab.tokenizer_clean_spaces = true;
|
||||||
- if (tokenizer_pre.empty()) {
|
- if (tokenizer_pre.empty()) {
|
||||||
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
- LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
|
||||||
- LLAMA_LOG_WARN("%s: \n", __func__);
|
- LLAMA_LOG_WARN("%s: \n", __func__);
|
||||||
@@ -20,7 +20,7 @@ index 73f52435..2b81b4bd 100644
|
|||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
|
||||||
} else if (
|
} else if (
|
||||||
tokenizer_pre == "llama3" ||
|
tokenizer_pre == "llama3" ||
|
||||||
@@ -5164,7 +5155,8 @@ static void llm_load_vocab(
|
@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
|
||||||
tokenizer_pre == "jais") {
|
tokenizer_pre == "jais") {
|
||||||
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -699,10 +699,9 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
|
|||||||
}
|
}
|
||||||
defer s.sem.Release(1)
|
defer s.sem.Release(1)
|
||||||
|
|
||||||
// only allow maximum 10 "context shifts" to avoid infinite generation
|
// put an upper limit on num_predict to avoid the model running on forever
|
||||||
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
|
if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
|
||||||
req.Options.NumPredict = 10 * s.options.NumCtx
|
req.Options.NumPredict = 10 * s.options.NumCtx
|
||||||
slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict)
|
|
||||||
}
|
}
|
||||||
|
|
||||||
request := map[string]any{
|
request := map[string]any{
|
||||||
|
|||||||
@@ -197,9 +197,12 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
|
||||||
// Block attempting to load a model larger than system memory + GPU memory
|
|
||||||
estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts)
|
estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts)
|
||||||
maxSize := systemMem.FreeMemory
|
maxSize := systemMem.FreeMemory
|
||||||
|
|
||||||
|
// Add available GPU memory to the total pool
|
||||||
|
// macOS hardware has unified memory so don't double count
|
||||||
|
if runtime.GOOS != "darwin" {
|
||||||
for _, gpu := range gpus {
|
for _, gpu := range gpus {
|
||||||
if gpu.Library == "cpu" {
|
if gpu.Library == "cpu" {
|
||||||
continue
|
continue
|
||||||
@@ -212,11 +215,19 @@ func (s *Scheduler) processPending(ctx context.Context) {
|
|||||||
maxSize += gpu.TotalMemory
|
maxSize += gpu.TotalMemory
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Block attempting to load a model larger than system memory + GPU memory
|
||||||
if estimate.TotalSize > maxSize {
|
if estimate.TotalSize > maxSize {
|
||||||
slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize))
|
slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize))
|
||||||
|
|
||||||
|
// Linux will crash if over-allocating memory - return an error to the user.
|
||||||
|
// TODO (jmorganca): add reasonable upper limits for darwin and windows as well
|
||||||
|
if runtime.GOOS == "linux" {
|
||||||
pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize))
|
pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize))
|
||||||
break
|
break
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
// Evaluate if the model will fit in the available system memory, or if we should unload a model first
|
||||||
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
if len(gpus) == 1 && gpus[0].Library == "cpu" {
|
||||||
|
|||||||
Reference in New Issue
Block a user