diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 61ca3c43..0005c69d 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -304,6 +304,11 @@ jobs: write-host "Installing plugin" & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet write-host "plugin installed" + - name: remove unwanted mingw dll.a files + run: | + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force - uses: actions/setup-go@v5 with: go-version-file: go.mod diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index f26d23c1..39d8fcf8 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -56,7 +56,7 @@ func GetCPUInfo() GpuInfoList { func GetCPUMem() (memInfo, error) { return memInfo{ TotalMemory: uint64(C.getPhysicalMemory()), - FreeMemory: 0, + FreeMemory: uint64(C.getFreeMemory()), }, nil } diff --git a/gpu/gpu_info_darwin.h b/gpu/gpu_info_darwin.h index 3edca237..415e7922 100644 --- a/gpu/gpu_info_darwin.h +++ b/gpu/gpu_info_darwin.h @@ -2,3 +2,4 @@ #include uint64_t getRecommendedMaxVRAM(); uint64_t getPhysicalMemory(); +uint64_t getFreeMemory(); diff --git a/gpu/gpu_info_darwin.m b/gpu/gpu_info_darwin.m index a145ac07..5ca139e0 100644 --- a/gpu/gpu_info_darwin.m +++ b/gpu/gpu_info_darwin.m @@ -1,4 +1,5 @@ -// go:build darwin +#import +#import #include "gpu_info_darwin.h" uint64_t getRecommendedMaxVRAM() { @@ -8,6 +9,27 @@ uint64_t getRecommendedMaxVRAM() { return result; } +// getPhysicalMemory returns the total physical memory in bytes uint64_t getPhysicalMemory() { - return [[NSProcessInfo processInfo] physicalMemory]; + return [NSProcessInfo processInfo].physicalMemory; +} + +// getFreeMemory returns the total free memory in bytes, including inactive +// memory that can be reclaimed by the system. +uint64_t getFreeMemory() { + mach_port_t host_port = mach_host_self(); + mach_msg_type_number_t host_size = sizeof(vm_statistics64_data_t) / sizeof(integer_t); + vm_size_t pagesize; + vm_statistics64_data_t vm_stat; + + host_page_size(host_port, &pagesize); + if (host_statistics64(host_port, HOST_VM_INFO64, (host_info64_t)&vm_stat, &host_size) != KERN_SUCCESS) { + return 0; + } + + uint64_t free_memory = (uint64_t)vm_stat.free_count * pagesize; + free_memory += (uint64_t)vm_stat.speculative_count * pagesize; + free_memory += (uint64_t)vm_stat.inactive_count * pagesize; + + return free_memory; } diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt index c300244f..b63f3c0e 100644 --- a/llm/ext_server/CMakeLists.txt +++ b/llm/ext_server/CMakeLists.txt @@ -1,17 +1,13 @@ - -set(TARGET ollama_llama_server) -option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) -target_compile_definitions(${TARGET} PRIVATE - SERVER_VERBOSE=$ -) -target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT}) -install(TARGETS ollama_llama_server ggml llama - RUNTIME DESTINATION "${CMAKE_BINARY_DIR}/bin" - LIBRARY DESTINATION "${CMAKE_BINARY_DIR}/bin" - COMPONENT ollama_llama_server) -if (WIN32) - TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) -endif() +set(TARGET ollama_llama_server) +option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) +install(TARGETS ${TARGET} RUNTIME) +target_compile_definitions(${TARGET} PRIVATE + SERVER_VERBOSE=$ +) +target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT}) +if (WIN32) + TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) +endif() target_compile_features(${TARGET} PRIVATE cxx_std_11) \ No newline at end of file diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 00a15b4a..0ef3956e 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1413,7 +1413,7 @@ struct llama_server_context return get_slot(-1); } - LOG_INFO("slot with common prefix found", {{ + LOG_DEBUG("slot with common prefix found", {{ "slot_id", slot->id, "characters", longest }}); @@ -1688,22 +1688,8 @@ struct llama_server_context } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - char buf[256]; - llama_model_meta_val_str(model, "general.architecture", buf, 256); - bool gemma2 = strcmp(buf, "gemma2") == 0; - - int32_t truncate_at = slot.n_ctx; - - // truncate at 2/3 of the context length for gemma2 models - // as they do not support context shifts (from the sliding window implementation). - // this way, prompts that almost fit the context length can still generate a full - // response without a sudden stop from hitting the context limit - if (gemma2) { - truncate_at = 2 * slot.n_ctx / 3; - } - // if input prompt is too big, truncate it, if group attention self-extend is disabled - if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at) + if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_shift = n_left / 2; @@ -1731,19 +1717,6 @@ struct llama_server_context GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } - // Models with sliding window attention do not work with context shifts, so - // limit their prediction to the context length - if (gemma2) { - int32_t limit = slot.n_ctx - slot.n_prompt_tokens; - slot.n_predict = limit; - slot.params.n_predict = limit; - LOG_INFO("model does not support sliding window, limiting generation", { - {"n_ctx", slot.n_ctx}, - {"n_prompt_tokens", slot.n_prompt_tokens}, - {"n_predict", slot.n_predict} - }); - } - if (!slot.params.cache_prompt) { llama_sampling_reset(slot.ctx_sampling); diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index 23feaf99..da1b0688 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -81,7 +81,6 @@ apply_patches() { build() { cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 - cmake --install ${BUILD_DIR} --component ollama_llama_server } compress() { diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 02577545..6c0b62cb 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -18,7 +18,7 @@ sign() { fi } -COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off" +COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off" case "${GOARCH}" in "amd64") @@ -27,7 +27,7 @@ case "${GOARCH}" in # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DGGML_BLAS=off -DGGML_ACCELERATE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_BLAS=off -DGGML_ACCELERATE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}_static" echo "Building static library" build @@ -75,7 +75,7 @@ case "${GOARCH}" in # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}_static" echo "Building static library" build diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index da1bab1b..02f85235 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -60,7 +60,7 @@ if [ -z "${CUDACXX}" ]; then export CUDACXX=$(command -v nvcc) fi fi -COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" +COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" source $(dirname $0)/gen_common.sh init_vars git_module_setup @@ -86,7 +86,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then init_vars echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" - CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" + CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building custom CPU" build @@ -102,7 +102,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake - COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" + COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) diff --git a/llm/llama.cpp b/llm/llama.cpp index d7fd29ff..a8db2a9c 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit d7fd29fff16456ce9c3a23fd2d09a66256b05aff +Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584 diff --git a/llm/llm.go b/llm/llm.go index 98fe7f09..88c0258d 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -1,7 +1,7 @@ package llm // #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include -// #cgo LDFLAGS: -lllama -lggml -lstdc++ +// #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src // #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index f4eaced7..341a6f59 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,11 +1,11 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index 73f52435..2b81b4bd 100644 +index 2b9ace28..172640e2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -5092,16 +5092,7 @@ static void llm_load_vocab( - - // for now, only BPE models have pre-tokenizers +@@ -5357,16 +5357,7 @@ static void llm_load_vocab( if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { + vocab.tokenizer_add_space_prefix = false; + vocab.tokenizer_clean_spaces = true; - if (tokenizer_pre.empty()) { - LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); - LLAMA_LOG_WARN("%s: \n", __func__); @@ -20,7 +20,7 @@ index 73f52435..2b81b4bd 100644 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -5164,7 +5155,8 @@ static void llm_load_vocab( +@@ -5439,7 +5430,8 @@ static void llm_load_vocab( tokenizer_pre == "jais") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; } else { diff --git a/llm/server.go b/llm/server.go index 206f9e39..54fad92c 100644 --- a/llm/server.go +++ b/llm/server.go @@ -699,10 +699,9 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu } defer s.sem.Release(1) - // only allow maximum 10 "context shifts" to avoid infinite generation + // put an upper limit on num_predict to avoid the model running on forever if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx { req.Options.NumPredict = 10 * s.options.NumCtx - slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict) } request := map[string]any{ diff --git a/server/sched.go b/server/sched.go index 8c054c6b..9dff2ae0 100644 --- a/server/sched.go +++ b/server/sched.go @@ -197,25 +197,36 @@ func (s *Scheduler) processPending(ctx context.Context) { break } - // Block attempting to load a model larger than system memory + GPU memory estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts) maxSize := systemMem.FreeMemory - for _, gpu := range gpus { - if gpu.Library == "cpu" { - continue - } - if loadedCount == 0 { - // If no other models are loaded, set the limit based on what's available - maxSize += gpu.FreeMemory - } else { - // Other models could be unloaded, favor total memory for limit - maxSize += gpu.TotalMemory + + // Add available GPU memory to the total pool + // macOS hardware has unified memory so don't double count + if runtime.GOOS != "darwin" { + for _, gpu := range gpus { + if gpu.Library == "cpu" { + continue + } + if loadedCount == 0 { + // If no other models are loaded, set the limit based on what's available + maxSize += gpu.FreeMemory + } else { + // Other models could be unloaded, favor total memory for limit + maxSize += gpu.TotalMemory + } } } + + // Block attempting to load a model larger than system memory + GPU memory if estimate.TotalSize > maxSize { slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize)) - pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize)) - break + + // Linux will crash if over-allocating memory - return an error to the user. + // TODO (jmorganca): add reasonable upper limits for darwin and windows as well + if runtime.GOOS == "linux" { + pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize)) + break + } } // Evaluate if the model will fit in the available system memory, or if we should unload a model first