From 2cc854f8cb5b9670fc53134f8104569c60d535be Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Fri, 5 Jul 2024 21:48:31 -0400 Subject: [PATCH 01/15] llm: fix missing dylibs by restoring old build behavior on Linux and macOS (#5511) * Revert "fix cmake build (#5505)" This reverts commit 4fd5f3526a116d05cd74cfcc7217d4e6326e1bea. * llm: fix missing dylibs by restoring old build behavior * crlf -> lf --- llm/ext_server/CMakeLists.txt | 28 ++++++++++++---------------- llm/generate/gen_common.sh | 1 - llm/generate/gen_darwin.sh | 6 +++--- llm/generate/gen_linux.sh | 2 +- 4 files changed, 16 insertions(+), 21 deletions(-) diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt index c300244f..b63f3c0e 100644 --- a/llm/ext_server/CMakeLists.txt +++ b/llm/ext_server/CMakeLists.txt @@ -1,17 +1,13 @@ - -set(TARGET ollama_llama_server) -option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) -include_directories(${CMAKE_CURRENT_SOURCE_DIR}) -add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) -target_compile_definitions(${TARGET} PRIVATE - SERVER_VERBOSE=$ -) -target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT}) -install(TARGETS ollama_llama_server ggml llama - RUNTIME DESTINATION "${CMAKE_BINARY_DIR}/bin" - LIBRARY DESTINATION "${CMAKE_BINARY_DIR}/bin" - COMPONENT ollama_llama_server) -if (WIN32) - TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) -endif() +set(TARGET ollama_llama_server) +option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON) +include_directories(${CMAKE_CURRENT_SOURCE_DIR}) +add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h) +install(TARGETS ${TARGET} RUNTIME) +target_compile_definitions(${TARGET} PRIVATE + SERVER_VERBOSE=$ +) +target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT}) +if (WIN32) + TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32) +endif() target_compile_features(${TARGET} PRIVATE cxx_std_11) \ No newline at end of file diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh index 23feaf99..da1b0688 100644 --- a/llm/generate/gen_common.sh +++ b/llm/generate/gen_common.sh @@ -81,7 +81,6 @@ apply_patches() { build() { cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS} cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8 - cmake --install ${BUILD_DIR} --component ollama_llama_server } compress() { diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 02577545..8b4779f9 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -18,7 +18,7 @@ sign() { fi } -COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off" +COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off" case "${GOARCH}" in "amd64") @@ -27,7 +27,7 @@ case "${GOARCH}" in # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DGGML_BLAS=off -DGGML_ACCELERATE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_BLAS=off -DGGML_ACCELERATE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}_static" echo "Building static library" build @@ -75,7 +75,7 @@ case "${GOARCH}" in # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" + CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}_static" echo "Building static library" build diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index c3686252..2bea1c4e 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then export CUDACXX=$(command -v nvcc) fi fi -COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" +COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off" source $(dirname $0)/gen_common.sh init_vars git_module_setup From e0348d3fe8042b7e378a7cbcee95d17d20a14017 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Fri, 5 Jul 2024 22:42:42 -0400 Subject: [PATCH 02/15] llm: add `COMMON_DARWIN_DEFS` to arm static build (#5513) --- llm/generate/gen_darwin.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh index 8b4779f9..6c0b62cb 100755 --- a/llm/generate/gen_darwin.sh +++ b/llm/generate/gen_darwin.sh @@ -75,7 +75,7 @@ case "${GOARCH}" in # Static build for linking into the Go binary init_vars CMAKE_TARGETS="--target llama --target ggml" - CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" + CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}" BUILD_DIR="../build/darwin/${ARCH}_static" echo "Building static library" build From 9ae146993e9ec834b95d038df1eecac68a744f18 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sat, 6 Jul 2024 03:27:05 -0400 Subject: [PATCH 03/15] llm: add `GGML_STATIC` flag to windows static lib --- llm/generate/gen_windows.ps1 | 1 + 1 file changed, 1 insertion(+) diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 5c694350..123c44cc 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -204,6 +204,7 @@ function build_static() { "-DCMAKE_C_COMPILER=gcc.exe", "-DCMAKE_CXX_COMPILER=g++.exe", "-DBUILD_SHARED_LIBS=off", + "-DGGML_STATIC=on", "-DGGML_NATIVE=off", "-DGGML_AVX=off", "-DGGML_AVX2=off", From f1a379aa566f7a9fefb2a64ac35faf34d9c00812 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sat, 6 Jul 2024 12:54:02 -0400 Subject: [PATCH 04/15] llm: statically link pthread and stdc++ dependencies in windows build --- llm/generate/gen_windows.ps1 | 1 - llm/llm.go | 3 ++- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1 index 123c44cc..5c694350 100644 --- a/llm/generate/gen_windows.ps1 +++ b/llm/generate/gen_windows.ps1 @@ -204,7 +204,6 @@ function build_static() { "-DCMAKE_C_COMPILER=gcc.exe", "-DCMAKE_CXX_COMPILER=g++.exe", "-DBUILD_SHARED_LIBS=off", - "-DGGML_STATIC=on", "-DGGML_NATIVE=off", "-DGGML_AVX=off", "-DGGML_AVX2=off", diff --git a/llm/llm.go b/llm/llm.go index 98fe7f09..3cd162e0 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -1,7 +1,8 @@ package llm // #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include -// #cgo LDFLAGS: -lllama -lggml -lstdc++ +// #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread +// #cgo windows LDFLAGS: -static-libstdc++ -static-libgcc -static // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src // #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src From 5796bfc4013f4ebe26cdbf13554332a25c405027 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sat, 6 Jul 2024 14:06:20 -0400 Subject: [PATCH 05/15] llm: only statically link libstdc++ --- .github/workflows/release.yaml | 4 ++++ llm/llm.go | 2 +- 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 61ca3c43..1042c684 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -304,6 +304,10 @@ jobs: write-host "Installing plugin" & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet write-host "plugin installed" + - name: remove unwanted mingw dll.a files + run: | + Remove-Item "C:\mingw64\x86_64-w64-mingw32\lib\libpthread.dll.a" + Remove-Item "C:\mingw64\x86_64-w64-mingw32\lib\libwinpthread.dll.a" - uses: actions/setup-go@v5 with: go-version-file: go.mod diff --git a/llm/llm.go b/llm/llm.go index 3cd162e0..ac6a5249 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -1,8 +1,8 @@ package llm // #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include +// #cgo windows LDFLAGS: -static-libstdc++ // #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread -// #cgo windows LDFLAGS: -static-libstdc++ -static-libgcc -static // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src // #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src From 6cea0360276e5fc7e2fecbe0cadf89cc72615279 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sat, 6 Jul 2024 15:10:48 -0400 Subject: [PATCH 06/15] Revert "llm: only statically link libstdc++" This reverts commit 5796bfc4013f4ebe26cdbf13554332a25c405027. --- .github/workflows/release.yaml | 4 ---- llm/llm.go | 2 +- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 1042c684..61ca3c43 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -304,10 +304,6 @@ jobs: write-host "Installing plugin" & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet write-host "plugin installed" - - name: remove unwanted mingw dll.a files - run: | - Remove-Item "C:\mingw64\x86_64-w64-mingw32\lib\libpthread.dll.a" - Remove-Item "C:\mingw64\x86_64-w64-mingw32\lib\libwinpthread.dll.a" - uses: actions/setup-go@v5 with: go-version-file: go.mod diff --git a/llm/llm.go b/llm/llm.go index ac6a5249..3cd162e0 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -1,8 +1,8 @@ package llm // #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include -// #cgo windows LDFLAGS: -static-libstdc++ // #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread +// #cgo windows LDFLAGS: -static-libstdc++ -static-libgcc -static // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src // #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src From a08f20d910194edff79d45315330a088fda3f136 Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sat, 6 Jul 2024 15:21:15 -0400 Subject: [PATCH 07/15] release: remove unwanted mingw dll.a files --- .github/workflows/release.yaml | 5 +++++ llm/llm.go | 1 - 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index 61ca3c43..d1faf9f5 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -85,6 +85,11 @@ jobs: write-host "Installing plugin" & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet write-host "plugin installed" + - name: remove unwanted mingw dll.a files + run: | + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force - uses: actions/setup-go@v5 with: go-version-file: go.mod diff --git a/llm/llm.go b/llm/llm.go index 3cd162e0..88c0258d 100644 --- a/llm/llm.go +++ b/llm/llm.go @@ -2,7 +2,6 @@ package llm // #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include // #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread -// #cgo windows LDFLAGS: -static-libstdc++ -static-libgcc -static // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src // #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src From c12f1c5b99c9d9f9388f464aa77063987fdb8f0f Mon Sep 17 00:00:00 2001 From: jmorganca Date: Sat, 6 Jul 2024 16:12:29 -0400 Subject: [PATCH 08/15] release: move mingw library cleanup to correct job --- .github/workflows/release.yaml | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml index d1faf9f5..0005c69d 100644 --- a/.github/workflows/release.yaml +++ b/.github/workflows/release.yaml @@ -85,11 +85,6 @@ jobs: write-host "Installing plugin" & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet write-host "plugin installed" - - name: remove unwanted mingw dll.a files - run: | - Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force - Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force - Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force - uses: actions/setup-go@v5 with: go-version-file: go.mod @@ -309,6 +304,11 @@ jobs: write-host "Installing plugin" & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet write-host "plugin installed" + - name: remove unwanted mingw dll.a files + run: | + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force + Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force - uses: actions/setup-go@v5 with: go-version-file: go.mod From 4607c706413f1354d0e762d25a9a0a933edc14ec Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 6 Jul 2024 18:58:16 -0400 Subject: [PATCH 09/15] llm: add `-DBUILD_SHARED_LIBS=off` to common cpu cmake flags (#5520) --- llm/generate/gen_linux.sh | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh index 2bea1c4e..d3e2d13b 100755 --- a/llm/generate/gen_linux.sh +++ b/llm/generate/gen_linux.sh @@ -77,7 +77,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then init_vars echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\"" - CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" + CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}" BUILD_DIR="../build/linux/${ARCH}/cpu" echo "Building custom CPU" build @@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake - COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" + COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off" if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then # # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta) From f8241bfba384cf8c888847dc44b73d7f43a42d82 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 6 Jul 2024 19:35:04 -0400 Subject: [PATCH 10/15] gpu: report system free memory instead of 0 (#5521) --- gpu/gpu_darwin.go | 2 +- gpu/gpu_info_darwin.h | 1 + gpu/gpu_info_darwin.m | 26 ++++++++++++++++++++++++-- 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go index f26d23c1..39d8fcf8 100644 --- a/gpu/gpu_darwin.go +++ b/gpu/gpu_darwin.go @@ -56,7 +56,7 @@ func GetCPUInfo() GpuInfoList { func GetCPUMem() (memInfo, error) { return memInfo{ TotalMemory: uint64(C.getPhysicalMemory()), - FreeMemory: 0, + FreeMemory: uint64(C.getFreeMemory()), }, nil } diff --git a/gpu/gpu_info_darwin.h b/gpu/gpu_info_darwin.h index 3edca237..415e7922 100644 --- a/gpu/gpu_info_darwin.h +++ b/gpu/gpu_info_darwin.h @@ -2,3 +2,4 @@ #include uint64_t getRecommendedMaxVRAM(); uint64_t getPhysicalMemory(); +uint64_t getFreeMemory(); diff --git a/gpu/gpu_info_darwin.m b/gpu/gpu_info_darwin.m index a145ac07..5ca139e0 100644 --- a/gpu/gpu_info_darwin.m +++ b/gpu/gpu_info_darwin.m @@ -1,4 +1,5 @@ -// go:build darwin +#import +#import #include "gpu_info_darwin.h" uint64_t getRecommendedMaxVRAM() { @@ -8,6 +9,27 @@ uint64_t getRecommendedMaxVRAM() { return result; } +// getPhysicalMemory returns the total physical memory in bytes uint64_t getPhysicalMemory() { - return [[NSProcessInfo processInfo] physicalMemory]; + return [NSProcessInfo processInfo].physicalMemory; +} + +// getFreeMemory returns the total free memory in bytes, including inactive +// memory that can be reclaimed by the system. +uint64_t getFreeMemory() { + mach_port_t host_port = mach_host_self(); + mach_msg_type_number_t host_size = sizeof(vm_statistics64_data_t) / sizeof(integer_t); + vm_size_t pagesize; + vm_statistics64_data_t vm_stat; + + host_page_size(host_port, &pagesize); + if (host_statistics64(host_port, HOST_VM_INFO64, (host_info64_t)&vm_stat, &host_size) != KERN_SUCCESS) { + return 0; + } + + uint64_t free_memory = (uint64_t)vm_stat.free_count * pagesize; + free_memory += (uint64_t)vm_stat.speculative_count * pagesize; + free_memory += (uint64_t)vm_stat.inactive_count * pagesize; + + return free_memory; } From 0ee87615c74c69d8fbc3cad8f3ea5a2364b1a876 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sat, 6 Jul 2024 22:01:52 -0400 Subject: [PATCH 11/15] sched: don't error if paging to disk on Windows and macOS (#5523) --- server/sched.go | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/server/sched.go b/server/sched.go index 8c054c6b..9dff2ae0 100644 --- a/server/sched.go +++ b/server/sched.go @@ -197,25 +197,36 @@ func (s *Scheduler) processPending(ctx context.Context) { break } - // Block attempting to load a model larger than system memory + GPU memory estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts) maxSize := systemMem.FreeMemory - for _, gpu := range gpus { - if gpu.Library == "cpu" { - continue - } - if loadedCount == 0 { - // If no other models are loaded, set the limit based on what's available - maxSize += gpu.FreeMemory - } else { - // Other models could be unloaded, favor total memory for limit - maxSize += gpu.TotalMemory + + // Add available GPU memory to the total pool + // macOS hardware has unified memory so don't double count + if runtime.GOOS != "darwin" { + for _, gpu := range gpus { + if gpu.Library == "cpu" { + continue + } + if loadedCount == 0 { + // If no other models are loaded, set the limit based on what's available + maxSize += gpu.FreeMemory + } else { + // Other models could be unloaded, favor total memory for limit + maxSize += gpu.TotalMemory + } } } + + // Block attempting to load a model larger than system memory + GPU memory if estimate.TotalSize > maxSize { slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize)) - pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize)) - break + + // Linux will crash if over-allocating memory - return an error to the user. + // TODO (jmorganca): add reasonable upper limits for darwin and windows as well + if runtime.GOOS == "linux" { + pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize)) + break + } } // Evaluate if the model will fit in the available system memory, or if we should unload a model first From 0e09c380fcae8b81db3c3447d70d721cfad00dbd Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 7 Jul 2024 12:38:04 -0400 Subject: [PATCH 12/15] llm: print caching notices in debug only (#5533) --- llm/ext_server/server.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 00a15b4a..7ae58e38 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1413,7 +1413,7 @@ struct llama_server_context return get_slot(-1); } - LOG_INFO("slot with common prefix found", {{ + LOG_DEBUG("slot with common prefix found", {{ "slot_id", slot->id, "characters", longest }}); From 571dc61955ced560a45e9d32b1cd2a52d9803c8c Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 7 Jul 2024 13:03:09 -0400 Subject: [PATCH 13/15] Update llama.cpp submodule to `a8db2a9c` (#5530) --- llm/llama.cpp | 2 +- llm/patches/05-default-pretokenizer.diff | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/llm/llama.cpp b/llm/llama.cpp index d7fd29ff..a8db2a9c 160000 --- a/llm/llama.cpp +++ b/llm/llama.cpp @@ -1 +1 @@ -Subproject commit d7fd29fff16456ce9c3a23fd2d09a66256b05aff +Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584 diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff index f4eaced7..341a6f59 100644 --- a/llm/patches/05-default-pretokenizer.diff +++ b/llm/patches/05-default-pretokenizer.diff @@ -1,11 +1,11 @@ diff --git a/src/llama.cpp b/src/llama.cpp -index 73f52435..2b81b4bd 100644 +index 2b9ace28..172640e2 100644 --- a/src/llama.cpp +++ b/src/llama.cpp -@@ -5092,16 +5092,7 @@ static void llm_load_vocab( - - // for now, only BPE models have pre-tokenizers +@@ -5357,16 +5357,7 @@ static void llm_load_vocab( if (vocab.type == LLAMA_VOCAB_TYPE_BPE) { + vocab.tokenizer_add_space_prefix = false; + vocab.tokenizer_clean_spaces = true; - if (tokenizer_pre.empty()) { - LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__); - LLAMA_LOG_WARN("%s: \n", __func__); @@ -20,7 +20,7 @@ index 73f52435..2b81b4bd 100644 vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT; } else if ( tokenizer_pre == "llama3" || -@@ -5164,7 +5155,8 @@ static void llm_load_vocab( +@@ -5439,7 +5430,8 @@ static void llm_load_vocab( tokenizer_pre == "jais") { vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS; } else { From d8def1ff9432ef60d1067e5e6dde0d700dd95021 Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 7 Jul 2024 13:41:51 -0400 Subject: [PATCH 14/15] llm: allow gemma 2 to context shift (#5534) --- llm/ext_server/server.cpp | 29 +---------------------------- 1 file changed, 1 insertion(+), 28 deletions(-) diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp index 7ae58e38..0ef3956e 100644 --- a/llm/ext_server/server.cpp +++ b/llm/ext_server/server.cpp @@ -1688,22 +1688,8 @@ struct llama_server_context } slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep); - char buf[256]; - llama_model_meta_val_str(model, "general.architecture", buf, 256); - bool gemma2 = strcmp(buf, "gemma2") == 0; - - int32_t truncate_at = slot.n_ctx; - - // truncate at 2/3 of the context length for gemma2 models - // as they do not support context shifts (from the sliding window implementation). - // this way, prompts that almost fit the context length can still generate a full - // response without a sudden stop from hitting the context limit - if (gemma2) { - truncate_at = 2 * slot.n_ctx / 3; - } - // if input prompt is too big, truncate it, if group attention self-extend is disabled - if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at) + if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx) { const int n_left = slot.n_ctx - slot.params.n_keep; const int n_shift = n_left / 2; @@ -1731,19 +1717,6 @@ struct llama_server_context GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx); } - // Models with sliding window attention do not work with context shifts, so - // limit their prediction to the context length - if (gemma2) { - int32_t limit = slot.n_ctx - slot.n_prompt_tokens; - slot.n_predict = limit; - slot.params.n_predict = limit; - LOG_INFO("model does not support sliding window, limiting generation", { - {"n_ctx", slot.n_ctx}, - {"n_prompt_tokens", slot.n_prompt_tokens}, - {"n_predict", slot.n_predict} - }); - } - if (!slot.params.cache_prompt) { llama_sampling_reset(slot.ctx_sampling); From 53da2c69654769c0c086af695722e1d9b9ee6ecc Mon Sep 17 00:00:00 2001 From: Jeffrey Morgan Date: Sun, 7 Jul 2024 14:32:05 -0400 Subject: [PATCH 15/15] llm: remove ambiguous comment when putting upper limit on predictions to avoid infinite generation (#5535) --- llm/server.go | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/llm/server.go b/llm/server.go index 206f9e39..54fad92c 100644 --- a/llm/server.go +++ b/llm/server.go @@ -699,10 +699,9 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu } defer s.sem.Release(1) - // only allow maximum 10 "context shifts" to avoid infinite generation + // put an upper limit on num_predict to avoid the model running on forever if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx { req.Options.NumPredict = 10 * s.options.NumCtx - slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict) } request := map[string]any{