From 2cc854f8cb5b9670fc53134f8104569c60d535be Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Fri, 5 Jul 2024 21:48:31 -0400
Subject: [PATCH 01/15] llm: fix missing dylibs by restoring old build behavior
 on Linux and macOS (#5511)

* Revert "fix cmake build (#5505)"

This reverts commit 4fd5f3526a116d05cd74cfcc7217d4e6326e1bea.

* llm: fix missing dylibs by restoring old build behavior

* crlf -> lf
---
 llm/ext_server/CMakeLists.txt | 28 ++++++++++++----------------
 llm/generate/gen_common.sh    |  1 -
 llm/generate/gen_darwin.sh    |  6 +++---
 llm/generate/gen_linux.sh     |  2 +-
 4 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt
index c300244f..b63f3c0e 100644
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -1,17 +1,13 @@
-
-set(TARGET ollama_llama_server)
-option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
-include_directories(${CMAKE_CURRENT_SOURCE_DIR})
-add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
-target_compile_definitions(${TARGET} PRIVATE
-    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
-)
-target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
-install(TARGETS ollama_llama_server ggml llama
-        RUNTIME DESTINATION "${CMAKE_BINARY_DIR}/bin"
-        LIBRARY DESTINATION "${CMAKE_BINARY_DIR}/bin"
-        COMPONENT ollama_llama_server)
-if (WIN32)
-    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
-endif()
+set(TARGET ollama_llama_server)
+option(LLAMA_SERVER_VERBOSE "Build verbose logging option for Server" ON)
+include_directories(${CMAKE_CURRENT_SOURCE_DIR})
+add_executable(${TARGET} server.cpp utils.hpp json.hpp httplib.h)
+install(TARGETS ${TARGET} RUNTIME)
+target_compile_definitions(${TARGET} PRIVATE
+    SERVER_VERBOSE=$<BOOL:${LLAMA_SERVER_VERBOSE}>
+)
+target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT})
+if (WIN32)
+    TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
\ No newline at end of file
diff --git a/llm/generate/gen_common.sh b/llm/generate/gen_common.sh
index 23feaf99..da1b0688 100644
--- a/llm/generate/gen_common.sh
+++ b/llm/generate/gen_common.sh
@@ -81,7 +81,6 @@ apply_patches() {
 build() {
     cmake -S ${LLAMACPP_DIR} -B ${BUILD_DIR} ${CMAKE_DEFS}
     cmake --build ${BUILD_DIR} ${CMAKE_TARGETS} -j8
-    cmake --install ${BUILD_DIR} --component ollama_llama_server
 }
 
 compress() {
diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
index 02577545..8b4779f9 100755
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -18,7 +18,7 @@ sign() {
     fi
 }
 
-COMMON_DARWIN_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
+COMMON_DARWIN_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DLLAMA_METAL_MACOSX_VERSION_MIN=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DGGML_METAL_EMBED_LIBRARY=on -DGGML_OPENMP=off"
 
 case "${GOARCH}" in
 "amd64")
@@ -27,7 +27,7 @@ case "${GOARCH}" in
     # Static build for linking into the Go binary
     init_vars
     CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="${COMMON_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DGGML_BLAS=off -DGGML_ACCELERATE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
+    CMAKE_DEFS="${COMMON_CPU_DEFS} -DGGML_BLAS=off -DGGML_ACCELERATE=off -DGGML_AVX=off -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off ${CMAKE_DEFS}"
     BUILD_DIR="../build/darwin/${ARCH}_static"
     echo "Building static library"
     build
@@ -75,7 +75,7 @@ case "${GOARCH}" in
     # Static build for linking into the Go binary
     init_vars
     CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DBUILD_SHARED_LIBS=off -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
+    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
     BUILD_DIR="../build/darwin/${ARCH}_static"
     echo "Building static library"
     build
diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index c3686252..2bea1c4e 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -51,7 +51,7 @@ if [ -z "${CUDACXX}" ]; then
         export CUDACXX=$(command -v nvcc)
     fi
 fi
-COMMON_CMAKE_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
+COMMON_CMAKE_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_AVX=on -DGGML_AVX2=off -DGGML_AVX512=off -DGGML_FMA=off -DGGML_F16C=off -DGGML_OPENMP=off"
 source $(dirname $0)/gen_common.sh
 init_vars
 git_module_setup

From e0348d3fe8042b7e378a7cbcee95d17d20a14017 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Fri, 5 Jul 2024 22:42:42 -0400
Subject: [PATCH 02/15] llm: add `COMMON_DARWIN_DEFS` to arm static build
 (#5513)

---
 llm/generate/gen_darwin.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/generate/gen_darwin.sh b/llm/generate/gen_darwin.sh
index 8b4779f9..6c0b62cb 100755
--- a/llm/generate/gen_darwin.sh
+++ b/llm/generate/gen_darwin.sh
@@ -75,7 +75,7 @@ case "${GOARCH}" in
     # Static build for linking into the Go binary
     init_vars
     CMAKE_TARGETS="--target llama --target ggml"
-    CMAKE_DEFS="-DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
+    CMAKE_DEFS="${COMMON_DARWIN_DEFS} -DCMAKE_OSX_DEPLOYMENT_TARGET=11.3 -DCMAKE_SYSTEM_NAME=Darwin -DCMAKE_SYSTEM_PROCESSOR=${ARCH} -DCMAKE_OSX_ARCHITECTURES=${ARCH} ${CMAKE_DEFS}"
     BUILD_DIR="../build/darwin/${ARCH}_static"
     echo "Building static library"
     build

From 9ae146993e9ec834b95d038df1eecac68a744f18 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 6 Jul 2024 03:27:05 -0400
Subject: [PATCH 03/15] llm: add `GGML_STATIC` flag to windows static lib

---
 llm/generate/gen_windows.ps1 | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 5c694350..123c44cc 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -204,6 +204,7 @@ function build_static() {
             "-DCMAKE_C_COMPILER=gcc.exe",
             "-DCMAKE_CXX_COMPILER=g++.exe",
             "-DBUILD_SHARED_LIBS=off",
+            "-DGGML_STATIC=on",
             "-DGGML_NATIVE=off",
             "-DGGML_AVX=off",
             "-DGGML_AVX2=off",

From f1a379aa566f7a9fefb2a64ac35faf34d9c00812 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 6 Jul 2024 12:54:02 -0400
Subject: [PATCH 04/15] llm: statically link pthread and stdc++ dependencies in
 windows build

---
 llm/generate/gen_windows.ps1 | 1 -
 llm/llm.go                   | 3 ++-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 123c44cc..5c694350 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -204,7 +204,6 @@ function build_static() {
             "-DCMAKE_C_COMPILER=gcc.exe",
             "-DCMAKE_CXX_COMPILER=g++.exe",
             "-DBUILD_SHARED_LIBS=off",
-            "-DGGML_STATIC=on",
             "-DGGML_NATIVE=off",
             "-DGGML_AVX=off",
             "-DGGML_AVX2=off",
diff --git a/llm/llm.go b/llm/llm.go
index 98fe7f09..3cd162e0 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -1,7 +1,8 @@
 package llm
 
 // #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
-// #cgo LDFLAGS: -lllama -lggml -lstdc++
+// #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
+// #cgo windows LDFLAGS: -static-libstdc++ -static-libgcc -static
 // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
 // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
 // #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src

From 5796bfc4013f4ebe26cdbf13554332a25c405027 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 6 Jul 2024 14:06:20 -0400
Subject: [PATCH 05/15] llm: only statically link libstdc++

---
 .github/workflows/release.yaml | 4 ++++
 llm/llm.go                     | 2 +-
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 61ca3c43..1042c684 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -304,6 +304,10 @@ jobs:
           write-host "Installing plugin"
           & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
           write-host "plugin installed"
+      - name: remove unwanted mingw dll.a files
+        run: |
+          Remove-Item "C:\mingw64\x86_64-w64-mingw32\lib\libpthread.dll.a"
+          Remove-Item "C:\mingw64\x86_64-w64-mingw32\lib\libwinpthread.dll.a"
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
diff --git a/llm/llm.go b/llm/llm.go
index 3cd162e0..ac6a5249 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -1,8 +1,8 @@
 package llm
 
 // #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
+// #cgo windows LDFLAGS: -static-libstdc++
 // #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
-// #cgo windows LDFLAGS: -static-libstdc++ -static-libgcc -static
 // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
 // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
 // #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src

From 6cea0360276e5fc7e2fecbe0cadf89cc72615279 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 6 Jul 2024 15:10:48 -0400
Subject: [PATCH 06/15] Revert "llm: only statically link libstdc++"

This reverts commit 5796bfc4013f4ebe26cdbf13554332a25c405027.
---
 .github/workflows/release.yaml | 4 ----
 llm/llm.go                     | 2 +-
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 1042c684..61ca3c43 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -304,10 +304,6 @@ jobs:
           write-host "Installing plugin"
           & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
           write-host "plugin installed"
-      - name: remove unwanted mingw dll.a files
-        run: |
-          Remove-Item "C:\mingw64\x86_64-w64-mingw32\lib\libpthread.dll.a"
-          Remove-Item "C:\mingw64\x86_64-w64-mingw32\lib\libwinpthread.dll.a"
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
diff --git a/llm/llm.go b/llm/llm.go
index ac6a5249..3cd162e0 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -1,8 +1,8 @@
 package llm
 
 // #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
-// #cgo windows LDFLAGS: -static-libstdc++
 // #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
+// #cgo windows LDFLAGS: -static-libstdc++ -static-libgcc -static
 // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
 // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
 // #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src

From a08f20d910194edff79d45315330a088fda3f136 Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 6 Jul 2024 15:21:15 -0400
Subject: [PATCH 07/15] release: remove unwanted mingw dll.a files

---
 .github/workflows/release.yaml | 5 +++++
 llm/llm.go                     | 1 -
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 61ca3c43..d1faf9f5 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -85,6 +85,11 @@ jobs:
           write-host "Installing plugin"
           & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
           write-host "plugin installed"
+      - name: remove unwanted mingw dll.a files
+        run: |
+          Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force
+          Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force
+          Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
diff --git a/llm/llm.go b/llm/llm.go
index 3cd162e0..88c0258d 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -2,7 +2,6 @@ package llm
 
 // #cgo CFLAGS: -Illama.cpp -Illama.cpp/include -Illama.cpp/ggml/include
 // #cgo LDFLAGS: -lllama -lggml -lstdc++ -lpthread
-// #cgo windows LDFLAGS: -static-libstdc++ -static-libgcc -static
 // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
 // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
 // #cgo windows,amd64 LDFLAGS: -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src

From c12f1c5b99c9d9f9388f464aa77063987fdb8f0f Mon Sep 17 00:00:00 2001
From: jmorganca <jmorganca@gmail.com>
Date: Sat, 6 Jul 2024 16:12:29 -0400
Subject: [PATCH 08/15] release: move mingw library cleanup to correct job

---
 .github/workflows/release.yaml | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index d1faf9f5..0005c69d 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -85,11 +85,6 @@ jobs:
           write-host "Installing plugin"
           & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
           write-host "plugin installed"
-      - name: remove unwanted mingw dll.a files
-        run: |
-          Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force
-          Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force
-          Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod
@@ -309,6 +304,11 @@ jobs:
           write-host "Installing plugin"
           & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
           write-host "plugin installed"
+      - name: remove unwanted mingw dll.a files
+        run: |
+          Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libpthread.dll.a" -File | Remove-Item -Force
+          Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libwinpthread.dll.a" -File | Remove-Item -Force
+          Get-ChildItem -Path "C:\mingw64" -Recurse -Filter "libstdc++.dll.a" -File | Remove-Item -Force
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod

From 4607c706413f1354d0e762d25a9a0a933edc14ec Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sat, 6 Jul 2024 18:58:16 -0400
Subject: [PATCH 09/15] llm: add `-DBUILD_SHARED_LIBS=off` to common cpu cmake
 flags (#5520)

---
 llm/generate/gen_linux.sh | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llm/generate/gen_linux.sh b/llm/generate/gen_linux.sh
index 2bea1c4e..d3e2d13b 100755
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -77,7 +77,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
     if [ -n "${OLLAMA_CUSTOM_CPU_DEFS}" ]; then
         init_vars
         echo "OLLAMA_CUSTOM_CPU_DEFS=\"${OLLAMA_CUSTOM_CPU_DEFS}\""
-        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
+        CMAKE_DEFS="${OLLAMA_CUSTOM_CPU_DEFS} -DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on ${CMAKE_DEFS}"
         BUILD_DIR="../build/linux/${ARCH}/cpu"
         echo "Building custom CPU"
         build
@@ -93,7 +93,7 @@ if [ -z "${OLLAMA_SKIP_CPU_GENERATE}" ]; then
         # -DGGML_AVX512_VBMI -- 2018 Intel Cannon Lake
         # -DGGML_AVX512_VNNI -- 2021 Intel Alder Lake
 
-        COMMON_CPU_DEFS="-DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
+        COMMON_CPU_DEFS="-DBUILD_SHARED_LIBS=off -DCMAKE_POSITION_INDEPENDENT_CODE=on -DGGML_NATIVE=off -DGGML_OPENMP=off"
         if [ -z "${OLLAMA_CPU_TARGET}" -o "${OLLAMA_CPU_TARGET}" = "cpu" ]; then
             #
             # CPU first for the default library, set up as lowest common denominator for maximum compatibility (including Rosetta)

From f8241bfba384cf8c888847dc44b73d7f43a42d82 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sat, 6 Jul 2024 19:35:04 -0400
Subject: [PATCH 10/15] gpu: report system free memory instead of 0 (#5521)

---
 gpu/gpu_darwin.go     |  2 +-
 gpu/gpu_info_darwin.h |  1 +
 gpu/gpu_info_darwin.m | 26 ++++++++++++++++++++++++--
 3 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/gpu/gpu_darwin.go b/gpu/gpu_darwin.go
index f26d23c1..39d8fcf8 100644
--- a/gpu/gpu_darwin.go
+++ b/gpu/gpu_darwin.go
@@ -56,7 +56,7 @@ func GetCPUInfo() GpuInfoList {
 func GetCPUMem() (memInfo, error) {
 	return memInfo{
 		TotalMemory: uint64(C.getPhysicalMemory()),
-		FreeMemory:  0,
+		FreeMemory:  uint64(C.getFreeMemory()),
 	}, nil
 }
 
diff --git a/gpu/gpu_info_darwin.h b/gpu/gpu_info_darwin.h
index 3edca237..415e7922 100644
--- a/gpu/gpu_info_darwin.h
+++ b/gpu/gpu_info_darwin.h
@@ -2,3 +2,4 @@
 #include <stdint.h>
 uint64_t getRecommendedMaxVRAM();
 uint64_t getPhysicalMemory();
+uint64_t getFreeMemory();
diff --git a/gpu/gpu_info_darwin.m b/gpu/gpu_info_darwin.m
index a145ac07..5ca139e0 100644
--- a/gpu/gpu_info_darwin.m
+++ b/gpu/gpu_info_darwin.m
@@ -1,4 +1,5 @@
-// go:build darwin
+#import <Foundation/Foundation.h>
+#import <mach/mach.h>
 #include "gpu_info_darwin.h"
 
 uint64_t getRecommendedMaxVRAM() {
@@ -8,6 +9,27 @@ uint64_t getRecommendedMaxVRAM() {
   return result;
 }
 
+// getPhysicalMemory returns the total physical memory in bytes
 uint64_t getPhysicalMemory() {
-  return [[NSProcessInfo processInfo] physicalMemory];
+  return [NSProcessInfo processInfo].physicalMemory;
+}
+
+// getFreeMemory returns the total free memory in bytes, including inactive
+// memory that can be reclaimed by the system.
+uint64_t getFreeMemory() {
+  mach_port_t host_port = mach_host_self();
+  mach_msg_type_number_t host_size = sizeof(vm_statistics64_data_t) / sizeof(integer_t);
+  vm_size_t pagesize;
+  vm_statistics64_data_t vm_stat;
+
+  host_page_size(host_port, &pagesize);
+  if (host_statistics64(host_port, HOST_VM_INFO64, (host_info64_t)&vm_stat, &host_size) != KERN_SUCCESS) {
+    return 0;
+  }
+
+  uint64_t free_memory = (uint64_t)vm_stat.free_count * pagesize;
+  free_memory += (uint64_t)vm_stat.speculative_count * pagesize;
+  free_memory += (uint64_t)vm_stat.inactive_count * pagesize;
+
+  return free_memory;
 }

From 0ee87615c74c69d8fbc3cad8f3ea5a2364b1a876 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sat, 6 Jul 2024 22:01:52 -0400
Subject: [PATCH 11/15] sched: don't error if paging to disk on Windows and
 macOS (#5523)

---
 server/sched.go | 37 ++++++++++++++++++++++++-------------
 1 file changed, 24 insertions(+), 13 deletions(-)

diff --git a/server/sched.go b/server/sched.go
index 8c054c6b..9dff2ae0 100644
--- a/server/sched.go
+++ b/server/sched.go
@@ -197,25 +197,36 @@ func (s *Scheduler) processPending(ctx context.Context) {
 						break
 					}
 
-					// Block attempting to load a model larger than system memory + GPU memory
 					estimate := llm.EstimateGPULayers(gpus, ggml, pending.model.ProjectorPaths, pending.opts)
 					maxSize := systemMem.FreeMemory
-					for _, gpu := range gpus {
-						if gpu.Library == "cpu" {
-							continue
-						}
-						if loadedCount == 0 {
-							// If no other models are loaded, set the limit based on what's available
-							maxSize += gpu.FreeMemory
-						} else {
-							// Other models could be unloaded, favor total memory for limit
-							maxSize += gpu.TotalMemory
+
+					// Add available GPU memory to the total pool
+					// macOS hardware has unified memory so don't double count
+					if runtime.GOOS != "darwin" {
+						for _, gpu := range gpus {
+							if gpu.Library == "cpu" {
+								continue
+							}
+							if loadedCount == 0 {
+								// If no other models are loaded, set the limit based on what's available
+								maxSize += gpu.FreeMemory
+							} else {
+								// Other models could be unloaded, favor total memory for limit
+								maxSize += gpu.TotalMemory
+							}
 						}
 					}
+
+					// Block attempting to load a model larger than system memory + GPU memory
 					if estimate.TotalSize > maxSize {
 						slog.Warn("model request too large for system", "requested", format.HumanBytes2(estimate.TotalSize), "system", format.HumanBytes2(maxSize))
-						pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize))
-						break
+
+						// Linux will crash if over-allocating memory - return an error to the user.
+						// TODO (jmorganca): add reasonable upper limits for darwin and windows as well
+						if runtime.GOOS == "linux" {
+							pending.errCh <- fmt.Errorf("requested model (%s) is too large for this system (%s)", format.HumanBytes2(estimate.TotalSize), format.HumanBytes2(maxSize))
+							break
+						}
 					}
 
 					// Evaluate if the model will fit in the available system memory, or if we should unload a model first

From 0e09c380fcae8b81db3c3447d70d721cfad00dbd Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 7 Jul 2024 12:38:04 -0400
Subject: [PATCH 12/15] llm: print caching notices in debug only (#5533)

---
 llm/ext_server/server.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index 00a15b4a..7ae58e38 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1413,7 +1413,7 @@ struct llama_server_context
             return get_slot(-1);
         }
 
-        LOG_INFO("slot with common prefix found", {{
+        LOG_DEBUG("slot with common prefix found", {{
             "slot_id", slot->id,
             "characters", longest
         }});

From 571dc61955ced560a45e9d32b1cd2a52d9803c8c Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 7 Jul 2024 13:03:09 -0400
Subject: [PATCH 13/15] Update llama.cpp submodule to `a8db2a9c` (#5530)

---
 llm/llama.cpp                            |  2 +-
 llm/patches/05-default-pretokenizer.diff | 10 +++++-----
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llm/llama.cpp b/llm/llama.cpp
index d7fd29ff..a8db2a9c 160000
--- a/llm/llama.cpp
+++ b/llm/llama.cpp
@@ -1 +1 @@
-Subproject commit d7fd29fff16456ce9c3a23fd2d09a66256b05aff
+Subproject commit a8db2a9ce64cd4417f6a312ab61858f17f0f8584
diff --git a/llm/patches/05-default-pretokenizer.diff b/llm/patches/05-default-pretokenizer.diff
index f4eaced7..341a6f59 100644
--- a/llm/patches/05-default-pretokenizer.diff
+++ b/llm/patches/05-default-pretokenizer.diff
@@ -1,11 +1,11 @@
 diff --git a/src/llama.cpp b/src/llama.cpp
-index 73f52435..2b81b4bd 100644
+index 2b9ace28..172640e2 100644
 --- a/src/llama.cpp
 +++ b/src/llama.cpp
-@@ -5092,16 +5092,7 @@ static void llm_load_vocab(
- 
-         // for now, only BPE models have pre-tokenizers
+@@ -5357,16 +5357,7 @@ static void llm_load_vocab(
          if (vocab.type == LLAMA_VOCAB_TYPE_BPE) {
+             vocab.tokenizer_add_space_prefix = false;
+             vocab.tokenizer_clean_spaces = true;
 -            if (tokenizer_pre.empty()) {
 -                LLAMA_LOG_WARN("%s: missing pre-tokenizer type, using: 'default'\n", __func__);
 -                LLAMA_LOG_WARN("%s:                                             \n", __func__);
@@ -20,7 +20,7 @@ index 73f52435..2b81b4bd 100644
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_DEFAULT;
              } else if (
                      tokenizer_pre == "llama3"   ||
-@@ -5164,7 +5155,8 @@ static void llm_load_vocab(
+@@ -5439,7 +5430,8 @@ static void llm_load_vocab(
                  tokenizer_pre == "jais") {
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_JAIS;
              } else {

From d8def1ff9432ef60d1067e5e6dde0d700dd95021 Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 7 Jul 2024 13:41:51 -0400
Subject: [PATCH 14/15] llm: allow gemma 2 to context shift (#5534)

---
 llm/ext_server/server.cpp | 29 +----------------------------
 1 file changed, 1 insertion(+), 28 deletions(-)

diff --git a/llm/ext_server/server.cpp b/llm/ext_server/server.cpp
index 7ae58e38..0ef3956e 100644
--- a/llm/ext_server/server.cpp
+++ b/llm/ext_server/server.cpp
@@ -1688,22 +1688,8 @@ struct llama_server_context
                     }
                     slot.params.n_keep = std::min(slot.n_ctx - 4, slot.params.n_keep);
 
-                    char buf[256];
-                    llama_model_meta_val_str(model, "general.architecture", buf, 256);
-                    bool gemma2 = strcmp(buf, "gemma2") == 0;
-
-                    int32_t truncate_at = slot.n_ctx;
-
-                    // truncate at 2/3 of the context length for gemma2 models
-                    // as they do not support context shifts (from the sliding window implementation).
-                    // this way, prompts that almost fit the context length can still generate a full
-                    // response without a sudden stop from hitting the context limit
-                    if (gemma2) {
-                        truncate_at = 2 * slot.n_ctx / 3;
-                    }
-
                     // if input prompt is too big, truncate it, if group attention self-extend is disabled
-                    if (slot.ga_n == 1 && slot.n_prompt_tokens >= truncate_at)
+                    if (slot.ga_n == 1 && slot.n_prompt_tokens >= slot.n_ctx)
                     {
                         const int n_left = slot.n_ctx - slot.params.n_keep;
                         const int n_shift = n_left / 2;
@@ -1731,19 +1717,6 @@ struct llama_server_context
                         GGML_ASSERT(slot.n_prompt_tokens < slot.n_ctx);
                     }
 
-                    // Models with sliding window attention do not work with context shifts, so
-                    // limit their prediction to the context length
-                    if (gemma2) {
-                        int32_t limit = slot.n_ctx - slot.n_prompt_tokens;
-                        slot.n_predict = limit;
-                        slot.params.n_predict = limit;
-                        LOG_INFO("model does not support sliding window, limiting generation", {
-                            {"n_ctx", slot.n_ctx},
-                            {"n_prompt_tokens", slot.n_prompt_tokens},
-                            {"n_predict", slot.n_predict}
-                        });
-                    }
-
                     if (!slot.params.cache_prompt)
                     {
                         llama_sampling_reset(slot.ctx_sampling);

From 53da2c69654769c0c086af695722e1d9b9ee6ecc Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Sun, 7 Jul 2024 14:32:05 -0400
Subject: [PATCH 15/15] llm: remove ambiguous comment when putting upper limit
 on predictions to avoid infinite generation (#5535)

---
 llm/server.go | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llm/server.go b/llm/server.go
index 206f9e39..54fad92c 100644
--- a/llm/server.go
+++ b/llm/server.go
@@ -699,10 +699,9 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu
 	}
 	defer s.sem.Release(1)
 
-	// only allow maximum 10 "context shifts" to avoid infinite generation
+	// put an upper limit on num_predict to avoid the model running on forever
 	if req.Options.NumPredict < 0 || req.Options.NumPredict > 10*s.options.NumCtx {
 		req.Options.NumPredict = 10 * s.options.NumCtx
-		slog.Debug("setting token limit to 10x num_ctx", "num_ctx", s.options.NumCtx, "num_predict", req.Options.NumPredict)
 	}
 
 	request := map[string]any{