From d632e23fbae005ca3522c0e3b2cf31a2499e4c03 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 20 Sep 2024 13:09:38 -0700
Subject: [PATCH 01/13] Add Windows arm64 support to official builds (#5712)

* Unified arm/x86 windows installer

This adjusts the installer payloads to be architecture aware so we can cary
both amd64 and arm64 binaries in the installer, and install only the applicable
architecture at install time.

* Include arm64 in official windows build

* Harden schedule test for slow windows timers

This test seems to be a bit flaky on windows, so give it more time to converge
---
 .github/workflows/release.yaml | 131 ++++++++++++++++++++++++++++++++-
 app/ollama.iss                 |  61 +++++++++++++--
 docs/development.md            |  19 +++++
 llm/ext_server/CMakeLists.txt  |   1 +
 llm/generate/gen_windows.ps1   |  66 +++++++++++++----
 llm/llm.go                     |   2 +-
 scripts/build_windows.ps1      |  70 +++++++++++++-----
 server/sched_test.go           |   4 +-
 8 files changed, 310 insertions(+), 44 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 0748b7c1..e5ac2833 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -274,7 +274,132 @@ jobs:
           path: dist/deps/*
 
 
-  # Import the prior generation steps and build the final windows assets
+  # windows arm64 generate, go build, and zip file (no installer)
+  # Output of this build is aggregated into the final x86 build
+  # for a unified windows installer
+  windows-arm64:
+    runs-on: windows-arm64
+    environment: release
+    env:
+      KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
+    steps:
+      - name: Set Version
+        run: |
+          $ver=${env:GITHUB_REF_NAME}.trim("v")
+          write-host VERSION=$ver | Out-File -FilePath ${env:GITHUB_ENV} -Encoding utf8 -Append
+      - uses: 'google-github-actions/auth@v2'
+        with:
+          project_id: 'ollama'
+          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
+      - run: echo "${{ vars.OLLAMA_CERT }}" | Out-File -FilePath ollama_inc.crt -Encoding utf8
+      - name: install Windows SDK 8.1 to get signtool
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "downloading SDK"
+          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
+          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
+          write-host "Win SDK 8.1 installed"
+          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
+      - name: install signing plugin
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "downloading plugin"
+          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
+          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
+          write-host "Installing plugin"
+          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
+          write-host "plugin installed"
+      # The current Windows arm64 beta image has effectively zero dev tools installed...
+      - name: Install git and gzip
+        run: |
+          Set-ExecutionPolicy Bypass -Scope Process -Force
+          [System.Net.ServicePointManager]::SecurityProtocol = [System.Net.ServicePointManager]::SecurityProtocol -bor 3072
+          iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
+          choco install -y --no-progress git gzip
+          echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Install Visual Studio 2022
+        run: |
+          $components = @(
+            "Microsoft.VisualStudio.Component.CoreEditor",
+            "Microsoft.VisualStudio.Workload.CoreEditor",
+            "Microsoft.VisualStudio.Component.Roslyn.Compiler",
+            "Microsoft.Component.MSBuild",
+            "Microsoft.VisualStudio.Component.TextTemplating",
+            "Microsoft.VisualStudio.Component.Debugger.JustInTime",
+            "Microsoft.VisualStudio.Component.VC.CoreIde",
+            "Microsoft.VisualStudio.Component.VC.Tools.x86.x64",
+            "Microsoft.VisualStudio.Component.Windows11SDK.22621",
+            "Microsoft.VisualStudio.Component.VC.Tools.ARM64EC",
+            "Microsoft.VisualStudio.Component.VC.Tools.ARM64",
+            "Microsoft.VisualStudio.Component.VC.ATL",
+            "Microsoft.VisualStudio.Component.VC.ATL.ARM64",
+            "Microsoft.VisualStudio.Component.Graphics",
+            "Microsoft.VisualStudio.Component.VC.Redist.14.Latest",
+            "Microsoft.VisualStudio.ComponentGroup.NativeDesktop.Core",
+            "Microsoft.VisualStudio.Component.Windows11Sdk.WindowsPerformanceToolkit",
+            "Microsoft.VisualStudio.Component.CppBuildInsights",
+            "Microsoft.VisualStudio.Component.VC.DiagnosticTools",
+            "Microsoft.VisualStudio.ComponentGroup.WebToolsExtensions.CMake",
+            "Microsoft.VisualStudio.Component.VC.CMake.Project",
+            "Microsoft.VisualStudio.Component.VC.ASAN",
+            "Microsoft.VisualStudio.Component.Vcpkg",
+            "Microsoft.VisualStudio.Workload.NativeDesktop"
+          )
+          $config = @{
+                "version" = "1.0"
+                "components"  = $components
+                "extensions"  = @()
+            }
+          $configPath = "${env:RUNNER_TEMP}\vsconfig"
+          $config | ConvertTo-Json | Out-File -FilePath $configPath
+          $bootstrapperFilePath = "${env:RUNNER_TEMP}\vs_community.exe"
+          write-host "Downloading Visual Studio 2022"
+          Invoke-WebRequest -Uri "https://aka.ms/vs/17/release/vs_community.exe" -outfile $bootstrapperFilePath
+          $bootstrapperArgumentList = ('/c', $bootstrapperFilePath, '--config', $configPath, '--quiet', '--wait' )
+          write-host "Installing Visual Studio 2022"
+          $process = Start-Process -FilePath cmd.exe -ArgumentList $bootstrapperArgumentList -Wait -PassThru
+          $exitCode = $process.ExitCode
+          write-host $exitCode
+      # pacman in mingw/msys2 is ~broken on windows arm right now - hangs consistently during attempts to install
+      # so we'll use this alternative GCC binary
+      - name: Install llvm-mingw GCC
+        run: |
+          $gcc_url="https://github.com/mstorsjo/llvm-mingw/releases/download/20240619/llvm-mingw-20240619-ucrt-aarch64.zip"
+          write-host "Downloading llvm-mingw"
+          Invoke-WebRequest -Uri "${gcc_url}" -OutFile "${env:RUNNER_TEMP}\gcc.zip"
+          write-host "Unpacking llvm-mingw"
+          expand-archive -path "${env:RUNNER_TEMP}\gcc.zip" -destinationpath "c:\"
+          mv c:\llvm-mingw-* c:\llvm-mingw
+          echo "c:\llvm-mingw\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+      - name: Verify GCC
+        run: |
+          echo $env:PATH
+          gcc --version
+      - uses: actions/checkout@v4
+      - uses: actions/setup-go@v5
+        with:
+          go-version-file: go.mod
+          cache: true
+      - run: go get ./...
+      - run: |
+          $gopath=(get-command go).source | split-path -parent
+          $gccpath=(get-command gcc).source | split-path -parent
+          & "C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\Tools\Launch-VsDevShell.ps1"
+          cd $env:GITHUB_WORKSPACE
+          $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
+          $env:PATH="$gopath;$gccpath;$env:PATH;C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin"
+          echo $env:PATH
+          .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies distZip
+        name: 'Windows Build'
+      - uses: actions/upload-artifact@v4
+        with:
+          name: windows-arm64
+          path: |
+            dist/windows-arm64/**
+            dist/windows-arm64-app.exe
+            dist/ollama-windows-arm64.zip
+
+  # Import the prior generation steps plus the full arm64 build, and build the final windows assets
   build-windows:
     environment: release
     runs-on: windows
@@ -282,6 +407,7 @@ jobs:
       - generate-windows-cuda
       - generate-windows-rocm
       - generate-windows-cpu
+      - windows-arm64
     env:
       KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
     steps:
@@ -339,6 +465,9 @@ jobs:
       - uses: actions/download-artifact@v4
         with:
           name: generate-windows-rocm
+      - uses: actions/download-artifact@v4
+        with:
+          name: windows-arm64
       - run: dir build
       - run: |
           $gopath=(get-command go).source | split-path -parent
diff --git a/app/ollama.iss b/app/ollama.iss
index 34cc5c4c..63b5bdb0 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -28,8 +28,8 @@ AppPublisher={#MyAppPublisher}
 AppPublisherURL={#MyAppURL}
 AppSupportURL={#MyAppURL}
 AppUpdatesURL={#MyAppURL}
-ArchitecturesAllowed=x64 arm64
-ArchitecturesInstallIn64BitMode=x64 arm64
+ArchitecturesAllowed=x64compatible arm64
+ArchitecturesInstallIn64BitMode=x64compatible arm64
 DefaultDirName={localappdata}\Programs\{#MyAppName}
 DefaultGroupName={#MyAppName}
 DisableProgramGroupPage=yes
@@ -48,6 +48,7 @@ OutputDir=..\dist\
 SetupLogging=yes
 CloseApplications=yes
 RestartApplications=no
+RestartIfNeededByRun=no
 
 ; https://jrsoftware.org/ishelp/index.php?topic=setup_wizardimagefile
 WizardSmallImageFile=.\assets\setup.bmp
@@ -86,12 +87,21 @@ Name: "english"; MessagesFile: "compiler:Default.isl"
 DialogFontSize=12
 
 [Files]
-Source: ".\app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ; Flags: ignoreversion 64bit
-Source: "..\ollama.exe"; DestDir: "{app}"; Flags: ignoreversion 64bit
-Source: "..\dist\windows-{#ARCH}\lib\ollama\runners\*"; DestDir: "{app}\lib\ollama\runners"; Flags: ignoreversion 64bit recursesubdirs
+#if DirExists("..\dist\windows-amd64")
+Source: "..\dist\windows-amd64-app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ;Check: not IsArm64();  Flags: ignoreversion 64bit
+Source: "..\dist\windows-amd64\ollama.exe"; DestDir: "{app}"; Check: not IsArm64(); Flags: ignoreversion 64bit
+Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Check: not IsArm64(); Flags: ignoreversion 64bit recursesubdirs
+#endif
+
+#if DirExists("..\dist\windows-arm64")
+Source: "..\dist\windows-arm64\vc_redist.arm64.exe"; DestDir: "{tmp}"; Check: IsArm64() and vc_redist_needed(); Flags: deleteafterinstall
+Source: "..\dist\windows-arm64-app.exe"; DestDir: "{app}"; DestName: "{#MyAppExeName}" ;Check: IsArm64();  Flags: ignoreversion 64bit
+Source: "..\dist\windows-arm64\ollama.exe"; DestDir: "{app}"; Check: IsArm64(); Flags: ignoreversion 64bit
+Source: "..\dist\windows-arm64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Check: IsArm64(); Flags: ignoreversion 64bit recursesubdirs
+#endif
+
 Source: "..\dist\ollama_welcome.ps1"; DestDir: "{app}"; Flags: ignoreversion
 Source: ".\assets\app.ico"; DestDir: "{app}"; Flags: ignoreversion
-Source: "..\dist\windows-amd64\lib\ollama\*"; DestDir: "{app}\lib\ollama\"; Flags: ignoreversion recursesubdirs
 
 [Icons]
 Name: "{group}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
@@ -99,6 +109,9 @@ Name: "{userstartup}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilen
 Name: "{userprograms}\{#MyAppName}"; Filename: "{app}\{#MyAppExeName}"; IconFilename: "{app}\app.ico"
 
 [Run]
+#if DirExists("..\dist\windows-arm64")
+Filename: "{tmp}\vc_redist.arm64.exe"; Parameters: "/install /passive /norestart"; Check: IsArm64() and vc_redist_needed(); StatusMsg: "Installing VC++ Redistributables..."; Flags: waituntilterminated
+#endif
 Filename: "{cmd}"; Parameters: "/C set PATH={app};%PATH% & ""{app}\{#MyAppExeName}"""; Flags: postinstall nowait runhidden
 
 [UninstallRun]
@@ -154,3 +167,39 @@ begin
   { Pos() returns 0 if not found }
   Result := Pos(';' + ExpandConstant(Param) + ';', ';' + OrigPath + ';') = 0;
 end;
+
+{ --- VC Runtime libraries discovery code - Only install vc_redist if it isn't already installed ----- }
+const VCRTL_MIN_V1 = 14;
+const VCRTL_MIN_V2 = 40;
+const VCRTL_MIN_V3 = 33807;
+const VCRTL_MIN_V4 = 0;
+
+ // check if the minimum required vc redist is installed (by looking the registry)
+function vc_redist_needed (): Boolean;
+var
+  sRegKey: string;
+  v1: Cardinal;
+  v2: Cardinal;
+  v3: Cardinal;
+  v4: Cardinal;
+begin
+  sRegKey := 'SOFTWARE\WOW6432Node\Microsoft\VisualStudio\14.0\VC\Runtimes\arm64';
+  if (RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'Major', v1)  and
+      RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'Minor', v2) and
+      RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'Bld', v3) and
+      RegQueryDWordValue (HKEY_LOCAL_MACHINE, sRegKey, 'RBld', v4)) then
+  begin
+    Log ('VC Redist version: ' + IntToStr (v1) +
+        '.' + IntToStr (v2) + '.' + IntToStr (v3) +
+        '.' + IntToStr (v4));
+    { Version info was found. Return true if later or equal to our
+       minimal required version RTL_MIN_Vx }
+    Result := not (
+        (v1 > VCRTL_MIN_V1) or ((v1 = VCRTL_MIN_V1) and
+         ((v2 > VCRTL_MIN_V2) or ((v2 = VCRTL_MIN_V2) and
+          ((v3 > VCRTL_MIN_V3) or ((v3 = VCRTL_MIN_V3) and
+           (v4 >= VCRTL_MIN_V4)))))));
+  end
+  else
+    Result := TRUE;
+end;
diff --git a/docs/development.md b/docs/development.md
index cd6c41af..e67689ab 100644
--- a/docs/development.md
+++ b/docs/development.md
@@ -148,3 +148,22 @@ In addition to the common Windows development tools described above, install AMD
 - [Strawberry Perl](https://strawberryperl.com/)
 
 Lastly, add `ninja.exe` included with MSVC to the system path (e.g. `C:\Program Files (x86)\Microsoft Visual Studio\2019\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\Ninja`).
+
+#### Windows arm64
+
+The default `Developer PowerShell for VS 2022` may default to x86 which is not what you want.  To ensure you get an arm64 development environment, start a plain PowerShell terminal and run:
+
+```powershell
+import-module 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community\\Common7\\Tools\\Microsoft.VisualStudio.DevShell.dll'
+Enter-VsDevShell -Arch arm64 -vsinstallpath 'C:\\Program Files\\Microsoft Visual Studio\\2022\\Community' -skipautomaticlocation
+```
+
+You can confirm with `write-host $env:VSCMD_ARG_TGT_ARCH`
+
+Follow the instructions at https://www.msys2.org/wiki/arm64/ to set up an arm64 msys2 environment.  Ollama requires gcc and mingw32-make to compile, which is not currently available on Windows arm64, but a gcc compatibility adapter is available via `mingw-w64-clang-aarch64-gcc-compat`. At a minimum you will need to install the following:
+
+```
+pacman -S mingw-w64-clang-aarch64-clang mingw-w64-clang-aarch64-gcc-compat mingw-w64-clang-aarch64-make make
+```
+
+You will need to ensure your PATH includes go, cmake, gcc and clang mingw32-make to build ollama from source. (typically `C:\msys64\clangarm64\bin\`)
\ No newline at end of file
diff --git a/llm/ext_server/CMakeLists.txt b/llm/ext_server/CMakeLists.txt
index 88c8b03d..51730245 100644
--- a/llm/ext_server/CMakeLists.txt
+++ b/llm/ext_server/CMakeLists.txt
@@ -10,5 +10,6 @@ target_compile_definitions(${TARGET} PRIVATE
 target_link_libraries(${TARGET} PRIVATE ggml llama common llava ${CMAKE_THREAD_LIBS_INIT} ${LLAMA_SERVER_LDFLAGS})
 if (WIN32)
     TARGET_LINK_LIBRARIES(${TARGET} PRIVATE ws2_32)
+    target_link_options(${TARGET} PRIVATE -municode -Wl,/subsystem:console)
 endif()
 target_compile_features(${TARGET} PRIVATE cxx_std_11)
\ No newline at end of file
diff --git a/llm/generate/gen_windows.ps1 b/llm/generate/gen_windows.ps1
index 22538851..29ff5ff6 100644
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -19,6 +19,19 @@ function amdGPUs {
 
 
 function init_vars {
+    write-host "Checking for cmake..."
+    get-command cmake
+    write-host "Checking for ninja..."
+    $d=(get-command -ea 'silentlycontinue' ninja).path
+    if ($null -eq $d) {
+        $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
+        $matches=(gci -path $MSVC_INSTALL -r -fi ninja.exe)
+        if ($matches.count -eq 0) {
+            throw "Unable to locate ninja"
+        }
+        $ninjaDir=($matches[0].FullName | split-path -parent)
+        $env:PATH="$env:PATH;$ninjaDir"
+    }
     if (!$script:SRC_DIR) {
         $script:SRC_DIR = $(resolve-path "..\..\")
     }
@@ -145,7 +158,7 @@ function cleanup {
         }
 
         # Checkout each file
-        foreach ($file in $filePaths) {            
+        foreach ($file in $filePaths) {
             git -C "${script:llamacppDir}" checkout $file
         }
         git -C "${script:llamacppDir}" checkout CMakeLists.txt
@@ -162,12 +175,12 @@ function build_static() {
     if ((-not "${env:OLLAMA_SKIP_STATIC_GENERATE}") -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "static"))) {
         # GCC build for direct linking into the Go binary
         init_vars
-        # cmake will silently fallback to msvc compilers if mingw isn't in the path, so detect and fail fast
-        # as we need this to be compiled by gcc for golang to be able to link with itx
-        write-host "Checking for MinGW..."
-        # error action ensures we exit on failure
-        get-command gcc
-        get-command mingw32-make
+
+        # cmake will silently fallback to msvc compilers if gcc isn't in the path, so detect and fail fast
+        # as we need this to be compiled by gcc for golang to be able to link with it
+        write-host "Checking for gcc..."
+        get-command  gcc
+        get-command  mingw32-make
         $oldTargets = $script:cmakeTargets
         $script:cmakeTargets = @("llama", "ggml")
         $script:cmakeDefs = @(
@@ -191,11 +204,10 @@ function build_static() {
     }
 }
 
-function build_cpu($gen_arch) {
+function build_cpu_x64 {
     if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
-        # remaining llama.cpp builds use MSVC 
         init_vars
-        $script:cmakeDefs = $script:commonCpuDefs + @("-A", $gen_arch, "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
+        $script:cmakeDefs = $script:commonCpuDefs + @("-A", "x64", "-DGGML_AVX=off", "-DGGML_AVX2=off", "-DGGML_AVX512=off", "-DGGML_FMA=off", "-DGGML_F16C=off") + $script:cmakeDefs
         $script:buildDir="../build/windows/${script:ARCH}/cpu"
         $script:distDir="$script:DIST_BASE\cpu"
         write-host "Building LCD CPU"
@@ -207,6 +219,32 @@ function build_cpu($gen_arch) {
     }
 }
 
+function build_cpu_arm64 {
+    if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu"))) {
+        init_vars
+        write-host "Checking for clang..."
+        get-command clang
+        $env:CFLAGS="-march=armv8.7-a -fvectorize -ffp-model=fast -fno-finite-math-only"
+        $env:CXXFLAGS="$env:CFLAGS"
+        $env:LDFLAGS="-static-libstdc++"
+        $script:cmakeDefs = $script:commonCpuDefs + @(
+            "-DCMAKE_VERBOSE_MAKEFILE=on",
+            "-DCMAKE_C_COMPILER=clang.exe",
+            "-DCMAKE_CXX_COMPILER=clang++.exe",
+            "-DMSVC_RUNTIME_LIBRARY=MultiThreaded"
+        ) + $script:cmakeDefs
+        $script:buildDir="../build/windows/${script:ARCH}/cpu"
+        $script:distDir="$script:DIST_BASE\cpu"
+        write-host "Building LCD CPU"
+        build
+        sign
+        install
+    } else {
+        write-host "Skipping CPU generation step as requested"
+    }
+}
+
+
 function build_cpu_avx() {
     if ((-not "${env:OLLAMA_SKIP_CPU_GENERATE}" ) -and ((-not "${env:OLLAMA_CPU_TARGET}") -or ("${env:OLLAMA_CPU_TARGET}" -eq "cpu_avx"))) {
         init_vars
@@ -331,7 +369,7 @@ function build_rocm() {
         $script:buildDir="../build/windows/${script:ARCH}/rocm$script:ROCM_VARIANT"
         $script:distDir="$script:DIST_BASE\rocm$script:ROCM_VARIANT"
         $script:cmakeDefs += @(
-            "-G", "Ninja", 
+            "-G", "Ninja",
             "-DCMAKE_C_COMPILER=clang.exe",
             "-DCMAKE_CXX_COMPILER=clang++.exe",
             "-DGGML_HIPBLAS=on",
@@ -380,9 +418,9 @@ if ($($args.count) -eq 0) {
     apply_patches
     build_static
     if ($script:ARCH -eq "arm64") {
-        build_cpu("ARM64")
+        build_cpu_arm64
     } else { # amd64
-        build_cpu("x64")
+        build_cpu_x64
         build_cpu_avx
         build_cpu_avx2
         build_cuda
@@ -396,5 +434,5 @@ if ($($args.count) -eq 0) {
     for ( $i = 0; $i -lt $args.count; $i++ ) {
         write-host "performing $($args[$i])"
         & $($args[$i])
-    } 
+    }
 }
\ No newline at end of file
diff --git a/llm/llm.go b/llm/llm.go
index 6bb6591d..6c695889 100644
--- a/llm/llm.go
+++ b/llm/llm.go
@@ -5,7 +5,7 @@ package llm
 // #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/build/darwin/arm64_static -L${SRCDIR}/build/darwin/arm64_static/src -L${SRCDIR}/build/darwin/arm64_static/ggml/src -framework Accelerate -framework Metal
 // #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/build/darwin/x86_64_static -L${SRCDIR}/build/darwin/x86_64_static/src -L${SRCDIR}/build/darwin/x86_64_static/ggml/src
 // #cgo windows,amd64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/amd64_static -L${SRCDIR}/build/windows/amd64_static/src -L${SRCDIR}/build/windows/amd64_static/ggml/src
-// #cgo windows,arm64 LDFLAGS: -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src
+// #cgo windows,arm64 LDFLAGS: -lllama -lggml -static-libstdc++ -static-libgcc -static -L${SRCDIR}/build/windows/arm64_static -L${SRCDIR}/build/windows/arm64_static/src -L${SRCDIR}/build/windows/arm64_static/ggml/src
 // #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/build/linux/x86_64_static -L${SRCDIR}/build/linux/x86_64_static/src -L${SRCDIR}/build/linux/x86_64_static/ggml/src
 // #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/build/linux/arm64_static -L${SRCDIR}/build/linux/arm64_static/src -L${SRCDIR}/build/linux/arm64_static/ggml/src
 // #include <stdlib.h>
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index eb8570c8..ff74a500 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -7,12 +7,12 @@
 $ErrorActionPreference = "Stop"
 
 function checkEnv() {
-    $script:ARCH = $Env:PROCESSOR_ARCHITECTURE.ToLower()
-    $script:TARGET_ARCH=$Env:PROCESSOR_ARCHITECTURE.ToLower()
+    $script:ARCH = (([System.Runtime.InteropServices.RuntimeInformation]::OSArchitecture).ToString().ToLower()).Replace("x64", "amd64")
+    $script:TARGET_ARCH=$script:ARCH
     Write-host "Building for ${script:TARGET_ARCH}"
     write-host "Locating required tools and paths"
     $script:SRC_DIR=$PWD
-    if (!$env:VCToolsRedistDir) {
+    if ($null -eq $env:VCToolsRedistDir) {
         $MSVC_INSTALL=(Get-CimInstance MSFT_VSInstance -Namespace root/cimv2/vs)[0].InstallLocation
         $env:VCToolsRedistDir=(get-item "${MSVC_INSTALL}\VC\Redist\MSVC\*")[0]
     }
@@ -28,9 +28,12 @@ function checkEnv() {
         $script:CUDA_DIRS=$cudaList
     }
     
-    $script:INNO_SETUP_DIR=(get-item "C:\Program Files*\Inno Setup*\")[0]
+    $inoSetup=(get-item "C:\Program Files*\Inno Setup*\")
+    if ($inoSetup.length -gt 0) {
+        $script:INNO_SETUP_DIR=$inoSetup[0]
+    }
 
-    $script:DEPS_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
+    $script:DIST_DIR="${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}"
     $env:CGO_ENABLED="1"
     Write-Output "Checking version"
     if (!$env:VERSION) {
@@ -130,7 +133,7 @@ function buildApp() {
     write-host "Building Ollama App"
     cd "${script:SRC_DIR}\app"
     & windres -l 0 -o ollama.syso ollama.rc
-    & go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
+    & go build -trimpath -ldflags "-s -w -H windowsgui -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" -o "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}-app.exe" .
     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     if ("${env:KEY_CONTAINER}") {
         & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
@@ -140,24 +143,40 @@ function buildApp() {
 }
 
 function gatherDependencies() {
-    write-host "Gathering runtime dependencies"
+    if ($null -eq $env:VCToolsRedistDir) {
+        write-error "Unable to locate VC Install location - please use a Developer shell"
+        exit 1
+    }
+    write-host "Gathering runtime dependencies from $env:VCToolsRedistDir"
     cd "${script:SRC_DIR}"
-    md "${script:DEPS_DIR}\lib\ollama" -ea 0 > $null
+    md "${script:DIST_DIR}\lib\ollama" -ea 0 > $null
 
     # TODO - this varies based on host build system and MSVC version - drive from dumpbin output
     # currently works for Win11 + MSVC 2019 + Cuda V11
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DEPS_DIR}\lib\ollama\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DEPS_DIR}\lib\ollama\"
-    cp "${env:VCToolsRedistDir}\x64\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DEPS_DIR}\lib\ollama\"
-    foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
-        cp "$env:VCToolsRedistDir\..\..\..\Tools\Llvm\x64\bin\api-ms-win-crt-${part}*.dll" "${script:DEPS_DIR}\lib\ollama\"
+    if ($script:TARGET_ARCH -eq "amd64") {
+        $depArch="x64"
+    } else {
+        $depArch=$script:TARGET_ARCH
+    }
+    if ($depArch -eq "amd64") {
+        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\msvcp140*.dll" "${script:DIST_DIR}\lib\ollama\"
+        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140.dll" "${script:DIST_DIR}\lib\ollama\"
+        cp "${env:VCToolsRedistDir}\${depArch}\Microsoft.VC*.CRT\vcruntime140_1.dll" "${script:DIST_DIR}\lib\ollama\"
+        $llvmCrtDir="$env:VCToolsRedistDir\..\..\..\Tools\Llvm\${depArch}\bin"
+        foreach ($part in $("runtime", "stdio", "filesystem", "math", "convert", "heap", "string", "time", "locale", "environment")) {
+            write-host "cp ${llvmCrtDir}\api-ms-win-crt-${part}*.dll ${script:DIST_DIR}\lib\ollama\"
+            cp "${llvmCrtDir}\api-ms-win-crt-${part}*.dll" "${script:DIST_DIR}\lib\ollama\"
+        }
+    } else {
+        # Carying the dll's doesn't seem to work, so use the redist installer
+        copy-item -path "${env:VCToolsRedistDir}\vc_redist.arm64.exe" -destination "${script:DIST_DIR}" -verbose
     }
 
 
     cp "${script:SRC_DIR}\app\ollama_welcome.ps1" "${script:SRC_DIR}\dist\"
     if ("${env:KEY_CONTAINER}") {
         write-host "about to sign"
-        foreach ($file in (get-childitem "${script:DEPS_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
+        foreach ($file in (get-childitem "${script:DIST_DIR}\lib\ollama\cu*.dll") + @("${script:SRC_DIR}\dist\ollama_welcome.ps1")){
             write-host "signing $file"
             & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
                 /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} $file
@@ -167,6 +186,10 @@ function gatherDependencies() {
 }
 
 function buildInstaller() {
+    if ($null -eq ${script:INNO_SETUP_DIR}) {
+        write-host "Inno Setup not present, skipping installer build"
+        return
+    }
     write-host "Building Ollama Installer"
     cd "${script:SRC_DIR}\app"
     $env:PKG_VERSION=$script:PKG_VERSION
@@ -183,13 +206,20 @@ function distZip() {
     Compress-Archive -Path "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}\*" -DestinationPath "${script:SRC_DIR}\dist\ollama-windows-${script:TARGET_ARCH}.zip" -Force
 }
 
+checkEnv
 try {
-    checkEnv
-    buildOllama
-    buildApp
-    gatherDependencies
-    buildInstaller
-    distZip
+    if ($($args.count) -eq 0) {
+        buildOllama
+        buildApp
+        gatherDependencies
+        buildInstaller
+        distZip
+    } else {
+        for ( $i = 0; $i -lt $args.count; $i++ ) {
+            write-host "performing $($args[$i])"
+            & $($args[$i])
+        } 
+    }
 } catch {
     write-host "Build Failed"
     write-host $_
diff --git a/server/sched_test.go b/server/sched_test.go
index be32065a..fe5647c5 100644
--- a/server/sched_test.go
+++ b/server/sched_test.go
@@ -354,7 +354,7 @@ func TestRequestsMultipleLoadedModels(t *testing.T) {
 }
 
 func TestGetRunner(t *testing.T) {
-	ctx, done := context.WithTimeout(context.Background(), 100*time.Millisecond)
+	ctx, done := context.WithTimeout(context.Background(), 200*time.Millisecond)
 	defer done()
 
 	a := newScenarioRequest(t, ctx, "ollama-model-1a", 10, &api.Duration{Duration: 2 * time.Millisecond})
@@ -395,7 +395,7 @@ func TestGetRunner(t *testing.T) {
 	slog.Info("c")
 	successCh1c, errCh1c := s.GetRunner(c.ctx, c.req.model, c.req.opts, c.req.sessionDuration)
 	// Starts in pending channel, then should be quickly processsed to return an error
-	time.Sleep(20 * time.Millisecond) // Long enough for the "a" model to expire and unload
+	time.Sleep(50 * time.Millisecond) // Long enough for the "a" model to expire and unload
 	require.Empty(t, successCh1c)
 	s.loadedMu.Lock()
 	require.Empty(t, s.loaded)

From f5ff917b1dd24cddd45d19f6bb9ba335bf815e83 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 20 Sep 2024 14:20:57 -0700
Subject: [PATCH 02/13] CI: adjust step ordering for win arm to match x64
 (#6895)

---
 .github/workflows/release.yaml | 52 +++++++++++++++++-----------------
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index e5ac2833..8fbe475f 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -283,32 +283,6 @@ jobs:
     env:
       KEY_CONTAINER: ${{ vars.KEY_CONTAINER }}
     steps:
-      - name: Set Version
-        run: |
-          $ver=${env:GITHUB_REF_NAME}.trim("v")
-          write-host VERSION=$ver | Out-File -FilePath ${env:GITHUB_ENV} -Encoding utf8 -Append
-      - uses: 'google-github-actions/auth@v2'
-        with:
-          project_id: 'ollama'
-          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
-      - run: echo "${{ vars.OLLAMA_CERT }}" | Out-File -FilePath ollama_inc.crt -Encoding utf8
-      - name: install Windows SDK 8.1 to get signtool
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading SDK"
-          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
-          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
-          write-host "Win SDK 8.1 installed"
-          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
-      - name: install signing plugin
-        run: |
-          $ErrorActionPreference = "Stop"
-          write-host "downloading plugin"
-          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
-          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
-          write-host "Installing plugin"
-          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
-          write-host "plugin installed"
       # The current Windows arm64 beta image has effectively zero dev tools installed...
       - name: Install git and gzip
         run: |
@@ -376,6 +350,32 @@ jobs:
           echo $env:PATH
           gcc --version
       - uses: actions/checkout@v4
+      - name: Set Version
+        run: |
+          $ver=${env:GITHUB_REF_NAME}.trim("v")
+          write-host VERSION=$ver | Out-File -FilePath ${env:GITHUB_ENV} -Encoding utf8 -Append
+      - uses: 'google-github-actions/auth@v2'
+        with:
+          project_id: 'ollama'
+          credentials_json: '${{ secrets.GOOGLE_SIGNING_CREDENTIALS }}'
+      - run: echo "${{ vars.OLLAMA_CERT }}" | Out-File -FilePath ollama_inc.crt -Encoding utf8
+      - name: install Windows SDK 8.1 to get signtool
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "downloading SDK"
+          Invoke-WebRequest -Uri "https://go.microsoft.com/fwlink/p/?LinkId=323507" -OutFile "${env:RUNNER_TEMP}\sdksetup.exe"
+          Start-Process "${env:RUNNER_TEMP}\sdksetup.exe" -ArgumentList @("/q") -NoNewWindow -Wait
+          write-host "Win SDK 8.1 installed"
+          gci -path 'C:\Program Files (x86)\Windows Kits\' -r -fi 'signtool.exe'
+      - name: install signing plugin
+        run: |
+          $ErrorActionPreference = "Stop"
+          write-host "downloading plugin"
+          Invoke-WebRequest -Uri "https://github.com/GoogleCloudPlatform/kms-integrations/releases/download/cng-v1.0/kmscng-1.0-windows-amd64.zip" -OutFile "${env:RUNNER_TEMP}\plugin.zip"
+          Expand-Archive -Path "${env:RUNNER_TEMP}\plugin.zip" -DestinationPath ${env:RUNNER_TEMP}\plugin\
+          write-host "Installing plugin"
+          & "${env:RUNNER_TEMP}\plugin\*\kmscng.msi" /quiet
+          write-host "plugin installed"
       - uses: actions/setup-go@v5
         with:
           go-version-file: go.mod

From 616c5eafee3f9ddabe1d9c59248a0a6fdbe7796f Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 20 Sep 2024 16:58:56 -0700
Subject: [PATCH 03/13] CI: win arm adjustments (#6898)

---
 .github/workflows/release.yaml |  2 ++
 scripts/build_windows.ps1      | 25 ++++++++++++++++++-------
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 8fbe475f..82d4a28a 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -291,6 +291,7 @@ jobs:
           iex ((New-Object System.Net.WebClient).DownloadString('https://community.chocolatey.org/install.ps1'))
           choco install -y --no-progress git gzip
           echo "C:\Program Files\Git\cmd" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
+          echo "C:\ProgramData\chocolatey\bin" | Out-File -FilePath $env:GITHUB_PATH -Encoding utf8 -Append
       - name: Install Visual Studio 2022
         run: |
           $components = @(
@@ -389,6 +390,7 @@ jobs:
           $env:CMAKE_SYSTEM_VERSION="10.0.22621.0"
           $env:PATH="$gopath;$gccpath;$env:PATH;C:\Program Files\Microsoft Visual Studio\2022\Community\Common7\IDE\CommonExtensions\Microsoft\CMake\CMake\bin"
           echo $env:PATH
+          $env:ARCH="arm64"
           .\scripts\build_windows.ps1 buildOllama buildApp gatherDependencies distZip
         name: 'Windows Build'
       - uses: actions/upload-artifact@v4
diff --git a/scripts/build_windows.ps1 b/scripts/build_windows.ps1
index ff74a500..b9508341 100644
--- a/scripts/build_windows.ps1
+++ b/scripts/build_windows.ps1
@@ -7,7 +7,17 @@
 $ErrorActionPreference = "Stop"
 
 function checkEnv() {
-    $script:ARCH = (([System.Runtime.InteropServices.RuntimeInformation]::OSArchitecture).ToString().ToLower()).Replace("x64", "amd64")
+    if ($null -ne $env:ARCH ) {
+        $script:ARCH = $env:ARCH
+    } else {
+        $arch=([System.Runtime.InteropServices.RuntimeInformation]::OSArchitecture)
+        if ($null -ne $arch) {
+            $script:ARCH = ($arch.ToString().ToLower()).Replace("x64", "amd64")
+        } else {
+            write-host "WARNING: old powershell detected, assuming amd64 architecture - set `$env:ARCH to override"
+            $script:ARCH="amd64"
+        }
+    }
     $script:TARGET_ARCH=$script:ARCH
     Write-host "Building for ${script:TARGET_ARCH}"
     write-host "Locating required tools and paths"
@@ -70,7 +80,6 @@ function checkEnv() {
 
 
 function buildOllama() {
-    write-host "Building ollama CLI"
     if ($null -eq ${env:OLLAMA_SKIP_GENERATE}) {
         Remove-Item -ea 0 -recurse -force -path "${script:SRC_DIR}\dist\windows-${script:ARCH}"
 
@@ -78,15 +87,16 @@ function buildOllama() {
         #        which targets to build
 
         # Start by skipping CUDA to build everything else
-        pwsh -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
+        write-host "Building ollama runners"
+        powershell -Command { $env:OLLAMA_SKIP_CUDA_GENERATE="1"; & go generate ./... }
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}    
 
         # Then skip everyhting else and build all the CUDA variants
         foreach ($env:CUDA_LIB_DIR in $script:CUDA_DIRS) {
-            write-host "Building CUDA ${env:CUDA_LIB_DIR}"
+            write-host "Building CUDA ${env:CUDA_LIB_DIR} runner"
 
             if ($env:CUDA_LIB_DIR.Contains("v12")) {
-                pwsh -Command {
+                powershell -Command {
                     $env:OLLAMA_SKIP_CUDA_GENERATE=""
                     $env:OLLAMA_SKIP_STATIC_GENERATE="1"
                     $env:OLLAMA_SKIP_CPU_GENERATE="1"
@@ -99,7 +109,7 @@ function buildOllama() {
                     & go generate ./...
                 }
             } else {
-                pwsh -Command {
+                powershell -Command {
                     $env:OLLAMA_SKIP_CUDA_GENERATE=""
                     $env:OLLAMA_SKIP_STATIC_GENERATE="1"
                     $env:OLLAMA_SKIP_CPU_GENERATE="1"
@@ -118,6 +128,7 @@ function buildOllama() {
     } else {
         write-host "Skipping generate step with OLLAMA_SKIP_GENERATE set"
     }
+    write-host "Building ollama CLI"
     & go build -trimpath -ldflags "-s -w -X=github.com/ollama/ollama/version.Version=$script:VERSION -X=github.com/ollama/ollama/server.mode=release" .
     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     if ("${env:KEY_CONTAINER}") {
@@ -137,7 +148,7 @@ function buildApp() {
     if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     if ("${env:KEY_CONTAINER}") {
         & "${script:SignTool}" sign /v /fd sha256 /t http://timestamp.digicert.com /f "${script:OLLAMA_CERT}" `
-            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} app.exe
+            /csp "Google Cloud KMS Provider" /kc ${env:KEY_CONTAINER} "${script:SRC_DIR}\dist\windows-${script:TARGET_ARCH}-app.exe"
         if ($LASTEXITCODE -ne 0) { exit($LASTEXITCODE)}
     }
 }

From 2a038c1d7e8351b386dbf6944e63f1053cf9b9b6 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Fri, 20 Sep 2024 19:16:18 -0700
Subject: [PATCH 04/13] CI: win arm artifact dist dir (#6900)

The upload artifact is missing the dist prefix since all
payloads are in the same directory, so restore the prefix
on download.
---
 .github/workflows/release.yaml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 82d4a28a..4090f206 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -470,6 +470,7 @@ jobs:
       - uses: actions/download-artifact@v4
         with:
           name: windows-arm64
+          path: dist
       - run: dir build
       - run: |
           $gopath=(get-command go).source | split-path -parent

From 6c2eb73a70716ca48c1b58b6ca89086a15c40d03 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Sat, 21 Sep 2024 16:28:29 -0700
Subject: [PATCH 05/13] Fix missing dep path on windows CPU runners (#6884)

GPUs handled the dependency path properly, but CPU runners didn't which
results in missing vc redist libraries on systems where the user didn't
already have it installed from some other app.
---
 gpu/gpu.go | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/gpu/gpu.go b/gpu/gpu.go
index 1fa941dd..db0e247b 100644
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -205,13 +205,16 @@ func GetGPUInfo() GpuInfoList {
 		if err != nil {
 			slog.Warn("error looking up system memory", "error", err)
 		}
+		depPath := LibraryDir()
+
 		cpus = []CPUInfo{
 			{
 				GpuInfo: GpuInfo{
-					memInfo: mem,
-					Library: "cpu",
-					Variant: cpuCapability.String(),
-					ID:      "0",
+					memInfo:        mem,
+					Library:        "cpu",
+					Variant:        cpuCapability.String(),
+					ID:             "0",
+					DependencyPath: depPath,
 				},
 			},
 		}
@@ -224,8 +227,6 @@ func GetGPUInfo() GpuInfoList {
 			return GpuInfoList{cpus[0].GpuInfo}
 		}
 
-		depPath := LibraryDir()
-
 		// Load ALL libraries
 		cHandles = initCudaHandles()
 

From dbba73469d98fc45d255dd526dedb201548d823d Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Sat, 21 Sep 2024 16:54:49 -0700
Subject: [PATCH 06/13] runner: Set windows above normal priority (#6905)

When running the subprocess as a background service windows may
throttle, which can lead to thrashing and very poor token rate.
---
 llm/llm_windows.go | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/llm/llm_windows.go b/llm/llm_windows.go
index 74a735c2..915355a2 100644
--- a/llm/llm_windows.go
+++ b/llm/llm_windows.go
@@ -4,7 +4,10 @@ import (
 	"syscall"
 )
 
-const CREATE_DEFAULT_ERROR_MODE = 0x04000000
+const (
+	CREATE_DEFAULT_ERROR_MODE   = 0x04000000
+	ABOVE_NORMAL_PRIORITY_CLASS = 0x00008000
+)
 
 var LlamaServerSysProcAttr = &syscall.SysProcAttr{
 	// Wire up the default error handling logic If for some reason a DLL is
@@ -12,5 +15,8 @@ var LlamaServerSysProcAttr = &syscall.SysProcAttr{
 	// the user can either fix their PATH, or report a bug. Without this
 	// setting, the process exits immediately with a generic exit status but no
 	// way to (easily) figure out what the actual missing DLL was.
-	CreationFlags: CREATE_DEFAULT_ERROR_MODE,
+	//
+	// Setting Above Normal priority class ensures when running as a "background service"
+	// with "programs" given best priority, we aren't starved of cpu cycles
+	CreationFlags: CREATE_DEFAULT_ERROR_MODE | ABOVE_NORMAL_PRIORITY_CLASS,
 }

From ad935f45ac19a8ba090db32580f3a6469e9858bb Mon Sep 17 00:00:00 2001
From: Mahesh Sathiamoorthy <callsmahesh@gmail.com>
Date: Sun, 22 Sep 2024 07:25:28 +0530
Subject: [PATCH 07/13] examples: use punkt_tab instead of punkt (#6907)

This was causing an error since we depend on punkt_tab.
---
 examples/python-grounded-factuality-rag-check/main.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/python-grounded-factuality-rag-check/main.py b/examples/python-grounded-factuality-rag-check/main.py
index 1da9eb1f..f4d562d5 100644
--- a/examples/python-grounded-factuality-rag-check/main.py
+++ b/examples/python-grounded-factuality-rag-check/main.py
@@ -9,7 +9,7 @@ import nltk
 warnings.filterwarnings(
     "ignore", category=FutureWarning, module="transformers.tokenization_utils_base"
 )
-nltk.download("punkt", quiet=True)
+nltk.download("punkt_tab", quiet=True)
 
 
 def getArticleText(url):

From 98701b58b3300587ae82118491d90a3a85c9b52a Mon Sep 17 00:00:00 2001
From: Deep Lakhani <100652109+deep93333@users.noreply.github.com>
Date: Mon, 23 Sep 2024 20:49:46 -0400
Subject: [PATCH 08/13] readme: add LLMChat to community integrations (#6919)

---
 README.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/README.md b/README.md
index 466f315a..a2dd76f2 100644
--- a/README.md
+++ b/README.md
@@ -325,6 +325,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [ConfiChat](https://github.com/1runeberg/confichat) (Lightweight, standalone, multi-platform, and privacy focused LLM chat interface with optional encryption)
 - [Archyve](https://github.com/nickthecook/archyve) (RAG-enabling document library)
 - [crewAI with Mesop](https://github.com/rapidarchitect/ollama-crew-mesop) (Mesop Web Interface to run crewAI with Ollama)
+- [LLMChat](https://github.com/trendy-design/llmchat) (Privacy focused, 100% local, intuitive all-in-one chat interface)
 
 ### Terminal
 

From 35bb6d32b332e61966dd5ff69faa825a875d6ae3 Mon Sep 17 00:00:00 2001
From: Alex Yang <himself65@outlook.com>
Date: Tue, 24 Sep 2024 12:15:43 -0700
Subject: [PATCH 09/13] readme: update llamaindex links (#6939)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index a2dd76f2..65d93db5 100644
--- a/README.md
+++ b/README.md
@@ -378,7 +378,7 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [LangChainGo](https://github.com/tmc/langchaingo/) with [example](https://github.com/tmc/langchaingo/tree/main/examples/ollama-completion-example)
 - [LangChain4j](https://github.com/langchain4j/langchain4j) with [example](https://github.com/langchain4j/langchain4j-examples/tree/main/ollama-examples/src/main/java)
 - [LangChainRust](https://github.com/Abraxas-365/langchain-rust) with [example](https://github.com/Abraxas-365/langchain-rust/blob/main/examples/llm_ollama.rs)
-- [LlamaIndex](https://gpt-index.readthedocs.io/en/stable/examples/llm/ollama.html)
+- [LlamaIndex](https://docs.llamaindex.ai/en/stable/examples/llm/ollama/) and [LlamaIndexTS](https://ts.llamaindex.ai/modules/llms/available_llms/ollama)
 - [LiteLLM](https://github.com/BerriAI/litellm)
 - [OllamaFarm for Go](https://github.com/presbrey/ollamafarm)
 - [OllamaSharp for .NET](https://github.com/awaescher/OllamaSharp)

From e9e9bdb8d904f009e8b1e54af9f77624d481cfb2 Mon Sep 17 00:00:00 2001
From: Daniel Hiltgen <dhiltgen@users.noreply.github.com>
Date: Tue, 24 Sep 2024 15:18:10 -0700
Subject: [PATCH 10/13] CI: Fix win arm version defect (#6940)

write-host in powershell writes directly to the console and will not be picked
up by a pipe.  Echo, or write-output will.
---
 .github/workflows/release.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/release.yaml b/.github/workflows/release.yaml
index 4090f206..ac4c19b0 100644
--- a/.github/workflows/release.yaml
+++ b/.github/workflows/release.yaml
@@ -354,7 +354,7 @@ jobs:
       - name: Set Version
         run: |
           $ver=${env:GITHUB_REF_NAME}.trim("v")
-          write-host VERSION=$ver | Out-File -FilePath ${env:GITHUB_ENV} -Encoding utf8 -Append
+          echo VERSION=$ver | Out-File -FilePath ${env:GITHUB_ENV} -Encoding utf8 -Append
       - uses: 'google-github-actions/auth@v2'
         with:
           project_id: 'ollama'

From 55ea963c9e9033d01c7c20a54c5ede5babb6878e Mon Sep 17 00:00:00 2001
From: Jeffrey Morgan <jmorganca@gmail.com>
Date: Wed, 25 Sep 2024 11:11:22 -0700
Subject: [PATCH 11/13] update default model to llama3.2 (#6959)

---
 README.md                                     | 30 +++++----
 app/ollama.iss                                |  2 +-
 app/ollama_welcome.ps1                        |  2 +-
 docs/api.md                                   | 64 +++++++++----------
 docs/docker.md                                |  2 +-
 docs/faq.md                                   | 10 +--
 docs/modelfile.md                             |  8 +--
 docs/openai.md                                | 22 +++----
 docs/template.md                              |  2 +-
 docs/tutorials/langchainjs.md                 |  4 +-
 docs/windows.md                               |  2 +-
 examples/go-chat/main.go                      |  2 +-
 .../langchain-python-rag-document/README.md   |  4 +-
 .../langchain-python-rag-document/main.py     |  2 +-
 .../langchain-python-rag-websummary/README.md |  4 +-
 .../langchain-python-rag-websummary/main.py   |  2 +-
 examples/langchain-python-simple/README.md    |  4 +-
 examples/langchain-python-simple/main.py      |  2 +-
 examples/modelfile-mario/Modelfile            |  2 +-
 examples/modelfile-mario/readme.md            |  6 +-
 .../README.md                                 |  6 +-
 .../main.py                                   |  2 +-
 .../predefinedschema.py                       |  2 +-
 .../randomaddresses.py                        |  2 +-
 examples/python-json-datagenerator/readme.md  |  4 +-
 examples/python-simplechat/client.py          |  2 +-
 examples/python-simplechat/readme.md          |  4 +-
 examples/typescript-simplechat/client.ts      |  2 +-
 macapp/src/app.tsx                            |  2 +-
 29 files changed, 102 insertions(+), 100 deletions(-)

diff --git a/README.md b/README.md
index 65d93db5..d422b8cb 100644
--- a/README.md
+++ b/README.md
@@ -35,10 +35,10 @@ The official [Ollama Docker image](https://hub.docker.com/r/ollama/ollama) `olla
 
 ## Quickstart
 
-To run and chat with [Llama 3.1](https://ollama.com/library/llama3.1):
+To run and chat with [Llama 3.2](https://ollama.com/library/llama3.2):
 
 ```
-ollama run llama3.1
+ollama run llama3.2
 ```
 
 ## Model library
@@ -49,6 +49,8 @@ Here are some example models that can be downloaded:
 
 | Model              | Parameters | Size  | Download                       |
 | ------------------ | ---------- | ----- | ------------------------------ |
+| Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`          |
+| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.1:1b`       |
 | Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`          |
 | Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`      |
 | Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`     |
@@ -99,16 +101,16 @@ See the [guide](docs/import.md) on importing models for more information.
 
 ### Customize a prompt
 
-Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.1` model:
+Models from the Ollama library can be customized with a prompt. For example, to customize the `llama3.2` model:
 
 ```
-ollama pull llama3.1
+ollama pull llama3.2
 ```
 
 Create a `Modelfile`:
 
 ```
-FROM llama3.1
+FROM llama3.2
 
 # set the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
@@ -143,7 +145,7 @@ ollama create mymodel -f ./Modelfile
 ### Pull a model
 
 ```
-ollama pull llama3.1
+ollama pull llama3.2
 ```
 
 > This command can also be used to update a local model. Only the diff will be pulled.
@@ -151,13 +153,13 @@ ollama pull llama3.1
 ### Remove a model
 
 ```
-ollama rm llama3.1
+ollama rm llama3.2
 ```
 
 ### Copy a model
 
 ```
-ollama cp llama3.1 my-model
+ollama cp llama3.2 my-model
 ```
 
 ### Multiline input
@@ -181,14 +183,14 @@ The image features a yellow smiley face, which is likely the central focus of th
 ### Pass the prompt as an argument
 
 ```
-$ ollama run llama3.1 "Summarize this file: $(cat README.md)"
+$ ollama run llama3.2 "Summarize this file: $(cat README.md)"
  Ollama is a lightweight, extensible framework for building and running language models on the local machine. It provides a simple API for creating, running, and managing models, as well as a library of pre-built models that can be easily used in a variety of applications.
 ```
 
 ### Show model information
 
 ```
-ollama show llama3.1
+ollama show llama3.2
 ```
 
 ### List models on your computer
@@ -206,7 +208,7 @@ ollama ps
 ### Stop a model which is currently running
 
 ```
-ollama stop llama3.1
+ollama stop llama3.2
 ```
 
 ### Start Ollama
@@ -228,7 +230,7 @@ Next, start the server:
 Finally, in a separate shell, run a model:
 
 ```
-./ollama run llama3.1
+./ollama run llama3.2
 ```
 
 ## REST API
@@ -239,7 +241,7 @@ Ollama has a REST API for running and managing models.
 
 ```
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "prompt":"Why is the sky blue?"
 }'
 ```
@@ -248,7 +250,7 @@ curl http://localhost:11434/api/generate -d '{
 
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "messages": [
     { "role": "user", "content": "why is the sky blue?" }
   ]
diff --git a/app/ollama.iss b/app/ollama.iss
index 63b5bdb0..4038815a 100644
--- a/app/ollama.iss
+++ b/app/ollama.iss
@@ -142,7 +142,7 @@ SetupAppRunningError=Another Ollama installer is running.%n%nPlease cancel or fi
 
 
 ;FinishedHeadingLabel=Run your first model
-;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama3.1
+;FinishedLabel=%nRun this command in a PowerShell or cmd terminal.%n%n%n    ollama run llama3.2
 ;ClickFinish=%n
 
 [Registry]
diff --git a/app/ollama_welcome.ps1 b/app/ollama_welcome.ps1
index 46777a3a..e9695748 100644
--- a/app/ollama_welcome.ps1
+++ b/app/ollama_welcome.ps1
@@ -4,5 +4,5 @@ write-host "Welcome to Ollama!"
 write-host ""
 write-host "Run your first model:"
 write-host ""
-write-host "`tollama run llama3.1"
+write-host "`tollama run llama3.2"
 write-host ""
\ No newline at end of file
diff --git a/docs/api.md b/docs/api.md
index 95e79e00..fe2eb82c 100644
--- a/docs/api.md
+++ b/docs/api.md
@@ -69,7 +69,7 @@ Enable JSON mode by setting the `format` parameter to `json`. This will structur
 
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "prompt": "Why is the sky blue?"
 }'
 ```
@@ -80,7 +80,7 @@ A stream of JSON objects is returned:
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-08-04T08:52:19.385406455-07:00",
   "response": "The",
   "done": false
@@ -102,7 +102,7 @@ To calculate how fast the response is generated in tokens per second (token/s),
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-08-04T19:22:45.499127Z",
   "response": "",
   "done": true,
@@ -124,7 +124,7 @@ A response can be received in one reply when streaming is off.
 
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "prompt": "Why is the sky blue?",
   "stream": false
 }'
@@ -136,7 +136,7 @@ If `stream` is set to `false`, the response will be a single JSON object:
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-08-04T19:22:45.499127Z",
   "response": "The sky is blue because it is the color of the sky.",
   "done": true,
@@ -194,7 +194,7 @@ curl http://localhost:11434/api/generate -d '{
 
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "prompt": "What color is the sky at different times of the day? Respond using JSON",
   "format": "json",
   "stream": false
@@ -205,7 +205,7 @@ curl http://localhost:11434/api/generate -d '{
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-11-09T21:07:55.186497Z",
   "response": "{\n\"morning\": {\n\"color\": \"blue\"\n},\n\"noon\": {\n\"color\": \"blue-gray\"\n},\n\"afternoon\": {\n\"color\": \"warm gray\"\n},\n\"evening\": {\n\"color\": \"orange\"\n}\n}\n",
   "done": true,
@@ -327,7 +327,7 @@ If you want to set custom options for the model at runtime rather than in the Mo
 
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "prompt": "Why is the sky blue?",
   "stream": false,
   "options": {
@@ -368,7 +368,7 @@ curl http://localhost:11434/api/generate -d '{
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-08-04T19:22:45.499127Z",
   "response": "The sky is blue because it is the color of the sky.",
   "done": true,
@@ -390,7 +390,7 @@ If an empty prompt is provided, the model will be loaded into memory.
 
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1"
+  "model": "llama3.2"
 }'
 ```
 
@@ -400,7 +400,7 @@ A single JSON object is returned:
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-12-18T19:52:07.071755Z",
   "response": "",
   "done": true
@@ -415,7 +415,7 @@ If an empty prompt is provided and the `keep_alive` parameter is set to `0`, a m
 
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "keep_alive": 0
 }'
 ```
@@ -426,7 +426,7 @@ A single JSON object is returned:
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2024-09-12T03:54:03.516566Z",
   "response": "",
   "done": true,
@@ -472,7 +472,7 @@ Send a chat message with a streaming response.
 
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "messages": [
     {
       "role": "user",
@@ -488,7 +488,7 @@ A stream of JSON objects is returned:
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-08-04T08:52:19.385406455-07:00",
   "message": {
     "role": "assistant",
@@ -503,7 +503,7 @@ Final response:
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-08-04T19:22:45.499127Z",
   "done": true,
   "total_duration": 4883583458,
@@ -521,7 +521,7 @@ Final response:
 
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "messages": [
     {
       "role": "user",
@@ -536,7 +536,7 @@ curl http://localhost:11434/api/chat -d '{
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-12-12T14:13:43.416799Z",
   "message": {
     "role": "assistant",
@@ -560,7 +560,7 @@ Send a chat message with a conversation history. You can use this same approach
 
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "messages": [
     {
       "role": "user",
@@ -584,7 +584,7 @@ A stream of JSON objects is returned:
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-08-04T08:52:19.385406455-07:00",
   "message": {
     "role": "assistant",
@@ -598,7 +598,7 @@ Final response:
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-08-04T19:22:45.499127Z",
   "done": true,
   "total_duration": 8113331500,
@@ -656,7 +656,7 @@ curl http://localhost:11434/api/chat -d '{
 
 ```shell
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "messages": [
     {
       "role": "user",
@@ -674,7 +674,7 @@ curl http://localhost:11434/api/chat -d '{
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2023-12-12T14:13:43.416799Z",
   "message": {
     "role": "assistant",
@@ -696,7 +696,7 @@ curl http://localhost:11434/api/chat -d '{
 
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "messages": [
     {
       "role": "user",
@@ -735,7 +735,7 @@ curl http://localhost:11434/api/chat -d '{
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at": "2024-07-22T20:33:28.123648Z",
   "message": {
     "role": "assistant",
@@ -771,7 +771,7 @@ If the messages array is empty, the model will be loaded into memory.
 
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "messages": []
 }'
 ```
@@ -779,7 +779,7 @@ curl http://localhost:11434/api/chat -d '{
 ##### Response
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at":"2024-09-12T21:17:29.110811Z",
   "message": {
     "role": "assistant",
@@ -798,7 +798,7 @@ If the messages array is empty and the `keep_alive` parameter is set to `0`, a m
 
 ```
 curl http://localhost:11434/api/chat -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "messages": [],
   "keep_alive": 0
 }'
@@ -810,7 +810,7 @@ A single JSON object is returned:
 
 ```json
 {
-  "model": "llama3.1",
+  "model": "llama3.2",
   "created_at":"2024-09-12T21:33:17.547535Z",
   "message": {
     "role": "assistant",
@@ -989,7 +989,7 @@ Show information about a model including details, modelfile, template, parameter
 
 ```shell
 curl http://localhost:11434/api/show -d '{
-  "name": "llama3.1"
+  "name": "llama3.2"
 }'
 ```
 
@@ -1050,7 +1050,7 @@ Copy a model. Creates a model with another name from an existing model.
 
 ```shell
 curl http://localhost:11434/api/copy -d '{
-  "source": "llama3.1",
+  "source": "llama3.2",
   "destination": "llama3-backup"
 }'
 ```
@@ -1105,7 +1105,7 @@ Download a model from the ollama library. Cancelled pulls are resumed from where
 
 ```shell
 curl http://localhost:11434/api/pull -d '{
-  "name": "llama3.1"
+  "name": "llama3.2"
 }'
 ```
 
diff --git a/docs/docker.md b/docs/docker.md
index 314666b2..9c758c38 100644
--- a/docs/docker.md
+++ b/docs/docker.md
@@ -63,7 +63,7 @@ docker run -d --device /dev/kfd --device /dev/dri -v ollama:/root/.ollama -p 114
 Now you can run a model:
 
 ```
-docker exec -it ollama ollama run llama3.1
+docker exec -it ollama ollama run llama3.2
 ```
 
 ### Try different models
diff --git a/docs/faq.md b/docs/faq.md
index b2b1ca30..0dbbb3ff 100644
--- a/docs/faq.md
+++ b/docs/faq.md
@@ -32,7 +32,7 @@ When using the API, specify the `num_ctx` parameter:
 
 ```shell
 curl http://localhost:11434/api/generate -d '{
-  "model": "llama3.1",
+  "model": "llama3.2",
   "prompt": "Why is the sky blue?",
   "options": {
     "num_ctx": 4096
@@ -232,7 +232,7 @@ curl http://localhost:11434/api/chat -d '{"model": "mistral"}'
 
 To preload a model using the CLI, use the command:
 ```shell
-ollama run llama3.1 ""
+ollama run llama3.2 ""
 ```
 
 ## How do I keep a model loaded in memory or make it unload immediately?
@@ -240,7 +240,7 @@ ollama run llama3.1 ""
 By default models are kept in memory for 5 minutes before being unloaded. This allows for quicker response times if you're making numerous requests to the LLM. If you want to immediately unload a model from memory, use the `ollama stop` command:
 
 ```shell
-ollama stop llama3.1
+ollama stop llama3.2
 ```
 
 If you're using the API, use the `keep_alive` parameter with the `/api/generate` and `/api/chat` endpoints to set the amount of time that a model stays in memory. The `keep_alive` parameter can be set to:
@@ -251,12 +251,12 @@ If you're using the API, use the `keep_alive` parameter with the `/api/generate`
 
 For example, to preload a model and leave it in memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": -1}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": -1}'
 ```
 
 To unload the model and free up memory use:
 ```shell
-curl http://localhost:11434/api/generate -d '{"model": "llama3.1", "keep_alive": 0}'
+curl http://localhost:11434/api/generate -d '{"model": "llama3.2", "keep_alive": 0}'
 ```
 
 Alternatively, you can change the amount of time all models are loaded into memory by setting the `OLLAMA_KEEP_ALIVE` environment variable when starting the Ollama server. The `OLLAMA_KEEP_ALIVE` variable uses the same parameter types as the `keep_alive` parameter types mentioned above. Refer to the section explaining [how to configure the Ollama server](#how-do-i-configure-ollama-server) to correctly set the environment variable.
diff --git a/docs/modelfile.md b/docs/modelfile.md
index a33f180b..aa2849e7 100644
--- a/docs/modelfile.md
+++ b/docs/modelfile.md
@@ -50,7 +50,7 @@ INSTRUCTION arguments
 An example of a `Modelfile` creating a mario blueprint:
 
 ```modelfile
-FROM llama3.1
+FROM llama3.2
 # sets the temperature to 1 [higher is more creative, lower is more coherent]
 PARAMETER temperature 1
 # sets the context window size to 4096, this controls how many tokens the LLM can use as context to generate the next token
@@ -72,10 +72,10 @@ More examples are available in the [examples directory](../examples).
 To view the Modelfile of a given model, use the `ollama show --modelfile` command.
 
   ```bash
-  > ollama show --modelfile llama3.1
+  > ollama show --modelfile llama3.2
   # Modelfile generated by "ollama show"
   # To build a new Modelfile based on this one, replace the FROM line with:
-  # FROM llama3.1:latest
+  # FROM llama3.2:latest
   FROM /Users/pdevine/.ollama/models/blobs/sha256-00e1317cbf74d901080d7100f57580ba8dd8de57203072dc6f668324ba545f29
   TEMPLATE """{{ if .System }}<|start_header_id|>system<|end_header_id|>
 
@@ -103,7 +103,7 @@ FROM <model name>:<tag>
 #### Build from existing model
 
 ```modelfile
-FROM llama3.1
+FROM llama3.2
 ```
 
 A list of available base models:
diff --git a/docs/openai.md b/docs/openai.md
index c6df0fec..e13842c0 100644
--- a/docs/openai.md
+++ b/docs/openai.md
@@ -25,7 +25,7 @@ chat_completion = client.chat.completions.create(
             'content': 'Say this is a test',
         }
     ],
-    model='llama3.1',
+    model='llama3.2',
 )
 
 response = client.chat.completions.create(
@@ -46,13 +46,13 @@ response = client.chat.completions.create(
 )
 
 completion = client.completions.create(
-    model="llama3.1",
+    model="llama3.2",
     prompt="Say this is a test",
 )
 
 list_completion = client.models.list()
 
-model = client.models.retrieve("llama3.1")
+model = client.models.retrieve("llama3.2")
 
 embeddings = client.embeddings.create(
     model="all-minilm",
@@ -74,7 +74,7 @@ const openai = new OpenAI({
 
 const chatCompletion = await openai.chat.completions.create({
     messages: [{ role: 'user', content: 'Say this is a test' }],
-    model: 'llama3.1',
+    model: 'llama3.2',
 })
 
 const response = await openai.chat.completions.create({
@@ -94,13 +94,13 @@ const response = await openai.chat.completions.create({
 })
 
 const completion = await openai.completions.create({
-    model: "llama3.1",
+    model: "llama3.2",
     prompt: "Say this is a test.",
 })
 
 const listCompletion = await openai.models.list()
 
-const model = await openai.models.retrieve("llama3.1")
+const model = await openai.models.retrieve("llama3.2")
 
 const embedding = await openai.embeddings.create({
   model: "all-minilm",
@@ -114,7 +114,7 @@ const embedding = await openai.embeddings.create({
 curl http://localhost:11434/v1/chat/completions \
     -H "Content-Type: application/json" \
     -d '{
-        "model": "llama3.1",
+        "model": "llama3.2",
         "messages": [
             {
                 "role": "system",
@@ -154,13 +154,13 @@ curl http://localhost:11434/v1/chat/completions \
 curl http://localhost:11434/v1/completions \
     -H "Content-Type: application/json" \
     -d '{
-        "model": "llama3.1",
+        "model": "llama3.2",
         "prompt": "Say this is a test"
     }'
 
 curl http://localhost:11434/v1/models
 
-curl http://localhost:11434/v1/models/llama3.1
+curl http://localhost:11434/v1/models/llama3.2
 
 curl http://localhost:11434/v1/embeddings \
     -H "Content-Type: application/json" \
@@ -274,7 +274,7 @@ curl http://localhost:11434/v1/embeddings \
 Before using a model, pull it locally `ollama pull`:
 
 ```shell
-ollama pull llama3.1
+ollama pull llama3.2
 ```
 
 ### Default model names
@@ -282,7 +282,7 @@ ollama pull llama3.1
 For tooling that relies on default OpenAI model names such as `gpt-3.5-turbo`, use `ollama cp` to copy an existing model name to a temporary name:
 
 ```
-ollama cp llama3.1 gpt-3.5-turbo
+ollama cp llama3.2 gpt-3.5-turbo
 ```
 
 Afterwards, this new model name can be specified the `model` field:
diff --git a/docs/template.md b/docs/template.md
index 192d878d..bd367e91 100644
--- a/docs/template.md
+++ b/docs/template.md
@@ -33,7 +33,7 @@ Omitting a template in these models puts the responsibility of correctly templat
 To add templates in your model, you'll need to add a `TEMPLATE` command to the Modelfile. Here's an example using Meta's Llama 3.
 
 ```dockerfile
-FROM llama3.1
+FROM llama3.2
 
 TEMPLATE """{{- if .System }}<|start_header_id|>system<|end_header_id|>
 
diff --git a/docs/tutorials/langchainjs.md b/docs/tutorials/langchainjs.md
index f925869b..86f895ae 100644
--- a/docs/tutorials/langchainjs.md
+++ b/docs/tutorials/langchainjs.md
@@ -15,7 +15,7 @@ import { Ollama } from "@langchain/community/llms/ollama";
 
 const ollama = new Ollama({
   baseUrl: "http://localhost:11434",
-  model: "llama3.1",
+  model: "llama3.2",
 });
 
 const answer = await ollama.invoke(`why is the sky blue?`);
@@ -23,7 +23,7 @@ const answer = await ollama.invoke(`why is the sky blue?`);
 console.log(answer);
 ```
 
-That will get us the same thing as if we ran `ollama run llama3.1 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
+That will get us the same thing as if we ran `ollama run llama3.2 "why is the sky blue"` in the terminal. But we want to load a document from the web to ask a question against. **Cheerio** is a great library for ingesting a webpage, and **LangChain** uses it in their **CheerioWebBaseLoader**. So let's install **Cheerio** and build that part of the app.
 
 ```bash
 npm install cheerio
diff --git a/docs/windows.md b/docs/windows.md
index 372a35aa..5f196756 100644
--- a/docs/windows.md
+++ b/docs/windows.md
@@ -29,7 +29,7 @@ Ollama uses unicode characters for progress indication, which may render as unkn
 
 Here's a quick example showing API access from `powershell`
 ```powershell
-(Invoke-WebRequest -method POST -Body '{"model":"llama3.1", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
+(Invoke-WebRequest -method POST -Body '{"model":"llama3.2", "prompt":"Why is the sky blue?", "stream": false}' -uri http://localhost:11434/api/generate ).Content | ConvertFrom-json
 ```
 
 ## Troubleshooting
diff --git a/examples/go-chat/main.go b/examples/go-chat/main.go
index 7663fb8f..07430305 100644
--- a/examples/go-chat/main.go
+++ b/examples/go-chat/main.go
@@ -35,7 +35,7 @@ func main() {
 
 	ctx := context.Background()
 	req := &api.ChatRequest{
-		Model:    "llama3.1",
+		Model:    "llama3.2",
 		Messages: messages,
 	}
 
diff --git a/examples/langchain-python-rag-document/README.md b/examples/langchain-python-rag-document/README.md
index e2f3bc02..d37afc9d 100644
--- a/examples/langchain-python-rag-document/README.md
+++ b/examples/langchain-python-rag-document/README.md
@@ -4,10 +4,10 @@ This example provides an interface for asking questions to a PDF document.
 
 ## Setup
 
-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3.2` model installed:
 
 ```
-ollama pull llama3.1
+ollama pull llama3.2
 ```
 
 2. Install the Python Requirements.
diff --git a/examples/langchain-python-rag-document/main.py b/examples/langchain-python-rag-document/main.py
index 6f7cec9b..4871a042 100644
--- a/examples/langchain-python-rag-document/main.py
+++ b/examples/langchain-python-rag-document/main.py
@@ -51,7 +51,7 @@ while True:
         template=template,
     )
 
-    llm = Ollama(model="llama3.1", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
+    llm = Ollama(model="llama3.2", callback_manager=CallbackManager([StreamingStdOutCallbackHandler()]))
     qa_chain = RetrievalQA.from_chain_type(
         llm,
         retriever=vectorstore.as_retriever(),
diff --git a/examples/langchain-python-rag-websummary/README.md b/examples/langchain-python-rag-websummary/README.md
index 29c706a3..746c47ab 100644
--- a/examples/langchain-python-rag-websummary/README.md
+++ b/examples/langchain-python-rag-websummary/README.md
@@ -4,10 +4,10 @@ This example summarizes the website, [https://ollama.com/blog/run-llama2-uncenso
 
 ## Running the Example
 
-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3.2` model installed:
 
    ```bash
-   ollama pull llama3.1
+   ollama pull llama3.2
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/langchain-python-rag-websummary/main.py b/examples/langchain-python-rag-websummary/main.py
index 77b09fbb..56f8bd24 100644
--- a/examples/langchain-python-rag-websummary/main.py
+++ b/examples/langchain-python-rag-websummary/main.py
@@ -5,7 +5,7 @@ from langchain.chains.summarize import load_summarize_chain
 loader = WebBaseLoader("https://ollama.com/blog/run-llama2-uncensored-locally")
 docs = loader.load()
 
-llm = Ollama(model="llama3.1")
+llm = Ollama(model="llama3.2")
 chain = load_summarize_chain(llm, chain_type="stuff")
 
 result = chain.invoke(docs)
diff --git a/examples/langchain-python-simple/README.md b/examples/langchain-python-simple/README.md
index 60db2c8c..680ab560 100644
--- a/examples/langchain-python-simple/README.md
+++ b/examples/langchain-python-simple/README.md
@@ -4,10 +4,10 @@ This example is a basic "hello world" of using LangChain with Ollama.
 
 ## Running the Example
 
-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3.2` model installed:
 
    ```bash
-   ollama pull llama3.1
+   ollama pull llama3.2
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/langchain-python-simple/main.py b/examples/langchain-python-simple/main.py
index a7ed81d6..8d6989c8 100644
--- a/examples/langchain-python-simple/main.py
+++ b/examples/langchain-python-simple/main.py
@@ -1,6 +1,6 @@
 from langchain.llms import Ollama
 
 input = input("What is your question?")
-llm = Ollama(model="llama3.1")
+llm = Ollama(model="llama3.2")
 res = llm.predict(input)
 print (res)
diff --git a/examples/modelfile-mario/Modelfile b/examples/modelfile-mario/Modelfile
index a3747086..b8e49667 100644
--- a/examples/modelfile-mario/Modelfile
+++ b/examples/modelfile-mario/Modelfile
@@ -1,4 +1,4 @@
-FROM llama3.1
+FROM llama3.2
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from super mario bros, acting as an assistant.
diff --git a/examples/modelfile-mario/readme.md b/examples/modelfile-mario/readme.md
index c3f34197..882023ad 100644
--- a/examples/modelfile-mario/readme.md
+++ b/examples/modelfile-mario/readme.md
@@ -2,12 +2,12 @@
 
 # Example character: Mario
 
-This example shows how to create a basic character using Llama3.1 as the base model.
+This example shows how to create a basic character using Llama 3.2 as the base model.
 
 To run this example:
 
 1. Download the Modelfile
-2. `ollama pull llama3.1` to get the base model used in the model file.
+2. `ollama pull llama3.2` to get the base model used in the model file.
 3. `ollama create NAME -f ./Modelfile`
 4. `ollama run NAME`
 
@@ -18,7 +18,7 @@ Ask it some questions like "Who are you?" or "Is Peach in trouble again?"
 What the model file looks like:
 
 ```
-FROM llama3.1
+FROM llama3.2
 PARAMETER temperature 1
 SYSTEM """
 You are Mario from Super Mario Bros, acting as an assistant.
diff --git a/examples/python-grounded-factuality-rag-check/README.md b/examples/python-grounded-factuality-rag-check/README.md
index 5c981752..cd72071c 100644
--- a/examples/python-grounded-factuality-rag-check/README.md
+++ b/examples/python-grounded-factuality-rag-check/README.md
@@ -1,14 +1,14 @@
 # RAG Hallucination Checker using Bespoke-Minicheck
 
-This example allows the user to ask questions related to a document, which can be specified via an article url. Relevant chunks are retreived from the document and given to `llama3.1` as context to answer the question. Then each sentence in the answer is checked against the retrieved chunks using `bespoke-minicheck` to ensure that the answer does not contain hallucinations. 
+This example allows the user to ask questions related to a document, which can be specified via an article url. Relevant chunks are retreived from the document and given to `llama3.2` as context to answer the question. Then each sentence in the answer is checked against the retrieved chunks using `bespoke-minicheck` to ensure that the answer does not contain hallucinations. 
 
 ## Running the Example
 
-1. Ensure `all-minilm` (embedding) `llama3.1` (chat) and `bespoke-minicheck` (check) models installed:
+1. Ensure `all-minilm` (embedding) `llama3.2` (chat) and `bespoke-minicheck` (check) models installed:
 
    ```bash
    ollama pull all-minilm
-   ollama pull llama3.1
+   ollama pull llama3.2
    ollama pull bespoke-minicheck
    ```
 
diff --git a/examples/python-grounded-factuality-rag-check/main.py b/examples/python-grounded-factuality-rag-check/main.py
index f4d562d5..eab0b670 100644
--- a/examples/python-grounded-factuality-rag-check/main.py
+++ b/examples/python-grounded-factuality-rag-check/main.py
@@ -119,7 +119,7 @@ if __name__ == "__main__":
         system_prompt = f"Only use the following information to answer the question. Do not use anything else: {sourcetext}"
 
         ollama_response = ollama.generate(
-            model="llama3.1",
+            model="llama3.2",
             prompt=question,
             system=system_prompt,
             options={"stream": False},
diff --git a/examples/python-json-datagenerator/predefinedschema.py b/examples/python-json-datagenerator/predefinedschema.py
index 68090ad7..91463760 100644
--- a/examples/python-json-datagenerator/predefinedschema.py
+++ b/examples/python-json-datagenerator/predefinedschema.py
@@ -2,7 +2,7 @@ import requests
 import json
 import random
 
-model = "llama3.1"
+model = "llama3.2"
 template = {
   "firstName": "",
   "lastName": "",
diff --git a/examples/python-json-datagenerator/randomaddresses.py b/examples/python-json-datagenerator/randomaddresses.py
index 878c9803..3df59d32 100644
--- a/examples/python-json-datagenerator/randomaddresses.py
+++ b/examples/python-json-datagenerator/randomaddresses.py
@@ -12,7 +12,7 @@ countries = [
     "France",
 ]
 country = random.choice(countries)
-model = "llama3.1"
+model = "llama3.2"
 
 prompt = f"generate one realistically believable sample data set of a persons first name, last name, address in {country}, and phone number. Do not use common names. Respond using JSON. Key names should have no backslashes, values should use plain ascii with no special characters."
 
diff --git a/examples/python-json-datagenerator/readme.md b/examples/python-json-datagenerator/readme.md
index 5b444dff..a551e1dd 100644
--- a/examples/python-json-datagenerator/readme.md
+++ b/examples/python-json-datagenerator/readme.md
@@ -6,10 +6,10 @@ There are two python scripts in this example. `randomaddresses.py` generates ran
 
 ## Running the Example
 
-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3.2` model installed:
 
    ```bash
-   ollama pull llama3.1
+   ollama pull llama3.2
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/python-simplechat/client.py b/examples/python-simplechat/client.py
index 85043d5f..6ef14ffc 100644
--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -2,7 +2,7 @@ import json
 import requests
 
 # NOTE: ollama must be running for this to work, start the ollama app or run `ollama serve`
-model = "llama3.1"  # TODO: update this for whatever model you wish to use
+model = "llama3.2"  # TODO: update this for whatever model you wish to use
 
 
 def chat(messages):
diff --git a/examples/python-simplechat/readme.md b/examples/python-simplechat/readme.md
index 4c2ded4d..a4a2dfc1 100644
--- a/examples/python-simplechat/readme.md
+++ b/examples/python-simplechat/readme.md
@@ -4,10 +4,10 @@ The **chat** endpoint is one of two ways to generate text from an LLM with Ollam
 
 ## Running the Example
 
-1. Ensure you have the `llama3.1` model installed:
+1. Ensure you have the `llama3.2` model installed:
 
    ```bash
-   ollama pull llama3.1
+   ollama pull llama3.2
    ```
 
 2. Install the Python Requirements.
diff --git a/examples/typescript-simplechat/client.ts b/examples/typescript-simplechat/client.ts
index 8ad113b1..d8faaa1b 100644
--- a/examples/typescript-simplechat/client.ts
+++ b/examples/typescript-simplechat/client.ts
@@ -1,6 +1,6 @@
 import * as readline from "readline";
 
-const model = "llama3.1";
+const model = "llama3.2";
 type Message = {
   role: "assistant" | "user" | "system";
   content: string;
diff --git a/macapp/src/app.tsx b/macapp/src/app.tsx
index a627e63d..449fc851 100644
--- a/macapp/src/app.tsx
+++ b/macapp/src/app.tsx
@@ -19,7 +19,7 @@ export default function () {
   const [step, setStep] = useState<Step>(Step.WELCOME)
   const [commandCopied, setCommandCopied] = useState<boolean>(false)
 
-  const command = 'ollama run llama3.1'
+  const command = 'ollama run llama3.2'
 
   return (
     <div className='drag'>

From 450acb71a6366ca22eada96ab08c0527a6914c5e Mon Sep 17 00:00:00 2001
From: Xe Iaso <me@xeiaso.net>
Date: Wed, 25 Sep 2024 11:53:47 -0700
Subject: [PATCH 12/13] readme: fix llama3.1 -> llama3.2 typo (#6962)

---
 README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/README.md b/README.md
index d422b8cb..8400fd8f 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ Here are some example models that can be downloaded:
 | Model              | Parameters | Size  | Download                       |
 | ------------------ | ---------- | ----- | ------------------------------ |
 | Llama 3.2          | 3B         | 2.0GB | `ollama run llama3.2`          |
-| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.1:1b`       |
+| Llama 3.2          | 1B         | 1.3GB | `ollama run llama3.2:1b`       |
 | Llama 3.1          | 8B         | 4.7GB | `ollama run llama3.1`          |
 | Llama 3.1          | 70B        | 40GB  | `ollama run llama3.1:70b`      |
 | Llama 3.1          | 405B       | 231GB | `ollama run llama3.1:405b`     |

From 03608cb46ecdccaf8c340c9390626a9d8fcc3c6b Mon Sep 17 00:00:00 2001
From: Blake Mizerany <blake.mizerany@gmail.com>
Date: Thu, 26 Sep 2024 12:00:31 -0700
Subject: [PATCH 13/13] server: close response body on error (#6986)

This change closes the response body when an error occurs in
makeRequestWithRetry. Previously, the first, non-200 response body was
not closed before reattempting the request. This change ensures that
the response body is closed in all cases where an error occurs,
preventing leaks of file descriptors.

Fixes #6974
---
 server/images.go | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/server/images.go b/server/images.go
index b5bf7ad6..c88edc69 100644
--- a/server/images.go
+++ b/server/images.go
@@ -1025,6 +1025,8 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 
 		switch {
 		case resp.StatusCode == http.StatusUnauthorized:
+			resp.Body.Close()
+
 			// Handle authentication error with one retry
 			challenge := parseRegistryChallenge(resp.Header.Get("www-authenticate"))
 			token, err := getAuthorizationToken(ctx, challenge)
@@ -1040,8 +1042,10 @@ func makeRequestWithRetry(ctx context.Context, method string, requestURL *url.UR
 				}
 			}
 		case resp.StatusCode == http.StatusNotFound:
+			resp.Body.Close()
 			return nil, os.ErrNotExist
 		case resp.StatusCode >= http.StatusBadRequest:
+			defer resp.Body.Close()
 			responseBody, err := io.ReadAll(resp.Body)
 			if err != nil {
 				return nil, fmt.Errorf("%d: %s", resp.StatusCode, err)