Merge branch 'ollama:main' into main

2025-12-23 15:08:27 +00:00 · 2024-05-29 19:33:39 +08:00
parent 2a80d6f743 646371f56d
commit cafde1f8ce
13 changed files with 966 additions and 130 deletions
--- a/README.md
+++ b/README.md
@@ -301,6 +301,8 @@ See the [API documentation](./docs/api.md) for all endpoints.
 - [Ollama RAG Chatbot](https://github.com/datvodinh/rag-chatbot.git) (Local Chat with multiple PDFs using Ollama and RAG)
 - [BrainSoup](https://www.nurgo-software.com/products/brainsoup) (Flexible native client with RAG & multi-agent automation)
 - [macai](https://github.com/Renset/macai) (macOS client for Ollama, ChatGPT, and other compatible API back-ends)
+- [Olpaka](https://github.com/Otacon/olpaka) (User-friendly Flutter Web App for Ollama)
+- [OllamaSpring](https://github.com/CrazyNeil/OllamaSpring) (Ollama Client for macOS)

 ### Terminal

--- a/examples/python-simplechat/client.py
+++ b/examples/python-simplechat/client.py
@@ -9,6 +9,7 @@ def chat(messages):
    r = requests.post(
        "http://0.0.0.0:11434/api/chat",
        json={"model": model, "messages": messages, "stream": True},
+	stream=True
    )
    r.raise_for_status()
    output = ""
--- a/gpu/gpu.go
+++ b/gpu/gpu.go
@@ -16,6 +16,7 @@ import (
 	"os"
 	"path/filepath"
 	"runtime"
+	"strconv"
 	"strings"
 	"sync"
 	"unsafe"
@@ -28,6 +29,7 @@ type handles struct {
 	deviceCount int
 	cudart      *C.cudart_handle_t
 	nvcuda      *C.nvcuda_handle_t
+	oneapi      *C.oneapi_handle_t
 }

 const (
@@ -80,6 +82,15 @@ var NvcudaWindowsGlobs = []string{
 	"c:\\windows\\system*\\nvcuda.dll",
 }

+var OneapiWindowsGlobs = []string{
+	"c:\\Windows\\System32\\DriverStore\\FileRepository\\*\\ze_intel_gpu64.dll",
+}
+
+var OneapiLinuxGlobs = []string{
+	"/usr/lib/x86_64-linux-gnu/libze_intel_gpu.so*",
+	"/usr/lib*/libze_intel_gpu.so*",
+}
+
 // Jetson devices have JETSON_JETPACK="x.y.z" factory set to the Jetpack version installed.
 // Included to drive logic for reducing Ollama-allocated overhead on L4T/Jetson devices.
 var CudaTegra string = os.Getenv("JETSON_JETPACK")
@@ -94,6 +105,8 @@ func initGPUHandles() *handles {
 	var cudartMgmtPatterns []string
 	var nvcudaMgmtName string
 	var nvcudaMgmtPatterns []string
+	var oneapiMgmtName string
+	var oneapiMgmtPatterns []string

 	tmpDir, _ := PayloadsDir()
 	switch runtime.GOOS {
@@ -105,6 +118,8 @@ func initGPUHandles() *handles {
 		// Aligned with driver, we can't carry as payloads
 		nvcudaMgmtName = "nvcuda.dll"
 		nvcudaMgmtPatterns = NvcudaWindowsGlobs
+		oneapiMgmtName = "ze_intel_gpu64.dll"
+		oneapiMgmtPatterns = OneapiWindowsGlobs
 	case "linux":
 		cudartMgmtName = "libcudart.so*"
 		if tmpDir != "" {
@@ -115,6 +130,8 @@ func initGPUHandles() *handles {
 		// Aligned with driver, we can't carry as payloads
 		nvcudaMgmtName = "libcuda.so*"
 		nvcudaMgmtPatterns = NvcudaLinuxGlobs
+		oneapiMgmtName = "libze_intel_gpu.so"
+		oneapiMgmtPatterns = OneapiLinuxGlobs
 	default:
 		return gpuHandles
 	}
@@ -141,6 +158,18 @@ func initGPUHandles() *handles {
 			return gpuHandles
 		}
 	}
+
+	oneapiLibPaths := FindGPULibs(oneapiMgmtName, oneapiMgmtPatterns)
+	if len(oneapiLibPaths) > 0 {
+		deviceCount, oneapi, libPath := LoadOneapiMgmt(oneapiLibPaths)
+		if oneapi != nil {
+			slog.Debug("detected Intel GPUs", "library", libPath, "count", deviceCount)
+			gpuHandles.oneapi = oneapi
+			gpuHandles.deviceCount = deviceCount
+			return gpuHandles
+		}
+	}
+
 	return gpuHandles
 }

@@ -181,39 +210,53 @@ func GetGPUInfo() GpuInfoList {
 		if cpuVariant == "" && runtime.GOARCH == "amd64" {
 			continue
 		}
-		gpuInfo := GpuInfo{
-			Library: "cuda",
-		}
-		var driverMajor int
-		var driverMinor int
-		if gpuHandles.cudart != nil {
-			C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
-		} else {
-			C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
-			driverMajor = int(gpuHandles.nvcuda.driver_major)
-			driverMinor = int(gpuHandles.nvcuda.driver_minor)
-		}
-		if memInfo.err != nil {
-			slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
-			C.free(unsafe.Pointer(memInfo.err))
-			continue
-		}
-		if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
-			slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
-			continue
-		}
-		gpuInfo.TotalMemory = uint64(memInfo.total)
-		gpuInfo.FreeMemory = uint64(memInfo.free)
-		gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
-		gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
-		gpuInfo.MinimumMemory = cudaMinimumMemory
-		gpuInfo.DependencyPath = depPath
-		gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
-		gpuInfo.DriverMajor = int(driverMajor)
-		gpuInfo.DriverMinor = int(driverMinor)
+		if gpuHandles.cudart != nil || gpuHandles.nvcuda != nil {
+			gpuInfo := GpuInfo{
+				Library: "cuda",
+			}
+			var driverMajor int
+			var driverMinor int
+			if gpuHandles.cudart != nil {
+				C.cudart_check_vram(*gpuHandles.cudart, C.int(i), &memInfo)
+			} else {
+				C.nvcuda_check_vram(*gpuHandles.nvcuda, C.int(i), &memInfo)
+				driverMajor = int(gpuHandles.nvcuda.driver_major)
+				driverMinor = int(gpuHandles.nvcuda.driver_minor)
+			}
+			if memInfo.err != nil {
+				slog.Info("error looking up nvidia GPU memory", "error", C.GoString(memInfo.err))
+				C.free(unsafe.Pointer(memInfo.err))
+				continue
+			}
+			if memInfo.major < CudaComputeMin[0] || (memInfo.major == CudaComputeMin[0] && memInfo.minor < CudaComputeMin[1]) {
+				slog.Info(fmt.Sprintf("[%d] CUDA GPU is too old. Compute Capability detected: %d.%d", i, memInfo.major, memInfo.minor))
+				continue
+			}
+			gpuInfo.TotalMemory = uint64(memInfo.total)
+			gpuInfo.FreeMemory = uint64(memInfo.free)
+			gpuInfo.ID = C.GoString(&memInfo.gpu_id[0])
+			gpuInfo.Compute = fmt.Sprintf("%d.%d", memInfo.major, memInfo.minor)
+			gpuInfo.MinimumMemory = cudaMinimumMemory
+			gpuInfo.DependencyPath = depPath
+			gpuInfo.Name = C.GoString(&memInfo.gpu_name[0])
+			gpuInfo.DriverMajor = int(driverMajor)
+			gpuInfo.DriverMinor = int(driverMinor)

-		// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
-		resp = append(resp, gpuInfo)
+			// TODO potentially sort on our own algorithm instead of what the underlying GPU library does...
+			resp = append(resp, gpuInfo)
+		}
+		if gpuHandles.oneapi != nil {
+			gpuInfo := GpuInfo{
+				Library: "oneapi",
+			}
+			C.oneapi_check_vram(*gpuHandles.oneapi, &memInfo)
+			var totalFreeMem float64 = float64(memInfo.free) * 0.95 // work-around: leave some reserve vram for mkl lib used in ggml-sycl backend.
+			memInfo.free = C.uint64_t(totalFreeMem)
+			gpuInfo.TotalMemory = uint64(memInfo.total)
+			gpuInfo.FreeMemory = uint64(memInfo.free)
+			gpuInfo.ID = strconv.Itoa(i)
+			resp = append(resp, gpuInfo)
+		}
 	}

 	// Then AMD
@@ -348,6 +391,23 @@ func LoadNVCUDAMgmt(nvcudaLibPaths []string) (int, *C.nvcuda_handle_t, string) {
 	return 0, nil, ""
 }

+func LoadOneapiMgmt(oneapiLibPaths []string) (int, *C.oneapi_handle_t, string) {
+	var resp C.oneapi_init_resp_t
+	resp.oh.verbose = getVerboseState()
+	for _, libPath := range oneapiLibPaths {
+		lib := C.CString(libPath)
+		defer C.free(unsafe.Pointer(lib))
+		C.oneapi_init(lib, &resp)
+		if resp.err != nil {
+			slog.Debug("Unable to load oneAPI management library", "library", libPath, "error", C.GoString(resp.err))
+			C.free(unsafe.Pointer(resp.err))
+		} else {
+			return int(resp.num_devices), &resp.oh, libPath
+		}
+	}
+	return 0, nil, ""
+}
+
 func getVerboseState() C.uint16_t {
 	if envconfig.Debug {
 		return C.uint16_t(1)
@@ -368,6 +428,8 @@ func (l GpuInfoList) GetVisibleDevicesEnv() (string, string) {
 		return cudaGetVisibleDevicesEnv(l)
 	case "rocm":
 		return rocmGetVisibleDevicesEnv(l)
+	case "oneapi":
+		return oneapiGetVisibleDevicesEnv(l)
 	default:
 		slog.Debug("no filter required for library " + l[0].Library)
 		return "", ""
--- a/gpu/gpu_info.h
+++ b/gpu/gpu_info.h
@@ -62,6 +62,7 @@ void cpu_check_ram(mem_info_t *resp);

 #include "gpu_info_cudart.h"
 #include "gpu_info_nvcuda.h"
+#include "gpu_info_oneapi.h"

 #endif  // __GPU_INFO_H__
 #endif  // __APPLE__
--- a/gpu/gpu_info_oneapi.c
+++ b/gpu/gpu_info_oneapi.c
@@ -0,0 +1,214 @@
+#ifndef __APPLE__
+
+#include "gpu_info_oneapi.h"
+
+#include <string.h>
+
+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp)
+{
+  ze_result_t ret;
+  resp->err = NULL;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i;
+  struct lookup
+  {
+    char *s;
+    void **p;
+  } l[] = {
+      {"zesInit", (void *)&resp->oh.zesInit},
+      {"zesDriverGet", (void *)&resp->oh.zesDriverGet},
+      {"zesDeviceGet", (void *)&resp->oh.zesDeviceGet},
+      {"zesDeviceGetProperties", (void *)&resp->oh.zesDeviceGetProperties},
+      {"zesDeviceEnumMemoryModules",
+       (void *)&resp->oh.zesDeviceEnumMemoryModules},
+      {"zesMemoryGetProperties", (void *)&resp->oh.zesMemoryGetProperties},
+      {"zesMemoryGetState", (void *)&resp->oh.zesMemoryGetState},
+      {NULL, NULL},
+  };
+
+  resp->oh.handle = LOAD_LIBRARY(oneapi_lib_path, RTLD_LAZY);
+  if (!resp->oh.handle)
+  {
+    char *msg = LOAD_ERR();
+    snprintf(buf, buflen,
+             "Unable to load %s library to query for Intel GPUs: %s\n",
+             oneapi_lib_path, msg);
+    free(msg);
+    resp->err = strdup(buf);
+    return;
+  }
+
+  // TODO once we've squashed the remaining corner cases remove this log
+  LOG(resp->oh.verbose,
+      "wiring Level-Zero management library functions in %s\n",
+      oneapi_lib_path);
+
+  for (i = 0; l[i].s != NULL; i++)
+  {
+    // TODO once we've squashed the remaining corner cases remove this log
+    LOG(resp->oh.verbose, "dlsym: %s\n", l[i].s);
+
+    *l[i].p = LOAD_SYMBOL(resp->oh.handle, l[i].s);
+    if (!l[i].p)
+    {
+      resp->oh.handle = NULL;
+      char *msg = LOAD_ERR();
+      LOG(resp->oh.verbose, "dlerr: %s\n", msg);
+      UNLOAD_LIBRARY(resp->oh.handle);
+      snprintf(buf, buflen, "symbol lookup for %s failed: %s", l[i].s, msg);
+      free(msg);
+      resp->err = strdup(buf);
+      return;
+    }
+  }
+
+  ret = (*resp->oh.zesInit)(0);
+  if (ret != ZE_RESULT_SUCCESS)
+  {
+    LOG(resp->oh.verbose, "zesInit err: %d\n", ret);
+    UNLOAD_LIBRARY(resp->oh.handle);
+    resp->oh.handle = NULL;
+    snprintf(buf, buflen, "oneapi vram init failure: %d", ret);
+    resp->err = strdup(buf);
+  }
+
+  (*resp->oh.zesDriverGet)(&resp->num_devices, NULL);
+
+  return;
+}
+
+void oneapi_check_vram(oneapi_handle_t h, mem_info_t *resp)
+{
+  ze_result_t ret;
+  resp->err = NULL;
+  uint64_t totalMem = 0;
+  uint64_t usedMem = 0;
+  const int buflen = 256;
+  char buf[buflen + 1];
+  int i, d, m;
+
+  if (h.handle == NULL)
+  {
+    resp->err = strdup("Level-Zero handle not initialized");
+    return;
+  }
+
+  uint32_t driversCount = 0;
+  ret = (*h.zesDriverGet)(&driversCount, NULL);
+  if (ret != ZE_RESULT_SUCCESS)
+  {
+    snprintf(buf, buflen, "unable to get driver count: %d", ret);
+    resp->err = strdup(buf);
+    return;
+  }
+  LOG(h.verbose, "discovered %d Level-Zero drivers\n", driversCount);
+
+  zes_driver_handle_t *allDrivers =
+      malloc(driversCount * sizeof(zes_driver_handle_t));
+  (*h.zesDriverGet)(&driversCount, allDrivers);
+
+  resp->total = 0;
+  resp->free = 0;
+
+  for (d = 0; d < driversCount; d++)
+  {
+    uint32_t deviceCount = 0;
+    ret = (*h.zesDeviceGet)(allDrivers[d], &deviceCount, NULL);
+    if (ret != ZE_RESULT_SUCCESS)
+    {
+      snprintf(buf, buflen, "unable to get device count: %d", ret);
+      resp->err = strdup(buf);
+      free(allDrivers);
+      return;
+    }
+
+    LOG(h.verbose, "discovered %d Level-Zero devices\n", deviceCount);
+
+    zes_device_handle_t *devices =
+        malloc(deviceCount * sizeof(zes_device_handle_t));
+    (*h.zesDeviceGet)(allDrivers[d], &deviceCount, devices);
+
+    for (i = 0; i < deviceCount; i++)
+    {
+      zes_device_ext_properties_t ext_props;
+      ext_props.stype = ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES;
+      ext_props.pNext = NULL;
+
+      zes_device_properties_t props;
+      props.stype = ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES;
+      props.pNext = &ext_props;
+
+      ret = (*h.zesDeviceGetProperties)(devices[i], &props);
+      if (ret != ZE_RESULT_SUCCESS)
+      {
+        snprintf(buf, buflen, "unable to get device properties: %d", ret);
+        resp->err = strdup(buf);
+        free(allDrivers);
+        free(devices);
+        return;
+      }
+
+      if (h.verbose)
+      {
+        // When in verbose mode, report more information about
+        // the card we discover.
+        LOG(h.verbose, "[%d] oneAPI device name: %s\n", i,
+            props.modelName);
+        LOG(h.verbose, "[%d] oneAPI brand: %s\n", i,
+            props.brandName);
+        LOG(h.verbose, "[%d] oneAPI vendor: %s\n", i,
+            props.vendorName);
+        LOG(h.verbose, "[%d] oneAPI S/N: %s\n", i,
+            props.serialNumber);
+        LOG(h.verbose, "[%d] oneAPI board number: %s\n", i,
+            props.boardNumber);
+      }
+
+      uint32_t memCount = 0;
+      ret = (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, NULL);
+      if (ret != ZE_RESULT_SUCCESS)
+      {
+        snprintf(buf, buflen,
+                 "unable to enumerate Level-Zero memory modules: %d", ret);
+        resp->err = strdup(buf);
+        free(allDrivers);
+        free(devices);
+        return;
+      }
+
+      LOG(h.verbose, "discovered %d Level-Zero memory modules\n", memCount);
+
+      zes_mem_handle_t *mems = malloc(memCount * sizeof(zes_mem_handle_t));
+      (*h.zesDeviceEnumMemoryModules)(devices[i], &memCount, mems);
+
+      for (m = 0; m < memCount; m++)
+      {
+        zes_mem_state_t state;
+        state.stype = ZES_STRUCTURE_TYPE_MEM_STATE;
+        state.pNext = NULL;
+        ret = (*h.zesMemoryGetState)(mems[m], &state);
+        if (ret != ZE_RESULT_SUCCESS)
+        {
+          snprintf(buf, buflen, "unable to get memory state: %d", ret);
+          resp->err = strdup(buf);
+          free(allDrivers);
+          free(devices);
+          free(mems);
+          return;
+        }
+
+        resp->total += state.size;
+        resp->free += state.free;
+      }
+
+      free(mems);
+    }
+
+    free(devices);
+  }
+
+  free(allDrivers);
+}
+
+#endif // __APPLE__
--- a/gpu/gpu_info_oneapi.h
+++ b/gpu/gpu_info_oneapi.h
@@ -0,0 +1,211 @@
+#ifndef __APPLE__
+#ifndef __GPU_INFO_ONEAPI_H__
+#define __GPU_INFO_ONEAPI_H__
+#include "gpu_info.h"
+
+#define ZE_MAX_DEVICE_NAME 256
+#define ZE_MAX_DEVICE_UUID_SIZE 16
+#define ZES_STRING_PROPERTY_SIZE 64
+#define ZE_BIT(_i) (1 << _i)
+
+// Just enough typedef's to dlopen/dlsym for memory information
+typedef enum ze_result_t
+{
+  ZE_RESULT_SUCCESS = 0,
+  // Other values omitted for now...
+} ze_result_t;
+
+typedef uint8_t ze_bool_t;
+typedef struct _zes_driver_handle_t *zes_driver_handle_t;
+typedef struct _zes_device_handle_t *zes_device_handle_t;
+typedef struct _zes_mem_handle_t *zes_mem_handle_t;
+
+typedef enum _ze_structure_type_t
+{
+  ZE_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
+} ze_structure_type_t;
+
+typedef enum _zes_structure_type_t
+{
+  ZES_STRUCTURE_TYPE_DEVICE_PROPERTIES = 0x1,
+  ZES_STRUCTURE_TYPE_MEM_PROPERTIES = 0xb,
+  ZES_STRUCTURE_TYPE_MEM_STATE = 0x1e,
+  ZES_STRUCTURE_TYPE_DEVICE_EXT_PROPERTIES = 0x2d,
+  ZES_STRUCTURE_TYPE_FORCE_UINT32 = 0x7fffffff
+} zes_structure_type_t;
+
+typedef enum _zes_mem_type_t
+{
+  ZES_MEM_TYPE_FORCE_UINT32 = 0x7fffffff
+} zes_mem_type_t;
+
+typedef enum _zes_mem_loc_t
+{
+  ZES_MEM_LOC_SYSTEM = 0,
+  ZES_MEM_LOC_DEVICE = 1,
+  ZES_MEM_LOC_FORCE_UINT32 = 0x7fffffff
+} zes_mem_loc_t;
+
+typedef enum _zes_mem_health_t
+{
+  ZES_MEM_HEALTH_FORCE_UINT32 = 0x7fffffff
+} zes_mem_health_t;
+
+typedef struct _ze_device_uuid_t
+{
+  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
+} ze_device_uuid_t;
+
+typedef struct _zes_uuid_t
+{
+  uint8_t id[ZE_MAX_DEVICE_UUID_SIZE];
+} zes_uuid_t;
+
+typedef enum _ze_device_type_t
+{
+  ZE_DEVICE_TYPE_GPU = 1,
+  ZE_DEVICE_TYPE_CPU = 2,
+  ZE_DEVICE_TYPE_FPGA = 3,
+  ZE_DEVICE_TYPE_MCA = 4,
+  ZE_DEVICE_TYPE_VPU = 5,
+  ZE_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
+} ze_device_type_t;
+
+typedef enum _zes_device_type_t
+{
+  ZES_DEVICE_TYPE_GPU = 1,
+  ZES_DEVICE_TYPE_CPU = 2,
+  ZES_DEVICE_TYPE_FPGA = 3,
+  ZES_DEVICE_TYPE_MCA = 4,
+  ZES_DEVICE_TYPE_VPU = 5,
+  ZES_DEVICE_TYPE_FORCE_UINT32 = 0x7fffffff
+} zes_device_type_t;
+
+typedef uint32_t ze_device_property_flags_t;
+typedef enum _ze_device_property_flag_t
+{
+  ZE_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
+  ZE_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
+  ZE_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
+  ZE_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
+  ZE_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+} ze_device_property_flag_t;
+
+typedef uint32_t zes_device_property_flags_t;
+typedef enum _zes_device_property_flag_t
+{
+  ZES_DEVICE_PROPERTY_FLAG_INTEGRATED = ZE_BIT(0),
+  ZES_DEVICE_PROPERTY_FLAG_SUBDEVICE = ZE_BIT(1),
+  ZES_DEVICE_PROPERTY_FLAG_ECC = ZE_BIT(2),
+  ZES_DEVICE_PROPERTY_FLAG_ONDEMANDPAGING = ZE_BIT(3),
+  ZES_DEVICE_PROPERTY_FLAG_FORCE_UINT32 = 0x7fffffff
+} zes_device_property_flag_t;
+
+typedef struct _ze_device_properties_t
+{
+  ze_structure_type_t stype;
+  void *pNext;
+  ze_device_type_t type;
+  uint32_t vendorId;
+  uint32_t deviceId;
+  ze_device_property_flags_t flags;
+  uint32_t subdeviceId;
+  uint32_t coreClockRate;
+  uint64_t maxMemAllocSize;
+  uint32_t maxHardwareContexts;
+  uint32_t maxCommandQueuePriority;
+  uint32_t numThreadsPerEU;
+  uint32_t physicalEUSimdWidth;
+  uint32_t numEUsPerSubslice;
+  uint32_t numSubslicesPerSlice;
+  uint32_t numSlices;
+  uint64_t timerResolution;
+  uint32_t timestampValidBits;
+  uint32_t kernelTimestampValidBits;
+  ze_device_uuid_t uuid;
+  char name[ZE_MAX_DEVICE_NAME];
+} ze_device_properties_t;
+
+typedef struct _zes_device_properties_t
+{
+  zes_structure_type_t stype;
+  void *pNext;
+  ze_device_properties_t core;
+  uint32_t numSubdevices;
+  char serialNumber[ZES_STRING_PROPERTY_SIZE];
+  char boardNumber[ZES_STRING_PROPERTY_SIZE];
+  char brandName[ZES_STRING_PROPERTY_SIZE];
+  char modelName[ZES_STRING_PROPERTY_SIZE];
+  char vendorName[ZES_STRING_PROPERTY_SIZE];
+  char driverVersion[ZES_STRING_PROPERTY_SIZE];
+} zes_device_properties_t;
+
+typedef struct _zes_device_ext_properties_t
+{
+  zes_structure_type_t stype;
+  void *pNext;
+  zes_uuid_t uuid;
+  zes_device_type_t type;
+  zes_device_property_flags_t flags;
+} zes_device_ext_properties_t;
+
+typedef struct _zes_mem_properties_t
+{
+  zes_structure_type_t stype;
+  void *pNext;
+  zes_mem_type_t type;
+  ze_bool_t onSubdevice;
+  uint32_t subdeviceId;
+  zes_mem_loc_t location;
+  uint64_t physicalSize;
+  int32_t busWidth;
+  int32_t numChannels;
+} zes_mem_properties_t;
+
+typedef struct _zes_mem_state_t
+{
+  zes_structure_type_t stype;
+  const void *pNext;
+  zes_mem_health_t health;
+  uint64_t free;
+  uint64_t size;
+} zes_mem_state_t;
+
+typedef struct oneapi_handle
+{
+  void *handle;
+  uint16_t verbose;
+  ze_result_t (*zesInit)(int);
+  ze_result_t (*zesDriverGet)(uint32_t *pCount, zes_driver_handle_t *phDrivers);
+  ze_result_t (*zesDeviceGet)(zes_driver_handle_t hDriver, uint32_t *pCount,
+                              zes_device_handle_t *phDevices);
+  ze_result_t (*zesDeviceGetProperties)(zes_device_handle_t hDevice,
+                                        zes_device_properties_t *pProperties);
+  ze_result_t (*zesDeviceEnumMemoryModules)(zes_device_handle_t hDevice,
+                                            uint32_t *pCount,
+                                            zes_mem_handle_t *phMemory);
+  ze_result_t (*zesMemoryGetProperties)(zes_mem_handle_t hMemory,
+                                        zes_mem_properties_t *pProperties);
+  ze_result_t (*zesMemoryGetState)(zes_mem_handle_t hMemory,
+                                   zes_mem_state_t *pState);
+
+} oneapi_handle_t;
+
+typedef struct oneapi_init_resp
+{
+  char *err; // If err is non-null handle is invalid
+  int num_devices;
+  oneapi_handle_t oh;
+} oneapi_init_resp_t;
+
+typedef struct oneapi_version_resp
+{
+  ze_result_t status;
+  char *str; // Contains version or error string if status != 0
+} oneapi_version_resp_t;
+
+void oneapi_init(char *oneapi_lib_path, oneapi_init_resp_t *resp);
+void oneapi_check_vram(oneapi_handle_t rh, mem_info_t *resp);
+
+#endif // __GPU_INFO_INTEL_H__
+#endif // __APPLE__
--- a/gpu/gpu_oneapi.go
+++ b/gpu/gpu_oneapi.go
@@ -0,0 +1,21 @@
+//go:build linux || windows
+
+package gpu
+
+import (
+	"log/slog"
+	"strings"
+)
+
+func oneapiGetVisibleDevicesEnv(gpuInfo []GpuInfo) (string, string) {
+	ids := []string{}
+	for _, info := range gpuInfo {
+		if info.Library != "oneapi" {
+			// TODO shouldn't happen if things are wired correctly...
+			slog.Debug("oneapiGetVisibleDevicesEnv skipping over non-sycl device", "library", info.Library)
+			continue
+		}
+		ids = append(ids, info.ID)
+	}
+	return "ONEAPI_DEVICE_SELECTOR", "level_zero:" + strings.Join(ids, ",")
+}
--- a/llm/generate/gen_linux.sh
+++ b/llm/generate/gen_linux.sh
@@ -215,6 +215,36 @@ if [ -z "${OLLAMA_SKIP_CUDA_GENERATE}" -a -d "${CUDA_LIB_DIR}" ]; then

 fi

+if [ -z "${ONEAPI_ROOT}" ]; then
+    # Try the default location in case it exists
+    ONEAPI_ROOT=/opt/intel/oneapi
+fi
+
+if [ -d "${ONEAPI_ROOT}" ]; then
+    echo "OneAPI libraries detected - building dynamic OneAPI library"
+    init_vars
+    source ${ONEAPI_ROOT}/setvars.sh --force # set up environment variables for oneAPI
+    CC=icx
+    CMAKE_DEFS="${COMMON_CMAKE_DEFS} ${CMAKE_DEFS} -DCMAKE_C_COMPILER=icx -DCMAKE_CXX_COMPILER=icpx -DLLAMA_SYCL=ON -DLLAMA_SYCL_F16=OFF"
+    BUILD_DIR="../build/linux/${ARCH}/oneapi"
+    EXTRA_LIBS="-fsycl -Wl,-rpath,${ONEAPI_ROOT}/compiler/latest/lib,-rpath,${ONEAPI_ROOT}/mkl/latest/lib,-rpath,${ONEAPI_ROOT}/tbb/latest/lib,-rpath,${ONEAPI_ROOT}/compiler/latest/opt/oclfpga/linux64/lib -lOpenCL -lmkl_core -lmkl_sycl_blas -lmkl_intel_ilp64 -lmkl_tbb_thread -ltbb"
+    DEBUG_FLAGS="" # icx compiles with -O0 if we pass -g, so we must remove it
+    build
+
+    # copy oneAPI dependencies
+    for dep in $(ldd "${BUILD_DIR}/bin/ollama_llama_server" | grep "=>" | cut -f2 -d= | cut -f2 -d' ' | grep -e sycl -e mkl -e tbb); do
+        cp "${dep}" "${BUILD_DIR}/bin/"
+    done
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libOpenCL.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libimf.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libintlc.so.5" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libirng.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libpi_level_zero.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libsvml.so" "${BUILD_DIR}/bin/"
+    cp "${ONEAPI_ROOT}/compiler/latest/lib/libur_loader.so.0" "${BUILD_DIR}/bin/"
+    compress
+fi
+
 if [ -z "${ROCM_PATH}" ]; then
    # Try the default location in case it exists
    ROCM_PATH=/opt/rocm
--- a/llm/generate/gen_windows.ps1
+++ b/llm/generate/gen_windows.ps1
@@ -300,6 +300,49 @@ function build_cuda() {
    }
 }

+function build_oneapi() {
+  if ((-not "${env:OLLAMA_SKIP_CUDA_GENERATE}") -and ("${env:ONEAPI_ROOT}"))  {
+    # Get oneAPI version
+    $script:ONEAPI_VERSION = icpx --version
+    $script:ONEAPI_VERSION = [regex]::Match($script:ONEAPI_VERSION, '(?<=oneAPI DPC\+\+/C\+\+ Compiler )(?<version>\d+\.\d+\.\d+)').Value
+    if ($null -ne $script:ONEAPI_VERSION) {
+      $script:ONEAPI_VARIANT = "_v" + $script:ONEAPI_VERSION
+    }
+    init_vars
+    $script:buildDir = "../build/windows/${script:ARCH}/oneapi$script:ONEAPI_VARIANT"
+    $script:distDir ="$script:DIST_BASE\oneapi$script:ONEAPI_VARIANT"
+    $script:cmakeDefs += @(
+      "-G", "MinGW Makefiles",
+      "-DLLAMA_SYCL=ON",
+      "-DCMAKE_C_COMPILER=icx",
+      "-DCMAKE_CXX_COMPILER=icx",
+      "-DCMAKE_BUILD_TYPE=Release"
+    )
+
+    Write-Host "Building oneAPI"
+    build
+    # Ninja doesn't prefix with config name
+    if ($null -ne $script:DUMPBIN) {
+      & "$script:DUMPBIN" /dependents "${script:buildDir}/bin/ollama_llama_server.exe" | Select-String ".dll"
+    }
+    sign
+    install
+
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libirngmd.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\libmmd.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_level_zero.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_unified_runtime.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\pi_win_proxy_loader.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\svml_dispmd.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\compiler\latest\bin\sycl7.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_core.2.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_sycl_blas.4.dll" "${script:distDir}"
+    cp "${env:ONEAPI_ROOT}\mkl\latest\bin\mkl_tbb_thread.2.dll" "${script:distDir}"
+  } else {
+    Write-Host "Skipping oneAPI generation step"
+  }
+}
+
 function build_rocm() {
    if ((-not "${env:OLLAMA_SKIP_ROCM_GENERATE}") -and ("${env:HIP_PATH}")) {
        $script:ROCM_VERSION=(get-item $env:HIP_PATH).Basename
@@ -367,6 +410,7 @@ if ($($args.count) -eq 0) {
        build_cpu_avx
        build_cpu_avx2
        build_cuda
+        build_oneapi
        build_rocm
    }

--- a/llm/server.go
+++ b/llm/server.go
@@ -24,9 +24,9 @@ import (
 	"golang.org/x/sync/semaphore"

 	"github.com/ollama/ollama/api"
+	"github.com/ollama/ollama/envconfig"
 	"github.com/ollama/ollama/format"
 	"github.com/ollama/ollama/gpu"
-	"github.com/ollama/ollama/envconfig"
 )

 type LlamaServer interface {
@@ -243,7 +243,7 @@ func NewLlamaServer(gpus gpu.GpuInfoList, model string, ggml *GGML, adapters, pr
 			gpuCount = 0
 		}

-		// Find an availableServers  port, retry on each iterration in case the failure was a port conflict race
+		// Find an availableServers  port, retry on each iteration in case the failure was a port conflict race
 		port := 0
 		if a, err := net.ResolveTCPAddr("tcp", "localhost:0"); err == nil {
 			var l *net.TCPListener
@@ -519,11 +519,13 @@ func (s *llmServer) Ping(ctx context.Context) error {

 func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 	start := time.Now()
-	stallDuration := 60 * time.Second
-	stallTimer := time.Now().Add(stallDuration) // give up if we stall for
+	stallDuration := 5 * time.Minute            // If no progress happens
+	finalLoadDuration := 5 * time.Minute        // After we hit 100%, give the runner more time to come online
+	stallTimer := time.Now().Add(stallDuration) // give up if we stall

 	slog.Info("waiting for llama runner to start responding")
 	var lastStatus ServerStatus = -1
+	fullyLoaded := false

 	for {
 		select {
@@ -572,6 +574,10 @@ func (s *llmServer) WaitUntilRunning(ctx context.Context) error {
 			if priorProgress != s.loadProgress {
 				slog.Debug(fmt.Sprintf("model load progress %0.2f", s.loadProgress))
 				stallTimer = time.Now().Add(stallDuration)
+			} else if !fullyLoaded && int(s.loadProgress*100.0) >= 100 {
+				slog.Debug("model load completed, waiting for server to become available", "status", status.ToString())
+				stallTimer = time.Now().Add(finalLoadDuration)
+				fullyLoaded = true
 			}
 			time.Sleep(time.Millisecond * 250)
 			continue
@@ -756,7 +762,7 @@ func (s *llmServer) Completion(ctx context.Context, req CompletionRequest, fn fu

 			var c completion
 			if err := json.Unmarshal(evt, &c); err != nil {
-				return fmt.Errorf("error unmarshaling llm prediction response: %v", err)
+				return fmt.Errorf("error unmarshalling llm prediction response: %v", err)
 			}

 			switch {
--- a/readline/buffer.go
+++ b/readline/buffer.go
@@ -5,16 +5,20 @@ import (
 	"os"

 	"github.com/emirpasic/gods/lists/arraylist"
+	"github.com/mattn/go-runewidth"
 	"golang.org/x/term"
 )

 type Buffer struct {
-	Pos       int
-	Buf       *arraylist.List
-	Prompt    *Prompt
-	LineWidth int
-	Width     int
-	Height    int
+	DisplayPos int
+	Pos        int
+	Buf        *arraylist.List
+	//LineHasSpace is an arraylist of bools to keep track of whether a line has a space at the end
+	LineHasSpace *arraylist.List
+	Prompt       *Prompt
+	LineWidth    int
+	Width        int
+	Height       int
 }

 func NewBuffer(prompt *Prompt) (*Buffer, error) {
@@ -27,25 +31,57 @@ func NewBuffer(prompt *Prompt) (*Buffer, error) {
 	lwidth := width - len(prompt.prompt())

 	b := &Buffer{
-		Pos:       0,
-		Buf:       arraylist.New(),
-		Prompt:    prompt,
-		Width:     width,
-		Height:    height,
-		LineWidth: lwidth,
+		DisplayPos:   0,
+		Pos:          0,
+		Buf:          arraylist.New(),
+		LineHasSpace: arraylist.New(),
+		Prompt:       prompt,
+		Width:        width,
+		Height:       height,
+		LineWidth:    lwidth,
 	}

 	return b, nil
 }

+func (b *Buffer) GetLineSpacing(line int) bool {
+	hasSpace, _ := b.LineHasSpace.Get(line)
+
+	if hasSpace == nil {
+		return false
+	}
+
+	return hasSpace.(bool)
+
+}
+
 func (b *Buffer) MoveLeft() {
 	if b.Pos > 0 {
-		if b.Pos%b.LineWidth == 0 {
-			fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))
-		} else {
-			fmt.Print(CursorLeft)
+		//asserts that we retrieve a rune
+		if e, ok := b.Buf.Get(b.Pos - 1); ok {
+			if r, ok := e.(rune); ok {
+				rLength := runewidth.RuneWidth(r)
+
+				if b.DisplayPos%b.LineWidth == 0 {
+					fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))
+					if rLength == 2 {
+						fmt.Print(CursorLeft)
+					}
+
+					line := b.DisplayPos/b.LineWidth - 1
+					hasSpace := b.GetLineSpacing(line)
+					if hasSpace {
+						b.DisplayPos -= 1
+						fmt.Print(CursorLeft)
+					}
+				} else {
+					fmt.Print(cursorLeftN(rLength))
+				}
+
+				b.Pos -= 1
+				b.DisplayPos -= rLength
+			}
 		}
-		b.Pos -= 1
 	}
 }

@@ -71,18 +107,35 @@ func (b *Buffer) MoveLeftWord() {
 }

 func (b *Buffer) MoveRight() {
-	if b.Pos < b.Size() {
-		b.Pos += 1
-		if b.Pos%b.LineWidth == 0 {
-			fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())))
-		} else {
-			fmt.Print(CursorRight)
+	if b.Pos < b.Buf.Size() {
+		if e, ok := b.Buf.Get(b.Pos); ok {
+			if r, ok := e.(rune); ok {
+				rLength := runewidth.RuneWidth(r)
+				b.Pos += 1
+				hasSpace := b.GetLineSpacing(b.DisplayPos / b.LineWidth)
+				b.DisplayPos += rLength
+
+				if b.DisplayPos%b.LineWidth == 0 {
+					fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())))
+
+				} else if (b.DisplayPos-rLength)%b.LineWidth == b.LineWidth-1 && hasSpace {
+					fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())+rLength))
+					b.DisplayPos += 1
+
+				} else if b.LineHasSpace.Size() > 0 && b.DisplayPos%b.LineWidth == b.LineWidth-1 && hasSpace {
+					fmt.Printf(CursorDown + CursorBOL + cursorRightN(len(b.Prompt.prompt())))
+					b.DisplayPos += 1
+
+				} else {
+					fmt.Print(cursorRightN(rLength))
+				}
+			}
 		}
 	}
 }

 func (b *Buffer) MoveRightWord() {
-	if b.Pos < b.Size() {
+	if b.Pos < b.Buf.Size() {
 		for {
 			b.MoveRight()
 			v, _ := b.Buf.Get(b.Pos)
@@ -90,7 +143,7 @@ func (b *Buffer) MoveRightWord() {
 				break
 			}

-			if b.Pos == b.Size() {
+			if b.Pos == b.Buf.Size() {
 				break
 			}
 		}
@@ -99,7 +152,7 @@ func (b *Buffer) MoveRightWord() {

 func (b *Buffer) MoveToStart() {
 	if b.Pos > 0 {
-		currLine := b.Pos / b.LineWidth
+		currLine := b.DisplayPos / b.LineWidth
 		if currLine > 0 {
 			for cnt := 0; cnt < currLine; cnt++ {
 				fmt.Print(CursorUp)
@@ -107,81 +160,195 @@ func (b *Buffer) MoveToStart() {
 		}
 		fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())))
 		b.Pos = 0
+		b.DisplayPos = 0
 	}
 }

 func (b *Buffer) MoveToEnd() {
-	if b.Pos < b.Size() {
-		currLine := b.Pos / b.LineWidth
-		totalLines := b.Size() / b.LineWidth
+	if b.Pos < b.Buf.Size() {
+		currLine := b.DisplayPos / b.LineWidth
+		totalLines := b.DisplaySize() / b.LineWidth
 		if currLine < totalLines {
 			for cnt := 0; cnt < totalLines-currLine; cnt++ {
 				fmt.Print(CursorDown)
 			}
-			remainder := b.Size() % b.LineWidth
+			remainder := b.DisplaySize() % b.LineWidth
 			fmt.Printf(CursorBOL + cursorRightN(len(b.Prompt.prompt())+remainder))
 		} else {
-			fmt.Print(cursorRightN(b.Size() - b.Pos))
+			fmt.Print(cursorRightN(b.DisplaySize() - b.DisplayPos))
 		}

-		b.Pos = b.Size()
+		b.Pos = b.Buf.Size()
+		b.DisplayPos = b.DisplaySize()
 	}
 }

-func (b *Buffer) Size() int {
-	return b.Buf.Size()
+func (b *Buffer) DisplaySize() int {
+	sum := 0
+	for i := 0; i < b.Buf.Size(); i++ {
+		if e, ok := b.Buf.Get(i); ok {
+			if r, ok := e.(rune); ok {
+				sum += runewidth.RuneWidth(r)
+			}
+		}
+	}
+
+	return sum
 }

 func (b *Buffer) Add(r rune) {
+
 	if b.Pos == b.Buf.Size() {
-		fmt.Printf("%c", r)
-		b.Buf.Add(r)
-		b.Pos += 1
-		if b.Pos > 0 && b.Pos%b.LineWidth == 0 {
+		b.AddChar(r, false)
+	} else {
+		b.AddChar(r, true)
+	}
+}
+
+func (b *Buffer) AddChar(r rune, insert bool) {
+	rLength := runewidth.RuneWidth(r)
+	b.DisplayPos += rLength
+
+	if b.Pos > 0 {
+
+		if b.DisplayPos%b.LineWidth == 0 {
+			fmt.Printf("%c", r)
 			fmt.Printf("\n%s", b.Prompt.AltPrompt)
+
+			if insert {
+				b.LineHasSpace.Set(b.DisplayPos/b.LineWidth-1, false)
+			} else {
+				b.LineHasSpace.Add(false)
+			}
+
+			// this case occurs when a double-width rune crosses the line boundary
+		} else if b.DisplayPos%b.LineWidth < (b.DisplayPos-rLength)%b.LineWidth {
+			if insert {
+				fmt.Print(ClearToEOL)
+			}
+			fmt.Printf("\n%s", b.Prompt.AltPrompt)
+			b.DisplayPos += 1
+			fmt.Printf("%c", r)
+
+			if insert {
+				b.LineHasSpace.Set(b.DisplayPos/b.LineWidth-1, true)
+			} else {
+				b.LineHasSpace.Add(true)
+			}
+
+		} else {
+			fmt.Printf("%c", r)
 		}
 	} else {
 		fmt.Printf("%c", r)
+	}
+
+	if insert {
 		b.Buf.Insert(b.Pos, r)
-		b.Pos += 1
-		if b.Pos > 0 && b.Pos%b.LineWidth == 0 {
-			fmt.Printf("\n%s", b.Prompt.AltPrompt)
-		}
+	} else {
+		b.Buf.Add(r)
+	}
+
+	b.Pos += 1
+
+	if insert {
 		b.drawRemaining()
 	}
 }

+func (b *Buffer) countRemainingLineWidth(place int) int {
+	var sum int
+	counter := -1
+	var prevLen int
+
+	for place <= b.LineWidth {
+		counter += 1
+		sum += prevLen
+		if e, ok := b.Buf.Get(b.Pos + counter); ok {
+			if r, ok := e.(rune); ok {
+				place += runewidth.RuneWidth(r)
+				prevLen = len(string(r))
+			}
+		} else {
+			break
+		}
+	}
+
+	return sum
+}
+
 func (b *Buffer) drawRemaining() {
 	var place int
 	remainingText := b.StringN(b.Pos)
 	if b.Pos > 0 {
-		place = b.Pos % b.LineWidth
+		place = b.DisplayPos % b.LineWidth
 	}
 	fmt.Print(CursorHide)

 	// render the rest of the current line
-	currLine := remainingText[:min(b.LineWidth-place, len(remainingText))]
+	currLineLength := b.countRemainingLineWidth(place)
+
+	currLine := remainingText[:min(currLineLength, len(remainingText))]
+	currLineSpace := runewidth.StringWidth(currLine)
+	remLength := runewidth.StringWidth(remainingText)
+
 	if len(currLine) > 0 {
 		fmt.Printf(ClearToEOL + currLine)
-		fmt.Print(cursorLeftN(len(currLine)))
+		fmt.Print(cursorLeftN(currLineSpace))
 	} else {
 		fmt.Print(ClearToEOL)
 	}

+	if currLineSpace != b.LineWidth-place && currLineSpace != remLength {
+		b.LineHasSpace.Set(b.DisplayPos/b.LineWidth, true)
+	} else if currLineSpace != b.LineWidth-place {
+		b.LineHasSpace.Remove(b.DisplayPos / b.LineWidth)
+	} else {
+		b.LineHasSpace.Set(b.DisplayPos/b.LineWidth, false)
+	}
+
+	if (b.DisplayPos+currLineSpace)%b.LineWidth == 0 && currLine == remainingText {
+		fmt.Print(cursorRightN(currLineSpace))
+		fmt.Printf("\n%s", b.Prompt.AltPrompt)
+		fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width-currLineSpace))
+	}
+
 	// render the other lines
-	if len(remainingText) > len(currLine) {
-		remaining := []rune(remainingText[len(currLine):])
+	if remLength > currLineSpace {
+		remaining := (remainingText[len(currLine):])
 		var totalLines int
-		for i, c := range remaining {
-			if i%b.LineWidth == 0 {
+		var displayLength int
+		var lineLength int = currLineSpace
+
+		for _, c := range remaining {
+			if displayLength == 0 || (displayLength+runewidth.RuneWidth(c))%b.LineWidth < displayLength%b.LineWidth {
 				fmt.Printf("\n%s", b.Prompt.AltPrompt)
 				totalLines += 1
+
+				if displayLength != 0 {
+					if lineLength == b.LineWidth {
+						b.LineHasSpace.Set(b.DisplayPos/b.LineWidth+totalLines-1, false)
+					} else {
+						b.LineHasSpace.Set(b.DisplayPos/b.LineWidth+totalLines-1, true)
+					}
+				}
+
+				lineLength = 0
 			}
+
+			displayLength += runewidth.RuneWidth(c)
+			lineLength += runewidth.RuneWidth(c)
 			fmt.Printf("%c", c)
 		}
 		fmt.Print(ClearToEOL)
 		fmt.Print(cursorUpN(totalLines))
-		fmt.Printf(CursorBOL + cursorRightN(b.Width-len(currLine)))
+		fmt.Printf(CursorBOL + cursorRightN(b.Width-currLineSpace))
+
+		hasSpace := b.GetLineSpacing(b.DisplayPos / b.LineWidth)
+
+		if hasSpace && b.DisplayPos%b.LineWidth != b.LineWidth-1 {
+			fmt.Print(CursorLeft)
+		}
 	}

 	fmt.Print(CursorShow)
@@ -189,46 +356,84 @@ func (b *Buffer) drawRemaining() {

 func (b *Buffer) Remove() {
 	if b.Buf.Size() > 0 && b.Pos > 0 {
-		if b.Pos%b.LineWidth == 0 {
-			// if the user backspaces over the word boundary, do this magic to clear the line
-			// and move to the end of the previous line
-			fmt.Printf(CursorBOL + ClearToEOL)
-			fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width) + " " + CursorLeft)
-		} else {
-			fmt.Printf(CursorLeft + " " + CursorLeft)
-		}

-		var eraseExtraLine bool
-		if (b.Size()-1)%b.LineWidth == 0 {
-			eraseExtraLine = true
-		}
+		if e, ok := b.Buf.Get(b.Pos - 1); ok {
+			if r, ok := e.(rune); ok {
+				rLength := runewidth.RuneWidth(r)
+				hasSpace := b.GetLineSpacing(b.DisplayPos/b.LineWidth - 1)

-		b.Pos -= 1
-		b.Buf.Remove(b.Pos)
+				if b.DisplayPos%b.LineWidth == 0 {
+					// if the user backspaces over the word boundary, do this magic to clear the line
+					// and move to the end of the previous line
+					fmt.Printf(CursorBOL + ClearToEOL)
+					fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))

-		if b.Pos < b.Size() {
-			b.drawRemaining()
-			// this erases a line which is left over when backspacing in the middle of a line and there
-			// are trailing characters which go over the line width boundary
-			if eraseExtraLine {
-				remainingLines := (b.Size() - b.Pos) / b.LineWidth
-				fmt.Printf(cursorDownN(remainingLines+1) + CursorBOL + ClearToEOL)
-				place := b.Pos % b.LineWidth
-				fmt.Printf(cursorUpN(remainingLines+1) + cursorRightN(place+len(b.Prompt.prompt())))
+					if b.DisplaySize()%b.LineWidth < (b.DisplaySize()-rLength)%b.LineWidth {
+						b.LineHasSpace.Remove(b.DisplayPos/b.LineWidth - 1)
+					}
+
+					if hasSpace {
+						b.DisplayPos -= 1
+						fmt.Print(CursorLeft)
+					}
+
+					if rLength == 2 {
+						fmt.Print(CursorLeft + "  " + cursorLeftN(2))
+					} else {
+						fmt.Print(" " + CursorLeft)
+					}
+
+				} else if (b.DisplayPos-rLength)%b.LineWidth == 0 && hasSpace {
+					fmt.Printf(CursorBOL + ClearToEOL)
+					fmt.Printf(CursorUp + CursorBOL + cursorRightN(b.Width))
+
+					if b.Pos == b.Buf.Size() {
+						b.LineHasSpace.Remove(b.DisplayPos/b.LineWidth - 1)
+					}
+					b.DisplayPos -= 1
+
+				} else {
+					fmt.Print(cursorLeftN(rLength))
+					for i := 0; i < rLength; i++ {
+						fmt.Print(" ")
+					}
+					fmt.Print(cursorLeftN(rLength))
+				}
+
+				var eraseExtraLine bool
+				if (b.DisplaySize()-1)%b.LineWidth == 0 || (rLength == 2 && ((b.DisplaySize()-2)%b.LineWidth == 0)) || b.DisplaySize()%b.LineWidth == 0 {
+					eraseExtraLine = true
+				}
+
+				b.Pos -= 1
+				b.DisplayPos -= rLength
+				b.Buf.Remove(b.Pos)
+
+				if b.Pos < b.Buf.Size() {
+					b.drawRemaining()
+					// this erases a line which is left over when backspacing in the middle of a line and there
+					// are trailing characters which go over the line width boundary
+					if eraseExtraLine {
+						remainingLines := (b.DisplaySize() - b.DisplayPos) / b.LineWidth
+						fmt.Printf(cursorDownN(remainingLines+1) + CursorBOL + ClearToEOL)
+						place := b.DisplayPos % b.LineWidth
+						fmt.Printf(cursorUpN(remainingLines+1) + cursorRightN(place+len(b.Prompt.prompt())))
+					}
+				}
 			}
 		}
 	}
 }

 func (b *Buffer) Delete() {
-	if b.Size() > 0 && b.Pos < b.Size() {
+	if b.Buf.Size() > 0 && b.Pos < b.Buf.Size() {
 		b.Buf.Remove(b.Pos)
 		b.drawRemaining()
-		if b.Size()%b.LineWidth == 0 {
-			if b.Pos != b.Size() {
-				remainingLines := (b.Size() - b.Pos) / b.LineWidth
+		if b.DisplaySize()%b.LineWidth == 0 {
+			if b.DisplayPos != b.DisplaySize() {
+				remainingLines := (b.DisplaySize() - b.DisplayPos) / b.LineWidth
 				fmt.Printf(cursorDownN(remainingLines) + CursorBOL + ClearToEOL)
-				place := b.Pos % b.LineWidth
+				place := b.DisplayPos % b.LineWidth
 				fmt.Printf(cursorUpN(remainingLines) + cursorRightN(place+len(b.Prompt.prompt())))
 			}
 		}
@@ -244,8 +449,8 @@ func (b *Buffer) DeleteBefore() {
 }

 func (b *Buffer) DeleteRemaining() {
-	if b.Size() > 0 && b.Pos < b.Size() {
-		charsToDel := b.Size() - b.Pos
+	if b.DisplaySize() > 0 && b.Pos < b.DisplaySize() {
+		charsToDel := b.Buf.Size() - b.Pos
 		for cnt := 0; cnt < charsToDel; cnt++ {
 			b.Delete()
 		}
@@ -281,8 +486,10 @@ func (b *Buffer) ClearScreen() {
 		ph := b.Prompt.placeholder()
 		fmt.Printf(ColorGrey + ph + cursorLeftN(len(ph)) + ColorDefault)
 	} else {
-		currPos := b.Pos
+		currPos := b.DisplayPos
+		currIndex := b.Pos
 		b.Pos = 0
+		b.DisplayPos = 0
 		b.drawRemaining()
 		fmt.Printf(CursorReset + cursorRightN(len(b.Prompt.prompt())))
 		if currPos > 0 {
@@ -300,7 +507,8 @@ func (b *Buffer) ClearScreen() {
 				fmt.Printf(CursorBOL + b.Prompt.AltPrompt)
 			}
 		}
-		b.Pos = currPos
+		b.Pos = currIndex
+		b.DisplayPos = currPos
 	}
 }

@@ -309,9 +517,20 @@ func (b *Buffer) IsEmpty() bool {
 }

 func (b *Buffer) Replace(r []rune) {
+	b.DisplayPos = 0
 	b.Pos = 0
+	lineNums := b.DisplaySize() / b.LineWidth
+
 	b.Buf.Clear()
-	fmt.Printf(ClearLine + CursorBOL + b.Prompt.prompt())
+
+	fmt.Printf(CursorBOL + ClearToEOL)
+
+	for i := 0; i < lineNums; i++ {
+		fmt.Print(CursorUp + CursorBOL + ClearToEOL)
+	}
+
+	fmt.Printf(CursorBOL + b.Prompt.prompt())
+
 	for _, c := range r {
 		b.Add(c)
 	}
@@ -328,7 +547,7 @@ func (b *Buffer) StringN(n int) string {
 func (b *Buffer) StringNM(n, m int) string {
 	var s string
 	if m == 0 {
-		m = b.Size()
+		m = b.Buf.Size()
 	}
 	for cnt := n; cnt < m; cnt++ {
 		c, _ := b.Buf.Get(cnt)
--- a/readline/readline.go
+++ b/readline/readline.go
@@ -150,7 +150,7 @@ func (i *Instance) Readline() (string, error) {
 					i.Pasting = false
 				}
 			case KeyDel:
-				if buf.Size() > 0 {
+				if buf.DisplaySize() > 0 {
 					buf.Delete()
 				}
 				metaDel = true
@@ -202,7 +202,7 @@ func (i *Instance) Readline() (string, error) {
 				buf.Add(' ')
 			}
 		case CharDelete:
-			if buf.Size() > 0 {
+			if buf.DisplaySize() > 0 {
 				buf.Delete()
 			} else {
 				return "", io.EOF
--- a/scripts/install.sh
+++ b/scripts/install.sh
@@ -33,9 +33,11 @@ case "$ARCH" in
    *) error "Unsupported architecture: $ARCH" ;;
 esac

+IS_WSL2=false
+
 KERN=$(uname -r)
 case "$KERN" in
-    *icrosoft*WSL2 | *icrosoft*wsl2) ;;
+    *icrosoft*WSL2 | *icrosoft*wsl2) IS_WSL2=true;;
    *icrosoft) error "Microsoft WSL1 is not currently supported. Please upgrade to WSL2 with 'wsl --set-version <distro> 2'" ;;
    *) ;;
 esac
@@ -72,7 +74,7 @@ status "Installing ollama to $BINDIR..."
 $SUDO install -o0 -g0 -m755 -d $BINDIR
 $SUDO install -o0 -g0 -m755 $TEMP_DIR/ollama $BINDIR/ollama

-install_success() { 
+install_success() {
    status 'The Ollama API is now available at 127.0.0.1:11434.'
    status 'Install complete. Run "ollama" from the command line.'
 }
@@ -131,6 +133,17 @@ if available systemctl; then
    configure_systemd
 fi

+# WSL2 only supports GPUs via nvidia passthrough
+# so check for nvidia-smi to determine if GPU is available
+if [ "$IS_WSL2" = true ]; then
+    if available nvidia-smi && [ -n "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\.[0-9]*")" ]; then
+        status "Nvidia GPU detected."
+    fi
+    install_success
+    exit 0
+fi
+
+# Install GPU dependencies on Linux
 if ! available lspci && ! available lshw; then
    warning "Unable to detect NVIDIA/AMD GPU. Install lspci or lshw to automatically detect and install GPU dependencies."
    exit 0
@@ -139,12 +152,12 @@ fi
 check_gpu() {
    # Look for devices based on vendor ID for NVIDIA and AMD
    case $1 in
-        lspci) 
+        lspci)
            case $2 in
                nvidia) available lspci && lspci -d '10de:' | grep -q 'NVIDIA' || return 1 ;;
                amdgpu) available lspci && lspci -d '1002:' | grep -q 'AMD' || return 1 ;;
            esac ;;
-        lshw) 
+        lshw)
            case $2 in
                nvidia) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[10DE\]' || return 1 ;;
                amdgpu) available lshw && $SUDO lshw -c display -numeric | grep -q 'vendor: .* \[1002\]' || return 1 ;;
@@ -181,7 +194,7 @@ if check_gpu lspci amdgpu || check_gpu lshw amdgpu; then
    curl --fail --show-error --location --progress-bar "https://ollama.com/download/ollama-linux-amd64-rocm.tgz${VER_PARAM}" \
        | $SUDO tar zx --owner ollama --group ollama -C /usr/share/ollama/lib/rocm .
    install_success
-    status "AMD GPU dependencies installed."
+    status "AMD GPU ready."
    exit 0
 fi

@@ -274,7 +287,7 @@ if ! check_gpu nvidia-smi || [ -z "$(nvidia-smi | grep -o "CUDA Version: [0-9]*\
    esac
 fi

-if ! lsmod | grep -q nvidia; then
+if ! lsmod | grep -q nvidia || ! lsmod | grep -q nvidia_uvm; then
    KERNEL_RELEASE="$(uname -r)"
    case $OS_NAME in
        rocky) $SUDO $PACKAGE_MANAGER -y install kernel-devel kernel-headers ;;
@@ -295,7 +308,19 @@ if ! lsmod | grep -q nvidia; then
    fi

    $SUDO modprobe nvidia
+    $SUDO modprobe nvidia_uvm
 fi

+# make sure the NVIDIA modules are loaded on boot with nvidia-persistenced
+if command -v nvidia-persistenced > /dev/null 2>&1; then
+    $SUDO touch /etc/modules-load.d/nvidia.conf
+    MODULES="nvidia nvidia-uvm"
+    for MODULE in $MODULES; do
+        if ! grep -qxF "$MODULE" /etc/modules-load.d/nvidia.conf; then
+            echo "$MODULE" | sudo tee -a /etc/modules-load.d/nvidia.conf > /dev/null
+        fi
+    done
+fi

-status "NVIDIA CUDA drivers installed."
+status "NVIDIA GPU ready."
+install_success