From cae6a847ada88e415b0beda09d70d79b51762618 Mon Sep 17 00:00:00 2001
From: Concedo <39025047+LostRuins@users.noreply.github.com>
Date: Mon, 7 Aug 2023 16:40:13 +0800
Subject: [PATCH] cuda free only for non mmq (+2 squashed commit)

Squashed commit:

[3aca763a] only cuda free for non mmq

[e69a8c9f] revert to pool alloc to try again
---
 ggml-cuda.cu | 64 ++++++++++++++++++++++++----------------------------
 koboldcpp.py |  2 +-
 2 files changed, 31 insertions(+), 35 deletions(-)

diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 9d42efb0d0b03..96f51a04cb009 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -4288,58 +4288,55 @@ struct cuda_buffer {
 
 static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
 static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
+static bool g_mul_mat_q = false;
 
 static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
     scoped_spin_lock lock(g_cuda_pool_lock);
     int id;
     CUDA_CHECK(cudaGetDevice(&id));
-#ifdef DEBUG_CUDA_MALLOC
-    int nnz = 0;
-    size_t max_size = 0, tot_size = 0;
-#endif
-    size_t best_diff = 1ull << 36;
-    int ibest = -1;
+
+    int best_i = -1;
+    size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
+    int worst_i = -1;
+    size_t worst_size = 0; //largest unused buffer seen so far
+
     for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
         cuda_buffer& b = g_cuda_buffer_pool[id][i];
-        if (b.ptr != nullptr) {
-#ifdef DEBUG_CUDA_MALLOC
-            ++nnz;
-            tot_size += b.size;
-            if (b.size > max_size) max_size = b.size;
-#endif
-            if (b.size >= size) {
-                size_t diff = b.size - size;
-                if (diff < best_diff) {
-                    best_diff = diff;
-                    ibest = i;
-                    if (!best_diff) {
-                        void * ptr = b.ptr;
-                        *actual_size = b.size;
-                        b.ptr = nullptr;
-                        b.size = 0;
-                        return ptr;
-                    }
-                }
-            }
+        if (b.size > 0 && b.size >= size && b.size < best_size)
+        {
+            best_i = i;
+            best_size = b.size;
+        }
+        if (b.size > 0 && b.size > worst_size)
+        {
+            worst_i = i;
+            worst_size = b.size;
         }
     }
-    if (ibest >= 0) {
-        cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
+    if(best_i!=-1) //found the smallest buffer that fits our needs
+    {
+        cuda_buffer& b = g_cuda_buffer_pool[id][best_i];
         void * ptr = b.ptr;
         *actual_size = b.size;
         b.ptr = nullptr;
         b.size = 0;
         return ptr;
     }
-#ifdef DEBUG_CUDA_MALLOC
-    fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
-            (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
-#endif
+    if(worst_i!=-1 && !g_mul_mat_q) //no buffer that fits our needs, resize largest one to save memory (non mmq only)
+    {
+        cuda_buffer& b = g_cuda_buffer_pool[id][worst_i];
+        b.size = 0;
+        void * ptr = b.ptr;
+        cudaFree(ptr);
+        b.ptr = ptr = nullptr;
+    }
     void * ptr;
-    size_t look_ahead_size = (size_t) (1.05 * size);
+
+    size_t look_ahead_size = (size_t) (1.02 * size);
     look_ahead_size = 256 * ((look_ahead_size + 255)/256);
     CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
     *actual_size = look_ahead_size;
+
     return ptr;
 }
 
@@ -4369,7 +4366,6 @@ static int g_device_count = -1;
 static int g_main_device = 0;
 static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
 static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
-static bool g_mul_mat_q = false;
 
 static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
 
diff --git a/koboldcpp.py b/koboldcpp.py
index 153cfc54447e4..ed50f37089974 100755
--- a/koboldcpp.py
+++ b/koboldcpp.py
@@ -303,7 +303,7 @@ def utfprint(str):
 maxhordelen = 256
 modelbusy = threading.Lock()
 defaultport = 5001
-KcppVersion = "1.39"
+KcppVersion = "1.39.1"
 showdebug = True
 showsamplerwarning = True
 showmaxctxwarning = True