From cae6a847ada88e415b0beda09d70d79b51762618 Mon Sep 17 00:00:00 2001 From: Concedo <39025047+LostRuins@users.noreply.github.com> Date: Mon, 7 Aug 2023 16:40:13 +0800 Subject: [PATCH] cuda free only for non mmq (+2 squashed commit) Squashed commit: [3aca763a] only cuda free for non mmq [e69a8c9f] revert to pool alloc to try again --- ggml-cuda.cu | 64 ++++++++++++++++++++++++---------------------------- koboldcpp.py | 2 +- 2 files changed, 31 insertions(+), 35 deletions(-) diff --git a/ggml-cuda.cu b/ggml-cuda.cu index 9d42efb0d0b03..96f51a04cb009 100644 --- a/ggml-cuda.cu +++ b/ggml-cuda.cu @@ -4288,58 +4288,55 @@ struct cuda_buffer { static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS]; static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT; +static bool g_mul_mat_q = false; static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) { scoped_spin_lock lock(g_cuda_pool_lock); int id; CUDA_CHECK(cudaGetDevice(&id)); -#ifdef DEBUG_CUDA_MALLOC - int nnz = 0; - size_t max_size = 0, tot_size = 0; -#endif - size_t best_diff = 1ull << 36; - int ibest = -1; + + int best_i = -1; + size_t best_size = std::numeric_limits::max(); //smallest unused buffer that fits our needs + int worst_i = -1; + size_t worst_size = 0; //largest unused buffer seen so far + for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) { cuda_buffer& b = g_cuda_buffer_pool[id][i]; - if (b.ptr != nullptr) { -#ifdef DEBUG_CUDA_MALLOC - ++nnz; - tot_size += b.size; - if (b.size > max_size) max_size = b.size; -#endif - if (b.size >= size) { - size_t diff = b.size - size; - if (diff < best_diff) { - best_diff = diff; - ibest = i; - if (!best_diff) { - void * ptr = b.ptr; - *actual_size = b.size; - b.ptr = nullptr; - b.size = 0; - return ptr; - } - } - } + if (b.size > 0 && b.size >= size && b.size < best_size) + { + best_i = i; + best_size = b.size; + } + if (b.size > 0 && b.size > worst_size) + { + worst_i = i; + worst_size = b.size; } } - if (ibest >= 0) { - cuda_buffer& b = g_cuda_buffer_pool[id][ibest]; + if(best_i!=-1) //found the smallest buffer that fits our needs + { + cuda_buffer& b = g_cuda_buffer_pool[id][best_i]; void * ptr = b.ptr; *actual_size = b.size; b.ptr = nullptr; b.size = 0; return ptr; } -#ifdef DEBUG_CUDA_MALLOC - fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz, - (uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024)); -#endif + if(worst_i!=-1 && !g_mul_mat_q) //no buffer that fits our needs, resize largest one to save memory (non mmq only) + { + cuda_buffer& b = g_cuda_buffer_pool[id][worst_i]; + b.size = 0; + void * ptr = b.ptr; + cudaFree(ptr); + b.ptr = ptr = nullptr; + } void * ptr; - size_t look_ahead_size = (size_t) (1.05 * size); + + size_t look_ahead_size = (size_t) (1.02 * size); look_ahead_size = 256 * ((look_ahead_size + 255)/256); CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size)); *actual_size = look_ahead_size; + return ptr; } @@ -4369,7 +4366,6 @@ static int g_device_count = -1; static int g_main_device = 0; static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES]; static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0}; -static bool g_mul_mat_q = false; static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr}; diff --git a/koboldcpp.py b/koboldcpp.py index 153cfc54447e4..ed50f37089974 100755 --- a/koboldcpp.py +++ b/koboldcpp.py @@ -303,7 +303,7 @@ def utfprint(str): maxhordelen = 256 modelbusy = threading.Lock() defaultport = 5001 -KcppVersion = "1.39" +KcppVersion = "1.39.1" showdebug = True showsamplerwarning = True showmaxctxwarning = True