Skip to content

Commit

Permalink
cuda free only for non mmq (+2 squashed commit)
Browse files Browse the repository at this point in the history
Squashed commit:

[3aca763] only cuda free for non mmq

[e69a8c9] revert to pool alloc to try again
  • Loading branch information
LostRuins committed Aug 7, 2023
1 parent 9f16a4c commit cae6a84
Show file tree
Hide file tree
Showing 2 changed files with 31 additions and 35 deletions.
64 changes: 30 additions & 34 deletions ggml-cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -4288,58 +4288,55 @@ struct cuda_buffer {

static cuda_buffer g_cuda_buffer_pool[GGML_CUDA_MAX_DEVICES][MAX_CUDA_BUFFERS];
static std::atomic_flag g_cuda_pool_lock = ATOMIC_FLAG_INIT;
static bool g_mul_mat_q = false;

static void * ggml_cuda_pool_malloc(size_t size, size_t * actual_size) {
scoped_spin_lock lock(g_cuda_pool_lock);
int id;
CUDA_CHECK(cudaGetDevice(&id));
#ifdef DEBUG_CUDA_MALLOC
int nnz = 0;
size_t max_size = 0, tot_size = 0;
#endif
size_t best_diff = 1ull << 36;
int ibest = -1;

int best_i = -1;
size_t best_size = std::numeric_limits<size_t>::max(); //smallest unused buffer that fits our needs
int worst_i = -1;
size_t worst_size = 0; //largest unused buffer seen so far

for (int i = 0; i < MAX_CUDA_BUFFERS; ++i) {
cuda_buffer& b = g_cuda_buffer_pool[id][i];
if (b.ptr != nullptr) {
#ifdef DEBUG_CUDA_MALLOC
++nnz;
tot_size += b.size;
if (b.size > max_size) max_size = b.size;
#endif
if (b.size >= size) {
size_t diff = b.size - size;
if (diff < best_diff) {
best_diff = diff;
ibest = i;
if (!best_diff) {
void * ptr = b.ptr;
*actual_size = b.size;
b.ptr = nullptr;
b.size = 0;
return ptr;
}
}
}
if (b.size > 0 && b.size >= size && b.size < best_size)
{
best_i = i;
best_size = b.size;
}
if (b.size > 0 && b.size > worst_size)
{
worst_i = i;
worst_size = b.size;
}
}
if (ibest >= 0) {
cuda_buffer& b = g_cuda_buffer_pool[id][ibest];
if(best_i!=-1) //found the smallest buffer that fits our needs
{
cuda_buffer& b = g_cuda_buffer_pool[id][best_i];
void * ptr = b.ptr;
*actual_size = b.size;
b.ptr = nullptr;
b.size = 0;
return ptr;
}
#ifdef DEBUG_CUDA_MALLOC
fprintf(stderr, "%s: %d buffers, max_size = %u MB, tot_size = %u MB, requested %u MB\n", __func__, nnz,
(uint32_t)(max_size/1024/1024), (uint32_t)(tot_size/1024/1024), (uint32_t)(size/1024/1024));
#endif
if(worst_i!=-1 && !g_mul_mat_q) //no buffer that fits our needs, resize largest one to save memory (non mmq only)
{
cuda_buffer& b = g_cuda_buffer_pool[id][worst_i];
b.size = 0;
void * ptr = b.ptr;
cudaFree(ptr);
b.ptr = ptr = nullptr;
}
void * ptr;
size_t look_ahead_size = (size_t) (1.05 * size);

size_t look_ahead_size = (size_t) (1.02 * size);
look_ahead_size = 256 * ((look_ahead_size + 255)/256);
CUDA_CHECK(cudaMalloc((void **) &ptr, look_ahead_size));
*actual_size = look_ahead_size;

return ptr;
}

Expand Down Expand Up @@ -4369,7 +4366,6 @@ static int g_device_count = -1;
static int g_main_device = 0;
static int g_compute_capabilities[GGML_CUDA_MAX_DEVICES];
static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
static bool g_mul_mat_q = false;

static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};

Expand Down
2 changes: 1 addition & 1 deletion koboldcpp.py
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,7 @@ def utfprint(str):
maxhordelen = 256
modelbusy = threading.Lock()
defaultport = 5001
KcppVersion = "1.39"
KcppVersion = "1.39.1"
showdebug = True
showsamplerwarning = True
showmaxctxwarning = True
Expand Down

0 comments on commit cae6a84

Please sign in to comment.