diff --git a/.gitignore b/.gitignore index 54dcebc4ddb7f..565866fd4bcdc 100644 --- a/.gitignore +++ b/.gitignore @@ -28,7 +28,7 @@ models/* /result /perplexity /embedding -/benchmark-q4_0-matmult +/benchmark-matmult /vdot /Pipfile diff --git a/CMakeLists.txt b/CMakeLists.txt index bf064cf774d65..e4d37497ee199 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -367,7 +367,7 @@ endif() add_library(llama llama.cpp llama.h - llama_util.h) + llama-util.h) target_include_directories(llama PUBLIC .) target_compile_features(llama PUBLIC cxx_std_11) # don't bump diff --git a/Makefile b/Makefile index c847b180942c6..8c9721d743fcd 100644 --- a/Makefile +++ b/Makefile @@ -34,10 +34,15 @@ endif # # keep standard at C11 and C++11 -CFLAGS = -I. -O3 -DNDEBUG -std=c11 -fPIC -CXXFLAGS = -I. -I./examples -O3 -DNDEBUG -std=c++11 -fPIC +CFLAGS = -I. -O3 -std=c11 -fPIC +CXXFLAGS = -I. -I./examples -O3 -std=c++11 -fPIC LDFLAGS = +ifndef LLAMA_DEBUG + CFLAGS += -DNDEBUG + CXXFLAGS += -DNDEBUG +endif + # warnings CFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wdouble-promotion -Wshadow -Wstrict-prototypes -Wpointer-arith CXXFLAGS += -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wno-multichar @@ -181,14 +186,14 @@ $(info ) ggml.o: ggml.c ggml.h ggml-cuda.h $(CC) $(CFLAGS) -c $< -o $@ -llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama_util.h +llama.o: llama.cpp ggml.h ggml-cuda.h llama.h llama-util.h $(CXX) $(CXXFLAGS) -c $< -o $@ common.o: examples/common.cpp examples/common.h $(CXX) $(CXXFLAGS) -c $< -o $@ clean: - rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-q4_0-matmult + rm -vf *.o main quantize quantize-stats perplexity embedding benchmark-matmult main: examples/main/main.cpp ggml.o llama.o common.o $(OBJS) $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) @@ -218,9 +223,9 @@ libllama.so: llama.o ggml.o $(OBJS) # Tests # -benchmark: examples/benchmark/benchmark-q4_0-matmult.c ggml.o $(OBJS) - $(CXX) $(CXXFLAGS) $^ -o benchmark-q4_0-matmult $(LDFLAGS) - ./benchmark-q4_0-matmult +benchmark-matmult: examples/benchmark/benchmark-matmult.cpp ggml.o $(OBJS) + $(CXX) $(CXXFLAGS) $^ -o $@ $(LDFLAGS) + ./$@ .PHONY: tests tests: diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index be35363f58a03..0973a3fa1a8b6 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -35,4 +35,5 @@ else() add_subdirectory(perplexity) add_subdirectory(embedding) add_subdirectory(save-load-state) + add_subdirectory(benchmark) endif() diff --git a/examples/benchmark/CMakeLists.txt b/examples/benchmark/CMakeLists.txt new file mode 100644 index 0000000000000..05deebcd10c79 --- /dev/null +++ b/examples/benchmark/CMakeLists.txt @@ -0,0 +1,4 @@ +set(TARGET benchmark) +add_executable(${TARGET} benchmark-matmult.cpp) +target_link_libraries(${TARGET} PRIVATE common llama ${CMAKE_THREAD_LIBS_INIT}) +target_compile_features(${TARGET} PRIVATE cxx_std_11) diff --git a/examples/benchmark/benchmark-q4_0-matmult.c b/examples/benchmark/benchmark-matmult.cpp similarity index 92% rename from examples/benchmark/benchmark-q4_0-matmult.c rename to examples/benchmark/benchmark-matmult.cpp index 84b06766c15dc..19cbab1c38825 100644 --- a/examples/benchmark/benchmark-q4_0-matmult.c +++ b/examples/benchmark/benchmark-matmult.cpp @@ -1,11 +1,3 @@ -/* - License: MIT License - - Changelog: - - 2023-03-31 Initial version by Sebastian Apel (https://github.com/SebastianApel) - -*/ - #include #include "ggml.h" #include @@ -45,7 +37,7 @@ float tensor_sum_elements(struct ggml_tensor * tensor) { #define TENSOR_TYPE_AS_STR(TYPE) TYPE == GGML_TYPE_F32 ? "FP32" : TYPE == GGML_TYPE_F16 ? "FP16" : TYPE == GGML_TYPE_Q4_0 ? "Q4_0" : TYPE == GGML_TYPE_Q4_1 ? "Q4_1" : "UNKNOWN" -#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5d x %5d x %5d, nb = (%5li, %5li, %5li) - ", #TENSOR, \ +#define TENSOR_DUMP(TENSOR) printf("%15s: type = %i (%5s) ne = %5ld x %5ld x %5ld, nb = (%5li, %5li, %5li) - ", #TENSOR, \ TENSOR->type,TENSOR_TYPE_AS_STR(TENSOR->type),\ TENSOR->ne[0], TENSOR->ne[1], TENSOR->ne[2], TENSOR->nb[0], TENSOR->nb[1], TENSOR->nb[2]); \ { float sum = tensor_sum_elements(TENSOR); printf("Sum of tensor %s is %6.2f\n",#TENSOR, sum); } @@ -98,12 +90,9 @@ int main(int argc, char ** argv) { } } - // create the ggml context printf("Starting Test\n"); - - struct ggml_context * ctx; //const int sizex = 4096; //const int sizey = 11008; @@ -125,16 +114,18 @@ int main(int argc, char ** argv) { #endif //printf("Memsize required = %i\n", sizex*sizex); - ggml_type wtype = GGML_TYPE_F32; size_t ctx_size = 0; - ctx_size += sizex*sizey*ggml_type_sizef(wtype); - ctx_size += sizex*sizey*ggml_type_sizef(wtype); ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); - ctx_size += sizex*sizeof(float); - ctx_size += 1024*1024*100; + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += sizex*sizez*ggml_type_sizef(GGML_TYPE_F32); + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_Q4_0); + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS + ctx_size += sizex*sizey*ggml_type_sizef(GGML_TYPE_F32); // BLAS + ctx_size += 1024*1024*16; - printf("Allocating Memory of size %li byes, %li MB\n",ctx_size, (ctx_size/1024/1024)); + printf("Allocating Memory of size %li bytes, %li MB\n",ctx_size, (ctx_size/1024/1024)); struct ggml_init_params params = { /*.mem_size =*/ ctx_size, @@ -217,7 +208,7 @@ int main(int argc, char ** argv) { const int dimz = sizez; long long int flops_per_dot_product = dimy + dimy; long long int flops_per_matrix = flops_per_dot_product * dimx * dimz; ; - printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - aboout %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); + printf("Matrix Multiplication of (%i,%i,%i) x (%i,%i,%i) - about %6.2f gFLOPS\n\n", sizex, sizey, 1, sizex, sizez, 1, 1.0f*flops_per_matrix / 1000 / 1000 / 1000); // Let's use the F32 result from above as a reference for the q4_0 multiplication @@ -234,7 +225,6 @@ int main(int argc, char ** argv) { ggml_graph_compute(ctx, &gf31); long long int stop = ggml_time_us(); long long int usec = stop-start; - float sec = usec/1000000; float flops_per_usec = (1.0f*flops_per_matrix)/usec; printf("%9i;%8i;%6i;%6i;%6i;%15lli;%18lli;%19.2f\n", i, diff --git a/examples/save-load-state/save-load-state.cpp b/examples/save-load-state/save-load-state.cpp index 07dfa2c74ed07..f1531ba39eb5e 100644 --- a/examples/save-load-state/save-load-state.cpp +++ b/examples/save-load-state/save-load-state.cpp @@ -1,12 +1,9 @@ -#include -#include -#include - #include "common.h" #include "llama.h" -#include "llama.cpp" -using namespace std; +#include +#include +#include int main(int argc, char ** argv) { gpt_params params; @@ -20,21 +17,25 @@ int main(int argc, char ** argv) { return 1; } + if (params.n_predict < 0) { + params.n_predict = 16; + } + auto lparams = llama_context_default_params(); - lparams.n_ctx = params.n_ctx; - lparams.n_parts = params.n_parts; - lparams.seed = params.seed; - lparams.f16_kv = params.memory_f16; - lparams.use_mmap = params.use_mmap; - lparams.use_mlock = params.use_mlock; + lparams.n_ctx = params.n_ctx; + lparams.n_parts = params.n_parts; + lparams.seed = params.seed; + lparams.f16_kv = params.memory_f16; + lparams.use_mmap = params.use_mmap; + lparams.use_mlock = params.use_mlock; auto n_past = 0; - auto last_n_tokens_data = vector(params.repeat_last_n, 0); + auto last_n_tokens_data = std::vector(params.repeat_last_n, 0); // init auto ctx = llama_init_from_file(params.model.c_str(), lparams); - auto tokens = vector(params.n_ctx); + auto tokens = std::vector(params.n_ctx); auto n_prompt_tokens = llama_tokenize(ctx, params.prompt.c_str(), tokens.data(), tokens.size(), true); if (n_prompt_tokens < 1) { @@ -43,26 +44,29 @@ int main(int argc, char ** argv) { } // evaluate prompt - llama_eval(ctx, tokens.data(), n_prompt_tokens, n_past, params.n_threads); last_n_tokens_data.insert(last_n_tokens_data.end(), tokens.data(), tokens.data() + n_prompt_tokens); n_past += n_prompt_tokens; + const size_t state_size = llama_get_state_size(ctx); + uint8_t * state_mem = new uint8_t[state_size]; + // Save state (rng, logits, embedding and kv_cache) to file - FILE *fp_write = fopen("dump_state.bin", "wb"); - auto state_size = llama_get_state_size(ctx); - auto state_mem = new uint8_t[state_size]; - llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file - fwrite(state_mem, 1, state_size, fp_write); - fclose(fp_write); + { + FILE *fp_write = fopen("dump_state.bin", "wb"); + llama_copy_state_data(ctx, state_mem); // could also copy directly to memory mapped file + fwrite(state_mem, 1, state_size, fp_write); + fclose(fp_write); + } // save state (last tokens) - auto last_n_tokens_data_saved = vector(last_n_tokens_data); - auto n_past_saved = n_past; + const auto last_n_tokens_data_saved = std::vector(last_n_tokens_data); + const auto n_past_saved = n_past; // first run printf("\n%s", params.prompt.c_str()); + for (auto i = 0; i < params.n_predict; i++) { auto logits = llama_get_logits(ctx); auto n_vocab = llama_n_vocab(ctx); @@ -75,6 +79,7 @@ int main(int argc, char ** argv) { auto next_token = llama_sample_token(ctx, &candidates_p); auto next_token_str = llama_token_to_str(ctx, next_token); last_n_tokens_data.push_back(next_token); + printf("%s", next_token_str); if (llama_eval(ctx, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); @@ -82,24 +87,34 @@ int main(int argc, char ** argv) { } n_past += 1; } + printf("\n\n"); // free old model llama_free(ctx); // load new model - auto ctx2 = llama_init_from_file(params.model.c_str(), lparams); // Load state (rng, logits, embedding and kv_cache) from file - FILE *fp_read = fopen("dump_state.bin", "rb"); - auto state_size2 = llama_get_state_size(ctx2); - if (state_size != state_size2) { - fprintf(stderr, "\n%s : failed to validate state size\n", __func__); + { + FILE *fp_read = fopen("dump_state.bin", "rb"); + if (state_size != llama_get_state_size(ctx2)) { + fprintf(stderr, "\n%s : failed to validate state size\n", __func__); + return 1; + } + + const size_t ret = fread(state_mem, 1, state_size, fp_read); + if (ret != state_size) { + fprintf(stderr, "\n%s : failed to read state\n", __func__); + return 1; + } + + llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file + fclose(fp_read); } - fread(state_mem, 1, state_size, fp_read); - llama_set_state_data(ctx2, state_mem); // could also read directly from memory mapped file - fclose(fp_read); + + delete[] state_mem; // restore state (last tokens) last_n_tokens_data = last_n_tokens_data_saved; @@ -118,6 +133,7 @@ int main(int argc, char ** argv) { auto next_token = llama_sample_token(ctx2, &candidates_p); auto next_token_str = llama_token_to_str(ctx2, next_token); last_n_tokens_data.push_back(next_token); + printf("%s", next_token_str); if (llama_eval(ctx2, &next_token, 1, n_past, params.n_threads)) { fprintf(stderr, "\n%s : failed to evaluate\n", __func__); @@ -125,6 +141,8 @@ int main(int argc, char ** argv) { } n_past += 1; } + printf("\n\n"); + return 0; } diff --git a/ggml.c b/ggml.c index 748ea8cd5e688..d8882d6446f85 100644 --- a/ggml.c +++ b/ggml.c @@ -668,6 +668,33 @@ uint8x8_t vzip2_u8(uint8x8_t a, uint8x8_t b) { return vget_high_u8(vcombine_u8(a, b)); } +int8x16_t vzip1q_s8(int8x16_t a, int8x16_t b) { + return vcombine_s8(vget_low_s8(a), vget_low_s8(b)); +} + +int8x16_t vzip2q_s8(int8x16_t a, int8x16_t b) { + return vcombine_s8(vget_high_s8(a), vget_high_s8(b)); +} + +uint8x16_t vzip1q_u8(uint8x16_t a, uint8x16_t b) { + return vcombine_u8(vget_low_u8(a), vget_low_u8(b)); +} + +uint8x16_t vzip2q_u8(uint8x16_t a, uint8x16_t b) { + return vcombine_u8(vget_high_u8(a), vget_high_u8(b)); +} + +int32x4_t vcvtnq_s32_f32(float32x4_t v) { + int32x4_t res; + + res[0] = roundf(vgetq_lane_f32(v, 0)); + res[1] = roundf(vgetq_lane_f32(v, 1)); + res[2] = roundf(vgetq_lane_f32(v, 2)); + res[3] = roundf(vgetq_lane_f32(v, 3)); + + return res; +} + #endif #endif @@ -2658,35 +2685,35 @@ static void ggml_vec_dot_q4_0_q8_0(const int n, float * restrict s, const void * const int8x16_t v0_1ls = vsubq_s8(v0_1l, s8b); const int8x16_t v0_1hs = vsubq_s8(v0_1h, s8b); + // interleave + const int8x16_t v0_0lz = vzip1q_s8(v0_0ls, v0_0hs); + const int8x16_t v0_0hz = vzip2q_s8(v0_0ls, v0_0hs); + const int8x16_t v0_1lz = vzip1q_s8(v0_1ls, v0_1hs); + const int8x16_t v0_1hz = vzip2q_s8(v0_1ls, v0_1hs); + // load y const int8x16_t v1_0l = vld1q_s8(y0->qs); const int8x16_t v1_0h = vld1q_s8(y0->qs + 16); const int8x16_t v1_1l = vld1q_s8(y1->qs); const int8x16_t v1_1h = vld1q_s8(y1->qs + 16); - // interleave - const int8x16_t v1_0ls = vuzp1q_s8(v1_0l, v1_0h); - const int8x16_t v1_0hs = vuzp2q_s8(v1_0l, v1_0h); - const int8x16_t v1_1ls = vuzp1q_s8(v1_1l, v1_1h); - const int8x16_t v1_1hs = vuzp2q_s8(v1_1l, v1_1h); - #if defined(__ARM_FEATURE_DOTPROD) // dot product into int32x4_t - const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0ls, v1_0ls), v0_0hs, v1_0hs); - const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1ls, v1_1ls), v0_1hs, v1_1hs); + const int32x4_t p_0 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_0lz, v1_0l), v0_0hz, v1_0h); + const int32x4_t p_1 = vdotq_s32(vdotq_s32(vdupq_n_s32(0), v0_1lz, v1_1l), v0_1hz, v1_1h); sumv0 = vmlaq_n_f32(sumv0, vcvtq_f32_s32(p_0), x0->d*y0->d); sumv1 = vmlaq_n_f32(sumv1, vcvtq_f32_s32(p_1), x1->d*y1->d); #else - const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0ls), vget_low_s8 (v1_0ls)); - const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0ls), vget_high_s8(v1_0ls)); - const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hs), vget_low_s8 (v1_0hs)); - const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hs), vget_high_s8(v1_0hs)); + const int16x8_t pl0l = vmull_s8(vget_low_s8 (v0_0lz), vget_low_s8 (v1_0l)); + const int16x8_t pl0h = vmull_s8(vget_high_s8(v0_0lz), vget_high_s8(v1_0l)); + const int16x8_t ph0l = vmull_s8(vget_low_s8 (v0_0hz), vget_low_s8 (v1_0h)); + const int16x8_t ph0h = vmull_s8(vget_high_s8(v0_0hz), vget_high_s8(v1_0h)); - const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1ls), vget_low_s8 (v1_1ls)); - const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1ls), vget_high_s8(v1_1ls)); - const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hs), vget_low_s8 (v1_1hs)); - const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hs), vget_high_s8(v1_1hs)); + const int16x8_t pl1l = vmull_s8(vget_low_s8 (v0_1lz), vget_low_s8 (v1_1l)); + const int16x8_t pl1h = vmull_s8(vget_high_s8(v0_1lz), vget_high_s8(v1_1l)); + const int16x8_t ph1l = vmull_s8(vget_low_s8 (v0_1hz), vget_low_s8 (v1_1h)); + const int16x8_t ph1h = vmull_s8(vget_high_s8(v0_1hz), vget_high_s8(v1_1h)); const int32x4_t pl0 = vaddq_s32(vpaddlq_s16(pl0l), vpaddlq_s16(pl0h)); const int32x4_t ph0 = vaddq_s32(vpaddlq_s16(ph0l), vpaddlq_s16(ph0h)); @@ -3800,6 +3827,7 @@ static const char * GGML_OP_LABEL[GGML_OP_COUNT] = { "DIAG_MASK_INF", "SOFT_MAX", "ROPE", + "ALIBI", "CONV_1D_1S", "CONV_1D_2S", @@ -3848,6 +3876,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { "diag_mask_inf(x)", "soft_max(x)", "rope(x)", + "alibi(x)", "conv_1d_1s(x)", "conv_1d_2s(x)", @@ -8245,8 +8274,6 @@ static void ggml_compute_forward_mul_mat_f16_f32( ggml_fp16_t * d_X = ggml_cuda_pool_malloc(sizeof(float) * x_ne, &x_size); ggml_fp16_t * d_Y = ggml_cuda_pool_malloc(sizeof(float) * y_ne, &y_size); float * d_D = ggml_cuda_pool_malloc(sizeof(float) * d_ne, &d_size); -#else - float * const wdata = params->wdata; #endif for (int64_t i03 = 0; i03 < ne03; i03++) { for (int64_t i02 = 0; i02 < ne02; i02++) { @@ -8263,8 +8290,11 @@ static void ggml_compute_forward_mul_mat_f16_f32( wdata[id++] = GGML_FP32_TO_FP16(*(float *) ((char *) src1->data + i03*nb13 + i02*nb12 + i01*nb11 + i00*nb10)); } } + + assert(id*sizeof(ggml_fp16_t) <= params->wsize); } #else + float * const wdata = params->wdata; { size_t id = 0; for (int64_t i01 = 0; i01 < ne01; ++i01) { @@ -8272,6 +8302,8 @@ static void ggml_compute_forward_mul_mat_f16_f32( wdata[id++] = GGML_FP16_TO_FP32(*(ggml_fp16_t *) ((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01 + i00*nb00)); } } + + assert(id*sizeof(float) <= params->wsize); } #endif @@ -8537,7 +8569,10 @@ static void ggml_compute_forward_mul_mat_q_f32( dequantize_row_q((char *) src0->data + i03*nb03 + i02*nb02 + i01*nb01, wdata + id, ne00); id += ne00; } + + assert(id*sizeof(float) <= params->wsize); } + const float * x = wdata; #endif @@ -9118,7 +9153,7 @@ static void ggml_compute_forward_alibi_f32( //const int nb3 = src0->nb[3]; assert(nb0 == sizeof(float)); - assert(ne1+n_past == ne0); + assert(ne1 + n_past == ne0); (void) n_past; // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); @@ -9179,7 +9214,7 @@ static void ggml_compute_forward_alibi_f16( //const int nb3 = src0->nb[3]; assert(nb0 == sizeof(ggml_fp16_t)); - assert(ne1+n_past == ne0); + assert(ne1 + n_past == ne0); (void) n_past; // add alibi to src0 (KQ_scaled) const int n_heads_log2_floor = 1 << (int) floor(log2(n_head)); @@ -11571,10 +11606,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; // TODO: this actually is doing nothing // the threads are still spinning - cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*MAX(ggml_nelements(node->src1), ggml_nelements(node->src0)); - //printf("src0: ne0 = %d, ne1 = %d, ne = %d\n", node->src0->ne[0], node->src0->ne[1], node->src0->ne[0]*node->src0->ne[1]); - //printf("src1: ne0 = %d, ne1 = %d, ne = %d\n", node->src1->ne[0], node->src1->ne[1], node->src1->ne[0]*node->src1->ne[1]); - //printf("cur = %zu\n", cur); +#if defined(GGML_USE_CUBLAS) + // with cuBLAS, we need memory for the full 3D / 4D data of src1 + cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); +#else + // here we need memory just for single 2D matrix from src0 + cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]); +#endif } else { cur = GGML_TYPE_SIZE[GGML_TYPE_F16]*ggml_nelements(node->src1); } @@ -11583,7 +11621,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph) #endif } else if (node->src0->type == GGML_TYPE_F32 && node->src1->type == GGML_TYPE_F32) { cur = 0; -#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) +#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) if (ggml_compute_forward_mul_mat_use_blas(node->src0, node->src1, node)) { node->n_tasks = 1; } diff --git a/ggml.h b/ggml.h index 38ae9a6eeeb71..c1c5495c63f44 100644 --- a/ggml.h +++ b/ggml.h @@ -701,8 +701,8 @@ extern "C" { struct ggml_tensor * c1); // Mapping operations - GGML_API typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); - GGML_API typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); + typedef void (*ggml_unary_op_f32_t)(const int, float *, const float *); + typedef void (*ggml_binary_op_f32_t)(const int, float *, const float *, const float *); GGML_API struct ggml_tensor * ggml_map_unary_f32( struct ggml_context * ctx, diff --git a/llama_util.h b/llama-util.h old mode 100755 new mode 100644 similarity index 99% rename from llama_util.h rename to llama-util.h index 6e66d12a8041c..ca4dd162f59fe --- a/llama_util.h +++ b/llama-util.h @@ -430,5 +430,4 @@ struct llama_ctx_buffer { typedef llama_buffer llama_ctx_buffer; #endif - #endif diff --git a/llama.cpp b/llama.cpp index 1032fb9fa9363..f8b4c8e46b521 100644 --- a/llama.cpp +++ b/llama.cpp @@ -5,7 +5,7 @@ #include #endif -#include "llama_util.h" +#include "llama-util.h" #include "llama.h" #include "ggml.h" @@ -33,7 +33,6 @@ #define LLAMA_USE_SCRATCH #define LLAMA_MAX_SCRATCH_BUFFERS 16 - // available llama models enum e_model { MODEL_UNKNOWN, @@ -781,7 +780,7 @@ static bool kv_cache_init( const int n_embd = hparams.n_embd; const int n_layer = hparams.n_layer; - const int64_t n_mem = (int64_t)n_layer*n_ctx; + const int64_t n_mem = n_layer*n_ctx; const int64_t n_elements = n_embd*n_mem; cache.buf.resize(2u*n_elements*ggml_type_size(wtype) + 2u*MB);