Merge 'origin/master' into hipblas

AAbushady · Jun 25, 2023 · 35a6031 · 35a6031
2 parents df7346c + 66a2555
commit 35a6031
Show file tree

Hide file tree

Showing 22 changed files with 452 additions and 211 deletions.
diff --git a/README.md b/README.md
@@ -9,6 +9,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
 
 **Hot topics:**
 
+- Azure CI brainstorming: https://github.com/ggerganov/llama.cpp/discussions/1985
 - p1 : LLM-based code completion engine at the edge : https://github.com/ggml-org/p1/discussions/1
 - Roadmap June 2023: https://github.com/ggerganov/llama.cpp/discussions/1729
 
@@ -29,6 +30,7 @@ Inference of [LLaMA](https://arxiv.org/abs/2302.13971) model in pure C/C++
         <li><a href="#quantization">Quantization</a></li>
         <li><a href="#interactive-mode">Interactive mode</a></li>
         <li><a href="#instruction-mode-with-alpaca">Instruction mode with Alpaca</a></li>
+        <li><a href="#using-openllama">Using OpenLLaMA</a></li>
         <li><a href="#using-gpt4all">Using GPT4All</a></li>
         <li><a href="#using-pygmalion-7b--metharme-7b">Using Pygmalion 7B & Metharme 7B</a></li>
         <li><a href="#obtaining-the-facebook-llama-original-model-and-stanford-alpaca-model-data">Obtaining the Facebook LLaMA original model and Stanford Alpaca model data</a></li>
@@ -543,6 +545,13 @@ cadaver, cauliflower, cabbage (vegetable), catalpa (tree) and Cailleach.
 >
 ```
 
+### Using [OpenLLaMA](https://github.com/openlm-research/open_llama)
+
+OpenLLaMA is an openly licensed reproduction of Meta's original LLaMA model. It uses the same architecture and is a drop-in replacement for the original LLaMA weights.
+
+- Download the [3B](https://huggingface.co/openlm-research/open_llama_3b), [7B](https://huggingface.co/openlm-research/open_llama_7b), or [13B](https://huggingface.co/openlm-research/open_llama_13b) model from Hugging Face.
+- Convert the model to ggml FP16 format using `python convert.py <path to OpenLLaMA directory>`
+
 ### Using [GPT4All](https://github.com/nomic-ai/gpt4all)
 
 - Obtain the `tokenizer.model` file from LLaMA model and put it to `models`
@@ -672,12 +681,13 @@ Upon completion of the aforementioned steps, you will have successfully compiled
 ```
 GGML_OPENCL_PLATFORM=0
 GGML_OPENCL_DEVICE=0
-export LD_LIBRARY_PATH=/system/vendor/lib64:$LD_LIBRARY_PATH
-./main (...)
+export LD_LIBRARY_PATH=/vendor/lib64:$LD_LIBRARY_PATH
 ```
 
 For easy and swift re-execution, consider documenting this final part in a .sh script file. This will enable you to rerun the process with minimal hassle.
 
+Place your desired model into the `/llama.cpp/models/` directory and execute the `./main (...)` script.
+
 ### Docker
 
 #### Prerequisites

diff --git a/build.zig b/build.zig
@@ -1,61 +1,58 @@
 const std = @import("std");
 
+// Zig Version: 0.11.0-dev.3379+629f0d23b
 pub fn build(b: *std.build.Builder) void {
     const target = b.standardTargetOptions(.{});
-    const optimize = b.standardReleaseOptions();
-    const want_lto = b.option(bool, "lto", "Want -fLTO");
-
-    const lib = b.addStaticLibrary("llama", null);
-    lib.want_lto = want_lto;
-    lib.setTarget(target);
-    lib.setBuildMode(optimize);
+    const optimize = b.standardOptimizeOption(.{});
+    const lib = b.addStaticLibrary(.{
+        .name = "llama",
+        .target = target,
+        .optimize = optimize,
+    });
+    lib.linkLibC();
     lib.linkLibCpp();
     lib.addIncludePath(".");
-    lib.addIncludePath("examples");
+    lib.addIncludePath("./examples");
     lib.addCSourceFiles(&.{
         "ggml.c",
     }, &.{"-std=c11"});
     lib.addCSourceFiles(&.{
         "llama.cpp",
     }, &.{"-std=c++11"});
-    lib.install();
-
-    const build_args = .{ .b = b, .lib = lib, .target = target, .optimize = optimize, .want_lto = want_lto };
-
-    const exe = build_example("main", build_args);
-    _ = build_example("quantize", build_args);
-    _ = build_example("perplexity", build_args);
-    _ = build_example("embedding", build_args);
-
-    // create "zig build run" command for ./main
-
-    const run_cmd = exe.run();
-    run_cmd.step.dependOn(b.getInstallStep());
-    if (b.args) |args| {
-        run_cmd.addArgs(args);
+    b.installArtifact(lib);
+
+    const examples = .{
+        "main",
+        "baby-llama",
+        "embedding",
+        // "metal",
+        "perplexity",
+        "quantize",
+        "quantize-stats",
+        "save-load-state",
+        // "server",
+        "simple",
+        "train-text-from-scratch",
+    };
+
+    inline for (examples) |example_name| {
+        const exe = b.addExecutable(.{
+            .name = example_name,
+            .target = target,
+            .optimize = optimize,
+        });
+        exe.addIncludePath(".");
+        exe.addIncludePath("./examples");
+        exe.addCSourceFiles(&.{
+            std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{example_name, example_name}),
+            "examples/common.cpp",
+        }, &.{"-std=c++11"});
+        exe.linkLibrary(lib);
+        b.installArtifact(exe);
+        const run_cmd = b.addRunArtifact(exe);
+        run_cmd.step.dependOn(b.getInstallStep());
+        if (b.args) |args| run_cmd.addArgs(args);
+        const run_step = b.step("run_" ++ example_name, "Run the app");
+        run_step.dependOn(&run_cmd.step);
     }
-
-    const run_step = b.step("run", "Run the app");
-    run_step.dependOn(&run_cmd.step);
-}
-
-fn build_example(comptime name: []const u8, args: anytype) *std.build.LibExeObjStep {
-    const b = args.b;
-    const lib = args.lib;
-    const want_lto = args.want_lto;
-
-    const exe = b.addExecutable(name, null);
-    exe.want_lto = want_lto;
-    lib.setTarget(args.target);
-    lib.setBuildMode(args.optimize);
-    exe.addIncludePath(".");
-    exe.addIncludePath("examples");
-    exe.addCSourceFiles(&.{
-        std.fmt.comptimePrint("examples/{s}/{s}.cpp", .{name, name}),
-        "examples/common.cpp",
-    }, &.{"-std=c++11"});
-    exe.linkLibrary(lib);
-    exe.install();
-
-    return exe;
 }
diff --git a/convert.py b/convert.py
@@ -998,9 +998,9 @@ def write_vocab(self, vocab: Vocab) -> None:
     def write_vocab_only(fname_out: Path, vocab: Vocab) -> None:
         of = OutputFile(fname_out)
         params = Params(n_vocab=vocab.vocab_size, n_embd=0, n_mult=0,
-                        n_head=1, n_layer=0, file_type=GGMLFileType.AllF32)
+                        n_head=1, n_layer=0)
         of = OutputFile(fname_out)
-        of.write_file_header(params)
+        of.write_file_header(params, file_type=GGMLFileType.AllF32)
         of.write_vocab(vocab)
         of.fout.close()
 

diff --git a/examples/common.cpp b/examples/common.cpp
@@ -536,7 +536,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
     return res;
 }
 
-struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params) {
     auto lparams = llama_context_default_params();
 
     lparams.n_ctx        = params.n_ctx;
@@ -552,25 +552,33 @@ struct llama_context * llama_init_from_gpt_params(const gpt_params & params) {
     lparams.logits_all   = params.perplexity;
     lparams.embedding    = params.embedding;
 
-    llama_context * lctx = llama_init_from_file(params.model.c_str(), lparams);
+    llama_model * model  = llama_load_model_from_file(params.model.c_str(), lparams);
+    if (model == NULL) {
+        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
+        return std::make_tuple(nullptr, nullptr);
+    }
 
+    llama_context * lctx = llama_new_context_with_model(model, lparams);
     if (lctx == NULL) {
-        fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
-        return NULL;
+        fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+        llama_free_model(model);
+        return std::make_tuple(nullptr, nullptr);
     }
 
     if (!params.lora_adapter.empty()) {
-        int err = llama_apply_lora_from_file(lctx,
+        int err = llama_model_apply_lora_from_file(model,
                                              params.lora_adapter.c_str(),
                                              params.lora_base.empty() ? NULL : params.lora_base.c_str(),
                                              params.n_threads);
         if (err != 0) {
             fprintf(stderr, "%s: error: failed to apply lora adapter\n", __func__);
-            return NULL;
+            llama_free(lctx);
+            llama_free_model(model);
+            return std::make_tuple(nullptr, nullptr);
         }
     }
 
-    return lctx;
+    return std::make_tuple(model, lctx);
 }
 
 void console_init(console_state & con_st) {

diff --git a/examples/common.h b/examples/common.h
@@ -9,6 +9,7 @@
 #include <random>
 #include <thread>
 #include <unordered_map>
+#include <tuple>
 
 #if !defined (_WIN32)
 #include <stdio.h>
@@ -95,7 +96,7 @@ std::vector<llama_token> llama_tokenize(struct llama_context * ctx, const std::s
 // Model utils
 //
 
-struct llama_context * llama_init_from_gpt_params(const gpt_params & params);
+std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(const gpt_params & params);
 
 //
 // Console utils

diff --git a/examples/embedding/embedding.cpp b/examples/embedding/embedding.cpp
@@ -37,11 +37,12 @@ int main(int argc, char ** argv) {
 
     llama_init_backend();
 
+    llama_model * model;
     llama_context * ctx;
 
     // load the model
-    ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
         fprintf(stderr, "%s: error: unable to load model\n", __func__);
         return 1;
     }
@@ -90,6 +91,7 @@ int main(int argc, char ** argv) {
 
     llama_print_timings(ctx);
     llama_free(ctx);
+    llama_free_model(model);
 
     return 0;
 }
diff --git a/examples/main/main.cpp b/examples/main/main.cpp
@@ -107,12 +107,13 @@ int main(int argc, char ** argv) {
 
     llama_init_backend();
 
+    llama_model * model;
     llama_context * ctx;
     g_ctx = &ctx;
 
     // load the model and apply lora adapter, if any
-    ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
         fprintf(stderr, "%s: error: unable to load model\n", __func__);
         return 1;
     }
@@ -139,6 +140,7 @@ int main(int argc, char ** argv) {
 
         llama_print_timings(ctx);
         llama_free(ctx);
+        llama_free_model(model);
 
         return 0;
     }
@@ -147,6 +149,7 @@ int main(int argc, char ** argv) {
     if (params.export_cgraph) {
         llama_eval_export(ctx, "llama.ggml");
         llama_free(ctx);
+        llama_free_model(model);
 
         return 0;
     }
@@ -666,6 +669,7 @@ int main(int argc, char ** argv) {
 
     llama_print_timings(ctx);
     llama_free(ctx);
+    llama_free_model(model);
 
     return 0;
 }
diff --git a/examples/perplexity/perplexity.cpp b/examples/perplexity/perplexity.cpp
@@ -149,11 +149,12 @@ int main(int argc, char ** argv) {
 
     llama_init_backend();
 
+    llama_model * model;
     llama_context * ctx;
 
     // load the model and apply lora adapter, if any
-    ctx = llama_init_from_gpt_params(params);
-    if (ctx == NULL) {
+    std::tie(model, ctx) = llama_init_from_gpt_params(params);
+    if (model == NULL) {
         fprintf(stderr, "%s: error: unable to load model\n", __func__);
         return 1;
     }
@@ -169,6 +170,7 @@ int main(int argc, char ** argv) {
 
     llama_print_timings(ctx);
     llama_free(ctx);
+    llama_free_model(model);
 
     return 0;
 }
diff --git a/examples/quantize-stats/quantize-stats.cpp b/examples/quantize-stats/quantize-stats.cpp
@@ -320,6 +320,7 @@ int main(int argc, char ** argv) {
     fprintf(stderr, "Loading model\n");
 
     const int64_t t_main_start_us = ggml_time_us();
+    llama_model * model;
     llama_context * ctx;
 
     {
@@ -330,12 +331,20 @@ int main(int argc, char ** argv) {
         lparams.f16_kv     = false;
         lparams.use_mlock  = false;
 
-        ctx = llama_init_from_file(params.model.c_str(), lparams);
+        model = llama_load_model_from_file(params.model.c_str(), lparams);
 
-        if (ctx == NULL) {
+        if (model == NULL) {
             fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
             return 1;
         }
+
+        ctx = llama_new_context_with_model(model, lparams);
+
+        if (ctx == NULL) {
+            fprintf(stderr, "%s: error: failed to create context with model '%s'\n", __func__, params.model.c_str());
+            llama_free_model(model);
+            return 1;
+        }
     }
 
     const auto &tensors = llama_internal_get_tensor_map(ctx);
@@ -357,6 +366,7 @@ int main(int argc, char ** argv) {
             fprintf(stderr, "%s: error: Quantization should be tested with a float model, "
                 "this model contains already quantized layers (%s is type %d)\n", __func__, kv_tensor.first.c_str(), kv_tensor.second->type);
             llama_free(ctx);
+            llama_free_model(model);
             return 1;
         }
         included_layers++;
@@ -415,6 +425,7 @@ int main(int argc, char ** argv) {
 
 
     llama_free(ctx);
+    llama_free_model(model);
     // report timing
     {
         const int64_t t_main_end_us = ggml_time_us();