From 3221ab01ad34393b8ccd1a5f7de6068874fb0bf4 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 09:59:05 +0100
Subject: [PATCH 01/52] common: introduce llama_load_model_from_url to download
 model from hf url using libopenssl only

---
 common/CMakeLists.txt      |  10 +++
 common/common.cpp          | 152 ++++++++++++++++++++++++++++++++++++-
 common/common.h            |  10 +++
 examples/server/server.cpp |   8 ++
 4 files changed, 179 insertions(+), 1 deletion(-)

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 350bbdf7f7b1b..d275ef5a65a57 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -47,6 +47,16 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
+# Check for OpenSSL
+find_package(OpenSSL QUIET)
+if (OPENSSL_FOUND)
+    add_definitions(-DHAVE_OPENSSL)
+    include_directories(${OPENSSL_INCLUDE_DIR})
+    link_libraries(${OPENSSL_LIBRARIES})
+else()
+    message(WARNING "OpenSSL not found. Building without model download support.")
+endif ()
+
 
 set(TARGET common)
 
diff --git a/common/common.cpp b/common/common.cpp
index 4912237e0d0f1..baa2ad2f9d62f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1376,10 +1376,160 @@ void llama_batch_add(
     batch.n_tokens++;
 }
 
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
+                                                         struct llama_model_params     params) {
+#ifdef HAVE_OPENSSL
+    // Initialize OpenSSL
+    SSL_library_init();
+    SSL_load_error_strings();
+    OpenSSL_add_all_algorithms();
+
+    // Parse the URL to extract host, path, user, and password
+    char host[256];
+    char path[256];
+    char userpass[256];
+
+    if (sscanf(model_url, "https://%255[^/]/%255s", host, path) != 2) {
+        fprintf(stderr, "%s: invalid URL format: %s\n", __func__, model_url);
+        return nullptr;
+    }
+
+    if (strstr(host, "@")) {
+        sscanf(host, "%[^@]@%s", userpass, host);
+    }
+
+    // Create an SSL context
+    auto ctx = SSL_CTX_new(TLS_client_method());
+    if (!ctx) {
+        fprintf(stderr, "%s: error creating SSL context\n", __func__);
+        return nullptr;
+    }
+
+    // Set up certificate verification
+    SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, nullptr);
+
+    // Load trusted CA certificates based on platform
+    const char* ca_cert_path = nullptr;
+#ifdef _WIN32
+    ca_cert_path = "C:\\path\\to\\ca-certificates.crt"; // Windows path (FIXME)
+#elif __APPLE__
+    ca_cert_path = "/etc/ssl/cert.pem"; // macOS path
+#else
+    ca_cert_path = "/etc/ssl/certs/ca-certificates.crt"; // Linux path
+#endif
+
+    if (!SSL_CTX_load_verify_locations(ctx, ca_cert_path, nullptr)) {
+        fprintf(stderr, "%s: error loading CA certificates\n", __func__);
+        SSL_CTX_free(ctx);
+        return nullptr;
+    }
+
+    // Create an SSL connection
+    auto bio = BIO_new_ssl_connect(ctx);
+    if (!bio) {
+        fprintf(stderr, "%s: error creating SSL connection\n", __func__);
+        SSL_CTX_free(ctx);
+        return nullptr;
+    }
+
+    // Set the hostname
+    if (!BIO_set_conn_hostname(bio, host)) {
+        fprintf(stderr, "%s: unable to set connection hostname %s\n", __func__, host);
+        BIO_free_all(bio);
+        SSL_CTX_free(ctx);
+        return nullptr;
+    }
+
+    // Construct the HTTP request
+    char request[1024];
+    snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\nHost: %s\r\nAccept: */*\r\nUser-Agent: llama-client\r\nConnection: close\r\n", path, host);
+
+    // Add Authorization header if user credentials are available
+    if (strlen(userpass) > 0) {
+        char auth_header[256];
+        snprintf(auth_header, sizeof(auth_header), "Authorization: Basic %s\r\n", userpass);
+        strcat(request, auth_header);
+    }
+
+    // End of headers
+    strcat(request, "\r\n");
+
+    // Send the request
+    fprintf(stdout, "%s: downloading model from https://%s/%s to %s ...\n", __func__, host, path, path_model);
+    if (!BIO_puts(bio, request)) {
+        fprintf(stderr, "%s: error sending HTTP request https://%s/%s\n", __func__, host, path);
+        BIO_free_all(bio);
+        SSL_CTX_free(ctx);
+        return nullptr;
+    }
+
+    // Read the response status line
+    char status_line[256];
+    if (BIO_gets(bio, status_line, sizeof(status_line)) <= 0) {
+        fprintf(stderr, "%s: error reading response status line\n", __func__);
+        BIO_free_all(bio);
+        SSL_CTX_free(ctx);
+        return nullptr;
+    }
+
+    // Verify HTTP status code
+    if (strncmp(status_line, "HTTP/1.1 200", 12) != 0) {
+        fprintf(stderr, "%s: HTTP request failed: %s\n", __func__, status_line);
+        BIO_free_all(bio);
+        SSL_CTX_free(ctx);
+        return nullptr;
+    }
+
+    // Skip response headers
+    char buffer[4096];
+    int n_bytes_received;
+    while ((n_bytes_received = BIO_read(bio, buffer, sizeof(buffer))) > 0) {
+        // Look for the end of headers (empty line)
+        if (strstr(buffer, "\r\n\r\n")) {
+            break;
+        }
+    }
+
+    // Read and save the file content
+    FILE* outfile = fopen(path_model, "wb");
+    if (!outfile) {
+        fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
+        BIO_free_all(bio);
+        SSL_CTX_free(ctx);
+        return nullptr;
+    }
+
+    int n_bytes_received_total = 0;
+    while ((n_bytes_received = BIO_read(bio, buffer, sizeof(buffer))) > 0) {
+        fwrite(buffer, 1, n_bytes_received, outfile);
+        n_bytes_received_total += n_bytes_received;
+        if (n_bytes_received_total % (1024 * 1024) == 0) {
+            fprintf(stdout, "%s: model downloading %dGi %s ...\n", __func__, n_bytes_received_total / 1024 / 1024, path_model);
+        }
+    }
+    fclose(outfile);
+
+    // Clean up
+    BIO_free_all(bio);
+    SSL_CTX_free(ctx);
+    fprintf(stdout, "%s: model downloaded from https://%s/%s to %s.\n", __func__, host, path, path_model);
+
+    return llama_load_model_from_file(path_model, params);
+#else
+    LLAMA_LOG_ERROR("llama.cpp built without SSL support, downloading from url not supported.\n", __func__);
+    return nullptr;
+#endif
+}
+
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
     auto mparams = llama_model_params_from_gpt_params(params);
 
-    llama_model * model  = llama_load_model_from_file(params.model.c_str(), mparams);
+    llama_model * model = nullptr;
+    if (!params.model_url.empty()) {
+        model = llama_load_model_from_url(params.model_url.c_str(), params.model.c_str(), mparams);
+    } else {
+        model = llama_load_model_from_file(params.model.c_str(), mparams);
+    }
     if (model == NULL) {
         fprintf(stderr, "%s: error: failed to load model '%s'\n", __func__, params.model.c_str());
         return std::make_tuple(nullptr, nullptr);
diff --git a/common/common.h b/common/common.h
index 687f3425e8544..b9b59211254f2 100644
--- a/common/common.h
+++ b/common/common.h
@@ -17,6 +17,12 @@
 #include <unordered_map>
 #include <tuple>
 
+#ifdef HAVE_OPENSSL
+#include <openssl/ssl.h>
+#include <openssl/bio.h>
+#include <openssl/err.h>
+#endif
+
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
 #else
@@ -89,6 +95,7 @@ struct gpt_params {
     struct llama_sampling_params sparams;
 
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
+    std::string model_url         = ""; // model path
     std::string model_draft       = "";                              // draft model for speculative decoding
     std::string model_alias       = "unknown"; // model alias
     std::string prompt            = "";
@@ -191,6 +198,9 @@ std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_par
 struct llama_model_params   llama_model_params_from_gpt_params  (const gpt_params & params);
 struct llama_context_params llama_context_params_from_gpt_params(const gpt_params & params);
 
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
+                                                         struct llama_model_params     params);
+
 // Batch utils
 
 void llama_batch_clear(struct llama_batch & batch);
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 895d608fdcc06..5e1020009cbf1 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2195,6 +2195,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
     }
     printf("  -m FNAME, --model FNAME\n");
     printf("                            model path (default: %s)\n", params.model.c_str());
+    printf("  -u MODEL_URL, --url MODEL_URL\n");
+    printf("                            model url (default: %s)\n", params.model_url.c_str());
     printf("  -a ALIAS, --alias ALIAS\n");
     printf("                            set an alias for the model, will be added as `model` field in completion response\n");
     printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
@@ -2317,6 +2319,12 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 break;
             }
             params.model = argv[i];
+        } else if (arg == "-u" || arg == "--model-url") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_url = argv[i];
         } else if (arg == "-a" || arg == "--alias") {
             if (++i >= argc) {
                 invalid_param = true;

From a0ebdfcc5d27d0438fe1555b35596d847a47691f Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 11:32:29 +0100
Subject: [PATCH 02/52] common: llama_load_model_from_url witch to libcurl
 dependency

---
 common/CMakeLists.txt      |  14 +--
 common/common.cpp          | 173 +++++++++++--------------------------
 examples/main/README.md    |   1 +
 examples/server/README.md  |   1 +
 examples/server/server.cpp |   6 +-
 5 files changed, 64 insertions(+), 131 deletions(-)

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index d275ef5a65a57..79c3abdfede8e 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -47,14 +47,14 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
-# Check for OpenSSL
-find_package(OpenSSL QUIET)
-if (OPENSSL_FOUND)
-    add_definitions(-DHAVE_OPENSSL)
-    include_directories(${OPENSSL_INCLUDE_DIR})
-    link_libraries(${OPENSSL_LIBRARIES})
+# Check for curl
+find_package(CURL QUIET)
+if (CURL_FOUND)
+    add_definitions(-DHAVE_CURL)
+    include_directories(${CURL_INCLUDE_DIRS})
+    link_libraries(${CURL_LIBRARIES})
 else()
-    message(WARNING "OpenSSL not found. Building without model download support.")
+    message(INFO "libcurl not found. Building without model download support.")
 endif ()
 
 
diff --git a/common/common.cpp b/common/common.cpp
index baa2ad2f9d62f..4f955df30a116 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -16,6 +16,9 @@
 #include <unordered_set>
 #include <vector>
 #include <cinttypes>
+#ifdef HAVE_CURL
+#include <curl/curl.h>
+#endif
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -531,6 +534,12 @@ bool gpt_params_parse_ex(int argc, char ** argv, gpt_params & params) {
                 break;
             }
             params.model = argv[i];
+        } else if (arg == "-mu" || arg == "--model-url") {
+            if (++i >= argc) {
+                invalid_param = true;
+                break;
+            }
+            params.model_url = argv[i];
         } else if (arg == "-md" || arg == "--model-draft") {
             if (++i >= argc) {
                 invalid_param = true;
@@ -1131,6 +1140,8 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("                        layer range to apply the control vector(s) to, start and end inclusive\n");
     printf("  -m FNAME, --model FNAME\n");
     printf("                        model path (default: %s)\n", params.model.c_str());
+    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
+    printf("                            model download url (default: %s)\n", params.model_url.c_str());
     printf("  -md FNAME, --model-draft FNAME\n");
     printf("                        draft model for speculative decoding\n");
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
@@ -1376,150 +1387,70 @@ void llama_batch_add(
     batch.n_tokens++;
 }
 
+#ifdef HAVE_CURL
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
                                                          struct llama_model_params     params) {
-#ifdef HAVE_OPENSSL
-    // Initialize OpenSSL
-    SSL_library_init();
-    SSL_load_error_strings();
-    OpenSSL_add_all_algorithms();
-
-    // Parse the URL to extract host, path, user, and password
-    char host[256];
-    char path[256];
-    char userpass[256];
-
-    if (sscanf(model_url, "https://%255[^/]/%255s", host, path) != 2) {
-        fprintf(stderr, "%s: invalid URL format: %s\n", __func__, model_url);
-        return nullptr;
-    }
-
-    if (strstr(host, "@")) {
-        sscanf(host, "%[^@]@%s", userpass, host);
-    }
-
-    // Create an SSL context
-    auto ctx = SSL_CTX_new(TLS_client_method());
-    if (!ctx) {
-        fprintf(stderr, "%s: error creating SSL context\n", __func__);
-        return nullptr;
-    }
-
-    // Set up certificate verification
-    SSL_CTX_set_verify(ctx, SSL_VERIFY_PEER, nullptr);
-
-    // Load trusted CA certificates based on platform
-    const char* ca_cert_path = nullptr;
-#ifdef _WIN32
-    ca_cert_path = "C:\\path\\to\\ca-certificates.crt"; // Windows path (FIXME)
-#elif __APPLE__
-    ca_cert_path = "/etc/ssl/cert.pem"; // macOS path
-#else
-    ca_cert_path = "/etc/ssl/certs/ca-certificates.crt"; // Linux path
-#endif
+    // Initialize libcurl
+    curl_global_init(CURL_GLOBAL_DEFAULT);
+    auto curl = curl_easy_init();
 
-    if (!SSL_CTX_load_verify_locations(ctx, ca_cert_path, nullptr)) {
-        fprintf(stderr, "%s: error loading CA certificates\n", __func__);
-        SSL_CTX_free(ctx);
-        return nullptr;
-    }
-
-    // Create an SSL connection
-    auto bio = BIO_new_ssl_connect(ctx);
-    if (!bio) {
-        fprintf(stderr, "%s: error creating SSL connection\n", __func__);
-        SSL_CTX_free(ctx);
-        return nullptr;
-    }
 
-    // Set the hostname
-    if (!BIO_set_conn_hostname(bio, host)) {
-        fprintf(stderr, "%s: unable to set connection hostname %s\n", __func__, host);
-        BIO_free_all(bio);
-        SSL_CTX_free(ctx);
+    if (!curl) {
+        curl_global_cleanup();
+        fprintf(stderr, "%s: error initializing lib curl\n", __func__);
         return nullptr;
     }
 
-    // Construct the HTTP request
-    char request[1024];
-    snprintf(request, sizeof(request), "GET /%s HTTP/1.1\r\nHost: %s\r\nAccept: */*\r\nUser-Agent: llama-client\r\nConnection: close\r\n", path, host);
+    // Set the URL
+    curl_easy_setopt(curl, CURLOPT_URL, model_url);
+    curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
 
-    // Add Authorization header if user credentials are available
-    if (strlen(userpass) > 0) {
-        char auth_header[256];
-        snprintf(auth_header, sizeof(auth_header), "Authorization: Basic %s\r\n", userpass);
-        strcat(request, auth_header);
-    }
-
-    // End of headers
-    strcat(request, "\r\n");
-
-    // Send the request
-    fprintf(stdout, "%s: downloading model from https://%s/%s to %s ...\n", __func__, host, path, path_model);
-    if (!BIO_puts(bio, request)) {
-        fprintf(stderr, "%s: error sending HTTP request https://%s/%s\n", __func__, host, path);
-        BIO_free_all(bio);
-        SSL_CTX_free(ctx);
+    // Set the output file
+    auto outfile = fopen(path_model, "wb");
+    if (!outfile) {
+        curl_easy_cleanup(curl);
+        curl_global_cleanup();
+        fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
         return nullptr;
     }
+    curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
 
-    // Read the response status line
-    char status_line[256];
-    if (BIO_gets(bio, status_line, sizeof(status_line)) <= 0) {
-        fprintf(stderr, "%s: error reading response status line\n", __func__);
-        BIO_free_all(bio);
-        SSL_CTX_free(ctx);
+    // start the download
+    fprintf(stdout, "%s: downloading model from %s to %s ...\n", __func__, model_url, path_model);
+    auto res = curl_easy_perform(curl);
+    if (res != CURLE_OK) {
+        fclose(outfile);
+        curl_easy_cleanup(curl);
+        curl_global_cleanup();
+        fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
         return nullptr;
     }
 
-    // Verify HTTP status code
-    if (strncmp(status_line, "HTTP/1.1 200", 12) != 0) {
-        fprintf(stderr, "%s: HTTP request failed: %s\n", __func__, status_line);
-        BIO_free_all(bio);
-        SSL_CTX_free(ctx);
+    int http_code = 0;
+    curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
+    if (http_code < 200 || http_code >= 400) {
+        fclose(outfile);
+        curl_easy_cleanup(curl);
+        curl_global_cleanup();
+        fprintf(stderr, "%s: invalid http status code failed: %d\n", __func__, http_code);
         return nullptr;
     }
 
-    // Skip response headers
-    char buffer[4096];
-    int n_bytes_received;
-    while ((n_bytes_received = BIO_read(bio, buffer, sizeof(buffer))) > 0) {
-        // Look for the end of headers (empty line)
-        if (strstr(buffer, "\r\n\r\n")) {
-            break;
-        }
-    }
-
-    // Read and save the file content
-    FILE* outfile = fopen(path_model, "wb");
-    if (!outfile) {
-        fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
-        BIO_free_all(bio);
-        SSL_CTX_free(ctx);
-        return nullptr;
-    }
-
-    int n_bytes_received_total = 0;
-    while ((n_bytes_received = BIO_read(bio, buffer, sizeof(buffer))) > 0) {
-        fwrite(buffer, 1, n_bytes_received, outfile);
-        n_bytes_received_total += n_bytes_received;
-        if (n_bytes_received_total % (1024 * 1024) == 0) {
-            fprintf(stdout, "%s: model downloading %dGi %s ...\n", __func__, n_bytes_received_total / 1024 / 1024, path_model);
-        }
-    }
-    fclose(outfile);
-
     // Clean up
-    BIO_free_all(bio);
-    SSL_CTX_free(ctx);
-    fprintf(stdout, "%s: model downloaded from https://%s/%s to %s.\n", __func__, host, path, path_model);
+    fclose(outfile);
+    curl_easy_cleanup(curl);
+    curl_global_cleanup();
 
     return llama_load_model_from_file(path_model, params);
+}
 #else
-    LLAMA_LOG_ERROR("llama.cpp built without SSL support, downloading from url not supported.\n", __func__);
+struct llama_model * llama_load_model_from_url(const char *, const char *,
+                                               struct llama_model_params) {
+    fprintf(stderr, "%s: llama.cpp built without SSL support, downloading from url not supported.\n", __func__);
     return nullptr;
-#endif
 }
+#endif
 
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
     auto mparams = llama_model_params_from_gpt_params(params);
diff --git a/examples/main/README.md b/examples/main/README.md
index 7f84e42623274..daaa807d55952 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -67,6 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
 
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
+-   `-mu MODEL_URL --model MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.
diff --git a/examples/server/README.md b/examples/server/README.md
index 8f8454affaecd..df1ccce9bebe0 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -20,6 +20,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
 - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
+- `-mu MODEL_URL --model MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.
diff --git a/examples/server/server.cpp b/examples/server/server.cpp
index 5e1020009cbf1..d2a8e541d3305 100644
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -2195,8 +2195,8 @@ static void server_print_usage(const char * argv0, const gpt_params & params, co
     }
     printf("  -m FNAME, --model FNAME\n");
     printf("                            model path (default: %s)\n", params.model.c_str());
-    printf("  -u MODEL_URL, --url MODEL_URL\n");
-    printf("                            model url (default: %s)\n", params.model_url.c_str());
+    printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
+    printf("                            model download url (default: %s)\n", params.model_url.c_str());
     printf("  -a ALIAS, --alias ALIAS\n");
     printf("                            set an alias for the model, will be added as `model` field in completion response\n");
     printf("  --lora FNAME              apply LoRA adapter (implies --no-mmap)\n");
@@ -2319,7 +2319,7 @@ static void server_params_parse(int argc, char ** argv, server_params & sparams,
                 break;
             }
             params.model = argv[i];
-        } else if (arg == "-u" || arg == "--model-url") {
+        } else if (arg == "-mu" || arg == "--model-url") {
             if (++i >= argc) {
                 invalid_param = true;
                 break;

From 42b25dacab6ddb90fc91ef6d479f3926692e30ae Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 13:27:05 +0100
Subject: [PATCH 03/52] common: PR feedback, rename the definition to
 LLAMA_USE_CURL

---
 common/CMakeLists.txt | 2 +-
 common/common.cpp     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 79c3abdfede8e..9e85c2337f815 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -50,7 +50,7 @@ endif()
 # Check for curl
 find_package(CURL QUIET)
 if (CURL_FOUND)
-    add_definitions(-DHAVE_CURL)
+    add_definitions(-DLLAMA_USE_CURL)
     include_directories(${CURL_INCLUDE_DIRS})
     link_libraries(${CURL_LIBRARIES})
 else()
diff --git a/common/common.cpp b/common/common.cpp
index 4f955df30a116..1f57493dfda35 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -16,7 +16,7 @@
 #include <unordered_set>
 #include <vector>
 #include <cinttypes>
-#ifdef HAVE_CURL
+#ifdef LLAMA_USE_CURL
 #include <curl/curl.h>
 #endif
 
@@ -1387,7 +1387,7 @@ void llama_batch_add(
     batch.n_tokens++;
 }
 
-#ifdef HAVE_CURL
+#ifdef LLAMA_USE_CURL
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
                                                          struct llama_model_params     params) {
     // Initialize libcurl

From 7e782856bd1416877960c2164a0f35908438c8f9 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 13:45:09 +0100
Subject: [PATCH 04/52] common: LLAMA_USE_CURL in make toolchain

---
 Makefile | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/Makefile b/Makefile
index c0f1250366a64..2ef626737745b 100644
--- a/Makefile
+++ b/Makefile
@@ -595,6 +595,11 @@ include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
 endif
 
+ifdef LLAMA_USE_CURL
+override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
+override LDFLAGS  := $(LDFLAGS) -lcurl
+endif
+
 #
 # Print build information
 #

From df0d82289c14dc3d03e54c45132723a6dbcbc548 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 13:52:17 +0100
Subject: [PATCH 05/52] ci: compile the server with curl, add make option curl
 example in default cmake

---
 .github/workflows/build.yml  | 1 +
 .github/workflows/server.yml | 3 ++-
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0da01d5ba6ead..386ab88f29c2d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -39,6 +39,7 @@ jobs:
         id: make_build
         env:
             LLAMA_FATAL_WARNINGS: 1
+            LLAMA_USE_CURL: 1
         run: |
           CC=gcc-8 make -j $(nproc)
 
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 5e38b3547c659..51340662a277f 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -57,7 +57,8 @@ jobs:
             cmake \
             python3-pip \
             wget \
-            language-pack-en
+            language-pack-en \
+            libcurl4-openssl-dev
 
       - name: Build
         id: cmake_build

From 80bec9890a57bc53d28c22669dbe9a6eed8ae1b9 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 14:08:21 +0100
Subject: [PATCH 06/52] llama_load_model_from_url: try to make the windows
 build passing

---
 common/common.cpp | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 1f57493dfda35..fc315e2fb4dc5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1394,7 +1394,6 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
     curl_global_init(CURL_GLOBAL_DEFAULT);
     auto curl = curl_easy_init();
 
-
     if (!curl) {
         curl_global_cleanup();
         fprintf(stderr, "%s: error initializing lib curl\n", __func__);
@@ -1445,11 +1444,13 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
     return llama_load_model_from_file(path_model, params);
 }
 #else
-struct llama_model * llama_load_model_from_url(const char *, const char *,
-                                               struct llama_model_params) {
-    fprintf(stderr, "%s: llama.cpp built without SSL support, downloading from url not supported.\n", __func__);
+
+struct llama_model *llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
+                                              struct llama_model_params /*params*/) {
+    fprintf(stderr, "%s: llama.cpp built without curl support, downloading from an url not supported.\n", __func__);
     return nullptr;
 }
+
 #endif
 
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {

From 2c3a00e270bdcdde49cda0414eb4e4b848c96454 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 15:40:29 +0100
Subject: [PATCH 07/52] Update Makefile

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 Makefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Makefile b/Makefile
index 2ef626737745b..838daf5c02acd 100644
--- a/Makefile
+++ b/Makefile
@@ -595,7 +595,7 @@ include scripts/get-flags.mk
 CUDA_CXXFLAGS := $(BASE_CXXFLAGS) $(GF_CXXFLAGS) -Wno-pedantic
 endif
 
-ifdef LLAMA_USE_CURL
+ifdef LLAMA_CURL
 override CXXFLAGS := $(CXXFLAGS) -DLLAMA_USE_CURL
 override LDFLAGS  := $(LDFLAGS) -lcurl
 endif

From 4135d4a50564c9913b911c4d87458b50b09e4e6f Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 14:26:17 +0100
Subject: [PATCH 08/52] llama_load_model_from_url: typo

---
 common/common.cpp | 4 ++--
 common/common.h   | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index fc315e2fb4dc5..45187a7c65f91 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1390,7 +1390,7 @@ void llama_batch_add(
 #ifdef LLAMA_USE_CURL
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
                                                          struct llama_model_params     params) {
-    // Initialize libcurl
+    // Initialize libcurl globally
     curl_global_init(CURL_GLOBAL_DEFAULT);
     auto curl = curl_easy_init();
 
@@ -1400,7 +1400,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
         return nullptr;
     }
 
-    // Set the URL
+    // Set the URL, allow to follow http redirection and display download progress
     curl_easy_setopt(curl, CURLOPT_URL, model_url);
     curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
     curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
diff --git a/common/common.h b/common/common.h
index b9b59211254f2..3e5dd661c95aa 100644
--- a/common/common.h
+++ b/common/common.h
@@ -95,7 +95,7 @@ struct gpt_params {
     struct llama_sampling_params sparams;
 
     std::string model             = "models/7B/ggml-model-f16.gguf"; // model path
-    std::string model_url         = ""; // model path
+    std::string model_url         = ""; // model url to download
     std::string model_draft       = "";                              // draft model for speculative decoding
     std::string model_alias       = "unknown"; // model alias
     std::string prompt            = "";

From 5d99f3224f19c98c568a5bcd3023dfcb33a9f046 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 16:27:06 +0100
Subject: [PATCH 09/52] llama_load_model_from_url: download the file only if
 modified based on etag and last-modified http headers

---
 common/common.cpp | 152 +++++++++++++++++++++++++++++++++++++---------
 1 file changed, 124 insertions(+), 28 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 45187a7c65f91..8b256e7fb09f9 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1388,61 +1388,157 @@ void llama_batch_add(
 }
 
 #ifdef LLAMA_USE_CURL
+
 struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
-                                                         struct llama_model_params     params) {
+                                               struct llama_model_params     params) {
     // Initialize libcurl globally
     curl_global_init(CURL_GLOBAL_DEFAULT);
-    auto curl = curl_easy_init();
+    CURL *curl = curl_easy_init();
 
     if (!curl) {
         curl_global_cleanup();
         fprintf(stderr, "%s: error initializing lib curl\n", __func__);
-        return nullptr;
+        return NULL;
     }
 
-    // Set the URL, allow to follow http redirection and display download progress
+    // Set the URL, allow to follow http redirection
     curl_easy_setopt(curl, CURLOPT_URL, model_url);
     curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
-    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L);
+
+    // Check if the file already exists locally
+    struct stat buffer;
+    int file_exists = (stat(path_model, &buffer) == 0);
+
+    // If the file exists, check for an ETag file or a lastModified file
+    char etag[256] = {0};
+    char etag_path[256] = {0};
+    strcpy(etag_path, path_model);
+    strcat(etag_path, ".etag");
+
+    char last_modified[256] = {0};
+    char last_modified_path[256] = {0};
+    strcpy(last_modified_path, path_model);
+    strcat(last_modified_path, ".lastModified");
+
+    if (file_exists) {
+        FILE *f_etag = fopen(etag_path, "r");
+        if (f_etag) {
+            fgets(etag, sizeof(etag), f_etag);
+            fclose(f_etag);
+            fprintf(stderr, "%s: previous model .etag file found %s: %s\n", __func__, path_model, etag);
+        }
 
-    // Set the output file
-    auto outfile = fopen(path_model, "wb");
-    if (!outfile) {
-        curl_easy_cleanup(curl);
-        curl_global_cleanup();
-        fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
-        return nullptr;
+        FILE *f_last_modified = fopen(last_modified_path, "r");
+        if (f_last_modified) {
+            fgets(last_modified, sizeof(last_modified), f_last_modified);
+            fclose(f_etag);
+            fprintf(stderr, "%s: previous model .lastModified file found %s: %s\n", __func__, last_modified_path, last_modified);
+        }
     }
-    curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
 
-    // start the download
-    fprintf(stdout, "%s: downloading model from %s to %s ...\n", __func__, model_url, path_model);
-    auto res = curl_easy_perform(curl);
+    // Send a HEAD request to retrieve the ETag and Last-Modified headers
+    struct llama_load_model_from_url_headers {
+        char etag[256]            = {0};
+        char last_modified[256] = {0};
+    };
+    typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
+    auto header_callback = [](char *buffer, size_t /*size*/, size_t n_items, void *userdata) -> size_t {
+        llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers*) userdata;
+
+        const char *etag_prefix = "etag: ";
+        if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
+            strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix)- 2); // Remove LRLF
+        }
+
+        const char *last_modified_prefix = "last-modified: ";
+        if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
+            strncpy(headers->last_modified, buffer + strlen(last_modified_prefix), n_items - strlen(last_modified_prefix) - 2); // Remove LRLF
+        }
+        return n_items;
+    };
+
+    curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);
+    llama_load_model_from_url_headers headers;
+    curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+    curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
+
+    CURLcode res = curl_easy_perform(curl);
     if (res != CURLE_OK) {
-        fclose(outfile);
         curl_easy_cleanup(curl);
         curl_global_cleanup();
         fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
-        return nullptr;
-    }
+        return NULL;
+    }
+
+    // If only the ETag or the Last-Modified header are different, trigger a new download
+    if (strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
+        // Set the output file
+        FILE *outfile = fopen(path_model, "wb");
+        if (!outfile) {
+            curl_easy_cleanup(curl);
+            curl_global_cleanup();
+            fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
+            return NULL;
+        }
+        curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
+        curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
+
+        //  display download progress
+        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 0L);
+
+        // start the download
+        fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
+                model_url, path_model, headers.etag, headers.last_modified);
+        res = curl_easy_perform(curl);
+        if (res != CURLE_OK) {
+            fclose(outfile);
+            curl_easy_cleanup(curl);
+            curl_global_cleanup();
+            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+            return NULL;
+        }
 
-    int http_code = 0;
-    curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
-    if (http_code < 200 || http_code >= 400) {
+        long http_code = 0;
+        curl_easy_getinfo (curl, CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code < 200 || http_code >= 400) {
+            fclose(outfile);
+            curl_easy_cleanup(curl);
+            curl_global_cleanup();
+            fprintf(stderr, "%s: invalid http status code failed: %ld\n", __func__, http_code);
+            return NULL;
+        }
+
+        // Clean up
         fclose(outfile);
-        curl_easy_cleanup(curl);
-        curl_global_cleanup();
-        fprintf(stderr, "%s: invalid http status code failed: %d\n", __func__, http_code);
-        return nullptr;
+
+        // Write the new ETag to the .etag file
+        if (strlen( headers.etag) > 0) {
+            FILE *etag_file = fopen(etag_path, "w");
+            if (etag_file) {
+                fputs( headers.etag, etag_file);
+                fclose(etag_file);
+                fprintf(stderr, "%s: model etag saved %s:%s\n", __func__, etag_path, etag);
+            }
+        }
+
+        // Write the new lastModified to the .etag file
+        if (strlen( headers.last_modified) > 0) {
+            FILE *last_modified_file = fopen(last_modified_path, "w");
+            if (last_modified_file) {
+                fputs(headers.last_modified, last_modified_file);
+                fclose(last_modified_file);
+                fprintf(stderr, "%s: model last modified saved %s:%s\n", __func__, last_modified_path, headers.last_modified);
+            }
+        }
     }
 
-    // Clean up
-    fclose(outfile);
     curl_easy_cleanup(curl);
     curl_global_cleanup();
 
     return llama_load_model_from_file(path_model, params);
 }
+
 #else
 
 struct llama_model *llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,

From 921e4af9302919a81513e8c1ba43004f81aa3c98 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 16:29:02 +0100
Subject: [PATCH 10/52] ci: build, fix the default build to use LLAMA_CURL

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 386ab88f29c2d..0977aa8ba93d4 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -39,7 +39,7 @@ jobs:
         id: make_build
         env:
             LLAMA_FATAL_WARNINGS: 1
-            LLAMA_USE_CURL: 1
+            LLAMA_CURL: 1
         run: |
           CC=gcc-8 make -j $(nproc)
 

From 6633689fa5cd972bfa3de3c06477996fb554f79b Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 16:49:44 +0100
Subject: [PATCH 11/52] llama_load_model_from_url: cleanup code

---
 common/common.cpp | 130 +++++++++++++++++++++++++++-------------------
 1 file changed, 77 insertions(+), 53 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 8b256e7fb09f9..89b5ee50113e2 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -53,6 +53,19 @@
 #define GGML_USE_CUBLAS_SYCL_VULKAN
 #endif
 
+#ifdef LLAMA_USE_CURL
+#ifdef __linux__
+#include <linux/limits.h>
+#elif defined(_WIN32)
+#include <windows.h>
+#define PATH_MAX MAX_PATH
+#else
+#include <sys/syslimits.h>
+#endif
+#define LLAMA_CURL_MAX_PATH_LENGTH PATH_MAX
+#define LLAMA_CURL_MAX_HEADER_LENGTH 256
+#endif // LLAMA_USE_CURL
+
 int32_t get_num_physical_cores() {
 #ifdef __linux__
     // enumerate the set of thread siblings, num entries is num cores
@@ -1389,11 +1402,17 @@ void llama_batch_add(
 
 #ifdef LLAMA_USE_CURL
 
-struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
-                                               struct llama_model_params     params) {
+struct llama_model *llama_load_model_from_url(const char *model_url, const char *path_model,
+                                              struct llama_model_params params) {
+    // Basic validation of the model_url
+    if (!model_url || strlen(model_url) == 0) {
+        fprintf(stderr, "%s: invalid model_url\n", __func__);
+        return NULL;
+    }
+
     // Initialize libcurl globally
     curl_global_init(CURL_GLOBAL_DEFAULT);
-    CURL *curl = curl_easy_init();
+    auto curl = curl_easy_init();
 
     if (!curl) {
         curl_global_cleanup();
@@ -1408,73 +1427,77 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
 
     // Check if the file already exists locally
     struct stat buffer;
-    int file_exists = (stat(path_model, &buffer) == 0);
+    auto file_exists = (stat(path_model, &buffer) == 0);
 
-    // If the file exists, check for an ETag file or a lastModified file
-    char etag[256] = {0};
-    char etag_path[256] = {0};
-    strcpy(etag_path, path_model);
-    strcat(etag_path, ".etag");
+    // If the file exists, check for ${model_path}.etag or ${model_path}.lastModified files
+    char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+    char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
+    strncpy(etag_path, path_model, LLAMA_CURL_MAX_PATH_LENGTH - 6); // 6 is the length of ".etag\0"
+    strncat(etag_path, ".etag", 6);
 
-    char last_modified[256] = {0};
-    char last_modified_path[256] = {0};
-    strcpy(last_modified_path, path_model);
-    strcat(last_modified_path, ".lastModified");
+    char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+    char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
+    strncpy(last_modified_path, path_model, LLAMA_CURL_MAX_PATH_LENGTH - 15); // 15 is the length of ".lastModified\0"
+    strncat(last_modified_path, ".lastModified", 15);
 
     if (file_exists) {
-        FILE *f_etag = fopen(etag_path, "r");
+        auto *f_etag = fopen(etag_path, "r");
         if (f_etag) {
             fgets(etag, sizeof(etag), f_etag);
             fclose(f_etag);
             fprintf(stderr, "%s: previous model .etag file found %s: %s\n", __func__, path_model, etag);
         }
 
-        FILE *f_last_modified = fopen(last_modified_path, "r");
+        auto *f_last_modified = fopen(last_modified_path, "r");
         if (f_last_modified) {
             fgets(last_modified, sizeof(last_modified), f_last_modified);
-            fclose(f_etag);
-            fprintf(stderr, "%s: previous model .lastModified file found %s: %s\n", __func__, last_modified_path, last_modified);
+            fclose(f_last_modified);
+            fprintf(stderr, "%s: previous model .lastModified file found %s: %s\n", __func__, last_modified_path,
+                    last_modified);
         }
     }
 
-    // Send a HEAD request to retrieve the ETag and Last-Modified headers
+    // Send a HEAD request to retrieve the etag and last-modified headers
     struct llama_load_model_from_url_headers {
-        char etag[256]            = {0};
-        char last_modified[256] = {0};
+        char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
+        char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
     };
-    typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
-    auto header_callback = [](char *buffer, size_t /*size*/, size_t n_items, void *userdata) -> size_t {
-        llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers*) userdata;
+    llama_load_model_from_url_headers headers;
+    {
+        typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
+        auto header_callback = [](char *buffer, size_t /*size*/, size_t n_items, void *userdata) -> size_t {
+            llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
 
-        const char *etag_prefix = "etag: ";
-        if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
-            strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix)- 2); // Remove LRLF
-        }
+            const char *etag_prefix = "etag: ";
+            if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
+                strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove LRLF
+            }
 
-        const char *last_modified_prefix = "last-modified: ";
-        if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
-            strncpy(headers->last_modified, buffer + strlen(last_modified_prefix), n_items - strlen(last_modified_prefix) - 2); // Remove LRLF
-        }
-        return n_items;
-    };
+            const char *last_modified_prefix = "last-modified: ";
+            if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
+                strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
+                        n_items - strlen(last_modified_prefix) - 2); // Remove LRLF
+            }
+            return n_items;
+        };
 
-    curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);
-    llama_load_model_from_url_headers headers;
-    curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
-    curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
+        curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);
+        curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
+        curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
 
-    CURLcode res = curl_easy_perform(curl);
-    if (res != CURLE_OK) {
-        curl_easy_cleanup(curl);
-        curl_global_cleanup();
-        fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
-        return NULL;
+        CURLcode res = curl_easy_perform(curl);
+        if (res != CURLE_OK) {
+            curl_easy_cleanup(curl);
+            curl_global_cleanup();
+            fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
+            return NULL;
+        }
     }
 
     // If only the ETag or the Last-Modified header are different, trigger a new download
     if (strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
         // Set the output file
-        FILE *outfile = fopen(path_model, "wb");
+        auto *outfile = fopen(path_model, "wb");
         if (!outfile) {
             curl_easy_cleanup(curl);
             curl_global_cleanup();
@@ -1490,7 +1513,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
         // start the download
         fprintf(stderr, "%s: downloading model from %s to %s (server_etag:%s, server_last_modified:%s)...\n", __func__,
                 model_url, path_model, headers.etag, headers.last_modified);
-        res = curl_easy_perform(curl);
+        auto res = curl_easy_perform(curl);
         if (res != CURLE_OK) {
             fclose(outfile);
             curl_easy_cleanup(curl);
@@ -1513,22 +1536,23 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
         fclose(outfile);
 
         // Write the new ETag to the .etag file
-        if (strlen( headers.etag) > 0) {
-            FILE *etag_file = fopen(etag_path, "w");
+        if (strlen(headers.etag) > 0) {
+            auto *etag_file = fopen(etag_path, "w");
             if (etag_file) {
-                fputs( headers.etag, etag_file);
+                fputs(headers.etag, etag_file);
                 fclose(etag_file);
-                fprintf(stderr, "%s: model etag saved %s:%s\n", __func__, etag_path, etag);
+                fprintf(stderr, "%s: model etag saved %s:%s\n", __func__, etag_path, headers.etag);
             }
         }
 
         // Write the new lastModified to the .etag file
-        if (strlen( headers.last_modified) > 0) {
-            FILE *last_modified_file = fopen(last_modified_path, "w");
+        if (strlen(headers.last_modified) > 0) {
+            auto *last_modified_file = fopen(last_modified_path, "w");
             if (last_modified_file) {
                 fputs(headers.last_modified, last_modified_file);
                 fclose(last_modified_file);
-                fprintf(stderr, "%s: model last modified saved %s:%s\n", __func__, last_modified_path, headers.last_modified);
+                fprintf(stderr, "%s: model last modified saved %s:%s\n", __func__, last_modified_path,
+                        headers.last_modified);
             }
         }
     }
@@ -1547,7 +1571,7 @@ struct llama_model *llama_load_model_from_url(const char * /*model_url*/, const
     return nullptr;
 }
 
-#endif
+#endif // LLAMA_USE_CURL
 
 std::tuple<struct llama_model *, struct llama_context *> llama_init_from_gpt_params(gpt_params & params) {
     auto mparams = llama_model_params_from_gpt_params(params);

From e84206d13203111e642e3bdc94ca34921078c176 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:46:18 +0100
Subject: [PATCH 12/52] Update examples/server/README.md

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/server/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/server/README.md b/examples/server/README.md
index df1ccce9bebe0..755e1d5384f55 100644
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -20,7 +20,7 @@ The project is under active development, and we are [looking for feedback and co
 - `-tb N, --threads-batch N`: Set the number of threads to use during batch and prompt processing. If not specified, the number of threads will be set to the number of threads used for generation.
 - `--threads-http N`: number of threads in the http server pool to process requests (default: `max(std::thread::hardware_concurrency() - 1, --parallel N + 2)`)
 - `-m FNAME`, `--model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.gguf`).
-- `-mu MODEL_URL --model MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
+- `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 - `-a ALIAS`, `--alias ALIAS`: Set an alias for the model. The alias will be returned in API responses.
 - `-c N`, `--ctx-size N`: Set the size of the prompt context. The default is 512, but LLaMA models were built with a context of 2048, which will provide better results for longer input/inference. The size may differ in other models, for example, baichuan models were build with a context of 4096.
 - `-ngl N`, `--n-gpu-layers N`: When compiled with appropriate support (currently CLBlast or cuBLAS), this option allows offloading some layers to the GPU for computation. Generally results in increased performance.

From 4bc47b75caef8fc17e150621b3c3617fd79acc7e Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:46:34 +0100
Subject: [PATCH 13/52] Update common/common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index f35296274270b..52b120dc2f725 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1803,7 +1803,7 @@ struct llama_model *llama_load_model_from_url(const char *model_url, const char
 
 #else
 
-struct llama_model *llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
+struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
                                               struct llama_model_params /*params*/) {
     fprintf(stderr, "%s: llama.cpp built without curl support, downloading from an url not supported.\n", __func__);
     return nullptr;

From 8751bd0c82d5768e73af17801e8e382c15ff47d8 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:46:46 +0100
Subject: [PATCH 14/52] Update common/common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 52b120dc2f725..3c5fa79eb52ea 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1785,7 +1785,7 @@ struct llama_model *llama_load_model_from_url(const char *model_url, const char
 
         // Write the new lastModified to the .etag file
         if (strlen(headers.last_modified) > 0) {
-            auto *last_modified_file = fopen(last_modified_path, "w");
+            auto * last_modified_file = fopen(last_modified_path, "w");
             if (last_modified_file) {
                 fputs(headers.last_modified, last_modified_file);
                 fclose(last_modified_file);

From f53bfd56afad0991ed140a7ccbbe5c2060bb06fc Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:46:53 +0100
Subject: [PATCH 15/52] Update common/common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 3c5fa79eb52ea..d1654c59a1a07 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1735,7 +1735,7 @@ struct llama_model *llama_load_model_from_url(const char *model_url, const char
     // If only the ETag or the Last-Modified header are different, trigger a new download
     if (strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
         // Set the output file
-        auto *outfile = fopen(path_model, "wb");
+        auto * outfile = fopen(path_model, "wb");
         if (!outfile) {
             curl_easy_cleanup(curl);
             curl_global_cleanup();

From b088122719a9409642134bb1d5fe0c865d7099b0 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:47:04 +0100
Subject: [PATCH 16/52] Update common/common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index d1654c59a1a07..2720f93101ee3 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1775,7 +1775,7 @@ struct llama_model *llama_load_model_from_url(const char *model_url, const char
 
         // Write the new ETag to the .etag file
         if (strlen(headers.etag) > 0) {
-            auto *etag_file = fopen(etag_path, "w");
+            auto * etag_file = fopen(etag_path, "w");
             if (etag_file) {
                 fputs(headers.etag, etag_file);
                 fclose(etag_file);

From f22456d8c33f50da15f7c98c74189dfba1855a0b Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:48:02 +0100
Subject: [PATCH 17/52] Update common/common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 2720f93101ee3..b93ad05e3892d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1711,7 +1711,7 @@ struct llama_model *llama_load_model_from_url(const char *model_url, const char
                 strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove LRLF
             }
 
-            const char *last_modified_prefix = "last-modified: ";
+            const char * last_modified_prefix = "last-modified: ";
             if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
                 strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
                         n_items - strlen(last_modified_prefix) - 2); // Remove LRLF

From 9565ae31878b17e902cc83a02a903231485f58ac Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:48:10 +0100
Subject: [PATCH 18/52] Update common/common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index b93ad05e3892d..2b968b82b758a 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1706,7 +1706,7 @@ struct llama_model *llama_load_model_from_url(const char *model_url, const char
         auto header_callback = [](char *buffer, size_t /*size*/, size_t n_items, void *userdata) -> size_t {
             llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
 
-            const char *etag_prefix = "etag: ";
+            const char * etag_prefix = "etag: ";
             if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
                 strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove LRLF
             }

From 330e28df084aeafdc77911eb3b2a3e3901a3eda6 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:48:20 +0100
Subject: [PATCH 19/52] Update common/common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 2b968b82b758a..c49f0920f6a3b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1703,7 +1703,7 @@ struct llama_model *llama_load_model_from_url(const char *model_url, const char
     llama_load_model_from_url_headers headers;
     {
         typedef size_t(*CURLOPT_HEADERFUNCTION_PTR)(char *, size_t, size_t, void *);
-        auto header_callback = [](char *buffer, size_t /*size*/, size_t n_items, void *userdata) -> size_t {
+        auto header_callback = [](char * buffer, size_t /*size*/, size_t n_items, void * userdata) -> size_t {
             llama_load_model_from_url_headers *headers = (llama_load_model_from_url_headers *) userdata;
 
             const char * etag_prefix = "etag: ";

From 89ab37a261cd50b00647c22a389ca938e14a1db9 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:48:27 +0100
Subject: [PATCH 20/52] Update common/common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index c49f0920f6a3b..7630e0fbd160f 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1686,7 +1686,7 @@ struct llama_model *llama_load_model_from_url(const char *model_url, const char
             fprintf(stderr, "%s: previous model .etag file found %s: %s\n", __func__, path_model, etag);
         }
 
-        auto *f_last_modified = fopen(last_modified_path, "r");
+        auto * f_last_modified = fopen(last_modified_path, "r");
         if (f_last_modified) {
             fgets(last_modified, sizeof(last_modified), f_last_modified);
             fclose(f_last_modified);

From be561a7ffd3d4ee86be3d782b45c7a6491d64530 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:48:32 +0100
Subject: [PATCH 21/52] Update common/common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 7630e0fbd160f..f07ab444849cc 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1679,7 +1679,7 @@ struct llama_model *llama_load_model_from_url(const char *model_url, const char
     strncat(last_modified_path, ".lastModified", 15);
 
     if (file_exists) {
-        auto *f_etag = fopen(etag_path, "r");
+        auto * f_etag = fopen(etag_path, "r");
         if (f_etag) {
             fgets(etag, sizeof(etag), f_etag);
             fclose(f_etag);

From eb9e52a21832e2c062548c1ece6b849ce725d504 Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:48:38 +0100
Subject: [PATCH 22/52] Update common/common.cpp

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index f07ab444849cc..1bcf76ff6fe88 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1640,7 +1640,7 @@ void llama_batch_add(
 
 #ifdef LLAMA_USE_CURL
 
-struct llama_model *llama_load_model_from_url(const char *model_url, const char *path_model,
+struct llama_model * llama_load_model_from_url(const char * model_url, const char * path_model,
                                               struct llama_model_params params) {
     // Basic validation of the model_url
     if (!model_url || strlen(model_url) == 0) {

From b0b49e0bb8de8cc272c10b2503486c4ccd4edb4f Mon Sep 17 00:00:00 2001
From: Pierrick Hymbert <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 17:48:48 +0100
Subject: [PATCH 23/52] Update examples/main/README.md

Co-authored-by: Georgi Gerganov <ggerganov@gmail.com>
---
 examples/main/README.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/main/README.md b/examples/main/README.md
index daaa807d55952..6a8d1e1c50cbb 100644
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -67,7 +67,7 @@ main.exe -m models\7B\ggml-model.bin --ignore-eos -n -1 --random-prompt
 In this section, we cover the most commonly used options for running the `main` program with the LLaMA models:
 
 -   `-m FNAME, --model FNAME`: Specify the path to the LLaMA model file (e.g., `models/7B/ggml-model.bin`).
--   `-mu MODEL_URL --model MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
+-   `-mu MODEL_URL --model-url MODEL_URL`: Specify a remote http url to download the file (e.g https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf).
 -   `-i, --interactive`: Run the program in interactive mode, allowing you to provide input directly and receive real-time responses.
 -   `-ins, --instruct`: Run the program in instruction mode, which is particularly useful when working with Alpaca models.
 -   `-n N, --n-predict N`: Set the number of tokens to predict when generating text. Adjusting this value can influence the length of the generated text.

From 545fef6e0ef24ea9663ae44b08c6a7096e090baa Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 18:01:55 +0100
Subject: [PATCH 24/52] llama_load_model_from_url: fix compilation warning,
 clearer logging

---
 common/common.cpp | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 1bcf76ff6fe88..269a3afd88ecf 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1681,17 +1681,23 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
     if (file_exists) {
         auto * f_etag = fopen(etag_path, "r");
         if (f_etag) {
-            fgets(etag, sizeof(etag), f_etag);
+            if (!fgets(etag, sizeof(etag), f_etag)) {
+                fprintf(stderr, "%s: unable to read file %s\n", __func__, etag_path);
+            } else {
+                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, etag_path, etag);
+            }
             fclose(f_etag);
-            fprintf(stderr, "%s: previous model .etag file found %s: %s\n", __func__, path_model, etag);
         }
 
         auto * f_last_modified = fopen(last_modified_path, "r");
         if (f_last_modified) {
-            fgets(last_modified, sizeof(last_modified), f_last_modified);
+            if (!fgets(last_modified, sizeof(last_modified), f_last_modified)) {
+                fprintf(stderr, "%s: unable to read file %s\n", __func__, last_modified_path);
+            } else {
+                fprintf(stderr, "%s: previous model file found %s: %s\n", __func__, last_modified_path,
+                        last_modified);
+            }
             fclose(f_last_modified);
-            fprintf(stderr, "%s: previous model .lastModified file found %s: %s\n", __func__, last_modified_path,
-                    last_modified);
         }
     }
 

From 4fadb072e93ed724c93353eeddd6e207eb245991 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 18:15:20 +0100
Subject: [PATCH 25/52] server: tests: add `--model-url` tests

---
 examples/server/tests/README.md                 |  2 +-
 .../server/tests/features/embeddings.feature    |  3 ++-
 examples/server/tests/features/steps/steps.py   | 17 ++++++++++++++++-
 3 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/examples/server/tests/README.md b/examples/server/tests/README.md
index 95a0353b6a9c5..feb2b1d6cf5de 100644
--- a/examples/server/tests/README.md
+++ b/examples/server/tests/README.md
@@ -57,7 +57,7 @@ Feature or Scenario must be annotated with `@llama.cpp` to be included in the de
 To run a scenario annotated with `@bug`, start:
 
 ```shell
-DEBUG=ON ./tests.sh --no-skipped --tags bug
+DEBUG=ON ./tests.sh --no-skipped --tags bug --stop
 ```
 
 After changing logic in `steps.py`, ensure that `@bug` and `@wrong_usage` scenario are updated.
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
index 57359b267a668..fb821f802596d 100644
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@@ -4,7 +4,8 @@ Feature: llama.cpp server
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file bert-bge-small/ggml-model-f16.gguf from HF repo ggml-org/models
+    And   a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
+    And   a model file /tmp/ggml-model-f16.gguf
     And   a model alias bert-bge-small
     And   42 as server seed
     And   2 slots
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index a59a52d21748a..19d064dfd0304 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -32,6 +32,8 @@ def step_server_config(context, server_fqdn, server_port):
     context.base_url = f'http://{context.server_fqdn}:{context.server_port}'
 
     context.model_alias = None
+    context.model_file = None
+    context.model_url = None
     context.n_batch = None
     context.n_ubatch = None
     context.n_ctx = None
@@ -65,6 +67,16 @@ def step_download_hf_model(context, hf_file, hf_repo):
         print(f"model file: {context.model_file}\n")
 
 
+@step('a model file {model_file}')
+def step_model_file(context, model_file):
+    context.model_file = model_file
+
+
+@step('a model url {model_url}')
+def step_model_url(context, model_url):
+    context.model_url = model_url
+
+
 @step('a model alias {model_alias}')
 def step_model_alias(context, model_alias):
     context.model_alias = model_alias
@@ -1038,8 +1050,11 @@ def start_server_background(context):
     server_args = [
         '--host', server_listen_addr,
         '--port', context.server_port,
-        '--model', context.model_file
     ]
+    if context.model_file:
+        server_args.extend(['--model', context.model_file])
+    if context.model_url:
+        server_args.extend(['--model-url', context.model_url])
     if context.n_batch:
         server_args.extend(['--batch-size', context.n_batch])
     if context.n_ubatch:

From 124c474bba8d339a3c3e9a555c6c2d46d9ff8b25 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 18:24:21 +0100
Subject: [PATCH 26/52] llama_load_model_from_url: coherent clearer logging

---
 common/common.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 269a3afd88ecf..5775840dd27d5 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1785,7 +1785,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
             if (etag_file) {
                 fputs(headers.etag, etag_file);
                 fclose(etag_file);
-                fprintf(stderr, "%s: model etag saved %s:%s\n", __func__, etag_path, headers.etag);
+                fprintf(stderr, "%s: model etag saved %s: %s\n", __func__, etag_path, headers.etag);
             }
         }
 
@@ -1795,7 +1795,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
             if (last_modified_file) {
                 fputs(headers.last_modified, last_modified_file);
                 fclose(last_modified_file);
-                fprintf(stderr, "%s: model last modified saved %s:%s\n", __func__, last_modified_path,
+                fprintf(stderr, "%s: model last modified saved %s: %s\n", __func__, last_modified_path,
                         headers.last_modified);
             }
         }

From 064dc076bb7751785bc7f52039091ba23ea1dfdc Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 18:34:36 +0100
Subject: [PATCH 27/52] common: CMakeLists.txt fix typo in logging when lib
 curl is not found

---
 common/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 9e85c2337f815..0331788fd88ca 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -54,7 +54,7 @@ if (CURL_FOUND)
     include_directories(${CURL_INCLUDE_DIRS})
     link_libraries(${CURL_LIBRARIES})
 else()
-    message(INFO "libcurl not found. Building without model download support.")
+    message(INFO " libcurl not found. Building without model download support.")
 endif ()
 
 

From 838178a1969ed02e8378003a742182d1add218e6 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 18:34:53 +0100
Subject: [PATCH 28/52] ci: tests: windows tests add libcurl

---
 .github/workflows/server.yml | 19 +++++++++++++++++++
 1 file changed, 19 insertions(+)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 51340662a277f..519e0313fbd8f 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -102,6 +102,25 @@ jobs:
         with:
           fetch-depth: 0
 
+      - name: Download libCURL
+        id: get_libcurl
+        env:
+          CURL_VERSION: 8.6.0
+        run: |
+          curl.exe -o $env:RUNNER_TEMP/libcurl.tar.gz -L "https://github.com/curl/curl/releases/download/v${env:CURL_VERSION}/curl-v${env:CURL_VERSION}.tar.gz"
+          mkdir $env:RUNNER_TEMP/libcurl
+          tar.exe -xvf $env:RUNNER_TEMP/libcurl.tar.gz --strip-components=1 -C $env:RUNNER_TEMP/libcurl
+
+      - name: Install libcurl
+        id: install_libcurl
+        run: |
+          cd $env:RUNNER_TEMP/libcurl
+          mkdir build
+          cd build
+          cmake .. -DCMAKE_BUILD_TYPE=Release ;
+          cmake --build . --config Release
+          make install          
+
       - name: Build
         id: cmake_build
         run: |

From 176f039a91788b3b1e20527573eaf3fb866de73a Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 19:51:44 +0100
Subject: [PATCH 29/52] ci: tests: windows tests add libcurl

---
 .github/workflows/server.yml | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 519e0313fbd8f..5530e9bd80a23 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -105,28 +105,28 @@ jobs:
       - name: Download libCURL
         id: get_libcurl
         env:
+          CURL_TAG: 8_6_0
           CURL_VERSION: 8.6.0
         run: |
-          curl.exe -o $env:RUNNER_TEMP/libcurl.tar.gz -L "https://github.com/curl/curl/releases/download/v${env:CURL_VERSION}/curl-v${env:CURL_VERSION}.tar.gz"
+          curl.exe -o $env:RUNNER_TEMP/libcurl.tar.gz -L "https://github.com/curl/curl/releases/download/curl-${env:CURL_TAG}/curl-${env:CURL_VERSION}.tar.gz"
           mkdir $env:RUNNER_TEMP/libcurl
           tar.exe -xvf $env:RUNNER_TEMP/libcurl.tar.gz --strip-components=1 -C $env:RUNNER_TEMP/libcurl
 
-      - name: Install libcurl
-        id: install_libcurl
+      - name: Build libcurl
+        id: build_libcurl
         run: |
           cd $env:RUNNER_TEMP/libcurl
           mkdir build
           cd build
-          cmake .. -DCMAKE_BUILD_TYPE=Release ;
+          cmake .. -DCMAKE_BUILD_TYPE=Release
           cmake --build . --config Release
-          make install          
 
       - name: Build
         id: cmake_build
         run: |
           mkdir build
           cd build
-          cmake ..  -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release ;
+          cmake .. -DCURL_LIBRARY="${env:RUNNER_TEMP}/libcurl/lib/Release/libcurl_imp.lib" -DCURL_INCLUDE_DIR="${env:RUNNER_TEMP}/libcurl/include" -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release
           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
 
       - name: Python setup

From 5df5605b0267360b0d6f9bc219496eb34a2a79df Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 19:52:11 +0100
Subject: [PATCH 30/52] ci: build: add libcurl in default make toolchain step

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0977aa8ba93d4..375625beb36f6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -33,7 +33,7 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install build-essential gcc-8
+          sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
 
       - name: Build
         id: make_build

From 78812c6d638c3dba75a521d39bfeca5c30af6310 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 20:02:34 +0100
Subject: [PATCH 31/52] llama_load_model_from_url: PR feedback, use snprintf
 instead of strncp and strncat

---
 common/common.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 5775840dd27d5..90902542a6971 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1667,16 +1667,14 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
     struct stat buffer;
     auto file_exists = (stat(path_model, &buffer) == 0);
 
-    // If the file exists, check for ${model_path}.etag or ${model_path}.lastModified files
+    // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
     char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
     char etag_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
-    strncpy(etag_path, path_model, LLAMA_CURL_MAX_PATH_LENGTH - 6); // 6 is the length of ".etag\0"
-    strncat(etag_path, ".etag", 6);
+    snprintf(etag_path, sizeof(etag_path), "%s.etag", path_model);
 
     char last_modified[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
     char last_modified_path[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
-    strncpy(last_modified_path, path_model, LLAMA_CURL_MAX_PATH_LENGTH - 15); // 15 is the length of ".lastModified\0"
-    strncat(last_modified_path, ".lastModified", 15);
+    snprintf(last_modified_path, sizeof(last_modified_path), "%s.lastModified", path_model);
 
     if (file_exists) {
         auto * f_etag = fopen(etag_path, "r");

From 1ad5a45210d573d65e2243882f4d84e0b9d17c49 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 20:06:18 +0100
Subject: [PATCH 32/52] ci: build: add libcurl in default make toolchain step
 for tests

---
 .github/workflows/build.yml | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 375625beb36f6..b36fad09da0a4 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -45,6 +45,8 @@ jobs:
 
       - name: Test
         id: make_test
+        env:
+          LLAMA_CURL: 1
         run: |
           CC=gcc-8 make tests -j $(nproc)
           make test -j $(nproc)

From 22b3bb3ceb4bd94a8028d4b4cdb3c4c3f790a71f Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 21:50:37 +0100
Subject: [PATCH 33/52] common: fix windows build caused by double windows.h
 import

---
 common/common.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 90902542a6971..3cbd4dbae6f22 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -57,7 +57,6 @@
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)
-#include <windows.h>
 #define PATH_MAX MAX_PATH
 #else
 #include <sys/syslimits.h>

From e6848ab0e699533579e60bbeb23c900e2e625a8c Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 21:53:07 +0100
Subject: [PATCH 34/52] build: move the make build with env LLAMA_CURL to a
 dedicated place

---
 .github/workflows/build.yml | 25 ++++++++++++++++++++++---
 1 file changed, 22 insertions(+), 3 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index b36fad09da0a4..8130197461c8c 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -39,18 +39,37 @@ jobs:
         id: make_build
         env:
             LLAMA_FATAL_WARNINGS: 1
-            LLAMA_CURL: 1
         run: |
           CC=gcc-8 make -j $(nproc)
 
       - name: Test
         id: make_test
-        env:
-          LLAMA_CURL: 1
         run: |
           CC=gcc-8 make tests -j $(nproc)
           make test -j $(nproc)
 
+  ubuntu-focal-make-curl:
+    runs-on: ubuntu-latest
+
+    steps:
+      - name: Clone
+        id: checkout
+        uses: actions/checkout@v3
+
+      - name: Dependencies
+        id: depends
+        run: |
+          sudo apt-get update
+          sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
+
+      - name: Build
+        id: make_build
+        env:
+          LLAMA_FATAL_WARNINGS: 1
+          LLAMA_CURL: 1
+        run: |
+          CC=gcc-8 make -j $(nproc)
+
   ubuntu-latest-cmake:
     runs-on: ubuntu-latest
 

From d81acb68476d7fa05e443d63511d0da91ec39fb9 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 21:59:53 +0100
Subject: [PATCH 35/52] build: introduce cmake option LLAMA_CURL to trigger
 libcurl linking to be coherent with the make toolchain

---
 .github/workflows/server.yml | 3 ++-
 CMakeLists.txt               | 1 +
 common/CMakeLists.txt        | 7 +++----
 3 files changed, 6 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 5530e9bd80a23..8abe6f49619ef 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -68,6 +68,7 @@ jobs:
           cmake .. \
               -DLLAMA_NATIVE=OFF \
               -DLLAMA_BUILD_SERVER=ON \
+              -DLLAMA_CURL=ON \
               -DCMAKE_BUILD_TYPE=${{ matrix.build_type }} \
               -DLLAMA_SANITIZE_${{ matrix.sanitizer }}=ON ;
           cmake --build . --config ${{ matrix.build_type }} -j $(nproc) --target server
@@ -126,7 +127,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DCURL_LIBRARY="${env:RUNNER_TEMP}/libcurl/lib/Release/libcurl_imp.lib" -DCURL_INCLUDE_DIR="${env:RUNNER_TEMP}/libcurl/include" -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release
+          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="${env:RUNNER_TEMP}/libcurl/lib/Release/libcurl_imp.lib" -DCURL_INCLUDE_DIR="${env:RUNNER_TEMP}/libcurl/include" -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release
           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
 
       - name: Python setup
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3ac2804a6881a..fc4cff28f44ac 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -99,6 +99,7 @@ option(LLAMA_CUDA_F16                        "llama: use 16 bit floats for some
 set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for Q2_K/Q6_K")
 set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                              "llama: max. batch size for using peer access")
+option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
 option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
 option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
 option(LLAMA_CLBLAST                         "llama: use CLBlast"                               OFF)
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 0331788fd88ca..c8a21a9c2b6e7 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -47,14 +47,13 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
+
 # Check for curl
-find_package(CURL QUIET)
-if (CURL_FOUND)
+if (LLAMA_CURL)
+    find_package(CURL)
     add_definitions(-DLLAMA_USE_CURL)
     include_directories(${CURL_INCLUDE_DIRS})
     link_libraries(${CURL_LIBRARIES})
-else()
-    message(INFO " libcurl not found. Building without model download support.")
 endif ()
 
 

From dbd969142e22997e801032b45f8ce145031742aa Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 22:01:19 +0100
Subject: [PATCH 36/52] build: move the make build with env LLAMA_CURL to a
 dedicated place

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 8130197461c8c..ded19606284d0 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -33,7 +33,7 @@ jobs:
         id: depends
         run: |
           sudo apt-get update
-          sudo apt-get install build-essential gcc-8 libcurl4-openssl-dev
+          sudo apt-get install build-essential gcc-8
 
       - name: Build
         id: make_build

From 9da4eec082fd3d7339485e453cde96763566fc84 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 22:13:46 +0100
Subject: [PATCH 37/52] llama_load_model_from_url: minor spacing and log
 message changes

---
 common/common.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 3cbd4dbae6f22..007424fd91a4b 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1391,7 +1391,7 @@ void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     printf("  -m FNAME, --model FNAME\n");
     printf("                        model path (default: %s)\n", params.model.c_str());
     printf("  -mu MODEL_URL, --model-url MODEL_URL\n");
-    printf("                            model download url (default: %s)\n", params.model_url.c_str());
+    printf("                        model download url (default: %s)\n", params.model_url.c_str());
     printf("  -md FNAME, --model-draft FNAME\n");
     printf("                        draft model for speculative decoding\n");
     printf("  -ld LOGDIR, --logdir LOGDIR\n");
@@ -1653,18 +1653,17 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
 
     if (!curl) {
         curl_global_cleanup();
-        fprintf(stderr, "%s: error initializing lib curl\n", __func__);
+        fprintf(stderr, "%s: error initializing libcurl\n", __func__);
         return NULL;
     }
 
     // Set the URL, allow to follow http redirection
     curl_easy_setopt(curl, CURLOPT_URL, model_url);
     curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
-    curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L);
 
     // Check if the file already exists locally
-    struct stat buffer;
-    auto file_exists = (stat(path_model, &buffer) == 0);
+    struct stat model_file_info;
+    auto file_exists = (stat(path_model, &model_file_info) == 0);
 
     // If the file exists, check for ${path_model}.etag or ${path_model}.lastModified files
     char etag[LLAMA_CURL_MAX_HEADER_LENGTH] = {0};
@@ -1722,7 +1721,8 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
             return n_items;
         };
 
-        curl_easy_setopt(curl, CURLOPT_NOBODY, 1L);
+        curl_easy_setopt(curl, CURLOPT_NOBODY, 1L); // will trigger the HEAD verb
+        curl_easy_setopt(curl, CURLOPT_NOPROGRESS, 1L); // hide head request progress
         curl_easy_setopt(curl, CURLOPT_HEADERFUNCTION, static_cast<CURLOPT_HEADERFUNCTION_PTR>(header_callback));
         curl_easy_setopt(curl, CURLOPT_HEADERDATA, &headers);
 
@@ -1735,7 +1735,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
         }
     }
 
-    // If only the ETag or the Last-Modified header are different, trigger a new download
+    // If the ETag or the Last-Modified headers are different: trigger a new download
     if (strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
         // Set the output file
         auto * outfile = fopen(path_model, "wb");
@@ -1769,7 +1769,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
             fclose(outfile);
             curl_easy_cleanup(curl);
             curl_global_cleanup();
-            fprintf(stderr, "%s: invalid http status code failed: %ld\n", __func__, http_code);
+            fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
             return NULL;
         }
 
@@ -1808,7 +1808,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
 
 struct llama_model * llama_load_model_from_url(const char * /*model_url*/, const char * /*path_model*/,
                                               struct llama_model_params /*params*/) {
-    fprintf(stderr, "%s: llama.cpp built without curl support, downloading from an url not supported.\n", __func__);
+    fprintf(stderr, "%s: llama.cpp built without libcurl, downloading from an url not supported.\n", __func__);
     return nullptr;
 }
 

From 89d3483860ea10ab7c49cbc910aa9a455969c279 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 22:27:02 +0100
Subject: [PATCH 38/52] ci: build: fix ubuntu-focal-make-curl

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index ded19606284d0..945df42f886a6 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -49,7 +49,7 @@ jobs:
           make test -j $(nproc)
 
   ubuntu-focal-make-curl:
-    runs-on: ubuntu-latest
+    runs-on: ubuntu-20.04
 
     steps:
       - name: Clone

From 13d8817ce260dceeed4a776aec89c9c19cba31b8 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 22:34:01 +0100
Subject: [PATCH 39/52] ci: build: try to fix the windows build

---
 .github/workflows/server.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 8abe6f49619ef..d0458629ad957 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -127,7 +127,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="${env:RUNNER_TEMP}/libcurl/lib/Release/libcurl_imp.lib" -DCURL_INCLUDE_DIR="${env:RUNNER_TEMP}/libcurl/include" -DLLAMA_BUILD_SERVER=ON -DCMAKE_BUILD_TYPE=Release
+          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="${env:RUNNER_TEMP}/libcurl/lib/Release/libcurl_imp.lib" -DCURL_INCLUDE_DIR="${env:RUNNER_TEMP}/libcurl/include" -DLLAMA_BUILD_SERVER=ON -DLLAMA_NATIVE=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
 
       - name: Python setup

From 1ddaf7109acd3eef882929e755e48f30e1aaec8c Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sat, 16 Mar 2024 22:43:05 +0100
Subject: [PATCH 40/52] common: remove old dependency to openssl

---
 common/common.h | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/common/common.h b/common/common.h
index 3e5dd661c95aa..8dd8a3edc9c94 100644
--- a/common/common.h
+++ b/common/common.h
@@ -17,12 +17,6 @@
 #include <unordered_map>
 #include <tuple>
 
-#ifdef HAVE_OPENSSL
-#include <openssl/ssl.h>
-#include <openssl/bio.h>
-#include <openssl/err.h>
-#endif
-
 #ifdef _WIN32
 #define DIRECTORY_SEPARATOR '\\'
 #else

From 73b4b44785d803e3c74e97bdea59d230a94f3ed1 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 00:43:35 +0100
Subject: [PATCH 41/52] common: fix build

---
 .github/workflows/server.yml | 20 +++++---------------
 common/CMakeLists.txt        | 17 ++++++++---------
 common/common.cpp            |  4 ++--
 3 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index d0458629ad957..bb321aa1c1ece 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -103,31 +103,21 @@ jobs:
         with:
           fetch-depth: 0
 
-      - name: Download libCURL
+      - name: libCURL
         id: get_libcurl
         env:
-          CURL_TAG: 8_6_0
-          CURL_VERSION: 8.6.0
+          CURL_VERSION: 8.6.0_6
         run: |
-          curl.exe -o $env:RUNNER_TEMP/libcurl.tar.gz -L "https://github.com/curl/curl/releases/download/curl-${env:CURL_TAG}/curl-${env:CURL_VERSION}.tar.gz"
+          curl.exe -o $env:RUNNER_TEMP/curl.zip -L "https://curl.se/windows/dl-${env:CURL_VERSION}/curl-${env:CURL_VERSION}-win64-mingw.zip"
           mkdir $env:RUNNER_TEMP/libcurl
-          tar.exe -xvf $env:RUNNER_TEMP/libcurl.tar.gz --strip-components=1 -C $env:RUNNER_TEMP/libcurl
-
-      - name: Build libcurl
-        id: build_libcurl
-        run: |
-          cd $env:RUNNER_TEMP/libcurl
-          mkdir build
-          cd build
-          cmake .. -DCMAKE_BUILD_TYPE=Release
-          cmake --build . --config Release
+          tar.exe -xvf $env:RUNNER_TEMP/curl.zip --strip-components=1 -C $env:RUNNER_TEMP/libcurl
 
       - name: Build
         id: cmake_build
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="${env:RUNNER_TEMP}/libcurl/lib/Release/libcurl_imp.lib" -DCURL_INCLUDE_DIR="${env:RUNNER_TEMP}/libcurl/include" -DLLAMA_BUILD_SERVER=ON -DLLAMA_NATIVE=OFF -DBUILD_SHARED_LIBS=ON -DCMAKE_BUILD_TYPE=Release
+          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" -DLLAMA_BUILD_SERVER=ON -DLLAMA_NATIVE=OFF -DBUILD_SHARED_LIBS=ON
           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
 
       - name: Python setup
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index c8a21a9c2b6e7..3beda6d25caec 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -48,15 +48,6 @@ if (BUILD_SHARED_LIBS)
 endif()
 
 
-# Check for curl
-if (LLAMA_CURL)
-    find_package(CURL)
-    add_definitions(-DLLAMA_USE_CURL)
-    include_directories(${CURL_INCLUDE_DIRS})
-    link_libraries(${CURL_LIBRARIES})
-endif ()
-
-
 set(TARGET common)
 
 add_library(${TARGET} STATIC
@@ -80,3 +71,11 @@ endif()
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
 target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
+
+# Use curl to download model url
+if (LLAMA_CURL)
+    find_package(CURL)
+    add_definitions(-DLLAMA_USE_CURL)
+    target_include_directories(${TARGET} ${CURL_INCLUDE_DIRS})
+    target_link_libraries(${TARGET} PRIVATE curl)
+endif ()
diff --git a/common/common.cpp b/common/common.cpp
index 007424fd91a4b..77b8f1d7c594d 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1710,13 +1710,13 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
 
             const char * etag_prefix = "etag: ";
             if (strncmp(buffer, etag_prefix, strlen(etag_prefix)) == 0) {
-                strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove LRLF
+                strncpy(headers->etag, buffer + strlen(etag_prefix), n_items - strlen(etag_prefix) - 2); // Remove CRLF
             }
 
             const char * last_modified_prefix = "last-modified: ";
             if (strncmp(buffer, last_modified_prefix, strlen(last_modified_prefix)) == 0) {
                 strncpy(headers->last_modified, buffer + strlen(last_modified_prefix),
-                        n_items - strlen(last_modified_prefix) - 2); // Remove LRLF
+                        n_items - strlen(last_modified_prefix) - 2); // Remove CRLF
             }
             return n_items;
         };

From a3ed3d48d30af5c096a0b301d0bef384a0b2c22d Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 01:17:58 +0100
Subject: [PATCH 42/52] common: fix windows build

---
 common/CMakeLists.txt | 10 +++++-----
 common/common.cpp     |  8 ++++----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index 3beda6d25caec..cb4e538698337 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -68,14 +68,14 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
-target_include_directories(${TARGET} PUBLIC .)
-target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
-
 # Use curl to download model url
 if (LLAMA_CURL)
     find_package(CURL)
     add_definitions(-DLLAMA_USE_CURL)
-    target_include_directories(${TARGET} ${CURL_INCLUDE_DIRS})
+    include_directories(${CURL_INCLUDE_DIRS})
     target_link_libraries(${TARGET} PRIVATE curl)
 endif ()
+
+target_include_directories(${TARGET} PUBLIC .)
+target_compile_features(${TARGET} PUBLIC cxx_std_11)
+target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
diff --git a/common/common.cpp b/common/common.cpp
index 77b8f1d7c594d..fd4ee9f1efa2e 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -16,9 +16,6 @@
 #include <unordered_set>
 #include <vector>
 #include <cinttypes>
-#ifdef LLAMA_USE_CURL
-#include <curl/curl.h>
-#endif
 
 #if defined(__APPLE__) && defined(__MACH__)
 #include <sys/types.h>
@@ -40,6 +37,9 @@
 #include <sys/stat.h>
 #include <unistd.h>
 #endif
+#if defined(LLAMA_USE_CURL)
+#include <curl/curl.h>
+#endif
 
 #if defined(_MSC_VER)
 #pragma warning(disable: 4244 4267) // possible loss of data
@@ -53,7 +53,7 @@
 #define GGML_USE_CUBLAS_SYCL_VULKAN
 #endif
 
-#ifdef LLAMA_USE_CURL
+#if defined(LLAMA_USE_CURL)
 #ifdef __linux__
 #include <linux/limits.h>
 #elif defined(_WIN32)

From 5e66ec80b33451ee9949e308f5ecf8637613af90 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 02:07:06 +0100
Subject: [PATCH 43/52] common: fix windows tests

---
 .github/workflows/server.yml | 4 +++-
 common/CMakeLists.txt        | 9 ++++++---
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index bb321aa1c1ece..92268fe9dc4b0 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -117,7 +117,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" -DLLAMA_BUILD_SERVER=ON -DLLAMA_NATIVE=OFF -DBUILD_SHARED_LIBS=ON
+          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" -DLLAMA_BUILD_SERVER=ON -DLLAMA_NATIVE=OFF -DBUILD_SHARED_LIBS=ON
           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
 
       - name: Python setup
@@ -136,6 +136,7 @@ jobs:
         if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
         run: |
           cd examples/server/tests
+          $env:PATH += ";$env:RUNNER_TEMP/libcurl/bin"
           behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
 
       - name: Slow tests
@@ -143,4 +144,5 @@ jobs:
         if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
         run: |
           cd examples/server/tests
+          $env:PATH += ";$env:RUNNER_TEMP/libcurl/bin"
           behave.exe --stop --no-skipped --no-capture --tags slow
diff --git a/common/CMakeLists.txt b/common/CMakeLists.txt
index cb4e538698337..af2629a460b93 100644
--- a/common/CMakeLists.txt
+++ b/common/CMakeLists.txt
@@ -68,14 +68,17 @@ if (BUILD_SHARED_LIBS)
     set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
 endif()
 
+set(LLAMA_COMMON_EXTRA_LIBS build_info)
+
 # Use curl to download model url
 if (LLAMA_CURL)
-    find_package(CURL)
+    find_package(CURL REQUIRED)
     add_definitions(-DLLAMA_USE_CURL)
     include_directories(${CURL_INCLUDE_DIRS})
-    target_link_libraries(${TARGET} PRIVATE curl)
+    find_library(CURL_LIBRARY curl REQUIRED)
+    set(LLAMA_COMMON_EXTRA_LIBS ${LLAMA_COMMON_EXTRA_LIBS} ${CURL_LIBRARY})
 endif ()
 
 target_include_directories(${TARGET} PUBLIC .)
 target_compile_features(${TARGET} PUBLIC cxx_std_11)
-target_link_libraries(${TARGET} PRIVATE build_info PUBLIC llama)
+target_link_libraries(${TARGET} PRIVATE ${LLAMA_COMMON_EXTRA_LIBS} PUBLIC llama)

From 9ca4acc5fb4b77ce3369c52b9e5fa5c7bb52da1b Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 02:30:20 +0100
Subject: [PATCH 44/52] common: fix windows tests

---
 .github/workflows/server.yml | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 92268fe9dc4b0..e27daf0c3b1aa 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -135,8 +135,8 @@ jobs:
         id: server_integration_tests
         if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
         run: |
+          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl.dll
           cd examples/server/tests
-          $env:PATH += ";$env:RUNNER_TEMP/libcurl/bin"
           behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
 
       - name: Slow tests
@@ -144,5 +144,4 @@ jobs:
         if: ${{ (github.event.schedule || github.event.inputs.slow_tests == 'true') && matrix.build_type == 'Release' }}
         run: |
           cd examples/server/tests
-          $env:PATH += ";$env:RUNNER_TEMP/libcurl/bin"
           behave.exe --stop --no-skipped --no-capture --tags slow

From c1b002e06772fe39136651b192cd5cea1c2cc553 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 09:35:19 +0100
Subject: [PATCH 45/52] common: llama_load_model_from_url windows set
 CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA

---
 .github/workflows/server.yml                      |  6 +++++-
 common/common.cpp                                 |  5 +++++
 examples/server/tests/features/embeddings.feature |  2 +-
 examples/server/tests/features/environment.py     | 10 ++++++++++
 examples/server/tests/features/steps/steps.py     |  2 ++
 5 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index e27daf0c3b1aa..79807f8971d41 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -131,11 +131,15 @@ jobs:
         run: |
           pip install -r examples/server/tests/requirements.txt
 
+      - name: Copy Libcurl
+        id: prepare_libcurl
+        run: |
+          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl-x64.dll
+
       - name: Tests
         id: server_integration_tests
         if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
         run: |
-          cp $env:RUNNER_TEMP/libcurl/bin/libcurl-x64.dll ./build/bin/Release/libcurl.dll
           cd examples/server/tests
           behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
 
diff --git a/common/common.cpp b/common/common.cpp
index fd4ee9f1efa2e..789466fdaf0f9 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1660,6 +1660,11 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
     // Set the URL, allow to follow http redirection
     curl_easy_setopt(curl, CURLOPT_URL, model_url);
     curl_easy_setopt(curl, CURLOPT_FOLLOWLOCATION, 1L);
+#if defined(_WIN32)
+    // CURLSSLOPT_NATIVE_CA tells libcurl to use standard certificate store of
+    //   operating system. Currently implemented under MS-Windows.
+    curl_easy_setopt(curl, CURLOPT_SSL_OPTIONS, CURLSSLOPT_NATIVE_CA);
+#endif
 
     // Check if the file already exists locally
     struct stat model_file_info;
diff --git a/examples/server/tests/features/embeddings.feature b/examples/server/tests/features/embeddings.feature
index fb821f802596d..dcf1434f97124 100644
--- a/examples/server/tests/features/embeddings.feature
+++ b/examples/server/tests/features/embeddings.feature
@@ -5,7 +5,7 @@ Feature: llama.cpp server
   Background: Server startup
     Given a server listening on localhost:8080
     And   a model url https://huggingface.co/ggml-org/models/resolve/main/bert-bge-small/ggml-model-f16.gguf
-    And   a model file /tmp/ggml-model-f16.gguf
+    And   a model file ggml-model-f16.gguf
     And   a model alias bert-bge-small
     And   42 as server seed
     And   2 slots
diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py
index 8ad987e1bb618..3b45de6bafc82 100644
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@@ -33,6 +33,16 @@ def after_scenario(context, scenario):
             print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
 
     if not pid_exists(context.server_process.pid):
+        print("Trying to find server logs:")
+        out, err = context.server_process.communicate()
+        if out:
+            print("Server stdout:\n")
+            print(out)
+            print("\n")
+        if err:
+            print("Server stderr:\n")
+            print(err)
+            print("\n")
         assert False, f"Server not running pid={context.server_process.pid} ..."
 
     server_graceful_shutdown(context)
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 19d064dfd0304..9b25b1aebe587 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1094,6 +1094,8 @@ def start_server_background(context):
 
     pkwargs = {
         'creationflags': flags,
+        'stderr': subprocess.PIPE,
+        'stdout': subprocess.PIPE
     }
     context.server_process = subprocess.Popen(
         [str(arg) for arg in [context.server_path, *server_args]],

From cff7faaccbebdd64275fa801bd65f514b5d14699 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 16:28:01 +0100
Subject: [PATCH 46/52] ci: tests: print server logs in case of scenario
 failure

---
 examples/server/tests/features/environment.py | 28 ++++++++++++-------
 1 file changed, 18 insertions(+), 10 deletions(-)

diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py
index 3b45de6bafc82..9ae3954e9aa62 100644
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@@ -33,16 +33,7 @@ def after_scenario(context, scenario):
             print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
 
     if not pid_exists(context.server_process.pid):
-        print("Trying to find server logs:")
-        out, err = context.server_process.communicate()
-        if out:
-            print("Server stdout:\n")
-            print(out)
-            print("\n")
-        if err:
-            print("Server stderr:\n")
-            print(err)
-            print("\n")
+        print_server_logs(context)
         assert False, f"Server not running pid={context.server_process.pid} ..."
 
     server_graceful_shutdown(context)
@@ -58,6 +49,9 @@ def after_scenario(context, scenario):
         if attempts > 5:
             server_kill_hard(context)
 
+    if scenario.status == "failed" or context.debug:
+        print_server_logs(context)
+
 
 def server_graceful_shutdown(context):
     print(f"shutting down server pid={context.server_process.pid} ...\n")
@@ -108,3 +102,17 @@ def pid_exists(pid):
             return e.errno == errno.EPERM
         else:
             return True
+
+
+def print_server_logs(context):
+    print("Trying to find server logs:")
+    out, err = context.server_process.communicate()
+    if out:
+        print("Server stdout:\n")
+        print(out.decode("utf-8"))
+        print("\n")
+    if err:
+        print("Server stderr:\n")
+        print(err.decode("utf-8"))
+        print("\n")
+

From 4fe431d429e5f887a62f07b0906e32c32717749a Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 16:31:34 +0100
Subject: [PATCH 47/52] common: llama_load_model_from_url: make it working on
 windows: disable global curl function, use a write callback.

---
 common/common.cpp | 24 ++++++++++++++++--------
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/common/common.cpp b/common/common.cpp
index 789466fdaf0f9..25d1ff4fa81bf 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1648,11 +1648,9 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
     }
 
     // Initialize libcurl globally
-    curl_global_init(CURL_GLOBAL_DEFAULT);
     auto curl = curl_easy_init();
 
     if (!curl) {
-        curl_global_cleanup();
         fprintf(stderr, "%s: error initializing libcurl\n", __func__);
         return NULL;
     }
@@ -1734,23 +1732,36 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
         CURLcode res = curl_easy_perform(curl);
         if (res != CURLE_OK) {
             curl_easy_cleanup(curl);
-            curl_global_cleanup();
             fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
             return NULL;
         }
+
+        long http_code = 0;
+        curl_easy_getinfo(curl, CURLINFO_RESPONSE_CODE, &http_code);
+        if (http_code != 200) {
+            // HEAD not supported, we don't know if the file has changed
+            // force trigger downloading
+            file_exists = false;
+            fprintf(stderr, "%s: HEAD invalid http status code received: %ld\n", __func__, http_code);
+        }
     }
 
     // If the ETag or the Last-Modified headers are different: trigger a new download
-    if (strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
+    if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
         // Set the output file
         auto * outfile = fopen(path_model, "wb");
         if (!outfile) {
             curl_easy_cleanup(curl);
-            curl_global_cleanup();
             fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
             return NULL;
         }
+
+        typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
+        auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
+            return fwrite(data, size, nmemb, (FILE *)fd);;
+        };
         curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
+        curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));
         curl_easy_setopt(curl, CURLOPT_WRITEDATA, outfile);
 
         //  display download progress
@@ -1763,7 +1774,6 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
         if (res != CURLE_OK) {
             fclose(outfile);
             curl_easy_cleanup(curl);
-            curl_global_cleanup();
             fprintf(stderr, "%s: curl_easy_perform() failed: %s\n", __func__, curl_easy_strerror(res));
             return NULL;
         }
@@ -1773,7 +1783,6 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
         if (http_code < 200 || http_code >= 400) {
             fclose(outfile);
             curl_easy_cleanup(curl);
-            curl_global_cleanup();
             fprintf(stderr, "%s: invalid http status code received: %ld\n", __func__, http_code);
             return NULL;
         }
@@ -1804,7 +1813,6 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
     }
 
     curl_easy_cleanup(curl);
-    curl_global_cleanup();
 
     return llama_load_model_from_file(path_model, params);
 }

From 47a9e5d76c1cb30b71f6f79eb29bfe1cc58cda50 Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 16:37:40 +0100
Subject: [PATCH 48/52] ci: tests: increase timeout for windows

---
 examples/server/tests/features/steps/steps.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 9b25b1aebe587..93845244ad1b2 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -153,7 +153,8 @@ def step_start_server(context):
 async def step_wait_for_the_server_to_be_started(context, expecting_status):
     match expecting_status:
         case 'healthy':
-            await wait_for_health_status(context, context.base_url, 200, 'ok')
+            await wait_for_health_status(context, context.base_url, 200, 'ok',
+                                         timeout=30)
 
         case 'ready' | 'idle':
             await wait_for_health_status(context, context.base_url, 200, 'ok',

From 31272c635a46722f0ec46813f63b038686b9652f Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 16:46:53 +0100
Subject: [PATCH 49/52] common: fix typo

---
 common/common.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/common/common.cpp b/common/common.cpp
index 25d1ff4fa81bf..de05018550f65 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1758,7 +1758,7 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
 
         typedef size_t(*CURLOPT_WRITEFUNCTION_PTR)(void * data, size_t size, size_t nmemb, void * fd);
         auto write_callback = [](void * data, size_t size, size_t nmemb, void * fd) -> size_t {
-            return fwrite(data, size, nmemb, (FILE *)fd);;
+            return fwrite(data, size, nmemb, (FILE *)fd);
         };
         curl_easy_setopt(curl, CURLOPT_NOBODY, 0L);
         curl_easy_setopt(curl, CURLOPT_WRITEFUNCTION, static_cast<CURLOPT_WRITEFUNCTION_PTR>(write_callback));

From f902ab6de2b99abb6569f06642963bacfd0fc81d Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 16:37:02 +0100
Subject: [PATCH 50/52] common: llama_load_model_from_url use a temporary file
 for downloading

---
 .github/workflows/server.yml                  |  2 +-
 common/common.cpp                             | 11 ++++++++++-
 examples/server/tests/features/environment.py |  4 ++--
 examples/server/tests/features/server.feature |  3 ++-
 4 files changed, 15 insertions(+), 5 deletions(-)

diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml
index 79807f8971d41..4ea09115a3c44 100644
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -117,7 +117,7 @@ jobs:
         run: |
           mkdir build
           cd build
-          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include" -DLLAMA_BUILD_SERVER=ON -DLLAMA_NATIVE=OFF -DBUILD_SHARED_LIBS=ON
+          cmake .. -DLLAMA_CURL=ON -DCURL_LIBRARY="$env:RUNNER_TEMP/libcurl/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="$env:RUNNER_TEMP/libcurl/include"
           cmake --build . --config Release -j ${env:NUMBER_OF_PROCESSORS} --target server
 
       - name: Python setup
diff --git a/common/common.cpp b/common/common.cpp
index de05018550f65..3ecd4e5cdbd11 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1748,8 +1748,11 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
 
     // If the ETag or the Last-Modified headers are different: trigger a new download
     if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
+        char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
+        snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
+
         // Set the output file
-        auto * outfile = fopen(path_model, "wb");
+        auto * outfile = fopen(path_model_temporary, "wb");
         if (!outfile) {
             curl_easy_cleanup(curl);
             fprintf(stderr, "%s: error opening local file for writing: %s\n", __func__, path_model);
@@ -1810,6 +1813,12 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
                         headers.last_modified);
             }
         }
+
+        if (rename(path_model_temporary, path_model) != 0) {
+            curl_easy_cleanup(curl);
+            fprintf(stderr, "%s: unable to rename file: %s to %s\n", __func__, path_model_temporary, path_model);
+            return NULL;
+        }
     }
 
     curl_easy_cleanup(curl);
diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py
index 9ae3954e9aa62..96751d71364d2 100644
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@@ -109,10 +109,10 @@ def print_server_logs(context):
     out, err = context.server_process.communicate()
     if out:
         print("Server stdout:\n")
-        print(out.decode("utf-8"))
+        print(out.decode('utf-8'))
         print("\n")
     if err:
         print("Server stderr:\n")
-        print(err.decode("utf-8"))
+        print(err.decode('utf-8'))
         print("\n")
 
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature
index 5014f326dc050..7448986e75a49 100644
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -4,7 +4,8 @@ Feature: llama.cpp server
 
   Background: Server startup
     Given a server listening on localhost:8080
-    And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
+    And   a model url https://huggingface.co/ggml-org/models/resolve/main/tinyllamas/stories260K.gguf
+    And   a model file stories260K.gguf
     And   a model alias tinyllama-2
     And   42 as server seed
       # KV Cache corresponds to the total amount of tokens

From b24f30fdad741cf0178d29f008519b5349f52e9a Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 16:52:38 +0100
Subject: [PATCH 51/52] common: llama_load_model_from_url delete previous file
 before downloading

---
 common/common.cpp | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/common/common.cpp b/common/common.cpp
index 3ecd4e5cdbd11..2f5d965d6511c 100644
--- a/common/common.cpp
+++ b/common/common.cpp
@@ -1750,6 +1750,14 @@ struct llama_model * llama_load_model_from_url(const char * model_url, const cha
     if (!file_exists || strcmp(etag, headers.etag) != 0 || strcmp(last_modified, headers.last_modified) != 0) {
         char path_model_temporary[LLAMA_CURL_MAX_PATH_LENGTH] = {0};
         snprintf(path_model_temporary, sizeof(path_model_temporary), "%s.downloadInProgress", path_model);
+        if (file_exists) {
+            fprintf(stderr, "%s: deleting previous downloaded model file: %s\n", __func__, path_model);
+            if (remove(path_model) != 0) {
+                curl_easy_cleanup(curl);
+                fprintf(stderr, "%s: unable to delete file: %s\n", __func__, path_model);
+                return NULL;
+            }
+        }
 
         // Set the output file
         auto * outfile = fopen(path_model_temporary, "wb");

From fcf327f0e64002dfd9e5146f6eb74a3069fda38f Mon Sep 17 00:00:00 2001
From: Pierrick HYMBERT <pierrick.hymbert@gmail.com>
Date: Sun, 17 Mar 2024 17:45:09 +0100
Subject: [PATCH 52/52] ci: tests: fix behavior on windows

---
 examples/server/tests/features/environment.py | 113 ++++++++----------
 examples/server/tests/features/steps/steps.py |  19 ++-
 examples/server/tests/requirements.txt        |   1 +
 3 files changed, 66 insertions(+), 67 deletions(-)

diff --git a/examples/server/tests/features/environment.py b/examples/server/tests/features/environment.py
index 96751d71364d2..82104e9202e5e 100644
--- a/examples/server/tests/features/environment.py
+++ b/examples/server/tests/features/environment.py
@@ -1,10 +1,12 @@
-import errno
 import os
+import signal
 import socket
-import subprocess
+import sys
 import time
+import traceback
 from contextlib import closing
-import signal
+
+import psutil
 
 
 def before_scenario(context, scenario):
@@ -20,37 +22,40 @@ def before_scenario(context, scenario):
 
 
 def after_scenario(context, scenario):
-    if context.server_process is None:
-        return
-    if scenario.status == "failed":
-        if 'GITHUB_ACTIONS' in os.environ:
-            print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
-            if os.path.isfile('llama.log'):
-                with closing(open('llama.log', 'r')) as f:
-                    for line in f:
-                        print(line)
-        if not is_server_listening(context.server_fqdn, context.server_port):
-            print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
-
-    if not pid_exists(context.server_process.pid):
-        print_server_logs(context)
-        assert False, f"Server not running pid={context.server_process.pid} ..."
-
-    server_graceful_shutdown(context)
-
-    # Wait few for socket to free up
-    time.sleep(0.05)
-
-    attempts = 0
-    while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
-        server_kill(context)
-        time.sleep(0.1)
-        attempts += 1
-        if attempts > 5:
-            server_kill_hard(context)
-
-    if scenario.status == "failed" or context.debug:
-        print_server_logs(context)
+    try:
+        if 'server_process' not in context or context.server_process is None:
+            return
+        if scenario.status == "failed":
+            if 'GITHUB_ACTIONS' in os.environ:
+                print(f"\x1b[33;101mSCENARIO FAILED: {scenario.name} server logs:\x1b[0m\n\n")
+                if os.path.isfile('llama.log'):
+                    with closing(open('llama.log', 'r')) as f:
+                        for line in f:
+                            print(line)
+            if not is_server_listening(context.server_fqdn, context.server_port):
+                print("\x1b[33;101mERROR: Server stopped listening\x1b[0m\n")
+
+        if not pid_exists(context.server_process.pid):
+            assert False, f"Server not running pid={context.server_process.pid} ..."
+
+        server_graceful_shutdown(context)
+
+        # Wait few for socket to free up
+        time.sleep(0.05)
+
+        attempts = 0
+        while pid_exists(context.server_process.pid) or is_server_listening(context.server_fqdn, context.server_port):
+            server_kill(context)
+            time.sleep(0.1)
+            attempts += 1
+            if attempts > 5:
+                server_kill_hard(context)
+    except:
+        exc = sys.exception()
+        print("error in after scenario: \n")
+        print(exc)
+        print("*** print_tb: \n")
+        traceback.print_tb(exc.__traceback__, file=sys.stdout)
 
 
 def server_graceful_shutdown(context):
@@ -71,11 +76,11 @@ def server_kill_hard(context):
     path = context.server_path
 
     print(f"Server dangling exits, hard killing force {pid}={path}...\n")
-    if os.name == 'nt':
-        process = subprocess.check_output(['taskkill', '/F', '/pid', str(pid)]).decode()
-        print(process)
-    else:
-        os.kill(-pid, signal.SIGKILL)
+    try:
+        psutil.Process(pid).kill()
+    except psutil.NoSuchProcess:
+        return False
+    return True
 
 
 def is_server_listening(server_fqdn, server_port):
@@ -88,31 +93,9 @@ def is_server_listening(server_fqdn, server_port):
 
 
 def pid_exists(pid):
-    """Check whether pid exists in the current process table."""
-    if pid < 0:
+    try:
+        psutil.Process(pid)
+    except psutil.NoSuchProcess:
         return False
-    if os.name == 'nt':
-        output = subprocess.check_output(['TASKLIST', '/FI', f'pid eq {pid}']).decode()
-        print(output)
-        return "No tasks are running" not in output
-    else:
-        try:
-            os.kill(pid, 0)
-        except OSError as e:
-            return e.errno == errno.EPERM
-        else:
-            return True
-
-
-def print_server_logs(context):
-    print("Trying to find server logs:")
-    out, err = context.server_process.communicate()
-    if out:
-        print("Server stdout:\n")
-        print(out.decode('utf-8'))
-        print("\n")
-    if err:
-        print("Server stderr:\n")
-        print(err.decode('utf-8'))
-        print("\n")
+    return True
 
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py
index 93845244ad1b2..9e348d5fc4c37 100644
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -5,6 +5,8 @@
 import re
 import socket
 import subprocess
+import sys
+import threading
 import time
 from contextlib import closing
 from re import RegexFlag
@@ -1095,10 +1097,23 @@ def start_server_background(context):
 
     pkwargs = {
         'creationflags': flags,
-        'stderr': subprocess.PIPE,
-        'stdout': subprocess.PIPE
+        'stdout': subprocess.PIPE,
+        'stderr': subprocess.PIPE
     }
     context.server_process = subprocess.Popen(
         [str(arg) for arg in [context.server_path, *server_args]],
         **pkwargs)
+
+    def log_stdout(process):
+        for line in iter(process.stdout.readline, b''):
+            print(line.decode('utf-8'), end='')
+    thread_stdout = threading.Thread(target=log_stdout, args=(context.server_process,))
+    thread_stdout.start()
+
+    def log_stderr(process):
+        for line in iter(process.stderr.readline, b''):
+            print(line.decode('utf-8'), end='', file=sys.stderr)
+    thread_stderr = threading.Thread(target=log_stderr, args=(context.server_process,))
+    thread_stderr.start()
+
     print(f"server pid={context.server_process.pid}, behave pid={os.getpid()}")
diff --git a/examples/server/tests/requirements.txt b/examples/server/tests/requirements.txt
index 2e4f42ad28c23..c2c960102b523 100644
--- a/examples/server/tests/requirements.txt
+++ b/examples/server/tests/requirements.txt
@@ -3,4 +3,5 @@ behave~=1.2.6
 huggingface_hub~=0.20.3
 numpy~=1.24.4
 openai~=0.25.0
+psutil~=5.9.8
 prometheus-client~=0.20.0