From 26eb8562e33d8047e108f270cb966d5041c651fa Mon Sep 17 00:00:00 2001
From: mqy <meng.qingyou@gmail.com>
Date: Fri, 26 May 2023 21:57:14 +0800
Subject: [PATCH] ggml.c: bugfix CBLAS profile #1 was not executed; misc minor
 refactors

---
 .../bench-out/7b.q4_0.accelerate.txt          |  64 ++++-----
 examples/mulmat-tune/mulmat-tune.c            |  84 ++++++------
 examples/mulmat-tune/mulmat-tune.h            |   3 +-
 ggml.c                                        | 121 ++++++------------
 4 files changed, 117 insertions(+), 155 deletions(-)

diff --git a/examples/mulmat-tune/bench-out/7b.q4_0.accelerate.txt b/examples/mulmat-tune/bench-out/7b.q4_0.accelerate.txt
index 294b0eba2cd517..2f9f22f921a88e 100644
--- a/examples/mulmat-tune/bench-out/7b.q4_0.accelerate.txt
+++ b/examples/mulmat-tune/bench-out/7b.q4_0.accelerate.txt
@@ -3,38 +3,38 @@
 -1 0 0   3 0 1  -1 0 0
  0 1 0   3 0 1  -1 0 0
 4096 4096
- 16       23    14046 0 0        0 0    11366     6297 0
- 32       36    26793 0 0        0 0    11244     6201 0
- 48       55    40187 0 0        0 0    11316     7811 0
- 64       78    54450 0 0        0 0    11149     7859 0
- 80       96    68095 0 0        0 0    11258     8748 0
- 96      114    81588 0 0        0 0    11017    10248 0
-112      134    96596 0 0        0 0    11186    10506 0
-128      157   112871 0 0        0 0    11179    11887 0
+ 16       17    14400 0 0    20380 0    13643     6406 0
+ 32       48    26184 0 0    17892 0    12759     6875 0
+ 48       62    40950 0 0    20940 0    11344     6470 0
+ 64       75    54959 0 0    19897 0    12056     8272 0
+ 80       95    69812 0 0    23261 0    13296    10944 0
+ 96      135    82530 0 0    20238 0    11363     9733 0
+112      135    97063 0 0    21620 0    11008    10231 0
+128      160   110596 0 0    22374 0    11130    12202 0
 4096 11008
- 16       55    36520 0 0        0 0    29851     9467 0
- 32      103    73460 0 0        0 0    29815    11175 0
- 48      173   109619 0 0        0 0    29870    13368 0
- 64      206   147174 0 0        0 0    29571    16828 0
- 80      289   178721 0 0        0 0    29895    18013 0
- 96      343   219130 0 0        0 0    29633    21457 0
-112      550   257754 0 0        0 0    30342    23557 0
-128      594   298395 0 0        0 0    29683    24796 0
+ 16       63    34214 0 0    43145 0    30377     9875 0
+ 32       98    71625 0 0    43591 0    29675    11653 0
+ 48      155   109818 0 0    44130 0    30964    14123 0
+ 64      253   144841 0 0    46174 0    29843    17059 0
+ 80      279   175670 0 0    47225 0    29574    16913 0
+ 96      331   217921 0 0    48978 0    29582    19354 0
+112      408   254362 0 0    53326 0    29963    22962 0
+128      611   281834 0 0    57593 0    30629    25448 0
 11008 4096
- 16       19    35077 0 0        0 0    30130    21051 0
- 32       43    71844 0 0        0 0    29937    21740 0
- 48       56   108664 0 0        0 0    30534    23017 0
- 64       94   148288 0 0        0 0    29848    26486 0
- 80      108   187098 0 0        0 0    29896    29687 0
- 96      116   224466 0 0        0 0    29931    31416 0
-112      137   264372 0 0        0 0    29797    34035 0
-128      178   300958 0 0        0 0    29713    37036 0
+ 16       18    35422 0 0    53263 0    30608    20630 0
+ 32       37    69747 0 0    54542 0    30501    23162 0
+ 48       53   107693 0 0    56207 0    29500    23522 0
+ 64       99   144891 0 0    60231 0    29461    23695 0
+ 80       98   178384 0 0    60697 0    29281    26783 0
+ 96      112   217583 0 0    63507 0    29741    31710 0
+112      170   253402 0 0    65329 0    28823    34861 0
+128      189   290395 0 0    70656 0    31023    35913 0
 32000 4096
- 16       18   105077 0 0        0 0    87731    67479 0
- 32       36   205088 0 0        0 0    86620    72865 0
- 48       54   314438 0 0        0 0    87458    77700 0
- 64       75   420397 0 0        0 0    86515    83575 0
- 80      109   541305 0 0        0 0    86580    88873 0
- 96      121   646842 0 0        0 0    86500    96982 0
-112      134   761083 0 0        0 0    87326   102948 0
-128      155   872466 0 0        0 0    87668   112924 0
\ No newline at end of file
+ 16       18   104453 0 0   146992 0    86361    67977 0
+ 32       36   203698 0 0   150361 0    87629    71108 0
+ 48       89   312316 0 0   155162 0    86803    76783 0
+ 64      104   428321 0 0   161366 0    89776    82720 0
+ 80       93   532930 0 0   171931 0    87039    88321 0
+ 96      113   642233 0 0   176509 0    86327    95598 0
+112      169   745426 0 0   186020 0    87538   102664 0
+128      202   860052 0 0   196480 0    88918   109959 0
diff --git a/examples/mulmat-tune/mulmat-tune.c b/examples/mulmat-tune/mulmat-tune.c
index b7f0744e35e92e..d9b31d76ca4f83 100644
--- a/examples/mulmat-tune/mulmat-tune.c
+++ b/examples/mulmat-tune/mulmat-tune.c
@@ -79,25 +79,31 @@ int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
         return rc;
     }
 
-    tune->items = malloc(sizeof(struct ggml_mulmat_tune_m) *
-                         (tune->n_shapes * tune->n_profiles * tune->m_num));
-    if (tune->items == NULL) {
-        fprintf(stderr, "failed to allocate memory\n");
-        return -2;
+    {
+        size_t item_size = sizeof(struct ggml_mulmat_tune_m) *
+                           (tune->n_shapes * tune->n_profiles * tune->m_num);
+        tune->items = malloc(item_size);
+        if (tune->items == NULL) {
+            fprintf(stderr, "failed to allocate memory\n");
+            return -2;
+        }
+        memset(tune->items, 0, item_size);
     }
 
-    size_t sz = sizeof(struct ggml_task_profile) * tune->n_profiles;
-    tune->profiles = malloc(sz);
-    GGML_ASSERT(tune->profiles);
-    memset(tune->profiles, 0, sz);
+    {
+        size_t sz = sizeof(struct ggml_task_profile) * tune->n_profiles;
+        tune->profiles = malloc(sz);
+        GGML_ASSERT(tune->profiles);
+        memset(tune->profiles, 0, sz);
+    }
 
     for (int ip = 0; ip < tune->n_profiles; ip++) {
         struct ggml_task_profile *profile = &tune->profiles[ip];
         for (int j = 0; j < 3; j++) {
             struct ggml_task_stage *ts = &profile->stages[j];
             int backend, parallel, wait;
-            rc = fscanf(fp, "%d %d %d", &backend, &parallel, &wait);
-            if (rc <= 0) {
+            if (rc = fscanf(fp, "%d %d %d", &backend, &parallel, &wait),
+                rc <= 0) {
                 return rc;
             }
             ts->backend = backend;
@@ -107,9 +113,9 @@ int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
     }
 
     for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
-        rc = fscanf(fp, "%d %d", &tune->shapes[i_shape].N,
-                    &tune->shapes[i_shape].K);
-        if (rc <= 0) {
+        if (rc = fscanf(fp, "%d %d", &tune->shapes[i_shape].N,
+                        &tune->shapes[i_shape].K),
+            rc <= 0) {
             return rc;
         }
 
@@ -117,8 +123,7 @@ int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
             int M;
             for (int ip = 0; ip < tune->n_profiles; ip++) {
                 if (ip == 0) {
-                    rc = fscanf(fp, "%d", &M);
-                    if (rc <= 0) {
+                    if (rc = fscanf(fp, "%d", &M), rc <= 0) {
                         return rc;
                     }
                 }
@@ -126,9 +131,9 @@ int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
                     ggml_mulmat_tune_get_item_index(tune, i_shape, ip, i_m);
                 struct ggml_mulmat_tune_m *item = &tune->items[index];
                 item->M = M;
-                rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
-                            &item->stages_time[1], &item->stages_time[2]);
-                if (rc <= 0) {
+                if (rc = fscanf(fp, "%d %d %d", &item->stages_time[0],
+                                &item->stages_time[1], &item->stages_time[2]),
+                    rc <= 0) {
                     return rc;
                 }
             }
@@ -139,11 +144,12 @@ int ggml_mulmat_tune_read_data(struct ggml_mulmat_tune *tune, FILE *fp) {
 }
 
 int ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp) {
-    int rc = fprintf(fp, "%d %s %d %s %d %s %d %d %d %d\n", tune->version,
+    int rc;
+    if (rc = fprintf(fp, "%d %s %d %s %d %s %d %d %d %d\n", tune->version,
                      tune->model, tune->type, tune->type_name, tune->backend,
                      tune->blas_vendor, tune->n_shapes, tune->m_step,
-                     tune->m_num, tune->n_profiles);
-    if (rc <= 0) {
+                     tune->m_num, tune->n_profiles),
+        rc <= 0) {
         return rc;
     }
 
@@ -151,28 +157,25 @@ int ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp) {
         struct ggml_task_profile *profile = &tune->profiles[i];
         for (int j = 0; j < 3; j++) {
             struct ggml_task_stage *ts = &profile->stages[j];
-            rc = fprintf(fp, "%2d %d %d", ts->backend,
-                         ts->parallel ? 1 : 0, ts->wait ? 1 : 0);
-            if (rc <= 0) {
+            if (rc = fprintf(fp, "%2d %d %d", ts->backend, ts->parallel ? 1 : 0,
+                             ts->wait ? 1 : 0),
+                rc <= 0) {
                 return rc;
             }
             if (j < 2) {
-                rc = fprintf(fp, "  ");
-                if (rc <= 0) {
+                if (rc = fprintf(fp, "  "), rc <= 0) {
                     return rc;
                 }
             }
         }
-        rc = fprintf(fp, "\n");
-        if (rc <= 0) {
+        if (rc = fprintf(fp, "\n"), rc <= 0) {
             return rc;
         }
     }
 
     for (int i_shape = 0; i_shape < tune->n_shapes; i_shape++) {
         const struct ggml_mulmat_tune_nk *shape = &tune->shapes[i_shape];
-        rc = fprintf(fp, "%d %d\n", shape->N, shape->K);
-        if (rc <= 0) {
+        if (rc = fprintf(fp, "%d %d\n", shape->N, shape->K), rc <= 0) {
             return rc;
         }
 
@@ -182,8 +185,7 @@ int ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp) {
                     ggml_mulmat_tune_get_item_index(tune, i_shape, ip, i_m);
                 struct ggml_mulmat_tune_m *item = &tune->items[index];
                 if (ip == 0) {
-                    rc = fprintf(fp, "%3d", item->M);
-                    if (rc <= 0) {
+                    if (rc = fprintf(fp, "%3d", item->M), rc <= 0) {
                         return rc;
                     }
                 }
@@ -191,20 +193,18 @@ int ggml_mulmat_tune_write_data(const struct ggml_mulmat_tune *tune, FILE *fp) {
                 struct ggml_task_profile *profile = &tune->profiles[ip];
                 for (int k = 0; k < 3; k++) {
                     if (profile->stages[k].backend != GGML_BACKEND_UNKNOWN) {
-                        rc = fprintf(fp, "%9d", item->stages_time[k]);
-                        if (rc <= 0) {
+                        if (rc = fprintf(fp, "%9d", item->stages_time[k]),
+                            rc <= 0) {
                             return rc;
                         }
                     } else {
-                        rc = fprintf(fp, " 0");
-                        if (rc <= 0) {
+                        if (rc = fprintf(fp, " 0"), rc <= 0) {
                             return rc;
                         }
                     }
                 }
             }
-            rc = fprintf(fp, "\n");
-            if (rc <= 0) {
+            if (rc = fprintf(fp, "\n"), rc <= 0) {
                 return rc;
             }
         }
@@ -298,8 +298,8 @@ void ggml_mulmat_tune_estimate_time(
             if (ts->parallel) {
                 t /= nth;
             }
-            time_stats->profile_time[ip].stage_time[stage] = t;
-            time_stats->profile_time[ip].total_time += t;
+            time_stats->profile_time[ip].stage_time[stage] = (int)t;
+            time_stats->profile_time[ip].total_time += (int)t;
         }
     }
 }
@@ -313,7 +313,7 @@ static const char *ggml_backend_names[] = {
 
 const char *ggml_get_backend_name(enum ggml_backend backend) {
     if (backend == GGML_BACKEND_UNKNOWN) {
-        return "";
+        return "UNKNOWN";
     }
     return ggml_backend_names[backend];
 }
diff --git a/examples/mulmat-tune/mulmat-tune.h b/examples/mulmat-tune/mulmat-tune.h
index 5dee88935c1289..f98405795013b6 100644
--- a/examples/mulmat-tune/mulmat-tune.h
+++ b/examples/mulmat-tune/mulmat-tune.h
@@ -13,7 +13,7 @@ extern "C" {
 #define GGML_MULMAT_MAX_PROFILES 4
 
 struct ggml_task_stage {
-    int backend; // enum ggml_backend
+    /*enum ggml_backend*/ int backend;
     bool parallel;
     bool wait;
 };
@@ -99,7 +99,6 @@ void ggml_mulmat_init_task_profiles(/*enum ggml_backend*/ int backend);
 int ggml_mulmat_get_task_profiles(struct ggml_task_profile **profiles,
                                   int src0_type, int src1_type);
 
-// returns enum ggml_backend
 /*enum ggml_backend*/ int ggml_auto_detect_backend(void);
 
 const char *ggml_get_backend_name(/*enum ggml_backend*/ int backend);
diff --git a/ggml.c b/ggml.c
index 4e88d799f6fb0c..a0010f9b57b861 100644
--- a/ggml.c
+++ b/ggml.c
@@ -9534,14 +9534,6 @@ static void ggml_compute_forward_rms_norm_back(
 // }
 // #endif
 
-
-
-
-
-
-// TODO: allow compile CUDA/CL along with CBLAS
-
-
 static void ggml_compute_forward_mul_mat_f32(
         const struct ggml_compute_params * params,
         const struct ggml_tensor * src0,
@@ -10060,11 +10052,12 @@ static void ggml_compute_forward_mul_mat_q_f32(
                             0.0f,    d, ne01);
                 }
             }
+            return;
         }
-        return;
 
         GGML_ASSERT(nth == 1);
         GGML_ASSERT(params->type == GGML_TASK_COMPUTE);
+        GGML_ASSERT(init_backend == GGML_BACKEND_UNKNOWN);
         GGML_ASSERT(compute_backend == GGML_BACKEND_CBLAS);
 
         float * const wdata = params->wdata;
@@ -14410,13 +14403,19 @@ void ggml_graph_compute_mul_mat_set_task_profile(struct ggml_cgraph *cgraph) {
                         e->N = N;
                         e->K = K;
                         e->profile = profile;
+
+                        GGML_PRINT_THREAD_DEBUG("(1) M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n",
+                                                M, N, K,
+                                                profile->stages[0].backend,
+                                                profile->stages[1].backend,
+                                                profile->stages[2].backend);
                 }
             }
         }
 
         memcpy(&node->task_profile, profile, sizeof(struct ggml_task_profile));
 
-        GGML_PRINT_THREAD_DEBUG("M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n",
+        GGML_PRINT_THREAD_DEBUG("(2) M: %d, N: %d, K: %d, backends: %2d, %2d %2d\n",
                                 M, N, K,
                                 profile->stages[0].backend,
                                 profile->stages[1].backend,
@@ -14482,14 +14481,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
         // thread scheduling for the different operations
         for (int i = 0; i < cgraph->n_nodes; i++) {
             struct ggml_tensor * node = cgraph->nodes[i];
+            struct ggml_task_stage *stages = node->task_profile.stages;
 
             switch (node->op) {
                 case GGML_OP_CPY:
                 case GGML_OP_DUP:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
                         size_t cur = 0;
                         if (ggml_is_quantized(node->type)) {
                             cur = GGML_TYPE_SIZE[GGML_TYPE_F32] * node->ne[0] * n_threads;
@@ -14500,10 +14498,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_ADD:
                 case GGML_OP_ADD1:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                            .parallel = true,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         size_t cur = 0;
 
@@ -14515,13 +14511,9 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_ACC:
                     {
-                        node->task_profile.stages[GGML_TASK_INIT] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                        };
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                            .parallel = true,
-                        };
+                        stages[GGML_TASK_INIT].backend = GGML_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         size_t cur = 0;
 
@@ -14546,9 +14538,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_STEP:
                 case GGML_OP_RELU:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
                     } break;
                 case GGML_OP_MUL:
                 case GGML_OP_GELU:
@@ -14558,28 +14548,27 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_RMS_NORM:
                 case GGML_OP_RMS_NORM_BACK:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                            .parallel = true,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                     } break;
                 case GGML_OP_MUL_MAT:
                     {
                         size_t cur = 0;
+                        enum ggml_backend compute_backend = stages[GGML_TASK_COMPUTE].backend;
 #if defined(GGML_USE_CUBLAS)
-                        if (node->task_profile.stages[GGML_TASK_COMPUTE].backend != GGML_BACKEND_CPU) {
+                        if (!= GGML_BACKEND_CPU) {
                             cur = ggml_cuda_mul_mat_get_wsize(node->src0, node->src1, node);
                         }
                         else
 #elif defined(GGML_USE_CLBLAST)
-                        if (node->task_profile.stages[GGML_TASK_COMPUTE].backend != GGML_BACKEND_CPU) {
+                        if (compute_backend!= GGML_BACKEND_CPU) {
                             cur = ggml_cl_mul_mat_get_wsize(node->src0, node->src1, node);
                         }
                         else
 #endif
                         if (node->src0->type == GGML_TYPE_F16 && node->src1->type == GGML_TYPE_F32) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (node->task_profile.stages[GGML_TASK_COMPUTE].backend != GGML_BACKEND_CPU) {
+                            if (compute_backend != GGML_BACKEND_CPU) {
                                 // here we need memory just for single 2D matrix from src0
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
                             } else {
@@ -14592,7 +14581,7 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                             cur = 0;
                         } else if (ggml_is_quantized(node->src0->type) && node->src1->type == GGML_TYPE_F32) {
 #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-                            if (node->task_profile.stages[GGML_TASK_COMPUTE].backend != GGML_BACKEND_CPU) {
+                            if (compute_backend != GGML_BACKEND_CPU) {
                                 cur = GGML_TYPE_SIZE[GGML_TYPE_F32]*(node->src0->ne[0]*node->src0->ne[1]);
                             } else
 #endif
@@ -14608,19 +14597,13 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_SCALE:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                            .parallel = true,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                     } break;
                 case GGML_OP_SET:
                     {
-                        node->task_profile.stages[GGML_TASK_INIT] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                        };
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                        };
+                        stages[GGML_TASK_INIT].backend = GGML_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
                     } break;
                 case GGML_OP_CONT:
                 case GGML_OP_RESHAPE:
@@ -14632,42 +14615,30 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_DIAG:
                 case GGML_OP_DIAG_MASK_ZERO:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
                     } break;
                 case GGML_OP_DIAG_MASK_INF:
                 case GGML_OP_SOFT_MAX:
                 case GGML_OP_ROPE:
                 case GGML_OP_ROPE_BACK:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                            .parallel = true,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                     } break;
                 case GGML_OP_ALIBI:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
                     } break;
                 case GGML_OP_CLAMP:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
                     } break;
                 case GGML_OP_CONV_1D_1S:
                 case GGML_OP_CONV_1D_2S:
                     {
-                        node->task_profile.stages[GGML_TASK_INIT] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                        };
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                            .parallel = true,
-                        };
+                        stages[GGML_TASK_INIT].backend = GGML_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         GGML_ASSERT(node->src0->ne[3] == 1);
                         GGML_ASSERT(node->src1->ne[2] == 1);
@@ -14696,10 +14667,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_FLASH_ATTN:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                            .parallel = true,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
 
                         size_t cur = 0;
 
@@ -14719,10 +14688,8 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                     } break;
                 case GGML_OP_FLASH_FF:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                            .parallel = true,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
+                        stages[GGML_TASK_COMPUTE].parallel = true;
                         size_t cur = 0;
 
                         if (node->src1->type == GGML_TYPE_F32) {
@@ -14740,15 +14707,11 @@ void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph)
                 case GGML_OP_MAP_UNARY:
                 case GGML_OP_MAP_BINARY:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
                     } break;
                 case GGML_OP_NONE:
                     {
-                        node->task_profile.stages[GGML_TASK_COMPUTE] = (struct ggml_task_stage){
-                            .backend = GGML_BACKEND_CPU,
-                        };
+                        stages[GGML_TASK_COMPUTE].backend = GGML_BACKEND_CPU;
                     } break;
                 case GGML_OP_COUNT:
                     {