Date: Mon, 23 Mar 2026 21:37:15 +0000 From: Yuri Victorovich <yuri@FreeBSD.org> To: ports-committers@FreeBSD.org, dev-commits-ports-all@FreeBSD.org, dev-commits-ports-main@FreeBSD.org Cc: Eric Camachat <eric@camachat.org> Subject: git: f0dc3ed09464 - main - misc/ggml: update 0.=?utf-8?Q?9.7 =E2=86=92?= 0.9.8 Message-ID: <69c1b28b.2114e.6da73a3d@gitrepo.freebsd.org>
index | next in thread | raw e-mail
The branch main has been updated by yuri: URL: https://cgit.FreeBSD.org/ports/commit/?id=f0dc3ed094644932d9746ea52b293949b6034515 commit f0dc3ed094644932d9746ea52b293949b6034515 Author: Eric Camachat <eric@camachat.org> AuthorDate: 2026-03-23 21:35:55 +0000 Commit: Yuri Victorovich <yuri@FreeBSD.org> CommitDate: 2026-03-23 21:37:12 +0000 misc/ggml: update 0.9.7 → 0.9.8 PR: 293988 --- misc/ggml/Makefile | 5 +- misc/ggml/distinfo | 6 +- misc/ggml/files/patch-19504 | 563 -------------------------------------------- misc/ggml/pkg-plist | 9 +- 4 files changed, 11 insertions(+), 572 deletions(-) diff --git a/misc/ggml/Makefile b/misc/ggml/Makefile index 878ee6170627..b380698977df 100644 --- a/misc/ggml/Makefile +++ b/misc/ggml/Makefile @@ -1,7 +1,6 @@ PORTNAME= ggml DISTVERSIONPREFIX= v -DISTVERSION= 0.9.7 -PORTREVISION= 1 +DISTVERSION= 0.9.8 CATEGORIES= misc # machine-learning MAINTAINER= yuri@FreeBSD.org @@ -33,6 +32,8 @@ CMAKE_TESTING_ON= GGML_BUILD_TESTS BINARY_ALIAS= git=false +PLIST_SUB+= DISTVERSION=${DISTVERSION} + OPTIONS_DEFINE= VULKAN OPTIONS_DEFAULT= VULKAN OPTIONS_SUB= yes diff --git a/misc/ggml/distinfo b/misc/ggml/distinfo index b687229af04d..e64461696800 100644 --- a/misc/ggml/distinfo +++ b/misc/ggml/distinfo @@ -1,3 +1,3 @@ -TIMESTAMP = 1771220453 -SHA256 (ggml-org-ggml-v0.9.7_GH0.tar.gz) = 7288285b194cbf7fd7b532628c5f9ae86dda2568ec2276a8bb49b9eef65cca00 -SIZE (ggml-org-ggml-v0.9.7_GH0.tar.gz) = 2569901 +TIMESTAMP = 1774234893 +SHA256 (ggml-org-ggml-v0.9.8_GH0.tar.gz) = 9d8b38e473697e9014ea2275fadb4ed5c247b1ca82404875fe5ac336c0d0754c +SIZE (ggml-org-ggml-v0.9.8_GH0.tar.gz) = 2748285 diff --git a/misc/ggml/files/patch-19504 b/misc/ggml/files/patch-19504 deleted file mode 100644 index 8611182bb7b2..000000000000 --- a/misc/ggml/files/patch-19504 +++ /dev/null @@ -1,563 +0,0 @@ -- PR19504 from llama.cpp - ---- include/ggml.h -+++ include/ggml.h -@@ -556,6 +556,7 @@ extern "C" { - GGML_OP_GATED_LINEAR_ATTN, - GGML_OP_RWKV_WKV7, - GGML_OP_SOLVE_TRI, -+ GGML_OP_GATED_DELTA_NET, - - GGML_OP_UNARY, - -@@ -2463,6 +2464,15 @@ extern "C" { - bool lower, - bool uni); - -+ GGML_API struct ggml_tensor * ggml_gated_delta_net( -+ struct ggml_context * ctx, -+ struct ggml_tensor * q, -+ struct ggml_tensor * k, -+ struct ggml_tensor * v, -+ struct ggml_tensor * g, -+ struct ggml_tensor * beta, -+ struct ggml_tensor * state); -+ - // custom operators - - typedef void (*ggml_custom1_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, int ith, int nth, void * userdata); ---- src/ggml-cpu/ggml-cpu.c -+++ src/ggml-cpu/ggml-cpu.c -@@ -2021,6 +2021,10 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm - { - ggml_compute_forward_solve_tri(params, tensor); - } break; -+ case GGML_OP_GATED_DELTA_NET: -+ { -+ ggml_compute_forward_gated_delta_net(params, tensor); -+ } break; - case GGML_OP_MAP_CUSTOM1: - { - ggml_compute_forward_map_custom1(params, tensor); -@@ -2200,6 +2204,7 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) { - } break; - case GGML_OP_COUNT_EQUAL: - case GGML_OP_SOLVE_TRI: -+ case GGML_OP_GATED_DELTA_NET: - { - n_tasks = n_threads; - } break; -@@ -2905,6 +2910,11 @@ struct ggml_cplan ggml_graph_plan( - { - cur = ggml_type_size(node->type)*(n_tasks + node->src[0]->ne[0]*n_tasks); - } break; -+ case GGML_OP_GATED_DELTA_NET: -+ { -+ const int64_t S_v = node->src[2]->ne[0]; -+ cur = (S_v * S_v + S_v) * sizeof(float) * n_tasks; -+ } break; - case GGML_OP_COUNT: - { - GGML_ABORT("fatal error"); ---- src/ggml-cpu/ops.cpp -+++ src/ggml-cpu/ops.cpp -@@ -10380,6 +10380,192 @@ void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, s - } - } - -+// ggml_compute_forward_gated_delta_net -+static void ggml_compute_forward_gated_delta_net_one_chunk( -+ const ggml_compute_params * params, -+ ggml_tensor * dst, -+ int64_t ir0, -+ int64_t ir1) { -+ -+ ggml_tensor * src_q = dst->src[0]; -+ ggml_tensor * src_k = dst->src[1]; -+ ggml_tensor * src_v = dst->src[2]; -+ ggml_tensor * src_g = dst->src[3]; -+ ggml_tensor * src_beta = dst->src[4]; -+ ggml_tensor * src_state = dst->src[5]; -+ -+ const int64_t S_v = src_v->ne[0]; -+ const int64_t H = src_v->ne[1]; -+ const int64_t n_tokens = src_v->ne[2]; -+ const int64_t n_seqs = src_v->ne[3]; -+ -+ GGML_ASSERT(ggml_is_contiguous_rows(src_q)); -+ GGML_ASSERT(ggml_is_contiguous_rows(src_k)); -+ GGML_ASSERT(ggml_is_contiguous_rows(src_v)); -+ GGML_ASSERT(ggml_is_contiguous(src_g)); -+ GGML_ASSERT(ggml_is_contiguous(src_beta)); -+ GGML_ASSERT(ggml_is_contiguous(src_state)); -+ -+ // TODO: to support KDA -+ GGML_ASSERT(ggml_are_same_shape(src_beta, src_g)); -+ -+ GGML_TENSOR_LOCALS(int64_t, neq, src_q, ne); -+ GGML_TENSOR_LOCALS(size_t, nbq, src_q, nb); -+ GGML_TENSOR_LOCALS(int64_t, nek, src_k, ne); -+ GGML_TENSOR_LOCALS(size_t, nbk, src_k, nb); -+ GGML_TENSOR_LOCALS(int64_t, nev, src_v, ne); -+ GGML_TENSOR_LOCALS(size_t, nbv, src_v, nb); -+ GGML_TENSOR_LOCALS(int64_t, neg, src_g, ne); -+ GGML_TENSOR_LOCALS(size_t, nbg, src_g, nb); -+ -+ // scratch layout per thread: [s_t(S_v*S_v) | delta(S_v)] -+ // s_t holds the transposed (row-major) state for contiguous vector ops -+ const int64_t scratch_per_thread = S_v * S_v + S_v; -+ const int ith = params->ith; -+ -+ float * scratch = (float *)params->wdata + ith * scratch_per_thread + CACHE_LINE_SIZE_F32; -+ -+ float * s_t = scratch; -+ float * delta = scratch + S_v * S_v; -+ -+ // output layout: [attn_scores | new_states] -+ // attn_scores: S_v * H * n_tokens * n_seqs floats -+ // new_states: S_v * S_v * H * n_seqs floats -+ const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs; -+ float * attn_out_base = (float *)dst->data; -+ float * state_out_base = (float *)dst->data + attn_score_elems; -+ -+ const float * state_in_base = (const float *)src_state->data; -+ -+ const int64_t rq1 = nev1 / neq1; -+ const int64_t rk1 = nev1 / nek1; -+ const int64_t rq3 = nev3 / neq3; -+ const int64_t rk3 = nev3 / nek3; -+ -+ const float scale = 1.0f / sqrtf((float) S_v); -+ -+ for (int64_t ir = ir0; ir < ir1; ++ir) { -+ const int64_t iv1 = ir % H; // head_index -+ const int64_t iv3 = ir / H; // sequence -+ -+ const int64_t iq1 = iv1 / rq1; -+ const int64_t ik1 = iv1 / rk1; -+ -+ const int64_t iq3 = iv3 / rq3; -+ const int64_t ik3 = iv3 / rk3; -+ -+ float * s_out = state_out_base + (iv3 * H + iv1) * S_v * S_v; -+ -+ // tranpose -+ const float * s_in = state_in_base + (iv3 * H + iv1) * S_v * S_v; -+ for (int64_t j = 0; j < S_v; ++j) { -+ for (int64_t i = 0; i < S_v; ++i) { -+ s_t[j * S_v + i] = s_in[j + i * S_v]; -+ } -+ } -+ -+ // attn output pointer for first token of this (head, seq) -+ float * attn_data = attn_out_base + (iv3 * n_tokens * H + iv1) * S_v; -+ -+ for (int64_t t = 0; t < n_tokens; t++) { -+ const float * q_d = (const float *)((const char *)src_q->data + iq3 * nbq3 + t * nbq2 + iq1 * nbq1); -+ const float * k_d = (const float *)((const char *)src_k->data + ik3 * nbk3 + t * nbk2 + ik1 * nbk1); -+ const float * v_d = (const float *)((const char *)src_v->data + iv3 * nbv3 + t * nbv2 + iv1 * nbv1); -+ -+ const size_t gb_byte_offset = iv3 * nbg3 + t * nbg2 + iv1 * nbg1; -+ const float beta_val = *(const float *)((const char *)src_beta->data + gb_byte_offset); -+ const float g_val = expf(*(const float *)((const char *)src_g->data + gb_byte_offset)); -+ -+ ggml_vec_scale_f32(S_v * S_v, s_t, g_val); -+ -+ for (int64_t j = 0; j < S_v; ++j) { -+ float kv_j; -+ ggml_vec_dot_f32(S_v, &kv_j, 0, &s_t[j * S_v], 0, k_d, 0, 1); -+ delta[j] = (v_d[j] - kv_j) * beta_val; -+ } -+ -+ // outer product: S[j][i] += k[i] * delta[j] -+ for (int64_t j = 0; j < S_v; ++j) { -+ ggml_vec_mad_f32(S_v, &s_t[j * S_v], k_d, delta[j]); -+ } -+ -+ // attn_out[j] = sum_i S[j][i] * q[i] = dot(s_t[j*S_v:], q) -+ for (int64_t j = 0; j < S_v; ++j) { -+ ggml_vec_dot_f32(S_v, &attn_data[j], 0, &s_t[j * S_v], 0, q_d, 0, 1); -+ } -+ ggml_vec_scale_f32(S_v, attn_data, scale); -+ -+ attn_data += S_v * H; // advance to next token -+ } -+ -+ // transpose back -+ for (int64_t j = 0; j < S_v; ++j) { -+ for (int64_t i = 0; i < S_v; ++i) { -+ s_out[j + i * S_v] = s_t[j * S_v + i]; -+ } -+ } -+ } -+} -+ -+ -+static void ggml_compute_forward_gated_delta_net_f32( -+ const ggml_compute_params * params, -+ ggml_tensor * dst) { -+ -+ ggml_tensor * V = dst->src[2]; -+ int64_t nr = V->ne[1] * V->ne[3]; -+ -+ // disable for NUMA -+ const bool disable_chunking = ggml_is_numa(); -+ -+ int nth = params->nth; -+ int ith = params->ith; -+ -+ // 4x chunks per thread -+ int nth_scaled = nth * 4; -+ int64_t chunk_size = (nr + nth_scaled - 1) / nth_scaled; -+ int64_t nchunk = (nr + chunk_size - 1) / chunk_size; -+ -+ if (nth == 1 || nchunk < nth || disable_chunking) { -+ nchunk = nth; -+ } -+ -+ if (ith == 0) { -+ ggml_threadpool_chunk_set(params->threadpool, nth); -+ } -+ -+ ggml_barrier(params->threadpool); -+ -+ const int64_t dr = (nr + nchunk - 1) / nchunk; -+ -+ int current_chunk = ith; -+ -+ while (current_chunk < nchunk) { -+ const int64_t ir0 = dr * current_chunk; -+ const int64_t ir1 = MIN(ir0 + dr, nr); -+ -+ ggml_compute_forward_gated_delta_net_one_chunk(params, dst, ir0, ir1); -+ current_chunk = ggml_threadpool_chunk_add(params->threadpool, 1); -+ } -+} -+ -+void ggml_compute_forward_gated_delta_net( -+ const ggml_compute_params * params, -+ ggml_tensor * dst) { -+ const ggml_tensor * src0 = dst->src[0]; -+ -+ switch (src0->type) { -+ case GGML_TYPE_F32: -+ { -+ ggml_compute_forward_gated_delta_net_f32(params, dst); -+ } break; -+ default: -+ { -+ GGML_ABORT("fatal error"); -+ } -+ } -+} -+ - // ggml_compute_forward_rwkv_wkv7 - - static void ggml_compute_forward_rwkv_wkv7_f32( ---- src/ggml-cpu/ops.h -+++ src/ggml-cpu/ops.h -@@ -102,6 +102,7 @@ void ggml_compute_forward_rwkv_wkv6(const struct ggml_compute_params * params, s - void ggml_compute_forward_rwkv_wkv7(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_solve_tri(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_gla(const struct ggml_compute_params * params, struct ggml_tensor * dst); -+void ggml_compute_forward_gated_delta_net(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_map_custom1(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_map_custom2(const struct ggml_compute_params * params, struct ggml_tensor * dst); - void ggml_compute_forward_map_custom3(const struct ggml_compute_params * params, struct ggml_tensor * dst); ---- /dev/null -+++ src/ggml-cuda/gated_delta_net.cu -@@ -0,0 +1,169 @@ -+#include "gated_delta_net.cuh" -+#include "ggml-cuda/common.cuh" -+ -+template <int S_v> -+__global__ void gated_delta_net_cuda(const float * q, -+ const float * k, -+ const float * v, -+ const float * g, -+ const float * beta, -+ const float * curr_state, -+ float * dst, -+ int64_t H, -+ int64_t n_tokens, -+ int64_t n_seqs, -+ int64_t sq1, -+ int64_t sq2, -+ int64_t sq3, -+ int64_t sv1, -+ int64_t sv2, -+ int64_t sv3, -+ int64_t sg1, -+ int64_t sg2, -+ int64_t sg3, -+ int64_t rq1, -+ int64_t rq3, -+ float scale) { -+ const int64_t h_idx = blockIdx.x; -+ const int64_t sequence = blockIdx.y; -+ const int col = threadIdx.x; // each thread owns one column -+ -+ const int64_t iq1 = h_idx / rq1; -+ const int64_t iq3 = sequence / rq3; -+ -+ const int64_t attn_score_elems = S_v * H * n_tokens * n_seqs; -+ float * attn_data = dst; -+ float * state = dst + attn_score_elems; -+ -+ const int64_t state_offset = (sequence * H + h_idx) * S_v * S_v; -+ state += state_offset; -+ curr_state += state_offset; -+ attn_data += (sequence * n_tokens * H + h_idx) * S_v; -+ -+ // Load state column into registers -+ float s[S_v]; -+#pragma unroll -+ for (int i = 0; i < S_v; i++) { -+ s[i] = curr_state[i * S_v + col]; -+ } -+ -+ for (int t = 0; t < n_tokens; t++) { -+ const float * q_t = q + iq3 * sq3 + t * sq2 + iq1 * sq1; -+ const float * k_t = k + iq3 * sq3 + t * sq2 + iq1 * sq1; -+ const float * v_t = v + sequence * sv3 + t * sv2 + h_idx * sv1; -+ -+ const float * g_t = g + sequence * sg3 + t * sg2 + h_idx * sg1; -+ const float * beta_t = beta + sequence * sg3 + t * sg2 + h_idx * sg1; -+ -+ const float beta_val = *beta_t; -+ const float g_val = expf(*g_t); -+ -+ // kv[col] = (S^T @ k)[col] = sum_i S[i][col] * k[i] -+ float kv_col = 0.0f; -+#pragma unroll -+ for (int i = 0; i < S_v; i++) { -+ kv_col += s[i] * k_t[i]; -+ } -+ -+ // delta[col] = (v[col] - g * kv[col]) * beta -+ float delta_col = (v_t[col] - g_val * kv_col) * beta_val; -+ -+ // fused: S[i][col] = g * S[i][col] + k[i] * delta[col] -+ // attn[col] = (S^T @ q)[col] = sum_i S[i][col] * q[i] -+ float attn_col = 0.0f; -+#pragma unroll -+ for (int i = 0; i < S_v; i++) { -+ s[i] = g_val * s[i] + k_t[i] * delta_col; -+ attn_col += s[i] * q_t[i]; -+ } -+ -+ attn_data[col] = attn_col * scale; -+ attn_data += S_v * H; -+ } -+ -+ // Write state back to global memory -+#pragma unroll -+ for (int i = 0; i < S_v; i++) { -+ state[i * S_v + col] = s[i]; -+ } -+} -+ -+void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor * dst) { -+ ggml_tensor * src_q = dst->src[0]; -+ ggml_tensor * src_k = dst->src[1]; -+ ggml_tensor * src_v = dst->src[2]; -+ ggml_tensor * src_g = dst->src[3]; -+ ggml_tensor * src_beta = dst->src[4]; -+ ggml_tensor * src_state = dst->src[5]; -+ -+ GGML_TENSOR_LOCALS(int64_t, neq, src_q, ne); -+ GGML_TENSOR_LOCALS(size_t, nbq, src_q, nb); -+ GGML_TENSOR_LOCALS(int64_t, nev, src_v, ne); -+ GGML_TENSOR_LOCALS(size_t, nbv, src_v, nb); -+ GGML_TENSOR_LOCALS(size_t, nbg, src_g, nb); -+ -+ const int64_t S_v = nev0; -+ const int64_t H = nev1; -+ const int64_t n_tokens = nev2; -+ const int64_t n_seqs = nev3; -+ -+ const int64_t rq1 = nev1 / neq1; -+ const int64_t rq3 = nev3 / neq3; -+ -+ const float * q_d = (const float *) src_q->data; -+ const float * k_d = (const float *) src_k->data; -+ const float * v_d = (const float *) src_v->data; -+ const float * g_d = (const float *) src_g->data; -+ const float * b_d = (const float *) src_beta->data; -+ -+ const float * s_d = (const float *) src_state->data; -+ float * dst_d = (float *) dst->data; -+ -+ GGML_ASSERT(ggml_is_contiguous_rows(src_q)); -+ GGML_ASSERT(ggml_is_contiguous_rows(src_k)); -+ GGML_ASSERT(ggml_is_contiguous_rows(src_v)); -+ GGML_ASSERT(ggml_are_same_stride(src_q, src_k)); -+ GGML_ASSERT(ggml_are_same_stride(src_g, src_beta)); -+ GGML_ASSERT(ggml_is_contiguous(src_g)); -+ GGML_ASSERT(ggml_is_contiguous(src_beta)); -+ GGML_ASSERT(ggml_is_contiguous(src_state)); -+ -+ // strides in floats -+ const int64_t sq1 = nbq1 / sizeof(float); -+ const int64_t sq2 = nbq2 / sizeof(float); -+ const int64_t sq3 = nbq3 / sizeof(float); -+ const int64_t sv1 = nbv1 / sizeof(float); -+ const int64_t sv2 = nbv2 / sizeof(float); -+ const int64_t sv3 = nbv3 / sizeof(float); -+ const int64_t sg1 = nbg1 / sizeof(float); -+ const int64_t sg2 = nbg2 / sizeof(float); -+ const int64_t sg3 = nbg3 / sizeof(float); -+ -+ const float scale = 1.0f / sqrtf((float) S_v); -+ -+ dim3 grid_dims(H, n_seqs, 1); -+ dim3 block_dims(S_v, 1, 1); -+ -+ cudaStream_t stream = ctx.stream(); -+ -+ switch (S_v) { -+ case 32: -+ gated_delta_net_cuda<32><<<grid_dims, block_dims, 0, stream>>>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, -+ n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, -+ sv3, sg1, sg2, sg3, rq1, rq3, scale); -+ break; -+ case 64: -+ gated_delta_net_cuda<64><<<grid_dims, block_dims, 0, stream>>>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, -+ n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, -+ sv3, sg1, sg2, sg3, rq1, rq3, scale); -+ break; -+ case 128: -+ gated_delta_net_cuda<128><<<grid_dims, block_dims, 0, stream>>>(q_d, k_d, v_d, g_d, b_d, s_d, dst_d, H, -+ n_tokens, n_seqs, sq1, sq2, sq3, sv1, sv2, -+ sv3, sg1, sg2, sg3, rq1, rq3, scale); -+ break; -+ default: -+ GGML_ABORT("fatal error"); -+ break; -+ } -+} ---- /dev/null -+++ src/ggml-cuda/gated_delta_net.cuh -@@ -0,0 +1,4 @@ -+#include "common.cuh" -+#include "ggml.h" -+ -+void ggml_cuda_op_gated_delta_net(ggml_backend_cuda_context & ctx, ggml_tensor * dst); ---- src/ggml-cuda/ggml-cuda.cu -+++ src/ggml-cuda/ggml-cuda.cu -@@ -53,6 +53,7 @@ - #include "ggml-cuda/upscale.cuh" - #include "ggml-cuda/wkv.cuh" - #include "ggml-cuda/gla.cuh" -+#include "ggml-cuda/gated_delta_net.cuh" - #include "ggml-cuda/set.cuh" - #include "ggml-cuda/set-rows.cuh" - #include "ggml-cuda/pad_reflect_1d.cuh" -@@ -2733,6 +2734,9 @@ static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct gg - case GGML_OP_GATED_LINEAR_ATTN: - ggml_cuda_op_gated_linear_attn(ctx, dst); - break; -+ case GGML_OP_GATED_DELTA_NET: -+ ggml_cuda_op_gated_delta_net(ctx, dst); -+ break; - case GGML_OP_RWKV_WKV7: - ggml_cuda_op_rwkv_wkv7(ctx, dst); - break; -@@ -4972,6 +4976,7 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g - case GGML_OP_LEAKY_RELU: - case GGML_OP_RWKV_WKV6: - case GGML_OP_GATED_LINEAR_ATTN: -+ case GGML_OP_GATED_DELTA_NET: - case GGML_OP_RWKV_WKV7: - return true; - case GGML_OP_FLASH_ATTN_EXT: ---- src/ggml.c -+++ src/ggml.c -@@ -1031,6 +1031,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { - "GATED_LINEAR_ATTN", - "RWKV_WKV7", - "SOLVE_TRI", -+ "GATED_DELTA_NET", - - "UNARY", - -@@ -1048,7 +1049,7 @@ static const char * GGML_OP_NAME[GGML_OP_COUNT] = { - "GLU", - }; - --static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95"); -+static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); - - static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { - "none", -@@ -1140,6 +1141,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { - "gated_linear_attn(k, v, q, gate, s)", - "rwkv_wkv7(r, w, k, v, a, b, s)", - "A X = B, A triangular, solve X", -+ "gated_delta_net(q, k, v, g, beta, s)", - - "unary(x)", - -@@ -1157,7 +1159,7 @@ static const char * GGML_OP_SYMBOL[GGML_OP_COUNT] = { - "glu(x)", - }; - --static_assert(GGML_OP_COUNT == 95, "GGML_OP_COUNT != 95"); -+static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT != 96"); - - static_assert(GGML_OP_POOL_COUNT == 2, "GGML_OP_POOL_COUNT != 2"); - -@@ -6124,6 +6126,53 @@ struct ggml_tensor * ggml_solve_tri( - return result; - } - -+// ggml_gated_delta_net -+ -+struct ggml_tensor * ggml_gated_delta_net( -+ struct ggml_context * ctx, -+ struct ggml_tensor * q, -+ struct ggml_tensor * k, -+ struct ggml_tensor * v, -+ struct ggml_tensor * g, -+ struct ggml_tensor * beta, -+ struct ggml_tensor * state) { -+ GGML_ASSERT(ggml_is_contiguous_rows(q)); -+ GGML_ASSERT(ggml_is_contiguous_rows(k)); -+ GGML_ASSERT(ggml_is_contiguous_rows(v)); -+ GGML_ASSERT(ggml_is_contiguous(g)); -+ GGML_ASSERT(ggml_is_contiguous(beta)); -+ GGML_ASSERT(ggml_is_contiguous(state)); -+ -+ GGML_ASSERT(q->type == GGML_TYPE_F32); -+ GGML_ASSERT(k->type == GGML_TYPE_F32); -+ GGML_ASSERT(v->type == GGML_TYPE_F32); -+ GGML_ASSERT(g->type == GGML_TYPE_F32); -+ GGML_ASSERT(beta->type == GGML_TYPE_F32); -+ GGML_ASSERT(state->type == GGML_TYPE_F32); -+ -+ const int64_t S_v = v->ne[0]; -+ const int64_t H = v->ne[1]; -+ const int64_t n_tokens = v->ne[2]; -+ const int64_t n_seqs = v->ne[3]; -+ -+ GGML_ASSERT(ggml_nelements(state) == S_v * S_v * H * n_seqs); -+ -+ // concat output and new_state into a single tensor -+ // output: S_v * H * n_tokens * n_seqs, state: S_v * S_v * H * n_seqs -+ const int64_t ne[4] = { S_v * H, n_tokens * n_seqs + S_v * n_seqs, 1, 1 }; -+ struct ggml_tensor * result = ggml_new_tensor(ctx, GGML_TYPE_F32, 4, ne); -+ -+ result->op = GGML_OP_GATED_DELTA_NET; -+ result->src[0] = q; -+ result->src[1] = k; -+ result->src[2] = v; -+ result->src[3] = g; -+ result->src[4] = beta; -+ result->src[5] = state; -+ -+ return result; -+} -+ - //////////////////////////////////////////////////////////////////////////////// - - struct ggml_hash_set ggml_hash_set_new(size_t size) { diff --git a/misc/ggml/pkg-plist b/misc/ggml/pkg-plist index 4df10cd80ecb..8c9cbaeae863 100644 --- a/misc/ggml/pkg-plist +++ b/misc/ggml/pkg-plist @@ -6,6 +6,7 @@ include/ggml-cpp.h include/ggml-cpu.h include/ggml-cuda.h include/ggml-metal.h +include/ggml-openvino.h include/ggml-opt.h include/ggml-rpc.h include/ggml-sycl.h @@ -19,14 +20,14 @@ lib/cmake/ggml/ggml-config.cmake lib/cmake/ggml/ggml-version.cmake lib/libggml-base.so lib/libggml-base.so.0 -lib/libggml-base.so.0.9.7 +lib/libggml-base.so.%%DISTVERSION%% lib/libggml-cpu.so lib/libggml-cpu.so.0 -lib/libggml-cpu.so.0.9.7 +lib/libggml-cpu.so.%%DISTVERSION%% %%VULKAN%%lib/libggml-vulkan.so %%VULKAN%%lib/libggml-vulkan.so.0 -%%VULKAN%%lib/libggml-vulkan.so.0.9.7 +%%VULKAN%%lib/libggml-vulkan.so.%%DISTVERSION%% lib/libggml.so lib/libggml.so.0 -lib/libggml.so.0.9.7 +lib/libggml.so.%%DISTVERSION%% share/pkgconfig/ggml.pchome | help
Want to link to this message? Use this
URL: <https://mail-archive.FreeBSD.org/cgi/mid.cgi?69c1b28b.2114e.6da73a3d>
