diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.glsl new file mode 100644 index 00000000000..50827118a12 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.glsl @@ -0,0 +1,261 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * KHR Cooperative Matrix linear shader for prepacked weights. + * Drop-in replacement for linear_vec when storage=buffer and device + * supports GL_KHR_cooperative_matrix. + * + * Computes: D = A * W_packed (A: [M, K], W_packed: 4OC x 4IC blocked, D: [M, N]) + * + * Weight is prepacked by pack_fp_linear_weight into a 4OC x 4IC blocked layout: + * t_weight_packed[(k4 * N4 + n4) * 4 + dk] = vec4(w[k4*4+dk][n4*4+0..3]) + * + * fp16xfp16->fp32 MMA. When DTYPE=half, inputs are native fp16 (no + * conversion, half the bandwidth). When DTYPE=float, inputs are fp32 + * with on-the-fly packHalf2x16 conversion. + * + * Output is always fp32 (fp32 accumulator -> fp32 store) when DTYPE=float, + * or fp16 when DTYPE=half. + * + * Optional bias: when HAS_BIAS is defined, bias is added post-store via + * read-modify-write on the output buffer (one pass over the tile). + */ + +#version 450 core + +#extension GL_KHR_cooperative_matrix : require +#extension GL_KHR_memory_scope_semantics : require +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_control_flow_attributes : enable + +#define PRECISION ${PRECISION} + +$if DTYPE == "half": + #define IS_FP16_INPUT +$if DTYPE == "float": + #define IS_FP32_INPUT + +$if HAS_BIAS: + #define HAS_BIAS + +layout(std430) buffer; + +#include "common.glslh" + +// Bindings: output(0), mat1(1), weight_packed(2), [bias(3)] +$if HAS_BIAS: + ${layout_declare_tensor(B, "rw", "t_output", DTYPE, "buffer", is_scalar_array=True)} +$else: + ${layout_declare_tensor(B, "w", "t_output", DTYPE, "buffer", is_scalar_array=True)} +${layout_declare_tensor(B, "r", "t_mat1", DTYPE, "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_weight_packed", DTYPE, "buffer", is_scalar_array=False)} +$if HAS_BIAS: + ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=True)} + +// UBOs +${layout_declare_ubo(B, "ivec4", "mat1_sizes")} +${layout_declare_ubo(B, "ivec4", "out_sizes")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Tile dimensions (same as matmul_coopmat) +const uint lM = 16; +const uint lN = 16; +const uint lK = 16; +const uint TILE_M = 64; +const uint TILE_N = 64; +const uint TILE_K = 32; + +// Workgroup: 4 subgroups in 2x2 grid, 64 threads each = 256 total +const uint WG_WIDTH = 2; +const uint WG_HEIGHT = 2; +const uint NUM_SUBGROUPS = 4; +const uint INVOCATIONS = 64 * NUM_SUBGROUPS; + +// Result tiles per subgroup: 2x2 +const uint C_ROWS = TILE_M / WG_HEIGHT / lM; // 2 +const uint C_COLS = TILE_N / WG_WIDTH / lN; // 2 + +// fp16: 8 elements per uvec4 (128-bit) +const uint FP16_PER_VEC4 = 8; + +// Shared memory with skew padding +const uint A_STRIDE_VEC4 = (TILE_K + FP16_PER_VEC4) / FP16_PER_VEC4; // 5 +const uint B_STRIDE_VEC4 = (TILE_N + FP16_PER_VEC4) / FP16_PER_VEC4; // 9 + +shared uvec4 Ash[TILE_M * A_STRIDE_VEC4]; // 5KB +shared uvec4 Bsh[TILE_K * B_STRIDE_VEC4]; // 4.5KB + +// Accumulator tiles (fp32) +coopmat result[C_ROWS][C_COLS]; + +#ifdef IS_FP32_INPUT +uvec2 f32x4_to_f16x4(vec4 v) { + return uvec2(packHalf2x16(v.xy), packHalf2x16(v.zw)); +} +#endif + +void main() { + const uvec2 tileID = uvec2(gl_WorkGroupID.xy); + const uvec2 warpInTile = uvec2( + gl_SubgroupID % WG_WIDTH, + gl_SubgroupID / WG_WIDTH); + + const uint K = uint(mat1_sizes.x); + const uint M = uint(mat1_sizes.y); + const uint N = uint(out_sizes.x); + const uint K4 = (K + 3u) / 4u; + const uint N4 = (N + 3u) / 4u; + + [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { + [[unroll]] for (uint j = 0; j < C_COLS; ++j) { + result[i][j] = coopmat(0.0); + } + } + + // Thread assignment for A tile (64 rows x 4 uvec4/row = single pass) + const uint INVS_PER_ROW_A = TILE_K / FP16_PER_VEC4; // 4 + const uint a_col = gl_LocalInvocationID.x % INVS_PER_ROW_A; + const uint a_row_offset = gl_LocalInvocationID.x / INVS_PER_ROW_A; + + // Thread assignment for B tile (32 rows x 8 uvec4/row = single pass) + const uint INVS_PER_ROW_B = TILE_N / FP16_PER_VEC4; // 8 + const uint b_col = gl_LocalInvocationID.x % INVS_PER_ROW_B; + const uint b_row_offset = gl_LocalInvocationID.x / INVS_PER_ROW_B; + + const uint a_row_base = TILE_M * tileID.y; + const uint b_col_base = TILE_N * tileID.x; + + for (uint chunkK = 0; chunkK < K; chunkK += TILE_K) { + + // --- Load A tile -> shared (same as matmul_coopmat) --- + { + uint row = a_row_base + a_row_offset; + uint k_elem = chunkK + a_col * FP16_PER_VEC4; + +#ifdef IS_FP16_INPUT + uint k_hv4 = k_elem / 4; + f16vec4 v0 = t_mat1[row * K4 + k_hv4]; + f16vec4 v1 = t_mat1[row * K4 + k_hv4 + 1]; + Ash[a_row_offset * A_STRIDE_VEC4 + a_col] = uvec4( + packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)), + packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw))); +#else + uint k_vec4 = k_elem / 4; + vec4 v0 = t_mat1[row * K4 + k_vec4]; + vec4 v1 = t_mat1[row * K4 + k_vec4 + 1]; + uvec2 h0 = f32x4_to_f16x4(v0); + uvec2 h1 = f32x4_to_f16x4(v1); + Ash[a_row_offset * A_STRIDE_VEC4 + a_col] = uvec4(h0, h1); +#endif + } + + // --- Load B tile from packed weight -> shared --- + // Packed weight format: t_weight_packed[(k4 * N4 + n4) * 4 + dk] + // returns vec4 of 4 N-elements at K-row (k4*4+dk). + // Load two vec4s to get 8 consecutive N-elements = one uvec4 in Bsh. + { + uint k_row = chunkK + b_row_offset; + uint k4 = k_row >> 2u; + uint dk = k_row & 3u; + uint n_elem = b_col_base + b_col * FP16_PER_VEC4; + uint n4_0 = n_elem >> 2u; + +#ifdef IS_FP16_INPUT + f16vec4 v0 = t_weight_packed[(k4 * N4 + n4_0) * 4u + dk]; + f16vec4 v1 = t_weight_packed[(k4 * N4 + n4_0 + 1u) * 4u + dk]; + Bsh[b_row_offset * B_STRIDE_VEC4 + b_col] = uvec4( + packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)), + packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw))); +#else + vec4 v0 = t_weight_packed[(k4 * N4 + n4_0) * 4u + dk]; + vec4 v1 = t_weight_packed[(k4 * N4 + n4_0 + 1u) * 4u + dk]; + uvec2 h0 = f32x4_to_f16x4(v0); + uvec2 h1 = f32x4_to_f16x4(v1); + Bsh[b_row_offset * B_STRIDE_VEC4 + b_col] = uvec4(h0, h1); +#endif + } + + barrier(); + + // --- Cooperative matrix MMA --- + [[unroll]] for (uint k = 0; k < TILE_K / lK; ++k) { + uint k_start = lK * k; + + coopmat matA[C_ROWS]; + [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { + uint row_a = lM * (C_ROWS * warpInTile.y + i); + coopMatLoad( + matA[i], Ash, + row_a * A_STRIDE_VEC4 + k_start / FP16_PER_VEC4, + A_STRIDE_VEC4, + gl_CooperativeMatrixLayoutRowMajor); + } + + coopmat matB; + [[unroll]] for (uint j = 0; j < C_COLS; ++j) { + uint col_b = lN * (C_COLS * warpInTile.x + j) / FP16_PER_VEC4; + coopMatLoad( + matB, Bsh, + k_start * B_STRIDE_VEC4 + col_b, + B_STRIDE_VEC4, + gl_CooperativeMatrixLayoutRowMajor); + + [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { + result[i][j] = coopMatMulAdd(matA[i], matB, result[i][j]); + } + } + } + + barrier(); + } + + // --- Store result --- + [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { + [[unroll]] for (uint j = 0; j < C_COLS; ++j) { + uint gi = TILE_M * tileID.y + lM * (C_ROWS * warpInTile.y + i); + uint gj = TILE_N * tileID.x + lN * (C_COLS * warpInTile.x + j); +#ifdef IS_FP16_INPUT + coopmat out_tile = + coopmat(result[i][j]); + coopMatStore( + out_tile, t_output, + gi * N + gj, N, + gl_CooperativeMatrixLayoutRowMajor); +#else + coopMatStore( + result[i][j], t_output, + gi * N + gj, N, + gl_CooperativeMatrixLayoutRowMajor); +#endif + } + } + +#ifdef HAS_BIAS + // Add bias via read-modify-write on the output buffer. + // barrier() ensures all coopMatStore writes within this workgroup are visible. + barrier(); + + const uint tile_m_start = TILE_M * tileID.y; + const uint tile_n_start = TILE_N * tileID.x; + // 64x64 tile = 4096 elements, 256 threads -> 16 elements per thread + for (uint idx = gl_LocalInvocationID.x; idx < TILE_M * TILE_N; idx += INVOCATIONS) { + uint local_m = idx / TILE_N; + uint local_n = idx % TILE_N; + uint gm = tile_m_start + local_m; + uint gn = tile_n_start + local_n; + if (gm < M && gn < N) { + uint out_idx = gm * N + gn; + t_output[out_idx] = t_output[out_idx] + t_bias[gn]; + } + } +#endif +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.yaml new file mode 100644 index 00000000000..669da1bbd26 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.yaml @@ -0,0 +1,19 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +linear_coopmat: + parameter_names_with_default_values: + DTYPE: float + PRECISION: highp + HAS_BIAS: false + generate_variant_forall: + DTYPE: + - VALUE: float + - VALUE: half + shader_variants: + - NAME: linear_coopmat + - NAME: linear_coopmat_bias + HAS_BIAS: true diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.glsl new file mode 100644 index 00000000000..6a6ac6eb965 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.glsl @@ -0,0 +1,227 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +/* + * KHR Cooperative Matrix matmul following matmul_vec conventions. + * Drop-in replacement for matmul_vec when storage=buffer and device + * supports GL_KHR_cooperative_matrix. + * + * Computes: D = A * B (A: [M, K], B: [K, N], D: [M, N]) + * + * fp16×fp16→fp32 MMA. When DTYPE=half, inputs are native fp16 (no + * conversion, half the bandwidth). When DTYPE=float, inputs are fp32 + * with on-the-fly packHalf2x16 conversion. + * + * Output is always fp32 (fp32 accumulator → fp32 store) when DTYPE=float, + * or fp16 when DTYPE=half. + */ + +#version 450 core + +#extension GL_KHR_cooperative_matrix : require +#extension GL_KHR_memory_scope_semantics : require +#extension GL_KHR_shader_subgroup_basic : enable +#extension GL_EXT_shader_explicit_arithmetic_types : require +#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require +#extension GL_EXT_control_flow_attributes : enable + +#define PRECISION ${PRECISION} + +$if DTYPE == "half": + #define IS_FP16_INPUT +$if DTYPE == "float": + #define IS_FP32_INPUT + +layout(std430) buffer; + +#include "common.glslh" + +// Bindings — same order as matmul_vec: output(0), mat1(1), mat2(2) +// Output uses same dtype as input. Inputs use vec4 (non-scalar) for wide loads. +${layout_declare_tensor(B, "w", "t_output", DTYPE, "buffer", is_scalar_array=True)} +${layout_declare_tensor(B, "r", "t_mat1", DTYPE, "buffer", is_scalar_array=False)} +${layout_declare_tensor(B, "r", "t_mat2", DTYPE, "buffer", is_scalar_array=False)} + +// UBOs — same as matmul_vec +${layout_declare_ubo(B, "ivec4", "mat1_sizes")} +${layout_declare_ubo(B, "ivec4", "mat2_sizes")} + +layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in; + +// Tile dimensions +const uint lM = 16; +const uint lN = 16; +const uint lK = 16; +const uint TILE_M = 64; +const uint TILE_N = 64; +const uint TILE_K = 32; + +// Workgroup: 4 subgroups in 2×2 grid, 64 threads each = 256 total +const uint WG_WIDTH = 2; +const uint WG_HEIGHT = 2; +const uint NUM_SUBGROUPS = 4; +const uint INVOCATIONS = 64 * NUM_SUBGROUPS; + +// Result tiles per subgroup: 2×2 +const uint C_ROWS = TILE_M / WG_HEIGHT / lM; // 2 +const uint C_COLS = TILE_N / WG_WIDTH / lN; // 2 + +// fp16: 8 elements per uvec4 (128-bit) +const uint FP16_PER_VEC4 = 8; + +// Shared memory with skew padding +const uint A_STRIDE_VEC4 = (TILE_K + FP16_PER_VEC4) / FP16_PER_VEC4; // 5 +const uint B_STRIDE_VEC4 = (TILE_N + FP16_PER_VEC4) / FP16_PER_VEC4; // 9 + +shared uvec4 Ash[TILE_M * A_STRIDE_VEC4]; // 5KB +shared uvec4 Bsh[TILE_K * B_STRIDE_VEC4]; // 4.5KB + +// Accumulator tiles (fp32) +coopmat result[C_ROWS][C_COLS]; + +#ifdef IS_FP32_INPUT +uvec2 f32x4_to_f16x4(vec4 v) { + return uvec2(packHalf2x16(v.xy), packHalf2x16(v.zw)); +} +#endif + +void main() { + const uvec2 tileID = uvec2(gl_WorkGroupID.xy); + const uvec2 warpInTile = uvec2( + gl_SubgroupID % WG_WIDTH, + gl_SubgroupID / WG_WIDTH); + + const uint K = uint(mat1_sizes.x); + const uint M = uint(mat1_sizes.y); + const uint N = uint(mat2_sizes.x); + const uint K4 = (K + 3u) / 4u; + const uint N4 = (N + 3u) / 4u; + + [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { + [[unroll]] for (uint j = 0; j < C_COLS; ++j) { + result[i][j] = coopmat(0.0); + } + } + + // Thread assignment for A tile (64 rows × 4 uvec4/row = single pass) + const uint INVS_PER_ROW_A = TILE_K / FP16_PER_VEC4; // 4 + const uint a_col = gl_LocalInvocationID.x % INVS_PER_ROW_A; + const uint a_row_offset = gl_LocalInvocationID.x / INVS_PER_ROW_A; + + // Thread assignment for B tile (32 rows × 8 uvec4/row = single pass) + const uint INVS_PER_ROW_B = TILE_N / FP16_PER_VEC4; // 8 + const uint b_col = gl_LocalInvocationID.x % INVS_PER_ROW_B; + const uint b_row_offset = gl_LocalInvocationID.x / INVS_PER_ROW_B; + + const uint a_row_base = TILE_M * tileID.y; + const uint b_col_base = TILE_N * tileID.x; + + for (uint chunkK = 0; chunkK < K; chunkK += TILE_K) { + + // --- Load A tile → shared (single pass) --- + { + uint row = a_row_base + a_row_offset; + uint k_elem = chunkK + a_col * FP16_PER_VEC4; + +#ifdef IS_FP16_INPUT + // Native fp16: two f16vec4 loads (64-bit each) → one uvec4 (128-bit) + uint k_hv4 = k_elem / 4; + f16vec4 v0 = t_mat1[row * K4 + k_hv4]; + f16vec4 v1 = t_mat1[row * K4 + k_hv4 + 1]; + Ash[a_row_offset * A_STRIDE_VEC4 + a_col] = uvec4( + packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)), + packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw))); +#else + // fp32 inputs: load two vec4 (8 fp32), convert to 8 fp16 + uint k_vec4 = k_elem / 4; + vec4 v0 = t_mat1[row * K4 + k_vec4]; + vec4 v1 = t_mat1[row * K4 + k_vec4 + 1]; + uvec2 h0 = f32x4_to_f16x4(v0); + uvec2 h1 = f32x4_to_f16x4(v1); + Ash[a_row_offset * A_STRIDE_VEC4 + a_col] = uvec4(h0, h1); +#endif + } + + // --- Load B tile → shared (single pass) --- + { + uint row = chunkK + b_row_offset; + uint n_elem = b_col_base + b_col * FP16_PER_VEC4; + +#ifdef IS_FP16_INPUT + uint n_hv4 = n_elem / 4; + f16vec4 v0 = t_mat2[row * N4 + n_hv4]; + f16vec4 v1 = t_mat2[row * N4 + n_hv4 + 1]; + Bsh[b_row_offset * B_STRIDE_VEC4 + b_col] = uvec4( + packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)), + packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw))); +#else + uint n_vec4 = n_elem / 4; + vec4 v0 = t_mat2[row * N4 + n_vec4]; + vec4 v1 = t_mat2[row * N4 + n_vec4 + 1]; + uvec2 h0 = f32x4_to_f16x4(v0); + uvec2 h1 = f32x4_to_f16x4(v1); + Bsh[b_row_offset * B_STRIDE_VEC4 + b_col] = uvec4(h0, h1); +#endif + } + + barrier(); + + // --- Cooperative matrix MMA --- + [[unroll]] for (uint k = 0; k < TILE_K / lK; ++k) { + uint k_start = lK * k; + + coopmat matA[C_ROWS]; + [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { + uint row_a = lM * (C_ROWS * warpInTile.y + i); + coopMatLoad( + matA[i], Ash, + row_a * A_STRIDE_VEC4 + k_start / FP16_PER_VEC4, + A_STRIDE_VEC4, + gl_CooperativeMatrixLayoutRowMajor); + } + + coopmat matB; + [[unroll]] for (uint j = 0; j < C_COLS; ++j) { + uint col_b = lN * (C_COLS * warpInTile.x + j) / FP16_PER_VEC4; + coopMatLoad( + matB, Bsh, + k_start * B_STRIDE_VEC4 + col_b, + B_STRIDE_VEC4, + gl_CooperativeMatrixLayoutRowMajor); + + [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { + result[i][j] = coopMatMulAdd(matA[i], matB, result[i][j]); + } + } + } + + barrier(); + } + + // --- Store result --- + [[unroll]] for (uint i = 0; i < C_ROWS; ++i) { + [[unroll]] for (uint j = 0; j < C_COLS; ++j) { + uint gi = TILE_M * tileID.y + lM * (C_ROWS * warpInTile.y + i); + uint gj = TILE_N * tileID.x + lN * (C_COLS * warpInTile.x + j); +#ifdef IS_FP16_INPUT + // Convert fp32 accumulator to fp16 for fp16 output buffer + coopmat out_tile = + coopmat(result[i][j]); + coopMatStore( + out_tile, t_output, + gi * N + gj, N, + gl_CooperativeMatrixLayoutRowMajor); +#else + coopMatStore( + result[i][j], t_output, + gi * N + gj, N, + gl_CooperativeMatrixLayoutRowMajor); +#endif + } + } +} diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.yaml new file mode 100644 index 00000000000..d120892c563 --- /dev/null +++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.yaml @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +matmul_coopmat: + parameter_names_with_default_values: + DTYPE: float + PRECISION: highp + generate_variant_forall: + DTYPE: + - VALUE: float + - VALUE: half + shader_variants: + - NAME: matmul_coopmat diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp index 62266473351..9bcfbad607d 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp @@ -24,7 +24,8 @@ ValueRef prepack_fp_linear_weight( ComputeGraph& graph, const ValueRef weight_data, bool is_transposed, - int64_t B) { + int64_t B, + bool force_buffer) { std::vector weight_sizes = graph.sizes_of(weight_data); int64_t N, K; @@ -47,12 +48,17 @@ ValueRef prepack_fp_linear_weight( int64_t output_height = B * K4; int64_t output_width = N4 * 4 * 4; - utils::StorageType weight_storage = utils::kTexture2D; - uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); - // output_width is in scalars; texture width in texels = output_width / 4 - if (output_width / 4 > max_extent || - static_cast(output_height) > max_extent) { + utils::StorageType weight_storage; + if (force_buffer) { weight_storage = utils::kBuffer; + } else { + weight_storage = utils::kTexture2D; + uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim(); + // output_width is in scalars; texture width in texels = output_width / 4 + if (output_width / 4 > max_extent || + static_cast(output_height) > max_extent) { + weight_storage = utils::kBuffer; + } } ValueRef packed_weight = graph.add_tensor( @@ -233,6 +239,100 @@ void add_linear_tiled_node( resize_linear_node)); } +// ── Cooperative matrix linear ── + +static constexpr uint32_t kLinearCoopMatTileM = 64; +static constexpr uint32_t kLinearCoopMatTileN = 64; +static constexpr uint32_t kLinearCoopMatInvocations = 256; // 4 subgroups x 64 + +vkapi::ShaderInfo pick_linear_coopmat_shader( + ComputeGraph* graph, + const std::vector& args, + const std::vector& resize_args) { + const ValueRef out = args.at(0).refs.at(0); + bool has_bias = graph->get_bool(resize_args.at(1)); + std::string kernel_name = + has_bias ? "linear_coopmat_bias" : "linear_coopmat"; + kernel_name.reserve(kShaderNameReserve); + add_dtype_suffix(kernel_name, graph->dtype_of(out)); + return VK_KERNEL_FROM_STR(kernel_name); +} + +utils::uvec3 pick_linear_coopmat_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + const auto out_sizes = graph->sizes_of(out); + uint32_t M = out_sizes.at(out_sizes.size() - 2); + uint32_t N = out_sizes.at(out_sizes.size() - 1); + uint32_t num_tiles_n = utils::div_up(N, kLinearCoopMatTileN); + uint32_t num_tiles_m = utils::div_up(M, kLinearCoopMatTileM); + return {num_tiles_n * kLinearCoopMatInvocations, num_tiles_m, 1}; +} + +utils::uvec3 pick_linear_coopmat_local_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args) { + (void)graph; + (void)shader; + (void)global_workgroup_size; + (void)args; + (void)resize_args; + return {kLinearCoopMatInvocations, 1, 1}; +} + +void add_linear_coopmat_node( + ComputeGraph& graph, + const ValueRef input, + const ValueRef packed_weight, + const ValueRef packed_bias, + bool has_bias, + const ValueRef out, + int32_t weight_B) { + VK_CHECK_COND(graph.packed_dim_of(input) == WHCN::kWidthDim); + VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kWidthDim); + VK_CHECK_COND( + graph.storage_type_of(out) == utils::kBuffer, + "linear_coopmat requires buffer storage"); + + std::vector out_sizes = graph.sizes_of(out); + int32_t orig_N = utils::safe_downcast(out_sizes.back()); + ValueRef orig_N_ref = graph.add_scalar(static_cast(orig_N)); + ValueRef has_bias_ref = graph.add_scalar(has_bias); + + std::vector read_inputs = {input, packed_weight}; + if (has_bias) { + read_inputs.push_back(packed_bias); + } + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + pick_linear_coopmat_shader, + pick_linear_coopmat_global_wg_size, + pick_linear_coopmat_local_wg_size, + // Inputs and Outputs + {{out, vkapi::kWrite}, {read_inputs, vkapi::kRead}}, + // Shader params buffers + {graph.sizes_ubo(input), graph.sizes_ubo(out)}, + // Push Constants + {}, + // Specialization Constants + {}, + // Resize Args + {orig_N_ref, has_bias_ref}, + // Resizing Logic + resize_linear_node)); +} + +// ── End cooperative matrix linear ── + void linear_packed_weight( ComputeGraph& graph, const std::vector& args) { @@ -241,18 +341,38 @@ void linear_packed_weight( ValueRef bias = args.at(2); ValueRef out = args.at(3); + bool has_bias = graph.val_is_not_none(bias); + // Coopmat shader assumes M is a multiple of TILE_M (64) because the store + // does not bounds-check. Fall back to the tiled shader otherwise. + // TODO: remove this guard once the coopmat shader gains partial-tile + // bounds checking. + auto input_sizes = graph.sizes_of(input); + int64_t M = input_sizes.size() >= 2 + ? input_sizes.at(input_sizes.size() - 2) + : 1; + bool use_coopmat = + graph.context()->adapter_ptr()->supports_cooperative_matrix() && + graph.storage_type_of(out) == utils::kBuffer && + M >= 64; + ValueRef packed_weight = prepack_fp_linear_weight( - graph, weight_data, /*is_transposed=*/true, /*B=*/1); + graph, weight_data, /*is_transposed=*/true, /*B=*/1, + /*force_buffer=*/use_coopmat); ValueRef packed_bias = kDummyValueRef; - bool has_bias = graph.val_is_not_none(bias); if (has_bias) { packed_bias = prepack_standard( - graph, bias, graph.storage_type_of(out), utils::kWidthPacked); + graph, bias, graph.storage_type_of(out), utils::kWidthPacked, + /*passthrough=*/use_coopmat); } - add_linear_tiled_node( - graph, input, packed_weight, packed_bias, has_bias, out); + if (use_coopmat) { + add_linear_coopmat_node( + graph, input, packed_weight, packed_bias, has_bias, out); + } else { + add_linear_tiled_node( + graph, input, packed_weight, packed_bias, has_bias, out); + } } REGISTER_OPERATORS { diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.h b/backends/vulkan/runtime/graph/ops/impl/Linear.h index d7efb8c8b08..c829487fc3c 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Linear.h +++ b/backends/vulkan/runtime/graph/ops/impl/Linear.h @@ -16,7 +16,8 @@ ValueRef prepack_fp_linear_weight( ComputeGraph& graph, const ValueRef weight_data, bool is_transposed, - int64_t B); + int64_t B, + bool force_buffer = false); void add_linear_tiled_node( ComputeGraph& graph, @@ -29,4 +30,13 @@ void add_linear_tiled_node( float alpha = 1.0f, float beta = 1.0f); +void add_linear_coopmat_node( + ComputeGraph& graph, + const ValueRef input, + const ValueRef packed_weight, + const ValueRef packed_bias, + bool has_bias, + const ValueRef out, + int32_t weight_B = 1); + } // namespace vkcompute diff --git a/backends/vulkan/runtime/graph/ops/impl/Matmul.cpp b/backends/vulkan/runtime/graph/ops/impl/Matmul.cpp index 53bb8d82e12..dad7d5c5f8f 100644 --- a/backends/vulkan/runtime/graph/ops/impl/Matmul.cpp +++ b/backends/vulkan/runtime/graph/ops/impl/Matmul.cpp @@ -19,6 +19,95 @@ namespace vkcompute { +// Forward declaration +void resize_matmul_tiled_node( + ComputeGraph* graph, + const std::vector& args, + const std::vector& resize_args); + +// ── Cooperative matrix tile configuration (must match matmul_coopmat.glsl) ── + +static constexpr uint32_t kCoopMatTileM = 64; +static constexpr uint32_t kCoopMatTileN = 64; +static constexpr uint32_t kCoopMatInvocations = 256; // 4 subgroups × 64 + +vkapi::ShaderInfo pick_matmul_coopmat_shader( + ComputeGraph* graph, + const std::vector& args, + const std::vector& resize_args) { + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + std::string kernel_name = "matmul_coopmat"; + kernel_name.reserve(kShaderNameReserve); + add_dtype_suffix(kernel_name, graph->dtype_of(out)); + return VK_KERNEL_FROM_STR(kernel_name); +} + +utils::uvec3 pick_matmul_coopmat_global_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const std::vector& args, + const std::vector& resize_args) { + (void)shader; + (void)resize_args; + const ValueRef out = args.at(0).refs.at(0); + const auto out_sizes = graph->sizes_of(out); + uint32_t M = out_sizes.at(out_sizes.size() - 2); + uint32_t N = out_sizes.at(out_sizes.size() - 1); + uint32_t num_tiles_n = utils::div_up(N, kCoopMatTileN); + uint32_t num_tiles_m = utils::div_up(M, kCoopMatTileM); + return {num_tiles_n * kCoopMatInvocations, num_tiles_m, 1}; +} + +utils::uvec3 pick_matmul_coopmat_local_wg_size( + ComputeGraph* graph, + const vkapi::ShaderInfo& shader, + const utils::uvec3& global_workgroup_size, + const std::vector& args, + const std::vector& resize_args) { + (void)graph; + (void)shader; + (void)global_workgroup_size; + (void)args; + (void)resize_args; + return {kCoopMatInvocations, 1, 1}; +} + +void add_matmul_coopmat_node( + ComputeGraph& graph, + const ValueRef mat1, + const ValueRef mat2, + const ValueRef out) { + VK_CHECK_COND(graph.packed_dim_of(mat1) == WHCN::kWidthDim); + VK_CHECK_COND(graph.packed_dim_of(mat2) == WHCN::kWidthDim); + VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kWidthDim); + VK_CHECK_COND( + graph.storage_type_of(out) == utils::kBuffer, + "matmul_coopmat requires buffer storage"); + + ValueRef has_bias_ref = graph.add_scalar(false); + + graph.execute_nodes().emplace_back(new DynamicDispatchNode( + graph, + pick_matmul_coopmat_shader, + pick_matmul_coopmat_global_wg_size, + pick_matmul_coopmat_local_wg_size, + // Inputs and Outputs — same binding order as matmul_vec + {{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}}, + // Shader params buffers — same UBOs as matmul_vec + {graph.sizes_ubo(mat1), graph.sizes_ubo(mat2)}, + // Push Constants + {}, + // Specialization Constants (tile config hardcoded in shader) + {}, + // Resize Args + {has_bias_ref}, + // Resizing Logic + resize_matmul_tiled_node)); +} + +// ── End cooperative matrix section ── + void resize_matmul_tiled_node( ComputeGraph* graph, const std::vector& args, @@ -189,16 +278,30 @@ void matmul_tiled(ComputeGraph& graph, const std::vector& args) { if (graph.val_is_tref(mat2)) { auto mat2_sizes = graph.sizes_of(mat2); int64_t B = mat2_sizes.size() >= 3 ? mat2_sizes.at(0) : 1; - ValueRef packed = - prepack_fp_linear_weight(graph, mat2, /*is_transposed=*/false, B); - add_linear_tiled_node( - graph, - mat1, - packed, - kDummyValueRef, - false, - out, - utils::safe_downcast(B)); + bool use_coopmat = + graph.context()->adapter_ptr()->supports_cooperative_matrix() && + graph.storage_type_of(out) == utils::kBuffer; + ValueRef packed = prepack_fp_linear_weight( + graph, mat2, /*is_transposed=*/false, B, + /*force_buffer=*/use_coopmat); + if (use_coopmat) { + add_linear_coopmat_node( + graph, mat1, packed, kDummyValueRef, false, out, + utils::safe_downcast(B)); + } else { + add_linear_tiled_node( + graph, + mat1, + packed, + kDummyValueRef, + false, + out, + utils::safe_downcast(B)); + } + } else if ( + graph.context()->adapter_ptr()->supports_cooperative_matrix() && + graph.storage_type_of(out) == utils::kBuffer) { + add_matmul_coopmat_node(graph, mat1, mat2, out); } else { add_matmul_tiled_node(graph, mat1, mat2, out); } diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h index 3c503deab70..7a4aa94560d 100644 --- a/backends/vulkan/runtime/vk_api/Adapter.h +++ b/backends/vulkan/runtime/vk_api/Adapter.h @@ -252,6 +252,15 @@ class Adapter final { #endif /* VK_NV_cooperative_matrix2 */ } + inline bool supports_cooperative_matrix() { +#ifdef VK_KHR_cooperative_matrix + return physical_device_.cooperative_matrix_features.cooperativeMatrix == + VK_TRUE; +#else + return false; +#endif /* VK_KHR_cooperative_matrix */ + } + inline bool supports_int16_shader_types() { #ifdef ETVK_FORCE_NO_EXTENSIONS return false; diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt index d17ab94d194..c8fd9e41f2e 100644 --- a/backends/vulkan/test/custom_ops/CMakeLists.txt +++ b/backends/vulkan/test/custom_ops/CMakeLists.txt @@ -48,8 +48,10 @@ if(TARGET vulkan_backend) # Prototyping utility files set(PROTOTYPING_UTILS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR}) - set(PROTOTYPING_UTILS_CPP ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp - ${CMAKE_CURRENT_SOURCE_DIR}/conv2d_utils.cpp + set(PROTOTYPING_UTILS_CPP + ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/conv2d_utils.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/cm_utils.cpp ) # Prototyping shaders @@ -103,4 +105,6 @@ if(TARGET vulkan_backend) add_operator_prototype(test_q8ta_conv2d) add_operator_prototype(test_q8ta_conv2d_pw) add_operator_prototype(test_q8ta_conv2d_dw) + add_operator_prototype(matmul_coopmat_bench) + add_operator_prototype(linear_coopmat_bench) endif() diff --git a/backends/vulkan/test/custom_ops/cm_utils.cpp b/backends/vulkan/test/custom_ops/cm_utils.cpp new file mode 100644 index 00000000000..e55e58a30be --- /dev/null +++ b/backends/vulkan/test/custom_ops/cm_utils.cpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#include "cm_utils.h" + +#include +#include + +#include +#include +#include +#include + +namespace executorch { +namespace vulkan { +namespace prototyping { + +static std::string componentTypeToString(VkComponentTypeKHR type) { + switch (type) { + case VK_COMPONENT_TYPE_FLOAT16_KHR: + return "float16"; + case VK_COMPONENT_TYPE_FLOAT32_KHR: + return "float32"; + case VK_COMPONENT_TYPE_FLOAT64_KHR: + return "float64"; + case VK_COMPONENT_TYPE_SINT8_KHR: + return "int8"; + case VK_COMPONENT_TYPE_SINT16_KHR: + return "int16"; + case VK_COMPONENT_TYPE_SINT32_KHR: + return "int32"; + case VK_COMPONENT_TYPE_SINT64_KHR: + return "int64"; + case VK_COMPONENT_TYPE_UINT8_KHR: + return "uint8"; + case VK_COMPONENT_TYPE_UINT16_KHR: + return "uint16"; + case VK_COMPONENT_TYPE_UINT32_KHR: + return "uint32"; + case VK_COMPONENT_TYPE_UINT64_KHR: + return "uint64"; + default: + return "unknown(" + std::to_string(static_cast(type)) + ")"; + } +} + +static std::string scopeToString(VkScopeKHR scope) { + switch (scope) { + case VK_SCOPE_DEVICE_KHR: + return "Device"; + case VK_SCOPE_WORKGROUP_KHR: + return "Workgroup"; + case VK_SCOPE_SUBGROUP_KHR: + return "Subgroup"; + case VK_SCOPE_QUEUE_FAMILY_KHR: + return "QueueFamily"; + default: + return "unknown(" + std::to_string(static_cast(scope)) + ")"; + } +} + +void queryCooperativeMatrixProperties() { +#ifdef VK_KHR_cooperative_matrix + auto* adapter = vkcompute::api::context()->adapter_ptr(); + VkPhysicalDevice physicalDevice = adapter->physical_handle(); + + if (!adapter->supports_cooperative_matrix()) { + std::cout << "VK_KHR_cooperative_matrix is NOT supported on this device." + << std::endl; + return; + } + + std::cout << "\n=== Cooperative Matrix Properties (KHR) ===" << std::endl; + + uint32_t count = 0; + VkResult result = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR( + physicalDevice, &count, nullptr); + + if (result != VK_SUCCESS || count == 0) { + std::cout << "No cooperative matrix configurations found." << std::endl; + return; + } + + std::vector properties(count); + for (auto& prop : properties) { + prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR; + prop.pNext = nullptr; + } + + result = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR( + physicalDevice, &count, properties.data()); + + if (result != VK_SUCCESS) { + std::cerr << "Failed to query cooperative matrix properties." << std::endl; + return; + } + + std::cout << "Found " << count << " cooperative matrix configurations:\n" + << std::endl; + + std::cout << std::left << std::setw(5) << "#" << std::setw(10) << "M" + << std::setw(10) << "N" << std::setw(10) << "K" << std::setw(12) + << "AType" << std::setw(12) << "BType" << std::setw(12) << "CType" + << std::setw(12) << "ResultType" << std::setw(12) << "Scope" + << std::endl; + + std::cout << std::string(95, '-') << std::endl; + + for (uint32_t i = 0; i < count; ++i) { + const auto& p = properties[i]; + std::cout << std::left << std::setw(5) << i << std::setw(10) << p.MSize + << std::setw(10) << p.NSize << std::setw(10) << p.KSize + << std::setw(12) << componentTypeToString(p.AType) + << std::setw(12) << componentTypeToString(p.BType) + << std::setw(12) << componentTypeToString(p.CType) + << std::setw(12) << componentTypeToString(p.ResultType) + << std::setw(12) << scopeToString(p.scope) << std::endl; + } + + std::cout << std::endl; + +#else + std::cout << "VK_KHR_cooperative_matrix not available at compile time." + << std::endl; +#endif +} + +} // namespace prototyping +} // namespace vulkan +} // namespace executorch diff --git a/backends/vulkan/test/custom_ops/cm_utils.h b/backends/vulkan/test/custom_ops/cm_utils.h new file mode 100644 index 00000000000..1c6c06bdac8 --- /dev/null +++ b/backends/vulkan/test/custom_ops/cm_utils.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +#pragma once + +namespace executorch { +namespace vulkan { +namespace prototyping { + +// Query and print VK_KHR_cooperative_matrix properties from the device. +// Shows supported M/N/K tile sizes, component types, and scopes. +void queryCooperativeMatrixProperties(); + +} // namespace prototyping +} // namespace vulkan +} // namespace executorch diff --git a/backends/vulkan/test/custom_ops/linear_coopmat_bench.cpp b/backends/vulkan/test/custom_ops/linear_coopmat_bench.cpp new file mode 100644 index 00000000000..37ae94a0521 --- /dev/null +++ b/backends/vulkan/test/custom_ops/linear_coopmat_bench.cpp @@ -0,0 +1,196 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Microbenchmark: linear_coopmat vs linear_vec. +// +// Uses test_etvk.test_mm.default which routes to aten.mm.default. +// When mat2 is constant (set_constant(true)), aten.mm prepacks the weight +// and routes through the linear path: +// - texture3d output -> linear_vec (Stephen's tiled shader) +// - buffer output + coop mat device -> linear_coopmat (KHR cooperative matrix) +// +// For each matrix size, runs two variants: +// vec_tex: mat1=tex3d, mat2=tex3d(constant), out=tex3d -> linear_vec +// cm_fp32: mat1=buf, mat2=buf(constant), out=buf -> linear_coopmat + +#include +#include + +#include +#include +#include +#include + +#include "cm_utils.h" +#include "utils.h" + +using namespace executorch::vulkan::prototyping; + +std::vector generate_test_cases() { + std::vector test_cases; + + struct LinearConfig { + int64_t M, K, N; + std::string name; + }; + + std::vector configs = { + // BERT-like shapes + {256, 768, 3072, "BERT_FFN_up"}, + {256, 3072, 768, "BERT_FFN_down"}, + {128, 768, 768, "BERT_QKV"}, + // LLM-like shapes (single token) + {1, 4096, 4096, "LLM_QKV_1tok"}, + {1, 4096, 11008, "LLM_FFN_up_1tok"}, + {1, 11008, 4096, "LLM_FFN_down_1tok"}, + // LLM-like shapes (batch) + {32, 4096, 4096, "LLM_QKV_32tok"}, + {32, 4096, 11008, "LLM_FFN_up_32tok"}, + // Square stress + {256, 1024, 1024, "sq_1024"}, + {256, 4096, 4096, "sq_4096"}, + }; + + // Variant 1: linear_vec texture3d (baseline) + // mat2 is constant -> prepacked -> linear_vec path + for (const auto& cfg : configs) { + TestCase tc; + tc.set_name("vec_tex_" + cfg.name); + tc.set_operator_name("test_etvk.test_mm.default"); + + ValueSpec input_A( + {cfg.M, cfg.K}, vkapi::kFloat, utils::kTexture3D, + utils::kWidthPacked, DataGenType::RANDOM); + ValueSpec input_B( + {cfg.K, cfg.N}, vkapi::kFloat, utils::kTexture3D, + utils::kWidthPacked, DataGenType::RANDOM); + input_B.set_constant(true); + ValueSpec impl_selector = ValueSpec::make_string("default"); + ValueSpec output( + {cfg.M, cfg.N}, vkapi::kFloat, utils::kTexture3D, + utils::kWidthPacked, DataGenType::ZEROS); + + tc.add_input_spec(input_A); + tc.add_input_spec(input_B); + tc.add_input_spec(impl_selector); + tc.add_output_spec(output); + tc.set_abs_tolerance(1e-2f); + tc.set_rel_tolerance(1e-1f); + test_cases.push_back(tc); + } + + // Variant 2: linear_coopmat buffer fp32 + // mat2 is constant + buffer -> prepack with buffer -> linear_coopmat + for (const auto& cfg : configs) { + TestCase tc; + tc.set_name("cm_fp32_" + cfg.name); + tc.set_operator_name("test_etvk.test_mm.default"); + + ValueSpec input_A( + {cfg.M, cfg.K}, vkapi::kFloat, utils::kBuffer, + utils::kWidthPacked, DataGenType::RANDOM); + ValueSpec input_B( + {cfg.K, cfg.N}, vkapi::kFloat, utils::kBuffer, + utils::kWidthPacked, DataGenType::RANDOM); + input_B.set_constant(true); + ValueSpec impl_selector = ValueSpec::make_string("default"); + ValueSpec output( + {cfg.M, cfg.N}, vkapi::kFloat, utils::kBuffer, + utils::kWidthPacked, DataGenType::ZEROS); + + tc.add_input_spec(input_A); + tc.add_input_spec(input_B); + tc.add_input_spec(impl_selector); + tc.add_output_spec(output); + tc.set_abs_tolerance(5e-1f); + tc.set_rel_tolerance(5e-1f); + test_cases.push_back(tc); + } + + return test_cases; +} + +int64_t linear_flops(const TestCase& test_case) { + if (test_case.empty() || test_case.num_inputs() < 2) return 0; + const auto& A = test_case.inputs()[0].get_tensor_sizes(); + const auto& B = test_case.inputs()[1].get_tensor_sizes(); + int64_t M = A.at(A.size() - 2); + int64_t K = A.at(A.size() - 1); + int64_t N = B.at(B.size() - 1); + return 2 * M * N * K; +} + +static constexpr int64_t kRefLimit = 2048; + +void linear_reference(TestCase& test_case) { + const ValueSpec& A_spec = test_case.inputs().at(0); + const ValueSpec& B_spec = test_case.inputs().at(1); + ValueSpec& out_spec = test_case.outputs().at(0); + + const auto& A_sizes = A_spec.get_tensor_sizes(); + const auto& B_sizes = B_spec.get_tensor_sizes(); + int64_t M = A_sizes.at(A_sizes.size() - 2); + int64_t K = A_sizes.at(A_sizes.size() - 1); + int64_t N = B_sizes.at(B_sizes.size() - 1); + + if (M > kRefLimit || K > kRefLimit || N > kRefLimit) { + std::cerr << "Skipping reference for large matrix (" + << M << "x" << K << "x" << N << ")" << std::endl; + return; + } + + const auto& A_f = A_spec.get_float_data(); + const auto& B_f = B_spec.get_float_data(); + auto& ref = out_spec.get_ref_float_data(); + ref.resize(M * N, 0.0f); + + for (int64_t m = 0; m < M; ++m) + for (int64_t n = 0; n < N; ++n) { + float sum = 0.0f; + for (int64_t k = 0; k < K; ++k) + sum += A_f[m * K + k] * B_f[k * N + n]; + ref[m * N + n] = sum; + } +} + +int main(int argc, char* argv[]) { + (void)argc; + (void)argv; + + set_print_output(false); + set_print_latencies(true); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "Linear Coopmat vs Vec Microbenchmark" << std::endl; + print_separator(); + + try { + api::context()->initialize_querypool(); + } catch (const std::exception& e) { + std::cerr << "Failed to initialize Vulkan: " << e.what() << std::endl; + return 1; + } + + if (api::context()->adapter_ptr()->supports_cooperative_matrix()) { + std::cout << "Cooperative matrix: SUPPORTED" << std::endl; + queryCooperativeMatrixProperties(); + } else { + std::cout << "Cooperative matrix: NOT supported (buffer tests will use linear_vec)" << std::endl; + } + + auto results = execute_test_cases( + generate_test_cases, + linear_flops, + "LINEAR_COOPMAT_BENCH", + 3, // warmup + 10, // benchmark runs + linear_reference); + + return 0; +} diff --git a/backends/vulkan/test/custom_ops/matmul_coopmat_bench.cpp b/backends/vulkan/test/custom_ops/matmul_coopmat_bench.cpp new file mode 100644 index 00000000000..e7543f98a9a --- /dev/null +++ b/backends/vulkan/test/custom_ops/matmul_coopmat_bench.cpp @@ -0,0 +1,254 @@ +/* + * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under the BSD-style license found in the + * LICENSE file in the root directory of this source tree. + */ + +// Microbenchmark: matmul_coopmat vs matmul_vec (texture3d and buffer). +// +// Uses test_etvk.test_gemm.default which routes to aten.mm.default. +// The shader selected depends on storage type and device capabilities: +// - texture3d storage → matmul_vec (texture path) +// - buffer storage + coop mat device → matmul_coopmat +// - buffer storage + no coop mat → matmul_vec (buffer path) +// +// For each matrix size, runs three variants: +// impl=0: aten.mm buffer (→ matmul_coopmat if coop mat available) +// impl=2: aten.mm texture3d (→ matmul_vec texture) +// impl=4: aten.mm buffer fp16 (→ matmul_coopmat fp16 if coop mat available) + +#include +#include + +#include +#include +#include +#include + +#include "cm_utils.h" +#include "utils.h" + +using namespace executorch::vulkan::prototyping; + +std::vector generate_test_cases() { + std::vector test_cases; + + struct MatmulConfig { + int64_t M, K, N; + std::string name; + }; + + std::vector configs = { + // Attention Q@K^T shapes (single-head) + {512, 64, 512, "attn_QKt_512x64x512"}, + {2048, 128, 2048, "attn_QKt_2048x128x2048"}, + // Attention attn@V + {512, 512, 64, "attn_AV_512x512x64"}, + // BERT-like projection + {256, 768, 3072, "proj_256x768x3072"}, + // Square stress + {256, 256, 256, "sq_256"}, + {512, 512, 512, "sq_512"}, + {1024, 1024, 1024, "sq_1024"}, + {2048, 2048, 2048, "sq_2048"}, + {4096, 4096, 4096, "sq_4096"}, + }; + + // impl=2: matmul_vec texture3d (baseline, fp32) + for (const auto& cfg : configs) { + TestCase tc; + tc.set_name("vec_tex_" + cfg.name); + tc.set_operator_name("test_etvk.test_gemm.default"); + + ValueSpec input_A( + {cfg.M, cfg.K}, vkapi::kFloat, utils::kTexture3D, + utils::kWidthPacked, DataGenType::RANDOM); + ValueSpec input_B( + {cfg.K, cfg.N}, vkapi::kFloat, utils::kTexture3D, + utils::kWidthPacked, DataGenType::RANDOM); + ValueSpec input_C( + {cfg.M, cfg.N}, vkapi::kFloat, utils::kBuffer, + utils::kWidthPacked, DataGenType::ZEROS); + ValueSpec alpha_spec(1.0f); + ValueSpec beta_spec(0.0f); + ValueSpec impl_selector_spec(static_cast(2)); + ValueSpec output( + {cfg.M, cfg.N}, vkapi::kFloat, utils::kTexture3D, + utils::kWidthPacked, DataGenType::ZEROS); + + tc.add_input_spec(input_A); + tc.add_input_spec(input_B); + tc.add_input_spec(input_C); + tc.add_input_spec(alpha_spec); + tc.add_input_spec(beta_spec); + tc.add_input_spec(impl_selector_spec); + tc.add_output_spec(output); + tc.set_abs_tolerance(1e-2f); + tc.set_rel_tolerance(1e-1f); + test_cases.push_back(tc); + } + + // impl=0: aten.mm buffer fp32 (→ matmul_coopmat if device supports) + for (const auto& cfg : configs) { + TestCase tc; + tc.set_name("cm_fp32_" + cfg.name); + tc.set_operator_name("test_etvk.test_gemm.default"); + + ValueSpec input_A( + {cfg.M, cfg.K}, vkapi::kFloat, utils::kBuffer, + utils::kWidthPacked, DataGenType::RANDOM); + ValueSpec input_B( + {cfg.K, cfg.N}, vkapi::kFloat, utils::kBuffer, + utils::kWidthPacked, DataGenType::RANDOM); + ValueSpec input_C( + {cfg.M, cfg.N}, vkapi::kFloat, utils::kBuffer, + utils::kWidthPacked, DataGenType::ZEROS); + ValueSpec alpha_spec(1.0f); + ValueSpec beta_spec(0.0f); + ValueSpec impl_selector_spec(static_cast(0)); + ValueSpec output( + {cfg.M, cfg.N}, vkapi::kFloat, utils::kBuffer, + utils::kWidthPacked, DataGenType::ZEROS); + + tc.add_input_spec(input_A); + tc.add_input_spec(input_B); + tc.add_input_spec(input_C); + tc.add_input_spec(alpha_spec); + tc.add_input_spec(beta_spec); + tc.add_input_spec(impl_selector_spec); + tc.add_output_spec(output); + tc.set_abs_tolerance(5e-1f); + tc.set_rel_tolerance(5e-1f); + test_cases.push_back(tc); + } + + // impl=0: aten.mm buffer fp16 (→ matmul_coopmat fp16 if device supports) + for (const auto& cfg : configs) { + TestCase tc; + tc.set_name("cm_fp16_" + cfg.name); + tc.set_operator_name("test_etvk.test_gemm.default"); + + ValueSpec input_A( + {cfg.M, cfg.K}, vkapi::kHalf, utils::kBuffer, + utils::kWidthPacked, DataGenType::RANDOM); + ValueSpec input_B( + {cfg.K, cfg.N}, vkapi::kHalf, utils::kBuffer, + utils::kWidthPacked, DataGenType::RANDOM); + ValueSpec input_C( + {cfg.M, cfg.N}, vkapi::kFloat, utils::kBuffer, + utils::kWidthPacked, DataGenType::ZEROS); + ValueSpec alpha_spec(1.0f); + ValueSpec beta_spec(0.0f); + ValueSpec impl_selector_spec(static_cast(0)); + ValueSpec output( + {cfg.M, cfg.N}, vkapi::kHalf, utils::kBuffer, + utils::kWidthPacked, DataGenType::ZEROS); + + tc.add_input_spec(input_A); + tc.add_input_spec(input_B); + tc.add_input_spec(input_C); + tc.add_input_spec(alpha_spec); + tc.add_input_spec(beta_spec); + tc.add_input_spec(impl_selector_spec); + tc.add_output_spec(output); + tc.set_abs_tolerance(5e-1f); + tc.set_rel_tolerance(5e-1f); + test_cases.push_back(tc); + } + + return test_cases; +} + +int64_t matmul_flops(const TestCase& test_case) { + if (test_case.empty() || test_case.num_inputs() < 2) return 0; + const auto& A = test_case.inputs()[0].get_tensor_sizes(); + const auto& B = test_case.inputs()[1].get_tensor_sizes(); + int64_t M = A.at(A.size() - 2); + int64_t K = A.at(A.size() - 1); + int64_t N = B.at(B.size() - 1); + return 2 * M * N * K; +} + +static constexpr int64_t kRefLimit = 2048; + +void matmul_reference(TestCase& test_case) { + const ValueSpec& A_spec = test_case.inputs().at(0); + const ValueSpec& B_spec = test_case.inputs().at(1); + ValueSpec& out_spec = test_case.outputs().at(0); + + const auto& A_sizes = A_spec.get_tensor_sizes(); + const auto& B_sizes = B_spec.get_tensor_sizes(); + int64_t M = A_sizes.at(A_sizes.size() - 2); + int64_t K = A_sizes.at(A_sizes.size() - 1); + int64_t N = B_sizes.at(B_sizes.size() - 1); + + if (M > kRefLimit || K > kRefLimit || N > kRefLimit) { + std::cerr << "Skipping reference for large matrix (" + << M << "x" << K << "x" << N << ")" << std::endl; + return; + } + + auto& ref = out_spec.get_ref_float_data(); + ref.resize(M * N, 0.0f); + + if (A_spec.dtype == vkapi::kHalf) { + const auto& A_h = A_spec.get_half_data(); + const auto& B_h = B_spec.get_half_data(); + for (int64_t m = 0; m < M; ++m) + for (int64_t n = 0; n < N; ++n) { + float sum = 0.0f; + for (int64_t k = 0; k < K; ++k) + sum += half_to_float(A_h[m * K + k]) * half_to_float(B_h[k * N + n]); + ref[m * N + n] = sum; + } + } else { + const auto& A_f = A_spec.get_float_data(); + const auto& B_f = B_spec.get_float_data(); + for (int64_t m = 0; m < M; ++m) + for (int64_t n = 0; n < N; ++n) { + float sum = 0.0f; + for (int64_t k = 0; k < K; ++k) + sum += A_f[m * K + k] * B_f[k * N + n]; + ref[m * N + n] = sum; + } + } +} + +int main(int argc, char* argv[]) { + (void)argc; + (void)argv; + + set_print_output(false); + set_print_latencies(true); + set_use_gpu_timestamps(true); + + print_performance_header(); + std::cout << "Matmul Coopmat vs Vec Microbenchmark" << std::endl; + print_separator(); + + try { + api::context()->initialize_querypool(); + } catch (const std::exception& e) { + std::cerr << "Failed to initialize Vulkan: " << e.what() << std::endl; + return 1; + } + + if (api::context()->adapter_ptr()->supports_cooperative_matrix()) { + std::cout << "Cooperative matrix: SUPPORTED" << std::endl; + queryCooperativeMatrixProperties(); + } else { + std::cout << "Cooperative matrix: NOT supported (buffer tests will use matmul_vec)" << std::endl; + } + + auto results = execute_test_cases( + generate_test_cases, + matmul_flops, + "MATMUL_COOPMAT_BENCH", + 3, // warmup + 10, // benchmark runs + matmul_reference); + + return 0; +}