diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.glsl b/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.glsl
new file mode 100644
index 00000000000..50827118a12
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.glsl
@@ -0,0 +1,261 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * KHR Cooperative Matrix linear shader for prepacked weights.
+ * Drop-in replacement for linear_vec when storage=buffer and device
+ * supports GL_KHR_cooperative_matrix.
+ *
+ * Computes: D = A * W_packed   (A: [M, K], W_packed: 4OC x 4IC blocked, D: [M, N])
+ *
+ * Weight is prepacked by pack_fp_linear_weight into a 4OC x 4IC blocked layout:
+ *   t_weight_packed[(k4 * N4 + n4) * 4 + dk] = vec4(w[k4*4+dk][n4*4+0..3])
+ *
+ * fp16xfp16->fp32 MMA. When DTYPE=half, inputs are native fp16 (no
+ * conversion, half the bandwidth). When DTYPE=float, inputs are fp32
+ * with on-the-fly packHalf2x16 conversion.
+ *
+ * Output is always fp32 (fp32 accumulator -> fp32 store) when DTYPE=float,
+ * or fp16 when DTYPE=half.
+ *
+ * Optional bias: when HAS_BIAS is defined, bias is added post-store via
+ * read-modify-write on the output buffer (one pass over the tile).
+ */
+
+#version 450 core
+
+#extension GL_KHR_cooperative_matrix : require
+#extension GL_KHR_memory_scope_semantics : require
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_control_flow_attributes : enable
+
+#define PRECISION ${PRECISION}
+
+$if DTYPE == "half":
+  #define IS_FP16_INPUT
+$if DTYPE == "float":
+  #define IS_FP32_INPUT
+
+$if HAS_BIAS:
+  #define HAS_BIAS
+
+layout(std430) buffer;
+
+#include "common.glslh"
+
+// Bindings: output(0), mat1(1), weight_packed(2), [bias(3)]
+$if HAS_BIAS:
+  ${layout_declare_tensor(B, "rw", "t_output", DTYPE, "buffer", is_scalar_array=True)}
+$else:
+  ${layout_declare_tensor(B, "w", "t_output", DTYPE, "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_mat1", DTYPE, "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_weight_packed", DTYPE, "buffer", is_scalar_array=False)}
+$if HAS_BIAS:
+  ${layout_declare_tensor(B, "r", "t_bias", DTYPE, "buffer", is_scalar_array=True)}
+
+// UBOs
+${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
+${layout_declare_ubo(B, "ivec4", "out_sizes")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Tile dimensions (same as matmul_coopmat)
+const uint lM = 16;
+const uint lN = 16;
+const uint lK = 16;
+const uint TILE_M = 64;
+const uint TILE_N = 64;
+const uint TILE_K = 32;
+
+// Workgroup: 4 subgroups in 2x2 grid, 64 threads each = 256 total
+const uint WG_WIDTH = 2;
+const uint WG_HEIGHT = 2;
+const uint NUM_SUBGROUPS = 4;
+const uint INVOCATIONS = 64 * NUM_SUBGROUPS;
+
+// Result tiles per subgroup: 2x2
+const uint C_ROWS = TILE_M / WG_HEIGHT / lM; // 2
+const uint C_COLS = TILE_N / WG_WIDTH / lN;   // 2
+
+// fp16: 8 elements per uvec4 (128-bit)
+const uint FP16_PER_VEC4 = 8;
+
+// Shared memory with skew padding
+const uint A_STRIDE_VEC4 = (TILE_K + FP16_PER_VEC4) / FP16_PER_VEC4; // 5
+const uint B_STRIDE_VEC4 = (TILE_N + FP16_PER_VEC4) / FP16_PER_VEC4; // 9
+
+shared uvec4 Ash[TILE_M * A_STRIDE_VEC4];  // 5KB
+shared uvec4 Bsh[TILE_K * B_STRIDE_VEC4];  // 4.5KB
+
+// Accumulator tiles (fp32)
+coopmat<float, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator> result[C_ROWS][C_COLS];
+
+#ifdef IS_FP32_INPUT
+uvec2 f32x4_to_f16x4(vec4 v) {
+    return uvec2(packHalf2x16(v.xy), packHalf2x16(v.zw));
+}
+#endif
+
+void main() {
+    const uvec2 tileID = uvec2(gl_WorkGroupID.xy);
+    const uvec2 warpInTile = uvec2(
+        gl_SubgroupID % WG_WIDTH,
+        gl_SubgroupID / WG_WIDTH);
+
+    const uint K = uint(mat1_sizes.x);
+    const uint M = uint(mat1_sizes.y);
+    const uint N = uint(out_sizes.x);
+    const uint K4 = (K + 3u) / 4u;
+    const uint N4 = (N + 3u) / 4u;
+
+    [[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
+        [[unroll]] for (uint j = 0; j < C_COLS; ++j) {
+            result[i][j] = coopmat<float, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator>(0.0);
+        }
+    }
+
+    // Thread assignment for A tile (64 rows x 4 uvec4/row = single pass)
+    const uint INVS_PER_ROW_A = TILE_K / FP16_PER_VEC4;  // 4
+    const uint a_col = gl_LocalInvocationID.x % INVS_PER_ROW_A;
+    const uint a_row_offset = gl_LocalInvocationID.x / INVS_PER_ROW_A;
+
+    // Thread assignment for B tile (32 rows x 8 uvec4/row = single pass)
+    const uint INVS_PER_ROW_B = TILE_N / FP16_PER_VEC4;  // 8
+    const uint b_col = gl_LocalInvocationID.x % INVS_PER_ROW_B;
+    const uint b_row_offset = gl_LocalInvocationID.x / INVS_PER_ROW_B;
+
+    const uint a_row_base = TILE_M * tileID.y;
+    const uint b_col_base = TILE_N * tileID.x;
+
+    for (uint chunkK = 0; chunkK < K; chunkK += TILE_K) {
+
+        // --- Load A tile -> shared (same as matmul_coopmat) ---
+        {
+            uint row = a_row_base + a_row_offset;
+            uint k_elem = chunkK + a_col * FP16_PER_VEC4;
+
+#ifdef IS_FP16_INPUT
+            uint k_hv4 = k_elem / 4;
+            f16vec4 v0 = t_mat1[row * K4 + k_hv4];
+            f16vec4 v1 = t_mat1[row * K4 + k_hv4 + 1];
+            Ash[a_row_offset * A_STRIDE_VEC4 + a_col] = uvec4(
+                packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)),
+                packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw)));
+#else
+            uint k_vec4 = k_elem / 4;
+            vec4 v0 = t_mat1[row * K4 + k_vec4];
+            vec4 v1 = t_mat1[row * K4 + k_vec4 + 1];
+            uvec2 h0 = f32x4_to_f16x4(v0);
+            uvec2 h1 = f32x4_to_f16x4(v1);
+            Ash[a_row_offset * A_STRIDE_VEC4 + a_col] = uvec4(h0, h1);
+#endif
+        }
+
+        // --- Load B tile from packed weight -> shared ---
+        // Packed weight format: t_weight_packed[(k4 * N4 + n4) * 4 + dk]
+        // returns vec4 of 4 N-elements at K-row (k4*4+dk).
+        // Load two vec4s to get 8 consecutive N-elements = one uvec4 in Bsh.
+        {
+            uint k_row = chunkK + b_row_offset;
+            uint k4 = k_row >> 2u;
+            uint dk = k_row & 3u;
+            uint n_elem = b_col_base + b_col * FP16_PER_VEC4;
+            uint n4_0 = n_elem >> 2u;
+
+#ifdef IS_FP16_INPUT
+            f16vec4 v0 = t_weight_packed[(k4 * N4 + n4_0) * 4u + dk];
+            f16vec4 v1 = t_weight_packed[(k4 * N4 + n4_0 + 1u) * 4u + dk];
+            Bsh[b_row_offset * B_STRIDE_VEC4 + b_col] = uvec4(
+                packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)),
+                packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw)));
+#else
+            vec4 v0 = t_weight_packed[(k4 * N4 + n4_0) * 4u + dk];
+            vec4 v1 = t_weight_packed[(k4 * N4 + n4_0 + 1u) * 4u + dk];
+            uvec2 h0 = f32x4_to_f16x4(v0);
+            uvec2 h1 = f32x4_to_f16x4(v1);
+            Bsh[b_row_offset * B_STRIDE_VEC4 + b_col] = uvec4(h0, h1);
+#endif
+        }
+
+        barrier();
+
+        // --- Cooperative matrix MMA ---
+        [[unroll]] for (uint k = 0; k < TILE_K / lK; ++k) {
+            uint k_start = lK * k;
+
+            coopmat<float16_t, gl_ScopeSubgroup, lM, lK, gl_MatrixUseA> matA[C_ROWS];
+            [[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
+                uint row_a = lM * (C_ROWS * warpInTile.y + i);
+                coopMatLoad(
+                    matA[i], Ash,
+                    row_a * A_STRIDE_VEC4 + k_start / FP16_PER_VEC4,
+                    A_STRIDE_VEC4,
+                    gl_CooperativeMatrixLayoutRowMajor);
+            }
+
+            coopmat<float16_t, gl_ScopeSubgroup, lK, lN, gl_MatrixUseB> matB;
+            [[unroll]] for (uint j = 0; j < C_COLS; ++j) {
+                uint col_b = lN * (C_COLS * warpInTile.x + j) / FP16_PER_VEC4;
+                coopMatLoad(
+                    matB, Bsh,
+                    k_start * B_STRIDE_VEC4 + col_b,
+                    B_STRIDE_VEC4,
+                    gl_CooperativeMatrixLayoutRowMajor);
+
+                [[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
+                    result[i][j] = coopMatMulAdd(matA[i], matB, result[i][j]);
+                }
+            }
+        }
+
+        barrier();
+    }
+
+    // --- Store result ---
+    [[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
+        [[unroll]] for (uint j = 0; j < C_COLS; ++j) {
+            uint gi = TILE_M * tileID.y + lM * (C_ROWS * warpInTile.y + i);
+            uint gj = TILE_N * tileID.x + lN * (C_COLS * warpInTile.x + j);
+#ifdef IS_FP16_INPUT
+            coopmat<float16_t, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator> out_tile =
+                coopmat<float16_t, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator>(result[i][j]);
+            coopMatStore(
+                out_tile, t_output,
+                gi * N + gj, N,
+                gl_CooperativeMatrixLayoutRowMajor);
+#else
+            coopMatStore(
+                result[i][j], t_output,
+                gi * N + gj, N,
+                gl_CooperativeMatrixLayoutRowMajor);
+#endif
+        }
+    }
+
+#ifdef HAS_BIAS
+    // Add bias via read-modify-write on the output buffer.
+    // barrier() ensures all coopMatStore writes within this workgroup are visible.
+    barrier();
+
+    const uint tile_m_start = TILE_M * tileID.y;
+    const uint tile_n_start = TILE_N * tileID.x;
+    // 64x64 tile = 4096 elements, 256 threads -> 16 elements per thread
+    for (uint idx = gl_LocalInvocationID.x; idx < TILE_M * TILE_N; idx += INVOCATIONS) {
+        uint local_m = idx / TILE_N;
+        uint local_n = idx % TILE_N;
+        uint gm = tile_m_start + local_m;
+        uint gn = tile_n_start + local_n;
+        if (gm < M && gn < N) {
+            uint out_idx = gm * N + gn;
+            t_output[out_idx] = t_output[out_idx] + t_bias[gn];
+        }
+    }
+#endif
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.yaml b/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.yaml
new file mode 100644
index 00000000000..669da1bbd26
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/linear_coopmat.yaml
@@ -0,0 +1,19 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+linear_coopmat:
+  parameter_names_with_default_values:
+    DTYPE: float
+    PRECISION: highp
+    HAS_BIAS: false
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+  shader_variants:
+    - NAME: linear_coopmat
+    - NAME: linear_coopmat_bias
+      HAS_BIAS: true
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.glsl b/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.glsl
new file mode 100644
index 00000000000..6a6ac6eb965
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.glsl
@@ -0,0 +1,227 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+/*
+ * KHR Cooperative Matrix matmul following matmul_vec conventions.
+ * Drop-in replacement for matmul_vec when storage=buffer and device
+ * supports GL_KHR_cooperative_matrix.
+ *
+ * Computes: D = A * B   (A: [M, K], B: [K, N], D: [M, N])
+ *
+ * fp16×fp16→fp32 MMA. When DTYPE=half, inputs are native fp16 (no
+ * conversion, half the bandwidth). When DTYPE=float, inputs are fp32
+ * with on-the-fly packHalf2x16 conversion.
+ *
+ * Output is always fp32 (fp32 accumulator → fp32 store) when DTYPE=float,
+ * or fp16 when DTYPE=half.
+ */
+
+#version 450 core
+
+#extension GL_KHR_cooperative_matrix : require
+#extension GL_KHR_memory_scope_semantics : require
+#extension GL_KHR_shader_subgroup_basic : enable
+#extension GL_EXT_shader_explicit_arithmetic_types : require
+#extension GL_EXT_shader_explicit_arithmetic_types_float16 : require
+#extension GL_EXT_control_flow_attributes : enable
+
+#define PRECISION ${PRECISION}
+
+$if DTYPE == "half":
+  #define IS_FP16_INPUT
+$if DTYPE == "float":
+  #define IS_FP32_INPUT
+
+layout(std430) buffer;
+
+#include "common.glslh"
+
+// Bindings — same order as matmul_vec: output(0), mat1(1), mat2(2)
+// Output uses same dtype as input. Inputs use vec4 (non-scalar) for wide loads.
+${layout_declare_tensor(B, "w", "t_output", DTYPE, "buffer", is_scalar_array=True)}
+${layout_declare_tensor(B, "r", "t_mat1", DTYPE, "buffer", is_scalar_array=False)}
+${layout_declare_tensor(B, "r", "t_mat2", DTYPE, "buffer", is_scalar_array=False)}
+
+// UBOs — same as matmul_vec
+${layout_declare_ubo(B, "ivec4", "mat1_sizes")}
+${layout_declare_ubo(B, "ivec4", "mat2_sizes")}
+
+layout(local_size_x_id = 0, local_size_y_id = 1, local_size_z_id = 2) in;
+
+// Tile dimensions
+const uint lM = 16;
+const uint lN = 16;
+const uint lK = 16;
+const uint TILE_M = 64;
+const uint TILE_N = 64;
+const uint TILE_K = 32;
+
+// Workgroup: 4 subgroups in 2×2 grid, 64 threads each = 256 total
+const uint WG_WIDTH = 2;
+const uint WG_HEIGHT = 2;
+const uint NUM_SUBGROUPS = 4;
+const uint INVOCATIONS = 64 * NUM_SUBGROUPS;
+
+// Result tiles per subgroup: 2×2
+const uint C_ROWS = TILE_M / WG_HEIGHT / lM; // 2
+const uint C_COLS = TILE_N / WG_WIDTH / lN;   // 2
+
+// fp16: 8 elements per uvec4 (128-bit)
+const uint FP16_PER_VEC4 = 8;
+
+// Shared memory with skew padding
+const uint A_STRIDE_VEC4 = (TILE_K + FP16_PER_VEC4) / FP16_PER_VEC4; // 5
+const uint B_STRIDE_VEC4 = (TILE_N + FP16_PER_VEC4) / FP16_PER_VEC4; // 9
+
+shared uvec4 Ash[TILE_M * A_STRIDE_VEC4];  // 5KB
+shared uvec4 Bsh[TILE_K * B_STRIDE_VEC4];  // 4.5KB
+
+// Accumulator tiles (fp32)
+coopmat<float, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator> result[C_ROWS][C_COLS];
+
+#ifdef IS_FP32_INPUT
+uvec2 f32x4_to_f16x4(vec4 v) {
+    return uvec2(packHalf2x16(v.xy), packHalf2x16(v.zw));
+}
+#endif
+
+void main() {
+    const uvec2 tileID = uvec2(gl_WorkGroupID.xy);
+    const uvec2 warpInTile = uvec2(
+        gl_SubgroupID % WG_WIDTH,
+        gl_SubgroupID / WG_WIDTH);
+
+    const uint K = uint(mat1_sizes.x);
+    const uint M = uint(mat1_sizes.y);
+    const uint N = uint(mat2_sizes.x);
+    const uint K4 = (K + 3u) / 4u;
+    const uint N4 = (N + 3u) / 4u;
+
+    [[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
+        [[unroll]] for (uint j = 0; j < C_COLS; ++j) {
+            result[i][j] = coopmat<float, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator>(0.0);
+        }
+    }
+
+    // Thread assignment for A tile (64 rows × 4 uvec4/row = single pass)
+    const uint INVS_PER_ROW_A = TILE_K / FP16_PER_VEC4;  // 4
+    const uint a_col = gl_LocalInvocationID.x % INVS_PER_ROW_A;
+    const uint a_row_offset = gl_LocalInvocationID.x / INVS_PER_ROW_A;
+
+    // Thread assignment for B tile (32 rows × 8 uvec4/row = single pass)
+    const uint INVS_PER_ROW_B = TILE_N / FP16_PER_VEC4;  // 8
+    const uint b_col = gl_LocalInvocationID.x % INVS_PER_ROW_B;
+    const uint b_row_offset = gl_LocalInvocationID.x / INVS_PER_ROW_B;
+
+    const uint a_row_base = TILE_M * tileID.y;
+    const uint b_col_base = TILE_N * tileID.x;
+
+    for (uint chunkK = 0; chunkK < K; chunkK += TILE_K) {
+
+        // --- Load A tile → shared (single pass) ---
+        {
+            uint row = a_row_base + a_row_offset;
+            uint k_elem = chunkK + a_col * FP16_PER_VEC4;
+
+#ifdef IS_FP16_INPUT
+            // Native fp16: two f16vec4 loads (64-bit each) → one uvec4 (128-bit)
+            uint k_hv4 = k_elem / 4;
+            f16vec4 v0 = t_mat1[row * K4 + k_hv4];
+            f16vec4 v1 = t_mat1[row * K4 + k_hv4 + 1];
+            Ash[a_row_offset * A_STRIDE_VEC4 + a_col] = uvec4(
+                packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)),
+                packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw)));
+#else
+            // fp32 inputs: load two vec4 (8 fp32), convert to 8 fp16
+            uint k_vec4 = k_elem / 4;
+            vec4 v0 = t_mat1[row * K4 + k_vec4];
+            vec4 v1 = t_mat1[row * K4 + k_vec4 + 1];
+            uvec2 h0 = f32x4_to_f16x4(v0);
+            uvec2 h1 = f32x4_to_f16x4(v1);
+            Ash[a_row_offset * A_STRIDE_VEC4 + a_col] = uvec4(h0, h1);
+#endif
+        }
+
+        // --- Load B tile → shared (single pass) ---
+        {
+            uint row = chunkK + b_row_offset;
+            uint n_elem = b_col_base + b_col * FP16_PER_VEC4;
+
+#ifdef IS_FP16_INPUT
+            uint n_hv4 = n_elem / 4;
+            f16vec4 v0 = t_mat2[row * N4 + n_hv4];
+            f16vec4 v1 = t_mat2[row * N4 + n_hv4 + 1];
+            Bsh[b_row_offset * B_STRIDE_VEC4 + b_col] = uvec4(
+                packHalf2x16(vec2(v0.xy)), packHalf2x16(vec2(v0.zw)),
+                packHalf2x16(vec2(v1.xy)), packHalf2x16(vec2(v1.zw)));
+#else
+            uint n_vec4 = n_elem / 4;
+            vec4 v0 = t_mat2[row * N4 + n_vec4];
+            vec4 v1 = t_mat2[row * N4 + n_vec4 + 1];
+            uvec2 h0 = f32x4_to_f16x4(v0);
+            uvec2 h1 = f32x4_to_f16x4(v1);
+            Bsh[b_row_offset * B_STRIDE_VEC4 + b_col] = uvec4(h0, h1);
+#endif
+        }
+
+        barrier();
+
+        // --- Cooperative matrix MMA ---
+        [[unroll]] for (uint k = 0; k < TILE_K / lK; ++k) {
+            uint k_start = lK * k;
+
+            coopmat<float16_t, gl_ScopeSubgroup, lM, lK, gl_MatrixUseA> matA[C_ROWS];
+            [[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
+                uint row_a = lM * (C_ROWS * warpInTile.y + i);
+                coopMatLoad(
+                    matA[i], Ash,
+                    row_a * A_STRIDE_VEC4 + k_start / FP16_PER_VEC4,
+                    A_STRIDE_VEC4,
+                    gl_CooperativeMatrixLayoutRowMajor);
+            }
+
+            coopmat<float16_t, gl_ScopeSubgroup, lK, lN, gl_MatrixUseB> matB;
+            [[unroll]] for (uint j = 0; j < C_COLS; ++j) {
+                uint col_b = lN * (C_COLS * warpInTile.x + j) / FP16_PER_VEC4;
+                coopMatLoad(
+                    matB, Bsh,
+                    k_start * B_STRIDE_VEC4 + col_b,
+                    B_STRIDE_VEC4,
+                    gl_CooperativeMatrixLayoutRowMajor);
+
+                [[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
+                    result[i][j] = coopMatMulAdd(matA[i], matB, result[i][j]);
+                }
+            }
+        }
+
+        barrier();
+    }
+
+    // --- Store result ---
+    [[unroll]] for (uint i = 0; i < C_ROWS; ++i) {
+        [[unroll]] for (uint j = 0; j < C_COLS; ++j) {
+            uint gi = TILE_M * tileID.y + lM * (C_ROWS * warpInTile.y + i);
+            uint gj = TILE_N * tileID.x + lN * (C_COLS * warpInTile.x + j);
+#ifdef IS_FP16_INPUT
+            // Convert fp32 accumulator to fp16 for fp16 output buffer
+            coopmat<float16_t, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator> out_tile =
+                coopmat<float16_t, gl_ScopeSubgroup, lM, lN, gl_MatrixUseAccumulator>(result[i][j]);
+            coopMatStore(
+                out_tile, t_output,
+                gi * N + gj, N,
+                gl_CooperativeMatrixLayoutRowMajor);
+#else
+            coopMatStore(
+                result[i][j], t_output,
+                gi * N + gj, N,
+                gl_CooperativeMatrixLayoutRowMajor);
+#endif
+        }
+    }
+}
diff --git a/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.yaml b/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.yaml
new file mode 100644
index 00000000000..d120892c563
--- /dev/null
+++ b/backends/vulkan/runtime/graph/ops/glsl/matmul_coopmat.yaml
@@ -0,0 +1,16 @@
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+matmul_coopmat:
+  parameter_names_with_default_values:
+    DTYPE: float
+    PRECISION: highp
+  generate_variant_forall:
+    DTYPE:
+      - VALUE: float
+      - VALUE: half
+  shader_variants:
+    - NAME: matmul_coopmat
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
index 62266473351..9bcfbad607d 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Linear.cpp
@@ -24,7 +24,8 @@ ValueRef prepack_fp_linear_weight(
     ComputeGraph& graph,
     const ValueRef weight_data,
     bool is_transposed,
-    int64_t B) {
+    int64_t B,
+    bool force_buffer) {
   std::vector<int64_t> weight_sizes = graph.sizes_of(weight_data);
 
   int64_t N, K;
@@ -47,12 +48,17 @@ ValueRef prepack_fp_linear_weight(
   int64_t output_height = B * K4;
   int64_t output_width = N4 * 4 * 4;
 
-  utils::StorageType weight_storage = utils::kTexture2D;
-  uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
-  // output_width is in scalars; texture width in texels = output_width / 4
-  if (output_width / 4 > max_extent ||
-      static_cast<uint32_t>(output_height) > max_extent) {
+  utils::StorageType weight_storage;
+  if (force_buffer) {
     weight_storage = utils::kBuffer;
+  } else {
+    weight_storage = utils::kTexture2D;
+    uint32_t max_extent = graph.context()->adapter_ptr()->max_texture2d_dim();
+    // output_width is in scalars; texture width in texels = output_width / 4
+    if (output_width / 4 > max_extent ||
+        static_cast<uint32_t>(output_height) > max_extent) {
+      weight_storage = utils::kBuffer;
+    }
   }
 
   ValueRef packed_weight = graph.add_tensor(
@@ -233,6 +239,100 @@ void add_linear_tiled_node(
       resize_linear_node));
 }
 
+// ── Cooperative matrix linear ──
+
+static constexpr uint32_t kLinearCoopMatTileM = 64;
+static constexpr uint32_t kLinearCoopMatTileN = 64;
+static constexpr uint32_t kLinearCoopMatInvocations = 256; // 4 subgroups x 64
+
+vkapi::ShaderInfo pick_linear_coopmat_shader(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  const ValueRef out = args.at(0).refs.at(0);
+  bool has_bias = graph->get_bool(resize_args.at(1));
+  std::string kernel_name =
+      has_bias ? "linear_coopmat_bias" : "linear_coopmat";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, graph->dtype_of(out));
+  return VK_KERNEL_FROM_STR(kernel_name);
+}
+
+utils::uvec3 pick_linear_coopmat_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  const auto out_sizes = graph->sizes_of(out);
+  uint32_t M = out_sizes.at(out_sizes.size() - 2);
+  uint32_t N = out_sizes.at(out_sizes.size() - 1);
+  uint32_t num_tiles_n = utils::div_up(N, kLinearCoopMatTileN);
+  uint32_t num_tiles_m = utils::div_up(M, kLinearCoopMatTileM);
+  return {num_tiles_n * kLinearCoopMatInvocations, num_tiles_m, 1};
+}
+
+utils::uvec3 pick_linear_coopmat_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
+  (void)args;
+  (void)resize_args;
+  return {kLinearCoopMatInvocations, 1, 1};
+}
+
+void add_linear_coopmat_node(
+    ComputeGraph& graph,
+    const ValueRef input,
+    const ValueRef packed_weight,
+    const ValueRef packed_bias,
+    bool has_bias,
+    const ValueRef out,
+    int32_t weight_B) {
+  VK_CHECK_COND(graph.packed_dim_of(input) == WHCN::kWidthDim);
+  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kWidthDim);
+  VK_CHECK_COND(
+      graph.storage_type_of(out) == utils::kBuffer,
+      "linear_coopmat requires buffer storage");
+
+  std::vector<int64_t> out_sizes = graph.sizes_of(out);
+  int32_t orig_N = utils::safe_downcast<int32_t>(out_sizes.back());
+  ValueRef orig_N_ref = graph.add_scalar(static_cast<int64_t>(orig_N));
+  ValueRef has_bias_ref = graph.add_scalar(has_bias);
+
+  std::vector<ValueRef> read_inputs = {input, packed_weight};
+  if (has_bias) {
+    read_inputs.push_back(packed_bias);
+  }
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      pick_linear_coopmat_shader,
+      pick_linear_coopmat_global_wg_size,
+      pick_linear_coopmat_local_wg_size,
+      // Inputs and Outputs
+      {{out, vkapi::kWrite}, {read_inputs, vkapi::kRead}},
+      // Shader params buffers
+      {graph.sizes_ubo(input), graph.sizes_ubo(out)},
+      // Push Constants
+      {},
+      // Specialization Constants
+      {},
+      // Resize Args
+      {orig_N_ref, has_bias_ref},
+      // Resizing Logic
+      resize_linear_node));
+}
+
+// ── End cooperative matrix linear ──
+
 void linear_packed_weight(
     ComputeGraph& graph,
     const std::vector<ValueRef>& args) {
@@ -241,18 +341,38 @@ void linear_packed_weight(
   ValueRef bias = args.at(2);
   ValueRef out = args.at(3);
 
+  bool has_bias = graph.val_is_not_none(bias);
+  // Coopmat shader assumes M is a multiple of TILE_M (64) because the store
+  // does not bounds-check. Fall back to the tiled shader otherwise.
+  // TODO: remove this guard once the coopmat shader gains partial-tile
+  // bounds checking.
+  auto input_sizes = graph.sizes_of(input);
+  int64_t M = input_sizes.size() >= 2
+      ? input_sizes.at(input_sizes.size() - 2)
+      : 1;
+  bool use_coopmat =
+      graph.context()->adapter_ptr()->supports_cooperative_matrix() &&
+      graph.storage_type_of(out) == utils::kBuffer &&
+      M >= 64;
+
   ValueRef packed_weight = prepack_fp_linear_weight(
-      graph, weight_data, /*is_transposed=*/true, /*B=*/1);
+      graph, weight_data, /*is_transposed=*/true, /*B=*/1,
+      /*force_buffer=*/use_coopmat);
 
   ValueRef packed_bias = kDummyValueRef;
-  bool has_bias = graph.val_is_not_none(bias);
   if (has_bias) {
     packed_bias = prepack_standard(
-        graph, bias, graph.storage_type_of(out), utils::kWidthPacked);
+        graph, bias, graph.storage_type_of(out), utils::kWidthPacked,
+        /*passthrough=*/use_coopmat);
   }
 
-  add_linear_tiled_node(
-      graph, input, packed_weight, packed_bias, has_bias, out);
+  if (use_coopmat) {
+    add_linear_coopmat_node(
+        graph, input, packed_weight, packed_bias, has_bias, out);
+  } else {
+    add_linear_tiled_node(
+        graph, input, packed_weight, packed_bias, has_bias, out);
+  }
 }
 
 REGISTER_OPERATORS {
diff --git a/backends/vulkan/runtime/graph/ops/impl/Linear.h b/backends/vulkan/runtime/graph/ops/impl/Linear.h
index d7efb8c8b08..c829487fc3c 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Linear.h
+++ b/backends/vulkan/runtime/graph/ops/impl/Linear.h
@@ -16,7 +16,8 @@ ValueRef prepack_fp_linear_weight(
     ComputeGraph& graph,
     const ValueRef weight_data,
     bool is_transposed,
-    int64_t B);
+    int64_t B,
+    bool force_buffer = false);
 
 void add_linear_tiled_node(
     ComputeGraph& graph,
@@ -29,4 +30,13 @@ void add_linear_tiled_node(
     float alpha = 1.0f,
     float beta = 1.0f);
 
+void add_linear_coopmat_node(
+    ComputeGraph& graph,
+    const ValueRef input,
+    const ValueRef packed_weight,
+    const ValueRef packed_bias,
+    bool has_bias,
+    const ValueRef out,
+    int32_t weight_B = 1);
+
 } // namespace vkcompute
diff --git a/backends/vulkan/runtime/graph/ops/impl/Matmul.cpp b/backends/vulkan/runtime/graph/ops/impl/Matmul.cpp
index 53bb8d82e12..dad7d5c5f8f 100644
--- a/backends/vulkan/runtime/graph/ops/impl/Matmul.cpp
+++ b/backends/vulkan/runtime/graph/ops/impl/Matmul.cpp
@@ -19,6 +19,95 @@
 
 namespace vkcompute {
 
+// Forward declaration
+void resize_matmul_tiled_node(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args);
+
+// ── Cooperative matrix tile configuration (must match matmul_coopmat.glsl) ──
+
+static constexpr uint32_t kCoopMatTileM = 64;
+static constexpr uint32_t kCoopMatTileN = 64;
+static constexpr uint32_t kCoopMatInvocations = 256; // 4 subgroups × 64
+
+vkapi::ShaderInfo pick_matmul_coopmat_shader(
+    ComputeGraph* graph,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  std::string kernel_name = "matmul_coopmat";
+  kernel_name.reserve(kShaderNameReserve);
+  add_dtype_suffix(kernel_name, graph->dtype_of(out));
+  return VK_KERNEL_FROM_STR(kernel_name);
+}
+
+utils::uvec3 pick_matmul_coopmat_global_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)shader;
+  (void)resize_args;
+  const ValueRef out = args.at(0).refs.at(0);
+  const auto out_sizes = graph->sizes_of(out);
+  uint32_t M = out_sizes.at(out_sizes.size() - 2);
+  uint32_t N = out_sizes.at(out_sizes.size() - 1);
+  uint32_t num_tiles_n = utils::div_up(N, kCoopMatTileN);
+  uint32_t num_tiles_m = utils::div_up(M, kCoopMatTileM);
+  return {num_tiles_n * kCoopMatInvocations, num_tiles_m, 1};
+}
+
+utils::uvec3 pick_matmul_coopmat_local_wg_size(
+    ComputeGraph* graph,
+    const vkapi::ShaderInfo& shader,
+    const utils::uvec3& global_workgroup_size,
+    const std::vector<ArgGroup>& args,
+    const std::vector<ValueRef>& resize_args) {
+  (void)graph;
+  (void)shader;
+  (void)global_workgroup_size;
+  (void)args;
+  (void)resize_args;
+  return {kCoopMatInvocations, 1, 1};
+}
+
+void add_matmul_coopmat_node(
+    ComputeGraph& graph,
+    const ValueRef mat1,
+    const ValueRef mat2,
+    const ValueRef out) {
+  VK_CHECK_COND(graph.packed_dim_of(mat1) == WHCN::kWidthDim);
+  VK_CHECK_COND(graph.packed_dim_of(mat2) == WHCN::kWidthDim);
+  VK_CHECK_COND(graph.packed_dim_of(out) == WHCN::kWidthDim);
+  VK_CHECK_COND(
+      graph.storage_type_of(out) == utils::kBuffer,
+      "matmul_coopmat requires buffer storage");
+
+  ValueRef has_bias_ref = graph.add_scalar(false);
+
+  graph.execute_nodes().emplace_back(new DynamicDispatchNode(
+      graph,
+      pick_matmul_coopmat_shader,
+      pick_matmul_coopmat_global_wg_size,
+      pick_matmul_coopmat_local_wg_size,
+      // Inputs and Outputs — same binding order as matmul_vec
+      {{out, vkapi::kWrite}, {{mat1, mat2}, vkapi::kRead}},
+      // Shader params buffers — same UBOs as matmul_vec
+      {graph.sizes_ubo(mat1), graph.sizes_ubo(mat2)},
+      // Push Constants
+      {},
+      // Specialization Constants (tile config hardcoded in shader)
+      {},
+      // Resize Args
+      {has_bias_ref},
+      // Resizing Logic
+      resize_matmul_tiled_node));
+}
+
+// ── End cooperative matrix section ──
+
 void resize_matmul_tiled_node(
     ComputeGraph* graph,
     const std::vector<ArgGroup>& args,
@@ -189,16 +278,30 @@ void matmul_tiled(ComputeGraph& graph, const std::vector<ValueRef>& args) {
   if (graph.val_is_tref(mat2)) {
     auto mat2_sizes = graph.sizes_of(mat2);
     int64_t B = mat2_sizes.size() >= 3 ? mat2_sizes.at(0) : 1;
-    ValueRef packed =
-        prepack_fp_linear_weight(graph, mat2, /*is_transposed=*/false, B);
-    add_linear_tiled_node(
-        graph,
-        mat1,
-        packed,
-        kDummyValueRef,
-        false,
-        out,
-        utils::safe_downcast<int32_t>(B));
+    bool use_coopmat =
+        graph.context()->adapter_ptr()->supports_cooperative_matrix() &&
+        graph.storage_type_of(out) == utils::kBuffer;
+    ValueRef packed = prepack_fp_linear_weight(
+        graph, mat2, /*is_transposed=*/false, B,
+        /*force_buffer=*/use_coopmat);
+    if (use_coopmat) {
+      add_linear_coopmat_node(
+          graph, mat1, packed, kDummyValueRef, false, out,
+          utils::safe_downcast<int32_t>(B));
+    } else {
+      add_linear_tiled_node(
+          graph,
+          mat1,
+          packed,
+          kDummyValueRef,
+          false,
+          out,
+          utils::safe_downcast<int32_t>(B));
+    }
+  } else if (
+      graph.context()->adapter_ptr()->supports_cooperative_matrix() &&
+      graph.storage_type_of(out) == utils::kBuffer) {
+    add_matmul_coopmat_node(graph, mat1, mat2, out);
   } else {
     add_matmul_tiled_node(graph, mat1, mat2, out);
   }
diff --git a/backends/vulkan/runtime/vk_api/Adapter.h b/backends/vulkan/runtime/vk_api/Adapter.h
index 3c503deab70..7a4aa94560d 100644
--- a/backends/vulkan/runtime/vk_api/Adapter.h
+++ b/backends/vulkan/runtime/vk_api/Adapter.h
@@ -252,6 +252,15 @@ class Adapter final {
 #endif /* VK_NV_cooperative_matrix2 */
   }
 
+  inline bool supports_cooperative_matrix() {
+#ifdef VK_KHR_cooperative_matrix
+    return physical_device_.cooperative_matrix_features.cooperativeMatrix ==
+        VK_TRUE;
+#else
+    return false;
+#endif /* VK_KHR_cooperative_matrix */
+  }
+
   inline bool supports_int16_shader_types() {
 #ifdef ETVK_FORCE_NO_EXTENSIONS
     return false;
diff --git a/backends/vulkan/test/custom_ops/CMakeLists.txt b/backends/vulkan/test/custom_ops/CMakeLists.txt
index d17ab94d194..c8fd9e41f2e 100644
--- a/backends/vulkan/test/custom_ops/CMakeLists.txt
+++ b/backends/vulkan/test/custom_ops/CMakeLists.txt
@@ -48,8 +48,10 @@ if(TARGET vulkan_backend)
 
   # Prototyping utility files
   set(PROTOTYPING_UTILS_HEADERS ${CMAKE_CURRENT_SOURCE_DIR})
-  set(PROTOTYPING_UTILS_CPP ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
-                            ${CMAKE_CURRENT_SOURCE_DIR}/conv2d_utils.cpp
+  set(PROTOTYPING_UTILS_CPP
+      ${CMAKE_CURRENT_SOURCE_DIR}/utils.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/conv2d_utils.cpp
+      ${CMAKE_CURRENT_SOURCE_DIR}/cm_utils.cpp
   )
 
   # Prototyping shaders
@@ -103,4 +105,6 @@ if(TARGET vulkan_backend)
   add_operator_prototype(test_q8ta_conv2d)
   add_operator_prototype(test_q8ta_conv2d_pw)
   add_operator_prototype(test_q8ta_conv2d_dw)
+  add_operator_prototype(matmul_coopmat_bench)
+  add_operator_prototype(linear_coopmat_bench)
 endif()
diff --git a/backends/vulkan/test/custom_ops/cm_utils.cpp b/backends/vulkan/test/custom_ops/cm_utils.cpp
new file mode 100644
index 00000000000..e55e58a30be
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/cm_utils.cpp
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#include "cm_utils.h"
+
+#include <executorch/backends/vulkan/runtime/api/Context.h>
+#include <executorch/backends/vulkan/runtime/vk_api/Runtime.h>
+
+#include <iomanip>
+#include <iostream>
+#include <string>
+#include <vector>
+
+namespace executorch {
+namespace vulkan {
+namespace prototyping {
+
+static std::string componentTypeToString(VkComponentTypeKHR type) {
+  switch (type) {
+    case VK_COMPONENT_TYPE_FLOAT16_KHR:
+      return "float16";
+    case VK_COMPONENT_TYPE_FLOAT32_KHR:
+      return "float32";
+    case VK_COMPONENT_TYPE_FLOAT64_KHR:
+      return "float64";
+    case VK_COMPONENT_TYPE_SINT8_KHR:
+      return "int8";
+    case VK_COMPONENT_TYPE_SINT16_KHR:
+      return "int16";
+    case VK_COMPONENT_TYPE_SINT32_KHR:
+      return "int32";
+    case VK_COMPONENT_TYPE_SINT64_KHR:
+      return "int64";
+    case VK_COMPONENT_TYPE_UINT8_KHR:
+      return "uint8";
+    case VK_COMPONENT_TYPE_UINT16_KHR:
+      return "uint16";
+    case VK_COMPONENT_TYPE_UINT32_KHR:
+      return "uint32";
+    case VK_COMPONENT_TYPE_UINT64_KHR:
+      return "uint64";
+    default:
+      return "unknown(" + std::to_string(static_cast<int>(type)) + ")";
+  }
+}
+
+static std::string scopeToString(VkScopeKHR scope) {
+  switch (scope) {
+    case VK_SCOPE_DEVICE_KHR:
+      return "Device";
+    case VK_SCOPE_WORKGROUP_KHR:
+      return "Workgroup";
+    case VK_SCOPE_SUBGROUP_KHR:
+      return "Subgroup";
+    case VK_SCOPE_QUEUE_FAMILY_KHR:
+      return "QueueFamily";
+    default:
+      return "unknown(" + std::to_string(static_cast<int>(scope)) + ")";
+  }
+}
+
+void queryCooperativeMatrixProperties() {
+#ifdef VK_KHR_cooperative_matrix
+  auto* adapter = vkcompute::api::context()->adapter_ptr();
+  VkPhysicalDevice physicalDevice = adapter->physical_handle();
+
+  if (!adapter->supports_cooperative_matrix()) {
+    std::cout << "VK_KHR_cooperative_matrix is NOT supported on this device."
+              << std::endl;
+    return;
+  }
+
+  std::cout << "\n=== Cooperative Matrix Properties (KHR) ===" << std::endl;
+
+  uint32_t count = 0;
+  VkResult result = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(
+      physicalDevice, &count, nullptr);
+
+  if (result != VK_SUCCESS || count == 0) {
+    std::cout << "No cooperative matrix configurations found." << std::endl;
+    return;
+  }
+
+  std::vector<VkCooperativeMatrixPropertiesKHR> properties(count);
+  for (auto& prop : properties) {
+    prop.sType = VK_STRUCTURE_TYPE_COOPERATIVE_MATRIX_PROPERTIES_KHR;
+    prop.pNext = nullptr;
+  }
+
+  result = vkGetPhysicalDeviceCooperativeMatrixPropertiesKHR(
+      physicalDevice, &count, properties.data());
+
+  if (result != VK_SUCCESS) {
+    std::cerr << "Failed to query cooperative matrix properties." << std::endl;
+    return;
+  }
+
+  std::cout << "Found " << count << " cooperative matrix configurations:\n"
+            << std::endl;
+
+  std::cout << std::left << std::setw(5) << "#" << std::setw(10) << "M"
+            << std::setw(10) << "N" << std::setw(10) << "K" << std::setw(12)
+            << "AType" << std::setw(12) << "BType" << std::setw(12) << "CType"
+            << std::setw(12) << "ResultType" << std::setw(12) << "Scope"
+            << std::endl;
+
+  std::cout << std::string(95, '-') << std::endl;
+
+  for (uint32_t i = 0; i < count; ++i) {
+    const auto& p = properties[i];
+    std::cout << std::left << std::setw(5) << i << std::setw(10) << p.MSize
+              << std::setw(10) << p.NSize << std::setw(10) << p.KSize
+              << std::setw(12) << componentTypeToString(p.AType)
+              << std::setw(12) << componentTypeToString(p.BType)
+              << std::setw(12) << componentTypeToString(p.CType)
+              << std::setw(12) << componentTypeToString(p.ResultType)
+              << std::setw(12) << scopeToString(p.scope) << std::endl;
+  }
+
+  std::cout << std::endl;
+
+#else
+  std::cout << "VK_KHR_cooperative_matrix not available at compile time."
+            << std::endl;
+#endif
+}
+
+} // namespace prototyping
+} // namespace vulkan
+} // namespace executorch
diff --git a/backends/vulkan/test/custom_ops/cm_utils.h b/backends/vulkan/test/custom_ops/cm_utils.h
new file mode 100644
index 00000000000..1c6c06bdac8
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/cm_utils.h
@@ -0,0 +1,21 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+#pragma once
+
+namespace executorch {
+namespace vulkan {
+namespace prototyping {
+
+// Query and print VK_KHR_cooperative_matrix properties from the device.
+// Shows supported M/N/K tile sizes, component types, and scopes.
+void queryCooperativeMatrixProperties();
+
+} // namespace prototyping
+} // namespace vulkan
+} // namespace executorch
diff --git a/backends/vulkan/test/custom_ops/linear_coopmat_bench.cpp b/backends/vulkan/test/custom_ops/linear_coopmat_bench.cpp
new file mode 100644
index 00000000000..37ae94a0521
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/linear_coopmat_bench.cpp
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Microbenchmark: linear_coopmat vs linear_vec.
+//
+// Uses test_etvk.test_mm.default which routes to aten.mm.default.
+// When mat2 is constant (set_constant(true)), aten.mm prepacks the weight
+// and routes through the linear path:
+//   - texture3d output  -> linear_vec (Stephen's tiled shader)
+//   - buffer output + coop mat device -> linear_coopmat (KHR cooperative matrix)
+//
+// For each matrix size, runs two variants:
+//   vec_tex: mat1=tex3d, mat2=tex3d(constant), out=tex3d -> linear_vec
+//   cm_fp32: mat1=buf, mat2=buf(constant), out=buf -> linear_coopmat
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "cm_utils.h"
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+
+std::vector<TestCase> generate_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  struct LinearConfig {
+    int64_t M, K, N;
+    std::string name;
+  };
+
+  std::vector<LinearConfig> configs = {
+      // BERT-like shapes
+      {256, 768, 3072, "BERT_FFN_up"},
+      {256, 3072, 768, "BERT_FFN_down"},
+      {128, 768, 768, "BERT_QKV"},
+      // LLM-like shapes (single token)
+      {1, 4096, 4096, "LLM_QKV_1tok"},
+      {1, 4096, 11008, "LLM_FFN_up_1tok"},
+      {1, 11008, 4096, "LLM_FFN_down_1tok"},
+      // LLM-like shapes (batch)
+      {32, 4096, 4096, "LLM_QKV_32tok"},
+      {32, 4096, 11008, "LLM_FFN_up_32tok"},
+      // Square stress
+      {256, 1024, 1024, "sq_1024"},
+      {256, 4096, 4096, "sq_4096"},
+  };
+
+  // Variant 1: linear_vec texture3d (baseline)
+  // mat2 is constant -> prepacked -> linear_vec path
+  for (const auto& cfg : configs) {
+    TestCase tc;
+    tc.set_name("vec_tex_" + cfg.name);
+    tc.set_operator_name("test_etvk.test_mm.default");
+
+    ValueSpec input_A(
+        {cfg.M, cfg.K}, vkapi::kFloat, utils::kTexture3D,
+        utils::kWidthPacked, DataGenType::RANDOM);
+    ValueSpec input_B(
+        {cfg.K, cfg.N}, vkapi::kFloat, utils::kTexture3D,
+        utils::kWidthPacked, DataGenType::RANDOM);
+    input_B.set_constant(true);
+    ValueSpec impl_selector = ValueSpec::make_string("default");
+    ValueSpec output(
+        {cfg.M, cfg.N}, vkapi::kFloat, utils::kTexture3D,
+        utils::kWidthPacked, DataGenType::ZEROS);
+
+    tc.add_input_spec(input_A);
+    tc.add_input_spec(input_B);
+    tc.add_input_spec(impl_selector);
+    tc.add_output_spec(output);
+    tc.set_abs_tolerance(1e-2f);
+    tc.set_rel_tolerance(1e-1f);
+    test_cases.push_back(tc);
+  }
+
+  // Variant 2: linear_coopmat buffer fp32
+  // mat2 is constant + buffer -> prepack with buffer -> linear_coopmat
+  for (const auto& cfg : configs) {
+    TestCase tc;
+    tc.set_name("cm_fp32_" + cfg.name);
+    tc.set_operator_name("test_etvk.test_mm.default");
+
+    ValueSpec input_A(
+        {cfg.M, cfg.K}, vkapi::kFloat, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::RANDOM);
+    ValueSpec input_B(
+        {cfg.K, cfg.N}, vkapi::kFloat, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::RANDOM);
+    input_B.set_constant(true);
+    ValueSpec impl_selector = ValueSpec::make_string("default");
+    ValueSpec output(
+        {cfg.M, cfg.N}, vkapi::kFloat, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::ZEROS);
+
+    tc.add_input_spec(input_A);
+    tc.add_input_spec(input_B);
+    tc.add_input_spec(impl_selector);
+    tc.add_output_spec(output);
+    tc.set_abs_tolerance(5e-1f);
+    tc.set_rel_tolerance(5e-1f);
+    test_cases.push_back(tc);
+  }
+
+  return test_cases;
+}
+
+int64_t linear_flops(const TestCase& test_case) {
+  if (test_case.empty() || test_case.num_inputs() < 2) return 0;
+  const auto& A = test_case.inputs()[0].get_tensor_sizes();
+  const auto& B = test_case.inputs()[1].get_tensor_sizes();
+  int64_t M = A.at(A.size() - 2);
+  int64_t K = A.at(A.size() - 1);
+  int64_t N = B.at(B.size() - 1);
+  return 2 * M * N * K;
+}
+
+static constexpr int64_t kRefLimit = 2048;
+
+void linear_reference(TestCase& test_case) {
+  const ValueSpec& A_spec = test_case.inputs().at(0);
+  const ValueSpec& B_spec = test_case.inputs().at(1);
+  ValueSpec& out_spec = test_case.outputs().at(0);
+
+  const auto& A_sizes = A_spec.get_tensor_sizes();
+  const auto& B_sizes = B_spec.get_tensor_sizes();
+  int64_t M = A_sizes.at(A_sizes.size() - 2);
+  int64_t K = A_sizes.at(A_sizes.size() - 1);
+  int64_t N = B_sizes.at(B_sizes.size() - 1);
+
+  if (M > kRefLimit || K > kRefLimit || N > kRefLimit) {
+    std::cerr << "Skipping reference for large matrix ("
+              << M << "x" << K << "x" << N << ")" << std::endl;
+    return;
+  }
+
+  const auto& A_f = A_spec.get_float_data();
+  const auto& B_f = B_spec.get_float_data();
+  auto& ref = out_spec.get_ref_float_data();
+  ref.resize(M * N, 0.0f);
+
+  for (int64_t m = 0; m < M; ++m)
+    for (int64_t n = 0; n < N; ++n) {
+      float sum = 0.0f;
+      for (int64_t k = 0; k < K; ++k)
+        sum += A_f[m * K + k] * B_f[k * N + n];
+      ref[m * N + n] = sum;
+    }
+}
+
+int main(int argc, char* argv[]) {
+  (void)argc;
+  (void)argv;
+
+  set_print_output(false);
+  set_print_latencies(true);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Linear Coopmat vs Vec Microbenchmark" << std::endl;
+  print_separator();
+
+  try {
+    api::context()->initialize_querypool();
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to initialize Vulkan: " << e.what() << std::endl;
+    return 1;
+  }
+
+  if (api::context()->adapter_ptr()->supports_cooperative_matrix()) {
+    std::cout << "Cooperative matrix: SUPPORTED" << std::endl;
+    queryCooperativeMatrixProperties();
+  } else {
+    std::cout << "Cooperative matrix: NOT supported (buffer tests will use linear_vec)" << std::endl;
+  }
+
+  auto results = execute_test_cases(
+      generate_test_cases,
+      linear_flops,
+      "LINEAR_COOPMAT_BENCH",
+      3,   // warmup
+      10,  // benchmark runs
+      linear_reference);
+
+  return 0;
+}
diff --git a/backends/vulkan/test/custom_ops/matmul_coopmat_bench.cpp b/backends/vulkan/test/custom_ops/matmul_coopmat_bench.cpp
new file mode 100644
index 00000000000..e7543f98a9a
--- /dev/null
+++ b/backends/vulkan/test/custom_ops/matmul_coopmat_bench.cpp
@@ -0,0 +1,254 @@
+/*
+ * Copyright (c) Meta Platforms, Inc. and affiliates.
+ * All rights reserved.
+ *
+ * This source code is licensed under the BSD-style license found in the
+ * LICENSE file in the root directory of this source tree.
+ */
+
+// Microbenchmark: matmul_coopmat vs matmul_vec (texture3d and buffer).
+//
+// Uses test_etvk.test_gemm.default which routes to aten.mm.default.
+// The shader selected depends on storage type and device capabilities:
+//   - texture3d storage → matmul_vec (texture path)
+//   - buffer storage + coop mat device → matmul_coopmat
+//   - buffer storage + no coop mat → matmul_vec (buffer path)
+//
+// For each matrix size, runs three variants:
+//   impl=0: aten.mm buffer (→ matmul_coopmat if coop mat available)
+//   impl=2: aten.mm texture3d (→ matmul_vec texture)
+//   impl=4: aten.mm buffer fp16 (→ matmul_coopmat fp16 if coop mat available)
+
+#include <executorch/backends/vulkan/runtime/graph/ops/impl/Common.h>
+#include <executorch/backends/vulkan/runtime/graph/ops/utils/ShaderNameUtils.h>
+
+#include <cmath>
+#include <cstring>
+#include <iostream>
+#include <vector>
+
+#include "cm_utils.h"
+#include "utils.h"
+
+using namespace executorch::vulkan::prototyping;
+
+std::vector<TestCase> generate_test_cases() {
+  std::vector<TestCase> test_cases;
+
+  struct MatmulConfig {
+    int64_t M, K, N;
+    std::string name;
+  };
+
+  std::vector<MatmulConfig> configs = {
+      // Attention Q@K^T shapes (single-head)
+      {512, 64, 512, "attn_QKt_512x64x512"},
+      {2048, 128, 2048, "attn_QKt_2048x128x2048"},
+      // Attention attn@V
+      {512, 512, 64, "attn_AV_512x512x64"},
+      // BERT-like projection
+      {256, 768, 3072, "proj_256x768x3072"},
+      // Square stress
+      {256, 256, 256, "sq_256"},
+      {512, 512, 512, "sq_512"},
+      {1024, 1024, 1024, "sq_1024"},
+      {2048, 2048, 2048, "sq_2048"},
+      {4096, 4096, 4096, "sq_4096"},
+  };
+
+  // impl=2: matmul_vec texture3d (baseline, fp32)
+  for (const auto& cfg : configs) {
+    TestCase tc;
+    tc.set_name("vec_tex_" + cfg.name);
+    tc.set_operator_name("test_etvk.test_gemm.default");
+
+    ValueSpec input_A(
+        {cfg.M, cfg.K}, vkapi::kFloat, utils::kTexture3D,
+        utils::kWidthPacked, DataGenType::RANDOM);
+    ValueSpec input_B(
+        {cfg.K, cfg.N}, vkapi::kFloat, utils::kTexture3D,
+        utils::kWidthPacked, DataGenType::RANDOM);
+    ValueSpec input_C(
+        {cfg.M, cfg.N}, vkapi::kFloat, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::ZEROS);
+    ValueSpec alpha_spec(1.0f);
+    ValueSpec beta_spec(0.0f);
+    ValueSpec impl_selector_spec(static_cast<int32_t>(2));
+    ValueSpec output(
+        {cfg.M, cfg.N}, vkapi::kFloat, utils::kTexture3D,
+        utils::kWidthPacked, DataGenType::ZEROS);
+
+    tc.add_input_spec(input_A);
+    tc.add_input_spec(input_B);
+    tc.add_input_spec(input_C);
+    tc.add_input_spec(alpha_spec);
+    tc.add_input_spec(beta_spec);
+    tc.add_input_spec(impl_selector_spec);
+    tc.add_output_spec(output);
+    tc.set_abs_tolerance(1e-2f);
+    tc.set_rel_tolerance(1e-1f);
+    test_cases.push_back(tc);
+  }
+
+  // impl=0: aten.mm buffer fp32 (→ matmul_coopmat if device supports)
+  for (const auto& cfg : configs) {
+    TestCase tc;
+    tc.set_name("cm_fp32_" + cfg.name);
+    tc.set_operator_name("test_etvk.test_gemm.default");
+
+    ValueSpec input_A(
+        {cfg.M, cfg.K}, vkapi::kFloat, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::RANDOM);
+    ValueSpec input_B(
+        {cfg.K, cfg.N}, vkapi::kFloat, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::RANDOM);
+    ValueSpec input_C(
+        {cfg.M, cfg.N}, vkapi::kFloat, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::ZEROS);
+    ValueSpec alpha_spec(1.0f);
+    ValueSpec beta_spec(0.0f);
+    ValueSpec impl_selector_spec(static_cast<int32_t>(0));
+    ValueSpec output(
+        {cfg.M, cfg.N}, vkapi::kFloat, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::ZEROS);
+
+    tc.add_input_spec(input_A);
+    tc.add_input_spec(input_B);
+    tc.add_input_spec(input_C);
+    tc.add_input_spec(alpha_spec);
+    tc.add_input_spec(beta_spec);
+    tc.add_input_spec(impl_selector_spec);
+    tc.add_output_spec(output);
+    tc.set_abs_tolerance(5e-1f);
+    tc.set_rel_tolerance(5e-1f);
+    test_cases.push_back(tc);
+  }
+
+  // impl=0: aten.mm buffer fp16 (→ matmul_coopmat fp16 if device supports)
+  for (const auto& cfg : configs) {
+    TestCase tc;
+    tc.set_name("cm_fp16_" + cfg.name);
+    tc.set_operator_name("test_etvk.test_gemm.default");
+
+    ValueSpec input_A(
+        {cfg.M, cfg.K}, vkapi::kHalf, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::RANDOM);
+    ValueSpec input_B(
+        {cfg.K, cfg.N}, vkapi::kHalf, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::RANDOM);
+    ValueSpec input_C(
+        {cfg.M, cfg.N}, vkapi::kFloat, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::ZEROS);
+    ValueSpec alpha_spec(1.0f);
+    ValueSpec beta_spec(0.0f);
+    ValueSpec impl_selector_spec(static_cast<int32_t>(0));
+    ValueSpec output(
+        {cfg.M, cfg.N}, vkapi::kHalf, utils::kBuffer,
+        utils::kWidthPacked, DataGenType::ZEROS);
+
+    tc.add_input_spec(input_A);
+    tc.add_input_spec(input_B);
+    tc.add_input_spec(input_C);
+    tc.add_input_spec(alpha_spec);
+    tc.add_input_spec(beta_spec);
+    tc.add_input_spec(impl_selector_spec);
+    tc.add_output_spec(output);
+    tc.set_abs_tolerance(5e-1f);
+    tc.set_rel_tolerance(5e-1f);
+    test_cases.push_back(tc);
+  }
+
+  return test_cases;
+}
+
+int64_t matmul_flops(const TestCase& test_case) {
+  if (test_case.empty() || test_case.num_inputs() < 2) return 0;
+  const auto& A = test_case.inputs()[0].get_tensor_sizes();
+  const auto& B = test_case.inputs()[1].get_tensor_sizes();
+  int64_t M = A.at(A.size() - 2);
+  int64_t K = A.at(A.size() - 1);
+  int64_t N = B.at(B.size() - 1);
+  return 2 * M * N * K;
+}
+
+static constexpr int64_t kRefLimit = 2048;
+
+void matmul_reference(TestCase& test_case) {
+  const ValueSpec& A_spec = test_case.inputs().at(0);
+  const ValueSpec& B_spec = test_case.inputs().at(1);
+  ValueSpec& out_spec = test_case.outputs().at(0);
+
+  const auto& A_sizes = A_spec.get_tensor_sizes();
+  const auto& B_sizes = B_spec.get_tensor_sizes();
+  int64_t M = A_sizes.at(A_sizes.size() - 2);
+  int64_t K = A_sizes.at(A_sizes.size() - 1);
+  int64_t N = B_sizes.at(B_sizes.size() - 1);
+
+  if (M > kRefLimit || K > kRefLimit || N > kRefLimit) {
+    std::cerr << "Skipping reference for large matrix ("
+              << M << "x" << K << "x" << N << ")" << std::endl;
+    return;
+  }
+
+  auto& ref = out_spec.get_ref_float_data();
+  ref.resize(M * N, 0.0f);
+
+  if (A_spec.dtype == vkapi::kHalf) {
+    const auto& A_h = A_spec.get_half_data();
+    const auto& B_h = B_spec.get_half_data();
+    for (int64_t m = 0; m < M; ++m)
+      for (int64_t n = 0; n < N; ++n) {
+        float sum = 0.0f;
+        for (int64_t k = 0; k < K; ++k)
+          sum += half_to_float(A_h[m * K + k]) * half_to_float(B_h[k * N + n]);
+        ref[m * N + n] = sum;
+      }
+  } else {
+    const auto& A_f = A_spec.get_float_data();
+    const auto& B_f = B_spec.get_float_data();
+    for (int64_t m = 0; m < M; ++m)
+      for (int64_t n = 0; n < N; ++n) {
+        float sum = 0.0f;
+        for (int64_t k = 0; k < K; ++k)
+          sum += A_f[m * K + k] * B_f[k * N + n];
+        ref[m * N + n] = sum;
+      }
+  }
+}
+
+int main(int argc, char* argv[]) {
+  (void)argc;
+  (void)argv;
+
+  set_print_output(false);
+  set_print_latencies(true);
+  set_use_gpu_timestamps(true);
+
+  print_performance_header();
+  std::cout << "Matmul Coopmat vs Vec Microbenchmark" << std::endl;
+  print_separator();
+
+  try {
+    api::context()->initialize_querypool();
+  } catch (const std::exception& e) {
+    std::cerr << "Failed to initialize Vulkan: " << e.what() << std::endl;
+    return 1;
+  }
+
+  if (api::context()->adapter_ptr()->supports_cooperative_matrix()) {
+    std::cout << "Cooperative matrix: SUPPORTED" << std::endl;
+    queryCooperativeMatrixProperties();
+  } else {
+    std::cout << "Cooperative matrix: NOT supported (buffer tests will use matmul_vec)" << std::endl;
+  }
+
+  auto results = execute_test_cases(
+      generate_test_cases,
+      matmul_flops,
+      "MATMUL_COOPMAT_BENCH",
+      3,   // warmup
+      10,  // benchmark runs
+      matmul_reference);
+
+  return 0;
+}