14template <index_t MNXdlPerWave, index_t MNWaves, index_t MNPerXdl,
typename TileDesc_K0_MN_K1>
15__host__ __device__
static constexpr auto
16MakeGemmMmaTileDescriptor_MN0_MN1_MN2_K(
const TileDesc_K0_MN_K1&)
34 typename AK0MK1BlockDesc,
35 typename BK0NK1BlockDesc,
41 typename ComputeTypeA = FloatA,
42 typename ComputeTypeB = FloatB>
59 BK0NK1BlockDesc{}.GetLength(
I0) * BK0NK1BlockDesc{}.GetLength(
I2);
89 return threadid_to_wave_idx_adaptor.CalculateBottomIndex(
make_multi_index(thread_id));
95 const auto waveId_m = wave_idx[
I0];
96 const auto xdlops_a_idx =
xdlops_gemm.CalculateAThreadOriginDataIndex();
104 const auto waveId_n = wave_idx[
I1];
105 const auto xdlops_b_idx =
xdlops_gemm.CalculateBThreadOriginDataIndex();
110 template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
111 __device__
static auto
115 const auto waveId_m = wave_idx[
I0];
116 const auto waveId_n = wave_idx[
I1];
118 const auto blk_idx =
xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
130 const index_t c_thread_m = mrepeat_mwave_mperxdl_to_m_adaptor.CalculateBottomIndex(
132 const index_t c_thread_n = nrepeat_nwave_nperxdl_to_n_adaptor.CalculateBottomIndex(
138 template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
139 __device__
static auto
143 const auto waveId_m = wave_idx[
I0];
144 const auto waveId_n = wave_idx[
I1];
146 const auto blk_idx =
xdlops_gemm.GetBeginOfThreadBlk4D(xdlops_i, blk_i);
160#if defined(__HIP_DEVICE_COMPILE__)
161 static_assert(AK0MK1BlockDesc::IsKnownAtCompileTime() &&
162 BK0NK1BlockDesc::IsKnownAtCompileTime(),
163 "wrong! Desc should be known at compile-time");
166 "ThisThreadBlock::GetNumOfThread() != MWaves * NWaves * WaveSize\n");
168 static_assert(
MPerBlock % (MPerXDL * MRepeat) == 0,
169 "MPerBlock must be divisible by MPerXDL * MRepeat");
170 static_assert(
NPerBlock % (NPerXDL * NRepeat) == 0,
171 "NPerBlock must be divisible by NPerXDL * NRepeat");
174 KPack % (16 *
sizeof(ComputeTypeA)) == 0,
175 "KPack must be divisbile by number of elements processed in single smfmac instruction");
181 constexpr auto c_m0_m1_m2_n_tblk_lens =
xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
183 constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[
I0];
184 constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[
I1];
185 constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[
I2];
186 constexpr auto N = c_m0_m1_m2_n_tblk_lens[
I3];
194 constexpr auto c_m0_m1_m2_n_tblk_lens =
xdlops_gemm.GetCM0M1M2NThreadBlkLengths();
196 constexpr auto M0 = c_m0_m1_m2_n_tblk_lens[
I0];
197 constexpr auto M1 = c_m0_m1_m2_n_tblk_lens[
I1];
198 constexpr auto M2 = c_m0_m1_m2_n_tblk_lens[
I2];
199 constexpr auto N = c_m0_m1_m2_n_tblk_lens[
I3];
207 constexpr auto c_block_desc_m0_n0_m1_n1_m2_n2 =
215 return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_block_desc_m0_n0_m1_n1_m2_n2);
220 constexpr auto c_block_desc_g_m0_n0_m1_n1_m2_n2 =
229 return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
230 c_block_desc_g_m0_n0_m1_n1_m2_n2);
233 template <
typename CGr
idDesc_M_N>
234 __host__ __device__
static constexpr auto
237 const auto M = c_grid_desc_m_n.GetLength(
I0);
238 const auto N = c_grid_desc_m_n.GetLength(
I1);
247 return xdlops_gemm.MakeCDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(c_grid_desc_m0_n0_m1_n1_m2_n2);
250 template <
typename CGr
idDesc_G_M_N>
251 __host__ __device__
static constexpr auto
254 const auto G = c_grid_desc_g_m_n.GetLength(
I0);
255 const auto M = c_grid_desc_g_m_n.GetLength(
I1);
256 const auto N = c_grid_desc_g_m_n.GetLength(
I2);
266 return xdlops_gemm.MakeCDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(
267 c_grid_desc_g_m0_n0_m1_n1_m2_n2);
300 template <
typename AThreadBuf,
typename IdxBuf,
int32_t num_elems>
303 static constexpr int32_t bit_clear_masks[4] = {0b11, 0b1100, 0b110000, 0b11000000};
304 static constexpr int32_t processed_elems = 16 /
sizeof(ComputeTypeA);
307 constexpr int idx_reg_num = i / (16 *
sizeof(ComputeTypeA));
308 constexpr int idx_reg_part = (i % 32) / processed_elems;
312 a_thread_vec.template AsType<ComputeTypeA>()(j) = a_thread_buf
317 for(
int j = 0; j < processed_elems; j += 4)
319 int32_t a_pos = idx_reg_part * processed_elems + j;
321 ComputeTypeA nonzero_elems[2] = {a_thread_vec[j + 2], a_thread_vec[j + 3]};
322 for(
int k = 0; k < 3; k += 1)
324 if(a_thread_vec[j + k] != 0.0f)
326 nonzero_elems[nonzero_pos] = a_thread_vec[j + k];
327 idx &= ~bit_clear_masks[j / 2 + nonzero_pos];
328 idx |= k << 2 * (j / 2 + nonzero_pos);
332 a_thread_vec[j / 2] = nonzero_elems[0];
333 a_thread_vec[j / 2 + 1] = nonzero_elems[1];
337 static_for<0, processed_elems / 2, 1>{}([&](
auto j) {
339 make_tuple(0, 0, 0, i / 2 + j))>{}] = a_thread_vec[j];
344 template <
typename ABlockBuffer,
typename BBlockBuffer,
typename CThreadBuffer>
345 __device__
void Run(
const ABlockBuffer& a_block_buf,
346 const BBlockBuffer& b_block_buf,
347 CThreadBuffer& c_thread_buf)
const
353 static constexpr int32_t elems_per_idx = 16 *
sizeof(ComputeTypeA);
355 (
a_thread_desc_.GetElementSpaceSize() + elems_per_idx - 1) / elems_per_idx);
384 a_thread_vec.template AsType<ComputeTypeA>()(i) =
390 b_thread_vec.template AsType<ComputeTypeB>()(2 * i) = b_thread_buf
394 static_for<0, KPack / elems_per_idx, 1>{}([&](
auto i) {
395 idx_vec.template AsType<int32_t>()(i) = idx_buf[k / elems_per_idx + i];
399 using mfma_input_type_a =
401 using mfma_input_type_b =
408 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type_a>(),
409 b_thread_vec.template AsType<mfma_input_type_b>(),
410 idx_vec.template AsType<mfma_input_type_idx>(),
__host__ __device__ constexpr auto make_multi_index(Xs &&... xs)
Definition array_multi_index.hpp:15
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
int32_t index_t
Definition ck.hpp:299
__host__ __device__ constexpr auto make_single_stage_tensor_adaptor(const Transforms &transforms, LowerDimensionOldTopIdss, UpperDimensionNewTopIdss)
Definition tensor_description/tensor_adaptor.hpp:425
typename vector_type< int8_t, 4 >::type int8x4_t
Definition dtype_vector.hpp:2177
integral_constant< index_t, N > Number
Definition number.hpp:12
@ Vgpr
Definition amd_address_space.hpp:20
__host__ __device__ constexpr auto make_merge_transform(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:55
__host__ __device__ constexpr auto make_merge_transform_v3_division_mod(const LowLengths &low_lengths)
Definition multi_index_transform_helper.hpp:84
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
signed int int32_t
Definition stdint.h:123
unsigned char uint8_t
Definition stdint.h:124
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::KPerBlock static constexpr index_t KPerBlock
Definition blockwise_gemm_smfmac_xdlops.hpp:58
__host__ static __device__ constexpr auto MakeABlockDescriptor_M0_M1_M2_K()
Definition blockwise_gemm_smfmac_xdlops.hpp:270
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::A_K1 static constexpr index_t A_K1
Definition blockwise_gemm_smfmac_xdlops.hpp:63
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::c_thread_desc_ static constexpr auto c_thread_desc_
Definition blockwise_gemm_smfmac_xdlops.hpp:427
__host__ __device__ BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1()
Definition blockwise_gemm_smfmac_xdlops.hpp:158
__host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_smfmac_xdlops.hpp:78
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::I2 static constexpr auto I2
Definition blockwise_gemm_smfmac_xdlops.hpp:47
__host__ static __device__ constexpr auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N &c_grid_desc_g_m_n)
Definition blockwise_gemm_smfmac_xdlops.hpp:252
static __device__ auto CalculateBThreadOriginDataIndex()
Definition blockwise_gemm_smfmac_xdlops.hpp:101
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::WaveSize static constexpr index_t WaveSize
Definition blockwise_gemm_smfmac_xdlops.hpp:54
__host__ static __device__ constexpr auto MakeBBlockDescriptor_N0_N1_N2_K()
Definition blockwise_gemm_smfmac_xdlops.hpp:282
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::KPerThread static constexpr index_t KPerThread
Definition blockwise_gemm_smfmac_xdlops.hpp:69
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::B_K1 static constexpr index_t B_K1
Definition blockwise_gemm_smfmac_xdlops.hpp:64
static __device__ auto CalculateAThreadOriginDataIndex()
Definition blockwise_gemm_smfmac_xdlops.hpp:92
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::MPerBlock static constexpr index_t MPerBlock
Definition blockwise_gemm_smfmac_xdlops.hpp:56
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::c_thread_buf_ StaticBufferTupleOfVector< AddressSpaceEnum::Vgpr, FloatAcc, MRepeat *NRepeat, xdlops_gemm.GetRegSizePerXdlops(), true > c_thread_buf_
Definition blockwise_gemm_smfmac_xdlops.hpp:76
__host__ static __device__ constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_smfmac_xdlops.hpp:218
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::b_block_desc_n0_n1_n2_k static constexpr auto b_block_desc_n0_n1_n2_k
Definition blockwise_gemm_smfmac_xdlops.hpp:295
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::NPerBlock static constexpr index_t NPerBlock
Definition blockwise_gemm_smfmac_xdlops.hpp:57
static __device__ auto CalculateCThreadOriginDataIndex8D(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_smfmac_xdlops.hpp:140
__host__ static __device__ constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_smfmac_xdlops.hpp:192
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_smfmac_xdlops.hpp:205
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::I0 static constexpr auto I0
Definition blockwise_gemm_smfmac_xdlops.hpp:45
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::a_thread_desc_ static constexpr auto a_thread_desc_
Definition blockwise_gemm_smfmac_xdlops.hpp:419
__device__ void SetIdxSqueezeA(AThreadBuf &a_thread_buf, IdxBuf &idx_buf)
Definition blockwise_gemm_smfmac_xdlops.hpp:301
ThreadwiseTensorSliceTransfer_v4< FloatB, ComputeTypeB, decltype(b_block_desc_n0_n1_n2_k), decltype(b_thread_desc_), Sequence< 1, 1, 1, KPerThread >, Sequence< 0, 1, 2, 3 >, 3, B_K1, B_K1 > BThreadCopy
Definition blockwise_gemm_smfmac_xdlops.hpp:440
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::b_thread_copy_ BThreadCopy b_thread_copy_
Definition blockwise_gemm_smfmac_xdlops.hpp:451
ThisThreadBlock< BlockSize > ThisThreadBlock
Definition blockwise_gemm_smfmac_xdlops.hpp:50
__host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N &c_grid_desc_m_n)
Definition blockwise_gemm_smfmac_xdlops.hpp:235
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::a_block_desc_m0_m1_m2_k static constexpr auto a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_smfmac_xdlops.hpp:294
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::a_thread_copy_ AThreadCopy a_thread_copy_
Definition blockwise_gemm_smfmac_xdlops.hpp:450
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::NWaves static constexpr index_t NWaves
Definition blockwise_gemm_smfmac_xdlops.hpp:53
ThreadwiseTensorSliceTransfer_v4< FloatA, ComputeTypeA, decltype(a_block_desc_m0_m1_m2_k), decltype(a_thread_desc_), Sequence< 1, 1, 1, KPerThread >, Sequence< 0, 1, 2, 3 >, 3, A_K1, A_K1 > AThreadCopy
Definition blockwise_gemm_smfmac_xdlops.hpp:430
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::xdlops_gemm static constexpr auto xdlops_gemm
Definition blockwise_gemm_smfmac_xdlops.hpp:66
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::B_K0 static constexpr index_t B_K0
Definition blockwise_gemm_smfmac_xdlops.hpp:62
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::b_thread_desc_ static constexpr auto b_thread_desc_
Definition blockwise_gemm_smfmac_xdlops.hpp:423
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::A_K0 static constexpr index_t A_K0
Definition blockwise_gemm_smfmac_xdlops.hpp:61
__device__ void Run(const ABlockBuffer &a_block_buf, const BBlockBuffer &b_block_buf, CThreadBuffer &c_thread_buf) const
Definition blockwise_gemm_smfmac_xdlops.hpp:345
static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_smfmac_xdlops.hpp:112
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::I3 static constexpr auto I3
Definition blockwise_gemm_smfmac_xdlops.hpp:48
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::I1 static constexpr auto I1
Definition blockwise_gemm_smfmac_xdlops.hpp:46
static __device__ auto GetWaveIdx()
Definition blockwise_gemm_smfmac_xdlops.hpp:80
ck::BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1< BlockSize, FloatA, FloatB, FloatAcc, AK0MK1BlockDesc, BK0NK1BlockDesc, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack, ComputeTypeA, ComputeTypeB >::MWaves static constexpr index_t MWaves
Definition blockwise_gemm_smfmac_xdlops.hpp:52
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_smfmac_xdlops.hpp:179
Definition utility/sequence.hpp:43
Definition smfmac_xdlops_gemm.hpp:215
Definition static_buffer.hpp:75
static __device__ constexpr index_t GetNumOfThread()
Definition thread_group.hpp:15
static __device__ index_t GetThreadId()
Definition thread_group.hpp:19
Definition threadwise_tensor_slice_transfer.hpp:1260
Definition functional2.hpp:33
Definition dtype_vector.hpp:10