blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp Source File

blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp Source File#

Composable Kernel: blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp Source File
blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2025, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
7
8namespace ck {
9
10// Compute optimized pipeline
11// GlobalPrefetchStages: 2
12// LocalPreFillStages: 1
13// LocalPreFetchStages: 1
14// LocalSharedMemoryBuffer: 1
15
16template <BlockGemmPipelineScheduler BlkGemmPipelineVer,
17 index_t BlockSize,
18 typename ADataType,
19 typename BDataType,
20 typename ComputeDataType,
21 typename AccDataType,
22 typename ATileDesc,
23 typename BTileDesc,
24 typename AMmaTileDesc,
25 typename BMmaTileDesc,
26 index_t ABlockTransferSrcScalarPerVector,
27 index_t BBlockTransferSrcScalarPerVector,
28 index_t MPerBlock,
29 index_t NPerBlock,
30 index_t KPerBlock,
31 index_t MPerXDL,
32 index_t NPerXDL,
33 index_t MRepeat,
34 index_t NRepeat,
35 index_t KPacks>
39
40template <index_t BlockSize,
41 typename ADataType,
42 typename BDataType,
43 typename ComputeDataType,
44 typename AccDataType,
45 typename ATileDesc,
46 typename BTileDesc,
47 typename AMmaTileDesc,
48 typename BMmaTileDesc,
49 index_t ABlockTransferSrcScalarPerVector,
50 index_t BBlockTransferSrcScalarPerVector,
51 index_t MPerBlock,
52 index_t NPerBlock,
53 index_t KPerBlock,
54 index_t MPerXDL,
55 index_t NPerXDL,
56 index_t MRepeat,
57 index_t NRepeat,
58 index_t KPack
59 // ,bool TransposeC //disable transposec right now...
60 >
62 BlockSize,
63 ADataType,
64 BDataType,
65 ComputeDataType,
66 AccDataType,
67 ATileDesc,
68 BTileDesc,
69 AMmaTileDesc,
70 BMmaTileDesc,
71 ABlockTransferSrcScalarPerVector,
72 BBlockTransferSrcScalarPerVector,
73 MPerBlock,
74 NPerBlock,
75 KPerBlock,
76 MPerXDL,
77 NPerXDL,
78 MRepeat,
79 NRepeat,
80 KPack>
82 ADataType,
83 BDataType,
84 ComputeDataType,
85 AccDataType,
86 ATileDesc,
87 BTileDesc,
88 AMmaTileDesc,
89 BMmaTileDesc,
90 ABlockTransferSrcScalarPerVector,
91 BBlockTransferSrcScalarPerVector,
92 MPerBlock,
93 NPerBlock,
94 KPerBlock,
95 MPerXDL,
96 NPerXDL,
97 MRepeat,
98 NRepeat,
99 KPack>
100
101{
103 ADataType,
104 BDataType,
105 ComputeDataType,
106 AccDataType,
107 ATileDesc,
108 BTileDesc,
109 AMmaTileDesc,
110 BMmaTileDesc,
111 ABlockTransferSrcScalarPerVector,
112 BBlockTransferSrcScalarPerVector,
113 MPerBlock,
114 NPerBlock,
115 KPerBlock,
116 MPerXDL,
117 NPerXDL,
118 MRepeat,
119 NRepeat,
120 KPack>;
121 using Base::A_K1;
122 using Base::B_K1;
123 using Base::I0;
124 using Base::I1;
125 using Base::KRepeat;
126 using Base::xdlops_gemm;
127 using typename Base::HotLoopInstList;
128
141
142 using Base::AMmaKStride;
143 using Base::BMmaKStride;
144 using Base::WaveSize;
145
146 static constexpr index_t PrefetchStages = 2;
147 static constexpr index_t PrefillStages = 1;
148 static constexpr index_t GlobalBufferNum = 2;
149
150 template <typename TileDesc_M0_M1_M2_K>
151 __host__ __device__ static constexpr auto MakeAGemmMmaTileDescriptor(const TileDesc_M0_M1_M2_K&)
152 {
153 constexpr index_t M0 = TileDesc_M0_M1_M2_K{}.GetLength(Number<0>{});
154 constexpr index_t M1 = TileDesc_M0_M1_M2_K{}.GetLength(Number<1>{});
155 constexpr index_t M2 = TileDesc_M0_M1_M2_K{}.GetLength(Number<2>{});
156 constexpr index_t K2 = KPack;
157 constexpr index_t K1 = WaveSize / NPerXDL;
158 constexpr index_t K0 = KRepeat;
159
161 TileDesc_M0_M1_M2_K{},
169 }
170
171 static constexpr auto a_block_desc_m0_m1_m2_k0_k1_k2 =
173
174 __host__ __device__ static constexpr bool BlockHasHotloop(index_t num_loop)
175 {
176 return num_loop > PrefetchStages;
177 }
178
179 __host__ __device__ static constexpr TailNumber BlockLoopTailNum(index_t num_loop)
180 {
181 return num_loop % 2 == 0 ? TailNumber::Even : TailNumber::Odd;
182 }
183
184 __device__ static constexpr auto HotLoopScheduler()
185 {
186 constexpr auto num_ds_read_inst_a = HotLoopInstList::A_LDS_Read_Inst_Num;
187 constexpr auto num_buffer_load_inst_a = HotLoopInstList::A_Buffer_Load_Inst_Num;
188 constexpr auto num_buffer_load_inst_b = HotLoopInstList::B_Buffer_Load_Inst_Num;
189
190 // B global
192 ignore = i;
193 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
194 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
195 });
196
197 // A global
199 ignore = i;
200 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
201 __builtin_amdgcn_sched_group_barrier(0x200, 1, 0); // DS write
202 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
203 __builtin_amdgcn_sched_group_barrier(0x020, 1, 0); // VMEM read
204 });
205
206 // A local
207 static_for<0, num_ds_read_inst_a / 2, 1>{}([&](auto i) {
208 ignore = i;
209 __builtin_amdgcn_sched_group_barrier(0x008, 1, 0); // MFMA
210 __builtin_amdgcn_sched_group_barrier(0x100, 2, 0); // DS read
211 });
212 }
213
214 template <bool HasMainLoop,
215 TailNumber TailNum,
216 typename AGridDesc,
217 typename ABlockDesc,
218 typename ABlockTransfer,
219 typename AGridBuffer,
220 typename ABlockBuffer,
221 typename ABlockTransferStep,
222 typename BGridDesc,
223 typename BBlockTransfer,
224 typename BGridBuffer,
225 typename BBlockBuffer,
226 typename BBlockTransferStep,
227 typename CThreadBuffer>
228 __device__ void Run(const AGridDesc& a_grid_desc,
229 const ABlockDesc& a_block_desc,
230 ABlockTransfer& a_blockwise_copy,
231 const AGridBuffer& a_grid_buf,
232 ABlockBuffer& a_block_buf,
233 const ABlockTransferStep& a_block_copy_step,
234 const BGridDesc& b_grid_desc,
235 BBlockTransfer& b_blockwise_copy,
236 const BGridBuffer& b_grid_buf,
237 BBlockBuffer& b_block_buf,
238 const BBlockTransferStep& b_block_copy_step,
239 CThreadBuffer& c_thread_buf,
240 index_t num_loop) const
241 {
242 ignore = b_block_buf;
243 __builtin_amdgcn_sched_barrier(0);
245 a_thread_desc_.GetElementSpaceSize());
247 b_thread_desc_.GetElementSpaceSize());
248
250 b_thread_desc_.GetElementSpaceSize());
251
252 StaticallyIndexedArray<decltype(b_thread_buf), Number<2>{}> b_thread_bufs;
253 constexpr auto b_block_origin_idx = make_tuple(I0, I0, I0, I0);
254
255 StaticallyIndexedArray<decltype(b_thread_dequant_buf), Number<2>{}> b_thread_dequant_bufs;
256
257 // Global prefetch A1 B1
258 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
259 b_blockwise_copy.Run(b_grid_desc,
260 b_grid_buf,
262 b_block_origin_idx,
263 b_thread_bufs(I0));
264
265 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
266 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
267 __builtin_amdgcn_sched_barrier(0);
268
269 // // Local prefill A1
270 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, I0);
271
272 // // Global prefetch A2
273 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, I0);
274 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
275
276 // Local prefetch A1
278 static_for<0, MRepeat, 1>{}([&](auto m0) {
279 static_for<0, KRepeat, 1>{}([&](auto k0) {
281 make_tuple(m0, I0, I0, k0, I0, I0),
282 a_block_buf,
284 make_tuple(m0, I0, I0, k0, I0, I0),
285 a_thread_buf);
286 });
287 });
288 // B VGPR->VGPR dequant
290 b_block_origin_idx,
291 b_thread_bufs(I0),
293 make_tuple(I0, I0, I0, I0),
294 b_thread_dequant_bufs(I0));
295
296 // Initialize C
297 c_thread_buf.Clear();
298
299 __builtin_amdgcn_sched_barrier(0);
300
301 // main body
302 if constexpr(HasMainLoop)
303 {
304 index_t i = 0;
305 do
306 {
307 auto LoopFunc = [&](auto mfma_reg_buf, auto local_read_buf) {
308 b_blockwise_copy.Run(b_grid_desc,
309 b_grid_buf,
311 b_block_origin_idx,
312 b_thread_bufs(local_read_buf));
313 b_blockwise_copy.MoveSrcSliceWindow(b_grid_desc, b_block_copy_step);
314
316 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf, mfma_reg_buf);
317
318 a_blockwise_copy.RunRead(a_grid_desc, a_grid_buf, local_read_buf);
319 a_blockwise_copy.MoveSrcSliceWindow(a_grid_desc, a_block_copy_step);
320
321 static_for<0, MRepeat, 1>{}([&](auto m0) {
322 static_for<0, NRepeat, 1>{}([&](auto n0) {
323 static_for<0, KRepeat, 1>{}([&](auto k0) {
326
327 static_for<0, KPack, 1>{}([&](auto ik) {
328 a_thread_vec.template AsType<ComputeDataType>()(ik) =
329 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
330 make_tuple(m0, I0, I0, k0, I0, ik))>{}];
331 b_thread_vec.template AsType<ComputeDataType>()(ik) =
332 b_thread_dequant_bufs[mfma_reg_buf]
333 [Number<b_thread_desc_.CalculateOffset(
334 make_tuple(n0, I0, k0, ik))>{}];
335 });
336 using mfma_input_type =
337 typename vector_type<ComputeDataType,
338 xdlops_gemm.K1PerXdlops>::type;
339
340 constexpr index_t c_offset =
341 c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
342
343 xdlops_gemm.Run(
344 a_thread_vec.template AsType<mfma_input_type>(),
345 b_thread_vec.template AsType<mfma_input_type>(),
346 c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
347 });
348 });
349 });
350
352
353 static_for<0, MRepeat, 1>{}([&](auto m0) {
354 static_for<0, KRepeat, 1>{}([&](auto k0) {
356 make_tuple(m0, I0, I0, k0, I0, I0),
357 a_block_buf,
359 make_tuple(m0, I0, I0, k0, I0, I0),
360 a_thread_buf);
361 });
362 });
363 // B VGPR->VGPR dequant
365 b_block_origin_idx,
366 b_thread_bufs(local_read_buf),
368 make_tuple(I0, I0, I0, I0),
369 b_thread_dequant_bufs(local_read_buf));
370
372 __builtin_amdgcn_sched_barrier(0);
373 };
374
375 LoopFunc(I0, I1);
376 LoopFunc(I1, I0);
377
378 i += 2;
379 } while(i < (num_loop - 2));
380 }
381 // tail
382 if constexpr(TailNum == TailNumber::Even)
383 {
384 b_blockwise_copy.Run(b_grid_desc,
385 b_grid_buf,
387 b_block_origin_idx,
388 b_thread_bufs(I1));
389
391 a_blockwise_copy.RunWrite(a_block_desc, a_block_buf);
392
393 static_for<0, MRepeat, 1>{}([&](auto m0) {
394 static_for<0, NRepeat, 1>{}([&](auto n0) {
395 static_for<0, KRepeat, 1>{}([&](auto k0) {
398
399 static_for<0, KPack, 1>{}([&](auto ik) {
400 a_thread_vec.template AsType<ComputeDataType>()(ik) =
401 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
402 make_tuple(m0, I0, I0, k0, I0, ik))>{}];
403 b_thread_vec.template AsType<ComputeDataType>()(ik) =
404 b_thread_dequant_bufs[I0][Number<b_thread_desc_.CalculateOffset(
405 make_tuple(n0, I0, k0, ik))>{}];
406 });
407
408 using mfma_input_type =
409 typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
410
411 constexpr index_t c_offset =
412 c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
413
414 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
415 b_thread_vec.template AsType<mfma_input_type>(),
416 c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
417 });
418 });
419 });
420
422
423 static_for<0, MRepeat, 1>{}([&](auto m0) {
424 static_for<0, KRepeat, 1>{}([&](auto k0) {
426 make_tuple(m0, I0, I0, k0, I0, I0),
427 a_block_buf,
429 make_tuple(m0, I0, I0, k0, I0, I0),
430 a_thread_buf);
431 });
432 });
433 // B VGPR->VGPR dequant
435 b_block_origin_idx,
436 b_thread_bufs(I1),
438 make_tuple(I0, I0, I0, I0),
439 b_thread_dequant_bufs(I1));
440
441 __builtin_amdgcn_sched_barrier(0);
442
443 static_for<0, MRepeat, 1>{}([&](auto m0) {
444 static_for<0, NRepeat, 1>{}([&](auto n0) {
445 static_for<0, KRepeat, 1>{}([&](auto k0) {
448
449 static_for<0, KPack, 1>{}([&](auto ik) {
450 a_thread_vec.template AsType<ComputeDataType>()(ik) =
451 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
452 make_tuple(m0, I0, I0, k0, I0, ik))>{}];
453 b_thread_vec.template AsType<ComputeDataType>()(ik) =
454 b_thread_dequant_bufs[I1][Number<b_thread_desc_.CalculateOffset(
455 make_tuple(n0, I0, k0, ik))>{}];
456 });
457
458 using mfma_input_type =
459 typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
460
461 constexpr index_t c_offset =
462 c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
463
464 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
465 b_thread_vec.template AsType<mfma_input_type>(),
466 c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
467 });
468 });
469 });
470 // Let's leak last MFMA block to epilogue region, cover the potential lds-shuffle
471 // latency
472 // __builtin_amdgcn_sched_barrier(0);
473 }
474 else
475 {
476 static_for<0, MRepeat, 1>{}([&](auto m0) {
477 static_for<0, NRepeat, 1>{}([&](auto n0) {
478 static_for<0, KRepeat, 1>{}([&](auto k0) {
481
482 static_for<0, KPack, 1>{}([&](auto ik) {
483 a_thread_vec.template AsType<ComputeDataType>()(ik) =
484 a_thread_buf[Number<a_thread_desc_.CalculateOffset(
485 make_tuple(m0, I0, I0, k0, I0, ik))>{}];
486 b_thread_vec.template AsType<ComputeDataType>()(ik) =
487 b_thread_dequant_bufs[I0][Number<b_thread_desc_.CalculateOffset(
488 make_tuple(n0, I0, k0, ik))>{}];
489 });
490
491 using mfma_input_type =
492 typename vector_type<ComputeDataType, xdlops_gemm.K1PerXdlops>::type;
493
494 constexpr index_t c_offset =
495 c_thread_desc_.CalculateOffset(make_tuple(m0, n0, 0));
496
497 xdlops_gemm.Run(a_thread_vec.template AsType<mfma_input_type>(),
498 b_thread_vec.template AsType<mfma_input_type>(),
499 c_thread_buf.GetVectorTypeReference(Number<c_offset>{}));
500 });
501 });
502 });
503 }
504 }
505
506 protected:
507 // MRepeat MWave MLane KRepeat KLane KPack
508 // KRepeat -> MRepeat-> Mwave->KLane->MLane->KPack
511
513 ComputeDataType,
515 decltype(a_thread_desc_),
518 5,
519 A_K1,
520 A_K1>;
521
523
526
527 static constexpr BTileDesc b_block_desc_n0_n1_k0_k1;
528
530
532
540 Sequence<1, 2, 0, 3>,
541 3,
542 KPack>;
543
546};
547
548} // namespace ck
Definition ck.hpp:268
__host__ __device__ constexpr auto make_static_buffer(Number< N >)
Definition static_buffer.hpp:186
__host__ __device__ constexpr auto make_pass_through_transform(const LowLength &low_length)
Definition multi_index_transform_helper.hpp:12
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
integral_constant< index_t, N > Number
Definition number.hpp:12
TailNumber
Definition blkgemmpipe_scheduler.hpp:31
@ Even
Definition blkgemmpipe_scheduler.hpp:34
@ Odd
Definition blkgemmpipe_scheduler.hpp:33
constexpr detail::ignore_t ignore
Definition utility/ignore.hpp:20
BlockGemmPipelineScheduler
Definition blkgemmpipe_scheduler.hpp:25
@ Intrawave
Definition blkgemmpipe_scheduler.hpp:26
__host__ __device__ constexpr auto make_naive_tensor_descriptor_packed(const Tuple< Lengths... > &lengths)
Definition tensor_descriptor_helper.hpp:101
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr auto transform_tensor_descriptor(const OldTensorDescriptor &old_tensor_desc, const NewTransforms &new_transforms, NewLowerDimensionOldVisibleIdss, NewUpperDimensionNewVisibleIdss)
Definition tensor_description/tensor_descriptor.hpp:319
__device__ void block_sync_lds()
Definition synchronization.hpp:16
__host__ __device__ constexpr auto make_unmerge_transform(const UpLengths &up_lengths, integral_constant< bool, Use24BitIntegerCalculation >=integral_constant< bool, false >{})
Definition multi_index_transform_helper.hpp:90
__host__ __device__ BlockwiseGemmXdlops_pipeline_base(Tuple4 a_origin=CalculateAThreadOriginDataIndex(), Tuple4 b_origin=CalculateBThreadOriginDataIndex())
Constructor for BlockwiseGemmXdlops_pipeline_base.
Definition blockwise_gemm_pipeline_xdlops_base.hpp:222
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:280
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_N2_N3_N4()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:239
static constexpr auto c_thread_desc_
Definition blockwise_gemm_pipeline_xdlops_base.hpp:378
static constexpr auto xdlops_gemm
Definition blockwise_gemm_pipeline_xdlops_base.hpp:54
static constexpr auto I1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:37
__host__ static __device__ constexpr auto GetCThreadDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:266
__host__ static __device__ constexpr auto GetCBlockDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:294
static constexpr index_t AMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:60
static __device__ auto CalculateAThreadOriginDataIndex6D()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:136
static constexpr index_t WaveSize
Definition blockwise_gemm_pipeline_xdlops_base.hpp:46
__host__ static __device__ constexpr auto GetCThreadDescriptor_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:253
static constexpr index_t B_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:51
ck::BlockwiseGemmXdlops_pipeline_hotloop_inst< BlockSize, MPerBlock, NPerBlock, KPerBlock, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, A_K1, B_K1, A_K1, B_K1, MRepeat, NRepeat, MPerXDL, NPerXDL, xdlops_gemm.KPerXdlops > HotLoopInstList
Definition blockwise_gemm_pipeline_xdlops_base.hpp:82
__host__ __device__ constexpr auto & GetCThreadBuffer()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:111
static constexpr auto I0
Definition blockwise_gemm_pipeline_xdlops_base.hpp:36
static __device__ auto CalculateCThreadOriginDataIndex(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:160
static __device__ auto CalculateCThreadOriginDataIndex8D(Number< m0 >, Number< n0 >, Number< xdlops_i >, Number< blk_i >)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:189
static constexpr index_t KRepeat
Definition blockwise_gemm_pipeline_xdlops_base.hpp:64
static constexpr AMmaTileDesc a_block_desc_m0_m1_m2_k
Definition blockwise_gemm_pipeline_xdlops_base.hpp:359
static constexpr index_t A_K1
Definition blockwise_gemm_pipeline_xdlops_base.hpp:50
static constexpr index_t BMmaKStride
Definition blockwise_gemm_pipeline_xdlops_base.hpp:61
__host__ static __device__ constexpr auto MakeCGridDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_G_M_N &c_grid_desc_g_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:341
__host__ static __device__ constexpr auto GetCBlockDescriptor_G_M0_N0_M1_N1_M2_M3_M4_N2()
Definition blockwise_gemm_pipeline_xdlops_base.hpp:307
__host__ static __device__ constexpr auto MakeCGridDescriptor_M0_N0_M1_N1_M2_M3_M4_N2(const CGridDesc_M_N &c_grid_desc_m_n)
Definition blockwise_gemm_pipeline_xdlops_base.hpp:324
__device__ void Run(const AGridDesc &a_grid_desc, const ABlockDesc &a_block_desc, ABlockTransfer &a_blockwise_copy, const AGridBuffer &a_grid_buf, ABlockBuffer &a_block_buf, const ABlockTransferStep &a_block_copy_step, const BGridDesc &b_grid_desc, BBlockTransfer &b_blockwise_copy, const BGridBuffer &b_grid_buf, BBlockBuffer &b_block_buf, const BBlockTransferStep &b_block_copy_step, CThreadBuffer &c_thread_buf, index_t num_loop) const
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp:228
ThreadwiseTensorSliceTransfer_StaticToStatic< BDataType, ComputeDataType, decltype(b_block_desc_n0_n1_k0_k1), decltype(b_block_desc_n0_n1_k0_k1), tensor_operation::element_wise::PassThrough, Sequence< Number< NRepeat >{}, I1, Number< KRepeat >{}, Number< KPack >{}>, Sequence< 1, 2, 0, 3 >, 3, KPack > BThreadDequantCopy
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp:533
ThreadwiseTensorSliceTransfer_v4< ADataType, ComputeDataType, decltype(a_block_desc_m0_m1_m2_k0_k1_k2), decltype(a_thread_desc_), Sequence< 1, 1, 1, 1, 1, KPack >, Sequence< 0, 1, 2, 3, 4, 5 >, 5, A_K1, A_K1 > AThreadCopy
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp:512
BlockwiseGemmXdlops_pipeline_base< BlockSize, ADataType, BDataType, ComputeDataType, AccDataType, ATileDesc, BTileDesc, AMmaTileDesc, BMmaTileDesc, ABlockTransferSrcScalarPerVector, BBlockTransferSrcScalarPerVector, MPerBlock, NPerBlock, KPerBlock, MPerXDL, NPerXDL, MRepeat, NRepeat, KPack > Base
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp:102
Definition blockwise_gemm_pipeline_xdlops_b_preshuffle_dequant_v1.hpp:37
Definition utility/sequence.hpp:43
Threadwise data transfer.
Definition threadwise_tensor_slice_transfer.hpp:1720
Definition threadwise_tensor_slice_transfer.hpp:1260
Definition functional2.hpp:33
Definition tensor_operation/gpu/element/unary_element_wise_operation.hpp:340
Definition dtype_vector.hpp:10