ThreadGroupTensorSliceTransfer_v4r1_gather< ThreadGroup, SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch > Struct Template Reference#
ck::ThreadGroupTensorSliceTransfer_v4r1_gather< ThreadGroup, SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch > Struct Template Reference
Blockwise data transfer. More...
#include <thread_group_tensor_slice_transfer_v4r1_gather.hpp>
Public Types | |
| using | Index = MultiIndex<nDim> |
Public Member Functions | |
| __device__ constexpr | ThreadGroupTensorSliceTransfer_v4r1_gather (const SrcDesc &src_desc, const Index &src_block_slice_origin, const SrcElementwiseOperation &src_element_op, const DstDesc &dst_desc, const Index &dst_block_slice_origin, const DstElementwiseOperation &dst_element_op, const StaticallyIndexedArray< IndexType, gather_num > &gather_offsets) |
| __device__ void | SetSrcSliceOrigin (const SrcDesc &src_desc, const Index &src_block_slice_origin) |
| template<typename SeqIdx, index_t ThreadScratchId = 0> | |
| __device__ constexpr auto | GetSrcThreadScratchIdx () |
| template<typename SrcBuffer, index_t ThreadScratchId = 0> | |
| __device__ void | RunRead (const SrcDesc &src_desc, const SrcBuffer &src_buf, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{}) |
| template<typename DstBuffer, index_t ThreadScratchId = 0> | |
| __device__ void | RunWrite (const DstDesc &dst_desc, DstBuffer &dst_buf, Number< ThreadScratchId > thread_scratch_id=Number< ThreadScratchId >{}) |
| template<typename SrcBuffer, typename DstBuffer, index_t ThreadScratchId> | |
| __device__ void | Run (const SrcDesc &src_desc, const SrcBuffer &src_buf, const DstDesc &dst_desc, DstBuffer &dst_buf, Number< ThreadScratchId > thread_scratch_id) |
| __device__ void | MoveSrcSliceWindow (const SrcDesc &src_desc, const Index &step) |
| __device__ void | MoveDstSliceWindow (const DstDesc &dst_desc, const Index &step) |
Static Public Attributes | |
| static constexpr auto | I0 = Number<0>{} |
| static constexpr index_t | nDim = remove_reference_t<SrcDesc>::GetNumOfDimension() |
| static constexpr auto | thread_slice_lengths = BlockSliceLengths{} / ThreadClusterLengths{} |
| static constexpr index_t | gather_num = thread_slice_lengths.At(Number<GatherDim>{}) |
Detailed Description
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
struct ck::ThreadGroupTensorSliceTransfer_v4r1_gather< ThreadGroup, SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >
struct ck::ThreadGroupTensorSliceTransfer_v4r1_gather< ThreadGroup, SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >
Blockwise data transfer.
This version does following things to avoid scratch memory issue
- Use StaticallyIndexedArray instead of C array for thread buffer
- ThreadwiseTensorSliceTransfer_v3 does not keep reference to tensor descriptor
- ThreadwiseTensorSliceTransfer_v3::Run() does not construct new tensor coordinate
Member Typedef Documentation
◆ Index
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
| using ck::ThreadGroupTensorSliceTransfer_v4r1_gather< ThreadGroup, SrcElementwiseOperation, DstElementwiseOperation, DstInMemOp, BlockSliceLengths, ThreadClusterLengths, ThreadClusterArrangeOrder, SrcData, DstData, SrcDesc, DstDesc, SrcDimAccessOrder, DstDimAccessOrder, SrcVectorDim, DstVectorDim, SrcScalarPerVector, DstScalarPerVector, SrcScalarStrideInVector, DstScalarStrideInVector, ThreadTransferSrcResetCoordinateAfterRun, ThreadTransferDstResetCoordinateAfterRun, IndexType, GatherDim, NumThreadScratch >::Index = MultiIndex<nDim> |
Constructor & Destructor Documentation
◆ ThreadGroupTensorSliceTransfer_v4r1_gather()
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
|
inlineconstexpr |
Member Function Documentation
◆ GetSrcThreadScratchIdx()
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
template<typename SeqIdx, index_t ThreadScratchId = 0>
|
inlineconstexpr |
◆ MoveDstSliceWindow()
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
|
inline |
◆ MoveSrcSliceWindow()
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
|
inline |
◆ Run()
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
template<typename SrcBuffer, typename DstBuffer, index_t ThreadScratchId>
|
inline |
◆ RunRead()
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
template<typename SrcBuffer, index_t ThreadScratchId = 0>
|
inline |
◆ RunWrite()
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
template<typename DstBuffer, index_t ThreadScratchId = 0>
|
inline |
◆ SetSrcSliceOrigin()
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
|
inline |
Member Data Documentation
◆ gather_num
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
|
staticconstexpr |
◆ I0
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
|
staticconstexpr |
◆ nDim
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
|
staticconstexpr |
◆ thread_slice_lengths
template<typename ThreadGroup, typename SrcElementwiseOperation, typename DstElementwiseOperation, InMemoryDataOperationEnum DstInMemOp, typename BlockSliceLengths, typename ThreadClusterLengths, typename ThreadClusterArrangeOrder, typename SrcData, typename DstData, typename SrcDesc, typename DstDesc, typename SrcDimAccessOrder, typename DstDimAccessOrder, index_t SrcVectorDim, index_t DstVectorDim, index_t SrcScalarPerVector, index_t DstScalarPerVector, index_t SrcScalarStrideInVector, index_t DstScalarStrideInVector, bool ThreadTransferSrcResetCoordinateAfterRun, bool ThreadTransferDstResetCoordinateAfterRun, typename IndexType, index_t GatherDim = 1, index_t NumThreadScratch = 1>
|
staticconstexpr |
The documentation for this struct was generated from the following file: