25 BufferResource<T> wave_buffer_resource;
30 wave_buffer_resource.range(
Number<2>{}) = element_space_size *
sizeof(T);
34 return wave_buffer_resource.content;
45 wave_buffer_resource.range(
Number<2>{}) = 0xffffffff;
49 return wave_buffer_resource.content;
59 int32_t num = element_space_size *
sizeof(T);
62 return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
74 return __builtin_amdgcn_make_buffer_rsrc(p, stride, num, flags);
83 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
91 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.add.i32");
99 index_t glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fadd.f32");
107 int glc_slc) __asm(
"llvm.amdgcn.raw.buffer.atomic.fmax.f64");
133template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
134__device__
typename vector_type<int8_t, N>::type
136 index_t src_thread_addr_offset,
139 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
140 "wrong! not implemented");
144 return __builtin_amdgcn_raw_buffer_load_b8(src_wave_buffer_resource,
145 src_thread_addr_offset,
146 src_wave_addr_offset,
147 static_cast<index_t>(coherence));
149 else if constexpr(N == 2)
152 int16_t tmp = __builtin_amdgcn_raw_buffer_load_b16(src_wave_buffer_resource,
153 src_thread_addr_offset,
154 src_wave_addr_offset,
155 static_cast<index_t>(coherence));
159 else if constexpr(N == 4)
161 int32_t tmp = __builtin_amdgcn_raw_buffer_load_b32(src_wave_buffer_resource,
162 src_thread_addr_offset,
163 src_wave_addr_offset,
164 static_cast<index_t>(coherence));
168 else if constexpr(N == 8)
170 int32x2_t tmp = __builtin_amdgcn_raw_buffer_load_b64(src_wave_buffer_resource,
171 src_thread_addr_offset,
172 src_wave_addr_offset,
173 static_cast<index_t>(coherence));
177 else if constexpr(N == 16)
179 int32x4_t tmp = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
180 src_thread_addr_offset,
181 src_wave_addr_offset,
182 static_cast<index_t>(coherence));
185 else if constexpr(N == 32)
187 int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
188 src_thread_addr_offset,
189 src_wave_addr_offset,
190 static_cast<index_t>(coherence));
192 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
193 src_thread_addr_offset,
194 src_wave_addr_offset + 4 *
sizeof(
int32_t),
195 static_cast<index_t>(coherence));
203 else if constexpr(N == 64)
205 int32x4_t tmp0 = __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
206 src_thread_addr_offset,
207 src_wave_addr_offset,
208 static_cast<index_t>(coherence));
210 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
211 src_thread_addr_offset,
212 src_wave_addr_offset + 4 *
sizeof(
int32_t),
213 static_cast<index_t>(coherence));
215 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
216 src_thread_addr_offset,
217 src_wave_addr_offset + 8 *
sizeof(
int32_t),
218 static_cast<index_t>(coherence));
220 __builtin_amdgcn_raw_buffer_load_b128(src_wave_buffer_resource,
221 src_thread_addr_offset,
222 src_wave_addr_offset + 12 *
sizeof(
int32_t),
223 static_cast<index_t>(coherence));
239__device__
typename vector_type<T, N>::type
241 index_t src_thread_addr_offset,
255 "wrong! not implemented");
259 src_wave_buffer_resource, src_thread_addr_offset, src_wave_addr_offset);
263template <index_t N, AmdBufferCoherenceEnum coherence = AmdBufferCoherenceEnum::DefaultCoherence>
266 __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
267 index_t dst_thread_addr_offset,
270 static_assert(N == 1 || N == 2 || N == 4 || N == 8 || N == 16 || N == 32 || N == 64,
271 "wrong! not implemented");
275 __builtin_amdgcn_raw_buffer_store_b8(src_thread_data,
276 dst_wave_buffer_resource,
277 dst_thread_addr_offset,
278 dst_wave_addr_offset,
279 static_cast<index_t>(coherence));
281 else if constexpr(N == 2)
285 dst_wave_buffer_resource,
286 dst_thread_addr_offset,
287 dst_wave_addr_offset,
288 static_cast<index_t>(coherence));
290 else if constexpr(N == 4)
293 dst_wave_buffer_resource,
294 dst_thread_addr_offset,
295 dst_wave_addr_offset,
296 static_cast<index_t>(coherence));
298 else if constexpr(N == 8)
301 dst_wave_buffer_resource,
302 dst_thread_addr_offset,
303 dst_wave_addr_offset,
304 static_cast<index_t>(coherence));
306 else if constexpr(N == 16)
309 dst_wave_buffer_resource,
310 dst_thread_addr_offset,
311 dst_wave_addr_offset,
312 static_cast<index_t>(coherence));
314 else if constexpr(N == 32)
318 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<0>{}],
319 dst_wave_buffer_resource,
320 dst_thread_addr_offset,
321 dst_wave_addr_offset,
322 static_cast<index_t>(coherence));
324 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<1>{}],
325 dst_wave_buffer_resource,
326 dst_thread_addr_offset,
327 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
328 static_cast<index_t>(coherence));
330 else if constexpr(N == 64)
334 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<0>{}],
335 dst_wave_buffer_resource,
336 dst_thread_addr_offset,
337 dst_wave_addr_offset,
338 static_cast<index_t>(coherence));
340 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<1>{}],
341 dst_wave_buffer_resource,
342 dst_thread_addr_offset,
343 dst_wave_addr_offset +
sizeof(
int32_t) * 4,
344 static_cast<index_t>(coherence));
346 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<2>{}],
347 dst_wave_buffer_resource,
348 dst_thread_addr_offset,
349 dst_wave_addr_offset +
sizeof(
int32_t) * 8,
350 static_cast<index_t>(coherence));
352 __builtin_amdgcn_raw_buffer_store_b128(tmp.template AsType<int32x4_t>()[
Number<3>{}],
353 dst_wave_buffer_resource,
354 dst_thread_addr_offset,
355 dst_wave_addr_offset +
sizeof(
int32_t) * 12,
356 static_cast<index_t>(coherence));
364 __amdgpu_buffer_rsrc_t dst_wave_buffer_resource,
365 index_t dst_thread_addr_offset,
377 (N == 1 || N == 2 || N == 4 || N == 8 || N == 16)) ||
379 "wrong! not implemented");
384 dst_wave_buffer_resource,
385 dst_thread_addr_offset,
386 dst_wave_addr_offset);
389template <
typename T, index_t N>
395 "wrong! not implemented");
399 vector_type<half_t, N> tmp{src_thread_data};
402 tmp.template AsType<half2_t>()[i]);
405#if defined(__gfx942__) || defined(__gfx950__) || defined(__gfx12__)
411 tmp.template AsType<bhalf2_t>()[i]);
417template <
typename T, index_t N>
420 index_t dst_thread_addr_offset,
426 "wrong! not implemented");
433 dst_wave_buffer_resource,
434 dst_thread_addr_offset,
435 dst_wave_addr_offset,
438 else if constexpr(N == 2)
443 dst_wave_buffer_resource,
444 dst_thread_addr_offset,
445 dst_wave_addr_offset,
449 dst_wave_buffer_resource,
450 dst_thread_addr_offset,
451 dst_wave_addr_offset +
sizeof(
float),
454 else if constexpr(N == 4)
459 dst_wave_buffer_resource,
460 dst_thread_addr_offset,
461 dst_wave_addr_offset,
465 dst_wave_buffer_resource,
466 dst_thread_addr_offset,
467 dst_wave_addr_offset +
sizeof(
float),
471 dst_wave_buffer_resource,
472 dst_thread_addr_offset,
473 dst_wave_addr_offset + 2 *
sizeof(
float),
477 dst_wave_buffer_resource,
478 dst_thread_addr_offset,
479 dst_wave_addr_offset + 3 *
sizeof(
float),
488 dst_wave_buffer_resource,
489 dst_thread_addr_offset,
490 dst_wave_addr_offset,
493 else if constexpr(N == 4)
499 dst_wave_buffer_resource,
500 dst_thread_addr_offset,
501 dst_wave_addr_offset + i *
sizeof(
half2_t),
505 else if constexpr(N == 8)
511 dst_wave_buffer_resource,
512 dst_thread_addr_offset,
513 dst_wave_addr_offset + i *
sizeof(
half2_t),
523 dst_wave_buffer_resource,
524 dst_thread_addr_offset,
525 dst_wave_addr_offset,
528 else if constexpr(N == 2)
533 dst_wave_buffer_resource,
534 dst_thread_addr_offset,
535 dst_wave_addr_offset,
539 dst_wave_buffer_resource,
540 dst_thread_addr_offset,
541 dst_wave_addr_offset +
sizeof(
int32_t),
544 else if constexpr(N == 4)
549 dst_wave_buffer_resource,
550 dst_thread_addr_offset,
551 dst_wave_addr_offset,
555 dst_wave_buffer_resource,
556 dst_thread_addr_offset,
557 dst_wave_addr_offset +
sizeof(
int32_t),
561 dst_wave_buffer_resource,
562 dst_thread_addr_offset,
563 dst_wave_addr_offset + 2 *
sizeof(
int32_t),
567 dst_wave_buffer_resource,
568 dst_thread_addr_offset,
569 dst_wave_addr_offset + 3 *
sizeof(
int32_t),
575template <
typename T, index_t N>
578 index_t dst_thread_addr_offset,
582 "wrong! not implemented");
588 dst_wave_buffer_resource,
589 dst_thread_addr_offset,
590 dst_wave_addr_offset,
593 else if constexpr(N == 2)
598 dst_wave_buffer_resource,
599 dst_thread_addr_offset,
600 dst_wave_addr_offset,
604 dst_wave_buffer_resource,
605 dst_thread_addr_offset,
606 dst_wave_addr_offset +
sizeof(
double),
609 else if constexpr(N == 4)
614 dst_wave_buffer_resource,
615 dst_thread_addr_offset,
616 dst_wave_addr_offset,
620 dst_wave_buffer_resource,
621 dst_thread_addr_offset,
622 dst_wave_addr_offset +
sizeof(
double),
626 dst_wave_buffer_resource,
627 dst_thread_addr_offset,
628 dst_wave_addr_offset + 2 *
sizeof(
double),
632 dst_wave_buffer_resource,
633 dst_thread_addr_offset,
634 dst_wave_addr_offset + 3 *
sizeof(
double),
649 index_t src_thread_element_offset,
650 bool src_thread_element_valid,
651 index_t src_element_space_size)
653 const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
656 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
663#if CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK
664 uint32_t src_addr_shift = src_thread_element_valid ? 0 : 0x80000000;
666 src_wave_buffer_resource, src_addr_shift + src_thread_addr_offset, 0);
671 src_wave_buffer_resource, src_thread_addr_offset, 0)};
672 return src_thread_element_valid ? tmp : vector_t(0);
685 index_t src_thread_element_offset,
686 bool src_thread_element_valid,
687 index_t src_element_space_size,
690 const __amdgpu_buffer_rsrc_t src_wave_buffer_resource =
693 index_t src_thread_addr_offset = src_thread_element_offset *
sizeof(T);
701 src_wave_buffer_resource, src_thread_addr_offset, 0)};
703 return src_thread_element_valid ? tmp : vector_t(customized_value);
715 const index_t dst_thread_element_offset,
716 const bool dst_thread_element_valid,
717 const index_t dst_element_space_size)
719 const __amdgpu_buffer_rsrc_t dst_wave_buffer_resource =
722 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
728#if CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK
729 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
731 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
733 if(dst_thread_element_valid)
736 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
745template <
typename T, index_t N>
749 const index_t dst_thread_element_offset,
750 const bool dst_thread_element_valid,
751 const index_t dst_element_space_size)
753 const int32x4_t dst_wave_buffer_resource =
756 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
764 if(dst_thread_element_valid)
767 src_thread_data, p_dst_wave + dst_thread_element_offset);
772#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK
773 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
776 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
778 if(dst_thread_element_valid)
781 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
791template <
typename T, index_t N>
795 const index_t dst_thread_element_offset,
796 const bool dst_thread_element_valid,
797 const index_t dst_element_space_size)
799 const int32x4_t dst_wave_buffer_resource =
802 index_t dst_thread_addr_offset = dst_thread_element_offset *
sizeof(T);
808#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
809 uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x80000000;
812 src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
814 if(dst_thread_element_valid)
817 src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
825 __attribute__((address_space(3)))
uint32_t* lds_ptr,
830 index_t aux) __asm(
"llvm.amdgcn.raw.buffer.load.lds");
833template <
typename T, index_t NumElemsPerThread>
839 const index_t src_element_space_size)
844 constexpr auto bytes_per_thread =
sizeof(T) * NumElemsPerThread;
845#if defined(__gfx950__)
846 constexpr auto dword_bytes = 4;
847 static_assert(bytes_per_thread == dword_bytes || bytes_per_thread == dword_bytes * 3 ||
848 bytes_per_thread == dword_bytes * 4);
849#elif defined(__gfx942__)
850 constexpr auto dword_bytes = 4;
851 static_assert(bytes_per_thread == dword_bytes);
856 const index_t global_offset_bytes = is_valid ? global_offset *
sizeof(T) : 0x80000000;
858#if CK_USE_AMD_LDS_DIRECT_LOAD_INLINE_ASM
859 T* lds_ptr = lds_base_ptr + lds_offset;
860#ifndef CK_CODE_GEN_RTC
861 auto const lds_ptr_sgpr =
862 __builtin_amdgcn_readfirstlane((
reinterpret_cast<uintptr_t>(lds_ptr)));
864 auto const lds_ptr_sgpr = __builtin_amdgcn_readfirstlane((
reinterpret_cast<size_t>(lds_ptr)));
866 asm volatile(
"s_mov_b32 m0, %0; \n\t"
867 "buffer_load_dword %1, %2, 0 offen lds;\n\t" ::
"s"(lds_ptr_sgpr),
868 "v"(global_offset_bytes),
873 __attribute__((address_space(3)))
uint32_t* lds_ptr =
874#ifndef CK_CODE_GEN_RTC
875 reinterpret_cast<__attribute__((address_space(3)))
uint32_t*
>(
876 reinterpret_cast<uintptr_t>(lds_base_ptr + lds_offset));
878 reinterpret_cast<__attribute__((address_space(3)))
uint32_t*
>(
879 reinterpret_cast<size_t>(lds_base_ptr + lds_offset));
883 src_resource, lds_ptr, bytes_per_thread, global_offset_bytes, 0, 0, 0);
#define CK_BUFFER_RESOURCE_3RD_DWORD
Definition ck.hpp:80
__device__ int32x4_t make_wave_buffer_resource_with_default_range(T *p_wave)
Definition utility/amd_buffer_addressing.hpp:38
__device__ void amd_buffer_store(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition utility/amd_buffer_addressing.hpp:894
__device__ void amd_direct_load_global_to_lds(const T *global_base_ptr, const index_t global_offset, T *lds_base_ptr, const index_t lds_offset, const bool is_valid, const index_t src_element_space_size)
Definition utility/amd_buffer_addressing.hpp:1015
__device__ void amd_buffer_atomic_max(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition utility/amd_buffer_addressing.hpp:974
typename detail::StaticallyIndexedArrayImpl< T, N >::type StaticallyIndexedArray
Definition utility/statically_indexed_array.hpp:45
int32_t index_t
Definition ck.hpp:299
__device__ void amd_buffer_store_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:544
AmdBufferCoherenceEnum
Definition utility/amd_buffer_addressing.hpp:295
@ GLC
Definition utility/amd_buffer_addressing.hpp:297
@ SYSTEM_NT1
Definition utility/amd_buffer_addressing.hpp:310
@ WAVE_NT0
Definition utility/amd_buffer_addressing.hpp:303
@ GLC_SLC
Definition utility/amd_buffer_addressing.hpp:299
@ SLC
Definition utility/amd_buffer_addressing.hpp:298
@ DefaultCoherence
Definition utility/amd_buffer_addressing.hpp:296
@ DEVICE_NT1
Definition utility/amd_buffer_addressing.hpp:308
@ SYSTEM_NT0
Definition utility/amd_buffer_addressing.hpp:309
@ GROUP_NT1
Definition utility/amd_buffer_addressing.hpp:306
@ DEVICE_NT0
Definition utility/amd_buffer_addressing.hpp:307
@ GROUP_NT0
Definition utility/amd_buffer_addressing.hpp:305
@ WAVE_NT1
Definition utility/amd_buffer_addressing.hpp:304
__device__ int32x4_t make_wave_buffer_resource(T *p_wave, index_t element_space_size)
Definition utility/amd_buffer_addressing.hpp:23
__device__ void llvm_amdgcn_raw_buffer_load_lds(int32x4_t rsrc, uint32_t *lds_ptr, index_t size, index_t voffset, index_t soffset, index_t offset, index_t aux) __asm("llvm.amdgcn.raw.buffer.load.lds")
integral_constant< index_t, N > Number
Definition number.hpp:12
__device__ void amd_buffer_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:599
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_customized_value(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size, T customized_value)
Definition utility/amd_buffer_addressing.hpp:865
__device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(float vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32")
__device__ void amd_global_atomic_add_impl(const typename vector_type< T, N >::type src_thread_data, T *addr)
Definition utility/amd_buffer_addressing.hpp:571
typename vector_type< int32_t, 4 >::type int32x4_t
Definition dtype_vector.hpp:2168
__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(half2_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16")
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_new(T *p_wave, index_t element_space_size)
Definition utility/amd_buffer_addressing_builtins.hpp:53
typename vector_type< int32_t, 2 >::type int32x2_t
Definition dtype_vector.hpp:2167
__device__ vector_type< int8_t, N >::type amd_buffer_load_impl_raw(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:315
__device__ void amd_buffer_atomic_add(const typename vector_type_maker< T, N >::type::type src_thread_data, T *p_dst_wave, const index_t dst_thread_element_offset, const bool dst_thread_element_valid, const index_t dst_element_space_size)
Definition utility/amd_buffer_addressing.hpp:928
__device__ double llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata, int32x4_t rsrc, int voffset, int soffset, int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64")
typename vector_type< half_t, 2 >::type half2_t
Definition dtype_vector.hpp:2153
__device__ vector_type_maker< T, N >::type::type amd_buffer_load_invalid_element_return_zero(const T *p_src_wave, index_t src_thread_element_offset, bool src_thread_element_valid, index_t src_element_space_size)
Definition utility/amd_buffer_addressing.hpp:829
__device__ void amd_buffer_atomic_max_impl(const typename vector_type< T, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:757
__device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(int32_t vdata, int32x4_t rsrc, index_t voffset, index_t soffset, index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32")
__device__ vector_type< T, N >::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource, index_t src_thread_addr_offset, index_t src_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:419
__host__ __device__ constexpr Y bit_cast(const X &x)
Definition type.hpp:306
__device__ void amd_buffer_store_impl_raw(const typename vector_type< int8_t, N >::type src_thread_data, int32x4_t dst_wave_buffer_resource, index_t dst_thread_addr_offset, index_t dst_wave_addr_offset)
Definition utility/amd_buffer_addressing.hpp:446
typename remove_cv< T >::type remove_cv_t
Definition type.hpp:295
__device__ __amdgpu_buffer_rsrc_t make_wave_buffer_resource_with_default_range_new(T *p_wave)
Definition utility/amd_buffer_addressing_builtins.hpp:66
signed short int16_t
Definition stdint.h:122
_W64 unsigned int uintptr_t
Definition stdint.h:164
unsigned int uint32_t
Definition stdint.h:126
signed int int32_t
Definition stdint.h:123
signed char int8_t
Definition stdint.h:121
static constexpr value_type value
Definition utility/integral_constant.hpp:13
Definition data_type.hpp:39
Definition functional2.hpp:33
Definition dtype_vector.hpp:30
Definition dtype_vector.hpp:10
Definition utility/amd_buffer_addressing.hpp:11
int32x4_t content
Definition utility/amd_buffer_addressing.hpp:16
StaticallyIndexedArray< int32_t, 4 > config
Definition utility/amd_buffer_addressing.hpp:19
StaticallyIndexedArray< int32_t, 4 > range
Definition utility/amd_buffer_addressing.hpp:18
StaticallyIndexedArray< T *, 2 > address
Definition utility/amd_buffer_addressing.hpp:17
__device__ constexpr BufferResource()
Definition utility/amd_buffer_addressing_builtins.hpp:12