magic_division.hpp Source File

magic_division.hpp Source File#

Composable Kernel: magic_division.hpp Source File
magic_division.hpp
Go to the documentation of this file.
1// SPDX-License-Identifier: MIT
2// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
3
4#pragma once
5
6#include "ck/ck.hpp"
7#include "numeric_limits.hpp"
9#include "number.hpp"
10#include "type.hpp"
11#include "tuple.hpp"
12
13namespace ck {
14
15// magic number division
16// Caution:
17// 1. For uint32_t as dividend: magic number division implementation being used would produce
18// correct result if the dividend is uint32_t and its value is within 31-bit value range.
19// 2. For int32_t as dividendd: magic number division for int32_t dividened has not been
20// implemented, the int32_t dividend would be bit-wise interpreted as uint32_t and magic number
21// division implementation for uint32_t is then used. Therefore, dividend value need to be
22// non-negative.
23// TODO:
24// 1. Implement magic number divison for int32_t
25// 2. Implement magic number divison for unit32_t with 32-bit value range
27{
28 // uint32_t
29 __host__ __device__ static constexpr auto CalculateMagicNumbers(uint32_t divisor)
30 {
31 // WARNING: magic division is only applicable for division inside this range.
32 // You should use the return value of CalculateMagicNumbers, if division is not inside this
33 // range. The "else" logic below is to quiet down run-time error.
34 if(divisor >= 1 && divisor <= ck::NumericLimits<int32_t>::Max())
35 {
36 uint32_t shift = 0;
37 for(shift = 0; shift < 32; ++shift)
38 {
39 if((1U << shift) >= divisor)
40 {
41 break;
42 }
43 }
44
45 uint64_t one = 1;
46 uint64_t multiplier = ((one << 32) * ((one << shift) - divisor)) / divisor + 1;
47 // assert(multiplier <= 0xffffffffUL);
48
49 return make_tuple(uint32_t(multiplier), shift);
50 }
51 else
52 {
53 return make_tuple(uint32_t(0), uint32_t(0));
54 }
55 }
56
57 __host__ __device__ static constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)
58 {
59 auto tmp = CalculateMagicNumbers(divisor);
60
61 return tmp[Number<0>{}];
62 }
63
64 __host__ __device__ static constexpr uint32_t CalculateMagicShift(uint32_t divisor)
65 {
66 auto tmp = CalculateMagicNumbers(divisor);
67
68 return tmp[Number<1>{}];
69 }
70
71 // integral_constant<uint32_t, .>
72 template <uint32_t Divisor>
73 __host__ __device__ static constexpr auto
75 {
76 constexpr auto tmp = CalculateMagicNumbers(uint32_t{Divisor});
77
78 constexpr uint32_t multiplier = tmp[Number<0>{}];
79 constexpr uint32_t shift = tmp[Number<1>{}];
80
83 }
84
85 template <uint32_t Divisor>
86 __host__ __device__ static constexpr auto
93
94 template <uint32_t Divisor>
95 __host__ __device__ static constexpr auto
102
103 // integral_constant<int32_t, .>
104 template <int32_t Divisor>
105 __host__ __device__ static constexpr auto
110
111 template <int32_t Divisor>
112 __host__ __device__ static constexpr auto
117
118 template <int32_t Divisor>
119 __host__ __device__ static constexpr auto
124
125 // magic division for uint32_t
126 __device__ static constexpr uint32_t
127 DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
128 {
129 uint32_t tmp = __umulhi(dividend, multiplier);
130 return (tmp + dividend) >> shift;
131 }
132
133 __host__ static constexpr uint32_t
134 DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
135 {
136 uint32_t tmp = static_cast<uint64_t>(dividend) * multiplier >> 32;
137 return (tmp + dividend) >> shift;
138 }
139
140 // magic division for int32_t
141 // HACK: use dividend_i32 as if it's uint32_t, dividend_i32 need to be
142 // non-negative for result to be correct
143 // TODO: figure out how to do magic number divison for int32_t as dividended
144 __device__ static constexpr int32_t
145 DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
146 {
147 uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
148 uint32_t tmp = __umulhi(dividend_u32, multiplier);
149 return (tmp + dividend_u32) >> shift;
150 }
151
152 __host__ static constexpr int32_t
153 DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
154 {
155 uint32_t dividend_u32 = bit_cast<uint32_t>(dividend_i32);
156 uint32_t tmp = static_cast<uint64_t>(dividend_u32) * multiplier >> 32;
157 return (tmp + dividend_u32) >> shift;
158 }
159};
160
161struct MDiv
162{
163 // 1 dword -> 3 dword storage
166 uint32_t shift; // TODO: 8 bit is enough
167
168 // prefer construct on host
169 __host__ __device__ MDiv(uint32_t divisor_) : divisor(divisor_)
170 {
171 auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
172
173 multiplier = tmp[Number<0>{}];
174 shift = tmp[Number<1>{}];
175 }
176
177 __host__ __device__ MDiv() : divisor(0), multiplier(0), shift(0) {}
178
179 __host__ __device__ void update(uint32_t divisor_)
180 {
181 divisor = divisor_;
182 auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
183
184 multiplier = tmp[Number<0>{}];
185 shift = tmp[Number<1>{}];
186 }
187
188 __host__ __device__ uint32_t div(uint32_t dividend_) const
189 {
191 }
192
193 __host__ __device__ void
194 divmod(uint32_t dividend_, uint32_t& quotient_, uint32_t& remainder_) const
195 {
196 quotient_ = div(dividend_);
197 remainder_ = dividend_ - (quotient_ * divisor);
198 }
199
200 __host__ __device__ uint32_t get() const { return divisor; }
201};
202
203struct MDiv2
204{
205 // 1 dword -> 2 dword storage, divisor need compute from runtime
207 uint32_t shift; // TODO: 8 bit is enough
208
209 // prefer construct on host
210 __host__ __device__ MDiv2(uint32_t divisor_)
211 {
212 auto tmp = MagicDivision::CalculateMagicNumbers(divisor_);
213
214 multiplier = tmp[Number<0>{}];
215 shift = tmp[Number<1>{}];
216 }
217
218 __host__ __device__ MDiv2() : multiplier(0), shift(0) {}
219
220 __host__ __device__ uint32_t div(uint32_t dividend_) const
221 {
223 }
224
225 __host__ __device__ void
226 divmod(uint32_t dividend_, uint32_t divisor_, uint32_t& quotient_, uint32_t& remainder_) const
227 {
228 quotient_ = div(dividend_);
229 remainder_ = dividend_ - (quotient_ * divisor_);
230 }
231};
232
233} // namespace ck
Definition ck.hpp:268
integral_constant< index_t, N > Number
Definition number.hpp:12
__host__ __device__ constexpr auto make_tuple(Xs &&... xs)
Definition utility/tuple.hpp:211
__host__ __device__ constexpr Y bit_cast(const X &x)
Definition type.hpp:306
unsigned int uint32_t
Definition stdint.h:126
signed int int32_t
Definition stdint.h:123
unsigned __int64 uint64_t
Definition stdint.h:136
__host__ __device__ MDiv2(uint32_t divisor_)
Definition magic_division.hpp:210
uint32_t shift
Definition magic_division.hpp:207
uint32_t multiplier
Definition magic_division.hpp:206
__host__ __device__ void divmod(uint32_t dividend_, uint32_t divisor_, uint32_t &quotient_, uint32_t &remainder_) const
Definition magic_division.hpp:226
__host__ __device__ uint32_t div(uint32_t dividend_) const
Definition magic_division.hpp:220
__host__ __device__ MDiv2()
Definition magic_division.hpp:218
__host__ __device__ MDiv()
Definition magic_division.hpp:177
uint32_t divisor
Definition magic_division.hpp:164
__host__ __device__ uint32_t get() const
Definition magic_division.hpp:200
__host__ __device__ void divmod(uint32_t dividend_, uint32_t &quotient_, uint32_t &remainder_) const
Definition magic_division.hpp:194
__host__ __device__ void update(uint32_t divisor_)
Definition magic_division.hpp:179
__host__ __device__ MDiv(uint32_t divisor_)
Definition magic_division.hpp:169
uint32_t multiplier
Definition magic_division.hpp:165
uint32_t shift
Definition magic_division.hpp:166
__host__ __device__ uint32_t div(uint32_t dividend_) const
Definition magic_division.hpp:188
Definition magic_division.hpp:27
static __host__ constexpr int32_t DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
Definition magic_division.hpp:153
__host__ static __device__ constexpr auto CalculateMagicNumbers(integral_constant< uint32_t, Divisor >)
Definition magic_division.hpp:74
__host__ static __device__ constexpr auto CalculateMagicMultiplier(integral_constant< int32_t, Divisor >)
Definition magic_division.hpp:113
__host__ static __device__ constexpr uint32_t CalculateMagicShift(uint32_t divisor)
Definition magic_division.hpp:64
__host__ static __device__ constexpr auto CalculateMagicNumbers(integral_constant< int32_t, Divisor >)
Definition magic_division.hpp:106
static __host__ constexpr uint32_t DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
Definition magic_division.hpp:134
__host__ static __device__ constexpr auto CalculateMagicShift(integral_constant< uint32_t, Divisor >)
Definition magic_division.hpp:96
__host__ static __device__ constexpr auto CalculateMagicNumbers(uint32_t divisor)
Definition magic_division.hpp:29
__host__ static __device__ constexpr auto CalculateMagicMultiplier(integral_constant< uint32_t, Divisor >)
Definition magic_division.hpp:87
static __device__ constexpr uint32_t DoMagicDivision(uint32_t dividend, uint32_t multiplier, uint32_t shift)
Definition magic_division.hpp:127
__host__ static __device__ constexpr auto CalculateMagicShift(integral_constant< int32_t, Divisor >)
Definition magic_division.hpp:120
__host__ static __device__ constexpr uint32_t CalculateMagicMultiplier(uint32_t divisor)
Definition magic_division.hpp:57
static __device__ constexpr int32_t DoMagicDivision(int32_t dividend_i32, uint32_t multiplier, uint32_t shift)
Definition magic_division.hpp:145
__host__ static __device__ constexpr T Max()
Definition numeric_limits.hpp:311
Definition utility/integral_constant.hpp:20