We use __fls() to find the most significant bit. Using that, the loop can be avoided. A second trick is to use the behaviour of the rotate instructions to expand the range of the unsigned int to float conversion to the full 32 bits in a branchless way.
The routine is now exact up to 2^24. Above that, we truncate which is equivalent to rounding towards zero.
Signed-off-by: Steven Fuerst svfuerst@gmail.com --- drivers/gpu/drm/radeon/r600_blit.c | 50 ++++++++++++++++++++---------------- 1 file changed, 28 insertions(+), 22 deletions(-)
diff --git a/drivers/gpu/drm/radeon/r600_blit.c b/drivers/gpu/drm/radeon/r600_blit.c index 3c031a4..326a8da 100644 --- a/drivers/gpu/drm/radeon/r600_blit.c +++ b/drivers/gpu/drm/radeon/r600_blit.c @@ -489,29 +489,35 @@ set_default_state(drm_radeon_private_t *dev_priv) ADVANCE_RING(); }
-static uint32_t i2f(uint32_t input) +/* 23 bits of float fractional data */ +#define I2F_FRAC_BITS 23 +#define I2F_MASK ((1 << I2F_FRAC_BITS) - 1) + +/* + * Converts unsigned integer into 32-bit IEEE floating point representation. + * Will be exact from 0 to 2^24. Above that, we round towards zero + * as the fractional bits will not fit in a float. (It would be better to + * round towards even as the fpu does, but that is slower.) + */ +static uint32_t i2f(uint32_t x) { - u32 result, i, exponent, fraction; - - if ((input & 0x3fff) == 0) - result = 0; /* 0 is a special case */ - else { - exponent = 140; /* exponent biased by 127; */ - fraction = (input & 0x3fff) << 10; /* cheat and only - handle numbers below 2^^15 */ - for (i = 0; i < 14; i++) { - if (fraction & 0x800000) - break; - else { - fraction = fraction << 1; /* keep - shifting left until top bit = 1 */ - exponent = exponent - 1; - } - } - result = exponent << 23 | (fraction & 0x7fffff); /* mask - off top bit; assumed 1 */ - } - return result; + uint32_t msb, exponent, fraction; + + /* Zero is special */ + if (!x) return 0; + + /* Get location of the most significant bit */ + msb = __fls(x); + + /* + * Use a rotate instead of a shift because that works both leftwards + * and rightwards due to the mod(32) behaviour. This means we don't + * need to check to see if we are above 2^24 or not. + */ + fraction = ror32(x, (msb - I2F_FRAC_BITS) & 0x1f) & I2F_MASK; + exponent = (127 + msb) << I2F_FRAC_BITS; + + return fraction + exponent; }
We use __fls() to find the most significant bit. Using that, the loop can be avoided. A second trick is to use the behaviour of the rotate instructions to expand the range of the unsigned int to float conversion to the full 32 bits in a branchless way.
The routine is now exact up to 2^24. Above that, we truncate which is equivalent to rounding towards zero.
Signed-off-by: Steven Fuerst svfuerst@gmail.com --- drivers/gpu/drm/radeon/r600_blit_kms.c | 51 +++++++++++++------------------- 1 file changed, 21 insertions(+), 30 deletions(-)
diff --git a/drivers/gpu/drm/radeon/r600_blit_kms.c b/drivers/gpu/drm/radeon/r600_blit_kms.c index 2bef854..e5a40ca 100644 --- a/drivers/gpu/drm/radeon/r600_blit_kms.c +++ b/drivers/gpu/drm/radeon/r600_blit_kms.c @@ -455,44 +455,35 @@ set_default_state(struct radeon_device *rdev) radeon_ring_write(ring, sq_stack_resource_mgmt_2); }
-#define I2F_MAX_BITS 15 -#define I2F_MAX_INPUT ((1 << I2F_MAX_BITS) - 1) -#define I2F_SHIFT (24 - I2F_MAX_BITS) +/* 23 bits of float fractional data */ +#define I2F_FRAC_BITS 23 +#define I2F_MASK ((1 << I2F_FRAC_BITS) - 1)
/* * Converts unsigned integer into 32-bit IEEE floating point representation. - * Conversion is not universal and only works for the range from 0 - * to 2^I2F_MAX_BITS-1. Currently we only use it with inputs between - * 0 and 16384 (inclusive), so I2F_MAX_BITS=15 is enough. If necessary, - * I2F_MAX_BITS can be increased, but that will add to the loop iterations - * and slow us down. Conversion is done by shifting the input and counting - * down until the first 1 reaches bit position 23. The resulting counter - * and the shifted input are, respectively, the exponent and the fraction. - * The sign is always zero. + * Will be exact from 0 to 2^24. Above that, we round towards zero + * as the fractional bits will not fit in a float. (It would be better to + * round towards even as the fpu does, but that is slower.) */ -static uint32_t i2f(uint32_t input) +static uint32_t i2f(uint32_t x) { - u32 result, i, exponent, fraction; + uint32_t msb, exponent, fraction;
- WARN_ON_ONCE(input > I2F_MAX_INPUT); + /* Zero is special */ + if (!x) return 0;
- if ((input & I2F_MAX_INPUT) == 0) - result = 0; - else { - exponent = 126 + I2F_MAX_BITS; - fraction = (input & I2F_MAX_INPUT) << I2F_SHIFT; + /* Get location of the most significant bit */ + msb = __fls(x);
- for (i = 0; i < I2F_MAX_BITS; i++) { - if (fraction & 0x800000) - break; - else { - fraction = fraction << 1; - exponent = exponent - 1; - } - } - result = exponent << 23 | (fraction & 0x7fffff); - } - return result; + /* + * Use a rotate instead of a shift because that works both leftwards + * and rightwards due to the mod(32) behaviour. This means we don't + * need to check to see if we are above 2^24 or not. + */ + fraction = ror32(x, (msb - I2F_FRAC_BITS) & 0x1f) & I2F_MASK; + exponent = (127 + msb) << I2F_FRAC_BITS; + + return fraction + exponent; }
int r600_blit_init(struct radeon_device *rdev)
Remove the copy of i2f() in r600_blit_kms.c We rename the function to something longer now that it is a global symbol. This reduces the likelyhood of unintended clashes later.
This might be a candidate for inclusion inside general drm infrastructure. However, at the moment only the radeon driver uses it.
Signed-off-by: Steven Fuerst svfuerst@gmail.com
Conflicts: drivers/gpu/drm/radeon/r600_blit_kms.c --- drivers/gpu/drm/radeon/r600_blit.c | 67 ++++++++++++++-------------- drivers/gpu/drm/radeon/r600_blit_kms.c | 43 +++--------------- drivers/gpu/drm/radeon/r600_blit_shaders.h | 1 + 3 files changed, 40 insertions(+), 71 deletions(-)
diff --git a/drivers/gpu/drm/radeon/r600_blit.c b/drivers/gpu/drm/radeon/r600_blit.c index 326a8da..54980d8 100644 --- a/drivers/gpu/drm/radeon/r600_blit.c +++ b/drivers/gpu/drm/radeon/r600_blit.c @@ -499,7 +499,7 @@ set_default_state(drm_radeon_private_t *dev_priv) * as the fractional bits will not fit in a float. (It would be better to * round towards even as the fpu does, but that is slower.) */ -static uint32_t i2f(uint32_t x) +uint32_t int2float(uint32_t x) { uint32_t msb, exponent, fraction;
@@ -520,7 +520,6 @@ static uint32_t i2f(uint32_t x) return fraction + exponent; }
- static int r600_nomm_get_vb(struct drm_device *dev) { drm_radeon_private_t *dev_priv = dev->dev_private; @@ -638,20 +637,20 @@ r600_blit_copy(struct drm_device *dev, vb = r600_nomm_get_vb_ptr(dev); }
- vb[0] = i2f(dst_x); + vb[0] = int2float(dst_x); vb[1] = 0; - vb[2] = i2f(src_x); + vb[2] = int2float(src_x); vb[3] = 0;
- vb[4] = i2f(dst_x); - vb[5] = i2f(h); - vb[6] = i2f(src_x); - vb[7] = i2f(h); + vb[4] = int2float(dst_x); + vb[5] = int2float(h); + vb[6] = int2float(src_x); + vb[7] = int2float(h);
- vb[8] = i2f(dst_x + cur_size); - vb[9] = i2f(h); - vb[10] = i2f(src_x + cur_size); - vb[11] = i2f(h); + vb[8] = int2float(dst_x + cur_size); + vb[9] = int2float(h); + vb[10] = int2float(src_x + cur_size); + vb[11] = int2float(h);
/* src */ set_tex_resource(dev_priv, FMT_8, @@ -727,20 +726,20 @@ r600_blit_copy(struct drm_device *dev, vb = r600_nomm_get_vb_ptr(dev); }
- vb[0] = i2f(dst_x / 4); + vb[0] = int2float(dst_x / 4); vb[1] = 0; - vb[2] = i2f(src_x / 4); + vb[2] = int2float(src_x / 4); vb[3] = 0;
- vb[4] = i2f(dst_x / 4); - vb[5] = i2f(h); - vb[6] = i2f(src_x / 4); - vb[7] = i2f(h); + vb[4] = int2float(dst_x / 4); + vb[5] = int2float(h); + vb[6] = int2float(src_x / 4); + vb[7] = int2float(h);
- vb[8] = i2f((dst_x + cur_size) / 4); - vb[9] = i2f(h); - vb[10] = i2f((src_x + cur_size) / 4); - vb[11] = i2f(h); + vb[8] = int2float((dst_x + cur_size) / 4); + vb[9] = int2float(h); + vb[10] = int2float((src_x + cur_size) / 4); + vb[11] = int2float(h);
/* src */ set_tex_resource(dev_priv, FMT_8_8_8_8, @@ -810,20 +809,20 @@ r600_blit_swap(struct drm_device *dev, dx2 = dx + w; dy2 = dy + h;
- vb[0] = i2f(dx); - vb[1] = i2f(dy); - vb[2] = i2f(sx); - vb[3] = i2f(sy); + vb[0] = int2float(dx); + vb[1] = int2float(dy); + vb[2] = int2float(sx); + vb[3] = int2float(sy);
- vb[4] = i2f(dx); - vb[5] = i2f(dy2); - vb[6] = i2f(sx); - vb[7] = i2f(sy2); + vb[4] = int2float(dx); + vb[5] = int2float(dy2); + vb[6] = int2float(sx); + vb[7] = int2float(sy2);
- vb[8] = i2f(dx2); - vb[9] = i2f(dy2); - vb[10] = i2f(sx2); - vb[11] = i2f(sy2); + vb[8] = int2float(dx2); + vb[9] = int2float(dy2); + vb[10] = int2float(sx2); + vb[11] = int2float(sy2);
switch(cpp) { case 4: diff --git a/drivers/gpu/drm/radeon/r600_blit_kms.c b/drivers/gpu/drm/radeon/r600_blit_kms.c index e5a40ca..1c7ed3a 100644 --- a/drivers/gpu/drm/radeon/r600_blit_kms.c +++ b/drivers/gpu/drm/radeon/r600_blit_kms.c @@ -455,37 +455,6 @@ set_default_state(struct radeon_device *rdev) radeon_ring_write(ring, sq_stack_resource_mgmt_2); }
-/* 23 bits of float fractional data */ -#define I2F_FRAC_BITS 23 -#define I2F_MASK ((1 << I2F_FRAC_BITS) - 1) - -/* - * Converts unsigned integer into 32-bit IEEE floating point representation. - * Will be exact from 0 to 2^24. Above that, we round towards zero - * as the fractional bits will not fit in a float. (It would be better to - * round towards even as the fpu does, but that is slower.) - */ -static uint32_t i2f(uint32_t x) -{ - uint32_t msb, exponent, fraction; - - /* Zero is special */ - if (!x) return 0; - - /* Get location of the most significant bit */ - msb = __fls(x); - - /* - * Use a rotate instead of a shift because that works both leftwards - * and rightwards due to the mod(32) behaviour. This means we don't - * need to check to see if we are above 2^24 or not. - */ - fraction = ror32(x, (msb - I2F_FRAC_BITS) & 0x1f) & I2F_MASK; - exponent = (127 + msb) << I2F_FRAC_BITS; - - return fraction + exponent; -} - int r600_blit_init(struct radeon_device *rdev) { u32 obj_size; @@ -757,14 +726,14 @@ void r600_kms_blit_copy(struct radeon_device *rdev, vb_cpu_addr[3] = 0;
vb_cpu_addr[4] = 0; - vb_cpu_addr[5] = i2f(h); + vb_cpu_addr[5] = int2float(h); vb_cpu_addr[6] = 0; - vb_cpu_addr[7] = i2f(h); + vb_cpu_addr[7] = int2float(h);
- vb_cpu_addr[8] = i2f(w); - vb_cpu_addr[9] = i2f(h); - vb_cpu_addr[10] = i2f(w); - vb_cpu_addr[11] = i2f(h); + vb_cpu_addr[8] = int2float(w); + vb_cpu_addr[9] = int2float(h); + vb_cpu_addr[10] = int2float(w); + vb_cpu_addr[11] = int2float(h);
rdev->r600_blit.primitives.set_tex_resource(rdev, FMT_8_8_8_8, w, h, w, src_gpu_addr, size_in_bytes); diff --git a/drivers/gpu/drm/radeon/r600_blit_shaders.h b/drivers/gpu/drm/radeon/r600_blit_shaders.h index f437d36..e17c2cb 100644 --- a/drivers/gpu/drm/radeon/r600_blit_shaders.h +++ b/drivers/gpu/drm/radeon/r600_blit_shaders.h @@ -35,4 +35,5 @@ extern const u32 r6xx_default_state[]; extern const u32 r6xx_ps_size, r6xx_vs_size; extern const u32 r6xx_default_size, r7xx_default_size;
+uint32_t int2float(uint32_t x); #endif
This allows gcc to fold duplicate calls into a single call. Since the current users do actually call it multiple times with the same arguments, this is an obvious win. --- drivers/gpu/drm/radeon/r600_blit.c | 2 +- drivers/gpu/drm/radeon/r600_blit_shaders.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-)
diff --git a/drivers/gpu/drm/radeon/r600_blit.c b/drivers/gpu/drm/radeon/r600_blit.c index 54980d8..b4304288 100644 --- a/drivers/gpu/drm/radeon/r600_blit.c +++ b/drivers/gpu/drm/radeon/r600_blit.c @@ -499,7 +499,7 @@ set_default_state(drm_radeon_private_t *dev_priv) * as the fractional bits will not fit in a float. (It would be better to * round towards even as the fpu does, but that is slower.) */ -uint32_t int2float(uint32_t x) +__pure uint32_t int2float(uint32_t x) { uint32_t msb, exponent, fraction;
diff --git a/drivers/gpu/drm/radeon/r600_blit_shaders.h b/drivers/gpu/drm/radeon/r600_blit_shaders.h index e17c2cb..2f3ce7a 100644 --- a/drivers/gpu/drm/radeon/r600_blit_shaders.h +++ b/drivers/gpu/drm/radeon/r600_blit_shaders.h @@ -35,5 +35,5 @@ extern const u32 r6xx_default_state[]; extern const u32 r6xx_ps_size, r6xx_vs_size; extern const u32 r6xx_default_size, r7xx_default_size;
-uint32_t int2float(uint32_t x); +__pure uint32_t int2float(uint32_t x); #endif
On Sam, 2012-08-11 at 10:30 -0700, Steven Fuerst wrote:
We use __fls() to find the most significant bit. Using that, the loop can be avoided. A second trick is to use the behaviour of the rotate instructions to expand the range of the unsigned int to float conversion to the full 32 bits in a branchless way.
The routine is now exact up to 2^24. Above that, we truncate which is equivalent to rounding towards zero.
Signed-off-by: Steven Fuerst svfuerst@gmail.com
It might be better to reorder the series to use a shared int2float first and then optimize that. Either way though, although I haven't really looked into the floating point encoding aspects, the series is
Reviewed-by: Michel Dänzer michel.daenzer@amd.com
dri-devel@lists.freedesktop.org