平方根算法之二(翻译) - lhf 的窝
平方根算法之二(翻译)
LiHengFeng
posted @ 2008年3月22日 23:02
in 算法
, 1943 阅读
本blog所有文章,除声明了作者外,均为原创。欢迎转载,转载请注明作者。
二、实现
2a. (doble)sqrt(double)
下面是一个C程序的例子,在32位系统上对64位浮点数double(如X86)实现了第三种方法。
下面是一个C程序的例子,在32位系统上对64位浮点数double(如X86)实现了第三种方法。
double fsqrt (double y)
{
double x, z, tempf;
unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
tempf = y;
*tfptr = (0xbfcd4600 - *tfptr)>>1; /* estimate of 1/sqrt(y) */
x = tempf;
z = y*0.5; /* hoist out the “/2” */
x = (1.5*x) - (x*x)*(x*z); /* iteration formula */
x = (1.5*x)– (x*x)*(x*z);
x = (1.5*x)– (x*x)*(x*z);
x = (1.5*x)– (x*x)*(x*z);
return x*y;
}
{
double x, z, tempf;
unsigned long *tfptr = ((unsigned long *)&tempf) + 1;
tempf = y;
*tfptr = (0xbfcd4600 - *tfptr)>>1; /* estimate of 1/sqrt(y) */
x = tempf;
z = y*0.5; /* hoist out the “/2” */
x = (1.5*x) - (x*x)*(x*z); /* iteration formula */
x = (1.5*x)– (x*x)*(x*z);
x = (1.5*x)– (x*x)*(x*z);
x = (1.5*x)– (x*x)*(x*z);
return x*y;
}
进一步的优化,只有32位对估计值是有效的,迭代公式只是产生任意精度。迭代公式中的括号将引起编译器在支持管道浮点操作(和乱序执行)的处理器上产生效率更好的代码。下面是一个类似的程序,用内联汇编写的(在MSVC编译器下)。
需要指出的是,这些方法不会产生精确的截断值,它们只能产生近似值。同样的技术在CPU和编译器生产商中是很常见的,因此不要期望会超过现有的算法,或者比C函数库中的sqrt()函数更精确。不过,与那些需要产生精确值的标准库或CPU微代码得到的值相比,我们通过这些函数可以很容易在较高的执行性能下得到一个较低精度的估计值。
一般说来,对完全的浮点近似,在现实的实现中,上述第一,二种方法比第三种方法要慢。接下来,我们看看几种比较简单的情况,对这些情况来说,简化是很重要的。
2b. (float)sqrt(float)
一个改进是利用IEEE-754格式,用一个查找表找出尾数的最高位。Norbert Juffa用这个方法实现了32位精度的平方根倒数。
; invsqrt2 computes an approximation to the inverse square root of an
; IEEE-754 single precision number. The algorithm used is that
; described in this paper: Masayuki Ito, Naofumi Takagi, Shuzo Yajima:
; "Efficient Initial Approximation for Multiplicative Division
; and Square Root by a Multiplication with Operand Modification". IEEE
; Transactions on Computers, Vol. 46, No. 4, April 1997,
; pp 495-498.
;
; This code delivers inverse square root results that differ by at most 1
; ulp from correctly rounded single precision results. The correctly
; rounded single precision result is returned for 99% of all arguments
; if the FPU precision is set to either extended precision (default under
; DOS) or double precision (default under WindowsNT). This routine does
; not work correctly if the input is negative, zero, a denormal, an
; infinity or a NaN. It works properly for arguments in [2^-126, 2^128),
; which is the range of positive, normalized IEEE single precision numbers
; and is roughly equivalent to [1.1755e-38,3.4028e38].
;
; In case all memory accesses are cache hits, this code has been timed to
; have a latency of 39 cycles on a PentiumMMX.
;
;
; invsqrt2 - compute an approximation to the inverse square root of an
; IEEE-754 single precision number.
;
; input:
; ebx = pointer to IEEE-754 single precision number, argument
; esi = pointer to IEEE-754 single precision number, res = 1/sqrt(arg)
;
; output:
; [esi] = 1/sqrt(arg)
;
; destroys:
; eax, ecx, edx, edi
; eflags
;
INVSQRT2 MACRO
.DATA
d1tab dw 0B0DDh,0A901h,0A1B5h,09AECh,0949Ah,08EB2h,0892Ch,083FFh
dw 07E47h,07524h,06C8Ah,0646Eh,05CC6h,05589h,04EAFh,04831h
dw 04208h,03C2Fh,0369Fh,03154h,02C49h,0277Ah,022E3h,01E81h
dw 01A51h,0164Fh,01278h,00ECBh,00B45h,007E3h,004A3h,00184h
dw 0FA1Fh,0EF02h,0E4B1h,0DB18h,0D227h,0C9CEh,0C1FEh,0BAACh
dw 0B3CDh,0AD57h,0A742h,0A186h,09C1Ch,096FEh,09226h,08D8Fh
dw 08934h,08511h,08122h,07AC7h,073A6h,06CD9h,0665Ch,06029h
dw 05A3Ch,05491h,04F24h,049F1h,044F4h,0402Ch,03B94h,0372Ah
half ds 0.5
three ds 3.0
.CODE
mov eax, [ebx] ; arg (single precision floating-point)
mov ecx, 0be7fffffh ; 0xbe7fffff
mov edi, eax ; arg
mov edx, eax ; arg
shr edi, 18 ; (arg >18)
sub ecx, eax ; (0xbe7fffff - arg)
shr eax, 1 ; (arg >1)
and edi, 3fh ; index = ((arg >18) & 0x3f)
shr ecx, 1 ; ((0xbe7fffff - arg) >1)
and eax, 00001ffffh ; (arg >1) & 0x1ffff
movzx edi, d1tab[edi*2]; d1tab[index]
and edx, 0007e0000h ; arg & 0x7e0000
and ecx, 07f800000h ; exo=((0xbe7fffff - arg) >1) & 0x7f800000
or eax, edx ; (arg & 0x7e0000) | ((arg >1) & 0x1fff)
shl edi, 8 ; d1tab[index] << 8
xor eax, 00002ffffh ; (arg&0x7e0000)|((arg>>1)&0x1fff)^0x2ffff
add edi, 03f000000h ; d1 = (d1tab[index] << 8) | 0x3f000000
or eax, ecx ; xhat = ((arg&0x7e0000)|((arg>>1)&0x1fff)^0x2ffff)|expo
push edi ; d1
push eax ; xhat
fld dword ptr [esp+4]; d1
fmul dword ptr [esp] ; approx = d1*xhat
fld st(0) ; approx approx
fmul st,st(0) ; approx*approx approx
fxch st(1) ; approx approx*approx
fmul dword ptr [half] ; approx*0.5 approx*approx
fxch st(1) ; approx*approx approx*0.5
fmul dword ptr [ebx] ; approx*approx*arg approx*0.5
fsubr dword ptr [three]; 3-approx*approx*arg approx*0.5
fmulp st(1),st ; res = (3-approx*approx*arg)*approx*0.5
fstp dword ptr [esi] ; store result
add esp,8 ; remove floating-point temps
ENDM
; IEEE-754 single precision number. The algorithm used is that
; described in this paper: Masayuki Ito, Naofumi Takagi, Shuzo Yajima:
; "Efficient Initial Approximation for Multiplicative Division
; and Square Root by a Multiplication with Operand Modification". IEEE
; Transactions on Computers, Vol. 46, No. 4, April 1997,
; pp 495-498.
;
; This code delivers inverse square root results that differ by at most 1
; ulp from correctly rounded single precision results. The correctly
; rounded single precision result is returned for 99% of all arguments
; if the FPU precision is set to either extended precision (default under
; DOS) or double precision (default under WindowsNT). This routine does
; not work correctly if the input is negative, zero, a denormal, an
; infinity or a NaN. It works properly for arguments in [2^-126, 2^128),
; which is the range of positive, normalized IEEE single precision numbers
; and is roughly equivalent to [1.1755e-38,3.4028e38].
;
; In case all memory accesses are cache hits, this code has been timed to
; have a latency of 39 cycles on a PentiumMMX.
;
;
; invsqrt2 - compute an approximation to the inverse square root of an
; IEEE-754 single precision number.
;
; input:
; ebx = pointer to IEEE-754 single precision number, argument
; esi = pointer to IEEE-754 single precision number, res = 1/sqrt(arg)
;
; output:
; [esi] = 1/sqrt(arg)
;
; destroys:
; eax, ecx, edx, edi
; eflags
;
INVSQRT2 MACRO
.DATA
d1tab dw 0B0DDh,0A901h,0A1B5h,09AECh,0949Ah,08EB2h,0892Ch,083FFh
dw 07E47h,07524h,06C8Ah,0646Eh,05CC6h,05589h,04EAFh,04831h
dw 04208h,03C2Fh,0369Fh,03154h,02C49h,0277Ah,022E3h,01E81h
dw 01A51h,0164Fh,01278h,00ECBh,00B45h,007E3h,004A3h,00184h
dw 0FA1Fh,0EF02h,0E4B1h,0DB18h,0D227h,0C9CEh,0C1FEh,0BAACh
dw 0B3CDh,0AD57h,0A742h,0A186h,09C1Ch,096FEh,09226h,08D8Fh
dw 08934h,08511h,08122h,07AC7h,073A6h,06CD9h,0665Ch,06029h
dw 05A3Ch,05491h,04F24h,049F1h,044F4h,0402Ch,03B94h,0372Ah
half ds 0.5
three ds 3.0
.CODE
mov eax, [ebx] ; arg (single precision floating-point)
mov ecx, 0be7fffffh ; 0xbe7fffff
mov edi, eax ; arg
mov edx, eax ; arg
shr edi, 18 ; (arg >18)
sub ecx, eax ; (0xbe7fffff - arg)
shr eax, 1 ; (arg >1)
and edi, 3fh ; index = ((arg >18) & 0x3f)
shr ecx, 1 ; ((0xbe7fffff - arg) >1)
and eax, 00001ffffh ; (arg >1) & 0x1ffff
movzx edi, d1tab[edi*2]; d1tab[index]
and edx, 0007e0000h ; arg & 0x7e0000
and ecx, 07f800000h ; exo=((0xbe7fffff - arg) >1) & 0x7f800000
or eax, edx ; (arg & 0x7e0000) | ((arg >1) & 0x1fff)
shl edi, 8 ; d1tab[index] << 8
xor eax, 00002ffffh ; (arg&0x7e0000)|((arg>>1)&0x1fff)^0x2ffff
add edi, 03f000000h ; d1 = (d1tab[index] << 8) | 0x3f000000
or eax, ecx ; xhat = ((arg&0x7e0000)|((arg>>1)&0x1fff)^0x2ffff)|expo
push edi ; d1
push eax ; xhat
fld dword ptr [esp+4]; d1
fmul dword ptr [esp] ; approx = d1*xhat
fld st(0) ; approx approx
fmul st,st(0) ; approx*approx approx
fxch st(1) ; approx approx*approx
fmul dword ptr [half] ; approx*0.5 approx*approx
fxch st(1) ; approx*approx approx*0.5
fmul dword ptr [ebx] ; approx*approx*arg approx*0.5
fsubr dword ptr [three]; 3-approx*approx*arg approx*0.5
fmulp st(1),st ; res = (3-approx*approx*arg)*approx*0.5
fstp dword ptr [esi] ; store result
add esp,8 ; remove floating-point temps
ENDM