平方根算法之二(翻译)

LiHengFeng posted @ 2008年3月22日 23:02 in 算法 , 1954 阅读

本blog所有文章，除声明了作者外，均为原创。欢迎转载，转载请注明作者。

二、实现

2a. (doble)sqrt(double)
下面是一个C程序的例子，在32位系统上对64位浮点数double（如X86）实现了第三种方法。

double fsqrt (double y) 

{

    double x, z, tempf;

    unsigned long *tfptr = ((unsigned long *)&tempf) + 1;

        tempf = y;

        *tfptr = (0xbfcd4600 - *tfptr)>>1; /* estimate of 1/sqrt(y) */

        x =  tempf;

        z =  y*0.5;                               /* hoist out the “/2”    */

        x = (1.5*x) - (x*x)*(x*z);         /* iteration formula     */

        x = (1.5*x)– (x*x)*(x*z);

        x = (1.5*x)– (x*x)*(x*z);

        x = (1.5*x)– (x*x)*(x*z);

        return x*y;

}

进一步的优化，只有32位对估计值是有效的，迭代公式只是产生任意精度。迭代公式中的括号将引起编译器在支持管道浮点操作（和乱序执行）的处理器上产生效率更好的代码。下面是一个类似的程序，用内联汇编写的（在MSVC编译器下）。

需要指出的是，这些方法不会产生精确的截断值，它们只能产生近似值。同样的技术在CPU和编译器生产商中是很常见的，因此不要期望会超过现有的算法，或者比C函数库中的sqrt()函数更精确。不过，与那些需要产生精确值的标准库或CPU微代码得到的值相比，我们通过这些函数可以很容易在较高的执行性能下得到一个较低精度的估计值。

一般说来，对完全的浮点近似，在现实的实现中，上述第一，二种方法比第三种方法要慢。接下来，我们看看几种比较简单的情况，对这些情况来说，简化是很重要的。

2b. (float)sqrt(float)

一个改进是利用IEEE-754格式，用一个查找表找出尾数的最高位。Norbert Juffa用这个方法实现了32位精度的平方根倒数。

; invsqrt2 computes an approximation to the inverse square root of an

; IEEE-754 single precision number. The algorithm used is that

; described in this paper: Masayuki Ito, Naofumi Takagi, Shuzo Yajima:

; "Efficient Initial Approximation for Multiplicative Division

; and Square Root by a Multiplication with Operand Modification". IEEE

; Transactions on Computers, Vol. 46, No. 4, April 1997,

; pp 495-498.

;

; This code delivers inverse square root results that differ by at most 1

; ulp from correctly rounded single precision results. The correctly

; rounded single precision result is returned for 99% of all arguments

; if the FPU precision is set to either extended precision (default under

; DOS) or double precision (default under WindowsNT). This routine does 

; not work correctly if the input is negative, zero, a denormal, an 

; infinity or a NaN. It works properly for arguments in [2^-126, 2^128), 

; which is the range of positive, normalized IEEE single precision numbers 

; and is roughly equivalent to [1.1755e-38,3.4028e38].

;

; In case all memory accesses are cache hits, this code has been timed to 

; have a latency of 39 cycles on a PentiumMMX.

;

;

; invsqrt2 - compute an approximation to the inverse square root of an

;            IEEE-754 single precision number.

;

; input:

;    ebx  = pointer to IEEE-754 single precision number, argument

;    esi  = pointer to IEEE-754 single precision number, res = 1/sqrt(arg)

;

; output:

;   [esi] = 1/sqrt(arg)

;

; destroys:

;   eax, ecx, edx, edi

;   eflags

;

INVSQRT2 MACRO

         .DATA

d1tab    dw      0B0DDh,0A901h,0A1B5h,09AECh,0949Ah,08EB2h,0892Ch,083FFh

         dw      07E47h,07524h,06C8Ah,0646Eh,05CC6h,05589h,04EAFh,04831h

         dw      04208h,03C2Fh,0369Fh,03154h,02C49h,0277Ah,022E3h,01E81h

         dw      01A51h,0164Fh,01278h,00ECBh,00B45h,007E3h,004A3h,00184h

         dw      0FA1Fh,0EF02h,0E4B1h,0DB18h,0D227h,0C9CEh,0C1FEh,0BAACh

         dw      0B3CDh,0AD57h,0A742h,0A186h,09C1Ch,096FEh,09226h,08D8Fh

         dw      08934h,08511h,08122h,07AC7h,073A6h,06CD9h,0665Ch,06029h

         dw      05A3Ch,05491h,04F24h,049F1h,044F4h,0402Ch,03B94h,0372Ah

half     ds      0.5

three    ds      3.0

         .CODE

         mov     eax, [ebx]       ; arg (single precision floating-point)

         mov     ecx, 0be7fffffh  ; 0xbe7fffff

         mov     edi, eax         ; arg

         mov     edx, eax         ; arg

         shr     edi, 18          ; (arg >18)

         sub     ecx, eax         ; (0xbe7fffff - arg)

         shr     eax, 1           ; (arg >1)

         and     edi, 3fh         ; index = ((arg >18) & 0x3f)

         shr     ecx, 1           ; ((0xbe7fffff - arg) >1)

         and     eax, 00001ffffh  ; (arg >1) & 0x1ffff

         movzx   edi, d1tab[edi*2]; d1tab[index]

         and     edx, 0007e0000h  ; arg & 0x7e0000

         and     ecx, 07f800000h  ; exo=((0xbe7fffff - arg) >1) & 0x7f800000

         or      eax, edx         ; (arg & 0x7e0000) | ((arg >1) & 0x1fff)

         shl     edi, 8           ; d1tab[index] << 8

         xor     eax, 00002ffffh  ; (arg&0x7e0000)|((arg>>1)&0x1fff)^0x2ffff

         add     edi, 03f000000h  ; d1 = (d1tab[index] << 8) | 0x3f000000

         or      eax, ecx         ; xhat = ((arg&0x7e0000)|((arg>>1)&0x1fff)^0x2ffff)|expo

         push    edi              ; d1

         push    eax              ; xhat

         fld     dword ptr [esp+4]; d1

         fmul    dword ptr [esp]  ; approx = d1*xhat

         fld     st(0)            ; approx approx

         fmul    st,st(0)         ; approx*approx approx

         fxch    st(1)            ; approx approx*approx

         fmul    dword ptr [half] ; approx*0.5 approx*approx

         fxch    st(1)            ; approx*approx approx*0.5

         fmul    dword ptr [ebx]  ; approx*approx*arg approx*0.5

         fsubr   dword ptr [three]; 3-approx*approx*arg approx*0.5

         fmulp   st(1),st         ; res = (3-approx*approx*arg)*approx*0.5

         fstp    dword ptr [esi]  ; store result

         add     esp,8            ; remove floating-point temps

         ENDM