平方根算法之三(翻译)

LiHengFeng posted @ 2008年3月22日 23:23 in 算法 , 5199 阅读

本blog所有文章，除声明了作者外，均为原创。欢迎转载，转载请注明作者。

2c. (int)sqrt(int)

当需要输出整数时，头两种方法就变得有优势了。我们先看看输出输出都为整数的平方根函数。下面是一个更快的实现，它采用了第一个算法。

/*

// Integer Square Root function

// Contributors include Arne Steinarson for the basic approximation idea,

// Dann Corbit and Mathew Hendry for the first cut at the algorithm,

// Lawrence Kirby for the rearrangement, improvments and range optimization

// and Paul Hsieh for the round-then-adjust idea.

*/

static unsigned fred_sqrt(unsigned long x) {

         static const unsigned char sqq_table[] = {

         0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57,

         59, 61, 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83,

         84, 86, 87, 89, 90, 91, 93, 94, 96, 97, 98, 99, 101, 102,

         103, 104, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118,

         119, 120, 121, 122, 123, 124, 125, 126, 128, 128, 129, 130, 131, 132,

         133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 144, 145,

         146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, 156, 157,

         158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,

         169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178,

         179, 180, 181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188,

         189, 189, 190, 191, 192, 192, 193, 193, 194, 195, 195, 196, 197, 197,

         198, 199, 199, 200, 201, 201, 202, 203, 203, 204, 204, 205, 206, 206,

         207, 208, 208, 209, 209, 210, 211, 211, 212, 212, 213, 214, 214, 215,

         215, 216, 217, 217, 218, 218, 219, 219, 220, 221, 221, 222, 222, 223,

         224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230, 230, 231,

         231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,

         239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246,

         246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253,

         253, 254, 254, 255

         };

         unsigned long xn;

         if (x >= 0x10000)

              if (x >= 0x1000000)

                  if (x >= 0x10000000)

                      if (x >= 0x40000000) {

                           if (x >= 65535UL*65535UL)

                                 return 65535;

                             xn = sqq_table[x>>24] << 8;

                      } else

                             xn = sqq_table[x>>22] << 7;

                  else

                      if (x >= 0x4000000)

                             xn = sqq_table[x>>20] << 6;

                      else

                             xn = sqq_table[x>>18] << 5;

              else {

                     if (x >= 0x100000)

                         if (x >= 0x400000)

                             xn = sqq_table[x>>16] << 4;

                         else

                             xn = sqq_table[x>>14] << 3;

                     else

                         if (x >= 0x40000)

                             xn = sqq_table[x>>12] << 2;

                         else

                             xn = sqq_table[x>>10] << 1;

                     goto nr1;

                 }

         else

             if (x >= 0x100) {

                 if (x >= 0x1000)

                     if (x >= 0x4000)

                             xn = (sqq_table[x>>8] >> 0) + 1;

                     else

                             xn = (sqq_table[x>>6] >> 1) + 1;

                 else

                     if (x >= 0x400)

                             xn = (sqq_table[x>>4] >> 2) + 1;

                     else

                             xn = (sqq_table[x>>2] >> 3) + 1;

                     goto adj;

             } else

                     return sqq_table[x] >> 4;

/* Run two iterations of the standard convergence formula */

         xn = (xn + 1 + x / xn) / 2;

nr1:

         xn = (xn + 1 + x / xn) / 2;

adj:

         if (xn * xn > x) /* Correct rounding if necessary */

          xn--;

         return xn;

}

这个版本的截断值是完全精确的（需要一个整数参数）。但是，我们可以看到代码充满了不可预测的分支。对现代CPU提高执行性能的分支预测功能来说这是个问题。下面的代码更加适应带快速FPU（这需要32位长度，且该FPU支持IEEE-754规格32位浮点数）的顺序执行处理器。

int isqrt (long r) {

float tempf, x, y, rr;

int is;

rr = (long) r;

y = rr*0.5;

*(unsigned long *) &tempf = (0xbe6f0000 - *(unsigned long *) &rr) >> 1;

x = tempf;

x = (1.5*x) - (x*x)*(x*y);

if (r > 101123) x = (1.5*x) - (x*x)*(x*y);

is = (int) (x*rr + 0.5);

return is + ((signed int) (r - is*is)) >> 31;

}

让我们看一看第二种方法的实现：

/* by Jim Ulery */

static unsigned julery_isqrt(unsigned long val) {

         unsigned long temp, g=0, b = 0x8000, bshft = 15;

         do {

               if (val >= (temp = (((g << 1) + b)<<bshft--))) {

                  g += b;

                  val -= temp;

               }

           } while (b >>= 1);

         return g;

}

算法中除法这一步用一位精度的除法代替了。Jim Ulery算法的完全版本来自于这里：
http://www.azillionmonkeys.com/qed/ulerysqroot.pdf 。不过这里仍然有15个不能很好预测的分支。另外还有一个大的循环。我们的解决办法首先是下面的第一个改进：

  /* by Mark Crowne */

static unsigned int mcrowne_isqrt (unsigned long val) {

unsigned int temp, g=0;

if (val >= 0x40000000) {

        g = 0x8000; 

        val -= 0x40000000;

}

#define INNER_ISQRT(s)                        \

temp = (g << (s)) + (1 << ((s) * 2 - 2));   \

if (val >= temp) {                          \

        g += 1 << ((s)-1);                   \

        val -= temp;                         \

}

INNER_ISQRT (15)

INNER_ISQRT (14)

INNER_ISQRT (13)

INNER_ISQRT (12)

INNER_ISQRT (11)

INNER_ISQRT (10)

INNER_ISQRT ( 9)

INNER_ISQRT ( 8)

INNER_ISQRT ( 7)

INNER_ISQRT ( 6)

INNER_ISQRT ( 5)

INNER_ISQRT ( 4)

INNER_ISQRT ( 3)

INNER_ISQRT ( 2)

#undef INNER_ISQRT

temp = g+g+1;

if (val >= temp) g++;

return g;

}

再次把它改为内联汇编（MSVC格式的），我们可以用条件编译替换分支代码。

   /* by Norbert Juffa */

         __declspec (naked) unsigned long __stdcall isqrt (unsigned long x) {

         /* based on posting by Wilco Dijkstra in comp.sys.arm in 1996 */

         #define iter(N)               \

           trial = root2 + (1 << (N)); \

           foo = trial << (N);         \

           if (n >= foo) {             \

             n -= foo;                 \

             root2 |= (2 << (N));      \

           }

         /*

          * For processors that provide single cycle latency

          * LEA, the first four instructions of the following 

          * macro could be replaced as follows

          * __asm   lea    edx, [eax+(1<<(N))]

          * __asm   lea    ebx, [eax+(2<<(N))]

          */

         #if CPU == ATHLON || CPU == ATHLON64

         #define iterasm(N) \

           __asm   mov    edx, (1 << (N))  \

           __asm   mov    ebx, (2 << (N))  \

           __asm   or     edx, eax         \

           __asm   or     ebx, eax         \

           __asm   shl    edx, (N)         \

           __asm   mov    esi, ecx         \

           __asm   sub    ecx, edx         \

           __asm   cmovnc eax, ebx         \

           __asm   cmovc  ecx, esi         

         #else /* generic 386+ */

         #define iterasm(N) \

           __asm   mov    edx, (1 << (N))  \

           __asm   mov    ebx, (2 << (N))  \

           __asm   or     edx, eax         \

           __asm   or     ebx, eax         \

           __asm   shl    edx, (N)         \

           __asm   sub    ecx, edx         \

           __asm   sbb    esi, esi         \

           __asm   sub    eax, ebx         \

           __asm   and    eax, esi         \

           __asm   add    eax, ebx         \

           __asm   and    edx, esi         \

           __asm   add    ecx, edx         

         #endif

           __asm {

             mov    ecx, [esp+4] ; x

             push   ebx          ; save as per calling convention 

             push   esi          ; save as per calling convention 

             xor    eax, eax     ; 2*root

             /* iteration 15 */

             mov    ebx, (2 << (15))

             mov    esi, ecx        

             sub    ecx, (1 << (30))

             cmovnc eax, ebx                 

             cmovc  ecx, esi

             iterasm (14);

             iterasm (13);

             iterasm (12);

             iterasm (11);

             iterasm (10);

             iterasm (9);

             iterasm (8);

             iterasm (7);

             iterasm (6);

             iterasm (5);

             iterasm (4);

             iterasm (3);

             iterasm (2);

             iterasm (1);

             /* iteration 0 */

             mov    edx, 1

             mov    ebx, 2

             add    edx, eax

             add    ebx, eax

             sub    ecx, edx

             cmovnc eax, ebx                 

             shr    eax, 1

             mov    esi, [esp]   ; restore as per calling convention

             mov    ebx, [esp+4] ; restore as per calling convention

             add    esp, 8       ; remove temp variables

             ret    4            ; pop one DWORD arg and return

           }

         }

请注意，尽管X86处理器可以执行cmovCC指令，算法也不能在除了基于Athlon处理器的情况下使性能商有所提高。这个方案明显超过第一个方法。但是 Norbert写信告诉我，cmovCC, sbb 和setCC指令在P4平台上慢得令人难以忍受。

…目前为止最快的ISQRT()是在FPU上实现的。即使改变FP中的控制字，它也很快。看起来P4有一个为FLDCW准备的优化选项，比Athlon的要好。既然大部分应用只使用两个不同的控制字(CW)，我猜他们或许在内部缓存了最后两个控制字。就是说，this is like renaming limited to 2 rename registers。