平方根算法之三(翻译) - lhf 的窝
平方根算法之三(翻译)
LiHengFeng
posted @ 2008年3月22日 23:23
in 算法
, 5190 阅读
本blog所有文章,除声明了作者外,均为原创。欢迎转载,转载请注明作者。
2c. (int)sqrt(int)
当需要输出整数时,头两种方法就变得有优势了。我们先看看输出输出都为整数的平方根函数。下面是一个更快的实现,它采用了第一个算法。
/*
// Integer Square Root function
// Contributors include Arne Steinarson for the basic approximation idea,
// Dann Corbit and Mathew Hendry for the first cut at the algorithm,
// Lawrence Kirby for the rearrangement, improvments and range optimization
// and Paul Hsieh for the round-then-adjust idea.
*/
static unsigned fred_sqrt(unsigned long x) {
static const unsigned char sqq_table[] = {
0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57,
59, 61, 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83,
84, 86, 87, 89, 90, 91, 93, 94, 96, 97, 98, 99, 101, 102,
103, 104, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118,
119, 120, 121, 122, 123, 124, 125, 126, 128, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 144, 145,
146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, 156, 157,
158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,
169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178,
179, 180, 181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188,
189, 189, 190, 191, 192, 192, 193, 193, 194, 195, 195, 196, 197, 197,
198, 199, 199, 200, 201, 201, 202, 203, 203, 204, 204, 205, 206, 206,
207, 208, 208, 209, 209, 210, 211, 211, 212, 212, 213, 214, 214, 215,
215, 216, 217, 217, 218, 218, 219, 219, 220, 221, 221, 222, 222, 223,
224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230, 230, 231,
231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246,
246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253,
253, 254, 254, 255
};
unsigned long xn;
if (x >= 0x10000)
if (x >= 0x1000000)
if (x >= 0x10000000)
if (x >= 0x40000000) {
if (x >= 65535UL*65535UL)
return 65535;
xn = sqq_table[x>>24] << 8;
} else
xn = sqq_table[x>>22] << 7;
else
if (x >= 0x4000000)
xn = sqq_table[x>>20] << 6;
else
xn = sqq_table[x>>18] << 5;
else {
if (x >= 0x100000)
if (x >= 0x400000)
xn = sqq_table[x>>16] << 4;
else
xn = sqq_table[x>>14] << 3;
else
if (x >= 0x40000)
xn = sqq_table[x>>12] << 2;
else
xn = sqq_table[x>>10] << 1;
goto nr1;
}
else
if (x >= 0x100) {
if (x >= 0x1000)
if (x >= 0x4000)
xn = (sqq_table[x>>8] >> 0) + 1;
else
xn = (sqq_table[x>>6] >> 1) + 1;
else
if (x >= 0x400)
xn = (sqq_table[x>>4] >> 2) + 1;
else
xn = (sqq_table[x>>2] >> 3) + 1;
goto adj;
} else
return sqq_table[x] >> 4;
/* Run two iterations of the standard convergence formula */
xn = (xn + 1 + x / xn) / 2;
nr1:
xn = (xn + 1 + x / xn) / 2;
adj:
if (xn * xn > x) /* Correct rounding if necessary */
xn--;
return xn;
}
// Integer Square Root function
// Contributors include Arne Steinarson for the basic approximation idea,
// Dann Corbit and Mathew Hendry for the first cut at the algorithm,
// Lawrence Kirby for the rearrangement, improvments and range optimization
// and Paul Hsieh for the round-then-adjust idea.
*/
static unsigned fred_sqrt(unsigned long x) {
static const unsigned char sqq_table[] = {
0, 16, 22, 27, 32, 35, 39, 42, 45, 48, 50, 53, 55, 57,
59, 61, 64, 65, 67, 69, 71, 73, 75, 76, 78, 80, 81, 83,
84, 86, 87, 89, 90, 91, 93, 94, 96, 97, 98, 99, 101, 102,
103, 104, 106, 107, 108, 109, 110, 112, 113, 114, 115, 116, 117, 118,
119, 120, 121, 122, 123, 124, 125, 126, 128, 128, 129, 130, 131, 132,
133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 144, 145,
146, 147, 148, 149, 150, 150, 151, 152, 153, 154, 155, 155, 156, 157,
158, 159, 160, 160, 161, 162, 163, 163, 164, 165, 166, 167, 167, 168,
169, 170, 170, 171, 172, 173, 173, 174, 175, 176, 176, 177, 178, 178,
179, 180, 181, 181, 182, 183, 183, 184, 185, 185, 186, 187, 187, 188,
189, 189, 190, 191, 192, 192, 193, 193, 194, 195, 195, 196, 197, 197,
198, 199, 199, 200, 201, 201, 202, 203, 203, 204, 204, 205, 206, 206,
207, 208, 208, 209, 209, 210, 211, 211, 212, 212, 213, 214, 214, 215,
215, 216, 217, 217, 218, 218, 219, 219, 220, 221, 221, 222, 222, 223,
224, 224, 225, 225, 226, 226, 227, 227, 228, 229, 229, 230, 230, 231,
231, 232, 232, 233, 234, 234, 235, 235, 236, 236, 237, 237, 238, 238,
239, 240, 240, 241, 241, 242, 242, 243, 243, 244, 244, 245, 245, 246,
246, 247, 247, 248, 248, 249, 249, 250, 250, 251, 251, 252, 252, 253,
253, 254, 254, 255
};
unsigned long xn;
if (x >= 0x10000)
if (x >= 0x1000000)
if (x >= 0x10000000)
if (x >= 0x40000000) {
if (x >= 65535UL*65535UL)
return 65535;
xn = sqq_table[x>>24] << 8;
} else
xn = sqq_table[x>>22] << 7;
else
if (x >= 0x4000000)
xn = sqq_table[x>>20] << 6;
else
xn = sqq_table[x>>18] << 5;
else {
if (x >= 0x100000)
if (x >= 0x400000)
xn = sqq_table[x>>16] << 4;
else
xn = sqq_table[x>>14] << 3;
else
if (x >= 0x40000)
xn = sqq_table[x>>12] << 2;
else
xn = sqq_table[x>>10] << 1;
goto nr1;
}
else
if (x >= 0x100) {
if (x >= 0x1000)
if (x >= 0x4000)
xn = (sqq_table[x>>8] >> 0) + 1;
else
xn = (sqq_table[x>>6] >> 1) + 1;
else
if (x >= 0x400)
xn = (sqq_table[x>>4] >> 2) + 1;
else
xn = (sqq_table[x>>2] >> 3) + 1;
goto adj;
} else
return sqq_table[x] >> 4;
/* Run two iterations of the standard convergence formula */
xn = (xn + 1 + x / xn) / 2;
nr1:
xn = (xn + 1 + x / xn) / 2;
adj:
if (xn * xn > x) /* Correct rounding if necessary */
xn--;
return xn;
}
这个版本的截断值是完全精确的(需要一个整数参数)。但是,我们可以看到代码充满了不可预测的分支。对现代CPU提高执行性能的分支预测功能来说这是个问题。下面的代码更加适应带快速FPU(这需要32位长度,且该FPU支持IEEE-754规格32位浮点数)的顺序执行处理器。
int isqrt (long r) {
float tempf, x, y, rr;
int is;
rr = (long) r;
y = rr*0.5;
*(unsigned long *) &tempf = (0xbe6f0000 - *(unsigned long *) &rr) >> 1;
x = tempf;
x = (1.5*x) - (x*x)*(x*y);
if (r > 101123) x = (1.5*x) - (x*x)*(x*y);
is = (int) (x*rr + 0.5);
return is + ((signed int) (r - is*is)) >> 31;
}
float tempf, x, y, rr;
int is;
rr = (long) r;
y = rr*0.5;
*(unsigned long *) &tempf = (0xbe6f0000 - *(unsigned long *) &rr) >> 1;
x = tempf;
x = (1.5*x) - (x*x)*(x*y);
if (r > 101123) x = (1.5*x) - (x*x)*(x*y);
is = (int) (x*rr + 0.5);
return is + ((signed int) (r - is*is)) >> 31;
}
让我们看一看第二种方法的实现:
/* by Jim Ulery */
static unsigned julery_isqrt(unsigned long val) {
unsigned long temp, g=0, b = 0x8000, bshft = 15;
do {
if (val >= (temp = (((g << 1) + b)<<bshft--))) {
g += b;
val -= temp;
}
} while (b >>= 1);
return g;
}
static unsigned julery_isqrt(unsigned long val) {
unsigned long temp, g=0, b = 0x8000, bshft = 15;
do {
if (val >= (temp = (((g << 1) + b)<<bshft--))) {
g += b;
val -= temp;
}
} while (b >>= 1);
return g;
}
算法中除法这一步用一位精度的除法代替了。Jim Ulery算法的完全版本来自于这里:
http://www.azillionmonkeys.com/qed/ulerysqroot.pdf 。不过这里仍然有15个不能很好预测的分支。另外还有一个大的循环。我们的解决办法首先是下面的第一个改进:
/* by Mark Crowne */
static unsigned int mcrowne_isqrt (unsigned long val) {
unsigned int temp, g=0;
if (val >= 0x40000000) {
g = 0x8000;
val -= 0x40000000;
}
#define INNER_ISQRT(s) \
temp = (g << (s)) + (1 << ((s) * 2 - 2)); \
if (val >= temp) { \
g += 1 << ((s)-1); \
val -= temp; \
}
INNER_ISQRT (15)
INNER_ISQRT (14)
INNER_ISQRT (13)
INNER_ISQRT (12)
INNER_ISQRT (11)
INNER_ISQRT (10)
INNER_ISQRT ( 9)
INNER_ISQRT ( 8)
INNER_ISQRT ( 7)
INNER_ISQRT ( 6)
INNER_ISQRT ( 5)
INNER_ISQRT ( 4)
INNER_ISQRT ( 3)
INNER_ISQRT ( 2)
#undef INNER_ISQRT
temp = g+g+1;
if (val >= temp) g++;
return g;
}
static unsigned int mcrowne_isqrt (unsigned long val) {
unsigned int temp, g=0;
if (val >= 0x40000000) {
g = 0x8000;
val -= 0x40000000;
}
#define INNER_ISQRT(s) \
temp = (g << (s)) + (1 << ((s) * 2 - 2)); \
if (val >= temp) { \
g += 1 << ((s)-1); \
val -= temp; \
}
INNER_ISQRT (15)
INNER_ISQRT (14)
INNER_ISQRT (13)
INNER_ISQRT (12)
INNER_ISQRT (11)
INNER_ISQRT (10)
INNER_ISQRT ( 9)
INNER_ISQRT ( 8)
INNER_ISQRT ( 7)
INNER_ISQRT ( 6)
INNER_ISQRT ( 5)
INNER_ISQRT ( 4)
INNER_ISQRT ( 3)
INNER_ISQRT ( 2)
#undef INNER_ISQRT
temp = g+g+1;
if (val >= temp) g++;
return g;
}
再次把它改为内联汇编(MSVC格式的),我们可以用条件编译替换分支代码。
/* by Norbert Juffa */
__declspec (naked) unsigned long __stdcall isqrt (unsigned long x) {
/* based on posting by Wilco Dijkstra in comp.sys.arm in 1996 */
#define iter(N) \
trial = root2 + (1 << (N)); \
foo = trial << (N); \
if (n >= foo) { \
n -= foo; \
root2 |= (2 << (N)); \
}
/*
* For processors that provide single cycle latency
* LEA, the first four instructions of the following
* macro could be replaced as follows
* __asm lea edx, [eax+(1<<(N))]
* __asm lea ebx, [eax+(2<<(N))]
*/
#if CPU == ATHLON || CPU == ATHLON64
#define iterasm(N) \
__asm mov edx, (1 << (N)) \
__asm mov ebx, (2 << (N)) \
__asm or edx, eax \
__asm or ebx, eax \
__asm shl edx, (N) \
__asm mov esi, ecx \
__asm sub ecx, edx \
__asm cmovnc eax, ebx \
__asm cmovc ecx, esi
#else /* generic 386+ */
#define iterasm(N) \
__asm mov edx, (1 << (N)) \
__asm mov ebx, (2 << (N)) \
__asm or edx, eax \
__asm or ebx, eax \
__asm shl edx, (N) \
__asm sub ecx, edx \
__asm sbb esi, esi \
__asm sub eax, ebx \
__asm and eax, esi \
__asm add eax, ebx \
__asm and edx, esi \
__asm add ecx, edx
#endif
__asm {
mov ecx, [esp+4] ; x
push ebx ; save as per calling convention
push esi ; save as per calling convention
xor eax, eax ; 2*root
/* iteration 15 */
mov ebx, (2 << (15))
mov esi, ecx
sub ecx, (1 << (30))
cmovnc eax, ebx
cmovc ecx, esi
iterasm (14);
iterasm (13);
iterasm (12);
iterasm (11);
iterasm (10);
iterasm (9);
iterasm (8);
iterasm (7);
iterasm (6);
iterasm (5);
iterasm (4);
iterasm (3);
iterasm (2);
iterasm (1);
/* iteration 0 */
mov edx, 1
mov ebx, 2
add edx, eax
add ebx, eax
sub ecx, edx
cmovnc eax, ebx
shr eax, 1
mov esi, [esp] ; restore as per calling convention
mov ebx, [esp+4] ; restore as per calling convention
add esp, 8 ; remove temp variables
ret 4 ; pop one DWORD arg and return
}
}
__declspec (naked) unsigned long __stdcall isqrt (unsigned long x) {
/* based on posting by Wilco Dijkstra in comp.sys.arm in 1996 */
#define iter(N) \
trial = root2 + (1 << (N)); \
foo = trial << (N); \
if (n >= foo) { \
n -= foo; \
root2 |= (2 << (N)); \
}
/*
* For processors that provide single cycle latency
* LEA, the first four instructions of the following
* macro could be replaced as follows
* __asm lea edx, [eax+(1<<(N))]
* __asm lea ebx, [eax+(2<<(N))]
*/
#if CPU == ATHLON || CPU == ATHLON64
#define iterasm(N) \
__asm mov edx, (1 << (N)) \
__asm mov ebx, (2 << (N)) \
__asm or edx, eax \
__asm or ebx, eax \
__asm shl edx, (N) \
__asm mov esi, ecx \
__asm sub ecx, edx \
__asm cmovnc eax, ebx \
__asm cmovc ecx, esi
#else /* generic 386+ */
#define iterasm(N) \
__asm mov edx, (1 << (N)) \
__asm mov ebx, (2 << (N)) \
__asm or edx, eax \
__asm or ebx, eax \
__asm shl edx, (N) \
__asm sub ecx, edx \
__asm sbb esi, esi \
__asm sub eax, ebx \
__asm and eax, esi \
__asm add eax, ebx \
__asm and edx, esi \
__asm add ecx, edx
#endif
__asm {
mov ecx, [esp+4] ; x
push ebx ; save as per calling convention
push esi ; save as per calling convention
xor eax, eax ; 2*root
/* iteration 15 */
mov ebx, (2 << (15))
mov esi, ecx
sub ecx, (1 << (30))
cmovnc eax, ebx
cmovc ecx, esi
iterasm (14);
iterasm (13);
iterasm (12);
iterasm (11);
iterasm (10);
iterasm (9);
iterasm (8);
iterasm (7);
iterasm (6);
iterasm (5);
iterasm (4);
iterasm (3);
iterasm (2);
iterasm (1);
/* iteration 0 */
mov edx, 1
mov ebx, 2
add edx, eax
add ebx, eax
sub ecx, edx
cmovnc eax, ebx
shr eax, 1
mov esi, [esp] ; restore as per calling convention
mov ebx, [esp+4] ; restore as per calling convention
add esp, 8 ; remove temp variables
ret 4 ; pop one DWORD arg and return
}
}
请注意,尽管X86处理器可以执行cmovCC指令,算法也不能在除了基于Athlon处理器的情况下使性能商有所提高。这个方案明显超过第一个方法。但是 Norbert写信告诉我,cmovCC, sbb 和setCC指令在P4平台上慢得令人难以忍受。
…目前为止最快的ISQRT()是在FPU上实现的。即使改变FP中的控制字,它也很快。看起来P4有一个为FLDCW准备的优化选项,比Athlon的要好。既然大部分应用只使用两个不同的控制字(CW),我猜他们或许在内部缓存了最后两个控制字。就是说,this is like renaming limited to 2 rename registers。