--- 24d7bc8da7fedcdfcb68d5f007f1210530eb6ce5 +++ 4365df31a1d52122cd670d5ad2fe321fc887337f @@ -134,214 +134,104 @@ int skip_atoi(const char **s) /* Decimal conversion is by far the most typical, and is used * for /proc and /sys data. This directly impacts e.g. top performance * with many processes running. We optimize it for speed - * using ideas described at - * (with permission from the author, Douglas W. Jones). - */ - -#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 -/* Formats correctly any integer in [0, 999999999] */ + * using code from + * http://www.cs.uiowa.edu/~jones/bcd/decimal.html + * (with permission from the author, Douglas W. Jones). */ + +/* Formats correctly any integer in [0,99999]. + * Outputs from one to five digits depending on input. + * On i386 gcc 4.1.2 -O2: ~250 bytes of code. */ static noinline_for_stack -char *put_dec_full9(char *buf, unsigned q) +char *put_dec_trunc(char *buf, unsigned q) { - unsigned r; + unsigned d3, d2, d1, d0; + d1 = (q>>4) & 0xf; + d2 = (q>>8) & 0xf; + d3 = (q>>12); + + d0 = 6*(d3 + d2 + d1) + (q & 0xf); + q = (d0 * 0xcd) >> 11; + d0 = d0 - 10*q; + *buf++ = d0 + '0'; /* least significant digit */ + d1 = q + 9*d3 + 5*d2 + d1; + if (d1 != 0) { + q = (d1 * 0xcd) >> 11; + d1 = d1 - 10*q; + *buf++ = d1 + '0'; /* next digit */ + + d2 = q + 2*d2; + if ((d2 != 0) || (d3 != 0)) { + q = (d2 * 0xd) >> 7; + d2 = d2 - 10*q; + *buf++ = d2 + '0'; /* next digit */ + + d3 = q + 4*d3; + if (d3 != 0) { + q = (d3 * 0xcd) >> 11; + d3 = d3 - 10*q; + *buf++ = d3 + '0'; /* next digit */ + if (q != 0) + *buf++ = q + '0'; /* most sign. digit */ + } + } + } - /* Possible ways to approx. divide by 10 - * (x * 0x1999999a) >> 32 x < 1073741829 (multiply must be 64-bit) - * (x * 0xcccd) >> 19 x < 81920 (x < 262149 when 64-bit mul) - * (x * 0x6667) >> 18 x < 43699 - * (x * 0x3334) >> 17 x < 16389 - * (x * 0x199a) >> 16 x < 16389 - * (x * 0x0ccd) >> 15 x < 16389 - * (x * 0x0667) >> 14 x < 2739 - * (x * 0x0334) >> 13 x < 1029 - * (x * 0x019a) >> 12 x < 1029 - * (x * 0x00cd) >> 11 x < 1029 shorter code than * 0x67 (on i386) - * (x * 0x0067) >> 10 x < 179 - * (x * 0x0034) >> 9 x < 69 same - * (x * 0x001a) >> 8 x < 69 same - * (x * 0x000d) >> 7 x < 69 same, shortest code (on i386) - * (x * 0x0007) >> 6 x < 19 - * See - */ - r = (q * (uint64_t)0x1999999a) >> 32; - *buf++ = (q - 10 * r) + '0'; /* 1 */ - q = (r * (uint64_t)0x1999999a) >> 32; - *buf++ = (r - 10 * q) + '0'; /* 2 */ - r = (q * (uint64_t)0x1999999a) >> 32; - *buf++ = (q - 10 * r) + '0'; /* 3 */ - q = (r * (uint64_t)0x1999999a) >> 32; - *buf++ = (r - 10 * q) + '0'; /* 4 */ - r = (q * (uint64_t)0x1999999a) >> 32; - *buf++ = (q - 10 * r) + '0'; /* 5 */ - /* Now value is under 10000, can avoid 64-bit multiply */ - q = (r * 0x199a) >> 16; - *buf++ = (r - 10 * q) + '0'; /* 6 */ - r = (q * 0xcd) >> 11; - *buf++ = (q - 10 * r) + '0'; /* 7 */ - q = (r * 0xcd) >> 11; - *buf++ = (r - 10 * q) + '0'; /* 8 */ - *buf++ = q + '0'; /* 9 */ return buf; } -#endif - -/* Similar to above but do not pad with zeros. - * Code can be easily arranged to print 9 digits too, but our callers - * always call put_dec_full9() instead when the number has 9 decimal digits. - */ +/* Same with if's removed. Always emits five digits */ static noinline_for_stack -char *put_dec_trunc8(char *buf, unsigned r) +char *put_dec_full(char *buf, unsigned q) { - unsigned q; + /* BTW, if q is in [0,9999], 8-bit ints will be enough, */ + /* but anyway, gcc produces better code with full-sized ints */ + unsigned d3, d2, d1, d0; + d1 = (q>>4) & 0xf; + d2 = (q>>8) & 0xf; + d3 = (q>>12); + + /* + * Possible ways to approx. divide by 10 + * gcc -O2 replaces multiply with shifts and adds + * (x * 0xcd) >> 11: 11001101 - shorter code than * 0x67 (on i386) + * (x * 0x67) >> 10: 1100111 + * (x * 0x34) >> 9: 110100 - same + * (x * 0x1a) >> 8: 11010 - same + * (x * 0x0d) >> 7: 1101 - same, shortest code (on i386) + */ + d0 = 6*(d3 + d2 + d1) + (q & 0xf); + q = (d0 * 0xcd) >> 11; + d0 = d0 - 10*q; + *buf++ = d0 + '0'; + d1 = q + 9*d3 + 5*d2 + d1; + q = (d1 * 0xcd) >> 11; + d1 = d1 - 10*q; + *buf++ = d1 + '0'; + + d2 = q + 2*d2; + q = (d2 * 0xd) >> 7; + d2 = d2 - 10*q; + *buf++ = d2 + '0'; + + d3 = q + 4*d3; + q = (d3 * 0xcd) >> 11; /* - shorter code */ + /* q = (d3 * 0x67) >> 10; - would also work */ + d3 = d3 - 10*q; + *buf++ = d3 + '0'; + *buf++ = q + '0'; - /* Copy of previous function's body with added early returns */ - q = (r * (uint64_t)0x1999999a) >> 32; - *buf++ = (r - 10 * q) + '0'; /* 2 */ - if (q == 0) return buf; - r = (q * (uint64_t)0x1999999a) >> 32; - *buf++ = (q - 10 * r) + '0'; /* 3 */ - if (r == 0) return buf; - q = (r * (uint64_t)0x1999999a) >> 32; - *buf++ = (r - 10 * q) + '0'; /* 4 */ - if (q == 0) return buf; - r = (q * (uint64_t)0x1999999a) >> 32; - *buf++ = (q - 10 * r) + '0'; /* 5 */ - if (r == 0) return buf; - q = (r * 0x199a) >> 16; - *buf++ = (r - 10 * q) + '0'; /* 6 */ - if (q == 0) return buf; - r = (q * 0xcd) >> 11; - *buf++ = (q - 10 * r) + '0'; /* 7 */ - if (r == 0) return buf; - q = (r * 0xcd) >> 11; - *buf++ = (r - 10 * q) + '0'; /* 8 */ - if (q == 0) return buf; - *buf++ = q + '0'; /* 9 */ return buf; } -/* There are two algorithms to print larger numbers. - * One is generic: divide by 1000000000 and repeatedly print - * groups of (up to) 9 digits. It's conceptually simple, - * but requires a (unsigned long long) / 1000000000 division. - * - * Second algorithm splits 64-bit unsigned long long into 16-bit chunks, - * manipulates them cleverly and generates groups of 4 decimal digits. - * It so happens that it does NOT require long long division. - * - * If long is > 32 bits, division of 64-bit values is relatively easy, - * and we will use the first algorithm. - * If long long is > 64 bits (strange architecture with VERY large long long), - * second algorithm can't be used, and we again use the first one. - * - * Else (if long is 32 bits and long long is 64 bits) we use second one. - */ - -#if BITS_PER_LONG != 32 || BITS_PER_LONG_LONG != 64 - -/* First algorithm: generic */ - -static -char *put_dec(char *buf, unsigned long long n) -{ - if (n >= 100*1000*1000) { - while (n >= 1000*1000*1000) - buf = put_dec_full9(buf, do_div(n, 1000*1000*1000)); - if (n >= 100*1000*1000) - return put_dec_full9(buf, n); - } - return put_dec_trunc8(buf, n); -} - -#else - -/* Second algorithm: valid only for 64-bit long longs */ - +/* No inlining helps gcc to use registers better */ static noinline_for_stack -char *put_dec_full4(char *buf, unsigned q) +char *put_dec(char *buf, unsigned long long num) { - unsigned r; - r = (q * 0xcccd) >> 19; - *buf++ = (q - 10 * r) + '0'; - q = (r * 0x199a) >> 16; - *buf++ = (r - 10 * q) + '0'; - r = (q * 0xcd) >> 11; - *buf++ = (q - 10 * r) + '0'; - *buf++ = r + '0'; - return buf; -} - -/* Based on code by Douglas W. Jones found at - * - * (with permission from the author). - * Performs no 64-bit division and hence should be fast on 32-bit machines. - */ -static -char *put_dec(char *buf, unsigned long long n) -{ - uint32_t d3, d2, d1, q, h; - - if (n < 100*1000*1000) - return put_dec_trunc8(buf, n); - - d1 = ((uint32_t)n >> 16); /* implicit "& 0xffff" */ - h = (n >> 32); - d2 = (h ) & 0xffff; - d3 = (h >> 16); /* implicit "& 0xffff" */ - - q = 656 * d3 + 7296 * d2 + 5536 * d1 + ((uint32_t)n & 0xffff); - - buf = put_dec_full4(buf, q % 10000); - q = q / 10000; - - d1 = q + 7671 * d3 + 9496 * d2 + 6 * d1; - buf = put_dec_full4(buf, d1 % 10000); - q = d1 / 10000; - - d2 = q + 4749 * d3 + 42 * d2; - buf = put_dec_full4(buf, d2 % 10000); - q = d2 / 10000; - - d3 = q + 281 * d3; - if (!d3) - goto done; - buf = put_dec_full4(buf, d3 % 10000); - q = d3 / 10000; - if (!q) - goto done; - buf = put_dec_full4(buf, q); -done: - while (buf[-1] == '0') - --buf; - - return buf; -} - -#endif - -/* - * Convert passed number to decimal string. - * Returns the length of string. On buffer overflow, returns 0. - * - * If speed is not important, use snprintf(). It's easy to read the code. - */ -int num_to_str(char *buf, int size, unsigned long long num) -{ - char tmp[sizeof(num) * 3]; - int idx, len; - - /* put_dec() may work incorrectly for num = 0 (generate "", not "0") */ - if (num <= 9) { - tmp[0] = '0' + num; - len = 1; - } else { - len = put_dec(tmp, num) - tmp; + while (1) { + unsigned rem; + if (num < 100000) + return put_dec_trunc(buf, num); + rem = do_div(num, 100000); + buf = put_dec_full(buf, rem); } - - if (len > size) - return 0; - for (idx = 0; idx < len; ++idx) - buf[idx] = tmp[len - idx - 1]; - return len; } #define ZEROPAD 1 /* pad with zero */ @@ -424,8 +314,8 @@ char *number(char *buf, char *end, unsig /* generate full string in tmp[], in reverse order */ i = 0; - if (num < spec.base) - tmp[i++] = digits[num] | locase; + if (num == 0) + tmp[i++] = '0'; /* Generic code, for any base: else do { tmp[i++] = (digits[do_div(num,base)] | locase); @@ -719,7 +609,7 @@ char *ip4_string(char *p, const u8 *addr } for (i = 0; i < 4; i++) { char temp[3]; /* hold each IP quad in reverse order */ - int digits = put_dec_trunc8(temp, addr[index]) - temp; + int digits = put_dec_trunc(temp, addr[index]) - temp; if (leading_zeros) { if (digits < 3) *p++ = '0';