Mercurial > crc32
changeset 3:6483683ac857 default tip
*: add profiling code too; expand x86 to use all eight XMM registers
basically ported verbatim from the assembly
| author | Paper <paper@tflc.us> |
|---|---|
| date | Mon, 09 Feb 2026 21:30:30 -0500 |
| parents | ead9f84d11db |
| children | |
| files | Makefile crc32-test.c crc32.c crc32c.c crc32i.h crc32x86-tab.h crc32x86.c |
| diffstat | 7 files changed, 1132 insertions(+), 44 deletions(-) [+] |
line wrap: on
line diff
--- a/Makefile Mon Feb 09 01:21:00 2026 -0500 +++ b/Makefile Mon Feb 09 21:30:30 2026 -0500 @@ -1,5 +1,7 @@ +CFLAGS := -g -fvisibility=hidden -O2 $(CFLAGS) + crc32: crc32.o crc32-table.o crc32-test.o crc32c.o crc32qw.o crc32x86.o - $(CC) -o $@ $^ + $(CC) $(CFLAGS) -o $@ $^ clean: $(RM) crc32 *.o
--- a/crc32-test.c Mon Feb 09 01:21:00 2026 -0500 +++ b/crc32-test.c Mon Feb 09 21:30:30 2026 -0500 @@ -1,17 +1,22 @@ #include "crc32i.h" #include <stdio.h> +#include <inttypes.h> +#include <time.h> /* Test implementations and make sure they agree with each other */ int crc32_test(void) { /* Force alignment :) */ - static const __attribute__((__aligned__(CRC32_MAX_ALIGNMENT))) unsigned char testdata[1024] = + static const CRC32_ALIGN(CRC32_MAX_ALIGNMENT) unsigned char testdata[(1ul << 23) + 19] = #define DOUBLE(x) x x -DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE("\x01\x02\x04\x08\x10\x20\x40\x80"))))))) +DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE("\x01\x02\x04\x08\x10\x20\x40\x80")))))))) #undef DOUBLE ; - static const crc32_r_spec crc[] = { -#define CRC32_IMPL(name) crc32##name##_r, + static const struct { + crc32_r_spec f; + const char *name; + } crc[] = { +#define CRC32_IMPL(name) {crc32##name##_r, #name}, #include "crc32-impls.h" }; size_t i; @@ -19,10 +24,18 @@ uint32_t crcc = crc32(testdata, sizeof(testdata)); for (i = 0; i < ARRAY_SIZE(crc); i++) { - uint32_t thiscrc = ~crc[i](0xFFFFFFFF, testdata, sizeof(testdata)); + clock_t start, end; + + start = clock(); + uint32_t thiscrc = crc[i].f(0xFFFFFFFF, testdata, sizeof(testdata)); + end = clock(); + + printf("%s: took %f secs\n", crc[i].name, (double)(end - start) / CLOCKS_PER_SEC); + + thiscrc = ~thiscrc; if (thiscrc != crcc) { - fprintf(stderr, "%zu, mismatch: %08" PRIX32 ", %08" PRIx32 "\n", i, crcc, thiscrc); + fprintf(stderr, "%s: mismatch: %08" PRIX32 ", %08" PRIx32 "\n", crc[i].name, crcc, thiscrc); return -1; } }
--- a/crc32.c Mon Feb 09 01:21:00 2026 -0500 +++ b/crc32.c Mon Feb 09 21:30:30 2026 -0500 @@ -17,14 +17,12 @@ return; /* Calculate size needed to align */ - sz8 = align - ((uintptr_t)message % align); + sz8 = align - ((uintptr_t)*message % align); szs = MIN(*sz, sz8); *crc = crcfunc(*crc, *message, sz8); *message += sz8; *sz -= sz8; - - if (szs == sz8) assert(ALIGNED(*message, align)); } CRC32_API @@ -39,12 +37,17 @@ crc = 0xFFFFFFFF; crc32_align(&crc, crc32c_r, ALIGNOF(uint32_t), &message, &sz); if (!sz) return ~crc; -#ifdef __x86_64__ - crc32_align(&crc, crc32qw_r, 16, &message, &sz); - if (!sz) return ~crc; - return ~crc32x86_vpclmulqdq_r(crc, message, sz); -#else +#if defined(__x86_64__) && defined(__GNUC__) + /* Check at runtime if we can use vpclmulqdq */ + if (__builtin_cpu_supports("vpclmulqdq")) { + /* Align and do the rest with vpclmulqdq */ + crc32_align(&crc, crc32qw_r, 16, &message, &sz); + if (!sz) return ~crc; + + return ~crc32x86_vpclmulqdq_r(crc, message, sz); + } /* Otherwise just use 32-bit impl */ +#endif + return ~crc32qw_r(crc, message, sz); -#endif }
--- a/crc32c.c Mon Feb 09 01:21:00 2026 -0500 +++ b/crc32c.c Mon Feb 09 21:30:30 2026 -0500 @@ -2,10 +2,8 @@ uint32_t crc32c_r(uint32_t crc, const unsigned char *message, size_t sz) { - size_t i; - - for (i = 0; i < sz; i++) - crc = (crc >> 8) ^ crc32_tab[(crc ^ message[i]) & 0xFF]; + while (sz--) + crc = (crc >> 8) ^ crc32_tab[(crc ^ *message++) & 0xFF]; return crc; }
--- a/crc32i.h Mon Feb 09 01:21:00 2026 -0500 +++ b/crc32i.h Mon Feb 09 21:30:30 2026 -0500 @@ -13,11 +13,22 @@ * to be destroyed or if it can be cached. */ #define CRC32_POLYNOMIAL 0xedb88320 -/* crc32b.c */ +#if (__STDC_VERSION__ >= 201112L) +# define CRC32_ALIGN(N) alignas(N) +#elif defined(__GNUC__) +# define CRC32_ALIGN(N) __attribute__((__aligned__(N))) +#elif defined(_MSC_VER) +# define CRC32_ALIGN(N) __declspec(align(N)) +#else +# error fuck +#endif + #ifdef __GNUC__ -# define CRC32_PURE __attribute__((__pure__)) +# define CRC32_FORCEINLINE static inline __attribute__((__always_inline__)) +#elif defined(_MSC_VER) +# define CRC32_FORCEINLINE static __forceinline #else -# define CRC32_PURE +# define CRC32_FORCEINLINE static inline #endif #define ALIGNOF(type) offsetof(struct { type a; char b; }, b)
--- a/crc32x86-tab.h Mon Feb 09 01:21:00 2026 -0500 +++ b/crc32x86-tab.h Mon Feb 09 21:30:30 2026 -0500 @@ -1086,3 +1086,963 @@ XNDIVP_DIV_ITER(543, 542) XNDIVP_MOD_ITER(544, 543) XNDIVP_DIV_ITER(544, 543) +XNDIVP_MOD_ITER(545, 544) +XNDIVP_DIV_ITER(545, 544) +XNDIVP_MOD_ITER(546, 545) +XNDIVP_DIV_ITER(546, 545) +XNDIVP_MOD_ITER(547, 546) +XNDIVP_DIV_ITER(547, 546) +XNDIVP_MOD_ITER(548, 547) +XNDIVP_DIV_ITER(548, 547) +XNDIVP_MOD_ITER(549, 548) +XNDIVP_DIV_ITER(549, 548) +XNDIVP_MOD_ITER(550, 549) +XNDIVP_DIV_ITER(550, 549) +XNDIVP_MOD_ITER(551, 550) +XNDIVP_DIV_ITER(551, 550) +XNDIVP_MOD_ITER(552, 551) +XNDIVP_DIV_ITER(552, 551) +XNDIVP_MOD_ITER(553, 552) +XNDIVP_DIV_ITER(553, 552) +XNDIVP_MOD_ITER(554, 553) +XNDIVP_DIV_ITER(554, 553) +XNDIVP_MOD_ITER(555, 554) +XNDIVP_DIV_ITER(555, 554) +XNDIVP_MOD_ITER(556, 555) +XNDIVP_DIV_ITER(556, 555) +XNDIVP_MOD_ITER(557, 556) +XNDIVP_DIV_ITER(557, 556) +XNDIVP_MOD_ITER(558, 557) +XNDIVP_DIV_ITER(558, 557) +XNDIVP_MOD_ITER(559, 558) +XNDIVP_DIV_ITER(559, 558) +XNDIVP_MOD_ITER(560, 559) +XNDIVP_DIV_ITER(560, 559) +XNDIVP_MOD_ITER(561, 560) +XNDIVP_DIV_ITER(561, 560) +XNDIVP_MOD_ITER(562, 561) +XNDIVP_DIV_ITER(562, 561) +XNDIVP_MOD_ITER(563, 562) +XNDIVP_DIV_ITER(563, 562) +XNDIVP_MOD_ITER(564, 563) +XNDIVP_DIV_ITER(564, 563) +XNDIVP_MOD_ITER(565, 564) +XNDIVP_DIV_ITER(565, 564) +XNDIVP_MOD_ITER(566, 565) +XNDIVP_DIV_ITER(566, 565) +XNDIVP_MOD_ITER(567, 566) +XNDIVP_DIV_ITER(567, 566) +XNDIVP_MOD_ITER(568, 567) +XNDIVP_DIV_ITER(568, 567) +XNDIVP_MOD_ITER(569, 568) +XNDIVP_DIV_ITER(569, 568) +XNDIVP_MOD_ITER(570, 569) +XNDIVP_DIV_ITER(570, 569) +XNDIVP_MOD_ITER(571, 570) +XNDIVP_DIV_ITER(571, 570) +XNDIVP_MOD_ITER(572, 571) +XNDIVP_DIV_ITER(572, 571) +XNDIVP_MOD_ITER(573, 572) +XNDIVP_DIV_ITER(573, 572) +XNDIVP_MOD_ITER(574, 573) +XNDIVP_DIV_ITER(574, 573) +XNDIVP_MOD_ITER(575, 574) +XNDIVP_DIV_ITER(575, 574) +XNDIVP_MOD_ITER(576, 575) +XNDIVP_DIV_ITER(576, 575) +XNDIVP_MOD_ITER(577, 576) +XNDIVP_DIV_ITER(577, 576) +XNDIVP_MOD_ITER(578, 577) +XNDIVP_DIV_ITER(578, 577) +XNDIVP_MOD_ITER(579, 578) +XNDIVP_DIV_ITER(579, 578) +XNDIVP_MOD_ITER(580, 579) +XNDIVP_DIV_ITER(580, 579) +XNDIVP_MOD_ITER(581, 580) +XNDIVP_DIV_ITER(581, 580) +XNDIVP_MOD_ITER(582, 581) +XNDIVP_DIV_ITER(582, 581) +XNDIVP_MOD_ITER(583, 582) +XNDIVP_DIV_ITER(583, 582) +XNDIVP_MOD_ITER(584, 583) +XNDIVP_DIV_ITER(584, 583) +XNDIVP_MOD_ITER(585, 584) +XNDIVP_DIV_ITER(585, 584) +XNDIVP_MOD_ITER(586, 585) +XNDIVP_DIV_ITER(586, 585) +XNDIVP_MOD_ITER(587, 586) +XNDIVP_DIV_ITER(587, 586) +XNDIVP_MOD_ITER(588, 587) +XNDIVP_DIV_ITER(588, 587) +XNDIVP_MOD_ITER(589, 588) +XNDIVP_DIV_ITER(589, 588) +XNDIVP_MOD_ITER(590, 589) +XNDIVP_DIV_ITER(590, 589) +XNDIVP_MOD_ITER(591, 590) +XNDIVP_DIV_ITER(591, 590) +XNDIVP_MOD_ITER(592, 591) +XNDIVP_DIV_ITER(592, 591) +XNDIVP_MOD_ITER(593, 592) +XNDIVP_DIV_ITER(593, 592) +XNDIVP_MOD_ITER(594, 593) +XNDIVP_DIV_ITER(594, 593) +XNDIVP_MOD_ITER(595, 594) +XNDIVP_DIV_ITER(595, 594) +XNDIVP_MOD_ITER(596, 595) +XNDIVP_DIV_ITER(596, 595) +XNDIVP_MOD_ITER(597, 596) +XNDIVP_DIV_ITER(597, 596) +XNDIVP_MOD_ITER(598, 597) +XNDIVP_DIV_ITER(598, 597) +XNDIVP_MOD_ITER(599, 598) +XNDIVP_DIV_ITER(599, 598) +XNDIVP_MOD_ITER(600, 599) +XNDIVP_DIV_ITER(600, 599) +XNDIVP_MOD_ITER(601, 600) +XNDIVP_DIV_ITER(601, 600) +XNDIVP_MOD_ITER(602, 601) +XNDIVP_DIV_ITER(602, 601) +XNDIVP_MOD_ITER(603, 602) +XNDIVP_DIV_ITER(603, 602) +XNDIVP_MOD_ITER(604, 603) +XNDIVP_DIV_ITER(604, 603) +XNDIVP_MOD_ITER(605, 604) +XNDIVP_DIV_ITER(605, 604) +XNDIVP_MOD_ITER(606, 605) +XNDIVP_DIV_ITER(606, 605) +XNDIVP_MOD_ITER(607, 606) +XNDIVP_DIV_ITER(607, 606) +XNDIVP_MOD_ITER(608, 607) +XNDIVP_DIV_ITER(608, 607) +XNDIVP_MOD_ITER(609, 608) +XNDIVP_DIV_ITER(609, 608) +XNDIVP_MOD_ITER(610, 609) +XNDIVP_DIV_ITER(610, 609) +XNDIVP_MOD_ITER(611, 610) +XNDIVP_DIV_ITER(611, 610) +XNDIVP_MOD_ITER(612, 611) +XNDIVP_DIV_ITER(612, 611) +XNDIVP_MOD_ITER(613, 612) +XNDIVP_DIV_ITER(613, 612) +XNDIVP_MOD_ITER(614, 613) +XNDIVP_DIV_ITER(614, 613) +XNDIVP_MOD_ITER(615, 614) +XNDIVP_DIV_ITER(615, 614) +XNDIVP_MOD_ITER(616, 615) +XNDIVP_DIV_ITER(616, 615) +XNDIVP_MOD_ITER(617, 616) +XNDIVP_DIV_ITER(617, 616) +XNDIVP_MOD_ITER(618, 617) +XNDIVP_DIV_ITER(618, 617) +XNDIVP_MOD_ITER(619, 618) +XNDIVP_DIV_ITER(619, 618) +XNDIVP_MOD_ITER(620, 619) +XNDIVP_DIV_ITER(620, 619) +XNDIVP_MOD_ITER(621, 620) +XNDIVP_DIV_ITER(621, 620) +XNDIVP_MOD_ITER(622, 621) +XNDIVP_DIV_ITER(622, 621) +XNDIVP_MOD_ITER(623, 622) +XNDIVP_DIV_ITER(623, 622) +XNDIVP_MOD_ITER(624, 623) +XNDIVP_DIV_ITER(624, 623) +XNDIVP_MOD_ITER(625, 624) +XNDIVP_DIV_ITER(625, 624) +XNDIVP_MOD_ITER(626, 625) +XNDIVP_DIV_ITER(626, 625) +XNDIVP_MOD_ITER(627, 626) +XNDIVP_DIV_ITER(627, 626) +XNDIVP_MOD_ITER(628, 627) +XNDIVP_DIV_ITER(628, 627) +XNDIVP_MOD_ITER(629, 628) +XNDIVP_DIV_ITER(629, 628) +XNDIVP_MOD_ITER(630, 629) +XNDIVP_DIV_ITER(630, 629) +XNDIVP_MOD_ITER(631, 630) +XNDIVP_DIV_ITER(631, 630) +XNDIVP_MOD_ITER(632, 631) +XNDIVP_DIV_ITER(632, 631) +XNDIVP_MOD_ITER(633, 632) +XNDIVP_DIV_ITER(633, 632) +XNDIVP_MOD_ITER(634, 633) +XNDIVP_DIV_ITER(634, 633) +XNDIVP_MOD_ITER(635, 634) +XNDIVP_DIV_ITER(635, 634) +XNDIVP_MOD_ITER(636, 635) +XNDIVP_DIV_ITER(636, 635) +XNDIVP_MOD_ITER(637, 636) +XNDIVP_DIV_ITER(637, 636) +XNDIVP_MOD_ITER(638, 637) +XNDIVP_DIV_ITER(638, 637) +XNDIVP_MOD_ITER(639, 638) +XNDIVP_DIV_ITER(639, 638) +XNDIVP_MOD_ITER(640, 639) +XNDIVP_DIV_ITER(640, 639) +XNDIVP_MOD_ITER(641, 640) +XNDIVP_DIV_ITER(641, 640) +XNDIVP_MOD_ITER(642, 641) +XNDIVP_DIV_ITER(642, 641) +XNDIVP_MOD_ITER(643, 642) +XNDIVP_DIV_ITER(643, 642) +XNDIVP_MOD_ITER(644, 643) +XNDIVP_DIV_ITER(644, 643) +XNDIVP_MOD_ITER(645, 644) +XNDIVP_DIV_ITER(645, 644) +XNDIVP_MOD_ITER(646, 645) +XNDIVP_DIV_ITER(646, 645) +XNDIVP_MOD_ITER(647, 646) +XNDIVP_DIV_ITER(647, 646) +XNDIVP_MOD_ITER(648, 647) +XNDIVP_DIV_ITER(648, 647) +XNDIVP_MOD_ITER(649, 648) +XNDIVP_DIV_ITER(649, 648) +XNDIVP_MOD_ITER(650, 649) +XNDIVP_DIV_ITER(650, 649) +XNDIVP_MOD_ITER(651, 650) +XNDIVP_DIV_ITER(651, 650) +XNDIVP_MOD_ITER(652, 651) +XNDIVP_DIV_ITER(652, 651) +XNDIVP_MOD_ITER(653, 652) +XNDIVP_DIV_ITER(653, 652) +XNDIVP_MOD_ITER(654, 653) +XNDIVP_DIV_ITER(654, 653) +XNDIVP_MOD_ITER(655, 654) +XNDIVP_DIV_ITER(655, 654) +XNDIVP_MOD_ITER(656, 655) +XNDIVP_DIV_ITER(656, 655) +XNDIVP_MOD_ITER(657, 656) +XNDIVP_DIV_ITER(657, 656) +XNDIVP_MOD_ITER(658, 657) +XNDIVP_DIV_ITER(658, 657) +XNDIVP_MOD_ITER(659, 658) +XNDIVP_DIV_ITER(659, 658) +XNDIVP_MOD_ITER(660, 659) +XNDIVP_DIV_ITER(660, 659) +XNDIVP_MOD_ITER(661, 660) +XNDIVP_DIV_ITER(661, 660) +XNDIVP_MOD_ITER(662, 661) +XNDIVP_DIV_ITER(662, 661) +XNDIVP_MOD_ITER(663, 662) +XNDIVP_DIV_ITER(663, 662) +XNDIVP_MOD_ITER(664, 663) +XNDIVP_DIV_ITER(664, 663) +XNDIVP_MOD_ITER(665, 664) +XNDIVP_DIV_ITER(665, 664) +XNDIVP_MOD_ITER(666, 665) +XNDIVP_DIV_ITER(666, 665) +XNDIVP_MOD_ITER(667, 666) +XNDIVP_DIV_ITER(667, 666) +XNDIVP_MOD_ITER(668, 667) +XNDIVP_DIV_ITER(668, 667) +XNDIVP_MOD_ITER(669, 668) +XNDIVP_DIV_ITER(669, 668) +XNDIVP_MOD_ITER(670, 669) +XNDIVP_DIV_ITER(670, 669) +XNDIVP_MOD_ITER(671, 670) +XNDIVP_DIV_ITER(671, 670) +XNDIVP_MOD_ITER(672, 671) +XNDIVP_DIV_ITER(672, 671) +XNDIVP_MOD_ITER(673, 672) +XNDIVP_DIV_ITER(673, 672) +XNDIVP_MOD_ITER(674, 673) +XNDIVP_DIV_ITER(674, 673) +XNDIVP_MOD_ITER(675, 674) +XNDIVP_DIV_ITER(675, 674) +XNDIVP_MOD_ITER(676, 675) +XNDIVP_DIV_ITER(676, 675) +XNDIVP_MOD_ITER(677, 676) +XNDIVP_DIV_ITER(677, 676) +XNDIVP_MOD_ITER(678, 677) +XNDIVP_DIV_ITER(678, 677) +XNDIVP_MOD_ITER(679, 678) +XNDIVP_DIV_ITER(679, 678) +XNDIVP_MOD_ITER(680, 679) +XNDIVP_DIV_ITER(680, 679) +XNDIVP_MOD_ITER(681, 680) +XNDIVP_DIV_ITER(681, 680) +XNDIVP_MOD_ITER(682, 681) +XNDIVP_DIV_ITER(682, 681) +XNDIVP_MOD_ITER(683, 682) +XNDIVP_DIV_ITER(683, 682) +XNDIVP_MOD_ITER(684, 683) +XNDIVP_DIV_ITER(684, 683) +XNDIVP_MOD_ITER(685, 684) +XNDIVP_DIV_ITER(685, 684) +XNDIVP_MOD_ITER(686, 685) +XNDIVP_DIV_ITER(686, 685) +XNDIVP_MOD_ITER(687, 686) +XNDIVP_DIV_ITER(687, 686) +XNDIVP_MOD_ITER(688, 687) +XNDIVP_DIV_ITER(688, 687) +XNDIVP_MOD_ITER(689, 688) +XNDIVP_DIV_ITER(689, 688) +XNDIVP_MOD_ITER(690, 689) +XNDIVP_DIV_ITER(690, 689) +XNDIVP_MOD_ITER(691, 690) +XNDIVP_DIV_ITER(691, 690) +XNDIVP_MOD_ITER(692, 691) +XNDIVP_DIV_ITER(692, 691) +XNDIVP_MOD_ITER(693, 692) +XNDIVP_DIV_ITER(693, 692) +XNDIVP_MOD_ITER(694, 693) +XNDIVP_DIV_ITER(694, 693) +XNDIVP_MOD_ITER(695, 694) +XNDIVP_DIV_ITER(695, 694) +XNDIVP_MOD_ITER(696, 695) +XNDIVP_DIV_ITER(696, 695) +XNDIVP_MOD_ITER(697, 696) +XNDIVP_DIV_ITER(697, 696) +XNDIVP_MOD_ITER(698, 697) +XNDIVP_DIV_ITER(698, 697) +XNDIVP_MOD_ITER(699, 698) +XNDIVP_DIV_ITER(699, 698) +XNDIVP_MOD_ITER(700, 699) +XNDIVP_DIV_ITER(700, 699) +XNDIVP_MOD_ITER(701, 700) +XNDIVP_DIV_ITER(701, 700) +XNDIVP_MOD_ITER(702, 701) +XNDIVP_DIV_ITER(702, 701) +XNDIVP_MOD_ITER(703, 702) +XNDIVP_DIV_ITER(703, 702) +XNDIVP_MOD_ITER(704, 703) +XNDIVP_DIV_ITER(704, 703) +XNDIVP_MOD_ITER(705, 704) +XNDIVP_DIV_ITER(705, 704) +XNDIVP_MOD_ITER(706, 705) +XNDIVP_DIV_ITER(706, 705) +XNDIVP_MOD_ITER(707, 706) +XNDIVP_DIV_ITER(707, 706) +XNDIVP_MOD_ITER(708, 707) +XNDIVP_DIV_ITER(708, 707) +XNDIVP_MOD_ITER(709, 708) +XNDIVP_DIV_ITER(709, 708) +XNDIVP_MOD_ITER(710, 709) +XNDIVP_DIV_ITER(710, 709) +XNDIVP_MOD_ITER(711, 710) +XNDIVP_DIV_ITER(711, 710) +XNDIVP_MOD_ITER(712, 711) +XNDIVP_DIV_ITER(712, 711) +XNDIVP_MOD_ITER(713, 712) +XNDIVP_DIV_ITER(713, 712) +XNDIVP_MOD_ITER(714, 713) +XNDIVP_DIV_ITER(714, 713) +XNDIVP_MOD_ITER(715, 714) +XNDIVP_DIV_ITER(715, 714) +XNDIVP_MOD_ITER(716, 715) +XNDIVP_DIV_ITER(716, 715) +XNDIVP_MOD_ITER(717, 716) +XNDIVP_DIV_ITER(717, 716) +XNDIVP_MOD_ITER(718, 717) +XNDIVP_DIV_ITER(718, 717) +XNDIVP_MOD_ITER(719, 718) +XNDIVP_DIV_ITER(719, 718) +XNDIVP_MOD_ITER(720, 719) +XNDIVP_DIV_ITER(720, 719) +XNDIVP_MOD_ITER(721, 720) +XNDIVP_DIV_ITER(721, 720) +XNDIVP_MOD_ITER(722, 721) +XNDIVP_DIV_ITER(722, 721) +XNDIVP_MOD_ITER(723, 722) +XNDIVP_DIV_ITER(723, 722) +XNDIVP_MOD_ITER(724, 723) +XNDIVP_DIV_ITER(724, 723) +XNDIVP_MOD_ITER(725, 724) +XNDIVP_DIV_ITER(725, 724) +XNDIVP_MOD_ITER(726, 725) +XNDIVP_DIV_ITER(726, 725) +XNDIVP_MOD_ITER(727, 726) +XNDIVP_DIV_ITER(727, 726) +XNDIVP_MOD_ITER(728, 727) +XNDIVP_DIV_ITER(728, 727) +XNDIVP_MOD_ITER(729, 728) +XNDIVP_DIV_ITER(729, 728) +XNDIVP_MOD_ITER(730, 729) +XNDIVP_DIV_ITER(730, 729) +XNDIVP_MOD_ITER(731, 730) +XNDIVP_DIV_ITER(731, 730) +XNDIVP_MOD_ITER(732, 731) +XNDIVP_DIV_ITER(732, 731) +XNDIVP_MOD_ITER(733, 732) +XNDIVP_DIV_ITER(733, 732) +XNDIVP_MOD_ITER(734, 733) +XNDIVP_DIV_ITER(734, 733) +XNDIVP_MOD_ITER(735, 734) +XNDIVP_DIV_ITER(735, 734) +XNDIVP_MOD_ITER(736, 735) +XNDIVP_DIV_ITER(736, 735) +XNDIVP_MOD_ITER(737, 736) +XNDIVP_DIV_ITER(737, 736) +XNDIVP_MOD_ITER(738, 737) +XNDIVP_DIV_ITER(738, 737) +XNDIVP_MOD_ITER(739, 738) +XNDIVP_DIV_ITER(739, 738) +XNDIVP_MOD_ITER(740, 739) +XNDIVP_DIV_ITER(740, 739) +XNDIVP_MOD_ITER(741, 740) +XNDIVP_DIV_ITER(741, 740) +XNDIVP_MOD_ITER(742, 741) +XNDIVP_DIV_ITER(742, 741) +XNDIVP_MOD_ITER(743, 742) +XNDIVP_DIV_ITER(743, 742) +XNDIVP_MOD_ITER(744, 743) +XNDIVP_DIV_ITER(744, 743) +XNDIVP_MOD_ITER(745, 744) +XNDIVP_DIV_ITER(745, 744) +XNDIVP_MOD_ITER(746, 745) +XNDIVP_DIV_ITER(746, 745) +XNDIVP_MOD_ITER(747, 746) +XNDIVP_DIV_ITER(747, 746) +XNDIVP_MOD_ITER(748, 747) +XNDIVP_DIV_ITER(748, 747) +XNDIVP_MOD_ITER(749, 748) +XNDIVP_DIV_ITER(749, 748) +XNDIVP_MOD_ITER(750, 749) +XNDIVP_DIV_ITER(750, 749) +XNDIVP_MOD_ITER(751, 750) +XNDIVP_DIV_ITER(751, 750) +XNDIVP_MOD_ITER(752, 751) +XNDIVP_DIV_ITER(752, 751) +XNDIVP_MOD_ITER(753, 752) +XNDIVP_DIV_ITER(753, 752) +XNDIVP_MOD_ITER(754, 753) +XNDIVP_DIV_ITER(754, 753) +XNDIVP_MOD_ITER(755, 754) +XNDIVP_DIV_ITER(755, 754) +XNDIVP_MOD_ITER(756, 755) +XNDIVP_DIV_ITER(756, 755) +XNDIVP_MOD_ITER(757, 756) +XNDIVP_DIV_ITER(757, 756) +XNDIVP_MOD_ITER(758, 757) +XNDIVP_DIV_ITER(758, 757) +XNDIVP_MOD_ITER(759, 758) +XNDIVP_DIV_ITER(759, 758) +XNDIVP_MOD_ITER(760, 759) +XNDIVP_DIV_ITER(760, 759) +XNDIVP_MOD_ITER(761, 760) +XNDIVP_DIV_ITER(761, 760) +XNDIVP_MOD_ITER(762, 761) +XNDIVP_DIV_ITER(762, 761) +XNDIVP_MOD_ITER(763, 762) +XNDIVP_DIV_ITER(763, 762) +XNDIVP_MOD_ITER(764, 763) +XNDIVP_DIV_ITER(764, 763) +XNDIVP_MOD_ITER(765, 764) +XNDIVP_DIV_ITER(765, 764) +XNDIVP_MOD_ITER(766, 765) +XNDIVP_DIV_ITER(766, 765) +XNDIVP_MOD_ITER(767, 766) +XNDIVP_DIV_ITER(767, 766) +XNDIVP_MOD_ITER(768, 767) +XNDIVP_DIV_ITER(768, 767) +XNDIVP_MOD_ITER(769, 768) +XNDIVP_DIV_ITER(769, 768) +XNDIVP_MOD_ITER(770, 769) +XNDIVP_DIV_ITER(770, 769) +XNDIVP_MOD_ITER(771, 770) +XNDIVP_DIV_ITER(771, 770) +XNDIVP_MOD_ITER(772, 771) +XNDIVP_DIV_ITER(772, 771) +XNDIVP_MOD_ITER(773, 772) +XNDIVP_DIV_ITER(773, 772) +XNDIVP_MOD_ITER(774, 773) +XNDIVP_DIV_ITER(774, 773) +XNDIVP_MOD_ITER(775, 774) +XNDIVP_DIV_ITER(775, 774) +XNDIVP_MOD_ITER(776, 775) +XNDIVP_DIV_ITER(776, 775) +XNDIVP_MOD_ITER(777, 776) +XNDIVP_DIV_ITER(777, 776) +XNDIVP_MOD_ITER(778, 777) +XNDIVP_DIV_ITER(778, 777) +XNDIVP_MOD_ITER(779, 778) +XNDIVP_DIV_ITER(779, 778) +XNDIVP_MOD_ITER(780, 779) +XNDIVP_DIV_ITER(780, 779) +XNDIVP_MOD_ITER(781, 780) +XNDIVP_DIV_ITER(781, 780) +XNDIVP_MOD_ITER(782, 781) +XNDIVP_DIV_ITER(782, 781) +XNDIVP_MOD_ITER(783, 782) +XNDIVP_DIV_ITER(783, 782) +XNDIVP_MOD_ITER(784, 783) +XNDIVP_DIV_ITER(784, 783) +XNDIVP_MOD_ITER(785, 784) +XNDIVP_DIV_ITER(785, 784) +XNDIVP_MOD_ITER(786, 785) +XNDIVP_DIV_ITER(786, 785) +XNDIVP_MOD_ITER(787, 786) +XNDIVP_DIV_ITER(787, 786) +XNDIVP_MOD_ITER(788, 787) +XNDIVP_DIV_ITER(788, 787) +XNDIVP_MOD_ITER(789, 788) +XNDIVP_DIV_ITER(789, 788) +XNDIVP_MOD_ITER(790, 789) +XNDIVP_DIV_ITER(790, 789) +XNDIVP_MOD_ITER(791, 790) +XNDIVP_DIV_ITER(791, 790) +XNDIVP_MOD_ITER(792, 791) +XNDIVP_DIV_ITER(792, 791) +XNDIVP_MOD_ITER(793, 792) +XNDIVP_DIV_ITER(793, 792) +XNDIVP_MOD_ITER(794, 793) +XNDIVP_DIV_ITER(794, 793) +XNDIVP_MOD_ITER(795, 794) +XNDIVP_DIV_ITER(795, 794) +XNDIVP_MOD_ITER(796, 795) +XNDIVP_DIV_ITER(796, 795) +XNDIVP_MOD_ITER(797, 796) +XNDIVP_DIV_ITER(797, 796) +XNDIVP_MOD_ITER(798, 797) +XNDIVP_DIV_ITER(798, 797) +XNDIVP_MOD_ITER(799, 798) +XNDIVP_DIV_ITER(799, 798) +XNDIVP_MOD_ITER(800, 799) +XNDIVP_DIV_ITER(800, 799) +XNDIVP_MOD_ITER(801, 800) +XNDIVP_DIV_ITER(801, 800) +XNDIVP_MOD_ITER(802, 801) +XNDIVP_DIV_ITER(802, 801) +XNDIVP_MOD_ITER(803, 802) +XNDIVP_DIV_ITER(803, 802) +XNDIVP_MOD_ITER(804, 803) +XNDIVP_DIV_ITER(804, 803) +XNDIVP_MOD_ITER(805, 804) +XNDIVP_DIV_ITER(805, 804) +XNDIVP_MOD_ITER(806, 805) +XNDIVP_DIV_ITER(806, 805) +XNDIVP_MOD_ITER(807, 806) +XNDIVP_DIV_ITER(807, 806) +XNDIVP_MOD_ITER(808, 807) +XNDIVP_DIV_ITER(808, 807) +XNDIVP_MOD_ITER(809, 808) +XNDIVP_DIV_ITER(809, 808) +XNDIVP_MOD_ITER(810, 809) +XNDIVP_DIV_ITER(810, 809) +XNDIVP_MOD_ITER(811, 810) +XNDIVP_DIV_ITER(811, 810) +XNDIVP_MOD_ITER(812, 811) +XNDIVP_DIV_ITER(812, 811) +XNDIVP_MOD_ITER(813, 812) +XNDIVP_DIV_ITER(813, 812) +XNDIVP_MOD_ITER(814, 813) +XNDIVP_DIV_ITER(814, 813) +XNDIVP_MOD_ITER(815, 814) +XNDIVP_DIV_ITER(815, 814) +XNDIVP_MOD_ITER(816, 815) +XNDIVP_DIV_ITER(816, 815) +XNDIVP_MOD_ITER(817, 816) +XNDIVP_DIV_ITER(817, 816) +XNDIVP_MOD_ITER(818, 817) +XNDIVP_DIV_ITER(818, 817) +XNDIVP_MOD_ITER(819, 818) +XNDIVP_DIV_ITER(819, 818) +XNDIVP_MOD_ITER(820, 819) +XNDIVP_DIV_ITER(820, 819) +XNDIVP_MOD_ITER(821, 820) +XNDIVP_DIV_ITER(821, 820) +XNDIVP_MOD_ITER(822, 821) +XNDIVP_DIV_ITER(822, 821) +XNDIVP_MOD_ITER(823, 822) +XNDIVP_DIV_ITER(823, 822) +XNDIVP_MOD_ITER(824, 823) +XNDIVP_DIV_ITER(824, 823) +XNDIVP_MOD_ITER(825, 824) +XNDIVP_DIV_ITER(825, 824) +XNDIVP_MOD_ITER(826, 825) +XNDIVP_DIV_ITER(826, 825) +XNDIVP_MOD_ITER(827, 826) +XNDIVP_DIV_ITER(827, 826) +XNDIVP_MOD_ITER(828, 827) +XNDIVP_DIV_ITER(828, 827) +XNDIVP_MOD_ITER(829, 828) +XNDIVP_DIV_ITER(829, 828) +XNDIVP_MOD_ITER(830, 829) +XNDIVP_DIV_ITER(830, 829) +XNDIVP_MOD_ITER(831, 830) +XNDIVP_DIV_ITER(831, 830) +XNDIVP_MOD_ITER(832, 831) +XNDIVP_DIV_ITER(832, 831) +XNDIVP_MOD_ITER(833, 832) +XNDIVP_DIV_ITER(833, 832) +XNDIVP_MOD_ITER(834, 833) +XNDIVP_DIV_ITER(834, 833) +XNDIVP_MOD_ITER(835, 834) +XNDIVP_DIV_ITER(835, 834) +XNDIVP_MOD_ITER(836, 835) +XNDIVP_DIV_ITER(836, 835) +XNDIVP_MOD_ITER(837, 836) +XNDIVP_DIV_ITER(837, 836) +XNDIVP_MOD_ITER(838, 837) +XNDIVP_DIV_ITER(838, 837) +XNDIVP_MOD_ITER(839, 838) +XNDIVP_DIV_ITER(839, 838) +XNDIVP_MOD_ITER(840, 839) +XNDIVP_DIV_ITER(840, 839) +XNDIVP_MOD_ITER(841, 840) +XNDIVP_DIV_ITER(841, 840) +XNDIVP_MOD_ITER(842, 841) +XNDIVP_DIV_ITER(842, 841) +XNDIVP_MOD_ITER(843, 842) +XNDIVP_DIV_ITER(843, 842) +XNDIVP_MOD_ITER(844, 843) +XNDIVP_DIV_ITER(844, 843) +XNDIVP_MOD_ITER(845, 844) +XNDIVP_DIV_ITER(845, 844) +XNDIVP_MOD_ITER(846, 845) +XNDIVP_DIV_ITER(846, 845) +XNDIVP_MOD_ITER(847, 846) +XNDIVP_DIV_ITER(847, 846) +XNDIVP_MOD_ITER(848, 847) +XNDIVP_DIV_ITER(848, 847) +XNDIVP_MOD_ITER(849, 848) +XNDIVP_DIV_ITER(849, 848) +XNDIVP_MOD_ITER(850, 849) +XNDIVP_DIV_ITER(850, 849) +XNDIVP_MOD_ITER(851, 850) +XNDIVP_DIV_ITER(851, 850) +XNDIVP_MOD_ITER(852, 851) +XNDIVP_DIV_ITER(852, 851) +XNDIVP_MOD_ITER(853, 852) +XNDIVP_DIV_ITER(853, 852) +XNDIVP_MOD_ITER(854, 853) +XNDIVP_DIV_ITER(854, 853) +XNDIVP_MOD_ITER(855, 854) +XNDIVP_DIV_ITER(855, 854) +XNDIVP_MOD_ITER(856, 855) +XNDIVP_DIV_ITER(856, 855) +XNDIVP_MOD_ITER(857, 856) +XNDIVP_DIV_ITER(857, 856) +XNDIVP_MOD_ITER(858, 857) +XNDIVP_DIV_ITER(858, 857) +XNDIVP_MOD_ITER(859, 858) +XNDIVP_DIV_ITER(859, 858) +XNDIVP_MOD_ITER(860, 859) +XNDIVP_DIV_ITER(860, 859) +XNDIVP_MOD_ITER(861, 860) +XNDIVP_DIV_ITER(861, 860) +XNDIVP_MOD_ITER(862, 861) +XNDIVP_DIV_ITER(862, 861) +XNDIVP_MOD_ITER(863, 862) +XNDIVP_DIV_ITER(863, 862) +XNDIVP_MOD_ITER(864, 863) +XNDIVP_DIV_ITER(864, 863) +XNDIVP_MOD_ITER(865, 864) +XNDIVP_DIV_ITER(865, 864) +XNDIVP_MOD_ITER(866, 865) +XNDIVP_DIV_ITER(866, 865) +XNDIVP_MOD_ITER(867, 866) +XNDIVP_DIV_ITER(867, 866) +XNDIVP_MOD_ITER(868, 867) +XNDIVP_DIV_ITER(868, 867) +XNDIVP_MOD_ITER(869, 868) +XNDIVP_DIV_ITER(869, 868) +XNDIVP_MOD_ITER(870, 869) +XNDIVP_DIV_ITER(870, 869) +XNDIVP_MOD_ITER(871, 870) +XNDIVP_DIV_ITER(871, 870) +XNDIVP_MOD_ITER(872, 871) +XNDIVP_DIV_ITER(872, 871) +XNDIVP_MOD_ITER(873, 872) +XNDIVP_DIV_ITER(873, 872) +XNDIVP_MOD_ITER(874, 873) +XNDIVP_DIV_ITER(874, 873) +XNDIVP_MOD_ITER(875, 874) +XNDIVP_DIV_ITER(875, 874) +XNDIVP_MOD_ITER(876, 875) +XNDIVP_DIV_ITER(876, 875) +XNDIVP_MOD_ITER(877, 876) +XNDIVP_DIV_ITER(877, 876) +XNDIVP_MOD_ITER(878, 877) +XNDIVP_DIV_ITER(878, 877) +XNDIVP_MOD_ITER(879, 878) +XNDIVP_DIV_ITER(879, 878) +XNDIVP_MOD_ITER(880, 879) +XNDIVP_DIV_ITER(880, 879) +XNDIVP_MOD_ITER(881, 880) +XNDIVP_DIV_ITER(881, 880) +XNDIVP_MOD_ITER(882, 881) +XNDIVP_DIV_ITER(882, 881) +XNDIVP_MOD_ITER(883, 882) +XNDIVP_DIV_ITER(883, 882) +XNDIVP_MOD_ITER(884, 883) +XNDIVP_DIV_ITER(884, 883) +XNDIVP_MOD_ITER(885, 884) +XNDIVP_DIV_ITER(885, 884) +XNDIVP_MOD_ITER(886, 885) +XNDIVP_DIV_ITER(886, 885) +XNDIVP_MOD_ITER(887, 886) +XNDIVP_DIV_ITER(887, 886) +XNDIVP_MOD_ITER(888, 887) +XNDIVP_DIV_ITER(888, 887) +XNDIVP_MOD_ITER(889, 888) +XNDIVP_DIV_ITER(889, 888) +XNDIVP_MOD_ITER(890, 889) +XNDIVP_DIV_ITER(890, 889) +XNDIVP_MOD_ITER(891, 890) +XNDIVP_DIV_ITER(891, 890) +XNDIVP_MOD_ITER(892, 891) +XNDIVP_DIV_ITER(892, 891) +XNDIVP_MOD_ITER(893, 892) +XNDIVP_DIV_ITER(893, 892) +XNDIVP_MOD_ITER(894, 893) +XNDIVP_DIV_ITER(894, 893) +XNDIVP_MOD_ITER(895, 894) +XNDIVP_DIV_ITER(895, 894) +XNDIVP_MOD_ITER(896, 895) +XNDIVP_DIV_ITER(896, 895) +XNDIVP_MOD_ITER(897, 896) +XNDIVP_DIV_ITER(897, 896) +XNDIVP_MOD_ITER(898, 897) +XNDIVP_DIV_ITER(898, 897) +XNDIVP_MOD_ITER(899, 898) +XNDIVP_DIV_ITER(899, 898) +XNDIVP_MOD_ITER(900, 899) +XNDIVP_DIV_ITER(900, 899) +XNDIVP_MOD_ITER(901, 900) +XNDIVP_DIV_ITER(901, 900) +XNDIVP_MOD_ITER(902, 901) +XNDIVP_DIV_ITER(902, 901) +XNDIVP_MOD_ITER(903, 902) +XNDIVP_DIV_ITER(903, 902) +XNDIVP_MOD_ITER(904, 903) +XNDIVP_DIV_ITER(904, 903) +XNDIVP_MOD_ITER(905, 904) +XNDIVP_DIV_ITER(905, 904) +XNDIVP_MOD_ITER(906, 905) +XNDIVP_DIV_ITER(906, 905) +XNDIVP_MOD_ITER(907, 906) +XNDIVP_DIV_ITER(907, 906) +XNDIVP_MOD_ITER(908, 907) +XNDIVP_DIV_ITER(908, 907) +XNDIVP_MOD_ITER(909, 908) +XNDIVP_DIV_ITER(909, 908) +XNDIVP_MOD_ITER(910, 909) +XNDIVP_DIV_ITER(910, 909) +XNDIVP_MOD_ITER(911, 910) +XNDIVP_DIV_ITER(911, 910) +XNDIVP_MOD_ITER(912, 911) +XNDIVP_DIV_ITER(912, 911) +XNDIVP_MOD_ITER(913, 912) +XNDIVP_DIV_ITER(913, 912) +XNDIVP_MOD_ITER(914, 913) +XNDIVP_DIV_ITER(914, 913) +XNDIVP_MOD_ITER(915, 914) +XNDIVP_DIV_ITER(915, 914) +XNDIVP_MOD_ITER(916, 915) +XNDIVP_DIV_ITER(916, 915) +XNDIVP_MOD_ITER(917, 916) +XNDIVP_DIV_ITER(917, 916) +XNDIVP_MOD_ITER(918, 917) +XNDIVP_DIV_ITER(918, 917) +XNDIVP_MOD_ITER(919, 918) +XNDIVP_DIV_ITER(919, 918) +XNDIVP_MOD_ITER(920, 919) +XNDIVP_DIV_ITER(920, 919) +XNDIVP_MOD_ITER(921, 920) +XNDIVP_DIV_ITER(921, 920) +XNDIVP_MOD_ITER(922, 921) +XNDIVP_DIV_ITER(922, 921) +XNDIVP_MOD_ITER(923, 922) +XNDIVP_DIV_ITER(923, 922) +XNDIVP_MOD_ITER(924, 923) +XNDIVP_DIV_ITER(924, 923) +XNDIVP_MOD_ITER(925, 924) +XNDIVP_DIV_ITER(925, 924) +XNDIVP_MOD_ITER(926, 925) +XNDIVP_DIV_ITER(926, 925) +XNDIVP_MOD_ITER(927, 926) +XNDIVP_DIV_ITER(927, 926) +XNDIVP_MOD_ITER(928, 927) +XNDIVP_DIV_ITER(928, 927) +XNDIVP_MOD_ITER(929, 928) +XNDIVP_DIV_ITER(929, 928) +XNDIVP_MOD_ITER(930, 929) +XNDIVP_DIV_ITER(930, 929) +XNDIVP_MOD_ITER(931, 930) +XNDIVP_DIV_ITER(931, 930) +XNDIVP_MOD_ITER(932, 931) +XNDIVP_DIV_ITER(932, 931) +XNDIVP_MOD_ITER(933, 932) +XNDIVP_DIV_ITER(933, 932) +XNDIVP_MOD_ITER(934, 933) +XNDIVP_DIV_ITER(934, 933) +XNDIVP_MOD_ITER(935, 934) +XNDIVP_DIV_ITER(935, 934) +XNDIVP_MOD_ITER(936, 935) +XNDIVP_DIV_ITER(936, 935) +XNDIVP_MOD_ITER(937, 936) +XNDIVP_DIV_ITER(937, 936) +XNDIVP_MOD_ITER(938, 937) +XNDIVP_DIV_ITER(938, 937) +XNDIVP_MOD_ITER(939, 938) +XNDIVP_DIV_ITER(939, 938) +XNDIVP_MOD_ITER(940, 939) +XNDIVP_DIV_ITER(940, 939) +XNDIVP_MOD_ITER(941, 940) +XNDIVP_DIV_ITER(941, 940) +XNDIVP_MOD_ITER(942, 941) +XNDIVP_DIV_ITER(942, 941) +XNDIVP_MOD_ITER(943, 942) +XNDIVP_DIV_ITER(943, 942) +XNDIVP_MOD_ITER(944, 943) +XNDIVP_DIV_ITER(944, 943) +XNDIVP_MOD_ITER(945, 944) +XNDIVP_DIV_ITER(945, 944) +XNDIVP_MOD_ITER(946, 945) +XNDIVP_DIV_ITER(946, 945) +XNDIVP_MOD_ITER(947, 946) +XNDIVP_DIV_ITER(947, 946) +XNDIVP_MOD_ITER(948, 947) +XNDIVP_DIV_ITER(948, 947) +XNDIVP_MOD_ITER(949, 948) +XNDIVP_DIV_ITER(949, 948) +XNDIVP_MOD_ITER(950, 949) +XNDIVP_DIV_ITER(950, 949) +XNDIVP_MOD_ITER(951, 950) +XNDIVP_DIV_ITER(951, 950) +XNDIVP_MOD_ITER(952, 951) +XNDIVP_DIV_ITER(952, 951) +XNDIVP_MOD_ITER(953, 952) +XNDIVP_DIV_ITER(953, 952) +XNDIVP_MOD_ITER(954, 953) +XNDIVP_DIV_ITER(954, 953) +XNDIVP_MOD_ITER(955, 954) +XNDIVP_DIV_ITER(955, 954) +XNDIVP_MOD_ITER(956, 955) +XNDIVP_DIV_ITER(956, 955) +XNDIVP_MOD_ITER(957, 956) +XNDIVP_DIV_ITER(957, 956) +XNDIVP_MOD_ITER(958, 957) +XNDIVP_DIV_ITER(958, 957) +XNDIVP_MOD_ITER(959, 958) +XNDIVP_DIV_ITER(959, 958) +XNDIVP_MOD_ITER(960, 959) +XNDIVP_DIV_ITER(960, 959) +XNDIVP_MOD_ITER(961, 960) +XNDIVP_DIV_ITER(961, 960) +XNDIVP_MOD_ITER(962, 961) +XNDIVP_DIV_ITER(962, 961) +XNDIVP_MOD_ITER(963, 962) +XNDIVP_DIV_ITER(963, 962) +XNDIVP_MOD_ITER(964, 963) +XNDIVP_DIV_ITER(964, 963) +XNDIVP_MOD_ITER(965, 964) +XNDIVP_DIV_ITER(965, 964) +XNDIVP_MOD_ITER(966, 965) +XNDIVP_DIV_ITER(966, 965) +XNDIVP_MOD_ITER(967, 966) +XNDIVP_DIV_ITER(967, 966) +XNDIVP_MOD_ITER(968, 967) +XNDIVP_DIV_ITER(968, 967) +XNDIVP_MOD_ITER(969, 968) +XNDIVP_DIV_ITER(969, 968) +XNDIVP_MOD_ITER(970, 969) +XNDIVP_DIV_ITER(970, 969) +XNDIVP_MOD_ITER(971, 970) +XNDIVP_DIV_ITER(971, 970) +XNDIVP_MOD_ITER(972, 971) +XNDIVP_DIV_ITER(972, 971) +XNDIVP_MOD_ITER(973, 972) +XNDIVP_DIV_ITER(973, 972) +XNDIVP_MOD_ITER(974, 973) +XNDIVP_DIV_ITER(974, 973) +XNDIVP_MOD_ITER(975, 974) +XNDIVP_DIV_ITER(975, 974) +XNDIVP_MOD_ITER(976, 975) +XNDIVP_DIV_ITER(976, 975) +XNDIVP_MOD_ITER(977, 976) +XNDIVP_DIV_ITER(977, 976) +XNDIVP_MOD_ITER(978, 977) +XNDIVP_DIV_ITER(978, 977) +XNDIVP_MOD_ITER(979, 978) +XNDIVP_DIV_ITER(979, 978) +XNDIVP_MOD_ITER(980, 979) +XNDIVP_DIV_ITER(980, 979) +XNDIVP_MOD_ITER(981, 980) +XNDIVP_DIV_ITER(981, 980) +XNDIVP_MOD_ITER(982, 981) +XNDIVP_DIV_ITER(982, 981) +XNDIVP_MOD_ITER(983, 982) +XNDIVP_DIV_ITER(983, 982) +XNDIVP_MOD_ITER(984, 983) +XNDIVP_DIV_ITER(984, 983) +XNDIVP_MOD_ITER(985, 984) +XNDIVP_DIV_ITER(985, 984) +XNDIVP_MOD_ITER(986, 985) +XNDIVP_DIV_ITER(986, 985) +XNDIVP_MOD_ITER(987, 986) +XNDIVP_DIV_ITER(987, 986) +XNDIVP_MOD_ITER(988, 987) +XNDIVP_DIV_ITER(988, 987) +XNDIVP_MOD_ITER(989, 988) +XNDIVP_DIV_ITER(989, 988) +XNDIVP_MOD_ITER(990, 989) +XNDIVP_DIV_ITER(990, 989) +XNDIVP_MOD_ITER(991, 990) +XNDIVP_DIV_ITER(991, 990) +XNDIVP_MOD_ITER(992, 991) +XNDIVP_DIV_ITER(992, 991) +XNDIVP_MOD_ITER(993, 992) +XNDIVP_DIV_ITER(993, 992) +XNDIVP_MOD_ITER(994, 993) +XNDIVP_DIV_ITER(994, 993) +XNDIVP_MOD_ITER(995, 994) +XNDIVP_DIV_ITER(995, 994) +XNDIVP_MOD_ITER(996, 995) +XNDIVP_DIV_ITER(996, 995) +XNDIVP_MOD_ITER(997, 996) +XNDIVP_DIV_ITER(997, 996) +XNDIVP_MOD_ITER(998, 997) +XNDIVP_DIV_ITER(998, 997) +XNDIVP_MOD_ITER(999, 998) +XNDIVP_DIV_ITER(999, 998) +XNDIVP_MOD_ITER(1000, 999) +XNDIVP_DIV_ITER(1000, 999) +XNDIVP_MOD_ITER(1001, 1000) +XNDIVP_DIV_ITER(1001, 1000) +XNDIVP_MOD_ITER(1002, 1001) +XNDIVP_DIV_ITER(1002, 1001) +XNDIVP_MOD_ITER(1003, 1002) +XNDIVP_DIV_ITER(1003, 1002) +XNDIVP_MOD_ITER(1004, 1003) +XNDIVP_DIV_ITER(1004, 1003) +XNDIVP_MOD_ITER(1005, 1004) +XNDIVP_DIV_ITER(1005, 1004) +XNDIVP_MOD_ITER(1006, 1005) +XNDIVP_DIV_ITER(1006, 1005) +XNDIVP_MOD_ITER(1007, 1006) +XNDIVP_DIV_ITER(1007, 1006) +XNDIVP_MOD_ITER(1008, 1007) +XNDIVP_DIV_ITER(1008, 1007) +XNDIVP_MOD_ITER(1009, 1008) +XNDIVP_DIV_ITER(1009, 1008) +XNDIVP_MOD_ITER(1010, 1009) +XNDIVP_DIV_ITER(1010, 1009) +XNDIVP_MOD_ITER(1011, 1010) +XNDIVP_DIV_ITER(1011, 1010) +XNDIVP_MOD_ITER(1012, 1011) +XNDIVP_DIV_ITER(1012, 1011) +XNDIVP_MOD_ITER(1013, 1012) +XNDIVP_DIV_ITER(1013, 1012) +XNDIVP_MOD_ITER(1014, 1013) +XNDIVP_DIV_ITER(1014, 1013) +XNDIVP_MOD_ITER(1015, 1014) +XNDIVP_DIV_ITER(1015, 1014) +XNDIVP_MOD_ITER(1016, 1015) +XNDIVP_DIV_ITER(1016, 1015) +XNDIVP_MOD_ITER(1017, 1016) +XNDIVP_DIV_ITER(1017, 1016) +XNDIVP_MOD_ITER(1018, 1017) +XNDIVP_DIV_ITER(1018, 1017) +XNDIVP_MOD_ITER(1019, 1018) +XNDIVP_DIV_ITER(1019, 1018) +XNDIVP_MOD_ITER(1020, 1019) +XNDIVP_DIV_ITER(1020, 1019) +XNDIVP_MOD_ITER(1021, 1020) +XNDIVP_DIV_ITER(1021, 1020) +XNDIVP_MOD_ITER(1022, 1021) +XNDIVP_DIV_ITER(1022, 1021) +XNDIVP_MOD_ITER(1023, 1022) +XNDIVP_DIV_ITER(1023, 1022) +XNDIVP_MOD_ITER(1024, 1023) +XNDIVP_DIV_ITER(1024, 1023)
--- a/crc32x86.c Mon Feb 09 01:21:00 2026 -0500 +++ b/crc32x86.c Mon Feb 09 21:30:30 2026 -0500 @@ -2,6 +2,16 @@ #ifdef __x86_64__ +/* NOTE: None of this is really x86-specific. + * There are probably many other architectures with + * native 64x64->128. + * + * We could adapt this to use just the gcc uint128_t + * instead of x86 intrinsics, but it may slow things + * down a bit. */ + +#define VPCLMULQDQ_TARGET __attribute__((__target__("vpclmulqdq"))) + #include "crc32.h" #include "crc32i.h" #include <stdio.h> @@ -132,7 +142,7 @@ { unsigned i; - for (i = 1; i <= (4*128+32); i++) { + for (i = 1; i <= 1024; i++) { printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1); printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1); } @@ -155,44 +165,135 @@ #define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31) RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128), + RK03 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_960), + RK04 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_1024), RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64), RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32), RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32), RK08 = XNDIVP_RK08R, + RK09 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_832), + RK10 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_896), + RK11 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_704), + RK12 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_768), + RK13 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_576), + RK14 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_640), + RK15 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_448), + RK16 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_512), + RK17 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_320), + RK18 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_384), + RK19 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_192), + RK20 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_256), #undef FIXUPCONSTANTS }; -__attribute__((__target__("vpclmulqdq"))) +VPCLMULQDQ_TARGET +CRC32_FORCEINLINE +uint32_t crc32x86_barrett_reduction(__m128i msgxmm) +{ + static const CRC32_ALIGN(16) uint64_t rk05[2] = {RK05, RK06}, + rk07[2] = {RK07, RK08}, + mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF}; + __m128i rk; + + rk = _mm_load_si128((__m128i *)rk05); + + msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8)); + + msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2))); + + /* Barrett Reduction */ + rk = _mm_load_si128((__m128i *)rk07); + msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm); + + return _mm_extract_epi32(msgxmm, 2); +} + +VPCLMULQDQ_TARGET +CRC32_FORCEINLINE +__m128i crc32x86_fold(__m128i xmm, __m128i rk, __m128i next) +{ + return _mm_xor_si128(next, _mm_xor_si128(_mm_clmulepi64_si128(xmm, rk, 0x01), _mm_clmulepi64_si128(xmm, rk, 0x10))); +} + +/* GCC-specific shit */ +VPCLMULQDQ_TARGET uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz) { + static const CRC32_ALIGN(16) uint64_t rk01[2] = {RK01, RK02}, + rk03[2] = {RK03, RK04}, + rk09[2] = {RK09, RK10}, + rk11[2] = {RK11, RK12}, + rk13[2] = {RK13, RK14}, + rk15[2] = {RK15, RK16}, + rk17[2] = {RK17, RK18}, + rk19[2] = {RK19, RK20}; + __m128i msgxmm; + + if (sz >= 256) { + __m128i rk, msgxmma[8], xmm8; + + /* receive first 128 bytes */ + msgxmma[0] = _mm_load_si128((__m128i *)msg + 0); + msgxmma[1] = _mm_load_si128((__m128i *)msg + 1); + msgxmma[2] = _mm_load_si128((__m128i *)msg + 2); + msgxmma[3] = _mm_load_si128((__m128i *)msg + 3); + msgxmma[4] = _mm_load_si128((__m128i *)msg + 4); + msgxmma[5] = _mm_load_si128((__m128i *)msg + 5); + msgxmma[6] = _mm_load_si128((__m128i *)msg + 6); + msgxmma[7] = _mm_load_si128((__m128i *)msg + 7); + msg += 128; + sz -= 128; + + /* XOR the initial CRC */ + msgxmma[0] = _mm_xor_si128(msgxmma[0], _mm_cvtsi32_si128(crc)); + + rk = _mm_load_si128((__m128i *)rk03); + + for (; sz >= 128; msg += 128, sz -= 128) { + /* loop unrolled */ + msgxmma[0] = crc32x86_fold(msgxmma[0], rk, _mm_load_si128((__m128i *)msg + 0)); + msgxmma[1] = crc32x86_fold(msgxmma[1], rk, _mm_load_si128((__m128i *)msg + 1)); + msgxmma[2] = crc32x86_fold(msgxmma[2], rk, _mm_load_si128((__m128i *)msg + 2)); + msgxmma[3] = crc32x86_fold(msgxmma[3], rk, _mm_load_si128((__m128i *)msg + 3)); + msgxmma[4] = crc32x86_fold(msgxmma[4], rk, _mm_load_si128((__m128i *)msg + 4)); + msgxmma[5] = crc32x86_fold(msgxmma[5], rk, _mm_load_si128((__m128i *)msg + 5)); + msgxmma[6] = crc32x86_fold(msgxmma[6], rk, _mm_load_si128((__m128i *)msg + 6)); + msgxmma[7] = crc32x86_fold(msgxmma[7], rk, _mm_load_si128((__m128i *)msg + 7)); + } + + /* Fold it all into one xmm register */ + msgxmm = msgxmma[7]; + + msgxmm = crc32x86_fold(msgxmma[0], _mm_load_si128((__m128i *)rk09), msgxmm); + msgxmm = crc32x86_fold(msgxmma[1], _mm_load_si128((__m128i *)rk11), msgxmm); + msgxmm = crc32x86_fold(msgxmma[2], _mm_load_si128((__m128i *)rk13), msgxmm); + msgxmm = crc32x86_fold(msgxmma[3], _mm_load_si128((__m128i *)rk15), msgxmm); + msgxmm = crc32x86_fold(msgxmma[4], _mm_load_si128((__m128i *)rk17), msgxmm); + msgxmm = crc32x86_fold(msgxmma[5], _mm_load_si128((__m128i *)rk19), msgxmm); + msgxmm = crc32x86_fold(msgxmma[6], _mm_load_si128((__m128i *)rk01), msgxmm); + + /* Jump across into the 16-byte code, skipping the loading. + * This is much simpler than either doing two barrett reductions or + * adding a whole ton of branches... */ + goto jmpFrom128byte; + } + /* This actually works for 16-byte buffers too, but whether it's actually * useful or faster is another question entirely */ if (sz >= 32) { - static const __attribute__((__aligned__(16))) uint64_t rk01[2] = {RK01, RK02}, - rk05[2] = {RK05, RK06}, - rk07[2] = {RK07, RK08}, - mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF}; - __m128i rk, msgxmm; + __m128i rk; msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc)); + msg += 16; + sz -= 16; +jmpFrom128byte: rk = _mm_load_si128((__m128i *)rk01); - for (msg += 16, sz -= 16; sz >= 16; msg += 16, sz -= 16) { - msgxmm = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x10), _mm_clmulepi64_si128(msgxmm, rk, 0x01)), _mm_load_si128((__m128i *)msg)); - } - - rk = _mm_load_si128((__m128i *)rk05); - - msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8)); + for (; sz >= 16; msg += 16, sz -= 16) + msgxmm = crc32x86_fold(msgxmm, rk, _mm_load_si128((__m128i *)msg)); - msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2))); - - /* Barrett Reduction */ - rk = _mm_load_si128((__m128i *)rk07); - msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm); - - crc = _mm_extract_epi32(msgxmm, 2); + crc = crc32x86_barrett_reduction(msgxmm); } if (!sz) return crc;
