changeset 3:6483683ac857 default tip

*: add profiling code too; expand x86 to use all eight XMM registers basically ported verbatim from the assembly
author Paper <paper@tflc.us>
date Mon, 09 Feb 2026 21:30:30 -0500
parents ead9f84d11db
children
files Makefile crc32-test.c crc32.c crc32c.c crc32i.h crc32x86-tab.h crc32x86.c
diffstat 7 files changed, 1132 insertions(+), 44 deletions(-) [+]
line wrap: on
line diff
--- a/Makefile	Mon Feb 09 01:21:00 2026 -0500
+++ b/Makefile	Mon Feb 09 21:30:30 2026 -0500
@@ -1,5 +1,7 @@
+CFLAGS := -g -fvisibility=hidden -O2 $(CFLAGS)
+
 crc32: crc32.o crc32-table.o crc32-test.o crc32c.o crc32qw.o crc32x86.o
-	$(CC) -o $@ $^
+	$(CC) $(CFLAGS) -o $@ $^
 
 clean:
 	$(RM) crc32 *.o
--- a/crc32-test.c	Mon Feb 09 01:21:00 2026 -0500
+++ b/crc32-test.c	Mon Feb 09 21:30:30 2026 -0500
@@ -1,17 +1,22 @@
 #include "crc32i.h"
 #include <stdio.h>
+#include <inttypes.h>
+#include <time.h>
 
 /* Test implementations and make sure they agree with each other */
 int crc32_test(void)
 {
 	/* Force alignment :) */
-	static const __attribute__((__aligned__(CRC32_MAX_ALIGNMENT))) unsigned char testdata[1024] =
+	static const CRC32_ALIGN(CRC32_MAX_ALIGNMENT) unsigned char testdata[(1ul << 23) + 19] =
 #define DOUBLE(x) x x
-DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE("\x01\x02\x04\x08\x10\x20\x40\x80")))))))
+DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE(DOUBLE("\x01\x02\x04\x08\x10\x20\x40\x80"))))))))
 #undef DOUBLE
 	;
-	static const crc32_r_spec crc[] = {
-#define CRC32_IMPL(name) crc32##name##_r,
+	static const struct {
+		crc32_r_spec f;
+		const char *name;
+	} crc[] = {
+#define CRC32_IMPL(name) {crc32##name##_r, #name},
 #include "crc32-impls.h"
 	};
 	size_t i;
@@ -19,10 +24,18 @@
 	uint32_t crcc = crc32(testdata, sizeof(testdata));
 
 	for (i = 0; i < ARRAY_SIZE(crc); i++) {
-		uint32_t thiscrc = ~crc[i](0xFFFFFFFF, testdata, sizeof(testdata));
+		clock_t start, end;
+
+		start = clock();
+		uint32_t thiscrc = crc[i].f(0xFFFFFFFF, testdata, sizeof(testdata));
+		end = clock();
+
+		printf("%s: took %f secs\n", crc[i].name, (double)(end - start) / CLOCKS_PER_SEC);
+
+		thiscrc = ~thiscrc;
 
 		if (thiscrc != crcc) {
-			fprintf(stderr, "%zu, mismatch: %08" PRIX32 ", %08" PRIx32 "\n", i, crcc, thiscrc);
+			fprintf(stderr, "%s: mismatch: %08" PRIX32 ", %08" PRIx32 "\n", crc[i].name, crcc, thiscrc);
 			return -1;
 		}
 	}
--- a/crc32.c	Mon Feb 09 01:21:00 2026 -0500
+++ b/crc32.c	Mon Feb 09 21:30:30 2026 -0500
@@ -17,14 +17,12 @@
 		return;
 
 	/* Calculate size needed to align */
-	sz8 = align - ((uintptr_t)message % align);
+	sz8 = align - ((uintptr_t)*message % align);
 	szs = MIN(*sz, sz8);
 
 	*crc = crcfunc(*crc, *message, sz8);
 	*message += sz8;
 	*sz -= sz8;
-
-	if (szs == sz8) assert(ALIGNED(*message, align));
 }
 
 CRC32_API
@@ -39,12 +37,17 @@
 	crc = 0xFFFFFFFF;
 	crc32_align(&crc, crc32c_r, ALIGNOF(uint32_t), &message, &sz);
 	if (!sz) return ~crc;
-#ifdef __x86_64__
-	crc32_align(&crc, crc32qw_r, 16, &message, &sz);
-	if (!sz) return ~crc;
 
-	return ~crc32x86_vpclmulqdq_r(crc, message, sz);
-#else
+#if defined(__x86_64__) && defined(__GNUC__)
+	/* Check at runtime if we can use vpclmulqdq */
+	if (__builtin_cpu_supports("vpclmulqdq")) {
+		/* Align and do the rest with vpclmulqdq */
+		crc32_align(&crc, crc32qw_r, 16, &message, &sz);
+		if (!sz) return ~crc;
+
+		return ~crc32x86_vpclmulqdq_r(crc, message, sz);
+	} /* Otherwise just use 32-bit impl */
+#endif
+
 	return ~crc32qw_r(crc, message, sz);
-#endif
 }
--- a/crc32c.c	Mon Feb 09 01:21:00 2026 -0500
+++ b/crc32c.c	Mon Feb 09 21:30:30 2026 -0500
@@ -2,10 +2,8 @@
 
 uint32_t crc32c_r(uint32_t crc, const unsigned char *message, size_t sz)
 {
-	size_t i;
-
-	for (i = 0; i < sz; i++)
-		crc = (crc >> 8) ^ crc32_tab[(crc ^ message[i]) & 0xFF];
+	while (sz--)
+		crc = (crc >> 8) ^ crc32_tab[(crc ^ *message++) & 0xFF];
 
 	return crc;
 }
--- a/crc32i.h	Mon Feb 09 01:21:00 2026 -0500
+++ b/crc32i.h	Mon Feb 09 21:30:30 2026 -0500
@@ -13,11 +13,22 @@
  * to be destroyed or if it can be cached. */
 #define CRC32_POLYNOMIAL 0xedb88320
 
-/* crc32b.c */
+#if (__STDC_VERSION__ >= 201112L)
+# define CRC32_ALIGN(N) alignas(N)
+#elif defined(__GNUC__)
+# define CRC32_ALIGN(N) __attribute__((__aligned__(N)))
+#elif defined(_MSC_VER)
+# define CRC32_ALIGN(N) __declspec(align(N))
+#else
+# error fuck
+#endif
+
 #ifdef __GNUC__
-# define CRC32_PURE __attribute__((__pure__))
+# define CRC32_FORCEINLINE static inline __attribute__((__always_inline__))
+#elif defined(_MSC_VER)
+# define CRC32_FORCEINLINE static __forceinline
 #else
-# define CRC32_PURE
+# define CRC32_FORCEINLINE static inline
 #endif
 
 #define ALIGNOF(type) offsetof(struct { type a; char b; }, b)
--- a/crc32x86-tab.h	Mon Feb 09 01:21:00 2026 -0500
+++ b/crc32x86-tab.h	Mon Feb 09 21:30:30 2026 -0500
@@ -1086,3 +1086,963 @@
 XNDIVP_DIV_ITER(543, 542)
 XNDIVP_MOD_ITER(544, 543)
 XNDIVP_DIV_ITER(544, 543)
+XNDIVP_MOD_ITER(545, 544)
+XNDIVP_DIV_ITER(545, 544)
+XNDIVP_MOD_ITER(546, 545)
+XNDIVP_DIV_ITER(546, 545)
+XNDIVP_MOD_ITER(547, 546)
+XNDIVP_DIV_ITER(547, 546)
+XNDIVP_MOD_ITER(548, 547)
+XNDIVP_DIV_ITER(548, 547)
+XNDIVP_MOD_ITER(549, 548)
+XNDIVP_DIV_ITER(549, 548)
+XNDIVP_MOD_ITER(550, 549)
+XNDIVP_DIV_ITER(550, 549)
+XNDIVP_MOD_ITER(551, 550)
+XNDIVP_DIV_ITER(551, 550)
+XNDIVP_MOD_ITER(552, 551)
+XNDIVP_DIV_ITER(552, 551)
+XNDIVP_MOD_ITER(553, 552)
+XNDIVP_DIV_ITER(553, 552)
+XNDIVP_MOD_ITER(554, 553)
+XNDIVP_DIV_ITER(554, 553)
+XNDIVP_MOD_ITER(555, 554)
+XNDIVP_DIV_ITER(555, 554)
+XNDIVP_MOD_ITER(556, 555)
+XNDIVP_DIV_ITER(556, 555)
+XNDIVP_MOD_ITER(557, 556)
+XNDIVP_DIV_ITER(557, 556)
+XNDIVP_MOD_ITER(558, 557)
+XNDIVP_DIV_ITER(558, 557)
+XNDIVP_MOD_ITER(559, 558)
+XNDIVP_DIV_ITER(559, 558)
+XNDIVP_MOD_ITER(560, 559)
+XNDIVP_DIV_ITER(560, 559)
+XNDIVP_MOD_ITER(561, 560)
+XNDIVP_DIV_ITER(561, 560)
+XNDIVP_MOD_ITER(562, 561)
+XNDIVP_DIV_ITER(562, 561)
+XNDIVP_MOD_ITER(563, 562)
+XNDIVP_DIV_ITER(563, 562)
+XNDIVP_MOD_ITER(564, 563)
+XNDIVP_DIV_ITER(564, 563)
+XNDIVP_MOD_ITER(565, 564)
+XNDIVP_DIV_ITER(565, 564)
+XNDIVP_MOD_ITER(566, 565)
+XNDIVP_DIV_ITER(566, 565)
+XNDIVP_MOD_ITER(567, 566)
+XNDIVP_DIV_ITER(567, 566)
+XNDIVP_MOD_ITER(568, 567)
+XNDIVP_DIV_ITER(568, 567)
+XNDIVP_MOD_ITER(569, 568)
+XNDIVP_DIV_ITER(569, 568)
+XNDIVP_MOD_ITER(570, 569)
+XNDIVP_DIV_ITER(570, 569)
+XNDIVP_MOD_ITER(571, 570)
+XNDIVP_DIV_ITER(571, 570)
+XNDIVP_MOD_ITER(572, 571)
+XNDIVP_DIV_ITER(572, 571)
+XNDIVP_MOD_ITER(573, 572)
+XNDIVP_DIV_ITER(573, 572)
+XNDIVP_MOD_ITER(574, 573)
+XNDIVP_DIV_ITER(574, 573)
+XNDIVP_MOD_ITER(575, 574)
+XNDIVP_DIV_ITER(575, 574)
+XNDIVP_MOD_ITER(576, 575)
+XNDIVP_DIV_ITER(576, 575)
+XNDIVP_MOD_ITER(577, 576)
+XNDIVP_DIV_ITER(577, 576)
+XNDIVP_MOD_ITER(578, 577)
+XNDIVP_DIV_ITER(578, 577)
+XNDIVP_MOD_ITER(579, 578)
+XNDIVP_DIV_ITER(579, 578)
+XNDIVP_MOD_ITER(580, 579)
+XNDIVP_DIV_ITER(580, 579)
+XNDIVP_MOD_ITER(581, 580)
+XNDIVP_DIV_ITER(581, 580)
+XNDIVP_MOD_ITER(582, 581)
+XNDIVP_DIV_ITER(582, 581)
+XNDIVP_MOD_ITER(583, 582)
+XNDIVP_DIV_ITER(583, 582)
+XNDIVP_MOD_ITER(584, 583)
+XNDIVP_DIV_ITER(584, 583)
+XNDIVP_MOD_ITER(585, 584)
+XNDIVP_DIV_ITER(585, 584)
+XNDIVP_MOD_ITER(586, 585)
+XNDIVP_DIV_ITER(586, 585)
+XNDIVP_MOD_ITER(587, 586)
+XNDIVP_DIV_ITER(587, 586)
+XNDIVP_MOD_ITER(588, 587)
+XNDIVP_DIV_ITER(588, 587)
+XNDIVP_MOD_ITER(589, 588)
+XNDIVP_DIV_ITER(589, 588)
+XNDIVP_MOD_ITER(590, 589)
+XNDIVP_DIV_ITER(590, 589)
+XNDIVP_MOD_ITER(591, 590)
+XNDIVP_DIV_ITER(591, 590)
+XNDIVP_MOD_ITER(592, 591)
+XNDIVP_DIV_ITER(592, 591)
+XNDIVP_MOD_ITER(593, 592)
+XNDIVP_DIV_ITER(593, 592)
+XNDIVP_MOD_ITER(594, 593)
+XNDIVP_DIV_ITER(594, 593)
+XNDIVP_MOD_ITER(595, 594)
+XNDIVP_DIV_ITER(595, 594)
+XNDIVP_MOD_ITER(596, 595)
+XNDIVP_DIV_ITER(596, 595)
+XNDIVP_MOD_ITER(597, 596)
+XNDIVP_DIV_ITER(597, 596)
+XNDIVP_MOD_ITER(598, 597)
+XNDIVP_DIV_ITER(598, 597)
+XNDIVP_MOD_ITER(599, 598)
+XNDIVP_DIV_ITER(599, 598)
+XNDIVP_MOD_ITER(600, 599)
+XNDIVP_DIV_ITER(600, 599)
+XNDIVP_MOD_ITER(601, 600)
+XNDIVP_DIV_ITER(601, 600)
+XNDIVP_MOD_ITER(602, 601)
+XNDIVP_DIV_ITER(602, 601)
+XNDIVP_MOD_ITER(603, 602)
+XNDIVP_DIV_ITER(603, 602)
+XNDIVP_MOD_ITER(604, 603)
+XNDIVP_DIV_ITER(604, 603)
+XNDIVP_MOD_ITER(605, 604)
+XNDIVP_DIV_ITER(605, 604)
+XNDIVP_MOD_ITER(606, 605)
+XNDIVP_DIV_ITER(606, 605)
+XNDIVP_MOD_ITER(607, 606)
+XNDIVP_DIV_ITER(607, 606)
+XNDIVP_MOD_ITER(608, 607)
+XNDIVP_DIV_ITER(608, 607)
+XNDIVP_MOD_ITER(609, 608)
+XNDIVP_DIV_ITER(609, 608)
+XNDIVP_MOD_ITER(610, 609)
+XNDIVP_DIV_ITER(610, 609)
+XNDIVP_MOD_ITER(611, 610)
+XNDIVP_DIV_ITER(611, 610)
+XNDIVP_MOD_ITER(612, 611)
+XNDIVP_DIV_ITER(612, 611)
+XNDIVP_MOD_ITER(613, 612)
+XNDIVP_DIV_ITER(613, 612)
+XNDIVP_MOD_ITER(614, 613)
+XNDIVP_DIV_ITER(614, 613)
+XNDIVP_MOD_ITER(615, 614)
+XNDIVP_DIV_ITER(615, 614)
+XNDIVP_MOD_ITER(616, 615)
+XNDIVP_DIV_ITER(616, 615)
+XNDIVP_MOD_ITER(617, 616)
+XNDIVP_DIV_ITER(617, 616)
+XNDIVP_MOD_ITER(618, 617)
+XNDIVP_DIV_ITER(618, 617)
+XNDIVP_MOD_ITER(619, 618)
+XNDIVP_DIV_ITER(619, 618)
+XNDIVP_MOD_ITER(620, 619)
+XNDIVP_DIV_ITER(620, 619)
+XNDIVP_MOD_ITER(621, 620)
+XNDIVP_DIV_ITER(621, 620)
+XNDIVP_MOD_ITER(622, 621)
+XNDIVP_DIV_ITER(622, 621)
+XNDIVP_MOD_ITER(623, 622)
+XNDIVP_DIV_ITER(623, 622)
+XNDIVP_MOD_ITER(624, 623)
+XNDIVP_DIV_ITER(624, 623)
+XNDIVP_MOD_ITER(625, 624)
+XNDIVP_DIV_ITER(625, 624)
+XNDIVP_MOD_ITER(626, 625)
+XNDIVP_DIV_ITER(626, 625)
+XNDIVP_MOD_ITER(627, 626)
+XNDIVP_DIV_ITER(627, 626)
+XNDIVP_MOD_ITER(628, 627)
+XNDIVP_DIV_ITER(628, 627)
+XNDIVP_MOD_ITER(629, 628)
+XNDIVP_DIV_ITER(629, 628)
+XNDIVP_MOD_ITER(630, 629)
+XNDIVP_DIV_ITER(630, 629)
+XNDIVP_MOD_ITER(631, 630)
+XNDIVP_DIV_ITER(631, 630)
+XNDIVP_MOD_ITER(632, 631)
+XNDIVP_DIV_ITER(632, 631)
+XNDIVP_MOD_ITER(633, 632)
+XNDIVP_DIV_ITER(633, 632)
+XNDIVP_MOD_ITER(634, 633)
+XNDIVP_DIV_ITER(634, 633)
+XNDIVP_MOD_ITER(635, 634)
+XNDIVP_DIV_ITER(635, 634)
+XNDIVP_MOD_ITER(636, 635)
+XNDIVP_DIV_ITER(636, 635)
+XNDIVP_MOD_ITER(637, 636)
+XNDIVP_DIV_ITER(637, 636)
+XNDIVP_MOD_ITER(638, 637)
+XNDIVP_DIV_ITER(638, 637)
+XNDIVP_MOD_ITER(639, 638)
+XNDIVP_DIV_ITER(639, 638)
+XNDIVP_MOD_ITER(640, 639)
+XNDIVP_DIV_ITER(640, 639)
+XNDIVP_MOD_ITER(641, 640)
+XNDIVP_DIV_ITER(641, 640)
+XNDIVP_MOD_ITER(642, 641)
+XNDIVP_DIV_ITER(642, 641)
+XNDIVP_MOD_ITER(643, 642)
+XNDIVP_DIV_ITER(643, 642)
+XNDIVP_MOD_ITER(644, 643)
+XNDIVP_DIV_ITER(644, 643)
+XNDIVP_MOD_ITER(645, 644)
+XNDIVP_DIV_ITER(645, 644)
+XNDIVP_MOD_ITER(646, 645)
+XNDIVP_DIV_ITER(646, 645)
+XNDIVP_MOD_ITER(647, 646)
+XNDIVP_DIV_ITER(647, 646)
+XNDIVP_MOD_ITER(648, 647)
+XNDIVP_DIV_ITER(648, 647)
+XNDIVP_MOD_ITER(649, 648)
+XNDIVP_DIV_ITER(649, 648)
+XNDIVP_MOD_ITER(650, 649)
+XNDIVP_DIV_ITER(650, 649)
+XNDIVP_MOD_ITER(651, 650)
+XNDIVP_DIV_ITER(651, 650)
+XNDIVP_MOD_ITER(652, 651)
+XNDIVP_DIV_ITER(652, 651)
+XNDIVP_MOD_ITER(653, 652)
+XNDIVP_DIV_ITER(653, 652)
+XNDIVP_MOD_ITER(654, 653)
+XNDIVP_DIV_ITER(654, 653)
+XNDIVP_MOD_ITER(655, 654)
+XNDIVP_DIV_ITER(655, 654)
+XNDIVP_MOD_ITER(656, 655)
+XNDIVP_DIV_ITER(656, 655)
+XNDIVP_MOD_ITER(657, 656)
+XNDIVP_DIV_ITER(657, 656)
+XNDIVP_MOD_ITER(658, 657)
+XNDIVP_DIV_ITER(658, 657)
+XNDIVP_MOD_ITER(659, 658)
+XNDIVP_DIV_ITER(659, 658)
+XNDIVP_MOD_ITER(660, 659)
+XNDIVP_DIV_ITER(660, 659)
+XNDIVP_MOD_ITER(661, 660)
+XNDIVP_DIV_ITER(661, 660)
+XNDIVP_MOD_ITER(662, 661)
+XNDIVP_DIV_ITER(662, 661)
+XNDIVP_MOD_ITER(663, 662)
+XNDIVP_DIV_ITER(663, 662)
+XNDIVP_MOD_ITER(664, 663)
+XNDIVP_DIV_ITER(664, 663)
+XNDIVP_MOD_ITER(665, 664)
+XNDIVP_DIV_ITER(665, 664)
+XNDIVP_MOD_ITER(666, 665)
+XNDIVP_DIV_ITER(666, 665)
+XNDIVP_MOD_ITER(667, 666)
+XNDIVP_DIV_ITER(667, 666)
+XNDIVP_MOD_ITER(668, 667)
+XNDIVP_DIV_ITER(668, 667)
+XNDIVP_MOD_ITER(669, 668)
+XNDIVP_DIV_ITER(669, 668)
+XNDIVP_MOD_ITER(670, 669)
+XNDIVP_DIV_ITER(670, 669)
+XNDIVP_MOD_ITER(671, 670)
+XNDIVP_DIV_ITER(671, 670)
+XNDIVP_MOD_ITER(672, 671)
+XNDIVP_DIV_ITER(672, 671)
+XNDIVP_MOD_ITER(673, 672)
+XNDIVP_DIV_ITER(673, 672)
+XNDIVP_MOD_ITER(674, 673)
+XNDIVP_DIV_ITER(674, 673)
+XNDIVP_MOD_ITER(675, 674)
+XNDIVP_DIV_ITER(675, 674)
+XNDIVP_MOD_ITER(676, 675)
+XNDIVP_DIV_ITER(676, 675)
+XNDIVP_MOD_ITER(677, 676)
+XNDIVP_DIV_ITER(677, 676)
+XNDIVP_MOD_ITER(678, 677)
+XNDIVP_DIV_ITER(678, 677)
+XNDIVP_MOD_ITER(679, 678)
+XNDIVP_DIV_ITER(679, 678)
+XNDIVP_MOD_ITER(680, 679)
+XNDIVP_DIV_ITER(680, 679)
+XNDIVP_MOD_ITER(681, 680)
+XNDIVP_DIV_ITER(681, 680)
+XNDIVP_MOD_ITER(682, 681)
+XNDIVP_DIV_ITER(682, 681)
+XNDIVP_MOD_ITER(683, 682)
+XNDIVP_DIV_ITER(683, 682)
+XNDIVP_MOD_ITER(684, 683)
+XNDIVP_DIV_ITER(684, 683)
+XNDIVP_MOD_ITER(685, 684)
+XNDIVP_DIV_ITER(685, 684)
+XNDIVP_MOD_ITER(686, 685)
+XNDIVP_DIV_ITER(686, 685)
+XNDIVP_MOD_ITER(687, 686)
+XNDIVP_DIV_ITER(687, 686)
+XNDIVP_MOD_ITER(688, 687)
+XNDIVP_DIV_ITER(688, 687)
+XNDIVP_MOD_ITER(689, 688)
+XNDIVP_DIV_ITER(689, 688)
+XNDIVP_MOD_ITER(690, 689)
+XNDIVP_DIV_ITER(690, 689)
+XNDIVP_MOD_ITER(691, 690)
+XNDIVP_DIV_ITER(691, 690)
+XNDIVP_MOD_ITER(692, 691)
+XNDIVP_DIV_ITER(692, 691)
+XNDIVP_MOD_ITER(693, 692)
+XNDIVP_DIV_ITER(693, 692)
+XNDIVP_MOD_ITER(694, 693)
+XNDIVP_DIV_ITER(694, 693)
+XNDIVP_MOD_ITER(695, 694)
+XNDIVP_DIV_ITER(695, 694)
+XNDIVP_MOD_ITER(696, 695)
+XNDIVP_DIV_ITER(696, 695)
+XNDIVP_MOD_ITER(697, 696)
+XNDIVP_DIV_ITER(697, 696)
+XNDIVP_MOD_ITER(698, 697)
+XNDIVP_DIV_ITER(698, 697)
+XNDIVP_MOD_ITER(699, 698)
+XNDIVP_DIV_ITER(699, 698)
+XNDIVP_MOD_ITER(700, 699)
+XNDIVP_DIV_ITER(700, 699)
+XNDIVP_MOD_ITER(701, 700)
+XNDIVP_DIV_ITER(701, 700)
+XNDIVP_MOD_ITER(702, 701)
+XNDIVP_DIV_ITER(702, 701)
+XNDIVP_MOD_ITER(703, 702)
+XNDIVP_DIV_ITER(703, 702)
+XNDIVP_MOD_ITER(704, 703)
+XNDIVP_DIV_ITER(704, 703)
+XNDIVP_MOD_ITER(705, 704)
+XNDIVP_DIV_ITER(705, 704)
+XNDIVP_MOD_ITER(706, 705)
+XNDIVP_DIV_ITER(706, 705)
+XNDIVP_MOD_ITER(707, 706)
+XNDIVP_DIV_ITER(707, 706)
+XNDIVP_MOD_ITER(708, 707)
+XNDIVP_DIV_ITER(708, 707)
+XNDIVP_MOD_ITER(709, 708)
+XNDIVP_DIV_ITER(709, 708)
+XNDIVP_MOD_ITER(710, 709)
+XNDIVP_DIV_ITER(710, 709)
+XNDIVP_MOD_ITER(711, 710)
+XNDIVP_DIV_ITER(711, 710)
+XNDIVP_MOD_ITER(712, 711)
+XNDIVP_DIV_ITER(712, 711)
+XNDIVP_MOD_ITER(713, 712)
+XNDIVP_DIV_ITER(713, 712)
+XNDIVP_MOD_ITER(714, 713)
+XNDIVP_DIV_ITER(714, 713)
+XNDIVP_MOD_ITER(715, 714)
+XNDIVP_DIV_ITER(715, 714)
+XNDIVP_MOD_ITER(716, 715)
+XNDIVP_DIV_ITER(716, 715)
+XNDIVP_MOD_ITER(717, 716)
+XNDIVP_DIV_ITER(717, 716)
+XNDIVP_MOD_ITER(718, 717)
+XNDIVP_DIV_ITER(718, 717)
+XNDIVP_MOD_ITER(719, 718)
+XNDIVP_DIV_ITER(719, 718)
+XNDIVP_MOD_ITER(720, 719)
+XNDIVP_DIV_ITER(720, 719)
+XNDIVP_MOD_ITER(721, 720)
+XNDIVP_DIV_ITER(721, 720)
+XNDIVP_MOD_ITER(722, 721)
+XNDIVP_DIV_ITER(722, 721)
+XNDIVP_MOD_ITER(723, 722)
+XNDIVP_DIV_ITER(723, 722)
+XNDIVP_MOD_ITER(724, 723)
+XNDIVP_DIV_ITER(724, 723)
+XNDIVP_MOD_ITER(725, 724)
+XNDIVP_DIV_ITER(725, 724)
+XNDIVP_MOD_ITER(726, 725)
+XNDIVP_DIV_ITER(726, 725)
+XNDIVP_MOD_ITER(727, 726)
+XNDIVP_DIV_ITER(727, 726)
+XNDIVP_MOD_ITER(728, 727)
+XNDIVP_DIV_ITER(728, 727)
+XNDIVP_MOD_ITER(729, 728)
+XNDIVP_DIV_ITER(729, 728)
+XNDIVP_MOD_ITER(730, 729)
+XNDIVP_DIV_ITER(730, 729)
+XNDIVP_MOD_ITER(731, 730)
+XNDIVP_DIV_ITER(731, 730)
+XNDIVP_MOD_ITER(732, 731)
+XNDIVP_DIV_ITER(732, 731)
+XNDIVP_MOD_ITER(733, 732)
+XNDIVP_DIV_ITER(733, 732)
+XNDIVP_MOD_ITER(734, 733)
+XNDIVP_DIV_ITER(734, 733)
+XNDIVP_MOD_ITER(735, 734)
+XNDIVP_DIV_ITER(735, 734)
+XNDIVP_MOD_ITER(736, 735)
+XNDIVP_DIV_ITER(736, 735)
+XNDIVP_MOD_ITER(737, 736)
+XNDIVP_DIV_ITER(737, 736)
+XNDIVP_MOD_ITER(738, 737)
+XNDIVP_DIV_ITER(738, 737)
+XNDIVP_MOD_ITER(739, 738)
+XNDIVP_DIV_ITER(739, 738)
+XNDIVP_MOD_ITER(740, 739)
+XNDIVP_DIV_ITER(740, 739)
+XNDIVP_MOD_ITER(741, 740)
+XNDIVP_DIV_ITER(741, 740)
+XNDIVP_MOD_ITER(742, 741)
+XNDIVP_DIV_ITER(742, 741)
+XNDIVP_MOD_ITER(743, 742)
+XNDIVP_DIV_ITER(743, 742)
+XNDIVP_MOD_ITER(744, 743)
+XNDIVP_DIV_ITER(744, 743)
+XNDIVP_MOD_ITER(745, 744)
+XNDIVP_DIV_ITER(745, 744)
+XNDIVP_MOD_ITER(746, 745)
+XNDIVP_DIV_ITER(746, 745)
+XNDIVP_MOD_ITER(747, 746)
+XNDIVP_DIV_ITER(747, 746)
+XNDIVP_MOD_ITER(748, 747)
+XNDIVP_DIV_ITER(748, 747)
+XNDIVP_MOD_ITER(749, 748)
+XNDIVP_DIV_ITER(749, 748)
+XNDIVP_MOD_ITER(750, 749)
+XNDIVP_DIV_ITER(750, 749)
+XNDIVP_MOD_ITER(751, 750)
+XNDIVP_DIV_ITER(751, 750)
+XNDIVP_MOD_ITER(752, 751)
+XNDIVP_DIV_ITER(752, 751)
+XNDIVP_MOD_ITER(753, 752)
+XNDIVP_DIV_ITER(753, 752)
+XNDIVP_MOD_ITER(754, 753)
+XNDIVP_DIV_ITER(754, 753)
+XNDIVP_MOD_ITER(755, 754)
+XNDIVP_DIV_ITER(755, 754)
+XNDIVP_MOD_ITER(756, 755)
+XNDIVP_DIV_ITER(756, 755)
+XNDIVP_MOD_ITER(757, 756)
+XNDIVP_DIV_ITER(757, 756)
+XNDIVP_MOD_ITER(758, 757)
+XNDIVP_DIV_ITER(758, 757)
+XNDIVP_MOD_ITER(759, 758)
+XNDIVP_DIV_ITER(759, 758)
+XNDIVP_MOD_ITER(760, 759)
+XNDIVP_DIV_ITER(760, 759)
+XNDIVP_MOD_ITER(761, 760)
+XNDIVP_DIV_ITER(761, 760)
+XNDIVP_MOD_ITER(762, 761)
+XNDIVP_DIV_ITER(762, 761)
+XNDIVP_MOD_ITER(763, 762)
+XNDIVP_DIV_ITER(763, 762)
+XNDIVP_MOD_ITER(764, 763)
+XNDIVP_DIV_ITER(764, 763)
+XNDIVP_MOD_ITER(765, 764)
+XNDIVP_DIV_ITER(765, 764)
+XNDIVP_MOD_ITER(766, 765)
+XNDIVP_DIV_ITER(766, 765)
+XNDIVP_MOD_ITER(767, 766)
+XNDIVP_DIV_ITER(767, 766)
+XNDIVP_MOD_ITER(768, 767)
+XNDIVP_DIV_ITER(768, 767)
+XNDIVP_MOD_ITER(769, 768)
+XNDIVP_DIV_ITER(769, 768)
+XNDIVP_MOD_ITER(770, 769)
+XNDIVP_DIV_ITER(770, 769)
+XNDIVP_MOD_ITER(771, 770)
+XNDIVP_DIV_ITER(771, 770)
+XNDIVP_MOD_ITER(772, 771)
+XNDIVP_DIV_ITER(772, 771)
+XNDIVP_MOD_ITER(773, 772)
+XNDIVP_DIV_ITER(773, 772)
+XNDIVP_MOD_ITER(774, 773)
+XNDIVP_DIV_ITER(774, 773)
+XNDIVP_MOD_ITER(775, 774)
+XNDIVP_DIV_ITER(775, 774)
+XNDIVP_MOD_ITER(776, 775)
+XNDIVP_DIV_ITER(776, 775)
+XNDIVP_MOD_ITER(777, 776)
+XNDIVP_DIV_ITER(777, 776)
+XNDIVP_MOD_ITER(778, 777)
+XNDIVP_DIV_ITER(778, 777)
+XNDIVP_MOD_ITER(779, 778)
+XNDIVP_DIV_ITER(779, 778)
+XNDIVP_MOD_ITER(780, 779)
+XNDIVP_DIV_ITER(780, 779)
+XNDIVP_MOD_ITER(781, 780)
+XNDIVP_DIV_ITER(781, 780)
+XNDIVP_MOD_ITER(782, 781)
+XNDIVP_DIV_ITER(782, 781)
+XNDIVP_MOD_ITER(783, 782)
+XNDIVP_DIV_ITER(783, 782)
+XNDIVP_MOD_ITER(784, 783)
+XNDIVP_DIV_ITER(784, 783)
+XNDIVP_MOD_ITER(785, 784)
+XNDIVP_DIV_ITER(785, 784)
+XNDIVP_MOD_ITER(786, 785)
+XNDIVP_DIV_ITER(786, 785)
+XNDIVP_MOD_ITER(787, 786)
+XNDIVP_DIV_ITER(787, 786)
+XNDIVP_MOD_ITER(788, 787)
+XNDIVP_DIV_ITER(788, 787)
+XNDIVP_MOD_ITER(789, 788)
+XNDIVP_DIV_ITER(789, 788)
+XNDIVP_MOD_ITER(790, 789)
+XNDIVP_DIV_ITER(790, 789)
+XNDIVP_MOD_ITER(791, 790)
+XNDIVP_DIV_ITER(791, 790)
+XNDIVP_MOD_ITER(792, 791)
+XNDIVP_DIV_ITER(792, 791)
+XNDIVP_MOD_ITER(793, 792)
+XNDIVP_DIV_ITER(793, 792)
+XNDIVP_MOD_ITER(794, 793)
+XNDIVP_DIV_ITER(794, 793)
+XNDIVP_MOD_ITER(795, 794)
+XNDIVP_DIV_ITER(795, 794)
+XNDIVP_MOD_ITER(796, 795)
+XNDIVP_DIV_ITER(796, 795)
+XNDIVP_MOD_ITER(797, 796)
+XNDIVP_DIV_ITER(797, 796)
+XNDIVP_MOD_ITER(798, 797)
+XNDIVP_DIV_ITER(798, 797)
+XNDIVP_MOD_ITER(799, 798)
+XNDIVP_DIV_ITER(799, 798)
+XNDIVP_MOD_ITER(800, 799)
+XNDIVP_DIV_ITER(800, 799)
+XNDIVP_MOD_ITER(801, 800)
+XNDIVP_DIV_ITER(801, 800)
+XNDIVP_MOD_ITER(802, 801)
+XNDIVP_DIV_ITER(802, 801)
+XNDIVP_MOD_ITER(803, 802)
+XNDIVP_DIV_ITER(803, 802)
+XNDIVP_MOD_ITER(804, 803)
+XNDIVP_DIV_ITER(804, 803)
+XNDIVP_MOD_ITER(805, 804)
+XNDIVP_DIV_ITER(805, 804)
+XNDIVP_MOD_ITER(806, 805)
+XNDIVP_DIV_ITER(806, 805)
+XNDIVP_MOD_ITER(807, 806)
+XNDIVP_DIV_ITER(807, 806)
+XNDIVP_MOD_ITER(808, 807)
+XNDIVP_DIV_ITER(808, 807)
+XNDIVP_MOD_ITER(809, 808)
+XNDIVP_DIV_ITER(809, 808)
+XNDIVP_MOD_ITER(810, 809)
+XNDIVP_DIV_ITER(810, 809)
+XNDIVP_MOD_ITER(811, 810)
+XNDIVP_DIV_ITER(811, 810)
+XNDIVP_MOD_ITER(812, 811)
+XNDIVP_DIV_ITER(812, 811)
+XNDIVP_MOD_ITER(813, 812)
+XNDIVP_DIV_ITER(813, 812)
+XNDIVP_MOD_ITER(814, 813)
+XNDIVP_DIV_ITER(814, 813)
+XNDIVP_MOD_ITER(815, 814)
+XNDIVP_DIV_ITER(815, 814)
+XNDIVP_MOD_ITER(816, 815)
+XNDIVP_DIV_ITER(816, 815)
+XNDIVP_MOD_ITER(817, 816)
+XNDIVP_DIV_ITER(817, 816)
+XNDIVP_MOD_ITER(818, 817)
+XNDIVP_DIV_ITER(818, 817)
+XNDIVP_MOD_ITER(819, 818)
+XNDIVP_DIV_ITER(819, 818)
+XNDIVP_MOD_ITER(820, 819)
+XNDIVP_DIV_ITER(820, 819)
+XNDIVP_MOD_ITER(821, 820)
+XNDIVP_DIV_ITER(821, 820)
+XNDIVP_MOD_ITER(822, 821)
+XNDIVP_DIV_ITER(822, 821)
+XNDIVP_MOD_ITER(823, 822)
+XNDIVP_DIV_ITER(823, 822)
+XNDIVP_MOD_ITER(824, 823)
+XNDIVP_DIV_ITER(824, 823)
+XNDIVP_MOD_ITER(825, 824)
+XNDIVP_DIV_ITER(825, 824)
+XNDIVP_MOD_ITER(826, 825)
+XNDIVP_DIV_ITER(826, 825)
+XNDIVP_MOD_ITER(827, 826)
+XNDIVP_DIV_ITER(827, 826)
+XNDIVP_MOD_ITER(828, 827)
+XNDIVP_DIV_ITER(828, 827)
+XNDIVP_MOD_ITER(829, 828)
+XNDIVP_DIV_ITER(829, 828)
+XNDIVP_MOD_ITER(830, 829)
+XNDIVP_DIV_ITER(830, 829)
+XNDIVP_MOD_ITER(831, 830)
+XNDIVP_DIV_ITER(831, 830)
+XNDIVP_MOD_ITER(832, 831)
+XNDIVP_DIV_ITER(832, 831)
+XNDIVP_MOD_ITER(833, 832)
+XNDIVP_DIV_ITER(833, 832)
+XNDIVP_MOD_ITER(834, 833)
+XNDIVP_DIV_ITER(834, 833)
+XNDIVP_MOD_ITER(835, 834)
+XNDIVP_DIV_ITER(835, 834)
+XNDIVP_MOD_ITER(836, 835)
+XNDIVP_DIV_ITER(836, 835)
+XNDIVP_MOD_ITER(837, 836)
+XNDIVP_DIV_ITER(837, 836)
+XNDIVP_MOD_ITER(838, 837)
+XNDIVP_DIV_ITER(838, 837)
+XNDIVP_MOD_ITER(839, 838)
+XNDIVP_DIV_ITER(839, 838)
+XNDIVP_MOD_ITER(840, 839)
+XNDIVP_DIV_ITER(840, 839)
+XNDIVP_MOD_ITER(841, 840)
+XNDIVP_DIV_ITER(841, 840)
+XNDIVP_MOD_ITER(842, 841)
+XNDIVP_DIV_ITER(842, 841)
+XNDIVP_MOD_ITER(843, 842)
+XNDIVP_DIV_ITER(843, 842)
+XNDIVP_MOD_ITER(844, 843)
+XNDIVP_DIV_ITER(844, 843)
+XNDIVP_MOD_ITER(845, 844)
+XNDIVP_DIV_ITER(845, 844)
+XNDIVP_MOD_ITER(846, 845)
+XNDIVP_DIV_ITER(846, 845)
+XNDIVP_MOD_ITER(847, 846)
+XNDIVP_DIV_ITER(847, 846)
+XNDIVP_MOD_ITER(848, 847)
+XNDIVP_DIV_ITER(848, 847)
+XNDIVP_MOD_ITER(849, 848)
+XNDIVP_DIV_ITER(849, 848)
+XNDIVP_MOD_ITER(850, 849)
+XNDIVP_DIV_ITER(850, 849)
+XNDIVP_MOD_ITER(851, 850)
+XNDIVP_DIV_ITER(851, 850)
+XNDIVP_MOD_ITER(852, 851)
+XNDIVP_DIV_ITER(852, 851)
+XNDIVP_MOD_ITER(853, 852)
+XNDIVP_DIV_ITER(853, 852)
+XNDIVP_MOD_ITER(854, 853)
+XNDIVP_DIV_ITER(854, 853)
+XNDIVP_MOD_ITER(855, 854)
+XNDIVP_DIV_ITER(855, 854)
+XNDIVP_MOD_ITER(856, 855)
+XNDIVP_DIV_ITER(856, 855)
+XNDIVP_MOD_ITER(857, 856)
+XNDIVP_DIV_ITER(857, 856)
+XNDIVP_MOD_ITER(858, 857)
+XNDIVP_DIV_ITER(858, 857)
+XNDIVP_MOD_ITER(859, 858)
+XNDIVP_DIV_ITER(859, 858)
+XNDIVP_MOD_ITER(860, 859)
+XNDIVP_DIV_ITER(860, 859)
+XNDIVP_MOD_ITER(861, 860)
+XNDIVP_DIV_ITER(861, 860)
+XNDIVP_MOD_ITER(862, 861)
+XNDIVP_DIV_ITER(862, 861)
+XNDIVP_MOD_ITER(863, 862)
+XNDIVP_DIV_ITER(863, 862)
+XNDIVP_MOD_ITER(864, 863)
+XNDIVP_DIV_ITER(864, 863)
+XNDIVP_MOD_ITER(865, 864)
+XNDIVP_DIV_ITER(865, 864)
+XNDIVP_MOD_ITER(866, 865)
+XNDIVP_DIV_ITER(866, 865)
+XNDIVP_MOD_ITER(867, 866)
+XNDIVP_DIV_ITER(867, 866)
+XNDIVP_MOD_ITER(868, 867)
+XNDIVP_DIV_ITER(868, 867)
+XNDIVP_MOD_ITER(869, 868)
+XNDIVP_DIV_ITER(869, 868)
+XNDIVP_MOD_ITER(870, 869)
+XNDIVP_DIV_ITER(870, 869)
+XNDIVP_MOD_ITER(871, 870)
+XNDIVP_DIV_ITER(871, 870)
+XNDIVP_MOD_ITER(872, 871)
+XNDIVP_DIV_ITER(872, 871)
+XNDIVP_MOD_ITER(873, 872)
+XNDIVP_DIV_ITER(873, 872)
+XNDIVP_MOD_ITER(874, 873)
+XNDIVP_DIV_ITER(874, 873)
+XNDIVP_MOD_ITER(875, 874)
+XNDIVP_DIV_ITER(875, 874)
+XNDIVP_MOD_ITER(876, 875)
+XNDIVP_DIV_ITER(876, 875)
+XNDIVP_MOD_ITER(877, 876)
+XNDIVP_DIV_ITER(877, 876)
+XNDIVP_MOD_ITER(878, 877)
+XNDIVP_DIV_ITER(878, 877)
+XNDIVP_MOD_ITER(879, 878)
+XNDIVP_DIV_ITER(879, 878)
+XNDIVP_MOD_ITER(880, 879)
+XNDIVP_DIV_ITER(880, 879)
+XNDIVP_MOD_ITER(881, 880)
+XNDIVP_DIV_ITER(881, 880)
+XNDIVP_MOD_ITER(882, 881)
+XNDIVP_DIV_ITER(882, 881)
+XNDIVP_MOD_ITER(883, 882)
+XNDIVP_DIV_ITER(883, 882)
+XNDIVP_MOD_ITER(884, 883)
+XNDIVP_DIV_ITER(884, 883)
+XNDIVP_MOD_ITER(885, 884)
+XNDIVP_DIV_ITER(885, 884)
+XNDIVP_MOD_ITER(886, 885)
+XNDIVP_DIV_ITER(886, 885)
+XNDIVP_MOD_ITER(887, 886)
+XNDIVP_DIV_ITER(887, 886)
+XNDIVP_MOD_ITER(888, 887)
+XNDIVP_DIV_ITER(888, 887)
+XNDIVP_MOD_ITER(889, 888)
+XNDIVP_DIV_ITER(889, 888)
+XNDIVP_MOD_ITER(890, 889)
+XNDIVP_DIV_ITER(890, 889)
+XNDIVP_MOD_ITER(891, 890)
+XNDIVP_DIV_ITER(891, 890)
+XNDIVP_MOD_ITER(892, 891)
+XNDIVP_DIV_ITER(892, 891)
+XNDIVP_MOD_ITER(893, 892)
+XNDIVP_DIV_ITER(893, 892)
+XNDIVP_MOD_ITER(894, 893)
+XNDIVP_DIV_ITER(894, 893)
+XNDIVP_MOD_ITER(895, 894)
+XNDIVP_DIV_ITER(895, 894)
+XNDIVP_MOD_ITER(896, 895)
+XNDIVP_DIV_ITER(896, 895)
+XNDIVP_MOD_ITER(897, 896)
+XNDIVP_DIV_ITER(897, 896)
+XNDIVP_MOD_ITER(898, 897)
+XNDIVP_DIV_ITER(898, 897)
+XNDIVP_MOD_ITER(899, 898)
+XNDIVP_DIV_ITER(899, 898)
+XNDIVP_MOD_ITER(900, 899)
+XNDIVP_DIV_ITER(900, 899)
+XNDIVP_MOD_ITER(901, 900)
+XNDIVP_DIV_ITER(901, 900)
+XNDIVP_MOD_ITER(902, 901)
+XNDIVP_DIV_ITER(902, 901)
+XNDIVP_MOD_ITER(903, 902)
+XNDIVP_DIV_ITER(903, 902)
+XNDIVP_MOD_ITER(904, 903)
+XNDIVP_DIV_ITER(904, 903)
+XNDIVP_MOD_ITER(905, 904)
+XNDIVP_DIV_ITER(905, 904)
+XNDIVP_MOD_ITER(906, 905)
+XNDIVP_DIV_ITER(906, 905)
+XNDIVP_MOD_ITER(907, 906)
+XNDIVP_DIV_ITER(907, 906)
+XNDIVP_MOD_ITER(908, 907)
+XNDIVP_DIV_ITER(908, 907)
+XNDIVP_MOD_ITER(909, 908)
+XNDIVP_DIV_ITER(909, 908)
+XNDIVP_MOD_ITER(910, 909)
+XNDIVP_DIV_ITER(910, 909)
+XNDIVP_MOD_ITER(911, 910)
+XNDIVP_DIV_ITER(911, 910)
+XNDIVP_MOD_ITER(912, 911)
+XNDIVP_DIV_ITER(912, 911)
+XNDIVP_MOD_ITER(913, 912)
+XNDIVP_DIV_ITER(913, 912)
+XNDIVP_MOD_ITER(914, 913)
+XNDIVP_DIV_ITER(914, 913)
+XNDIVP_MOD_ITER(915, 914)
+XNDIVP_DIV_ITER(915, 914)
+XNDIVP_MOD_ITER(916, 915)
+XNDIVP_DIV_ITER(916, 915)
+XNDIVP_MOD_ITER(917, 916)
+XNDIVP_DIV_ITER(917, 916)
+XNDIVP_MOD_ITER(918, 917)
+XNDIVP_DIV_ITER(918, 917)
+XNDIVP_MOD_ITER(919, 918)
+XNDIVP_DIV_ITER(919, 918)
+XNDIVP_MOD_ITER(920, 919)
+XNDIVP_DIV_ITER(920, 919)
+XNDIVP_MOD_ITER(921, 920)
+XNDIVP_DIV_ITER(921, 920)
+XNDIVP_MOD_ITER(922, 921)
+XNDIVP_DIV_ITER(922, 921)
+XNDIVP_MOD_ITER(923, 922)
+XNDIVP_DIV_ITER(923, 922)
+XNDIVP_MOD_ITER(924, 923)
+XNDIVP_DIV_ITER(924, 923)
+XNDIVP_MOD_ITER(925, 924)
+XNDIVP_DIV_ITER(925, 924)
+XNDIVP_MOD_ITER(926, 925)
+XNDIVP_DIV_ITER(926, 925)
+XNDIVP_MOD_ITER(927, 926)
+XNDIVP_DIV_ITER(927, 926)
+XNDIVP_MOD_ITER(928, 927)
+XNDIVP_DIV_ITER(928, 927)
+XNDIVP_MOD_ITER(929, 928)
+XNDIVP_DIV_ITER(929, 928)
+XNDIVP_MOD_ITER(930, 929)
+XNDIVP_DIV_ITER(930, 929)
+XNDIVP_MOD_ITER(931, 930)
+XNDIVP_DIV_ITER(931, 930)
+XNDIVP_MOD_ITER(932, 931)
+XNDIVP_DIV_ITER(932, 931)
+XNDIVP_MOD_ITER(933, 932)
+XNDIVP_DIV_ITER(933, 932)
+XNDIVP_MOD_ITER(934, 933)
+XNDIVP_DIV_ITER(934, 933)
+XNDIVP_MOD_ITER(935, 934)
+XNDIVP_DIV_ITER(935, 934)
+XNDIVP_MOD_ITER(936, 935)
+XNDIVP_DIV_ITER(936, 935)
+XNDIVP_MOD_ITER(937, 936)
+XNDIVP_DIV_ITER(937, 936)
+XNDIVP_MOD_ITER(938, 937)
+XNDIVP_DIV_ITER(938, 937)
+XNDIVP_MOD_ITER(939, 938)
+XNDIVP_DIV_ITER(939, 938)
+XNDIVP_MOD_ITER(940, 939)
+XNDIVP_DIV_ITER(940, 939)
+XNDIVP_MOD_ITER(941, 940)
+XNDIVP_DIV_ITER(941, 940)
+XNDIVP_MOD_ITER(942, 941)
+XNDIVP_DIV_ITER(942, 941)
+XNDIVP_MOD_ITER(943, 942)
+XNDIVP_DIV_ITER(943, 942)
+XNDIVP_MOD_ITER(944, 943)
+XNDIVP_DIV_ITER(944, 943)
+XNDIVP_MOD_ITER(945, 944)
+XNDIVP_DIV_ITER(945, 944)
+XNDIVP_MOD_ITER(946, 945)
+XNDIVP_DIV_ITER(946, 945)
+XNDIVP_MOD_ITER(947, 946)
+XNDIVP_DIV_ITER(947, 946)
+XNDIVP_MOD_ITER(948, 947)
+XNDIVP_DIV_ITER(948, 947)
+XNDIVP_MOD_ITER(949, 948)
+XNDIVP_DIV_ITER(949, 948)
+XNDIVP_MOD_ITER(950, 949)
+XNDIVP_DIV_ITER(950, 949)
+XNDIVP_MOD_ITER(951, 950)
+XNDIVP_DIV_ITER(951, 950)
+XNDIVP_MOD_ITER(952, 951)
+XNDIVP_DIV_ITER(952, 951)
+XNDIVP_MOD_ITER(953, 952)
+XNDIVP_DIV_ITER(953, 952)
+XNDIVP_MOD_ITER(954, 953)
+XNDIVP_DIV_ITER(954, 953)
+XNDIVP_MOD_ITER(955, 954)
+XNDIVP_DIV_ITER(955, 954)
+XNDIVP_MOD_ITER(956, 955)
+XNDIVP_DIV_ITER(956, 955)
+XNDIVP_MOD_ITER(957, 956)
+XNDIVP_DIV_ITER(957, 956)
+XNDIVP_MOD_ITER(958, 957)
+XNDIVP_DIV_ITER(958, 957)
+XNDIVP_MOD_ITER(959, 958)
+XNDIVP_DIV_ITER(959, 958)
+XNDIVP_MOD_ITER(960, 959)
+XNDIVP_DIV_ITER(960, 959)
+XNDIVP_MOD_ITER(961, 960)
+XNDIVP_DIV_ITER(961, 960)
+XNDIVP_MOD_ITER(962, 961)
+XNDIVP_DIV_ITER(962, 961)
+XNDIVP_MOD_ITER(963, 962)
+XNDIVP_DIV_ITER(963, 962)
+XNDIVP_MOD_ITER(964, 963)
+XNDIVP_DIV_ITER(964, 963)
+XNDIVP_MOD_ITER(965, 964)
+XNDIVP_DIV_ITER(965, 964)
+XNDIVP_MOD_ITER(966, 965)
+XNDIVP_DIV_ITER(966, 965)
+XNDIVP_MOD_ITER(967, 966)
+XNDIVP_DIV_ITER(967, 966)
+XNDIVP_MOD_ITER(968, 967)
+XNDIVP_DIV_ITER(968, 967)
+XNDIVP_MOD_ITER(969, 968)
+XNDIVP_DIV_ITER(969, 968)
+XNDIVP_MOD_ITER(970, 969)
+XNDIVP_DIV_ITER(970, 969)
+XNDIVP_MOD_ITER(971, 970)
+XNDIVP_DIV_ITER(971, 970)
+XNDIVP_MOD_ITER(972, 971)
+XNDIVP_DIV_ITER(972, 971)
+XNDIVP_MOD_ITER(973, 972)
+XNDIVP_DIV_ITER(973, 972)
+XNDIVP_MOD_ITER(974, 973)
+XNDIVP_DIV_ITER(974, 973)
+XNDIVP_MOD_ITER(975, 974)
+XNDIVP_DIV_ITER(975, 974)
+XNDIVP_MOD_ITER(976, 975)
+XNDIVP_DIV_ITER(976, 975)
+XNDIVP_MOD_ITER(977, 976)
+XNDIVP_DIV_ITER(977, 976)
+XNDIVP_MOD_ITER(978, 977)
+XNDIVP_DIV_ITER(978, 977)
+XNDIVP_MOD_ITER(979, 978)
+XNDIVP_DIV_ITER(979, 978)
+XNDIVP_MOD_ITER(980, 979)
+XNDIVP_DIV_ITER(980, 979)
+XNDIVP_MOD_ITER(981, 980)
+XNDIVP_DIV_ITER(981, 980)
+XNDIVP_MOD_ITER(982, 981)
+XNDIVP_DIV_ITER(982, 981)
+XNDIVP_MOD_ITER(983, 982)
+XNDIVP_DIV_ITER(983, 982)
+XNDIVP_MOD_ITER(984, 983)
+XNDIVP_DIV_ITER(984, 983)
+XNDIVP_MOD_ITER(985, 984)
+XNDIVP_DIV_ITER(985, 984)
+XNDIVP_MOD_ITER(986, 985)
+XNDIVP_DIV_ITER(986, 985)
+XNDIVP_MOD_ITER(987, 986)
+XNDIVP_DIV_ITER(987, 986)
+XNDIVP_MOD_ITER(988, 987)
+XNDIVP_DIV_ITER(988, 987)
+XNDIVP_MOD_ITER(989, 988)
+XNDIVP_DIV_ITER(989, 988)
+XNDIVP_MOD_ITER(990, 989)
+XNDIVP_DIV_ITER(990, 989)
+XNDIVP_MOD_ITER(991, 990)
+XNDIVP_DIV_ITER(991, 990)
+XNDIVP_MOD_ITER(992, 991)
+XNDIVP_DIV_ITER(992, 991)
+XNDIVP_MOD_ITER(993, 992)
+XNDIVP_DIV_ITER(993, 992)
+XNDIVP_MOD_ITER(994, 993)
+XNDIVP_DIV_ITER(994, 993)
+XNDIVP_MOD_ITER(995, 994)
+XNDIVP_DIV_ITER(995, 994)
+XNDIVP_MOD_ITER(996, 995)
+XNDIVP_DIV_ITER(996, 995)
+XNDIVP_MOD_ITER(997, 996)
+XNDIVP_DIV_ITER(997, 996)
+XNDIVP_MOD_ITER(998, 997)
+XNDIVP_DIV_ITER(998, 997)
+XNDIVP_MOD_ITER(999, 998)
+XNDIVP_DIV_ITER(999, 998)
+XNDIVP_MOD_ITER(1000, 999)
+XNDIVP_DIV_ITER(1000, 999)
+XNDIVP_MOD_ITER(1001, 1000)
+XNDIVP_DIV_ITER(1001, 1000)
+XNDIVP_MOD_ITER(1002, 1001)
+XNDIVP_DIV_ITER(1002, 1001)
+XNDIVP_MOD_ITER(1003, 1002)
+XNDIVP_DIV_ITER(1003, 1002)
+XNDIVP_MOD_ITER(1004, 1003)
+XNDIVP_DIV_ITER(1004, 1003)
+XNDIVP_MOD_ITER(1005, 1004)
+XNDIVP_DIV_ITER(1005, 1004)
+XNDIVP_MOD_ITER(1006, 1005)
+XNDIVP_DIV_ITER(1006, 1005)
+XNDIVP_MOD_ITER(1007, 1006)
+XNDIVP_DIV_ITER(1007, 1006)
+XNDIVP_MOD_ITER(1008, 1007)
+XNDIVP_DIV_ITER(1008, 1007)
+XNDIVP_MOD_ITER(1009, 1008)
+XNDIVP_DIV_ITER(1009, 1008)
+XNDIVP_MOD_ITER(1010, 1009)
+XNDIVP_DIV_ITER(1010, 1009)
+XNDIVP_MOD_ITER(1011, 1010)
+XNDIVP_DIV_ITER(1011, 1010)
+XNDIVP_MOD_ITER(1012, 1011)
+XNDIVP_DIV_ITER(1012, 1011)
+XNDIVP_MOD_ITER(1013, 1012)
+XNDIVP_DIV_ITER(1013, 1012)
+XNDIVP_MOD_ITER(1014, 1013)
+XNDIVP_DIV_ITER(1014, 1013)
+XNDIVP_MOD_ITER(1015, 1014)
+XNDIVP_DIV_ITER(1015, 1014)
+XNDIVP_MOD_ITER(1016, 1015)
+XNDIVP_DIV_ITER(1016, 1015)
+XNDIVP_MOD_ITER(1017, 1016)
+XNDIVP_DIV_ITER(1017, 1016)
+XNDIVP_MOD_ITER(1018, 1017)
+XNDIVP_DIV_ITER(1018, 1017)
+XNDIVP_MOD_ITER(1019, 1018)
+XNDIVP_DIV_ITER(1019, 1018)
+XNDIVP_MOD_ITER(1020, 1019)
+XNDIVP_DIV_ITER(1020, 1019)
+XNDIVP_MOD_ITER(1021, 1020)
+XNDIVP_DIV_ITER(1021, 1020)
+XNDIVP_MOD_ITER(1022, 1021)
+XNDIVP_DIV_ITER(1022, 1021)
+XNDIVP_MOD_ITER(1023, 1022)
+XNDIVP_DIV_ITER(1023, 1022)
+XNDIVP_MOD_ITER(1024, 1023)
+XNDIVP_DIV_ITER(1024, 1023)
--- a/crc32x86.c	Mon Feb 09 01:21:00 2026 -0500
+++ b/crc32x86.c	Mon Feb 09 21:30:30 2026 -0500
@@ -2,6 +2,16 @@
 
 #ifdef __x86_64__
 
+/* NOTE: None of this is really x86-specific.
+ * There are probably many other architectures with
+ * native 64x64->128.
+ *
+ * We could adapt this to use just the gcc uint128_t
+ * instead of x86 intrinsics, but it may slow things
+ * down a bit. */
+
+#define VPCLMULQDQ_TARGET __attribute__((__target__("vpclmulqdq")))
+
 #include "crc32.h"
 #include "crc32i.h"
 #include <stdio.h>
@@ -132,7 +142,7 @@
 {
 	unsigned i;
 
-	for (i = 1; i <= (4*128+32); i++) {
+	for (i = 1; i <= 1024; i++) {
 		printf("XNDIVP_MOD_ITER(%u, %u)\n", i, i - 1);
 		printf("XNDIVP_DIV_ITER(%u, %u)\n", i, i - 1);
 	}
@@ -155,44 +165,135 @@
 #define FIXUPCONSTANTS(x) (BITREVERSE64(x) >> 31)
 	RK01 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64),
 	RK02 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_128),
+	RK03 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_960),
+	RK04 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_1024),
 	RK05 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_64),
 	RK06 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_32),
 	RK07 = FIXUPCONSTANTS(XNDIVP_DIV_ITER_32),
 	RK08 = XNDIVP_RK08R,
+	RK09 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_832),
+	RK10 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_896),
+	RK11 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_704),
+	RK12 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_768),
+	RK13 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_576),
+	RK14 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_640),
+	RK15 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_448),
+	RK16 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_512),
+	RK17 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_320),
+	RK18 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_384),
+	RK19 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_192),
+	RK20 = FIXUPCONSTANTS(XNDIVP_MOD_ITER_256),
 #undef FIXUPCONSTANTS
 };
 
-__attribute__((__target__("vpclmulqdq")))
+VPCLMULQDQ_TARGET
+CRC32_FORCEINLINE
+uint32_t crc32x86_barrett_reduction(__m128i msgxmm)
+{
+	static const CRC32_ALIGN(16) uint64_t rk05[2] = {RK05, RK06},
+			rk07[2] = {RK07, RK08},
+			mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF};
+	__m128i rk;
+
+	rk = _mm_load_si128((__m128i *)rk05);
+
+	msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8));
+
+	msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2)));
+
+	/* Barrett Reduction */
+	rk = _mm_load_si128((__m128i *)rk07);
+	msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm);
+
+	return _mm_extract_epi32(msgxmm, 2);
+}
+
+VPCLMULQDQ_TARGET
+CRC32_FORCEINLINE
+__m128i crc32x86_fold(__m128i xmm, __m128i rk, __m128i next)
+{
+	return _mm_xor_si128(next, _mm_xor_si128(_mm_clmulepi64_si128(xmm, rk, 0x01), _mm_clmulepi64_si128(xmm, rk, 0x10)));
+}
+
+/* GCC-specific shit */
+VPCLMULQDQ_TARGET
 uint32_t crc32x86_vpclmulqdq_r(uint32_t crc, const unsigned char *msg, size_t sz)
 {
+	static const CRC32_ALIGN(16) uint64_t rk01[2] = {RK01, RK02},
+			rk03[2] = {RK03, RK04},
+			rk09[2] = {RK09, RK10},
+			rk11[2] = {RK11, RK12},
+			rk13[2] = {RK13, RK14},
+			rk15[2] = {RK15, RK16},
+			rk17[2] = {RK17, RK18},
+			rk19[2] = {RK19, RK20};
+	__m128i msgxmm;
+
+	if (sz >= 256) {
+		__m128i rk, msgxmma[8], xmm8;
+
+		/* receive first 128 bytes */
+		msgxmma[0] = _mm_load_si128((__m128i *)msg + 0);
+		msgxmma[1] = _mm_load_si128((__m128i *)msg + 1);
+		msgxmma[2] = _mm_load_si128((__m128i *)msg + 2);
+		msgxmma[3] = _mm_load_si128((__m128i *)msg + 3);
+		msgxmma[4] = _mm_load_si128((__m128i *)msg + 4);
+		msgxmma[5] = _mm_load_si128((__m128i *)msg + 5);
+		msgxmma[6] = _mm_load_si128((__m128i *)msg + 6);
+		msgxmma[7] = _mm_load_si128((__m128i *)msg + 7);
+		msg += 128;
+		sz -= 128;
+
+		/* XOR the initial CRC */
+		msgxmma[0] = _mm_xor_si128(msgxmma[0], _mm_cvtsi32_si128(crc));
+
+		rk = _mm_load_si128((__m128i *)rk03);
+
+		for (; sz >= 128; msg += 128, sz -= 128) {
+			/* loop unrolled */
+			msgxmma[0] = crc32x86_fold(msgxmma[0], rk, _mm_load_si128((__m128i *)msg + 0));
+			msgxmma[1] = crc32x86_fold(msgxmma[1], rk, _mm_load_si128((__m128i *)msg + 1));
+			msgxmma[2] = crc32x86_fold(msgxmma[2], rk, _mm_load_si128((__m128i *)msg + 2));
+			msgxmma[3] = crc32x86_fold(msgxmma[3], rk, _mm_load_si128((__m128i *)msg + 3));
+			msgxmma[4] = crc32x86_fold(msgxmma[4], rk, _mm_load_si128((__m128i *)msg + 4));
+			msgxmma[5] = crc32x86_fold(msgxmma[5], rk, _mm_load_si128((__m128i *)msg + 5));
+			msgxmma[6] = crc32x86_fold(msgxmma[6], rk, _mm_load_si128((__m128i *)msg + 6));
+			msgxmma[7] = crc32x86_fold(msgxmma[7], rk, _mm_load_si128((__m128i *)msg + 7));	
+		}
+
+		/* Fold it all into one xmm register */
+		msgxmm = msgxmma[7];
+
+		msgxmm = crc32x86_fold(msgxmma[0], _mm_load_si128((__m128i *)rk09), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[1], _mm_load_si128((__m128i *)rk11), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[2], _mm_load_si128((__m128i *)rk13), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[3], _mm_load_si128((__m128i *)rk15), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[4], _mm_load_si128((__m128i *)rk17), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[5], _mm_load_si128((__m128i *)rk19), msgxmm);
+		msgxmm = crc32x86_fold(msgxmma[6], _mm_load_si128((__m128i *)rk01), msgxmm);
+
+		/* Jump across into the 16-byte code, skipping the loading.
+		 * This is much simpler than either doing two barrett reductions or
+		 * adding a whole ton of branches... */
+		goto jmpFrom128byte;
+	}
+
 	/* This actually works for 16-byte buffers too, but whether it's actually
 	 * useful or faster is another question entirely */
 	if (sz >= 32) {
-		static const __attribute__((__aligned__(16))) uint64_t rk01[2] = {RK01, RK02},
-				rk05[2] = {RK05, RK06},
-				rk07[2] = {RK07, RK08},
-				mask2[2] = {0xFFFFFFFF00000000, 0xFFFFFFFFFFFFFFFF};
-		__m128i rk, msgxmm;
+		__m128i rk;
 
 		msgxmm = _mm_xor_si128(_mm_load_si128((__m128i *)msg), _mm_cvtsi32_si128(crc));
+		msg += 16;
+		sz -= 16;
 
+jmpFrom128byte:
 		rk = _mm_load_si128((__m128i *)rk01);
 
-		for (msg += 16, sz -= 16; sz >= 16; msg += 16, sz -= 16) {
-			msgxmm = _mm_xor_si128(_mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x10), _mm_clmulepi64_si128(msgxmm, rk, 0x01)), _mm_load_si128((__m128i *)msg));
-		}
-
-		rk = _mm_load_si128((__m128i *)rk05);
-
-		msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), _mm_srli_si128(msgxmm, 8));
+		for (; sz >= 16; msg += 16, sz -= 16)
+			msgxmm = crc32x86_fold(msgxmm, rk, _mm_load_si128((__m128i *)msg));
 
-		msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_slli_si128(msgxmm, 12), rk, 0x11), _mm_and_si128(msgxmm, _mm_load_si128((__m128i *)mask2)));
-
-		/* Barrett Reduction */
-		rk = _mm_load_si128((__m128i *)rk07);
-		msgxmm = _mm_xor_si128(_mm_clmulepi64_si128(_mm_clmulepi64_si128(msgxmm, rk, 0x00), rk, 0x10), msgxmm);
-
-		crc = _mm_extract_epi32(msgxmm, 2);
+		crc = crc32x86_barrett_reduction(msgxmm);
 	}
 
 	if (!sz) return crc;