[git commit] factor: we can pack 21, not 20, 3-bit elements into packed wheel words

Denys Vlasenko vda.linux at googlemail.com
Sun Apr 23 12:02:20 UTC 2023


commit: https://git.busybox.net/busybox/commit/?id=382e1634972f7aab883259dd22cf721d1c205055
branch: https://git.busybox.net/busybox/commit/?id=refs/heads/master

function                                             old     new   delta
packed_wheel                                         192     184      -8
factor_main                                          171     163      -8
------------------------------------------------------------------------------
(add/remove: 0/0 grow/shrink: 0/2 up/down: 0/-16)             Total: -16 bytes

Signed-off-by: Denys Vlasenko <vda.linux at googlemail.com>
---
 coreutils/factor.c | 71 +++++++++++++++++++++++++++---------------------------
 1 file changed, 35 insertions(+), 36 deletions(-)

diff --git a/coreutils/factor.c b/coreutils/factor.c
index de8ea4e11..46dd2a0d7 100644
--- a/coreutils/factor.c
+++ b/coreutils/factor.c
@@ -48,38 +48,40 @@ typedef unsigned long half_t;
  * Larger wheels improve sieving only slightly, but quickly grow in size
  * (adding just one prime, 13, results in 5766 element sieve).
  */
-#define R(a,b,c,d,e,f,g,h,i,j,A,B,C,D,E,F,G,H,I,J) \
-	(((uint64_t)(a<<0) | (b<<3) | (c<<6) | (d<<9) | (e<<12) | (f<<15) | (g<<18) | (h<<21) | (i<<24) | (j<<27)) << 1) | \
-	(((uint64_t)(A<<0) | (B<<3) | (C<<6) | (D<<9) | (E<<12) | (F<<15) | (G<<18) | (H<<21) | (I<<24) | (J<<27)) << 31)
-#define P(a,b,c,d,e,f,g,h,i,j,A,B,C,D,E,F,G,H,I,J) \
+#define R(a,b,c,d,e,f,g,h,i,j,A,B,C,D,E,F,G,H,I,J,x) \
+	(((uint64_t)(a<<0) | (b<<3) | (c<<6) | (d<<9) | (e<<12) | (f<<15) | (g<<18) | (h<<21) | (i<<24) | (j<<27)) << 1)  | \
+	(((uint64_t)(A<<0) | (B<<3) | (C<<6) | (D<<9) | (E<<12) | (F<<15) | (G<<18) | (H<<21) | (I<<24) | (J<<27)) << 31) | \
+	((uint64_t)x << 61)
+#define P(a,b,c,d,e,f,g,h,i,j,A,B,C,D,E,F,G,H,I,J,x) \
 	R(	(a/2),(b/2),(c/2),(d/2),(e/2),(f/2),(g/2),(h/2),(i/2),(j/2), \
-		(A/2),(B/2),(C/2),(D/2),(E/2),(F/2),(G/2),(H/2),(I/2),(J/2)  )
+		(A/2),(B/2),(C/2),(D/2),(E/2),(F/2),(G/2),(H/2),(I/2),(J/2), \
+		(x/2) \
+	)
 static const uint64_t packed_wheel[] = {
-	/*1, 2, 2, 4, 2,*/
-	P( 4, 2, 4, 6, 2, 6, 4, 2, 4, 6, 6, 2, 6, 4, 2, 6, 4, 6, 8, 4), //01
-	P( 2, 4, 2, 4,14, 4, 6, 2,10, 2, 6, 6, 4, 2, 4, 6, 2,10, 2, 4), //02
-	P( 2,12,10, 2, 4, 2, 4, 6, 2, 6, 4, 6, 6, 6, 2, 6, 4, 2, 6, 4), //03
-	P( 6, 8, 4, 2, 4, 6, 8, 6,10, 2, 4, 6, 2, 6, 6, 4, 2, 4, 6, 2), //04
-	P( 6, 4, 2, 6,10, 2,10, 2, 4, 2, 4, 6, 8, 4, 2, 4,12, 2, 6, 4), //05
-	P( 2, 6, 4, 6,12, 2, 4, 2, 4, 8, 6, 4, 6, 2, 4, 6, 2, 6,10, 2), //06
-	P( 4, 6, 2, 6, 4, 2, 4, 2,10, 2,10, 2, 4, 6, 6, 2, 6, 6, 4, 6), //07
-	P( 6, 2, 6, 4, 2, 6, 4, 6, 8, 4, 2, 6, 4, 8, 6, 4, 6, 2, 4, 6), //08
-	P( 8, 6, 4, 2,10, 2, 6, 4, 2, 4, 2,10, 2,10, 2, 4, 2, 4, 8, 6), //09
-	P( 4, 2, 4, 6, 6, 2, 6, 4, 8, 4, 6, 8, 4, 2, 4, 2, 4, 8, 6, 4), //10
-	P( 6, 6, 6, 2, 6, 6, 4, 2, 4, 6, 2, 6, 4, 2, 4, 2,10, 2,10, 2), //11
-	P( 6, 4, 6, 2, 6, 4, 2, 4, 6, 6, 8, 4, 2, 6,10, 8, 4, 2, 4, 2), //12
-	P( 4, 8,10, 6, 2, 4, 8, 6, 6, 4, 2, 4, 6, 2, 6, 4, 6, 2,10, 2), //13
-	P(10, 2, 4, 2, 4, 6, 2, 6, 4, 2, 4, 6, 6, 2, 6, 6, 6, 4, 6, 8), //14
-	P( 4, 2, 4, 2, 4, 8, 6, 4, 8, 4, 6, 2, 6, 6, 4, 2, 4, 6, 8, 4), //15
-	P( 2, 4, 2,10, 2,10, 2, 4, 2, 4, 6, 2,10, 2, 4, 6, 8, 6, 4, 2), //16
-	P( 6, 4, 6, 8, 4, 6, 2, 4, 8, 6, 4, 6, 2, 4, 6, 2, 6, 6, 4, 6), //17
-	P( 6, 2, 6, 6, 4, 2,10, 2,10, 2, 4, 2, 4, 6, 2, 6, 4, 2,10, 6), //18
-	P( 2, 6, 4, 2, 6, 4, 6, 8, 4, 2, 4, 2,12, 6, 4, 6, 2, 4, 6, 2), //19
-	P(12, 4, 2, 4, 8, 6, 4, 2, 4, 2,10, 2,10, 6, 2, 4, 6, 2, 6, 4), //20
-	P( 2, 4, 6, 6, 2, 6, 4, 2,10, 6, 8, 6, 4, 2, 4, 8, 6, 4, 6, 2), //21
-	P( 4, 6, 2, 6, 6, 6, 4, 6, 2, 6, 4, 2, 4, 2,10,12, 2, 4, 2,10), //22
-	P( 2, 6, 4, 2, 4, 6, 6, 2,10, 2, 6, 4,14, 4, 2, 4, 2, 4, 8, 6), //23
-	P( 4, 6, 2, 4, 6, 2, 6, 6, 4, 2, 4, 6, 2, 6, 4, 2, 4,12, 2,12), //24
+	/* 1, 2, */
+	P( 2, 4, 2, 4, 2, 4, 6, 2, 6, 4, 2, 4, 6, 6, 2, 6, 4, 2, 6, 4, 6),
+	P( 8, 4, 2, 4, 2, 4,14, 4, 6, 2,10, 2, 6, 6, 4, 2, 4, 6, 2,10, 2),
+	P( 4, 2,12,10, 2, 4, 2, 4, 6, 2, 6, 4, 6, 6, 6, 2, 6, 4, 2, 6, 4),
+	P( 6, 8, 4, 2, 4, 6, 8, 6,10, 2, 4, 6, 2, 6, 6, 4, 2, 4, 6, 2, 6),
+	P( 4, 2, 6,10, 2,10, 2, 4, 2, 4, 6, 8, 4, 2, 4,12, 2, 6, 4, 2, 6),
+	P( 4, 6,12, 2, 4, 2, 4, 8, 6, 4, 6, 2, 4, 6, 2, 6,10, 2, 4, 6, 2),
+	P( 6, 4, 2, 4, 2,10, 2,10, 2, 4, 6, 6, 2, 6, 6, 4, 6, 6, 2, 6, 4),
+	P( 2, 6, 4, 6, 8, 4, 2, 6, 4, 8, 6, 4, 6, 2, 4, 6, 8, 6, 4, 2,10),
+	P( 2, 6, 4, 2, 4, 2,10, 2,10, 2, 4, 2, 4, 8, 6, 4, 2, 4, 6, 6, 2),
+	P( 6, 4, 8, 4, 6, 8, 4, 2, 4, 2, 4, 8, 6, 4, 6, 6, 6, 2, 6, 6, 4),
+	P( 2, 4, 6, 2, 6, 4, 2, 4, 2,10, 2,10, 2, 6, 4, 6, 2, 6, 4, 2, 4),
+	P( 6, 6, 8, 4, 2, 6,10, 8, 4, 2, 4, 2, 4, 8,10, 6, 2, 4, 8, 6, 6),
+	P( 4, 2, 4, 6, 2, 6, 4, 6, 2,10, 2,10, 2, 4, 2, 4, 6, 2, 6, 4, 2),
+	P( 4, 6, 6, 2, 6, 6, 6, 4, 6, 8, 4, 2, 4, 2, 4, 8, 6, 4, 8, 4, 6),
+	P( 2, 6, 6, 4, 2, 4, 6, 8, 4, 2, 4, 2,10, 2,10, 2, 4, 2, 4, 6, 2),
+	P(10, 2, 4, 6, 8, 6, 4, 2, 6, 4, 6, 8, 4, 6, 2, 4, 8, 6, 4, 6, 2),
+	P( 4, 6, 2, 6, 6, 4, 6, 6, 2, 6, 6, 4, 2,10, 2,10, 2, 4, 2, 4, 6),
+	P( 2, 6, 4, 2,10, 6, 2, 6, 4, 2, 6, 4, 6, 8, 4, 2, 4, 2,12, 6, 4),
+	P( 6, 2, 4, 6, 2,12, 4, 2, 4, 8, 6, 4, 2, 4, 2,10, 2,10, 6, 2, 4),
+	P( 6, 2, 6, 4, 2, 4, 6, 6, 2, 6, 4, 2,10, 6, 8, 6, 4, 2, 4, 8, 6),
+	P( 4, 6, 2, 4, 6, 2, 6, 6, 6, 4, 6, 2, 6, 4, 2, 4, 2,10,12, 2, 4),
+	P( 2,10, 2, 6, 4, 2, 4, 6, 6, 2,10, 2, 6, 4,14, 4, 2, 4, 2, 4, 8),
+	P( 6, 4, 6, 2, 4, 6, 2, 6, 6, 4, 2, 4, 6, 2, 6, 4, 2, 4,12, 2,12),
 };
 #undef P
 #undef R
@@ -93,8 +95,8 @@ static const uint64_t packed_wheel[] = {
  *	function                old     new   delta
  *	wheel_tab                 -     485    +485
  * 3-bit-packed insanity:
- *	packed_wheel              -     192    +192
- *	factor_main             108     171     +63
+ *	packed_wheel              -     184    +184
+ *	factor_main             108     163     +55
  */
 static void unpack_wheel(void)
 {
@@ -104,10 +106,7 @@ static void unpack_wheel(void)
 	setup_common_bufsiz();
 	wheel_tab[0] = 1;
 	wheel_tab[1] = 2;
-	wheel_tab[2] = 2;
-	wheel_tab[3] = 4;
-	wheel_tab[4] = 2;
-	p = &wheel_tab[5];
+	p = &wheel_tab[2];
 	for (i = 0; i < ARRAY_SIZE(packed_wheel); i++) {
 		uint64_t v = packed_wheel[i];
 		while ((v & 0xe) != 0) {


More information about the busybox-cvs mailing list