Speedup size hash for 17to128 by using independent accumulators.

Faster (better IPC) to make computation slightly more independent. Results are from geometric mean of N = 6 runs. Machine: ``` 11th Gen Intel(R) Core(TM) i7-11850H @ 2.50GHz ``` CC: ``` gcc (Ubuntu 11.3.0-1ubuntu1~22.04) 11.3.0 ``` Bench Command: ``` // Change is only for 17-128 range. $> ./benchHash xxh3 --mins=1 --maxs=256 --minl=0 --maxl=0 ``` Aggregated Results for [17, 128] size range: Times reported as geometric mean of all speedups. Latency for small inputs of fixed size : - 1.073 Throughput small inputs of fixed size (from 1 to 256 bytes): - 1.173 Latency for small inputs of random size [1-N] : - 1.051 benchmarking random size inputs [1-N] : - 1.134 So roughly 5-17% improvement.
Cyan4973 · Nov 9, 2022 · 7221be4 · 7221be4 · Cyan4973 · Jul 16, 2023
1 parent b2929c4
commit 7221be4
Showing 1 changed file with 11 additions and 9 deletions.
diff --git a/xxhash.h b/xxhash.h
@@ -3960,31 +3960,33 @@ XXH3_len_17to128_64b(const xxh_u8* XXH_RESTRICT input, size_t len,
     XXH_ASSERT(secretSize >= XXH3_SECRET_SIZE_MIN); (void)secretSize;
     XXH_ASSERT(16 < len && len <= 128);
 
-    {   xxh_u64 acc = len * XXH_PRIME64_1;
+    {   xxh_u64 acc = len * XXH_PRIME64_1, acc_end;
 #if XXH_SIZE_OPT >= 1
         /* Smaller and cleaner, but slightly slower. */
         unsigned int i = (unsigned int)(len - 1) / 32;
         do {
             acc += XXH3_mix16B(input+16 * i, secret+32*i, seed);
             acc += XXH3_mix16B(input+len-16*(i+1), secret+32*i+16, seed);
         } while (i-- != 0);
+        acc_end = 0;
 #else
+        acc += XXH3_mix16B(input+0, secret+0, seed);
+        acc_end = XXH3_mix16B(input+len-16, secret+16, seed);
         if (len > 32) {
+            acc += XXH3_mix16B(input+16, secret+32, seed);
+            acc_end += XXH3_mix16B(input+len-32, secret+48, seed);
             if (len > 64) {
+                acc += XXH3_mix16B(input+32, secret+64, seed);
+                acc_end += XXH3_mix16B(input+len-48, secret+80, seed);
+
                 if (len > 96) {
                     acc += XXH3_mix16B(input+48, secret+96, seed);
-                    acc += XXH3_mix16B(input+len-64, secret+112, seed);
+                    acc_end += XXH3_mix16B(input+len-64, secret+112, seed);
                 }
-                acc += XXH3_mix16B(input+32, secret+64, seed);
-                acc += XXH3_mix16B(input+len-48, secret+80, seed);
             }
-            acc += XXH3_mix16B(input+16, secret+32, seed);
-            acc += XXH3_mix16B(input+len-32, secret+48, seed);
         }
-        acc += XXH3_mix16B(input+0, secret+0, seed);
-        acc += XXH3_mix16B(input+len-16, secret+16, seed);
 #endif
-        return XXH3_avalanche(acc);
+        return XXH3_avalanche(acc + acc_end);
     }
 }