Created
February 1, 2026 01:10
-
-
Save nmoinvaz/556d98fa31c391bda5afff3b43645d2d to your computer and use it in GitHub Desktop.
Zlib-ng benchmark for deflate tallying
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* benchmark_tally.cc -- benchmark sym_buf read/write strategies | |
| * Copyright (C) 2024 zlib-ng contributors | |
| * For conditions of distribution and use, see copyright notice in zlib.h | |
| * | |
| * Compares: | |
| * 1. LIT_MEM (separate d_buf/l_buf arrays) | |
| * 2. sym_buf with zng_memread_4/zng_memwrite_4 (batched) | |
| * 3. sym_buf with byte-by-byte access (original) | |
| */ | |
| #include <benchmark/benchmark.h> | |
| #include <cstdlib> | |
| #include <cstring> | |
| #include <cstdint> | |
| #include <vector> | |
| extern "C" { | |
| # include "zbuild.h" | |
| # include "zendian.h" | |
| # include "zmemory.h" | |
| } | |
| /* Simulate realistic symbol buffer sizes */ | |
| #define LIT_BUFSIZE 16384 | |
| #define SYM_END ((LIT_BUFSIZE - 1) * 3) | |
| /* Test data: mix of literals (dist=0) and matches */ | |
| struct Symbol { | |
| uint16_t dist; | |
| uint8_t len_or_lit; | |
| }; | |
| class tally_benchmark : public benchmark::Fixture { | |
| protected: | |
| /* LIT_MEM style buffers */ | |
| uint16_t *d_buf; | |
| uint8_t *l_buf; | |
| /* sym_buf style buffer (3 bytes per symbol) */ | |
| uint8_t *sym_buf; | |
| /* Test symbols to write/read */ | |
| std::vector<Symbol> test_symbols; | |
| /* Number of symbols to process per iteration */ | |
| static constexpr int NUM_SYMBOLS = 10000; | |
| public: | |
| void SetUp(::benchmark::State&) { | |
| d_buf = (uint16_t *)malloc(LIT_BUFSIZE * sizeof(uint16_t)); | |
| l_buf = (uint8_t *)malloc(LIT_BUFSIZE * sizeof(uint8_t)); | |
| sym_buf = (uint8_t *)malloc(LIT_BUFSIZE * 3 + 4); /* +4 for 4-byte read safety */ | |
| memset(d_buf, 0, LIT_BUFSIZE * sizeof(uint16_t)); | |
| memset(l_buf, 0, LIT_BUFSIZE * sizeof(uint8_t)); | |
| memset(sym_buf, 0, LIT_BUFSIZE * 3 + 4); | |
| /* Generate realistic test data: ~70% literals, ~30% matches */ | |
| test_symbols.clear(); | |
| test_symbols.reserve(NUM_SYMBOLS); | |
| for (int i = 0; i < NUM_SYMBOLS; i++) { | |
| Symbol s; | |
| if ((i % 10) < 7) { | |
| /* Literal */ | |
| s.dist = 0; | |
| s.len_or_lit = (uint8_t)(i & 0xff); | |
| } else { | |
| /* Match: distance 1-32768, length 3-258 (stored as 0-255) */ | |
| s.dist = (uint16_t)((i * 7) % 32768 + 1); | |
| s.len_or_lit = (uint8_t)((i * 3) % 256); | |
| } | |
| test_symbols.push_back(s); | |
| } | |
| } | |
| void TearDown(const ::benchmark::State&) { | |
| free(d_buf); | |
| free(l_buf); | |
| free(sym_buf); | |
| } | |
| /* ========== WRITE BENCHMARKS ========== */ | |
| /* LIT_MEM write: separate arrays */ | |
| void BenchWriteLitMem(benchmark::State& state) { | |
| for (auto _ : state) { | |
| unsigned int sym_next = 0; | |
| for (const auto& s : test_symbols) { | |
| d_buf[sym_next] = s.dist; | |
| l_buf[sym_next] = s.len_or_lit; | |
| sym_next++; | |
| } | |
| benchmark::DoNotOptimize(sym_next); | |
| benchmark::ClobberMemory(); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS); | |
| } | |
| /* sym_buf write: original byte-by-byte */ | |
| void BenchWriteSymBufOriginal(benchmark::State& state) { | |
| for (auto _ : state) { | |
| unsigned int sym_next = 0; | |
| for (const auto& s : test_symbols) { | |
| sym_buf[sym_next] = (uint8_t)(s.dist); | |
| sym_buf[sym_next + 1] = (uint8_t)(s.dist >> 8); | |
| sym_buf[sym_next + 2] = s.len_or_lit; | |
| sym_next += 3; | |
| } | |
| benchmark::DoNotOptimize(sym_next); | |
| benchmark::ClobberMemory(); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS); | |
| } | |
| /* sym_buf write: zng_memwrite_4 (writes 4 bytes, only 3 used) */ | |
| /* Layout: [dist_lo, dist_hi, len, 0] */ | |
| void BenchWriteSymBufMemwrite4(benchmark::State& state) { | |
| for (auto _ : state) { | |
| unsigned int sym_next = 0; | |
| for (const auto& s : test_symbols) { | |
| /* Pack: [dist_lo, dist_hi, len, 0] on little-endian */ | |
| uint32_t val = s.dist | ((uint32_t)s.len_or_lit << 16); | |
| zng_memwrite_4(&sym_buf[sym_next], val); | |
| sym_next += 3; | |
| } | |
| benchmark::DoNotOptimize(sym_next); | |
| benchmark::ClobberMemory(); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS); | |
| } | |
| /* sym_buf write: swapped layout [len, dist_lo, dist_hi, 0] */ | |
| void BenchWriteSymBufSwapped(benchmark::State& state) { | |
| for (auto _ : state) { | |
| unsigned int sym_next = 0; | |
| for (const auto& s : test_symbols) { | |
| /* Pack: [len, dist_lo, dist_hi, 0] on little-endian */ | |
| uint32_t val = s.len_or_lit | ((uint32_t)s.dist << 8); | |
| zng_memwrite_4(&sym_buf[sym_next], val); | |
| sym_next += 3; | |
| } | |
| benchmark::DoNotOptimize(sym_next); | |
| benchmark::ClobberMemory(); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS); | |
| } | |
| /* sym_buf write: zng_memwrite_2 + strb (hybrid) */ | |
| void BenchWriteSymBufHybrid(benchmark::State& state) { | |
| for (auto _ : state) { | |
| unsigned int sym_next = 0; | |
| for (const auto& s : test_symbols) { | |
| zng_memwrite_2(&sym_buf[sym_next], s.dist); | |
| sym_buf[sym_next + 2] = s.len_or_lit; | |
| sym_next += 3; | |
| } | |
| benchmark::DoNotOptimize(sym_next); | |
| benchmark::ClobberMemory(); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS); | |
| } | |
| /* ========== READ BENCHMARKS ========== */ | |
| /* First, populate buffers for read benchmarks */ | |
| void PopulateBuffers() { | |
| unsigned int sym_next = 0; | |
| for (const auto& s : test_symbols) { | |
| d_buf[sym_next] = s.dist; | |
| l_buf[sym_next] = s.len_or_lit; | |
| sym_buf[sym_next * 3] = (uint8_t)(s.dist); | |
| sym_buf[sym_next * 3 + 1] = (uint8_t)(s.dist >> 8); | |
| sym_buf[sym_next * 3 + 2] = s.len_or_lit; | |
| sym_next++; | |
| } | |
| } | |
| /* LIT_MEM read: separate arrays */ | |
| void BenchReadLitMem(benchmark::State& state) { | |
| PopulateBuffers(); | |
| for (auto _ : state) { | |
| uint64_t checksum = 0; | |
| for (int i = 0; i < NUM_SYMBOLS; i++) { | |
| uint16_t dist = d_buf[i]; | |
| uint8_t lc = l_buf[i]; | |
| checksum += dist + lc; | |
| } | |
| benchmark::DoNotOptimize(checksum); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS); | |
| } | |
| /* sym_buf read: original byte-by-byte */ | |
| void BenchReadSymBufOriginal(benchmark::State& state) { | |
| PopulateBuffers(); | |
| for (auto _ : state) { | |
| uint64_t checksum = 0; | |
| unsigned int sx = 0; | |
| for (int i = 0; i < NUM_SYMBOLS; i++) { | |
| uint16_t dist = sym_buf[sx++] & 0xff; | |
| dist += (unsigned)(sym_buf[sx++] & 0xff) << 8; | |
| uint8_t lc = sym_buf[sx++]; | |
| checksum += dist + lc; | |
| } | |
| benchmark::DoNotOptimize(checksum); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS); | |
| } | |
| /* sym_buf read: zng_memread_4 - layout [dist_lo, dist_hi, len] */ | |
| void BenchReadSymBufMemread4(benchmark::State& state) { | |
| PopulateBuffers(); | |
| for (auto _ : state) { | |
| uint64_t checksum = 0; | |
| unsigned int sx = 0; | |
| for (int i = 0; i < NUM_SYMBOLS; i++) { | |
| uint32_t val = zng_memread_4(&sym_buf[sx]); | |
| uint16_t dist = val & 0xffff; | |
| uint8_t lc = (val >> 16) & 0xff; | |
| sx += 3; | |
| checksum += dist + lc; | |
| } | |
| benchmark::DoNotOptimize(checksum); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS); | |
| } | |
| /* sym_buf read: swapped layout [len, dist_lo, dist_hi] */ | |
| void BenchReadSymBufSwapped(benchmark::State& state) { | |
| /* Populate with swapped layout */ | |
| unsigned int sym_next = 0; | |
| for (const auto& s : test_symbols) { | |
| sym_buf[sym_next * 3] = s.len_or_lit; | |
| sym_buf[sym_next * 3 + 1] = (uint8_t)(s.dist); | |
| sym_buf[sym_next * 3 + 2] = (uint8_t)(s.dist >> 8); | |
| sym_next++; | |
| } | |
| for (auto _ : state) { | |
| uint64_t checksum = 0; | |
| unsigned int sx = 0; | |
| for (int i = 0; i < NUM_SYMBOLS; i++) { | |
| uint32_t val = zng_memread_4(&sym_buf[sx]); | |
| uint8_t lc = val & 0xff; | |
| uint16_t dist = (val >> 8) & 0xffff; | |
| sx += 3; | |
| checksum += dist + lc; | |
| } | |
| benchmark::DoNotOptimize(checksum); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS); | |
| } | |
| /* sym_buf read: zng_memread_2 + byte (hybrid) */ | |
| void BenchReadSymBufHybrid(benchmark::State& state) { | |
| PopulateBuffers(); | |
| for (auto _ : state) { | |
| uint64_t checksum = 0; | |
| unsigned int sx = 0; | |
| for (int i = 0; i < NUM_SYMBOLS; i++) { | |
| uint16_t dist = zng_memread_2(&sym_buf[sx]); | |
| uint8_t lc = sym_buf[sx + 2]; | |
| sx += 3; | |
| checksum += dist + lc; | |
| } | |
| benchmark::DoNotOptimize(checksum); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS); | |
| } | |
| /* ========== COMBINED READ+WRITE (realistic scenario) ========== */ | |
| /* LIT_MEM: write then read */ | |
| void BenchRoundtripLitMem(benchmark::State& state) { | |
| for (auto _ : state) { | |
| /* Write phase */ | |
| unsigned int sym_next = 0; | |
| for (const auto& s : test_symbols) { | |
| d_buf[sym_next] = s.dist; | |
| l_buf[sym_next] = s.len_or_lit; | |
| sym_next++; | |
| } | |
| /* Read phase */ | |
| uint64_t checksum = 0; | |
| for (unsigned int i = 0; i < sym_next; i++) { | |
| uint16_t dist = d_buf[i]; | |
| uint8_t lc = l_buf[i]; | |
| checksum += dist + lc; | |
| } | |
| benchmark::DoNotOptimize(checksum); | |
| benchmark::ClobberMemory(); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS * 2); | |
| } | |
| /* sym_buf original: write then read byte-by-byte */ | |
| void BenchRoundtripSymBufOriginal(benchmark::State& state) { | |
| for (auto _ : state) { | |
| /* Write phase */ | |
| unsigned int sym_next = 0; | |
| for (const auto& s : test_symbols) { | |
| sym_buf[sym_next] = (uint8_t)(s.dist); | |
| sym_buf[sym_next + 1] = (uint8_t)(s.dist >> 8); | |
| sym_buf[sym_next + 2] = s.len_or_lit; | |
| sym_next += 3; | |
| } | |
| /* Read phase */ | |
| uint64_t checksum = 0; | |
| unsigned int sx = 0; | |
| while (sx < sym_next) { | |
| uint16_t dist = sym_buf[sx++] & 0xff; | |
| dist += (unsigned)(sym_buf[sx++] & 0xff) << 8; | |
| uint8_t lc = sym_buf[sx++]; | |
| checksum += dist + lc; | |
| } | |
| benchmark::DoNotOptimize(checksum); | |
| benchmark::ClobberMemory(); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS * 2); | |
| } | |
| /* sym_buf optimized: write with memwrite_4, read with memread_4 */ | |
| /* Layout: [dist_lo, dist_hi, len] */ | |
| void BenchRoundtripSymBufOptimized(benchmark::State& state) { | |
| for (auto _ : state) { | |
| /* Write phase - use zng_memwrite_4 */ | |
| unsigned int sym_next = 0; | |
| for (const auto& s : test_symbols) { | |
| uint32_t val = s.dist | ((uint32_t)s.len_or_lit << 16); | |
| zng_memwrite_4(&sym_buf[sym_next], val); | |
| sym_next += 3; | |
| } | |
| /* Read phase - use zng_memread_4 */ | |
| uint64_t checksum = 0; | |
| unsigned int sx = 0; | |
| while (sx < sym_next) { | |
| uint32_t val = zng_memread_4(&sym_buf[sx]); | |
| uint16_t dist = val & 0xffff; | |
| uint8_t lc = (val >> 16) & 0xff; | |
| sx += 3; | |
| checksum += dist + lc; | |
| } | |
| benchmark::DoNotOptimize(checksum); | |
| benchmark::ClobberMemory(); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS * 2); | |
| } | |
| /* sym_buf swapped: layout [len, dist_lo, dist_hi] */ | |
| void BenchRoundtripSymBufSwapped(benchmark::State& state) { | |
| for (auto _ : state) { | |
| /* Write phase */ | |
| unsigned int sym_next = 0; | |
| for (const auto& s : test_symbols) { | |
| uint32_t val = s.len_or_lit | ((uint32_t)s.dist << 8); | |
| zng_memwrite_4(&sym_buf[sym_next], val); | |
| sym_next += 3; | |
| } | |
| /* Read phase */ | |
| uint64_t checksum = 0; | |
| unsigned int sx = 0; | |
| while (sx < sym_next) { | |
| uint32_t val = zng_memread_4(&sym_buf[sx]); | |
| uint8_t lc = val & 0xff; | |
| uint16_t dist = (val >> 8) & 0xffff; | |
| sx += 3; | |
| checksum += dist + lc; | |
| } | |
| benchmark::DoNotOptimize(checksum); | |
| benchmark::ClobberMemory(); | |
| } | |
| state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS * 2); | |
| } | |
| }; | |
| /* Write benchmarks */ | |
| BENCHMARK_DEFINE_F(tally_benchmark, write_lit_mem)(benchmark::State& state) { | |
| BenchWriteLitMem(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, write_lit_mem); | |
| BENCHMARK_DEFINE_F(tally_benchmark, write_sym_buf_original)(benchmark::State& state) { | |
| BenchWriteSymBufOriginal(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, write_sym_buf_original); | |
| BENCHMARK_DEFINE_F(tally_benchmark, write_sym_buf_memwrite4)(benchmark::State& state) { | |
| BenchWriteSymBufMemwrite4(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, write_sym_buf_memwrite4); | |
| BENCHMARK_DEFINE_F(tally_benchmark, write_sym_buf_swapped)(benchmark::State& state) { | |
| BenchWriteSymBufSwapped(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, write_sym_buf_swapped); | |
| BENCHMARK_DEFINE_F(tally_benchmark, write_sym_buf_hybrid)(benchmark::State& state) { | |
| BenchWriteSymBufHybrid(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, write_sym_buf_hybrid); | |
| /* Read benchmarks */ | |
| BENCHMARK_DEFINE_F(tally_benchmark, read_lit_mem)(benchmark::State& state) { | |
| BenchReadLitMem(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, read_lit_mem); | |
| BENCHMARK_DEFINE_F(tally_benchmark, read_sym_buf_original)(benchmark::State& state) { | |
| BenchReadSymBufOriginal(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, read_sym_buf_original); | |
| BENCHMARK_DEFINE_F(tally_benchmark, read_sym_buf_memread4)(benchmark::State& state) { | |
| BenchReadSymBufMemread4(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, read_sym_buf_memread4); | |
| BENCHMARK_DEFINE_F(tally_benchmark, read_sym_buf_swapped)(benchmark::State& state) { | |
| BenchReadSymBufSwapped(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, read_sym_buf_swapped); | |
| BENCHMARK_DEFINE_F(tally_benchmark, read_sym_buf_hybrid)(benchmark::State& state) { | |
| BenchReadSymBufHybrid(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, read_sym_buf_hybrid); | |
| /* Roundtrip benchmarks */ | |
| BENCHMARK_DEFINE_F(tally_benchmark, roundtrip_lit_mem)(benchmark::State& state) { | |
| BenchRoundtripLitMem(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, roundtrip_lit_mem); | |
| BENCHMARK_DEFINE_F(tally_benchmark, roundtrip_sym_buf_original)(benchmark::State& state) { | |
| BenchRoundtripSymBufOriginal(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, roundtrip_sym_buf_original); | |
| BENCHMARK_DEFINE_F(tally_benchmark, roundtrip_sym_buf_optimized)(benchmark::State& state) { | |
| BenchRoundtripSymBufOptimized(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, roundtrip_sym_buf_optimized); | |
| BENCHMARK_DEFINE_F(tally_benchmark, roundtrip_sym_buf_swapped)(benchmark::State& state) { | |
| BenchRoundtripSymBufSwapped(state); | |
| } | |
| BENCHMARK_REGISTER_F(tally_benchmark, roundtrip_sym_buf_swapped); |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment