Skip to content

Instantly share code, notes, and snippets.

@nmoinvaz
Created February 1, 2026 01:10
Show Gist options
  • Select an option

  • Save nmoinvaz/556d98fa31c391bda5afff3b43645d2d to your computer and use it in GitHub Desktop.

Select an option

Save nmoinvaz/556d98fa31c391bda5afff3b43645d2d to your computer and use it in GitHub Desktop.
Zlib-ng benchmark for deflate tallying
/* benchmark_tally.cc -- benchmark sym_buf read/write strategies
* Copyright (C) 2024 zlib-ng contributors
* For conditions of distribution and use, see copyright notice in zlib.h
*
* Compares:
* 1. LIT_MEM (separate d_buf/l_buf arrays)
* 2. sym_buf with zng_memread_4/zng_memwrite_4 (batched)
* 3. sym_buf with byte-by-byte access (original)
*/
#include <benchmark/benchmark.h>
#include <cstdlib>
#include <cstring>
#include <cstdint>
#include <vector>
extern "C" {
# include "zbuild.h"
# include "zendian.h"
# include "zmemory.h"
}
/* Simulate realistic symbol buffer sizes */
#define LIT_BUFSIZE 16384
#define SYM_END ((LIT_BUFSIZE - 1) * 3)
/* Test data: mix of literals (dist=0) and matches */
struct Symbol {
uint16_t dist;
uint8_t len_or_lit;
};
class tally_benchmark : public benchmark::Fixture {
protected:
/* LIT_MEM style buffers */
uint16_t *d_buf;
uint8_t *l_buf;
/* sym_buf style buffer (3 bytes per symbol) */
uint8_t *sym_buf;
/* Test symbols to write/read */
std::vector<Symbol> test_symbols;
/* Number of symbols to process per iteration */
static constexpr int NUM_SYMBOLS = 10000;
public:
void SetUp(::benchmark::State&) {
d_buf = (uint16_t *)malloc(LIT_BUFSIZE * sizeof(uint16_t));
l_buf = (uint8_t *)malloc(LIT_BUFSIZE * sizeof(uint8_t));
sym_buf = (uint8_t *)malloc(LIT_BUFSIZE * 3 + 4); /* +4 for 4-byte read safety */
memset(d_buf, 0, LIT_BUFSIZE * sizeof(uint16_t));
memset(l_buf, 0, LIT_BUFSIZE * sizeof(uint8_t));
memset(sym_buf, 0, LIT_BUFSIZE * 3 + 4);
/* Generate realistic test data: ~70% literals, ~30% matches */
test_symbols.clear();
test_symbols.reserve(NUM_SYMBOLS);
for (int i = 0; i < NUM_SYMBOLS; i++) {
Symbol s;
if ((i % 10) < 7) {
/* Literal */
s.dist = 0;
s.len_or_lit = (uint8_t)(i & 0xff);
} else {
/* Match: distance 1-32768, length 3-258 (stored as 0-255) */
s.dist = (uint16_t)((i * 7) % 32768 + 1);
s.len_or_lit = (uint8_t)((i * 3) % 256);
}
test_symbols.push_back(s);
}
}
void TearDown(const ::benchmark::State&) {
free(d_buf);
free(l_buf);
free(sym_buf);
}
/* ========== WRITE BENCHMARKS ========== */
/* LIT_MEM write: separate arrays */
void BenchWriteLitMem(benchmark::State& state) {
for (auto _ : state) {
unsigned int sym_next = 0;
for (const auto& s : test_symbols) {
d_buf[sym_next] = s.dist;
l_buf[sym_next] = s.len_or_lit;
sym_next++;
}
benchmark::DoNotOptimize(sym_next);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS);
}
/* sym_buf write: original byte-by-byte */
void BenchWriteSymBufOriginal(benchmark::State& state) {
for (auto _ : state) {
unsigned int sym_next = 0;
for (const auto& s : test_symbols) {
sym_buf[sym_next] = (uint8_t)(s.dist);
sym_buf[sym_next + 1] = (uint8_t)(s.dist >> 8);
sym_buf[sym_next + 2] = s.len_or_lit;
sym_next += 3;
}
benchmark::DoNotOptimize(sym_next);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS);
}
/* sym_buf write: zng_memwrite_4 (writes 4 bytes, only 3 used) */
/* Layout: [dist_lo, dist_hi, len, 0] */
void BenchWriteSymBufMemwrite4(benchmark::State& state) {
for (auto _ : state) {
unsigned int sym_next = 0;
for (const auto& s : test_symbols) {
/* Pack: [dist_lo, dist_hi, len, 0] on little-endian */
uint32_t val = s.dist | ((uint32_t)s.len_or_lit << 16);
zng_memwrite_4(&sym_buf[sym_next], val);
sym_next += 3;
}
benchmark::DoNotOptimize(sym_next);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS);
}
/* sym_buf write: swapped layout [len, dist_lo, dist_hi, 0] */
void BenchWriteSymBufSwapped(benchmark::State& state) {
for (auto _ : state) {
unsigned int sym_next = 0;
for (const auto& s : test_symbols) {
/* Pack: [len, dist_lo, dist_hi, 0] on little-endian */
uint32_t val = s.len_or_lit | ((uint32_t)s.dist << 8);
zng_memwrite_4(&sym_buf[sym_next], val);
sym_next += 3;
}
benchmark::DoNotOptimize(sym_next);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS);
}
/* sym_buf write: zng_memwrite_2 + strb (hybrid) */
void BenchWriteSymBufHybrid(benchmark::State& state) {
for (auto _ : state) {
unsigned int sym_next = 0;
for (const auto& s : test_symbols) {
zng_memwrite_2(&sym_buf[sym_next], s.dist);
sym_buf[sym_next + 2] = s.len_or_lit;
sym_next += 3;
}
benchmark::DoNotOptimize(sym_next);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS);
}
/* ========== READ BENCHMARKS ========== */
/* First, populate buffers for read benchmarks */
void PopulateBuffers() {
unsigned int sym_next = 0;
for (const auto& s : test_symbols) {
d_buf[sym_next] = s.dist;
l_buf[sym_next] = s.len_or_lit;
sym_buf[sym_next * 3] = (uint8_t)(s.dist);
sym_buf[sym_next * 3 + 1] = (uint8_t)(s.dist >> 8);
sym_buf[sym_next * 3 + 2] = s.len_or_lit;
sym_next++;
}
}
/* LIT_MEM read: separate arrays */
void BenchReadLitMem(benchmark::State& state) {
PopulateBuffers();
for (auto _ : state) {
uint64_t checksum = 0;
for (int i = 0; i < NUM_SYMBOLS; i++) {
uint16_t dist = d_buf[i];
uint8_t lc = l_buf[i];
checksum += dist + lc;
}
benchmark::DoNotOptimize(checksum);
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS);
}
/* sym_buf read: original byte-by-byte */
void BenchReadSymBufOriginal(benchmark::State& state) {
PopulateBuffers();
for (auto _ : state) {
uint64_t checksum = 0;
unsigned int sx = 0;
for (int i = 0; i < NUM_SYMBOLS; i++) {
uint16_t dist = sym_buf[sx++] & 0xff;
dist += (unsigned)(sym_buf[sx++] & 0xff) << 8;
uint8_t lc = sym_buf[sx++];
checksum += dist + lc;
}
benchmark::DoNotOptimize(checksum);
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS);
}
/* sym_buf read: zng_memread_4 - layout [dist_lo, dist_hi, len] */
void BenchReadSymBufMemread4(benchmark::State& state) {
PopulateBuffers();
for (auto _ : state) {
uint64_t checksum = 0;
unsigned int sx = 0;
for (int i = 0; i < NUM_SYMBOLS; i++) {
uint32_t val = zng_memread_4(&sym_buf[sx]);
uint16_t dist = val & 0xffff;
uint8_t lc = (val >> 16) & 0xff;
sx += 3;
checksum += dist + lc;
}
benchmark::DoNotOptimize(checksum);
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS);
}
/* sym_buf read: swapped layout [len, dist_lo, dist_hi] */
void BenchReadSymBufSwapped(benchmark::State& state) {
/* Populate with swapped layout */
unsigned int sym_next = 0;
for (const auto& s : test_symbols) {
sym_buf[sym_next * 3] = s.len_or_lit;
sym_buf[sym_next * 3 + 1] = (uint8_t)(s.dist);
sym_buf[sym_next * 3 + 2] = (uint8_t)(s.dist >> 8);
sym_next++;
}
for (auto _ : state) {
uint64_t checksum = 0;
unsigned int sx = 0;
for (int i = 0; i < NUM_SYMBOLS; i++) {
uint32_t val = zng_memread_4(&sym_buf[sx]);
uint8_t lc = val & 0xff;
uint16_t dist = (val >> 8) & 0xffff;
sx += 3;
checksum += dist + lc;
}
benchmark::DoNotOptimize(checksum);
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS);
}
/* sym_buf read: zng_memread_2 + byte (hybrid) */
void BenchReadSymBufHybrid(benchmark::State& state) {
PopulateBuffers();
for (auto _ : state) {
uint64_t checksum = 0;
unsigned int sx = 0;
for (int i = 0; i < NUM_SYMBOLS; i++) {
uint16_t dist = zng_memread_2(&sym_buf[sx]);
uint8_t lc = sym_buf[sx + 2];
sx += 3;
checksum += dist + lc;
}
benchmark::DoNotOptimize(checksum);
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS);
}
/* ========== COMBINED READ+WRITE (realistic scenario) ========== */
/* LIT_MEM: write then read */
void BenchRoundtripLitMem(benchmark::State& state) {
for (auto _ : state) {
/* Write phase */
unsigned int sym_next = 0;
for (const auto& s : test_symbols) {
d_buf[sym_next] = s.dist;
l_buf[sym_next] = s.len_or_lit;
sym_next++;
}
/* Read phase */
uint64_t checksum = 0;
for (unsigned int i = 0; i < sym_next; i++) {
uint16_t dist = d_buf[i];
uint8_t lc = l_buf[i];
checksum += dist + lc;
}
benchmark::DoNotOptimize(checksum);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS * 2);
}
/* sym_buf original: write then read byte-by-byte */
void BenchRoundtripSymBufOriginal(benchmark::State& state) {
for (auto _ : state) {
/* Write phase */
unsigned int sym_next = 0;
for (const auto& s : test_symbols) {
sym_buf[sym_next] = (uint8_t)(s.dist);
sym_buf[sym_next + 1] = (uint8_t)(s.dist >> 8);
sym_buf[sym_next + 2] = s.len_or_lit;
sym_next += 3;
}
/* Read phase */
uint64_t checksum = 0;
unsigned int sx = 0;
while (sx < sym_next) {
uint16_t dist = sym_buf[sx++] & 0xff;
dist += (unsigned)(sym_buf[sx++] & 0xff) << 8;
uint8_t lc = sym_buf[sx++];
checksum += dist + lc;
}
benchmark::DoNotOptimize(checksum);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS * 2);
}
/* sym_buf optimized: write with memwrite_4, read with memread_4 */
/* Layout: [dist_lo, dist_hi, len] */
void BenchRoundtripSymBufOptimized(benchmark::State& state) {
for (auto _ : state) {
/* Write phase - use zng_memwrite_4 */
unsigned int sym_next = 0;
for (const auto& s : test_symbols) {
uint32_t val = s.dist | ((uint32_t)s.len_or_lit << 16);
zng_memwrite_4(&sym_buf[sym_next], val);
sym_next += 3;
}
/* Read phase - use zng_memread_4 */
uint64_t checksum = 0;
unsigned int sx = 0;
while (sx < sym_next) {
uint32_t val = zng_memread_4(&sym_buf[sx]);
uint16_t dist = val & 0xffff;
uint8_t lc = (val >> 16) & 0xff;
sx += 3;
checksum += dist + lc;
}
benchmark::DoNotOptimize(checksum);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS * 2);
}
/* sym_buf swapped: layout [len, dist_lo, dist_hi] */
void BenchRoundtripSymBufSwapped(benchmark::State& state) {
for (auto _ : state) {
/* Write phase */
unsigned int sym_next = 0;
for (const auto& s : test_symbols) {
uint32_t val = s.len_or_lit | ((uint32_t)s.dist << 8);
zng_memwrite_4(&sym_buf[sym_next], val);
sym_next += 3;
}
/* Read phase */
uint64_t checksum = 0;
unsigned int sx = 0;
while (sx < sym_next) {
uint32_t val = zng_memread_4(&sym_buf[sx]);
uint8_t lc = val & 0xff;
uint16_t dist = (val >> 8) & 0xffff;
sx += 3;
checksum += dist + lc;
}
benchmark::DoNotOptimize(checksum);
benchmark::ClobberMemory();
}
state.SetItemsProcessed(int64_t(state.iterations()) * NUM_SYMBOLS * 2);
}
};
/* Write benchmarks */
BENCHMARK_DEFINE_F(tally_benchmark, write_lit_mem)(benchmark::State& state) {
BenchWriteLitMem(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, write_lit_mem);
BENCHMARK_DEFINE_F(tally_benchmark, write_sym_buf_original)(benchmark::State& state) {
BenchWriteSymBufOriginal(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, write_sym_buf_original);
BENCHMARK_DEFINE_F(tally_benchmark, write_sym_buf_memwrite4)(benchmark::State& state) {
BenchWriteSymBufMemwrite4(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, write_sym_buf_memwrite4);
BENCHMARK_DEFINE_F(tally_benchmark, write_sym_buf_swapped)(benchmark::State& state) {
BenchWriteSymBufSwapped(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, write_sym_buf_swapped);
BENCHMARK_DEFINE_F(tally_benchmark, write_sym_buf_hybrid)(benchmark::State& state) {
BenchWriteSymBufHybrid(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, write_sym_buf_hybrid);
/* Read benchmarks */
BENCHMARK_DEFINE_F(tally_benchmark, read_lit_mem)(benchmark::State& state) {
BenchReadLitMem(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, read_lit_mem);
BENCHMARK_DEFINE_F(tally_benchmark, read_sym_buf_original)(benchmark::State& state) {
BenchReadSymBufOriginal(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, read_sym_buf_original);
BENCHMARK_DEFINE_F(tally_benchmark, read_sym_buf_memread4)(benchmark::State& state) {
BenchReadSymBufMemread4(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, read_sym_buf_memread4);
BENCHMARK_DEFINE_F(tally_benchmark, read_sym_buf_swapped)(benchmark::State& state) {
BenchReadSymBufSwapped(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, read_sym_buf_swapped);
BENCHMARK_DEFINE_F(tally_benchmark, read_sym_buf_hybrid)(benchmark::State& state) {
BenchReadSymBufHybrid(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, read_sym_buf_hybrid);
/* Roundtrip benchmarks */
BENCHMARK_DEFINE_F(tally_benchmark, roundtrip_lit_mem)(benchmark::State& state) {
BenchRoundtripLitMem(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, roundtrip_lit_mem);
BENCHMARK_DEFINE_F(tally_benchmark, roundtrip_sym_buf_original)(benchmark::State& state) {
BenchRoundtripSymBufOriginal(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, roundtrip_sym_buf_original);
BENCHMARK_DEFINE_F(tally_benchmark, roundtrip_sym_buf_optimized)(benchmark::State& state) {
BenchRoundtripSymBufOptimized(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, roundtrip_sym_buf_optimized);
BENCHMARK_DEFINE_F(tally_benchmark, roundtrip_sym_buf_swapped)(benchmark::State& state) {
BenchRoundtripSymBufSwapped(state);
}
BENCHMARK_REGISTER_F(tally_benchmark, roundtrip_sym_buf_swapped);
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment