Skip to content

Instantly share code, notes, and snippets.

@vurtun
Last active December 14, 2025 11:45
Show Gist options
  • Select an option

  • Save vurtun/90af0b93e1fe51b2c187f43a2a13814c to your computer and use it in GitHub Desktop.

Select an option

Save vurtun/90af0b93e1fe51b2c187f43a2a13814c to your computer and use it in GitHub Desktop.
A* Pathfinder optimized for 64x64 grids on ARM64 mobile SoCs
/*
* A* Pathfinder optimized for 64x64 grids on ARM64 mobile SoCs.
*
* Features:
* - Deterministic memory (Strict Indexed Priority Queue, no heap bloat).
* - Cache-coherent SoA layout (Structure of Arrays) fitting ~50KB L1 Cache.
* - Register-level map caching (loading rows into uint64_t registers).
* - Fully NEON-accelerated initialization.
* - Branchless heuristics and collision checking.
* - Zero allocations (Context is reusable).
*/
// based on
// https://www.youtube.com/watch?v=hUZbkqLRYus
// https://www.youtube.com/watch?v=NAVbI1HIzCE
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
#include <assert.h>
#include <arm_neon.h>
#include <limits.h>
#define cast(t,p) ((t)p)
#define castus(p) cast(uint16_t,p)
#define GRID_SIZE 64
#define MAP_BITS (GRID_SIZE * GRID_SIZE)
#define MAP_WORDS (MAP_BITS / 64)
#define HEAP_CAPACITY MAP_BITS
#define STRAIGHT_COST 100
#define DIAG_COST 141
#define DIAG_EXTRA (DIAG_COST - STRAIGHT_COST)
#define HEAP_INDEX_NONE 0xFFFF
// Reordered for perfect 128-bit (16-byte) NEON alignment
struct as_ctx {
uint32_t g_scores[MAP_BITS]; // [16 KB] Offset: 0 (Aligned 64)
uint32_t f_scores[MAP_BITS]; // [16 KB] Offset: 16384 (Aligned 16)
uint16_t heap[HEAP_CAPACITY]; // [8 KB] Offset: 32768 (Aligned 16)
uint16_t indices[MAP_BITS]; // [8 KB] Offset: 40960 (Aligned 16)
uint64_t closed[MAP_WORDS]; // [512 B] Offset: 49152 (Aligned 16)
uint8_t parents[MAP_BITS]; // [4 KB] Offset: 49664 (Aligned 16)
uint16_t heap_cnt;
uint16_t goal;
uint8_t was_found;
uint8_t _padding[11]; // Explicit padding to reach 64-byte total align if needed
} __attribute__((aligned(64)));
static_assert(sizeof(uint64_t) == 8, "64-bit platform required");
static_assert(MAP_BITS == 4096, "Fixed grid size mismatch");
static void
as_assert_ctx_state(const struct as_ctx *ctx) {
assert(ctx != NULL);
assert(0 <= ctx->heap_cnt && ctx->heap_cnt <= HEAP_CAPACITY);
for (int i = 0; i < ctx->heap_cnt; i++) {
uint16_t node = ctx->heap[i];
assert(node < MAP_BITS);
assert(ctx->indices[node] == i);
}
for (int i = 0; i < MAP_BITS; i++) {
uint32_t g = ctx->g_scores[i];
uint32_t f = ctx->f_scores[i];
uint8_t p = ctx->parents[i];
assert(g <= UINT_MAX);
assert(f <= UINT_MAX);
assert(p <= 7 || p == 0);
int word = i >> 6;
int bit = i & 63;
if (ctx->closed[word] & (1ULL << (unsigned)bit)) {
assert(g != UINT_MAX);
}
if (ctx->indices[i] != HEAP_INDEX_NONE) {
assert(ctx->indices[i] < ctx->heap_cnt);
assert(ctx->heap[ctx->indices[i]] == i);
}
}
if (ctx->was_found) {
assert(ctx->goal < MAP_BITS);
int word = ctx->goal >> 6;
int bit = ctx->goal & 63;
assert(ctx->closed[word] & (1ULL << (unsigned)bit));
}
}
static inline int
as_abs(int v) {
// Branchless absolute value
int msk = v >> 31;
return (v + msk) ^ msk;
}
static inline uint32_t
as_heuristic(int x1, int y1, int x2, int y2) {
int dx = as_abs(x1 - x2);
int dy = as_abs(y1 - y2);
int min_d = (dx < dy ? dx : dy);
int max_d = (dx + dy) - min_d;
// 100 * max + 41 * min
return (uint32_t)(STRAIGHT_COST * max_d + DIAG_EXTRA * min_d);
}
static inline void
as_heap_swap(struct as_ctx *ctx, int i, int j) {
uint16_t n1 = ctx->heap[i];
uint16_t n2 = ctx->heap[j];
ctx->heap[i] = n2;
ctx->heap[j] = n1;
ctx->indices[n1] = castus(j);
ctx->indices[n2] = castus(i);
}
static void
as_heap_sift_up(struct as_ctx *ctx, int idx) {
while (idx > 0) {
int parent = (idx - 1) >> 1;
if (ctx->f_scores[ctx->heap[idx]] >= ctx->f_scores[ctx->heap[parent]]) {
break;
}
as_heap_swap(ctx, idx, parent);
idx = parent;
}
}
static void
as_heap_push_or_update(struct as_ctx *ctx, uint16_t node, uint32_t g, uint32_t f) {
assert(ctx != NULL);
assert(node < MAP_BITS);
uint16_t existing_idx = ctx->indices[node];
ctx->g_scores[node] = g;
ctx->f_scores[node] = f;
if (existing_idx == HEAP_INDEX_NONE) {
assert(ctx->heap_cnt < HEAP_CAPACITY);
int idx = ctx->heap_cnt;
ctx->heap[idx] = node;
ctx->indices[node] = castus(idx);
ctx->heap_cnt++;
as_heap_sift_up(ctx, idx);
} else {
// Decrease Key: node is already in heap, just fix position
as_heap_sift_up(ctx, existing_idx);
}
}
static uint16_t
as_heap_pop(struct as_ctx *ctx) {
assert(ctx->heap_cnt > 0);
uint16_t res = ctx->heap[0];
ctx->indices[res] = HEAP_INDEX_NONE;
ctx->heap_cnt--;
if (ctx->heap_cnt > 0) {
ctx->heap[0] = ctx->heap[ctx->heap_cnt];
ctx->indices[ctx->heap[0]] = 0;
int idx = 0;
int half = ctx->heap_cnt >> 1;
while (idx < half) {
int lo = idx;
int lhs = (idx << 1) + 1;
int rhs = lhs + 1;
if (ctx->f_scores[ctx->heap[lhs]] < ctx->f_scores[ctx->heap[lo]]) {
lo = lhs;
}
if (rhs < ctx->heap_cnt && ctx->f_scores[ctx->heap[rhs]] < ctx->f_scores[ctx->heap[lo]]) {
lo = rhs;
}
if (lo == idx) {
break;
}
as_heap_swap(ctx, idx, lo);
idx = lo;
}
}
return res;
}
extern void
as_solve(struct as_ctx *ctx, const uint64_t *map,
int start_x, int start_y, int goal_x, int goal_y) {
assert(ctx != NULL);
assert(map != NULL);
assert(0 <= start_x && start_x < GRID_SIZE);
assert(0 <= start_y && start_y < GRID_SIZE);
assert(0 <= goal_x && goal_x < GRID_SIZE);
assert(0 <= goal_y && goal_y < GRID_SIZE);
as_assert_ctx_state(ctx);
static const int8_t dx[8] = { 0, 0, -1, 1, -1, -1, 1, 1 };
static const int8_t dy[8] = {-1, 1, 0, 0, -1, 1, -1, 1 };
static const uint8_t dcost[8] = {
STRAIGHT_COST, STRAIGHT_COST, STRAIGHT_COST, STRAIGHT_COST,
DIAG_COST, DIAG_COST, DIAG_COST, DIAG_COST
};
int src = start_y * GRID_SIZE + start_x;
int dst = goal_y * GRID_SIZE + goal_x;
if ((map[start_y] & (1ULL << start_x)) || (map[goal_y] & (1ULL << goal_x))) {
ctx->was_found = 0;
as_assert_ctx_state(ctx);
return;
}
uint64x2_t zero_u64 = vdupq_n_u64(0);
for (int i = 0; i < MAP_WORDS; i += 2) {
vst1q_u64(&ctx->closed[i], zero_u64);
}
uint32x4_t max_u32 = vdupq_n_u32(UINT_MAX);
for (int i = 0; i < MAP_BITS; i += 4) {
vst1q_u32(&ctx->g_scores[i], max_u32);
vst1q_u32(&ctx->f_scores[i], max_u32);
}
uint16x8_t max_u16 = vdupq_n_u16(HEAP_INDEX_NONE);
for (int i = 0; i < MAP_BITS; i += 8) {
vst1q_u16(&ctx->indices[i], max_u16);
}
uint8x16_t zero_u8 = vdupq_n_u8(0);
for (int i = 0; i < MAP_BITS; i += 16) {
vst1q_u8(&ctx->parents[i], zero_u8);
}
ctx->heap_cnt = 0;
ctx->was_found = 0;
ctx->goal = 0;
uint32_t h = as_heuristic(start_x, start_y, goal_x, goal_y);
as_heap_push_or_update(ctx, castus(src), 0, h);
while (ctx->heap_cnt > 0) {
uint16_t cur = as_heap_pop(ctx);
int cx = cur & 63;
int cy = cur >> 6;
ctx->closed[cy] |= (1ULL << cx);
if (cur == dst) {
ctx->was_found = 1;
ctx->goal = cur;
break;
}
// Load rows to registers to avoid L1 Cache hits inside the neighbor loop.
// map[] is const (read-only), closed[] is modified only at bit (cx, cy) above.
uint64_t row_curr = map[cy] | ctx->closed[cy];
uint64_t row_top = ~0ULL;
if (cy > 0) {
row_top = map[cy - 1] | ctx->closed[cy - 1];
}
uint64_t row_bot = ~0ULL;
if (cy < GRID_SIZE - 1) {
row_bot = map[cy + 1] | ctx->closed[cy + 1];
}
uint64_t rows[3] = { row_top, row_curr, row_bot };
for (int i = 0; i < 8; i++) {
int nx = cx + dx[i];
// 1. Grid Bounds Check (unsigned cast trick)
if ((unsigned)nx >= GRID_SIZE) {
continue;
}
// 2. Collision & Closed Check (Bitwise Register Op)
int ridx = dy[i] + 1;
if (rows[ridx] & (1ULL << nx)) {
continue;
}
int ny = cy + dy[i];
uint16_t neighbor = castus(ny * GRID_SIZE + nx);
uint32_t new_g = ctx->g_scores[cur] + dcost[i];
// 3. Path improvement check
if (new_g < ctx->g_scores[neighbor]) {
ctx->parents[neighbor] = (uint8_t)i;
uint32_t new_f = new_g + as_heuristic(nx, ny, goal_x, goal_y);
as_heap_push_or_update(ctx, neighbor, new_g, new_f);
}
}
}
as_assert_ctx_state(ctx);
}
extern int
as_path(struct as_ctx *ctx, uint16_t *path) {
assert(ctx != NULL);
assert(path != NULL);
assert(ctx->was_found);
as_assert_ctx_state(ctx);
// Inverse Directions for Backtracking
static const int8_t rev_dx[8] = { 0, 0, 1, -1, 1, 1, -1, -1 };
static const int8_t rev_dy[8] = { 1, -1, 0, 0, 1, -1, 1, -1 };
int path_len = 0;
int node = (int)ctx->goal;
int x = node & 63;
int y = node >> 6;
int steps = 0;
while (ctx->g_scores[node] != 0 && ++steps < MAP_BITS) {
path[path_len++] = castus(node);
int didx = ctx->parents[node];
x += rev_dx[didx];
y += rev_dy[didx];
node = y * GRID_SIZE + x;
}
path[path_len++] = castus(node); // Add start
// Reverse Array
for (int i = 0; i < path_len / 2; i++) {
uint16_t tmp = path[i];
path[i] = path[path_len - 1 - i];
path[path_len - 1 - i] = tmp;
}
as_assert_ctx_state(ctx);
return path_len;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment