Last active
December 14, 2025 11:45
-
-
Save vurtun/90af0b93e1fe51b2c187f43a2a13814c to your computer and use it in GitHub Desktop.
A* Pathfinder optimized for 64x64 grids on ARM64 mobile SoCs
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| /* | |
| * A* Pathfinder optimized for 64x64 grids on ARM64 mobile SoCs. | |
| * | |
| * Features: | |
| * - Deterministic memory (Strict Indexed Priority Queue, no heap bloat). | |
| * - Cache-coherent SoA layout (Structure of Arrays) fitting ~50KB L1 Cache. | |
| * - Register-level map caching (loading rows into uint64_t registers). | |
| * - Fully NEON-accelerated initialization. | |
| * - Branchless heuristics and collision checking. | |
| * - Zero allocations (Context is reusable). | |
| */ | |
| // based on | |
| // https://www.youtube.com/watch?v=hUZbkqLRYus | |
| // https://www.youtube.com/watch?v=NAVbI1HIzCE | |
| #include <stdint.h> | |
| #include <stdlib.h> | |
| #include <string.h> | |
| #include <assert.h> | |
| #include <arm_neon.h> | |
| #include <limits.h> | |
| #define cast(t,p) ((t)p) | |
| #define castus(p) cast(uint16_t,p) | |
| #define GRID_SIZE 64 | |
| #define MAP_BITS (GRID_SIZE * GRID_SIZE) | |
| #define MAP_WORDS (MAP_BITS / 64) | |
| #define HEAP_CAPACITY MAP_BITS | |
| #define STRAIGHT_COST 100 | |
| #define DIAG_COST 141 | |
| #define DIAG_EXTRA (DIAG_COST - STRAIGHT_COST) | |
| #define HEAP_INDEX_NONE 0xFFFF | |
| // Reordered for perfect 128-bit (16-byte) NEON alignment | |
| struct as_ctx { | |
| uint32_t g_scores[MAP_BITS]; // [16 KB] Offset: 0 (Aligned 64) | |
| uint32_t f_scores[MAP_BITS]; // [16 KB] Offset: 16384 (Aligned 16) | |
| uint16_t heap[HEAP_CAPACITY]; // [8 KB] Offset: 32768 (Aligned 16) | |
| uint16_t indices[MAP_BITS]; // [8 KB] Offset: 40960 (Aligned 16) | |
| uint64_t closed[MAP_WORDS]; // [512 B] Offset: 49152 (Aligned 16) | |
| uint8_t parents[MAP_BITS]; // [4 KB] Offset: 49664 (Aligned 16) | |
| uint16_t heap_cnt; | |
| uint16_t goal; | |
| uint8_t was_found; | |
| uint8_t _padding[11]; // Explicit padding to reach 64-byte total align if needed | |
| } __attribute__((aligned(64))); | |
| static_assert(sizeof(uint64_t) == 8, "64-bit platform required"); | |
| static_assert(MAP_BITS == 4096, "Fixed grid size mismatch"); | |
| static void | |
| as_assert_ctx_state(const struct as_ctx *ctx) { | |
| assert(ctx != NULL); | |
| assert(0 <= ctx->heap_cnt && ctx->heap_cnt <= HEAP_CAPACITY); | |
| for (int i = 0; i < ctx->heap_cnt; i++) { | |
| uint16_t node = ctx->heap[i]; | |
| assert(node < MAP_BITS); | |
| assert(ctx->indices[node] == i); | |
| } | |
| for (int i = 0; i < MAP_BITS; i++) { | |
| uint32_t g = ctx->g_scores[i]; | |
| uint32_t f = ctx->f_scores[i]; | |
| uint8_t p = ctx->parents[i]; | |
| assert(g <= UINT_MAX); | |
| assert(f <= UINT_MAX); | |
| assert(p <= 7 || p == 0); | |
| int word = i >> 6; | |
| int bit = i & 63; | |
| if (ctx->closed[word] & (1ULL << (unsigned)bit)) { | |
| assert(g != UINT_MAX); | |
| } | |
| if (ctx->indices[i] != HEAP_INDEX_NONE) { | |
| assert(ctx->indices[i] < ctx->heap_cnt); | |
| assert(ctx->heap[ctx->indices[i]] == i); | |
| } | |
| } | |
| if (ctx->was_found) { | |
| assert(ctx->goal < MAP_BITS); | |
| int word = ctx->goal >> 6; | |
| int bit = ctx->goal & 63; | |
| assert(ctx->closed[word] & (1ULL << (unsigned)bit)); | |
| } | |
| } | |
| static inline int | |
| as_abs(int v) { | |
| // Branchless absolute value | |
| int msk = v >> 31; | |
| return (v + msk) ^ msk; | |
| } | |
| static inline uint32_t | |
| as_heuristic(int x1, int y1, int x2, int y2) { | |
| int dx = as_abs(x1 - x2); | |
| int dy = as_abs(y1 - y2); | |
| int min_d = (dx < dy ? dx : dy); | |
| int max_d = (dx + dy) - min_d; | |
| // 100 * max + 41 * min | |
| return (uint32_t)(STRAIGHT_COST * max_d + DIAG_EXTRA * min_d); | |
| } | |
| static inline void | |
| as_heap_swap(struct as_ctx *ctx, int i, int j) { | |
| uint16_t n1 = ctx->heap[i]; | |
| uint16_t n2 = ctx->heap[j]; | |
| ctx->heap[i] = n2; | |
| ctx->heap[j] = n1; | |
| ctx->indices[n1] = castus(j); | |
| ctx->indices[n2] = castus(i); | |
| } | |
| static void | |
| as_heap_sift_up(struct as_ctx *ctx, int idx) { | |
| while (idx > 0) { | |
| int parent = (idx - 1) >> 1; | |
| if (ctx->f_scores[ctx->heap[idx]] >= ctx->f_scores[ctx->heap[parent]]) { | |
| break; | |
| } | |
| as_heap_swap(ctx, idx, parent); | |
| idx = parent; | |
| } | |
| } | |
| static void | |
| as_heap_push_or_update(struct as_ctx *ctx, uint16_t node, uint32_t g, uint32_t f) { | |
| assert(ctx != NULL); | |
| assert(node < MAP_BITS); | |
| uint16_t existing_idx = ctx->indices[node]; | |
| ctx->g_scores[node] = g; | |
| ctx->f_scores[node] = f; | |
| if (existing_idx == HEAP_INDEX_NONE) { | |
| assert(ctx->heap_cnt < HEAP_CAPACITY); | |
| int idx = ctx->heap_cnt; | |
| ctx->heap[idx] = node; | |
| ctx->indices[node] = castus(idx); | |
| ctx->heap_cnt++; | |
| as_heap_sift_up(ctx, idx); | |
| } else { | |
| // Decrease Key: node is already in heap, just fix position | |
| as_heap_sift_up(ctx, existing_idx); | |
| } | |
| } | |
| static uint16_t | |
| as_heap_pop(struct as_ctx *ctx) { | |
| assert(ctx->heap_cnt > 0); | |
| uint16_t res = ctx->heap[0]; | |
| ctx->indices[res] = HEAP_INDEX_NONE; | |
| ctx->heap_cnt--; | |
| if (ctx->heap_cnt > 0) { | |
| ctx->heap[0] = ctx->heap[ctx->heap_cnt]; | |
| ctx->indices[ctx->heap[0]] = 0; | |
| int idx = 0; | |
| int half = ctx->heap_cnt >> 1; | |
| while (idx < half) { | |
| int lo = idx; | |
| int lhs = (idx << 1) + 1; | |
| int rhs = lhs + 1; | |
| if (ctx->f_scores[ctx->heap[lhs]] < ctx->f_scores[ctx->heap[lo]]) { | |
| lo = lhs; | |
| } | |
| if (rhs < ctx->heap_cnt && ctx->f_scores[ctx->heap[rhs]] < ctx->f_scores[ctx->heap[lo]]) { | |
| lo = rhs; | |
| } | |
| if (lo == idx) { | |
| break; | |
| } | |
| as_heap_swap(ctx, idx, lo); | |
| idx = lo; | |
| } | |
| } | |
| return res; | |
| } | |
| extern void | |
| as_solve(struct as_ctx *ctx, const uint64_t *map, | |
| int start_x, int start_y, int goal_x, int goal_y) { | |
| assert(ctx != NULL); | |
| assert(map != NULL); | |
| assert(0 <= start_x && start_x < GRID_SIZE); | |
| assert(0 <= start_y && start_y < GRID_SIZE); | |
| assert(0 <= goal_x && goal_x < GRID_SIZE); | |
| assert(0 <= goal_y && goal_y < GRID_SIZE); | |
| as_assert_ctx_state(ctx); | |
| static const int8_t dx[8] = { 0, 0, -1, 1, -1, -1, 1, 1 }; | |
| static const int8_t dy[8] = {-1, 1, 0, 0, -1, 1, -1, 1 }; | |
| static const uint8_t dcost[8] = { | |
| STRAIGHT_COST, STRAIGHT_COST, STRAIGHT_COST, STRAIGHT_COST, | |
| DIAG_COST, DIAG_COST, DIAG_COST, DIAG_COST | |
| }; | |
| int src = start_y * GRID_SIZE + start_x; | |
| int dst = goal_y * GRID_SIZE + goal_x; | |
| if ((map[start_y] & (1ULL << start_x)) || (map[goal_y] & (1ULL << goal_x))) { | |
| ctx->was_found = 0; | |
| as_assert_ctx_state(ctx); | |
| return; | |
| } | |
| uint64x2_t zero_u64 = vdupq_n_u64(0); | |
| for (int i = 0; i < MAP_WORDS; i += 2) { | |
| vst1q_u64(&ctx->closed[i], zero_u64); | |
| } | |
| uint32x4_t max_u32 = vdupq_n_u32(UINT_MAX); | |
| for (int i = 0; i < MAP_BITS; i += 4) { | |
| vst1q_u32(&ctx->g_scores[i], max_u32); | |
| vst1q_u32(&ctx->f_scores[i], max_u32); | |
| } | |
| uint16x8_t max_u16 = vdupq_n_u16(HEAP_INDEX_NONE); | |
| for (int i = 0; i < MAP_BITS; i += 8) { | |
| vst1q_u16(&ctx->indices[i], max_u16); | |
| } | |
| uint8x16_t zero_u8 = vdupq_n_u8(0); | |
| for (int i = 0; i < MAP_BITS; i += 16) { | |
| vst1q_u8(&ctx->parents[i], zero_u8); | |
| } | |
| ctx->heap_cnt = 0; | |
| ctx->was_found = 0; | |
| ctx->goal = 0; | |
| uint32_t h = as_heuristic(start_x, start_y, goal_x, goal_y); | |
| as_heap_push_or_update(ctx, castus(src), 0, h); | |
| while (ctx->heap_cnt > 0) { | |
| uint16_t cur = as_heap_pop(ctx); | |
| int cx = cur & 63; | |
| int cy = cur >> 6; | |
| ctx->closed[cy] |= (1ULL << cx); | |
| if (cur == dst) { | |
| ctx->was_found = 1; | |
| ctx->goal = cur; | |
| break; | |
| } | |
| // Load rows to registers to avoid L1 Cache hits inside the neighbor loop. | |
| // map[] is const (read-only), closed[] is modified only at bit (cx, cy) above. | |
| uint64_t row_curr = map[cy] | ctx->closed[cy]; | |
| uint64_t row_top = ~0ULL; | |
| if (cy > 0) { | |
| row_top = map[cy - 1] | ctx->closed[cy - 1]; | |
| } | |
| uint64_t row_bot = ~0ULL; | |
| if (cy < GRID_SIZE - 1) { | |
| row_bot = map[cy + 1] | ctx->closed[cy + 1]; | |
| } | |
| uint64_t rows[3] = { row_top, row_curr, row_bot }; | |
| for (int i = 0; i < 8; i++) { | |
| int nx = cx + dx[i]; | |
| // 1. Grid Bounds Check (unsigned cast trick) | |
| if ((unsigned)nx >= GRID_SIZE) { | |
| continue; | |
| } | |
| // 2. Collision & Closed Check (Bitwise Register Op) | |
| int ridx = dy[i] + 1; | |
| if (rows[ridx] & (1ULL << nx)) { | |
| continue; | |
| } | |
| int ny = cy + dy[i]; | |
| uint16_t neighbor = castus(ny * GRID_SIZE + nx); | |
| uint32_t new_g = ctx->g_scores[cur] + dcost[i]; | |
| // 3. Path improvement check | |
| if (new_g < ctx->g_scores[neighbor]) { | |
| ctx->parents[neighbor] = (uint8_t)i; | |
| uint32_t new_f = new_g + as_heuristic(nx, ny, goal_x, goal_y); | |
| as_heap_push_or_update(ctx, neighbor, new_g, new_f); | |
| } | |
| } | |
| } | |
| as_assert_ctx_state(ctx); | |
| } | |
| extern int | |
| as_path(struct as_ctx *ctx, uint16_t *path) { | |
| assert(ctx != NULL); | |
| assert(path != NULL); | |
| assert(ctx->was_found); | |
| as_assert_ctx_state(ctx); | |
| // Inverse Directions for Backtracking | |
| static const int8_t rev_dx[8] = { 0, 0, 1, -1, 1, 1, -1, -1 }; | |
| static const int8_t rev_dy[8] = { 1, -1, 0, 0, 1, -1, 1, -1 }; | |
| int path_len = 0; | |
| int node = (int)ctx->goal; | |
| int x = node & 63; | |
| int y = node >> 6; | |
| int steps = 0; | |
| while (ctx->g_scores[node] != 0 && ++steps < MAP_BITS) { | |
| path[path_len++] = castus(node); | |
| int didx = ctx->parents[node]; | |
| x += rev_dx[didx]; | |
| y += rev_dy[didx]; | |
| node = y * GRID_SIZE + x; | |
| } | |
| path[path_len++] = castus(node); // Add start | |
| // Reverse Array | |
| for (int i = 0; i < path_len / 2; i++) { | |
| uint16_t tmp = path[i]; | |
| path[i] = path[path_len - 1 - i]; | |
| path[path_len - 1 - i] = tmp; | |
| } | |
| as_assert_ctx_state(ctx); | |
| return path_len; | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment