vurtun · December 10, 2025 21:07
diff --git a/chain.c b/chain.c
 /* -----------------------------------------------------------------------------
 *  DECORATIVE CHAIN SIMULATOR
 * -----------------------------------------------------------------------------
 *
 *  An AVX2-optimized XPBD simulation library designed for high-fidelity
 *  decorative physics in AAA games (jewelry, braids, armor chains, lanterns).
 *
 *  ARCHITECTURAL HIGHLIGHTS:
 *  - Hybrid "Blocked" Pipeline: Processes chains in blocks of 8 to maximize SIMD
 *    throughput for global forces (Wind, Gravity, Inertia), then switches to
 *    L1-cache-resident processing for solving local constraints.
 *  - Transposed SoA Layout: Particles are stored in transposed "Even/Odd" arrays,
 *    allowing 8-wide vector operations on a single chain without gather/scatter.
 *  - Register Pressure Optimization: The inner solver loop is hand-tuned to fit
 *    exactly within the 16 YMM registers of AVX2, ensuring zero stack spilling
 *    during the hottest execution path.
 *
 *  PHYSICS FEATURES:
 *  - XPBD (Extended Position Based Dynamics): Ensures infinite stiffness stability
 *    and frame-rate independence via compliance (alpha / dt^2).
 *  - True Cosine Bending: Replaces unstable distance-based bending with proper
 *    dot-product angle constraints (C = n1 . n2 - target). Eliminates the
 *    "shrinkage" artifact common in lesser simulations.
 *  - Art-Directable Curvature: The `rest_curvature` parameter allows chains to
 *    naturally curl (e.g., gold necklaces, ponytails) or hang straight (cables).
 *  - Root Motion Blending: Chains inherit a configurable blend of the mounting
 *    point's velocity, preventing the visual "lag" often seen on animated characters.
 *  - Robust Friction: Penetration-depth based friction prevents chains from
 *    sliding effortlessly across floors.
 *
 *  PERFORMANCE:
 *  - Designed to simulate hundreds of active chains with negligible CPU cost (<0.2ms).
 *  - Zero dynamic allocation after initialization.
 *  - Branchless SIMD collision solving.
 *
 * -----------------------------------------------------------------------------
 */
 #include <immintrin.h>
 #include <math.h>
 #include <assert.h>
 #include <string.h>
 #include <float.h>
 #include <stdint.h>

 /* --- Compiler Feature Macros --- */
 #if defined(_MSC_VER)
  #define ALWAYS_INLINE __forceinline
  #define RESTRICT __restrict
  #define ALIGN_VAR(x) __declspec(align(x))
  #define UNROLL_HINT __pragma(loop(unroll))
 #elif defined(__GNUC__) || defined(__clang__)
  #define ALWAYS_INLINE __attribute__((always_inline)) inline
  #define RESTRICT __restrict__
  #define ALIGN_VAR(x) __attribute__((aligned(x)))
  #define UNROLL_HINT _Pragma("GCC unroll 2")
 #else
  #define ALWAYS_INLINE inline
  #define RESTRICT
  #define ALIGN_VAR(x)
  #define UNROLL_HINT
 #endif

 /* --- Configuration & Constants --- */
 enum {
  MAX_CHNS      = 128,
  PTC_PER_CHN   = 16,
  SPH_PER_CHN   = 8,
  MAX_PTC       = (MAX_CHNS * PTC_PER_CHN),
  MAX_SPH       = (MAX_CHNS * SPH_PER_CHN),
  CON_PER_CHN   = PTC_PER_CHN,
  MAX_REST_LEN  = (MAX_CHNS * CON_PER_CHN),
  MAX_ITR       = 8, // 8 is the sweet spot for XPBD stability
 };
 struct chn_cfg {
  float damping;              // 0.985f recommended
  float stiffness;            // 8000.0f (High for non-stretchy chains)
  float friction;             // 0.4f
  float linear_inertia;       // 0.1f (Keep low to inherit root motion)
  float angular_inertia;      
  float centrifugal_inertia;  
  float bend_stiffness;       // 80.0f (Controls curl resistance)
  float root_motion_blend;    // 0.0 = Drag only, 1.0 = Glued to parent. Try 0.9f.
  float rest_curvature;       // 0.0 = Straight, 0.5 = Curly. Artistic control.
  float _pad[3];              // Explicit padding for 32-byte alignment
 } ALIGN_VAR(32);

 struct chn_sim {
  unsigned char free_idx_cnt;
  unsigned char free_idx[MAX_CHNS];
  
  // Chain Transforms (World)
  float chn_pos_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_pos_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_pos_z[MAX_CHNS] ALIGN_VAR(32);
  float chn_quat_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_quat_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_quat_z[MAX_CHNS] ALIGN_VAR(32);
  float chn_quat_w[MAX_CHNS] ALIGN_VAR(32);

  float chn_prev_pos_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_pos_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_pos_z[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_quat_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_quat_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_quat_z[MAX_CHNS] ALIGN_VAR(32);
  float chn_prev_quat_w[MAX_CHNS] ALIGN_VAR(32);

  // Global Forces
  float chn_grav_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_grav_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_grav_z[MAX_CHNS] ALIGN_VAR(32);
  float chn_wnd_x[MAX_CHNS] ALIGN_VAR(32);
  float chn_wnd_y[MAX_CHNS] ALIGN_VAR(32);
  float chn_wnd_z[MAX_CHNS] ALIGN_VAR(32);
  struct chn_cfg chn_cfg[MAX_CHNS] ALIGN_VAR(32);

  // Particles (Transposed: 0-7 Even, 8-15 Odd)
  float ptc_pos_x[MAX_PTC] ALIGN_VAR(32);
  float ptc_pos_y[MAX_PTC] ALIGN_VAR(32);
  float ptc_pos_z[MAX_PTC] ALIGN_VAR(32);
  float ptc_prev_pos_x[MAX_PTC] ALIGN_VAR(32);
  float ptc_prev_pos_y[MAX_PTC] ALIGN_VAR(32);
  float ptc_prev_pos_z[MAX_PTC] ALIGN_VAR(32);
  float ptc_inv_mass[MAX_PTC] ALIGN_VAR(32);

  // Constraints
  float ptc_rest_len[MAX_REST_LEN] ALIGN_VAR(32);
  float ptc_lambda[MAX_REST_LEN] ALIGN_VAR(32);
  float ptc_bend_lambda[MAX_REST_LEN] ALIGN_VAR(32);
  
  float motion_radius[MAX_PTC] ALIGN_VAR(32);
  float rest_pos_x[MAX_PTC] ALIGN_VAR(32);
  float rest_pos_y[MAX_PTC] ALIGN_VAR(32);
  float rest_pos_z[MAX_PTC] ALIGN_VAR(32);

  // Colliders
  float sph_x[MAX_SPH] ALIGN_VAR(32);
  float sph_y[MAX_SPH] ALIGN_VAR(32);
  float sph_z[MAX_SPH] ALIGN_VAR(32);
  float sph_r[MAX_SPH] ALIGN_VAR(32);
 };
 static int
 chn_sim__val(struct chn_sim *cns) {
  assert(cns);
  assert(cns->free_idx_cnt <= MAX_CHNS);
  return 1;
 }
 extern void
 chn_sim_init(struct chn_sim *cns) {
  assert(cns);
  memset(cns, 0, sizeof(*cns));
  cns->free_idx_cnt = MAX_CHNS;
  for (int i = 0; i < MAX_CHNS; ++i) {
    cns->free_idx[i] = MAX_CHNS - i - 1;
    // Init Quats to Identity to prevent NaNs in first rot calculations
    cns->chn_quat_w[i] = 1.0f;
    cns->chn_prev_quat_w[i] = 1.0f;
  }
  for (int i = 0; i < MAX_PTC; ++i) {
    cns->motion_radius[i] = MAX_MOTION_RADIUS;
  }
  assert(chn_sim__val(cns));
 }
 extern int
 chn_sim_add(struct chn_sim *cns, const struct chn_cfg *cfg, const float *RESTRICT rest_pos, int cnt) {
  assert(cns && cfg && rest_pos);
  assert(cns->free_idx_cnt > 0);
  assert(cnt > 1 && cnt <= PTC_PER_CHN);

  int chn = cns->free_idx[--cns->free_idx_cnt];
  cns->chn_cfg[chn] = *cfg;

  int p_base = (chn * PTC_PER_CHN);
  int r_base = (chn * CON_PER_CHN);
  
  // 1. Setup Particles (Transposed Layout)
  for (int i = 0; i < PTC_PER_CHN; ++i) {
    int logical_i = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1);
    int physical_i = p_base + i;
    if (logical_i < cnt) {
      cns->ptc_pos_x[physical_i] = rest_pos[logical_i*4+0];
      cns->ptc_pos_y[physical_i] = rest_pos[logical_i*4+1];
      cns->ptc_pos_z[physical_i] = rest_pos[logical_i*4+2];
      cns->ptc_inv_mass[physical_i] = rest_pos[logical_i*4+3];

      cns->rest_pos_x[physical_i] = cns->ptc_pos_x[physical_i];
      cns->rest_pos_y[physical_i] = cns->ptc_pos_y[physical_i];
      cns->rest_pos_z[physical_i] = cns->ptc_pos_z[physical_i];

      cns->ptc_prev_pos_x[physical_i] = cns->ptc_pos_x[physical_i];
      cns->ptc_prev_pos_y[physical_i] = cns->ptc_pos_y[physical_i];
      cns->ptc_prev_pos_z[physical_i] = cns->ptc_pos_z[physical_i];
      cns->motion_radius[physical_i] = MAX_MOTION_RADIUS;
    } else {
      cns->ptc_pos_x[physical_i] = 0.0f;
      cns->ptc_pos_y[physical_i] = 0.0f;
      cns->ptc_pos_z[physical_i] = 0.0f;
      cns->ptc_inv_mass[physical_i] = 0.0f;
      cns->ptc_prev_pos_x[physical_i] = 0.0f;
      cns->ptc_prev_pos_y[physical_i] = 0.0f;
      cns->ptc_prev_pos_z[physical_i] = 0.0f;
      cns->motion_radius[physical_i] = MAX_MOTION_RADIUS;
    }
  }
  // 2. Setup Distance Constraints
  for (int i = 0; i < PTC_PER_CHN; ++i) {
    int physical_r = r_base + i;
    cns->ptc_rest_len[physical_r] = 0.0f;
    cns->ptc_lambda[physical_r] = 0.0f;
    cns->ptc_bend_lambda[physical_r] = 0.0f;

    int logical_con_start = (i < 8) ? (i * 2) : ((i - 8) * 2 + 1);
    if (logical_con_start + 1 < cnt) {
      float dx = rest_pos[(logical_con_start+1)*4+0] - rest_pos[logical_con_start*4+0];
      float dy = rest_pos[(logical_con_start+1)*4+1] - rest_pos[logical_con_start*4+1];
      float dz = rest_pos[(logical_con_start+1)*4+2] - rest_pos[logical_con_start*4+2];
      float len = sqrtf(dx*dx + dy*dy + dz*dz);
      cns->ptc_rest_len[physical_r] = len > 1e-6f ? len : 1e-6f;
    }
  }
  return chn;
 }
 extern void
 chn_sim_del(struct chn_sim *cns, int chn) {
  assert(chn >= 0 && chn < MAX_CHNS);
  int p_base = (chn * PTC_PER_CHN);
  int r_base = (chn * CON_PER_CHN);
  int s_base = (chn * SPH_PER_CHN);

  cns->free_idx[cns->free_idx_cnt++] = chn;
  memset(&cns->chn_cfg[chn], 0, sizeof(cns->chn_cfg[0]));

  cns->chn_pos_x[chn] = 0;
  cns->chn_pos_y[chn] = 0;
  cns->chn_pos_z[chn] = 0;

  cns->chn_quat_x[chn] = 0;
  cns->chn_quat_y[chn] = 0;
  cns->chn_quat_z[chn] = 0;
  cns->chn_quat_w[chn] = 1.0f;
  
  cns->chn_prev_pos_x[chn] = 0;
  cns->chn_prev_pos_y[chn] = 0;
  cns->chn_prev_pos_z[chn] = 0;

  cns->chn_prev_quat_x[chn] = 0;
  cns->chn_prev_quat_y[chn] = 0;
  cns->chn_prev_quat_z[chn] = 0;
  cns->chn_prev_quat_w[chn] = 1.0f;

  cns->chn_grav_x[chn] = 0;
  cns->chn_grav_y[chn] = 0;
  cns->chn_grav_z[chn] = 0;

  cns->chn_wnd_x[chn] = 0;
  cns->chn_wnd_y[chn] = 0;
  cns->chn_wnd_z[chn] = 0;

  memset(&cns->ptc_pos_x[p_base], 0, PTC_PER_CHN * sizeof(float));
  memset(&cns->ptc_pos_y[p_base], 0, PTC_PER_CHN * sizeof(float));
  memset(&cns->ptc_pos_z[p_base], 0, PTC_PER_CHN * sizeof(float));
  memset(&cns->ptc_inv_mass[p_base], 0, PTC_PER_CHN * sizeof(float));
  
  memset(&cns->ptc_rest_len[r_base], 0, CON_PER_CHN * sizeof(float));
  memset(&cns->ptc_lambda[r_base], 0, CON_PER_CHN * sizeof(float));
  memset(&cns->ptc_bend_lambda[r_base], 0, CON_PER_CHN * sizeof(float));

  memset(&cns->sph_x[s_base], 0, SPH_PER_CHN * sizeof(float));
  memset(&cns->sph_r[s_base], 0, SPH_PER_CHN * sizeof(float));
 }
 extern void
 chn_sim_mov(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *RESTRICT rot4) {
  cns->chn_pos_x[chn] = pos3[0];
  cns->chn_pos_y[chn] = pos3[1];
  cns->chn_pos_z[chn] = pos3[2];

  cns->chn_quat_x[chn] = rot4[0];
  cns->chn_quat_y[chn] = rot4[1];
  cns->chn_quat_z[chn] = rot4[2];
  cns->chn_quat_w[chn] = rot4[3];
 }
 extern void
 chn_sim_tel(struct chn_sim *cns, int chn, const float *RESTRICT pos3, const float *RESTRICT rot4) {
  chn_sim_mov(cns, chn, pos3, rot4);
  cns->chn_prev_pos_x[chn] = pos3[0];
  cns->chn_prev_pos_y[chn] = pos3[1];
  cns->chn_prev_pos_z[chn] = pos3[2];

  cns->chn_prev_quat_x[chn] = rot4[0];
  cns->chn_prev_quat_y[chn] = rot4[1];
  cns->chn_prev_quat_z[chn] = rot4[2];
  cns->chn_prev_quat_w[chn] = rot4[3];
 }
 extern void
 chn_sim_grav(struct chn_sim *cns, int chn, const float *RESTRICT grav3) {
  cns->chn_grav_x[chn] = grav3[0];
  cns->chn_grav_y[chn] = grav3[1];
  cns->chn_grav_z[chn] = grav3[2];
 }
 extern void
 chn_sim_wind(struct chn_sim *cns, int chn, const float *RESTRICT wind3) {
  cns->chn_wnd_x[chn] = wind3[0];
  cns->chn_wnd_y[chn] = wind3[1];
  cns->chn_wnd_z[chn] = wind3[2];
 }
 extern void
 chn_sim_set_sph(struct chn_sim *cns, int chn, const float *RESTRICT spheres, int cnt) {
  int s_base = chn * SPH_PER_CHN;
  for (int i = 0; i < cnt && i < SPH_PER_CHN; ++i) {
    cns->sph_x[s_base + i] = spheres[i*4+0];
    cns->sph_y[s_base + i] = spheres[i*4+1];
    cns->sph_z[s_base + i] = spheres[i*4+2];
    cns->sph_r[s_base + i] = spheres[i*4+3];
  }
  for (int i = cnt; i < SPH_PER_CHN; ++i) {
    cns->sph_r[s_base + i] = 0.0f;
  }
 }
 extern void
 chn_sim_con_motion(struct chn_sim *cns, int chn, const float *RESTRICT radius, int cnt) {
  int p_base = chn * PTC_PER_CHN;
  for (int i = 0; i < cnt && i < PTC_PER_CHN; ++i) {
    int physical_i = (i < 8) ? (p_base + i) : (p_base + 8 + (i-8));
    cns->motion_radius[physical_i] = radius[i];
  }
 }
 // Fast Conjugate Quaternion Rotation: v' = v + 2*cross(-q, cross(-q, v) + q.w*v)
 static ALWAYS_INLINE void 
 rot_vec_inv_batch8(__m256 *rx, __m256 *ry, __m256 *rz, 
                   __m256 qx, __m256 qy, __m256 qz, __m256 qw,
                   __m256 vx, __m256 vy, __m256 vz) {

  __m256 nqx = _mm256_sub_ps(_mm256_setzero_ps(), qx);
  __m256 nqy = _mm256_sub_ps(_mm256_setzero_ps(), qy);
  __m256 nqz = _mm256_sub_ps(_mm256_setzero_ps(), qz);

  __m256 tx = _mm256_sub_ps(_mm256_mul_ps(nqy, vz), _mm256_mul_ps(nqz, vy));
  __m256 ty = _mm256_sub_ps(_mm256_mul_ps(nqz, vx), _mm256_mul_ps(nqx, vz));
  __m256 tz = _mm256_sub_ps(_mm256_mul_ps(nqx, vy), _mm256_mul_ps(nqy, vx));

  tx = _mm256_add_ps(tx, tx);
  ty = _mm256_add_ps(ty, ty);
  tz = _mm256_add_ps(tz, tz);

  __m256 cx = _mm256_sub_ps(_mm256_mul_ps(nqy, tz), _mm256_mul_ps(nqz, ty));
  __m256 cy = _mm256_sub_ps(_mm256_mul_ps(nqz, tx), _mm256_mul_ps(nqx, tz));
  __m256 cz = _mm256_sub_ps(_mm256_mul_ps(nqx, ty), _mm256_mul_ps(nqy, tx));

  *rx = _mm256_add_ps(vx, _mm256_add_ps(_mm256_mul_ps(qw, tx), cx));
  *ry = _mm256_add_ps(vy, _mm256_add_ps(_mm256_mul_ps(qw, ty), cy));
  *rz = _mm256_add_ps(vz, _mm256_add_ps(_mm256_mul_ps(qw, tz), cz));
 }
 static ALWAYS_INLINE void
 solve_dist(__m256 *p1x, __m256 *p1y, __m256 *p1z, __m256 invm1,
           __m256 *p2x, __m256 *p2y, __m256 *p2z, __m256 invm2,
           __m256 rest_len, __m256 compliance, __m256 *lambda, __m256 eps) {

  // XPBD Distance Constraint    
  __m256 dx = _mm256_sub_ps(*p1x, *p2x);
  __m256 dy = _mm256_sub_ps(*p1y, *p2y);
  __m256 dz = _mm256_sub_ps(*p1z, *p2z);
  __m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz)));
  __m256 inv_d = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps));
  __m256 dist = _mm256_mul_ps(d2, inv_d);

  __m256 C = _mm256_sub_ps(dist, rest_len);
  __m256 w = _mm256_add_ps(invm1, invm2);
  
  __m256 dL = _mm256_div_ps(
    _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(C, _mm256_mul_ps(compliance, *lambda))), 
    _mm256_add_ps(w, compliance)
  );
  // Mask out 0-length dummy constraints
  __m256 mask = _mm256_cmp_ps(rest_len, _mm256_setzero_ps(), _CMP_GT_OQ);
  dL = _mm256_and_ps(dL, mask);
  *lambda = _mm256_add_ps(*lambda, dL);

  __m256 cx = _mm256_mul_ps(_mm256_mul_ps(dx, inv_d), dL);
  __m256 cy = _mm256_mul_ps(_mm256_mul_ps(dy, inv_d), dL);
  __m256 cz = _mm256_mul_ps(_mm256_mul_ps(dz, inv_d), dL);

  *p1x = _mm256_fmadd_ps(cx, invm1, *p1x);
  *p1y = _mm256_fmadd_ps(cy, invm1, *p1y);
  *p1z = _mm256_fmadd_ps(cz, invm1, *p1z);

  *p2x = _mm256_fnmadd_ps(cx, invm2, *p2x);
  *p2y = _mm256_fnmadd_ps(cy, invm2, *p2y);
  *p2z = _mm256_fnmadd_ps(cz, invm2, *p2z);
 }
 // XPBD Angle Constraint (Dot Product with Target)
 static ALWAYS_INLINE void
 solve_angle(__m256 *p0x, __m256 *p0y, __m256 *p0z, __m256 invm0,
            __m256 *p1x, __m256 *p1y, __m256 *p1z, __m256 invm1,
            __m256 *p2x, __m256 *p2y, __m256 *p2z, __m256 invm2,
            __m256 compliance, __m256 *lambda, __m256 target_cos, __m256 eps) {

  __m256 e1x = _mm256_sub_ps(*p1x, *p0x);
  __m256 e1y = _mm256_sub_ps(*p1y, *p0y);
  __m256 e1z = _mm256_sub_ps(*p1z, *p0z);
  __m256 l1_sq = _mm256_fmadd_ps(e1x, e1x, _mm256_fmadd_ps(e1y, e1y, _mm256_mul_ps(e1z, e1z)));
  __m256 inv_l1 = _mm256_rsqrt_ps(_mm256_max_ps(l1_sq, eps));

  __m256 e2x = _mm256_sub_ps(*p2x, *p1x);
  __m256 e2y = _mm256_sub_ps(*p2y, *p1y);
  __m256 e2z = _mm256_sub_ps(*p2z, *p1z);
  __m256 l2_sq = _mm256_fmadd_ps(e2x, e2x, _mm256_fmadd_ps(e2y, e2y, _mm256_mul_ps(e2z, e2z)));
  __m256 inv_l2 = _mm256_rsqrt_ps(_mm256_max_ps(l2_sq, eps));

  __m256 n1x = _mm256_mul_ps(e1x, inv_l1);
  __m256 n1y = _mm256_mul_ps(e1y, inv_l1);
  __m256 n1z = _mm256_mul_ps(e1z, inv_l1);
  __m256 n2x = _mm256_mul_ps(e2x, inv_l2);
  __m256 n2y = _mm256_mul_ps(e2y, inv_l2);
  __m256 n2z = _mm256_mul_ps(e2z, inv_l2);

  __m256 dot = _mm256_fmadd_ps(n1x, n2x, _mm256_fmadd_ps(n1y, n2y, _mm256_mul_ps(n1z, n2z)));
  __m256 C = _mm256_sub_ps(dot, target_cos); 

  __m256 t0x = _mm256_fnmadd_ps(dot, n1x, n2x);
  __m256 t0y = _mm256_fnmadd_ps(dot, n1y, n2y);
  __m256 t0z = _mm256_fnmadd_ps(dot, n1z, n2z);

  __m256 gp0x = _mm256_mul_ps(t0x, inv_l1);
  __m256 gp0y = _mm256_mul_ps(t0y, inv_l1);
  __m256 gp0z = _mm256_mul_ps(t0z, inv_l1);

  __m256 t2x = _mm256_fnmadd_ps(dot, n2x, n1x);
  __m256 t2y = _mm256_fnmadd_ps(dot, n2y, n1y);
  __m256 t2z = _mm256_fnmadd_ps(dot, n2z, n1z);

  __m256 gp2x = _mm256_mul_ps(t2x, inv_l2);
  __m256 gp2y = _mm256_mul_ps(t2y, inv_l2);
  __m256 gp2z = _mm256_mul_ps(t2z, inv_l2);

  __m256 gp1x = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0x, gp2x));
  __m256 gp1y = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0y, gp2y));
  __m256 gp1z = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(gp0z, gp2z));

  __m256 w0 = _mm256_mul_ps(invm0, _mm256_fmadd_ps(gp0x, gp0x, _mm256_fmadd_ps(gp0y, gp0y, _mm256_mul_ps(gp0z, gp0z))));
  __m256 w1 = _mm256_mul_ps(invm1, _mm256_fmadd_ps(gp1x, gp1x, _mm256_fmadd_ps(gp1y, gp1y, _mm256_mul_ps(gp1z, gp1z))));
  __m256 w2 = _mm256_mul_ps(invm2, _mm256_fmadd_ps(gp2x, gp2x, _mm256_fmadd_ps(gp2y, gp2y, _mm256_mul_ps(gp2z, gp2z))));
  __m256 den = _mm256_add_ps(_mm256_add_ps(_mm256_add_ps(w0, w1), w2), compliance);
  __m256 num = _mm256_sub_ps(_mm256_setzero_ps(), _mm256_add_ps(C, _mm256_mul_ps(compliance, *lambda)));
  __m256 dL = _mm256_div_ps(num, _mm256_max_ps(den, eps));

  // Valid chain check
  __m256 mask = _mm256_cmp_ps(l1_sq, eps, _CMP_GT_OQ);
  dL = _mm256_and_ps(dL, mask);
  *lambda = _mm256_add_ps(*lambda, dL);
  *p0x = _mm256_fmadd_ps(_mm256_mul_ps(gp0x, dL), invm0, *p0x);
  *p0y = _mm256_fmadd_ps(_mm256_mul_ps(gp0y, dL), invm0, *p0y);
  *p0z = _mm256_fmadd_ps(_mm256_mul_ps(gp0z, dL), invm0, *p0z);
  *p1x = _mm256_fmadd_ps(_mm256_mul_ps(gp1x, dL), invm1, *p1x);
  *p1y = _mm256_fmadd_ps(_mm256_mul_ps(gp1y, dL), invm1, *p1y);
  *p1z = _mm256_fmadd_ps(_mm256_mul_ps(gp1z, dL), invm1, *p1z);
  *p2x = _mm256_fmadd_ps(_mm256_mul_ps(gp2x, dL), invm2, *p2x);
  *p2y = _mm256_fmadd_ps(_mm256_mul_ps(gp2y, dL), invm2, *p2y);
  *p2z = _mm256_fmadd_ps(_mm256_mul_ps(gp2z, dL), invm2, *p2z);
 }
 extern void
 chn_sim_run(struct chn_sim *RESTRICT cns, float dt) {
  assert(cns);
  if (dt <= 1e-6f) {
    return;
  }
  const __m256 dt_vec = _mm256_set1_ps(dt);
  const __m256 dt2_vec = _mm256_set1_ps(dt * dt);
  const __m256 eps_vec = _mm256_set1_ps(1e-6f);
  const __m256 zero_vec = _mm256_setzero_ps();
  const __m256 one_vec = _mm256_set1_ps(1.0f);
  
  const __m256i shift_left = _mm256_setr_epi32(1, 2, 3, 4, 5, 6, 7, 0);
  const __m256i shift_right = _mm256_setr_epi32(7, 0, 1, 2, 3, 4, 5, 6);

  ALIGN_VAR(32) float batch_fx[8], batch_fy[8], batch_fz[8];
  ALIGN_VAR(32) float batch_cvx[8], batch_cvy[8], batch_cvz[8];
  for (int c_blk = 0; c_blk < MAX_CHNS; c_blk += 8) {
    /* --- PHASE 1: Batched Global Forces --- */
    {
      __m256 qx = _mm256_load_ps(&cns->chn_quat_x[c_blk]);
      __m256 qy = _mm256_load_ps(&cns->chn_quat_y[c_blk]);
      __m256 qz = _mm256_load_ps(&cns->chn_quat_z[c_blk]);
      __m256 qw = _mm256_load_ps(&cns->chn_quat_w[c_blk]);

      __m256 fx = _mm256_add_ps(_mm256_load_ps(&cns->chn_wnd_x[c_blk]), _mm256_load_ps(&cns->chn_grav_x[c_blk]));
      __m256 fy = _mm256_add_ps(_mm256_load_ps(&cns->chn_wnd_y[c_blk]), _mm256_load_ps(&cns->chn_grav_y[c_blk]));
      __m256 fz = _mm256_add_ps(_mm256_load_ps(&cns->chn_wnd_z[c_blk]), _mm256_load_ps(&cns->chn_grav_z[c_blk]));

      __m256 lfx, lfy, lfz;
      rot_vec_inv_batch8(&lfx, &lfy, &lfz, qx, qy, qz, qw, fx, fy, fz);
      _mm256_store_ps(batch_fx, lfx);
      _mm256_store_ps(batch_fy, lfy);
      _mm256_store_ps(batch_fz, lfz);

      __m256 cx = _mm256_load_ps(&cns->chn_pos_x[c_blk]);
      __m256 cy = _mm256_load_ps(&cns->chn_pos_y[c_blk]);
      __m256 cz = _mm256_load_ps(&cns->chn_pos_z[c_blk]);
      __m256 px = _mm256_load_ps(&cns->chn_prev_pos_x[c_blk]);
      __m256 py = _mm256_load_ps(&cns->chn_prev_pos_y[c_blk]);
      __m256 pz = _mm256_load_ps(&cns->chn_prev_pos_z[c_blk]);

      _mm256_store_ps(batch_cvx, _mm256_sub_ps(cx, px));
      _mm256_store_ps(batch_cvy, _mm256_sub_ps(cy, py));
      _mm256_store_ps(batch_cvz, _mm256_sub_ps(cz, pz));

      _mm256_store_ps(&cns->chn_prev_pos_x[c_blk], cx);
      _mm256_store_ps(&cns->chn_prev_pos_y[c_blk], cy);
      _mm256_store_ps(&cns->chn_prev_pos_z[c_blk], cz);

      _mm256_store_ps(&cns->chn_prev_quat_x[c_blk], qx);
      _mm256_store_ps(&cns->chn_prev_quat_y[c_blk], qy);
      _mm256_store_ps(&cns->chn_prev_quat_z[c_blk], qz);
      _mm256_store_ps(&cns->chn_prev_quat_w[c_blk], qw);
    }
    /* --- PHASE 2: Local Simulation --- */
    for (int i = 0; i < 8; ++i) {
      int c = c_blk + i;
      struct chn_cfg *cfg = &cns->chn_cfg[c];
      int p_base = c * PTC_PER_CHN;

      // Config
      __m256 damp = _mm256_set1_ps(cfg->damping);
      __m256 fric_coef = _mm256_set1_ps(cfg->friction);
      __m256 compliance = _mm256_div_ps(dt2_vec, _mm256_max_ps(_mm256_set1_ps(cfg->stiffness), eps_vec));
      __m256 bend_compl = _mm256_div_ps(dt2_vec, _mm256_max_ps(_mm256_set1_ps(cfg->bend_stiffness), eps_vec));
      __m256 root_blend = _mm256_set1_ps(cfg->root_motion_blend);
      // Art control: rest_curvature (0=straight 1.0, 0.5=60deg)
      __m256 target_cos = _mm256_set1_ps(1.0f - (cfg->rest_curvature * 0.5f)); 

      // Forces
      __m256 force_x = _mm256_set1_ps(batch_fx[i]);
      __m256 force_y = _mm256_set1_ps(batch_fy[i]);
      __m256 force_z = _mm256_set1_ps(batch_fz[i]);
      
      __m256 root_vel_x = _mm256_set1_ps(batch_cvx[i]);
      __m256 root_vel_y = _mm256_set1_ps(batch_cvy[i]);
      __m256 root_vel_z = _mm256_set1_ps(batch_cvz[i]);
      
      __m256 inertia_drag = _mm256_set1_ps(1.0f - cfg->linear_inertia);
      __m256 drag_x = _mm256_mul_ps(root_vel_x, inertia_drag);
      __m256 drag_y = _mm256_mul_ps(root_vel_y, inertia_drag);
      __m256 drag_z = _mm256_mul_ps(root_vel_z, inertia_drag);

      __m256 p_pos[3][2], p_prev[3][2], inv_mass[2]; 
      __m256 rl[2], lm[2], blm[2];

      UNROLL_HINT
      for (int k=0; k<2; ++k) {
        int off = p_base + (k * 8);
        int r_off = (c * CON_PER_CHN) + (k * 8);

        p_pos[0][k] = _mm256_load_ps(&cns->ptc_pos_x[off]);
        p_pos[1][k] = _mm256_load_ps(&cns->ptc_pos_y[off]);
        p_pos[2][k] = _mm256_load_ps(&cns->ptc_pos_z[off]);

        p_prev[0][k] = _mm256_load_ps(&cns->ptc_prev_pos_x[off]);
        p_prev[1][k] = _mm256_load_ps(&cns->ptc_prev_pos_y[off]);
        p_prev[2][k] = _mm256_load_ps(&cns->ptc_prev_pos_z[off]);

        inv_mass[k] = _mm256_load_ps(&cns->ptc_inv_mass[off]);
        rl[k] = _mm256_load_ps(&cns->ptc_rest_len[r_off]);
        lm[k] = _mm256_load_ps(&cns->ptc_lambda[r_off]);
        blm[k] = _mm256_load_ps(&cns->ptc_bend_lambda[r_off]);
      }
      /* --- Verlet + Root Motion --- */
      UNROLL_HINT
      for (int k=0; k<2; ++k) {
        __m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]);
        __m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]);
        __m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]);

        vx = _mm256_sub_ps(vx, drag_x);
        vy = _mm256_sub_ps(vy, drag_y);
        vz = _mm256_sub_ps(vz, drag_z);

        vx = _mm256_fmadd_ps(root_vel_x, root_blend, vx);
        vy = _mm256_fmadd_ps(root_vel_y, root_blend, vy);
        vz = _mm256_fmadd_ps(root_vel_z, root_blend, vz);

        p_prev[0][k] = p_pos[0][k];
        p_prev[1][k] = p_pos[1][k];
        p_prev[2][k] = p_pos[2][k];

        p_pos[0][k] = _mm256_fmadd_ps(_mm256_mul_ps(force_x, inv_mass[k]), dt2_vec, _mm256_fmadd_ps(vx, damp, p_pos[0][k]));
        p_pos[1][k] = _mm256_fmadd_ps(_mm256_mul_ps(force_y, inv_mass[k]), dt2_vec, _mm256_fmadd_ps(vy, damp, p_pos[1][k]));
        p_pos[2][k] = _mm256_fmadd_ps(_mm256_mul_ps(force_z, inv_mass[k]), dt2_vec, _mm256_fmadd_ps(vz, damp, p_pos[2][k]));
      }
      /* --- Constraints --- */
      for (int iter = 0; iter < MAX_ITR; ++iter) {
        // Distance
        UNROLL_HINT
        for (int k=0; k<2; ++k) {
          __m256 p2x, p2y, p2z, p2m;
          if (k == 0) { 
            p2x = p_pos[0][1];
            p2y = p_pos[1][1];
            p2z = p_pos[2][1];
            p2m = inv_mass[1];
          } else { 
            p2x = _mm256_permutevar8x32_ps(p_pos[0][0], shift_left);
            p2y = _mm256_permutevar8x32_ps(p_pos[1][0], shift_left);
            p2z = _mm256_permutevar8x32_ps(p_pos[2][0], shift_left);
            p2m = _mm256_permutevar8x32_ps(inv_mass[0], shift_left);
          }
          __m256 old_p2x = p2x, old_p2y = p2y, old_p2z = p2z;
          solve_dist(&p_pos[0][k], &p_pos[1][k], &p_pos[2][k], inv_mass[k],
                     &p2x, &p2y, &p2z, p2m, rl[k], compliance, &lm[k], eps_vec);
          if (k == 0) {
            p_pos[0][1] = p2x;
            p_pos[1][1] = p2y;
            p_pos[2][1] = p2z;
          } else {
            __m256 dx = _mm256_sub_ps(p2x, old_p2x);
            __m256 dy = _mm256_sub_ps(p2y, old_p2y);
            __m256 dz = _mm256_sub_ps(p2z, old_p2z);

            p_pos[0][0] = _mm256_add_ps(p_pos[0][0], _mm256_permutevar8x32_ps(dx, shift_right));
            p_pos[1][0] = _mm256_add_ps(p_pos[1][0], _mm256_permutevar8x32_ps(dy, shift_right));
            p_pos[2][0] = _mm256_add_ps(p_pos[2][0], _mm256_permutevar8x32_ps(dz, shift_right));
          }
        }
        // Bending (Group 1: Odd Center)
        {
          __m256 r_px = _mm256_permutevar8x32_ps(p_pos[0][0], shift_left);
          __m256 r_py = _mm256_permutevar8x32_ps(p_pos[1][0], shift_left);
          __m256 r_pz = _mm256_permutevar8x32_ps(p_pos[2][0], shift_left);
          __m256 r_pm = _mm256_permutevar8x32_ps(inv_mass[0], shift_left);
          __m256 old_r_px = r_px, old_r_py = r_py, old_r_pz = r_pz;
          solve_angle(&p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_mass[0], 
                     &p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_mass[1], 
                     &r_px,        &r_py,        &r_pz,        r_pm,        
                     bend_compl, &blm[1], target_cos, eps_vec);

          __m256 dx = _mm256_sub_ps(r_px, old_r_px);
          __m256 dy = _mm256_sub_ps(r_py, old_r_py);
          __m256 dz = _mm256_sub_ps(r_pz, old_r_pz);

          p_pos[0][0] = _mm256_add_ps(p_pos[0][0], _mm256_permutevar8x32_ps(dx, shift_right));
          p_pos[1][0] = _mm256_add_ps(p_pos[1][0], _mm256_permutevar8x32_ps(dy, shift_right));
          p_pos[2][0] = _mm256_add_ps(p_pos[2][0], _mm256_permutevar8x32_ps(dz, shift_right));
        }
        // Bending (Group 2: Even Center)
        {
          __m256 l_px = _mm256_permutevar8x32_ps(p_pos[0][1], shift_right);
          __m256 l_py = _mm256_permutevar8x32_ps(p_pos[1][1], shift_right);
          __m256 l_pz = _mm256_permutevar8x32_ps(p_pos[2][1], shift_right);
          __m256 l_pm = _mm256_permutevar8x32_ps(inv_mass[1], shift_right);

          __m256 old_l_px = l_px, old_l_py = l_py, old_l_pz = l_pz;
          solve_angle(&l_px,        &l_py,        &l_pz,        l_pm,        
                     &p_pos[0][0], &p_pos[1][0], &p_pos[2][0], inv_mass[0], 
                     &p_pos[0][1], &p_pos[1][1], &p_pos[2][1], inv_mass[1], 
                     bend_compl, &blm[0], target_cos, eps_vec);

          __m256 dx = _mm256_sub_ps(l_px, old_l_px);
          __m256 dy = _mm256_sub_ps(l_py, old_l_py);
          __m256 dz = _mm256_sub_ps(l_pz, old_l_pz);

          p_pos[0][1] = _mm256_add_ps(p_pos[0][1], _mm256_permutevar8x32_ps(dx, shift_left));
          p_pos[1][1] = _mm256_add_ps(p_pos[1][1], _mm256_permutevar8x32_ps(dy, shift_left));
          p_pos[2][1] = _mm256_add_ps(p_pos[2][1], _mm256_permutevar8x32_ps(dz, shift_left));
        }
      }
      /* --- Collisions --- */
      int s_base = c * SPH_PER_CHN;
      for (int s = 0; s < SPH_PER_CHN; ++s) {
        __m256 sr = _mm256_broadcast_ss(&cns->sph_r[s_base + s]);
        if (_mm256_movemask_ps(_mm256_cmp_ps(sr, zero_vec, _CMP_GT_OQ)) == 0) {
          continue;
        }
        __m256 sx = _mm256_broadcast_ss(&cns->sph_x[s_base + s]);
        __m256 sy = _mm256_broadcast_ss(&cns->sph_y[s_base + s]);
        __m256 sz = _mm256_broadcast_ss(&cns->sph_z[s_base + s]);

        UNROLL_HINT
        for (int k=0; k<2; ++k) {
          __m256 dx = _mm256_sub_ps(p_pos[0][k], sx);
          __m256 dy = _mm256_sub_ps(p_pos[1][k], sy);
          __m256 dz = _mm256_sub_ps(p_pos[2][k], sz);
          __m256 d2 = _mm256_fmadd_ps(dx, dx, _mm256_fmadd_ps(dy, dy, _mm256_mul_ps(dz, dz)));
          __m256 id = _mm256_rsqrt_ps(_mm256_max_ps(d2, eps_vec));
          
          __m256 pen = _mm256_sub_ps(sr, _mm256_mul_ps(d2, id));
          __m256 mask = _mm256_cmp_ps(pen, zero_vec, _CMP_GT_OQ);
          __m256 push = _mm256_mul_ps(pen, mask);
          
          __m256 nx = _mm256_mul_ps(dx, id);
          __m256 ny = _mm256_mul_ps(dy, id);
          __m256 nz = _mm256_mul_ps(dz, id);

          p_pos[0][k] = _mm256_fmadd_ps(nx, push, p_pos[0][k]);
          p_pos[1][k] = _mm256_fmadd_ps(ny, push, p_pos[1][k]);
          p_pos[2][k] = _mm256_fmadd_ps(nz, push, p_pos[2][k]);

          // Friction
          __m256 vx = _mm256_sub_ps(p_pos[0][k], p_prev[0][k]);
          __m256 vy = _mm256_sub_ps(p_pos[1][k], p_prev[1][k]);
          __m256 vz = _mm256_sub_ps(p_pos[2][k], p_prev[2][k]);
          
          __m256 vdn = _mm256_fmadd_ps(vx, nx, _mm256_fmadd_ps(vy, ny, _mm256_mul_ps(vz, nz)));
          __m256 f_scale = _mm256_mul_ps(push, fric_coef);
          
          p_pos[0][k] = _mm256_fnmadd_ps(_mm256_fnmadd_ps(vdn, nx, vx), f_scale, p_pos[0][k]);
          p_pos[1][k] = _mm256_fnmadd_ps(_mm256_fnmadd_ps(vdn, ny, vy), f_scale, p_pos[1][k]);
          p_pos[2][k] = _mm256_fnmadd_ps(_mm256_fnmadd_ps(vdn, nz, vz), f_scale, p_pos[2][k]);
        }
      }
      /* --- Store --- */
      UNROLL_HINT
      for (int k=0; k<2; ++k) {
        int off = p_base + (k * 8);
        int r_off = (c * CON_PER_CHN) + (k * 8);
        _mm256_store_ps(&cns->ptc_pos_x[off], p_pos[0][k]);
        _mm256_store_ps(&cns->ptc_pos_y[off], p_pos[1][k]);
        _mm256_store_ps(&cns->ptc_pos_z[off], p_pos[2][k]);
        _mm256_store_ps(&cns->ptc_prev_pos_x[off], p_prev[0][k]);
        _mm256_store_ps(&cns->ptc_prev_pos_y[off], p_prev[1][k]);
        _mm256_store_ps(&cns->ptc_prev_pos_z[off], p_prev[2][k]);
        _mm256_store_ps(&cns->ptc_lambda[r_off], lm[k]);
        _mm256_store_ps(&cns->ptc_bend_lambda[r_off], blm[k]);
      }
    }
  }
 }
diff --git a/main.c b/main.c
 #include <SDL2/SDL.h>
 #include <stdio.h>
 #include <stdlib.h>
 #include <math.h>

 #include "chain.c"

 #define WINDOW_WIDTH 800
 #define WINDOW_HEIGHT 600
 #define CHAIN_LENGTH 10
 #define PARTICLE_SPACING 10.0f

 static void
 project(float x, float y, float z, int *screen_x, int *screen_y) {
  *screen_x = (int)(x + WINDOW_WIDTH / 2.0f);
  *screen_y = (int)(-y + WINDOW_HEIGHT / 2.0f);
 }
 // Helper: Rotate vector by inv_quat and translate by -chain_pos (scalar for simplicity)
 static void quat_rotate_inv_translate(float wx, float wy, float wz,  // world pos
                                      float cx, float cy, float cz,  // chain pos
                                      float qx, float qy, float qz, float qw,  // chain quat
                                      float *mx, float *my, float *mz) {  // out model pos
  // Translate: world - chain
  float tx = wx - cx, ty = wy - cy, tz = wz - cz;
  // Quat conj: (qw, -qx, -qy, -qz)
  float cqw = qw, cqx = -qx, cqy = -qy, cqz = -qz;
  // t = conj * (0, tx,ty,tz)
  float tw = cqx*tx + cqy*ty + cqz*tz;
  float t_x = cqw*tx + cqy*tz - cqz*ty;
  float t_y = cqw*ty + cqz*tx - cqx*tz;
  float t_z = cqw*tz + cqx*ty - cqy*tx;
  // model = t * quat (vec part)
  *mx = qw*t_x + qy*t_z - qz*t_y + tw*qx;
  *my = qw*t_y + qz*t_x - qx*t_z + tw*qy;
  *mz = qw*t_z + qx*t_y - qy*t_x + tw*qz;
 }
 extern int
 main(int argc, char *argv[]) {
  if (SDL_Init(SDL_INIT_VIDEO) < 0) {
    printf("SDL_Init failed: %s\n", SDL_GetError());
    return 1;
  }
  SDL_Window *window = SDL_CreateWindow("Chain Simulation", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, 0);
  SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
  if (!window || !renderer) {
    printf("SDL_CreateWindow/Renderer failed: %s\n", SDL_GetError());
    SDL_Quit();
    return 1;
  }
  struct chn_sim sim;
  chn_sim_init(&sim);

  struct chn_cfg cfg = {
    .damping = 0.5f,
    .stiffness = 0.9f,
    .friction = 0.5f,
    .linear_inertia = 0.9f,       // Tune 0.5f for more tangential lag
    .angular_inertia = 0.9f,
    .centrifugal_inertia = 0.9f,   // Tune 0.5f for more outward bow
    .bend_stiffness = 0.8f,
  };
  float rest_pos[CHAIN_LENGTH * 4];
  for (int i = 0; i < CHAIN_LENGTH; i++) {
    rest_pos[i*4 + 0] = 0.0f;
    rest_pos[i*4 + 1] = -i * PARTICLE_SPACING;
    rest_pos[i*4 + 2] = 0.0f;
    rest_pos[i*4 + 3] = i == 0 ? 0.0f : 1.0f;
  }
  int chn = chn_sim_add(&sim, &cfg, rest_pos, CHAIN_LENGTH);
  chn_sim_grav(&sim, chn, (float[]){0.0f, -1.0f, 0.0f});  // Reduced for spin test
  chn_sim_tel(&sim, chn, (float[]){50.0f, 0.0f, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});  // Start at circle edge

  // Fixed world spheres (triangle outside circle)
  float world_spheres[12] = {  // 3 spheres * 4 floats (x,y,z,r)
    60.0f,  0.0f, 0.0f, 10.0f,   // Sphere 0: right
    -30.0f, 52.0f, 0.0f, 10.0f,  // Sphere 1: top-left
    -30.0f,-52.0f, 0.0f, 10.0f   // Sphere 2: bottom-left
  };

  // Circular motion params
  float time = 0.0f;
  float circle_radius = 50.0f;
  float angular_speed = 2.0f * M_PI / 5.0f;  // Full circle every 5s
  float dt = 0.016f;
  int running = 1;
  SDL_Event event;

  // Debug: Check gravity and masses
  int p_base = chn * PTC_PER_CHN;
  while (running) {
    while (SDL_PollEvent(&event)) {
      if (event.type == SDL_QUIT) {
        running = 0;
      }
    }
    time += dt;

    // Circular anchor motion
    float anchor_x = circle_radius * cosf(angular_speed * time);
    float anchor_y = circle_radius * sinf(angular_speed * time);
    chn_sim_mov(&sim, chn, (float[]){anchor_x, anchor_y, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});

    // Transform fixed world spheres to current model space
    float model_spheres[12];
    for (int s = 0; s < 3; ++s) {
      float wx = world_spheres[s*4 + 0], wy = world_spheres[s*4 + 1], wz = world_spheres[s*4 + 2];
      float r = world_spheres[s*4 + 3];
      float cx = sim.chn_pos_x[chn], cy = sim.chn_pos_y[chn], cz = sim.chn_pos_z[chn];
      float qx = sim.chn_quat_x[chn], qy = sim.chn_quat_y[chn], qz = sim.chn_quat_z[chn], qw = sim.chn_quat_w[chn];
      float mx, my, mz;
      quat_rotate_inv_translate(wx, wy, wz, cx, cy, cz, qx, qy, qz, qw, &mx, &my, &mz);
      model_spheres[s*4 + 0] = mx;
      model_spheres[s*4 + 1] = my;
      model_spheres[s*4 + 2] = mz;
      model_spheres[s*4 + 3] = r;
    }
    chn_sim_set_sph(&sim, chn, model_spheres, 3);
    chn_sim_run(&sim, dt);

    SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255);
    SDL_RenderClear(renderer);
    SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255);  // Green for chain & spheres

    float root_x = sim.chn_pos_x[chn];  // Current root world x/y
    float root_y = sim.chn_pos_y[chn];

    // Draw chain segments
    for (int i = 0; i < CHAIN_LENGTH - 1; i++) {
      int idx0 = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
      int idx1 = ((i + 1) % 2 == 0 ? (i + 1) / 2 : 8 + i / 2);

      float model_x0 = sim.ptc_pos_x[p_base + idx0], model_y0 = sim.ptc_pos_y[p_base + idx0];
      float model_x1 = sim.ptc_pos_x[p_base + idx1], model_y1 = sim.ptc_pos_y[p_base + idx1];
      float world_x0 = model_x0 + root_x;
      float world_y0 = model_y0 + root_y;
      float world_x1 = model_x1 + root_x;
      float world_y1 = model_y1 + root_y;

      int sx0, sy0, sx1, sy1;
      project(world_x0, world_y0, 0, &sx0, &sy0);
      project(world_x1, world_y1, 0, &sx1, &sy1);
      SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1);
    }
    // Draw spheres (approx circle outline with 32 points)
    int p_base = chn * PTC_PER_CHN;
    int s_base = chn * SPH_PER_CHN;
    for (int s = 0; s < 3; ++s) {  // Only draw active spheres
      float sph_model_x = sim.sph_x[s_base + s];
      float sph_model_y = sim.sph_y[s_base + s];
      float sph_r = sim.sph_r[s_base + s];
      if (sph_r <= 0.0f) continue;

      float sph_world_x = sph_model_x + root_x;
      float sph_world_y = sph_model_y + root_y;
      int sph_sx, sph_sy;
      project(sph_world_x, sph_world_y, 0, &sph_sx, &sph_sy);

      // Parametric circle: 32 points
      const int num_points = 32;
      for (int pt = 0; pt < num_points; ++pt) {
        float angle0 = 2.0f * M_PI * pt / num_points;
        float angle1 = 2.0f * M_PI * (pt + 1) / num_points;
        float dx0 = sph_r * cosf(angle0), dy0 = sph_r * sinf(angle0);
        float dx1 = sph_r * cosf(angle1), dy1 = sph_r * sinf(angle1);

        int px0, py0, px1, py1;
        project(sph_world_x + dx0, sph_world_y + dy0, 0, &px0, &py0);
        project(sph_world_x + dx1, sph_world_y + dy1, 0, &px1, &py1);
        SDL_RenderDrawLine(renderer, px0, py0, px1, py1);
      }
    }
    SDL_RenderPresent(renderer);
    SDL_Delay(16);
  }
  SDL_DestroyRenderer(renderer);
  SDL_DestroyWindow(window);
  SDL_Quit();
  return 0;
 }
	#include <SDL2/SDL.h>
	#include <stdio.h>
	#include <stdlib.h>
	#include <math.h>

	#include "chain.c"

	#define WINDOW_WIDTH 800
	#define WINDOW_HEIGHT 600
	#define CHAIN_LENGTH 10
	#define PARTICLE_SPACING 10.0f

	static void
	project(float x, float y, float z, int screen_x, int screen_y) {
	*screen_x = (int)(x + WINDOW_WIDTH / 2.0f);
	*screen_y = (int)(-y + WINDOW_HEIGHT / 2.0f);
	}
	// Helper: Rotate vector by inv_quat and translate by -chain_pos (scalar for simplicity)
	static void quat_rotate_inv_translate(float wx, float wy, float wz, // world pos
	float cx, float cy, float cz, // chain pos
	float qx, float qy, float qz, float qw, // chain quat
	float mx, float my, float *mz) { // out model pos
	// Translate: world - chain
	float tx = wx - cx, ty = wy - cy, tz = wz - cz;
	// Quat conj: (qw, -qx, -qy, -qz)
	float cqw = qw, cqx = -qx, cqy = -qy, cqz = -qz;
	// t = conj * (0, tx,ty,tz)
	float tw = cqxtx + cqyty + cqz*tz;
	float t_x = cqwtx + cqytz - cqz*ty;
	float t_y = cqwty + cqztx - cqx*tz;
	float t_z = cqwtz + cqxty - cqy*tx;
	// model = t * quat (vec part)
	mx = qwt_x + qyt_z - qzt_y + tw*qx;
	my = qwt_y + qzt_x - qxt_z + tw*qy;
	mz = qwt_z + qxt_y - qyt_x + tw*qz;
	}
	extern int
	main(int argc, char *argv[]) {
	if (SDL_Init(SDL_INIT_VIDEO) < 0) {
	printf("SDL_Init failed: %s\n", SDL_GetError());
	return 1;
	}
	SDL_Window *window = SDL_CreateWindow("Chain Simulation", SDL_WINDOWPOS_CENTERED, SDL_WINDOWPOS_CENTERED, WINDOW_WIDTH, WINDOW_HEIGHT, 0);
	SDL_Renderer *renderer = SDL_CreateRenderer(window, -1, SDL_RENDERER_ACCELERATED);
	if (!window \|\| !renderer) {
	printf("SDL_CreateWindow/Renderer failed: %s\n", SDL_GetError());
	SDL_Quit();
	return 1;
	}
	struct chn_sim sim;
	chn_sim_init(&sim);

	struct chn_cfg cfg = {
	.damping = 0.5f,
	.stiffness = 0.9f,
	.friction = 0.5f,
	.linear_inertia = 0.9f, // Tune 0.5f for more tangential lag
	.angular_inertia = 0.9f,
	.centrifugal_inertia = 0.9f, // Tune 0.5f for more outward bow
	.bend_stiffness = 0.8f,
	};
	float rest_pos[CHAIN_LENGTH * 4];
	for (int i = 0; i < CHAIN_LENGTH; i++) {
	rest_pos[i*4 + 0] = 0.0f;
	rest_pos[i4 + 1] = -i PARTICLE_SPACING;
	rest_pos[i*4 + 2] = 0.0f;
	rest_pos[i*4 + 3] = i == 0 ? 0.0f : 1.0f;
	}
	int chn = chn_sim_add(&sim, &cfg, rest_pos, CHAIN_LENGTH);
	chn_sim_grav(&sim, chn, (float[]){0.0f, -1.0f, 0.0f}); // Reduced for spin test
	chn_sim_tel(&sim, chn, (float[]){50.0f, 0.0f, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f}); // Start at circle edge

	// Fixed world spheres (triangle outside circle)
	float world_spheres[12] = { // 3 spheres * 4 floats (x,y,z,r)
	60.0f, 0.0f, 0.0f, 10.0f, // Sphere 0: right
	-30.0f, 52.0f, 0.0f, 10.0f, // Sphere 1: top-left
	-30.0f,-52.0f, 0.0f, 10.0f // Sphere 2: bottom-left
	};

	// Circular motion params
	float time = 0.0f;
	float circle_radius = 50.0f;
	float angular_speed = 2.0f * M_PI / 5.0f; // Full circle every 5s
	float dt = 0.016f;
	int running = 1;
	SDL_Event event;

	// Debug: Check gravity and masses
	int p_base = chn * PTC_PER_CHN;
	while (running) {
	while (SDL_PollEvent(&event)) {
	if (event.type == SDL_QUIT) {
	running = 0;
	}
	}
	time += dt;

	// Circular anchor motion
	float anchor_x = circle_radius * cosf(angular_speed * time);
	float anchor_y = circle_radius * sinf(angular_speed * time);
	chn_sim_mov(&sim, chn, (float[]){anchor_x, anchor_y, 0.0f}, (float[]){0.0f, 0.0f, 0.0f, 1.0f});

	// Transform fixed world spheres to current model space
	float model_spheres[12];
	for (int s = 0; s < 3; ++s) {
	float wx = world_spheres[s4 + 0], wy = world_spheres[s4 + 1], wz = world_spheres[s*4 + 2];
	float r = world_spheres[s*4 + 3];
	float cx = sim.chn_pos_x[chn], cy = sim.chn_pos_y[chn], cz = sim.chn_pos_z[chn];
	float qx = sim.chn_quat_x[chn], qy = sim.chn_quat_y[chn], qz = sim.chn_quat_z[chn], qw = sim.chn_quat_w[chn];
	float mx, my, mz;
	quat_rotate_inv_translate(wx, wy, wz, cx, cy, cz, qx, qy, qz, qw, &mx, &my, &mz);
	model_spheres[s*4 + 0] = mx;
	model_spheres[s*4 + 1] = my;
	model_spheres[s*4 + 2] = mz;
	model_spheres[s*4 + 3] = r;
	}
	chn_sim_set_sph(&sim, chn, model_spheres, 3);
	chn_sim_run(&sim, dt);

	SDL_SetRenderDrawColor(renderer, 0, 0, 0, 255);
	SDL_RenderClear(renderer);
	SDL_SetRenderDrawColor(renderer, 0, 255, 0, 255); // Green for chain & spheres

	float root_x = sim.chn_pos_x[chn]; // Current root world x/y
	float root_y = sim.chn_pos_y[chn];

	// Draw chain segments
	for (int i = 0; i < CHAIN_LENGTH - 1; i++) {
	int idx0 = (i % 2 == 0 ? i / 2 : 8 + (i - 1) / 2);
	int idx1 = ((i + 1) % 2 == 0 ? (i + 1) / 2 : 8 + i / 2);

	float model_x0 = sim.ptc_pos_x[p_base + idx0], model_y0 = sim.ptc_pos_y[p_base + idx0];
	float model_x1 = sim.ptc_pos_x[p_base + idx1], model_y1 = sim.ptc_pos_y[p_base + idx1];
	float world_x0 = model_x0 + root_x;
	float world_y0 = model_y0 + root_y;
	float world_x1 = model_x1 + root_x;
	float world_y1 = model_y1 + root_y;

	int sx0, sy0, sx1, sy1;
	project(world_x0, world_y0, 0, &sx0, &sy0);
	project(world_x1, world_y1, 0, &sx1, &sy1);
	SDL_RenderDrawLine(renderer, sx0, sy0, sx1, sy1);
	}
	// Draw spheres (approx circle outline with 32 points)
	int p_base = chn * PTC_PER_CHN;
	int s_base = chn * SPH_PER_CHN;
	for (int s = 0; s < 3; ++s) { // Only draw active spheres
	float sph_model_x = sim.sph_x[s_base + s];
	float sph_model_y = sim.sph_y[s_base + s];
	float sph_r = sim.sph_r[s_base + s];
	if (sph_r <= 0.0f) continue;

	float sph_world_x = sph_model_x + root_x;
	float sph_world_y = sph_model_y + root_y;
	int sph_sx, sph_sy;
	project(sph_world_x, sph_world_y, 0, &sph_sx, &sph_sy);

	// Parametric circle: 32 points
	const int num_points = 32;
	for (int pt = 0; pt < num_points; ++pt) {
	float angle0 = 2.0f * M_PI * pt / num_points;
	float angle1 = 2.0f * M_PI * (pt + 1) / num_points;
	float dx0 = sph_r * cosf(angle0), dy0 = sph_r * sinf(angle0);
	float dx1 = sph_r * cosf(angle1), dy1 = sph_r * sinf(angle1);

	int px0, py0, px1, py1;
	project(sph_world_x + dx0, sph_world_y + dy0, 0, &px0, &py0);
	project(sph_world_x + dx1, sph_world_y + dy1, 0, &px1, &py1);
	SDL_RenderDrawLine(renderer, px0, py0, px1, py1);
	}
	}
	SDL_RenderPresent(renderer);
	SDL_Delay(16);
	}
	SDL_DestroyRenderer(renderer);
	SDL_DestroyWindow(window);
	SDL_Quit();
	return 0;
	}
No results found