Skip to content

Instantly share code, notes, and snippets.

@daitj
Last active February 3, 2026 10:18
Show Gist options
  • Select an option

  • Save daitj/cffc52658b05515ef5af52001fc27014 to your computer and use it in GitHub Desktop.

Select an option

Save daitj/cffc52658b05515ef5af52001fc27014 to your computer and use it in GitHub Desktop.
llama.cpp with both CUDA and ROCM builder, currently targeting gfx906 (AMD MI50) and CUDA 75 (NVIDIA 2080TI)
--- /usr/local/cuda-13.1/targets/x86_64-linux/include/crt/math_functions.h.org 2025-08-21 00:50:27.000000000 +0200
+++ /usr/local/cuda-13.1/targets/x86_64-linux/include/crt/math_functions.h 2025-11-30 14:27:58.286886587 +0100
@@ -629,7 +629,7 @@
*
* \note_accuracy_double
*/
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double rsqrt(double x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double rsqrt(double x) noexcept (true);
/**
* \ingroup CUDA_MATH_SINGLE
@@ -653,7 +653,7 @@
*
* \note_accuracy_single
*/
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float rsqrtf(float x);
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float rsqrtf(float x) noexcept (true);
#if defined(__QNX__) && !defined(_LIBCPP_VERSION)
namespace std {
@@ -2594,7 +2594,7 @@
*
* \note_accuracy_double
*/
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double sinpi(double x) __NV_IEC_60559_FUNCS_EXCEPTION_SPECIFIER;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double sinpi(double x) noexcept (true);
/**
* \ingroup CUDA_MATH_SINGLE
* \brief Calculate the sine of the input argument
@@ -2617,7 +2617,7 @@
*
* \note_accuracy_single
*/
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float sinpif(float x) __NV_IEC_60559_FUNCS_EXCEPTION_SPECIFIER;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float sinpif(float x) noexcept (true);
/**
* \ingroup CUDA_MATH_DOUBLE
* \brief Calculate the cosine of the input argument
@@ -2639,7 +2639,7 @@
*
* \note_accuracy_double
*/
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double cospi(double x) __NV_IEC_60559_FUNCS_EXCEPTION_SPECIFIER;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double cospi(double x) noexcept (true);
/**
* \ingroup CUDA_MATH_SINGLE
* \brief Calculate the cosine of the input argument
@@ -2661,7 +2661,7 @@
*
* \note_accuracy_single
*/
-extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float cospif(float x) __NV_IEC_60559_FUNCS_EXCEPTION_SPECIFIER;
+extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float cospif(float x) noexcept (true);
#undef __NV_IEC_60559_FUNCS_EXCEPTION_SPECIFIER
/**
#!/bin/bash
#run me in a podman/docker container
# cuda-repo-debian13-13-1-local_13.1.1-590.48.01-1_amd64.deb should exist at /apps
# cuda_math_functions_noexcept.patch should also be at /apps take it from another file in this gist.
apt update
apt upgrade -y
apt install -y git cmake build-essential libcurl4-openssl-dev librocblas-dev hipcc libamdhip64-dev libhipblas-dev librocblas-dev librocsolver0 librocsolver-dev
dpkg -i /apps/cuda-repo-debian13-13-1-local_13.1.1-590.48.01-1_amd64.deb
cp /var/cuda-repo-debian13-13-1-local/cuda-*-keyring.gpg /usr/share/keyrings/
apt update
apt install -y cuda-libraries-13-1 cuda-compiler-13-1 libcublas-dev-13-1
# cuda needs a patch
patch -p3 < /apps/cuda_math_functions_noexcept.patch
cd /tmp
rm -rf /tmp/llama.cpp
git clone --shallow-submodules --recurse-submodules --depth=1 https://github.com/ggml-org/llama.cpp /tmp/llama.cpp
cd /tmp/llama.cpp
git clone --depth=1 --recurse-submodules --shallow-submodules -b rocm-6.4.3 https://github.com/ROCm/rocWMMA
rm -rf ./build
mkdir -p ./build
mkdir -p /home/user/.local/lib
mkdir -p /home/user/.local/bin
export HIPCXX=clang-21
export CUDACXX="/usr/local/cuda-13/bin/nvcc"
# rocwmma-version patch, may be not needed anymore do not know.
echo "#ifndef ROCWMMA_VERSION_HPP
#define ROCWMMA_VERSION_HPP
#define ROCWMMA_VERSION_MAJOR 1
#define ROCWMMA_VERSION_MINOR 4
#define ROCWMMA_VERSION_PATCH 0
#define ROCWMMA_VERSION_STRING \"1.4.0\"
#endif // ROCWMMA_VERSION_HPP" > "/tmp/llama.cpp/rocWMMA/library/include/rocwmma/rocwmma-version.hpp"
# apply rocwmma patch
sed -i 's|#include <rocwmma/rocwmma-version.hpp>|#include "/tmp/llama.cpp/rocWMMA/library/include/rocwmma/rocwmma-version.hpp"|' ggml/src/ggml-cuda/vendors/hip.h
sed -i 's|#include <rocwmma/rocwmma.hpp>|#include "/tmp/llama.cpp/rocWMMA/library/include/rocwmma/rocwmma.hpp"|' ggml/src/ggml-cuda/fattn-wmma-f16.cu
#build
cmake -B build \
-DCMAKE_INSTALL_PREFIX="/home/user/.local/llama-duo" \
-DCMAKE_INSTALL_RPATH="/home/user/.local/llama-duo/lib" \
-DGGML_NATIVE=OFF \
-DCMAKE_BUILD_TYPE=RelWithDebInfo \
-DGGML_BUILD_TESTS=OFF \
-DGGML_BUILD_EXAMPLES=OFF \
-DGGML_HIP=ON \
-DGPU_TARGETS="gfx906" \
-DGGML_HIP_ROCWMMA_FATTN=ON \
-DGGML_USE_WMMA_FATTN=ON \
-DCMAKE_CXX_FLAGS="-I/tmp/llama.cpp/rocWMMA/library/include" \
-DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="75" \
-DGGML_RPC=ON \
-DGGML_BACKEND_DL=ON \
-DGGML_CPU_ALL_VARIANTS=ON
cmake --build build --parallel 24 --config Debug
cmake --install build
#rpc-sever doesn't automatically get copied
cp ./build/bin/rpc-server /home/user/.local/llama-duo/bin/
#cuda libraries so that it would work outside of the container
cp /usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so.13 /home/user/.local/llama-duo/lib/
cp /usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.13 /home/user/.local/llama-duo/lib/
cp /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.13 /home/user/.local/llama-duo/lib/
cp /usr/lib/x86_64-linux-gnu/libcuda.so.1 //home/user/.local/llama-duo/lib/
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment