Last active
February 3, 2026 10:18
-
-
Save daitj/cffc52658b05515ef5af52001fc27014 to your computer and use it in GitHub Desktop.
llama.cpp with both CUDA and ROCM builder, currently targeting gfx906 (AMD MI50) and CUDA 75 (NVIDIA 2080TI)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| --- /usr/local/cuda-13.1/targets/x86_64-linux/include/crt/math_functions.h.org 2025-08-21 00:50:27.000000000 +0200 | |
| +++ /usr/local/cuda-13.1/targets/x86_64-linux/include/crt/math_functions.h 2025-11-30 14:27:58.286886587 +0100 | |
| @@ -629,7 +629,7 @@ | |
| * | |
| * \note_accuracy_double | |
| */ | |
| -extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double rsqrt(double x); | |
| +extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double rsqrt(double x) noexcept (true); | |
| /** | |
| * \ingroup CUDA_MATH_SINGLE | |
| @@ -653,7 +653,7 @@ | |
| * | |
| * \note_accuracy_single | |
| */ | |
| -extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float rsqrtf(float x); | |
| +extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float rsqrtf(float x) noexcept (true); | |
| #if defined(__QNX__) && !defined(_LIBCPP_VERSION) | |
| namespace std { | |
| @@ -2594,7 +2594,7 @@ | |
| * | |
| * \note_accuracy_double | |
| */ | |
| -extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double sinpi(double x) __NV_IEC_60559_FUNCS_EXCEPTION_SPECIFIER; | |
| +extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double sinpi(double x) noexcept (true); | |
| /** | |
| * \ingroup CUDA_MATH_SINGLE | |
| * \brief Calculate the sine of the input argument | |
| @@ -2617,7 +2617,7 @@ | |
| * | |
| * \note_accuracy_single | |
| */ | |
| -extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float sinpif(float x) __NV_IEC_60559_FUNCS_EXCEPTION_SPECIFIER; | |
| +extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float sinpif(float x) noexcept (true); | |
| /** | |
| * \ingroup CUDA_MATH_DOUBLE | |
| * \brief Calculate the cosine of the input argument | |
| @@ -2639,7 +2639,7 @@ | |
| * | |
| * \note_accuracy_double | |
| */ | |
| -extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double cospi(double x) __NV_IEC_60559_FUNCS_EXCEPTION_SPECIFIER; | |
| +extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ double cospi(double x) noexcept (true); | |
| /** | |
| * \ingroup CUDA_MATH_SINGLE | |
| * \brief Calculate the cosine of the input argument | |
| @@ -2661,7 +2661,7 @@ | |
| * | |
| * \note_accuracy_single | |
| */ | |
| -extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float cospif(float x) __NV_IEC_60559_FUNCS_EXCEPTION_SPECIFIER; | |
| +extern __DEVICE_FUNCTIONS_DECL__ __device_builtin__ float cospif(float x) noexcept (true); | |
| #undef __NV_IEC_60559_FUNCS_EXCEPTION_SPECIFIER | |
| /** |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| #run me in a podman/docker container | |
| # cuda-repo-debian13-13-1-local_13.1.1-590.48.01-1_amd64.deb should exist at /apps | |
| # cuda_math_functions_noexcept.patch should also be at /apps take it from another file in this gist. | |
| apt update | |
| apt upgrade -y | |
| apt install -y git cmake build-essential libcurl4-openssl-dev librocblas-dev hipcc libamdhip64-dev libhipblas-dev librocblas-dev librocsolver0 librocsolver-dev | |
| dpkg -i /apps/cuda-repo-debian13-13-1-local_13.1.1-590.48.01-1_amd64.deb | |
| cp /var/cuda-repo-debian13-13-1-local/cuda-*-keyring.gpg /usr/share/keyrings/ | |
| apt update | |
| apt install -y cuda-libraries-13-1 cuda-compiler-13-1 libcublas-dev-13-1 | |
| # cuda needs a patch | |
| patch -p3 < /apps/cuda_math_functions_noexcept.patch | |
| cd /tmp | |
| rm -rf /tmp/llama.cpp | |
| git clone --shallow-submodules --recurse-submodules --depth=1 https://github.com/ggml-org/llama.cpp /tmp/llama.cpp | |
| cd /tmp/llama.cpp | |
| git clone --depth=1 --recurse-submodules --shallow-submodules -b rocm-6.4.3 https://github.com/ROCm/rocWMMA | |
| rm -rf ./build | |
| mkdir -p ./build | |
| mkdir -p /home/user/.local/lib | |
| mkdir -p /home/user/.local/bin | |
| export HIPCXX=clang-21 | |
| export CUDACXX="/usr/local/cuda-13/bin/nvcc" | |
| # rocwmma-version patch, may be not needed anymore do not know. | |
| echo "#ifndef ROCWMMA_VERSION_HPP | |
| #define ROCWMMA_VERSION_HPP | |
| #define ROCWMMA_VERSION_MAJOR 1 | |
| #define ROCWMMA_VERSION_MINOR 4 | |
| #define ROCWMMA_VERSION_PATCH 0 | |
| #define ROCWMMA_VERSION_STRING \"1.4.0\" | |
| #endif // ROCWMMA_VERSION_HPP" > "/tmp/llama.cpp/rocWMMA/library/include/rocwmma/rocwmma-version.hpp" | |
| # apply rocwmma patch | |
| sed -i 's|#include <rocwmma/rocwmma-version.hpp>|#include "/tmp/llama.cpp/rocWMMA/library/include/rocwmma/rocwmma-version.hpp"|' ggml/src/ggml-cuda/vendors/hip.h | |
| sed -i 's|#include <rocwmma/rocwmma.hpp>|#include "/tmp/llama.cpp/rocWMMA/library/include/rocwmma/rocwmma.hpp"|' ggml/src/ggml-cuda/fattn-wmma-f16.cu | |
| #build | |
| cmake -B build \ | |
| -DCMAKE_INSTALL_PREFIX="/home/user/.local/llama-duo" \ | |
| -DCMAKE_INSTALL_RPATH="/home/user/.local/llama-duo/lib" \ | |
| -DGGML_NATIVE=OFF \ | |
| -DCMAKE_BUILD_TYPE=RelWithDebInfo \ | |
| -DGGML_BUILD_TESTS=OFF \ | |
| -DGGML_BUILD_EXAMPLES=OFF \ | |
| -DGGML_HIP=ON \ | |
| -DGPU_TARGETS="gfx906" \ | |
| -DGGML_HIP_ROCWMMA_FATTN=ON \ | |
| -DGGML_USE_WMMA_FATTN=ON \ | |
| -DCMAKE_CXX_FLAGS="-I/tmp/llama.cpp/rocWMMA/library/include" \ | |
| -DGGML_CUDA=ON -DCMAKE_CUDA_ARCHITECTURES="75" \ | |
| -DGGML_RPC=ON \ | |
| -DGGML_BACKEND_DL=ON \ | |
| -DGGML_CPU_ALL_VARIANTS=ON | |
| cmake --build build --parallel 24 --config Debug | |
| cmake --install build | |
| #rpc-sever doesn't automatically get copied | |
| cp ./build/bin/rpc-server /home/user/.local/llama-duo/bin/ | |
| #cuda libraries so that it would work outside of the container | |
| cp /usr/local/cuda/targets/x86_64-linux/lib/libcublasLt.so.13 /home/user/.local/llama-duo/lib/ | |
| cp /usr/local/cuda/targets/x86_64-linux/lib/libcudart.so.13 /home/user/.local/llama-duo/lib/ | |
| cp /usr/local/cuda/targets/x86_64-linux/lib/libcublas.so.13 /home/user/.local/llama-duo/lib/ | |
| cp /usr/lib/x86_64-linux-gnu/libcuda.so.1 //home/user/.local/llama-duo/lib/ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment