mirror of
https://github.com/typesense/typesense.git
synced 2025-05-17 04:02:36 +08:00
257 lines
9.1 KiB
Diff
257 lines
9.1 KiB
Diff
diff --git a/CMakeLists.txt b/CMakeLists.txt
|
|
index 567a58d..349cc4d 100644
|
|
--- a/CMakeLists.txt
|
|
+++ b/CMakeLists.txt
|
|
@@ -203,6 +203,9 @@ if (WHISPER_BLAS)
|
|
endif ()
|
|
endif ()
|
|
|
|
+
|
|
+add_compile_definitions(GGML_USE_CUBLAS)
|
|
+
|
|
if (WHISPER_CUBLAS)
|
|
cmake_minimum_required(VERSION 3.17)
|
|
|
|
@@ -214,9 +217,7 @@ if (WHISPER_CUBLAS)
|
|
enable_language(CUDA)
|
|
|
|
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
|
|
-
|
|
- add_compile_definitions(GGML_USE_CUBLAS)
|
|
-
|
|
+
|
|
if (WHISPER_STATIC)
|
|
if (WIN32)
|
|
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
|
|
@@ -483,6 +484,12 @@ add_library(${TARGET}
|
|
|
|
include(DefaultTargetOptions)
|
|
|
|
+target_compile_options(${TARGET} PRIVATE
|
|
+ -Wno-unused-function
|
|
+ -Wimplicit-int
|
|
+ -Wint-conversion
|
|
+)
|
|
+
|
|
target_include_directories(${TARGET} PUBLIC
|
|
.
|
|
)
|
|
diff --git a/ggml-backend.c b/ggml-backend.c
|
|
index 53e741c..4756a85 100644
|
|
--- a/ggml-backend.c
|
|
+++ b/ggml-backend.c
|
|
@@ -286,8 +286,12 @@ static void ggml_backend_registry_init(void) {
|
|
|
|
// add forward decls here to avoid including the backend headers
|
|
#ifdef GGML_USE_CUBLAS
|
|
- extern void ggml_backend_cuda_reg_devices(void);
|
|
- ggml_backend_cuda_reg_devices();
|
|
+ if (cuda_handle) {
|
|
+ void (*ggml_backend_cuda_reg_devices)(void) = dlsym(cuda_handle, "ggml_backend_cuda_reg_devices");
|
|
+ if (ggml_backend_cuda_reg_devices) {
|
|
+ ggml_backend_cuda_reg_devices();
|
|
+ }
|
|
+ }
|
|
#endif
|
|
|
|
#ifdef GGML_USE_METAL
|
|
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
|
|
index 7578d21..bb02610 100644
|
|
--- a/ggml-cuda.cu
|
|
+++ b/ggml-cuda.cu
|
|
@@ -9,6 +9,7 @@
|
|
#include <stdint.h>
|
|
#include <stdio.h>
|
|
#include <vector>
|
|
+#include <iostream>
|
|
|
|
|
|
#if defined(GGML_USE_HIPBLAS)
|
|
@@ -6806,6 +6807,8 @@ bool ggml_cublas_loaded(void) {
|
|
void ggml_init_cublas() {
|
|
static bool initialized = false;
|
|
|
|
+ std::cout << "Initializing CUDA" << std::endl;
|
|
+
|
|
if (!initialized) {
|
|
|
|
#ifdef __HIP_PLATFORM_AMD__
|
|
@@ -6816,6 +6819,7 @@ void ggml_init_cublas() {
|
|
#endif
|
|
|
|
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
|
|
+ std::cout << "ERROR: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
|
|
initialized = true;
|
|
g_cublas_loaded = false;
|
|
return;
|
|
@@ -6883,7 +6887,7 @@ void ggml_init_cublas() {
|
|
|
|
// configure logging to stdout
|
|
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
|
|
-
|
|
+ std::cout << "CUDA initialized" << std::endl;
|
|
initialized = true;
|
|
g_cublas_loaded = true;
|
|
}
|
|
@@ -9433,16 +9437,13 @@ void ggml_cuda_free_scratch() {
|
|
|
|
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
|
|
if (!g_cublas_loaded) return false;
|
|
-
|
|
ggml_cuda_func_t func;
|
|
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|
|
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|
|
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
|
|
-
|
|
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
|
|
return false;
|
|
}
|
|
-
|
|
if (tensor->op == GGML_OP_MUL_MAT) {
|
|
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
|
|
#ifndef NDEBUG
|
|
diff --git a/ggml-cuda.h b/ggml-cuda.h
|
|
index cdb0c0c..18ba59a 100644
|
|
--- a/ggml-cuda.h
|
|
+++ b/ggml-cuda.h
|
|
@@ -21,7 +21,7 @@ extern "C" {
|
|
GGML_API void ggml_init_cublas(void);
|
|
|
|
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
|
|
-GGML_API bool ggml_cublas_loaded(void);
|
|
+GGML_API bool ggml_cublas_loaded(void);
|
|
|
|
GGML_API void * ggml_cuda_host_malloc(size_t size);
|
|
GGML_API void ggml_cuda_host_free(void * ptr);
|
|
diff --git a/ggml.c b/ggml.c
|
|
index b124f14..332e837 100644
|
|
--- a/ggml.c
|
|
+++ b/ggml.c
|
|
@@ -109,6 +109,7 @@ typedef void * thread_ret_t;
|
|
|
|
#include <sys/wait.h>
|
|
|
|
+
|
|
void ggml_print_backtrace(void) {
|
|
/*
|
|
#include <execinfo.h>
|
|
@@ -2261,7 +2262,17 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
|
|
}
|
|
|
|
#if defined(GGML_USE_CUBLAS)
|
|
- ggml_init_cublas();
|
|
+ cuda_handle = dlopen("libwhisper_cuda_shared.so", RTLD_LAZY);
|
|
+ if(cuda_handle) {
|
|
+ // load ggml_init_cublas from libwhisper_cuda_shared.so
|
|
+ void (*init_cublas)(void) = dlsym(cuda_handle, "ggml_init_cublas");
|
|
+ if(init_cublas) {
|
|
+ init_cublas();
|
|
+ }
|
|
+ } else {
|
|
+ GGML_PRINT("WARNING: libwhisper_cuda_shared.so not found, CUDA won't be used for Whisper\n");
|
|
+ GGML_PRINT("WARNING: %s\n", dlerror());
|
|
+ }
|
|
#elif defined(GGML_USE_CLBLAST)
|
|
ggml_cl_init();
|
|
#endif
|
|
@@ -14425,9 +14436,12 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
|
|
}
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
- bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
|
|
- if (skip_cpu) {
|
|
- return;
|
|
+ if(cuda_handle) {
|
|
+ bool(*cuda_compute_forward)(const struct ggml_compute_params *, struct ggml_tensor *) = dlsym(cuda_handle, "ggml_cuda_compute_forward");
|
|
+ bool skip_cpu = cuda_compute_forward(params, tensor);
|
|
+ if (skip_cpu) {
|
|
+ return;
|
|
+ }
|
|
}
|
|
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
|
|
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
|
|
@@ -16303,9 +16317,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
|
|
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
|
|
|
|
#if defined(GGML_USE_CUBLAS)
|
|
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
|
- n_tasks = 1; // TODO: this actually is doing nothing
|
|
- // the threads are still spinning
|
|
+ if(cuda_handle) {
|
|
+ bool (*cuda_can_mul_mat)(struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c) = dlsym(cuda_handle, "ggml_cuda_can_mul_mat");
|
|
+ if (cuda_can_mul_mat(node->src[0], node->src[1], node)) {
|
|
+ n_tasks = 1; // TODO: this actually is doing nothing
|
|
+ // the threads are still spinning
|
|
+ }
|
|
}
|
|
#elif defined(GGML_USE_CLBLAST)
|
|
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
|
|
diff --git a/ggml.h b/ggml.h
|
|
index 64f4e45..85fb039 100644
|
|
--- a/ggml.h
|
|
+++ b/ggml.h
|
|
@@ -246,7 +246,6 @@
|
|
if (!(x)) { \
|
|
fflush(stdout); \
|
|
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
|
|
- ggml_print_backtrace(); \
|
|
abort(); \
|
|
} \
|
|
} while (0)
|
|
@@ -261,6 +260,11 @@
|
|
#define GGML_UNREACHABLE() ((void) 0)
|
|
#endif
|
|
|
|
+#ifdef GGML_USE_CUBLAS
|
|
+ #include <dlfcn.h>
|
|
+ static void* cuda_handle;
|
|
+#endif
|
|
+
|
|
// used to copy the number of elements and stride in bytes of tensors into local variables.
|
|
// main purpose is to reduce code duplication and improve readability.
|
|
//
|
|
@@ -635,8 +639,6 @@ extern "C" {
|
|
GGML_API int64_t ggml_cycles(void);
|
|
GGML_API int64_t ggml_cycles_per_ms(void);
|
|
|
|
- GGML_API void ggml_print_backtrace(void);
|
|
-
|
|
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
|
|
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
|
|
|
|
diff --git a/whisper.cpp b/whisper.cpp
|
|
index f6ba822..485c013 100644
|
|
--- a/whisper.cpp
|
|
+++ b/whisper.cpp
|
|
@@ -10,6 +10,7 @@
|
|
|
|
#ifdef GGML_USE_CUBLAS
|
|
#include "ggml-cuda.h"
|
|
+#include <dlfcn.h>
|
|
#endif
|
|
|
|
#ifdef WHISPER_USE_OPENVINO
|
|
@@ -1058,8 +1059,12 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params
|
|
|
|
// initialize the backends
|
|
#ifdef GGML_USE_CUBLAS
|
|
- if (params.use_gpu && ggml_cublas_loaded()) {
|
|
+ // load ggml_cublas_loaded() function from the shared library
|
|
+ auto ggml_cublas_loaded = (bool (*)()) dlsym(cuda_handle, "ggml_cublas_loaded");
|
|
+ if (cuda_handle && params.use_gpu && ggml_cublas_loaded()) {
|
|
WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
|
|
+ // load ggml_backend_cuda_init() function from the shared library
|
|
+ auto ggml_backend_cuda_init = (ggml_backend_t (*)(int)) dlsym(cuda_handle, "ggml_backend_cuda_init");
|
|
backend_gpu = ggml_backend_cuda_init(0);
|
|
if (!backend_gpu) {
|
|
WHISPER_LOG_ERROR("%s: ggml_backend_cuda_init() failed\n", __func__);
|
|
@@ -4483,7 +4488,7 @@ static const std::vector<std::string> non_speech_tokens = {
|
|
"\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
|
|
"_", "`", "{", "|", "}", "~", "「", "」", "『", "』", "<<", ">>", "<<<", ">>>", "--",
|
|
"---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
|
|
- "♪♪♪","♩", "♪", "♫", "♬", "♭", "♮", "♯"
|
|
+ "♪♪♪","♩", "♪", "♫", "♬", "♭", "♮", "♯", "!", ".", ",", "?", "!", "¡", "¿", "‼", "⁉",
|
|
};
|
|
|
|
// process the logits for the selected decoder
|