typesense/bazel/whisper.patch
Ozan Armağan 002b188b56
Force whisper to not transcribe punctuations (#1771)
* Force whisper to not transcribe punctuations

* Fix the test
2024-06-05 14:53:39 +05:30

257 lines
9.1 KiB
Diff

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 567a58d..349cc4d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -203,6 +203,9 @@ if (WHISPER_BLAS)
endif ()
endif ()
+
+add_compile_definitions(GGML_USE_CUBLAS)
+
if (WHISPER_CUBLAS)
cmake_minimum_required(VERSION 3.17)
@@ -214,9 +217,7 @@ if (WHISPER_CUBLAS)
enable_language(CUDA)
set(GGML_SOURCES_CUDA ggml-cuda.cu ggml-cuda.h)
-
- add_compile_definitions(GGML_USE_CUBLAS)
-
+
if (WHISPER_STATIC)
if (WIN32)
# As of 12.3.1 CUDA Tookit for Windows does not offer a static cublas library
@@ -483,6 +484,12 @@ add_library(${TARGET}
include(DefaultTargetOptions)
+target_compile_options(${TARGET} PRIVATE
+ -Wno-unused-function
+ -Wimplicit-int
+ -Wint-conversion
+)
+
target_include_directories(${TARGET} PUBLIC
.
)
diff --git a/ggml-backend.c b/ggml-backend.c
index 53e741c..4756a85 100644
--- a/ggml-backend.c
+++ b/ggml-backend.c
@@ -286,8 +286,12 @@ static void ggml_backend_registry_init(void) {
// add forward decls here to avoid including the backend headers
#ifdef GGML_USE_CUBLAS
- extern void ggml_backend_cuda_reg_devices(void);
- ggml_backend_cuda_reg_devices();
+ if (cuda_handle) {
+ void (*ggml_backend_cuda_reg_devices)(void) = dlsym(cuda_handle, "ggml_backend_cuda_reg_devices");
+ if (ggml_backend_cuda_reg_devices) {
+ ggml_backend_cuda_reg_devices();
+ }
+ }
#endif
#ifdef GGML_USE_METAL
diff --git a/ggml-cuda.cu b/ggml-cuda.cu
index 7578d21..bb02610 100644
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -9,6 +9,7 @@
#include <stdint.h>
#include <stdio.h>
#include <vector>
+#include <iostream>
#if defined(GGML_USE_HIPBLAS)
@@ -6806,6 +6807,8 @@ bool ggml_cublas_loaded(void) {
void ggml_init_cublas() {
static bool initialized = false;
+ std::cout << "Initializing CUDA" << std::endl;
+
if (!initialized) {
#ifdef __HIP_PLATFORM_AMD__
@@ -6816,6 +6819,7 @@ void ggml_init_cublas() {
#endif
if (cudaGetDeviceCount(&g_device_count) != cudaSuccess) {
+ std::cout << "ERROR: " << cudaGetErrorString(cudaGetLastError()) << std::endl;
initialized = true;
g_cublas_loaded = false;
return;
@@ -6883,7 +6887,7 @@ void ggml_init_cublas() {
// configure logging to stdout
// CUBLAS_CHECK(cublasLoggerConfigure(1, 1, 0, nullptr));
-
+ std::cout << "CUDA initialized" << std::endl;
initialized = true;
g_cublas_loaded = true;
}
@@ -9433,16 +9437,13 @@ void ggml_cuda_free_scratch() {
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
if (!g_cublas_loaded) return false;
-
ggml_cuda_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
-
if (!any_on_device && tensor->op != GGML_OP_MUL_MAT && tensor->op != GGML_OP_MUL_MAT_ID) {
return false;
}
-
if (tensor->op == GGML_OP_MUL_MAT) {
if (tensor->src[0]->ne[3] != tensor->src[1]->ne[3]) {
#ifndef NDEBUG
diff --git a/ggml-cuda.h b/ggml-cuda.h
index cdb0c0c..18ba59a 100644
--- a/ggml-cuda.h
+++ b/ggml-cuda.h
@@ -21,7 +21,7 @@ extern "C" {
GGML_API void ggml_init_cublas(void);
// Returns `true` if there are available CUDA devices and cublas loads successfully; otherwise, it returns `false`.
-GGML_API bool ggml_cublas_loaded(void);
+GGML_API bool ggml_cublas_loaded(void);
GGML_API void * ggml_cuda_host_malloc(size_t size);
GGML_API void ggml_cuda_host_free(void * ptr);
diff --git a/ggml.c b/ggml.c
index b124f14..332e837 100644
--- a/ggml.c
+++ b/ggml.c
@@ -109,6 +109,7 @@ typedef void * thread_ret_t;
#include <sys/wait.h>
+
void ggml_print_backtrace(void) {
/*
#include <execinfo.h>
@@ -2261,7 +2262,17 @@ struct ggml_context * ggml_init(struct ggml_init_params params) {
}
#if defined(GGML_USE_CUBLAS)
- ggml_init_cublas();
+ cuda_handle = dlopen("libwhisper_cuda_shared.so", RTLD_LAZY);
+ if(cuda_handle) {
+ // load ggml_init_cublas from libwhisper_cuda_shared.so
+ void (*init_cublas)(void) = dlsym(cuda_handle, "ggml_init_cublas");
+ if(init_cublas) {
+ init_cublas();
+ }
+ } else {
+ GGML_PRINT("WARNING: libwhisper_cuda_shared.so not found, CUDA won't be used for Whisper\n");
+ GGML_PRINT("WARNING: %s\n", dlerror());
+ }
#elif defined(GGML_USE_CLBLAST)
ggml_cl_init();
#endif
@@ -14425,9 +14436,12 @@ static void ggml_compute_forward(struct ggml_compute_params * params, struct ggm
}
#ifdef GGML_USE_CUBLAS
- bool skip_cpu = ggml_cuda_compute_forward(params, tensor);
- if (skip_cpu) {
- return;
+ if(cuda_handle) {
+ bool(*cuda_compute_forward)(const struct ggml_compute_params *, struct ggml_tensor *) = dlsym(cuda_handle, "ggml_cuda_compute_forward");
+ bool skip_cpu = cuda_compute_forward(params, tensor);
+ if (skip_cpu) {
+ return;
+ }
}
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
@@ -16303,9 +16317,12 @@ static int ggml_get_n_tasks(struct ggml_tensor * node, int n_threads) {
//printf("nr0 = %8d, nr1 = %8d, nr0*nr1 = %8d, n_tasks%d\n", nr0, nr1, nr0*nr1, n_tasks);
#if defined(GGML_USE_CUBLAS)
- if (ggml_cuda_can_mul_mat(node->src[0], node->src[1], node)) {
- n_tasks = 1; // TODO: this actually is doing nothing
- // the threads are still spinning
+ if(cuda_handle) {
+ bool (*cuda_can_mul_mat)(struct ggml_tensor * a, struct ggml_tensor * b, struct ggml_tensor * c) = dlsym(cuda_handle, "ggml_cuda_can_mul_mat");
+ if (cuda_can_mul_mat(node->src[0], node->src[1], node)) {
+ n_tasks = 1; // TODO: this actually is doing nothing
+ // the threads are still spinning
+ }
}
#elif defined(GGML_USE_CLBLAST)
if (ggml_cl_can_mul_mat(node->src[0], node->src[1], node)) {
diff --git a/ggml.h b/ggml.h
index 64f4e45..85fb039 100644
--- a/ggml.h
+++ b/ggml.h
@@ -246,7 +246,6 @@
if (!(x)) { \
fflush(stdout); \
fprintf(stderr, "GGML_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
- ggml_print_backtrace(); \
abort(); \
} \
} while (0)
@@ -261,6 +260,11 @@
#define GGML_UNREACHABLE() ((void) 0)
#endif
+#ifdef GGML_USE_CUBLAS
+ #include <dlfcn.h>
+ static void* cuda_handle;
+#endif
+
// used to copy the number of elements and stride in bytes of tensors into local variables.
// main purpose is to reduce code duplication and improve readability.
//
@@ -635,8 +639,6 @@ extern "C" {
GGML_API int64_t ggml_cycles(void);
GGML_API int64_t ggml_cycles_per_ms(void);
- GGML_API void ggml_print_backtrace(void);
-
GGML_API void ggml_numa_init(void); // call once for better performance on NUMA systems
GGML_API bool ggml_is_numa(void); // true if init detected that system has >1 NUMA node
diff --git a/whisper.cpp b/whisper.cpp
index f6ba822..485c013 100644
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -10,6 +10,7 @@
#ifdef GGML_USE_CUBLAS
#include "ggml-cuda.h"
+#include <dlfcn.h>
#endif
#ifdef WHISPER_USE_OPENVINO
@@ -1058,8 +1059,12 @@ static ggml_backend_t whisper_backend_init(const whisper_context_params & params
// initialize the backends
#ifdef GGML_USE_CUBLAS
- if (params.use_gpu && ggml_cublas_loaded()) {
+ // load ggml_cublas_loaded() function from the shared library
+ auto ggml_cublas_loaded = (bool (*)()) dlsym(cuda_handle, "ggml_cublas_loaded");
+ if (cuda_handle && params.use_gpu && ggml_cublas_loaded()) {
WHISPER_LOG_INFO("%s: using CUDA backend\n", __func__);
+ // load ggml_backend_cuda_init() function from the shared library
+ auto ggml_backend_cuda_init = (ggml_backend_t (*)(int)) dlsym(cuda_handle, "ggml_backend_cuda_init");
backend_gpu = ggml_backend_cuda_init(0);
if (!backend_gpu) {
WHISPER_LOG_ERROR("%s: ggml_backend_cuda_init() failed\n", __func__);
@@ -4483,7 +4488,7 @@ static const std::vector<std::string> non_speech_tokens = {
"\"", "#", "(", ")", "*", "+", "/", ":", ";", "<", "=", ">", "@", "[", "\\", "]", "^",
"_", "`", "{", "|", "}", "~", "「", "」", "『", "』", "<<", ">>", "<<<", ">>>", "--",
"---", "-(", "-[", "('", "(\"", "((", "))", "(((", ")))", "[[", "]]", "{{", "}}", "♪♪",
- "♪♪♪","♩", "♪", "♫", "♬", "♭", "♮", "♯"
+ "♪♪♪","♩", "♪", "♫", "♬", "♭", "♮", "♯", "!", ".", ",", "?", "!", "¡", "¿", "‼", "⁉",
};
// process the logits for the selected decoder