mirror of
https://github.com/ggerganov/llama.cpp.git
synced 2025-04-20 03:49:02 +08:00
SYCL: using graphs is configurable by environment variable and compile option (#12371)
* alberto changes * enable sycl graphs by env variable * fixed compilation warnings in ggml-sycl.cpp * renamed graph variables * fix markdown in docs/backend/SYCL.md Co-authored-by: Romain Biessy <romain.biessy@codeplay.com> * fix markdown in docs/backend/SYCL.md again * compiling graphs by default, renamed graph_enable to graph_disable --------- Co-authored-by: Romain Biessy <romain.biessy@codeplay.com>
This commit is contained in:
parent
810e0af3f5
commit
35cae5ba05
@ -660,8 +660,9 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||||||
|--------------------|---------------------------------------|---------------------------------------------|
|
|--------------------|---------------------------------------|---------------------------------------------|
|
||||||
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
|
| GGML_SYCL | ON (mandatory) | Enable build with SYCL code path.<br>FP32 path - recommended for better perforemance than FP16 on quantized model|
|
||||||
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
|
| GGML_SYCL_TARGET | INTEL *(default)* \| NVIDIA \| AMD | Set the SYCL target device type. |
|
||||||
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
|
| GGML_SYCL_DEVICE_ARCH | Optional (except for AMD) | Set the SYCL device architecture, optional except for AMD. Setting the device architecture can improve the performance. See the table [--offload-arch](https://github.com/intel/llvm/blob/sycl/sycl/doc/design/OffloadDesign.md#--offload-arch) for a list of valid architectures. |
|
||||||
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
| GGML_SYCL_F16 | OFF *(default)* \|ON *(optional)* | Enable FP16 build with SYCL code path. |
|
||||||
|
| GGML_SYCL_GRAPH | ON *(default)* \|OFF *(Optional)* | Enable build with [SYCL Graph extension](https://github.com/intel/llvm/blob/sycl/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc). |
|
||||||
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
| CMAKE_C_COMPILER | `icx` *(Linux)*, `icx/cl` *(Windows)* | Set `icx` compiler for SYCL code path. |
|
||||||
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
| CMAKE_CXX_COMPILER | `icpx` *(Linux)*, `icx` *(Windows)* | Set `icpx/icx` compiler for SYCL code path. |
|
||||||
|
|
||||||
@ -671,6 +672,7 @@ use 1 SYCL GPUs: [0] with Max compute units:512
|
|||||||
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
|
|-------------------|------------------|---------------------------------------------------------------------------------------------------------------------------|
|
||||||
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
|
| GGML_SYCL_DEBUG | 0 (default) or 1 | Enable log function by macro: GGML_SYCL_DEBUG |
|
||||||
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
|
| GGML_SYCL_DISABLE_OPT | 0 (default) or 1 | Disable optimize features based on Intel GPU type, to compare the performance increase |
|
||||||
|
| GGML_SYCL_DISABLE_GRAPH | 0 or 1 (default) | Disable running computations through SYCL Graphs feature. Disabled by default because graph performance isn't yet better than non-graph performance. |
|
||||||
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
| ZES_ENABLE_SYSMAN | 0 (default) or 1 | Support to get free memory of GPU by sycl::aspect::ext_intel_free_memory.<br>Recommended to use when --split-mode = layer |
|
||||||
|
|
||||||
|
|
||||||
|
@ -186,6 +186,7 @@ option(GGML_OPENMP "ggml: use OpenMP"
|
|||||||
option(GGML_RPC "ggml: use RPC" OFF)
|
option(GGML_RPC "ggml: use RPC" OFF)
|
||||||
option(GGML_SYCL "ggml: use SYCL" OFF)
|
option(GGML_SYCL "ggml: use SYCL" OFF)
|
||||||
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
option(GGML_SYCL_F16 "ggml: use 16 bit floats for sycl calculations" OFF)
|
||||||
|
option(GGML_SYCL_GRAPH "ggml: enable graphs in the SYCL backend" ON)
|
||||||
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
set (GGML_SYCL_TARGET "INTEL" CACHE STRING
|
||||||
"ggml: sycl target device")
|
"ggml: sycl target device")
|
||||||
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
set (GGML_SYCL_DEVICE_ARCH "" CACHE STRING
|
||||||
|
@ -66,6 +66,9 @@ if (WIN32)
|
|||||||
find_package(MKL REQUIRED)
|
find_package(MKL REQUIRED)
|
||||||
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
target_link_libraries(ggml-sycl PRIVATE IntelSYCL::SYCL_CXX MKL::MKL MKL::MKL_SYCL)
|
||||||
else()
|
else()
|
||||||
|
if (GGML_SYCL_GRAPH)
|
||||||
|
add_compile_definitions(GGML_SYCL_GRAPH)
|
||||||
|
endif()
|
||||||
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
if (GGML_SYCL_TARGET STREQUAL "INTEL")
|
||||||
target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
target_link_libraries(ggml-sycl PRIVATE sycl OpenCL mkl_core pthread m dl mkl_sycl_blas mkl_intel_ilp64 mkl_tbb_thread)
|
||||||
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
elseif (GGML_SYCL_TARGET STREQUAL "NVIDIA")
|
||||||
|
@ -301,6 +301,7 @@ inline optimize_feature check_gpu_optimize_feature(syclex::architecture &arch) {
|
|||||||
return opt;
|
return opt;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
namespace sycl_ex = sycl::ext::oneapi::experimental;
|
||||||
struct ggml_backend_sycl_context {
|
struct ggml_backend_sycl_context {
|
||||||
int device;
|
int device;
|
||||||
std::string name;
|
std::string name;
|
||||||
@ -392,6 +393,10 @@ struct ggml_backend_sycl_context {
|
|||||||
return pool(device);
|
return pool(device);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
#ifdef GGML_SYCL_GRAPH
|
||||||
|
std::unique_ptr<sycl_ex::command_graph<sycl_ex::graph_state::executable>> exec_graph = nullptr;
|
||||||
|
#endif
|
||||||
|
|
||||||
ggml_sycl_pool & host_pool(int device) {
|
ggml_sycl_pool & host_pool(int device) {
|
||||||
if (host_pools[device] == nullptr) {
|
if (host_pools[device] == nullptr) {
|
||||||
host_pools[device] = new_pool_for_host(stream(device, 0), device);
|
host_pools[device] = new_pool_for_host(stream(device, 0), device);
|
||||||
|
@ -46,6 +46,7 @@
|
|||||||
static bool g_sycl_loaded = false;
|
static bool g_sycl_loaded = false;
|
||||||
int g_ggml_sycl_debug = 0;
|
int g_ggml_sycl_debug = 0;
|
||||||
int g_ggml_sycl_disable_optimize = 0;
|
int g_ggml_sycl_disable_optimize = 0;
|
||||||
|
int g_ggml_sycl_disable_graph = 0;
|
||||||
|
|
||||||
static ggml_sycl_device_info ggml_sycl_init() {
|
static ggml_sycl_device_info ggml_sycl_init() {
|
||||||
ggml_sycl_device_info info = {};
|
ggml_sycl_device_info info = {};
|
||||||
@ -191,10 +192,12 @@ static void ggml_check_sycl() try {
|
|||||||
if (!initialized) {
|
if (!initialized) {
|
||||||
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
g_ggml_sycl_debug = get_sycl_env("GGML_SYCL_DEBUG", 0);
|
||||||
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
g_ggml_sycl_disable_optimize= get_sycl_env("GGML_SYCL_DISABLE_OPT", 0);
|
||||||
|
g_ggml_sycl_disable_graph = get_sycl_env("GGML_SYCL_DISABLE_GRAPH", 1);
|
||||||
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
GGML_SYCL_DEBUG("[SYCL] call ggml_check_sycl\n");
|
||||||
GGML_LOG_INFO("Running with Environment Variables:\n");
|
GGML_LOG_INFO("Running with Environment Variables:\n");
|
||||||
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
GGML_LOG_INFO(" GGML_SYCL_DEBUG: %d\n", g_ggml_sycl_debug);
|
||||||
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_OPT: %d\n", g_ggml_sycl_disable_optimize);
|
||||||
|
GGML_LOG_INFO(" GGML_SYCL_DISABLE_GRAPH: %d\n", g_ggml_sycl_disable_graph);
|
||||||
GGML_LOG_INFO("Build with Macros:\n");
|
GGML_LOG_INFO("Build with Macros:\n");
|
||||||
#if defined(GGML_SYCL_FORCE_MMQ)
|
#if defined(GGML_SYCL_FORCE_MMQ)
|
||||||
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
|
GGML_LOG_INFO(" GGML_SYCL_FORCE_MMQ: yes\n");
|
||||||
@ -3699,10 +3702,9 @@ static void optimize_graph_once(ggml_cgraph * cgraph, ggml_backend_sycl_context
|
|||||||
if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
|
if (ctx->opt_feature.reorder) opt_for_reorder(cgraph->nodes[i], stream);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
|
||||||
ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
|
|
||||||
ggml_sycl_set_main_device(sycl_ctx->device);
|
|
||||||
|
|
||||||
|
static void ggml_backend_sycl_graph_compute_impl(ggml_backend_sycl_context * sycl_ctx, ggml_cgraph * cgraph) {
|
||||||
|
ggml_sycl_set_main_device(sycl_ctx->device);
|
||||||
if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
|
if (!g_ggml_sycl_disable_optimize) optimize_graph_once(cgraph, sycl_ctx);
|
||||||
|
|
||||||
for (int i = 0; i < cgraph->n_nodes; i++) {
|
for (int i = 0; i < cgraph->n_nodes; i++) {
|
||||||
@ -3724,7 +3726,46 @@ static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_
|
|||||||
}
|
}
|
||||||
GGML_ASSERT(ok);
|
GGML_ASSERT(ok);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
static ggml_status ggml_backend_sycl_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
|
||||||
|
auto * sycl_ctx = static_cast<ggml_backend_sycl_context *>(backend->context);
|
||||||
|
|
||||||
|
#ifdef GGML_SYCL_GRAPH
|
||||||
|
if (!g_ggml_sycl_disable_graph) {
|
||||||
|
if (!sycl_ctx->exec_graph && !dpct::get_device(sycl_ctx->device).has(sycl::aspect::ext_oneapi_graph)) {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL-GRAPH] can not use graphs on device:%d\n", sycl_ctx->device);
|
||||||
|
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
|
||||||
|
return GGML_STATUS_SUCCESS;
|
||||||
|
}
|
||||||
|
|
||||||
|
sycl_ex::command_graph model_sycl_graph(*(sycl_ctx->stream()));
|
||||||
|
model_sycl_graph.begin_recording(*(sycl_ctx->stream()));
|
||||||
|
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
|
||||||
|
model_sycl_graph.end_recording();
|
||||||
|
|
||||||
|
if (!sycl_ctx->exec_graph) {
|
||||||
|
auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
|
||||||
|
sycl_ctx->exec_graph = std::make_unique<
|
||||||
|
sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
|
||||||
|
} else {
|
||||||
|
try {
|
||||||
|
sycl_ctx->exec_graph->update(model_sycl_graph);
|
||||||
|
GGML_SYCL_DEBUG("[SYCL-GRAPH] update success\n");
|
||||||
|
} catch (sycl::exception const & e) {
|
||||||
|
GGML_SYCL_DEBUG("[SYCL-GRAPH] Exception when updating graph, %s\n", e.what());
|
||||||
|
auto exec_graph = model_sycl_graph.finalize({sycl_ex::property::graph::updatable{}});
|
||||||
|
sycl_ctx->exec_graph = std::make_unique<
|
||||||
|
sycl_ex::command_graph<sycl_ex::graph_state::executable>>(exec_graph);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
sycl_ctx->stream()->ext_oneapi_graph(*(sycl_ctx->exec_graph));
|
||||||
|
} else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
ggml_backend_sycl_graph_compute_impl(sycl_ctx, cgraph);
|
||||||
|
}
|
||||||
return GGML_STATUS_SUCCESS;
|
return GGML_STATUS_SUCCESS;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
x
Reference in New Issue
Block a user