Load ROCm SMI dynamically by default, optionally statically compile and link

This commit is contained in:
romner 2023-06-02 15:34:12 +02:00
parent 093edfe948
commit daaa45324f
2 changed files with 81 additions and 30 deletions

View file

@ -116,10 +116,6 @@ ifeq ($(PLATFORM_LC),linux)
PLATFORM_DIR := linux
THREADS := $(shell getconf _NPROCESSORS_ONLN 2>/dev/null || echo 1)
SU_GROUP := root
ifneq ($(NO_GPU),true)
GPULDFLAGS += lib/rocm_smi_lib/build/rocm_smi/librocm_smi64.a
override ADDFLAGS += -DGPU_AMD
endif
else ifeq ($(PLATFORM_LC),freebsd)
PLATFORM_DIR := freebsd
THREADS := $(shell getconf NPROCESSORS_ONLN 2>/dev/null || echo 1)
@ -135,6 +131,15 @@ else
$(error $(shell printf "\033[1;91mERROR: \033[97mUnsupported platform ($(PLATFORM))\033[0m"))
endif
ifeq ($(RSMI_STATIC),true)
ifeq ($(PLATFORM_LC),linux)
GPULDFLAGS := lib/rocm_smi_lib/build/rocm_smi/librocm_smi64.a
override ADDFLAGS += -DRSMI_STATIC
else
override RSMI_STATIC = false
endif
endif
#? Use all CPU cores (will only be set if using Make 4.3+)
MAKEFLAGS := --jobs=$(THREADS)
ifeq ($(THREADS),1)
@ -306,7 +311,7 @@ uninstall:
-include $(OBJECTS:.$(OBJEXT)=.$(DEPEXT))
#? Compile rocm_smi
ifneq ($(NO_GPU),true)
ifeq ($(RSMI_STATIC),true)
.ONESHELL:
rocm_smi:
@printf "\n\033[1;92mBuilding ROCm SMI static library\033[37m...\033[0m\n"
@ -314,10 +319,10 @@ rocm_smi:
@mkdir -p lib/rocm_smi_lib/build
@cd lib/rocm_smi_lib/build
@$(QUIET) || printf "\033[1;97mRunning CMake...\033[0m\n"
@cmake .. $(SUPPRESS) || exit 1
@cmake .. $(SUPPRESS) || { printf "\033[1;91mCMake failed, continuing build without statically linking ROCm SMI\033[37m...\033[0m\n"; $(override RSMI_STATIC = false) $(LDFLAGS = $(filter-out $(GPULDFLAGS),$(LDFLAGS))) $(ADDFLAGS = $(filter-out -DRSMI_STATIC,$(ADDFLAGS))) exit 0; }
@$(QUIET) || printf "\n\033[1;97mBuilding and linking...\033[0m\n"
@$(MAKE) $(SUPPRESS) || exit 1
@ar -crs rocm_smi/librocm_smi64.a $$(find rocm_smi -name '*.o') $(SURPRESS) || exit 1
@$(MAKE) $(SUPPRESS) || { printf "\033[1;91mMake failed, continuing build without statically linking ROCm SMI\033[37m...\033[0m\n"; $(override RSMI_STATIC = false) $(LDFLAGS = $(filter-out $(GPULDFLAGS),$(LDFLAGS))) $(ADDFLAGS = $(filter-out -DRSMI_STATIC,$(ADDFLAGS))) exit 0; }
@ar -crs rocm_smi/librocm_smi64.a $$(find rocm_smi -name '*.o') $(SURPRESS) || { printf "\033[1;91mFailed to pack ROCm SMI into static library, continuing build without statically linking ROCm SMI\033[37m...\033[0m\n"; $(override RSMI_STATIC = false) $(LDFLAGS = $(filter-out $(GPULDFLAGS),$(LDFLAGS))) $(ADDFLAGS = $(filter-out -DRSMI_STATIC,$(ADDFLAGS))) exit 0; }
@printf "\033[1;92m100$(P)\033[10D\033[5C-> \033[1;37mrocm_smi/librocm_smi64.a \033[100D\033[38C\033[1;93m(\033[1;97m$$(du -ah rocm_smi/librocm_smi64.a | cut -f1)iB\033[1;93m)\033[0m\n"
@printf "\033[1;92mROCm SMI build complete in \033[92m(\033[97m$$($(DATE_CMD) -d @$$(expr $$(date +%s 2>/dev/null || echo "0") - $(TIMESTAMP) 2>/dev/null) -u +%Mm:%Ss 2>/dev/null | sed 's/^00m://' || echo "unknown")\033[92m)\033[0m\n"
else

View file

@ -29,9 +29,7 @@ tab-size = 4
#include <net/if.h>
#include <arpa/inet.h> // for inet_ntop()
#include <dlfcn.h>
#if defined(GPU_AMD)
#include <rocm_smi/rocm_smi.h>
#endif
#include <rocm_smi/rocm_smi.h>
#if !(defined(STATIC_BUILD) && defined(__GLIBC__))
@ -106,7 +104,7 @@ namespace Gpu {
//? NVIDIA data collection
namespace Nvml {
// NVML defines, structs & typedefs
//? NVML defines, structs & typedefs
#define NVML_DEVICE_NAME_BUFFER_SIZE 64
#define NVML_SUCCESS 0
#define NVML_TEMPERATURE_THRESHOLD_SHUTDOWN 0
@ -144,7 +142,7 @@ namespace Gpu {
nvmlReturn_t (*nvmlDeviceGetMemoryInfo)(nvmlDevice_t, nvmlMemory_t*);
nvmlReturn_t (*nvmlDeviceGetPcieThroughput)(nvmlDevice_t, nvmlPcieUtilCounter_t, unsigned int*);
//? Actual data
//? Data
void* nvml_dl_handle;
bool initialized = false;
bool init();
@ -156,6 +154,25 @@ namespace Gpu {
//? AMD data collection
namespace Rsmi {
#if !defined(RSMI_STATIC)
//? Function pointers
rsmi_status_t (*rsmi_init)(uint64_t);
rsmi_status_t (*rsmi_shut_down)();
rsmi_status_t (*rsmi_num_monitor_devices)(uint32_t*);
rsmi_status_t (*rsmi_dev_name_get)(uint32_t, char*, size_t);
rsmi_status_t (*rsmi_dev_power_cap_get)(uint32_t, uint32_t, uint64_t*);
rsmi_status_t (*rsmi_dev_temp_metric_get)(uint32_t, uint32_t, rsmi_temperature_metric_t, int64_t*);
rsmi_status_t (*rsmi_dev_busy_percent_get)(uint32_t, uint32_t*);
rsmi_status_t (*rsmi_dev_memory_busy_percent_get)(uint32_t, uint32_t*);
rsmi_status_t (*rsmi_dev_gpu_clk_freq_get)(uint32_t, rsmi_clk_type_t, rsmi_frequencies_t*);
rsmi_status_t (*rsmi_dev_power_ave_get)(uint32_t, uint32_t, uint64_t*);
rsmi_status_t (*rsmi_dev_memory_total_get)(uint32_t, rsmi_memory_type_t, uint64_t*);
rsmi_status_t (*rsmi_dev_memory_usage_get)(uint32_t, rsmi_memory_type_t, uint64_t*);
rsmi_status_t (*rsmi_dev_pci_throughput_get)(uint32_t, uint64_t*, uint64_t*, uint64_t*);
//? Data
void* rsmi_dl_handle;
#endif
bool initialized = false;
bool init();
bool shutdown();
@ -1161,11 +1178,49 @@ namespace Gpu {
//? AMD
namespace Rsmi {
bool init() {
#if defined(GPU_AMD)
if (initialized) return false;
rsmi_status_t result;
result = rsmi_init(0);
//? Dynamic loading & linking
#if !defined(RSMI_STATIC)
rsmi_dl_handle = dlopen("/opt/rocm/lib/librocm_smi64.so", RTLD_LAZY); // first try /lib and /usr/lib, then /opt/rocm/lib if that fails
if (dlerror() != NULL) {
rsmi_dl_handle = dlopen("librocm_smi64.so", RTLD_LAZY);
if (!rsmi_dl_handle) {
Logger::info(std::string("Failed to load librocm_smi64.so, AMD GPUs will not be detected: ") + dlerror());
return false;
}
}
auto load_rsmi_sym = [&](const char sym_name[]) {
auto sym = dlsym(rsmi_dl_handle, sym_name);
auto err = dlerror();
if (err != NULL) {
Logger::error(string("ROCm SMI: Couldn't find function ") + sym_name + ": " + err);
return (void*)nullptr;
} else return sym;
};
#define LOAD_SYM(NAME) if ((NAME = (decltype(NAME))load_rsmi_sym(#NAME)) == nullptr) return false
LOAD_SYM(rsmi_init);
LOAD_SYM(rsmi_shut_down);
LOAD_SYM(rsmi_num_monitor_devices);
LOAD_SYM(rsmi_dev_name_get);
LOAD_SYM(rsmi_dev_power_cap_get);
LOAD_SYM(rsmi_dev_temp_metric_get);
LOAD_SYM(rsmi_dev_busy_percent_get);
LOAD_SYM(rsmi_dev_memory_busy_percent_get);
LOAD_SYM(rsmi_dev_gpu_clk_freq_get);
LOAD_SYM(rsmi_dev_power_ave_get);
LOAD_SYM(rsmi_dev_memory_total_get);
LOAD_SYM(rsmi_dev_memory_usage_get);
LOAD_SYM(rsmi_dev_pci_throughput_get);
#undef LOAD_SYM
#endif
//? Function calls
rsmi_status_t result = rsmi_init(0);
if (result != RSMI_STATUS_SUCCESS) {
Logger::debug("Failed to initialize ROCm SMI, AMD GPUs will not be detected");
return false;
@ -1189,27 +1244,22 @@ namespace Gpu {
return true;
} else {initialized = true; shutdown(); return false;}
#else
return false;
#endif
}
bool shutdown() {
#if defined(GPU_AMD)
if (!initialized) return false;
if (rsmi_shut_down() == RSMI_STATUS_SUCCESS)
if (rsmi_shut_down() == RSMI_STATUS_SUCCESS) {
initialized = false;
else Logger::warning("Failed to shutdown ROCm SMI");
#if !defined(RSMI_STATIC)
dlclose(rsmi_dl_handle);
#endif
} else Logger::warning("Failed to shutdown ROCm SMI");
return true;
#else
return false;
#endif
}
template <bool is_init>
bool collect(gpu_info* gpus_slice) { // raw pointer to vector data, size == device_count, offset by Nvml::device_count elements
#if defined(GPU_AMD)
if (!initialized) return false;
rsmi_status_t result;
@ -1341,10 +1391,6 @@ namespace Gpu {
}
return true;
#else
(void)gpus_slice;
return false;
#endif
}
}