mirror of
https://github.com/aristocratos/btop.git
synced 2024-05-15 18:03:06 +12:00
Add ROCm SMI backend for AMD GPU support
This commit is contained in:
parent
917d568a77
commit
c352bf2613
4
Makefile
4
Makefile
|
@ -164,8 +164,8 @@ override REQFLAGS := -std=c++20
|
|||
WARNFLAGS := -Wall -Wextra -pedantic
|
||||
OPTFLAGS := -O2 -ftree-vectorize -flto=$(LTO)
|
||||
LDCXXFLAGS := -pthread -D_FORTIFY_SOURCE=2 -D_GLIBCXX_ASSERTIONS -D_FILE_OFFSET_BITS=64 $(GOODFLAGS) $(ADDFLAGS)
|
||||
GPUCXXFLAGS := -I/opt/cuda/include # TODO: there has to be a better way to link NVML than hardcoded dirs
|
||||
GPULDFLAGS := -L/usr/lib64 -lnvidia-ml
|
||||
GPUCXXFLAGS := -I/opt/cuda/include -I/opt/rocm/include # TODO: there has to be a better way to link these libs than hardcoded dirs
|
||||
GPULDFLAGS := -L/opt/cuda/lib -L/opt/rocm/lib -lnvidia-ml -lrocm_smi64
|
||||
override CXXFLAGS += $(REQFLAGS) $(LDCXXFLAGS) $(OPTFLAGS) $(WARNFLAGS) $(GPUCXXFLAGS)
|
||||
override LDFLAGS += $(LDCXXFLAGS) $(OPTFLAGS) $(WARNFLAGS) $(GPULDFLAGS)
|
||||
INC := $(foreach incdir,$(INCDIRS),-isystem $(incdir)) -I$(SRCDIR)
|
||||
|
|
|
@ -262,6 +262,7 @@ void clean_quit(int sig) {
|
|||
}
|
||||
|
||||
Gpu::Nvml::shutdown();
|
||||
Gpu::Rsmi::shutdown();
|
||||
|
||||
Config::write();
|
||||
|
||||
|
@ -537,7 +538,7 @@ namespace Runner {
|
|||
if (Global::debug) debug_timer("gpu", draw_begin);
|
||||
|
||||
//? Draw box
|
||||
if (not pause_output and Gpu::Nvml::initialized) output += Gpu::draw(gpus, conf.force_redraw, conf.no_update);
|
||||
if (not pause_output and not Gpu::gpu_names.empty()) output += Gpu::draw(gpus, conf.force_redraw, conf.no_update);
|
||||
|
||||
if (Global::debug) debug_timer("gpu", draw_done);
|
||||
}
|
||||
|
|
|
@ -757,7 +757,7 @@ namespace Gpu {
|
|||
string out;
|
||||
out.reserve(width * height);
|
||||
|
||||
auto gpu = gpus[0]; // TODO: mutli-gpu support
|
||||
auto gpu = gpus[0]; // TODO: multi-gpu support
|
||||
|
||||
//* Redraw elements not needed to be updated every cycle
|
||||
if (redraw) {
|
||||
|
@ -781,7 +781,7 @@ namespace Gpu {
|
|||
|
||||
pwr_meter = Draw::Meter{b_width - 24, "cached"};
|
||||
|
||||
mem_util_graph = Draw::Graph{b_width/2 - 1, 2, "free", gpu.mem_utilization_percent, graph_symbol, 0, 0, 100, 4}; // offset so the graph isn't empty at 0-5% I/O
|
||||
mem_util_graph = Draw::Graph{b_width/2 - 1, 2, "free", gpu.mem_utilization_percent, graph_symbol, 0, 0, 100, 4}; // offset so the graph isn't empty at 0-5% utilization
|
||||
mem_used_graph = Draw::Graph{b_width/2 - 2, 4, "used", gpu.mem_used_percent, graph_symbol};
|
||||
}
|
||||
|
||||
|
@ -813,8 +813,9 @@ namespace Gpu {
|
|||
|
||||
//? Power usage meter, power state
|
||||
out += Mv::to(b_y + 2, b_x + 1) + Theme::c("main_fg") + Fx::b + "PWR " + pwr_meter(gpu.pwr_percent.back())
|
||||
+ Theme::g("cached").at(gpu.pwr_percent.back()) + rjust(to_string(gpu.pwr_usage/1000), 4) + Theme::c("main_fg") + 'W'
|
||||
+ " P-state: " + (gpu.pwr_state > 9 ? "" : " ") + 'P' + Theme::g("cached").at(gpu.pwr_state) + to_string(gpu.pwr_state);
|
||||
+ Theme::g("cached").at(gpu.pwr_percent.back()) + rjust(to_string(gpu.pwr_usage/1000), 4) + Theme::c("main_fg") + 'W';
|
||||
if (gpu.pwr_state != 32) // NVML_PSTATE_UNKNOWN; unsupported or non-nvidia card
|
||||
out += std::string(" P-state: ") + (gpu.pwr_state > 9 ? "" : " ") + 'P' + Theme::g("cached").at(gpu.pwr_state) + to_string(gpu.pwr_state);
|
||||
|
||||
//? Memory section header & clock speed
|
||||
string used_memory_string = floating_humanizer(gpu.mem_used);
|
||||
|
@ -828,7 +829,7 @@ namespace Gpu {
|
|||
}
|
||||
|
||||
//? Memory usage borders
|
||||
out += Mv::to(b_y + 5, b_x) + Theme::c("div_line") + Symbols::div_left+Symbols::h_line + Theme::c("title") + "I/O:" + Theme::c("div_line") + Symbols::h_line*(b_width/2-6)
|
||||
out += Mv::to(b_y + 5, b_x) + Theme::c("div_line") + Symbols::div_left+Symbols::h_line + Theme::c("title") + "Utilization:" + Theme::c("div_line") + Symbols::h_line*(b_width/2-14)
|
||||
+ Symbols::div_right + Mv::u(1)+Mv::l(1) + Symbols::v_line + Mv::l(1)+Mv::d(2) + (Symbols::v_line + Mv::l(1)+Mv::d(1))*2;
|
||||
|
||||
//? Total memory usage
|
||||
|
@ -1743,7 +1744,7 @@ namespace Draw {
|
|||
Cpu::redraw = Gpu::redraw = Mem::redraw = Net::redraw = Proc::redraw = true;
|
||||
|
||||
Cpu::shown = s_contains(boxes, "cpu");
|
||||
Gpu::shown = s_contains(boxes, "gpu") and Gpu::Nvml::initialized;
|
||||
Gpu::shown = s_contains(boxes, "gpu") and not Gpu::gpu_names.empty();
|
||||
Mem::shown = s_contains(boxes, "mem");
|
||||
Net::shown = s_contains(boxes, "net");
|
||||
Proc::shown = s_contains(boxes, "proc");
|
||||
|
@ -1753,7 +1754,7 @@ namespace Draw {
|
|||
using namespace Cpu;
|
||||
bool show_temp = (Config::getB("check_temp") and got_sensors);
|
||||
width = round((double)Term::width * width_p / 100);
|
||||
if (Gpu::shown and !(Mem::shown or Net::shown or Proc::shown)) {
|
||||
if (Gpu::shown and not (Mem::shown or Net::shown or Proc::shown)) {
|
||||
height = Term::height/2;
|
||||
} else {
|
||||
height = max(8, (int)ceil((double)Term::height * (trim(boxes) == "cpu" ? 100 : height_p/(Gpu::shown+1) + Gpu::shown*5) / 100));
|
||||
|
|
|
@ -330,33 +330,34 @@ namespace Gpu {
|
|||
|
||||
//* Per-device container for GPU info
|
||||
struct gpu_info {
|
||||
deque<long long> gpu_percent = {};
|
||||
deque<long long> gpu_percent = {0};
|
||||
unsigned int gpu_clock_speed = 0; // MHz
|
||||
|
||||
deque<long long> pwr_percent = {};
|
||||
unsigned int pwr_usage = 0; // mW
|
||||
unsigned int pwr_max_usage = 300000;
|
||||
unsigned int pwr_state = 32;
|
||||
deque<long long> pwr_percent = {0};
|
||||
long long pwr_usage = 0; // mW
|
||||
long long pwr_max_usage = 255000;
|
||||
long long pwr_state = 32;
|
||||
|
||||
deque<long long> temp = {};
|
||||
long long temp_max = 100;
|
||||
deque<long long> temp = {0};
|
||||
long long temp_max = 110;
|
||||
|
||||
long long mem_total = 0;
|
||||
long long mem_used = 0;
|
||||
deque<long long> mem_used_percent = {};
|
||||
long long mem_utilization = 0;
|
||||
deque<long long> mem_utilization_percent = {};
|
||||
unsigned int mem_clock_speed = 0; // MHz
|
||||
deque<long long> mem_used_percent = {0};
|
||||
deque<long long> mem_utilization_percent = {0}; // TODO: properly handle GPUs that can't report some stats
|
||||
long long mem_clock_speed = 0; // MHz
|
||||
|
||||
unsigned int pcie_tx = 0; // KB/s
|
||||
unsigned int pcie_rx = 0;
|
||||
long long pcie_tx = 0; // KB/s
|
||||
long long pcie_rx = 0;
|
||||
|
||||
// vector<proc_info> graphics_processes = {}; // TODO
|
||||
// vector<proc_info> compute_processes = {};
|
||||
};
|
||||
|
||||
namespace Nvml {
|
||||
extern bool initialized;
|
||||
extern bool shutdown();
|
||||
}
|
||||
namespace Rsmi {
|
||||
extern bool shutdown();
|
||||
}
|
||||
|
||||
|
|
|
@ -103,7 +103,7 @@ namespace Term {
|
|||
bool mem = boxes.find("mem") != string::npos;
|
||||
bool net = boxes.find("net") != string::npos;
|
||||
bool proc = boxes.find("proc") != string::npos;
|
||||
bool gpu = boxes.find("gpu") != string::npos and Gpu::Nvml::initialized;
|
||||
bool gpu = boxes.find("gpu") != string::npos and not Gpu::gpu_names.empty();
|
||||
int width = 0;
|
||||
if (mem) width = Mem::min_width;
|
||||
else if (net) width = Mem::min_width;
|
||||
|
|
|
@ -29,6 +29,7 @@ tab-size = 4
|
|||
#include <net/if.h>
|
||||
#include <arpa/inet.h> // for inet_ntop()
|
||||
#include <nvml.h>
|
||||
#include <rocm_smi/rocm_smi.h>
|
||||
|
||||
|
||||
#if !(defined(STATIC_BUILD) && defined(__GLIBC__))
|
||||
|
@ -102,6 +103,14 @@ namespace Gpu {
|
|||
vector<nvmlDevice_t> devices;
|
||||
unsigned int device_count = 0;
|
||||
}
|
||||
|
||||
//? AMD data collection
|
||||
namespace Rsmi {
|
||||
bool initialized = false;
|
||||
bool init();
|
||||
bool shutdown();
|
||||
uint32_t device_count = 0;
|
||||
}
|
||||
}
|
||||
|
||||
namespace Mem {
|
||||
|
@ -169,10 +178,8 @@ namespace Shared {
|
|||
|
||||
//? Init for namespace Gpu
|
||||
Gpu::Nvml::init();
|
||||
|
||||
Gpu::Rsmi::init();
|
||||
}
|
||||
|
||||
|
||||
}
|
||||
|
||||
namespace Cpu {
|
||||
|
@ -830,7 +837,7 @@ namespace Gpu {
|
|||
//? NVIDIA
|
||||
namespace Nvml {
|
||||
bool init() {
|
||||
if (initialized) {return false;}
|
||||
if (initialized) return false;
|
||||
|
||||
nvmlReturn_t result = nvmlInit();
|
||||
if (result != NVML_SUCCESS) {
|
||||
|
@ -855,100 +862,97 @@ namespace Gpu {
|
|||
result = nvmlDeviceGetHandleByIndex(i, devices.data() + i*sizeof(nvmlDevice_t));
|
||||
if (result != NVML_SUCCESS) {
|
||||
Logger::error(std::string("NVML: Failed to get device handle: ") + nvmlErrorString(result));
|
||||
return false;
|
||||
continue;
|
||||
}
|
||||
|
||||
initialized = true;
|
||||
|
||||
//? Device name
|
||||
char name[NVML_DEVICE_NAME_BUFFER_SIZE];
|
||||
result = nvmlDeviceGetName(devices[i], name, NVML_DEVICE_NAME_BUFFER_SIZE);
|
||||
if (result != NVML_SUCCESS) {
|
||||
if (result != NVML_SUCCESS)
|
||||
Logger::error(std::string("NVML: Failed to get device name: ") + nvmlErrorString(result));
|
||||
} else {gpu_names[i] = string(name);}
|
||||
else gpu_names[i] = string(name);
|
||||
|
||||
//? Power usage
|
||||
result = nvmlDeviceGetPowerManagementLimit(devices[i], &gpus[i].pwr_max_usage);
|
||||
if (result != NVML_SUCCESS) {
|
||||
Logger::error(std::string("NVML: Failed to get maximum GPU power draw, defaulting to 300W: ") + nvmlErrorString(result));
|
||||
}
|
||||
unsigned int max_power;
|
||||
result = nvmlDeviceGetPowerManagementLimit(devices[i], &max_power);
|
||||
if (result != NVML_SUCCESS)
|
||||
Logger::error(std::string("NVML: Failed to get maximum GPU power draw, defaulting to 225W: ") + nvmlErrorString(result));
|
||||
else gpus[i].pwr_max_usage = max_power;
|
||||
|
||||
//? Get temp_max
|
||||
unsigned int temp_max = 100;
|
||||
unsigned int temp_max;
|
||||
result = nvmlDeviceGetTemperatureThreshold(devices[i], NVML_TEMPERATURE_THRESHOLD_SHUTDOWN, &temp_max);
|
||||
if (result != NVML_SUCCESS) {
|
||||
Logger::error(std::string("NVML: Failed to get maximum GPU temperature, defaulting to 100: ") + nvmlErrorString(result));
|
||||
}
|
||||
gpus[i].temp_max = (long long)temp_max;
|
||||
if (result != NVML_SUCCESS)
|
||||
Logger::error(std::string("NVML: Failed to get maximum GPU temperature, defaulting to 110: ") + nvmlErrorString(result));
|
||||
else gpus[i].temp_max = (long long)temp_max;
|
||||
}
|
||||
initialized = true;
|
||||
|
||||
return true;
|
||||
} else {initialized = true; shutdown(); return false;}
|
||||
}
|
||||
|
||||
bool shutdown() {
|
||||
if (!initialized) {return false;}
|
||||
if (!initialized) return false;
|
||||
|
||||
nvmlReturn_t result = nvmlShutdown();
|
||||
if (NVML_SUCCESS == result) {
|
||||
if (NVML_SUCCESS == result)
|
||||
initialized = false;
|
||||
} else Logger::warning(std::string("Failed to shutdown NVML: ") + nvmlErrorString(result));
|
||||
else Logger::warning(std::string("Failed to shutdown NVML: ") + nvmlErrorString(result));
|
||||
|
||||
return !initialized;
|
||||
}
|
||||
|
||||
bool collect(gpu_info* gpus_slice) { // raw pointer to vector data, size == device_count, defined in init()
|
||||
bool collect(gpu_info* gpus_slice) { // raw pointer to vector data, size == device_count
|
||||
if (!initialized) return false;
|
||||
nvmlReturn_t result;
|
||||
|
||||
for (unsigned int i = 0; i < device_count; ++i) {
|
||||
//? GPU & memory utilization
|
||||
nvmlUtilization_t utilization;
|
||||
nvmlReturn_t result = nvmlDeviceGetUtilizationRates(devices[i], &utilization);
|
||||
result = nvmlDeviceGetUtilizationRates(devices[i], &utilization);
|
||||
if (result != NVML_SUCCESS) {
|
||||
Logger::error(std::string("NVML: Failed to get GPU utilization: ") + nvmlErrorString(result));
|
||||
} else {
|
||||
gpus_slice[i].gpu_percent.push_back((long long)utilization.gpu);
|
||||
gpus_slice[i].mem_utilization_percent.push_back((long long)utilization.memory);
|
||||
//? Reduce size if there are more values than needed for graph
|
||||
while (cmp_greater(gpus_slice[i].gpu_percent.size(), width * 2)) gpus_slice[i].gpu_percent.pop_front();
|
||||
while (cmp_greater(gpus_slice[i].mem_utilization_percent.size(), width)) gpus_slice[i].mem_utilization_percent.pop_front();
|
||||
}
|
||||
|
||||
//? Clock speeds
|
||||
result = nvmlDeviceGetClockInfo(devices[i], NVML_CLOCK_GRAPHICS, &gpus_slice[i].gpu_clock_speed);
|
||||
if (result != NVML_SUCCESS) {
|
||||
unsigned int gpu_clock, mem_clock;
|
||||
result = nvmlDeviceGetClockInfo(devices[i], NVML_CLOCK_GRAPHICS, &gpu_clock);
|
||||
if (result != NVML_SUCCESS)
|
||||
Logger::error(std::string("NVML: Failed to get GPU clock speed: ") + nvmlErrorString(result));
|
||||
}
|
||||
result = nvmlDeviceGetClockInfo(devices[i], NVML_CLOCK_MEM, &gpus_slice[i].mem_clock_speed);
|
||||
if (result != NVML_SUCCESS) {
|
||||
else gpus_slice[i].gpu_clock_speed = (long long)gpu_clock;
|
||||
|
||||
result = nvmlDeviceGetClockInfo(devices[i], NVML_CLOCK_MEM, &mem_clock);
|
||||
if (result != NVML_SUCCESS)
|
||||
Logger::error(std::string("NVML: Failed to get VRAM clock speed: ") + nvmlErrorString(result));
|
||||
}
|
||||
else gpus_slice[i].mem_clock_speed = (long long)mem_clock;
|
||||
|
||||
//? Power usage & state
|
||||
result = nvmlDeviceGetPowerUsage(devices[i], &gpus_slice[i].pwr_usage);
|
||||
unsigned int power;
|
||||
result = nvmlDeviceGetPowerUsage(devices[i], &power);
|
||||
if (result != NVML_SUCCESS) {
|
||||
Logger::error(std::string("NVML: Failed to get GPU power usage: ") + nvmlErrorString(result));
|
||||
} else {
|
||||
gpus_slice[i].pwr_usage = (long long)power;
|
||||
gpus_slice[i].pwr_percent.push_back(clamp((long long)round((double)gpus_slice[i].pwr_usage * 100.0 / (double)gpus_slice[i].pwr_max_usage), 0ll, 100ll));
|
||||
}
|
||||
|
||||
nvmlPstates_t pState;
|
||||
result = nvmlDeviceGetPowerState(devices[i], &pState);
|
||||
if (result != NVML_SUCCESS) {
|
||||
if (result != NVML_SUCCESS)
|
||||
Logger::error(std::string("NVML: Failed to get GPU power state: ") + nvmlErrorString(result));
|
||||
} else {
|
||||
gpus_slice[i].pwr_state = static_cast<int>(pState);
|
||||
}
|
||||
else gpus_slice[i].pwr_state = static_cast<int>(pState);
|
||||
|
||||
//? GPU temperature
|
||||
if (Config::getB("check_temp")) {
|
||||
unsigned int temp;
|
||||
nvmlReturn_t result = nvmlDeviceGetTemperature(devices[i], NVML_TEMPERATURE_GPU, &temp);
|
||||
if (result != NVML_SUCCESS) {
|
||||
if (result != NVML_SUCCESS)
|
||||
Logger::error(std::string("NVML: Failed to get GPU temperature: ") + nvmlErrorString(result));
|
||||
} else {
|
||||
gpus_slice[i].temp.push_back((long long)temp);
|
||||
//? Reduce size if there are more values than needed for graph
|
||||
while (cmp_greater(gpus_slice[i].temp.size(), 18)) gpus_slice[i].temp.pop_front();
|
||||
}
|
||||
else gpus_slice[i].temp.push_back((long long)temp);
|
||||
}
|
||||
|
||||
//? Memory info
|
||||
|
@ -963,20 +967,19 @@ namespace Gpu {
|
|||
|
||||
auto used_percent = (long long)round((double)memory.used * 100.0 / (double)memory.total);
|
||||
gpus_slice[i].mem_used_percent.push_back(used_percent);
|
||||
|
||||
//? Reduce size if there are more values than needed for graphs
|
||||
while (cmp_greater(gpus_slice[i].mem_used_percent.size(), width/2)) gpus_slice[i].mem_used_percent.pop_front();
|
||||
}
|
||||
|
||||
//? PCIe link speeds
|
||||
result = nvmlDeviceGetPcieThroughput(devices[i], NVML_PCIE_UTIL_TX_BYTES, &gpus_slice[i].pcie_tx);
|
||||
if (result != NVML_SUCCESS) {
|
||||
unsigned int tx,rx;
|
||||
result = nvmlDeviceGetPcieThroughput(devices[i], NVML_PCIE_UTIL_TX_BYTES, &tx);
|
||||
if (result != NVML_SUCCESS)
|
||||
Logger::error(std::string("NVML: Failed to get PCIe TX throughput: ") + nvmlErrorString(result));
|
||||
}
|
||||
result = nvmlDeviceGetPcieThroughput(devices[i], NVML_PCIE_UTIL_RX_BYTES, &gpus_slice[i].pcie_rx);
|
||||
if (result != NVML_SUCCESS) {
|
||||
else gpus_slice[i].pcie_tx = (long long)tx;
|
||||
|
||||
result = nvmlDeviceGetPcieThroughput(devices[i], NVML_PCIE_UTIL_RX_BYTES, &rx);
|
||||
if (result != NVML_SUCCESS)
|
||||
Logger::error(std::string("NVML: Failed to get PCIe RX throughput: ") + nvmlErrorString(result));
|
||||
}
|
||||
else gpus_slice[i].pcie_rx = (long long)rx;
|
||||
|
||||
//? TODO: Processes using GPU
|
||||
/*unsigned int proc_info_len;
|
||||
|
@ -993,14 +996,172 @@ namespace Gpu {
|
|||
return true;
|
||||
}
|
||||
}
|
||||
// TODO: AMD
|
||||
|
||||
//? AMD
|
||||
namespace Rsmi {
|
||||
bool init() {
|
||||
if (initialized) return false;
|
||||
rsmi_status_t result;
|
||||
|
||||
result = rsmi_init(0);
|
||||
if (result != RSMI_STATUS_SUCCESS)
|
||||
Logger::error("Failed to initialize ROCm SMI, AMD GPUs will not be detected");
|
||||
|
||||
//? Device count
|
||||
result = rsmi_num_monitor_devices(&device_count);
|
||||
if (result != RSMI_STATUS_SUCCESS)
|
||||
Logger::error("ROCm SMI: Failed to fetch number of devices");
|
||||
|
||||
if (device_count > 0) {
|
||||
gpus.resize(gpus.size() + device_count);
|
||||
gpu_names.resize(gpus.size() + device_count);
|
||||
|
||||
for (unsigned int i = 0; i < device_count; ++i) {
|
||||
auto offset = Nvml::device_count + i;
|
||||
|
||||
//? Device name
|
||||
char name[NVML_DEVICE_NAME_BUFFER_SIZE]; // ROCm SMI does not provide a constant for this as far as I can tell, this should be good enough
|
||||
result = rsmi_dev_name_get(i, name, NVML_DEVICE_NAME_BUFFER_SIZE);
|
||||
if (result != RSMI_STATUS_SUCCESS)
|
||||
Logger::error("ROCm SMI: Failed to get device name");
|
||||
else gpu_names[offset] = string(name);
|
||||
|
||||
//? Power usage
|
||||
uint64_t max_power;
|
||||
result = rsmi_dev_power_cap_get(i, 0, &max_power);
|
||||
if (result != RSMI_STATUS_SUCCESS)
|
||||
Logger::error("ROCm SMI: Failed to get maximum GPU power draw, defaulting to 225W");
|
||||
else gpus[offset].pwr_max_usage = (long long)(max_power/1000); // RSMI reports power in microWatts
|
||||
|
||||
//? Get temp_max
|
||||
int64_t temp_max;
|
||||
result = rsmi_dev_temp_metric_get(i, RSMI_TEMP_TYPE_EDGE, RSMI_TEMP_MAX, &temp_max);
|
||||
if (result != RSMI_STATUS_SUCCESS)
|
||||
Logger::error("ROCm SMI: Failed to get maximum GPU temperature, defaulting to 110");
|
||||
else gpus[offset].temp_max = (long long)temp_max;
|
||||
}
|
||||
initialized = true;
|
||||
|
||||
return true;
|
||||
} else {initialized = true; shutdown(); return false;}
|
||||
}
|
||||
|
||||
bool shutdown() {
|
||||
if (!initialized) return false;
|
||||
|
||||
if (rsmi_shut_down() == RSMI_STATUS_SUCCESS)
|
||||
initialized = false;
|
||||
else Logger::warning("Failed to shutdown ROCm SMI");
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
bool collect(gpu_info* gpus_slice) { // raw pointer to vector data, size == device_count, offset by Nvml::device_count elements
|
||||
if (!initialized) return false;
|
||||
rsmi_status_t result;
|
||||
|
||||
for (uint32_t i = 0; i < device_count; ++i) {
|
||||
//? GPU utilization
|
||||
uint32_t utilization;
|
||||
result = rsmi_dev_busy_percent_get(i, &utilization);
|
||||
if (result != RSMI_STATUS_SUCCESS)
|
||||
Logger::error("ROCm SMI: Failed to get GPU utilization");
|
||||
else
|
||||
gpus_slice[i].gpu_percent.push_back((long long)utilization);
|
||||
|
||||
//? Memory utilization
|
||||
result = rsmi_dev_memory_busy_percent_get(i, &utilization);
|
||||
if (result != RSMI_STATUS_SUCCESS)
|
||||
Logger::error("ROCm SMI: Failed to get VRAM utilization");
|
||||
else
|
||||
gpus_slice[i].mem_utilization_percent.push_back((long long)utilization);
|
||||
|
||||
//? Clock speeds
|
||||
rsmi_frequencies_t frequencies;
|
||||
result = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_SYS, &frequencies);
|
||||
if (result != RSMI_STATUS_SUCCESS)
|
||||
Logger::error("ROCm SMI: Failed to get GPU clock speed: ");
|
||||
else gpus_slice[i].gpu_clock_speed = (long long)frequencies.frequency[frequencies.current]/1000000; // Hz to MHz
|
||||
|
||||
result = rsmi_dev_gpu_clk_freq_get(i, RSMI_CLK_TYPE_MEM, &frequencies);
|
||||
if (result != RSMI_STATUS_SUCCESS)
|
||||
Logger::error("ROCm SMI: Failed to get VRAM clock speed: ");
|
||||
else gpus_slice[i].mem_clock_speed = (long long)frequencies.frequency[frequencies.current]/1000000; // Hz to MHz
|
||||
|
||||
//? Power usage & state
|
||||
uint64_t power;
|
||||
result = rsmi_dev_power_ave_get(i, 0, &power);
|
||||
if (result != RSMI_STATUS_SUCCESS)
|
||||
Logger::error("ROCm SMI: Failed to get GPU power usage");
|
||||
else
|
||||
gpus_slice[i].pwr_percent.push_back(clamp((long long)round((double)gpus_slice[i].pwr_usage * 100.0 / (double)gpus_slice[i].pwr_max_usage), 0ll, 100ll));
|
||||
gpus_slice[i].pwr_state = 32; // NVML_PSTATE_UNKNOWN; won't display in GUI
|
||||
|
||||
//? GPU temperature
|
||||
if (Config::getB("check_temp")) {
|
||||
int64_t temp;
|
||||
result = rsmi_dev_temp_metric_get(i, RSMI_TEMP_TYPE_EDGE, RSMI_TEMP_CURRENT, &temp);
|
||||
if (result != RSMI_STATUS_SUCCESS)
|
||||
Logger::error("ROCm SMI: Failed to get GPU temperature");
|
||||
else gpus_slice[i].temp.push_back((long long)temp/1000);
|
||||
}
|
||||
|
||||
//? Memory info
|
||||
uint64_t total, used;
|
||||
result = rsmi_dev_memory_total_get(i, RSMI_MEM_TYPE_VRAM, &total);
|
||||
if (result != RSMI_STATUS_SUCCESS) {
|
||||
Logger::error("ROCm SMI: Failed to get total VRAM");
|
||||
} else {
|
||||
gpus_slice[i].mem_total = total;
|
||||
|
||||
result = rsmi_dev_memory_usage_get(i, RSMI_MEM_TYPE_VRAM, &used);
|
||||
if (result != RSMI_STATUS_SUCCESS) {
|
||||
Logger::error("ROCm SMI: Failed to get VRAM usage");
|
||||
} else {
|
||||
gpus_slice[i].mem_used = used;
|
||||
|
||||
auto used_percent = (long long)round((double)used * 100.0 / (double)total);
|
||||
gpus_slice[i].mem_used_percent.push_back(used_percent);
|
||||
}
|
||||
}
|
||||
|
||||
//? PCIe link speeds
|
||||
uint64_t tx, rx;
|
||||
result = rsmi_dev_pci_throughput_get(i, &tx, &rx, 0);
|
||||
if (result != RSMI_STATUS_SUCCESS) {
|
||||
Logger::error("ROCm SMI: Failed to get PCIe throughput");
|
||||
} else {
|
||||
gpus_slice[i].pcie_tx = (long long)tx;
|
||||
gpus_slice[i].pcie_rx = (long long)rx;
|
||||
}
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Intel
|
||||
|
||||
//? Collect data from GPU-specific libraries
|
||||
auto collect(bool no_update) -> vector<gpu_info>& {
|
||||
if (Runner::stopping or (no_update and not gpus.empty())) return gpus;
|
||||
|
||||
Nvml::collect(gpus.data()); // raw pointer to array data, size == Nvml::device_count, defined in Nvml::init()
|
||||
//* Collect data
|
||||
Nvml::collect(gpus.data()); // raw pointer to vector data, size == Nvml::device_count
|
||||
Rsmi::collect(gpus.data() + Nvml::device_count); // size = Rsmi::device_count
|
||||
|
||||
//* Trim vectors if there are more values than needed for graphs
|
||||
for (auto& gpu : gpus) {
|
||||
//? GPU & memory utilization
|
||||
while (cmp_greater(gpu.gpu_percent.size(), width * 2)) gpu.gpu_percent.pop_front();
|
||||
while (cmp_greater(gpu.mem_utilization_percent.size(), width)) gpu.mem_utilization_percent.pop_front();
|
||||
//? Power usage
|
||||
while (cmp_greater(gpu.pwr_percent.size(), width)) gpu.pwr_percent.pop_front();
|
||||
//? Temperature
|
||||
while (cmp_greater(gpu.temp.size(), 18)) gpu.temp.pop_front();
|
||||
//? Memory usage
|
||||
while (cmp_greater(gpu.mem_used_percent.size(), width/2)) gpu.mem_used_percent.pop_front();
|
||||
}
|
||||
|
||||
return gpus;
|
||||
}
|
||||
|
|
Loading…
Reference in a new issue