推广

Linux源码 | EM能量模型

iseeyu2年前 (2024-08-15)推广114

本文地址：Linux源码 | EM能量模型
博客地址：https://hqber.com
个人原创作品，转载需联系我，必须标注署名、文章出处

本文是基于linux kernel 5.15.41

能量模型（EM）<kernel/power/energy_model.c | 源代码 | v5.15.41>框架是一种驱动程序与内核子系统之间的接口。其中驱动程序了解不同性能层级的设备所消耗的功率，而内核子系统愿意使用该信息做出能量感知决策。EM框架管理着系统中各个设备提供的“性能域”，也就是频率功率的映射表，相关的能量感知算法可通过接口获取相应的设备的“性能域”，进行性能成本估算。

EM能量模型debug节点：/sys/kernel/debug/energy_model

目前在当前内核版本中，仅仅支持CPU device，CPU设备的em_perf_state中power<active_power | 源代码 | v5.15.41>和cost值计算公式如下：

power = capacitance(电容，dtsi配置:dynamic-power-coefficient) * voltage^2 * frequency
cost = max_frequency * power / frequency

1. energy_model结构

struct em_perf_state {
unsigned long frequency;
unsigned long power;
unsigned long cost;
};

struct em_perf_domain {
struct em_perf_state *table;
int nr_perf_states;
int milliwatts;
unsigned long cpus[];
};

API函数接口

//获取相应device的em_perf_domain结构
struct em_perf_domain *em_pd_get(struct device *dev);
//通过cpu id获取CPU device的em_perf_domain结构
struct em_perf_domain *em_cpu_get(int cpu);

//设备注册EM能量模型，这个接口是提供给设备使用
int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
struct em_data_callback *cb, cpumask_t *span,
bool milliwatts);
//设备注销EM能量模型，这个接口是提供给设备使用
void em_dev_unregister_perf_domain(struct device *dev);

2. em_dev_register_perf_domain

将设备注册到em能量模型，这个接口是提供给驱动设备使用

// 将设备注册到em能量模型
int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
struct em_data_callback *cb, cpumask_t *cpus,
bool milliwatts)
{
unsigned long cap, prev_cap = 0;
int cpu, ret;

if (!dev || !nr_states || !cb)
return -EINVAL;

/*
* Use a mutex to serialize the registration of performance domains and
* let the driver-defined callback functions sleep.
*/
mutex_lock(&em_pd_mutex);

if (dev->em_pd) {
ret = -EEXIST;
goto unlock;
}

//判断是否是cpu设备，dev->bus是否是虚拟总线cpu_subsys
if (_is_cpu_device(dev)) {
if (!cpus) {
dev_err(dev, "EM: invalid CPU mask\n");
ret = -EINVAL;
goto unlock;
}

for_each_cpu(cpu, cpus) {
//返回CPU的性能域
if (em_cpu_get(cpu)) {
dev_err(dev, "EM: exists for CPU%d\n", cpu);
ret = -EEXIST;
goto unlock;
}
/*
* All CPUs of a domain must have the same
* micro-architecture since they all share the same
* table.
*/
// 获取cpu的capacity，当前cpu算力和上一个算力不相等
cap = arch_scale_cpu_capacity(cpu);
if (prev_cap && prev_cap != cap) {
dev_err(dev, "EM: CPUs of %*pbl must have the same capacity\n",
cpumask_pr_args(cpus));//printf输出cpumask

ret = -EINVAL;
goto unlock;
}
prev_cap = cap;
}
}

// 创建em_perf_domain
ret = em_create_pd(dev, nr_states, cb, cpus);
if (ret)
goto unlock;

dev->em_pd->milliwatts = milliwatts;

//创建em能量模型对应的设备debug接口：/sys/kernel/debug/energy_model
em_debug_create_pd(dev);
dev_info(dev, "EM: created perf domain\n");

unlock:
mutex_unlock(&em_pd_mutex);
return ret;
}
EXPORT_SYMBOL_GPL(em_dev_register_perf_domain);
// 创建em_perf_domain
static int em_create_pd(struct device *dev, int nr_states,
struct em_data_callback *cb, cpumask_t *cpus)
{
struct em_perf_domain *pd;
struct device *cpu_dev;
int cpu, ret;

if (_is_cpu_device(dev)) {
//创建em_perf_domain对象
pd = kzalloc(sizeof(*pd) + cpumask_size(), GFP_KERNEL);
if (!pd)
return -ENOMEM;

//copy cpumask
cpumask_copy(em_span_cpus(pd), cpus);
} else {
// 非CPU设备
pd = kzalloc(sizeof(*pd), GFP_KERNEL);
if (!pd)
return -ENOMEM;
}

// 创建性能映射表，频率和功耗映射关系，计算cost
ret = em_create_perf_table(dev, pd, nr_states, cb);
if (ret) {
kfree(pd);
return ret;
}

if (_is_cpu_device(dev))
for_each_cpu(cpu, cpus) {
cpu_dev = get_cpu_device(cpu);
cpu_dev->em_pd = pd;
}

dev->em_pd = pd;

return 0;
}
// 创建性能映射表
static int em_create_perf_table(struct device *dev, struct em_perf_domain *pd,
int nr_states, struct em_data_callback *cb)
{
unsigned long power, freq, prev_freq = 0, prev_cost = ULONG_MAX;
struct em_perf_state *table;
int i, ret;
u64 fmax;

table = kcalloc(nr_states, sizeof(*table), GFP_KERNEL);
if (!table)
return -ENOMEM;

// 创建功耗和频率映射表
/* Build the list of performance states for this performance domain */
for (i = 0, freq = 0; i < nr_states; i++, freq++) {
/*
* active_power() is a driver callback which ceils 'freq' to
* lowest performance state of 'dev' above 'freq' and updates
* 'power' and 'freq' accordingly.
*/
// 回调设备的power计算函数，获取功耗和频率
ret = cb->active_power(&power, &freq, dev);
if (ret) {
dev_err(dev, "EM: invalid perf. state: %d\n",
ret);
goto free_ps_table;
}

/*
* We expect the driver callback to increase the frequency for
* higher performance states.
*/
// 新增的freq必须比上一个freq大，递增
if (freq <= prev_freq) {
dev_err(dev, "EM: non-increasing freq: %lu\n",
freq);
goto free_ps_table;
}

/*
* The power returned by active_state() is expected to be
* positive and to fit into 16 bits.
*/
if (!power || power > EM_MAX_POWER) {
dev_err(dev, "EM: invalid power: %lu\n",
power);
goto free_ps_table;
}

table[i].power = power;
table[i].frequency = prev_freq = freq;
}

//计算的cost，cost = max_freq * power / frequency
/* Compute the cost of each performance state. */
fmax = (u64) table[nr_states - 1].frequency;
for (i = nr_states - 1; i >= 0; i--) {
unsigned long power_res = em_scale_power(table[i].power);

table[i].cost = div64_u64(fmax * power_res,
table[i].frequency);
if (table[i].cost >= prev_cost) {
dev_dbg(dev, "EM: OPP:%lu is inefficient\n",
table[i].frequency);
} else {
prev_cost = table[i].cost;
}
}

pd->table = table;
pd->nr_perf_states = nr_states;

return 0;

free_ps_table:
kfree(table);
return -EINVAL;
}

3. em_pd_get

获取相应device的em_perf_domain结构，主要是给相关能量感知算法去调用，如：EAS、IPA。

struct em_perf_domain *em_pd_get(struct device *dev)
{
if (IS_ERR_OR_NULL(dev))
return NULL;

return dev->em_pd;
}

4. em_cpu_get

通过cpu id获取相应的cpu device的em_perf_domain结构，主要是给相关能量感知算法去调用，如：EAS、IPA。

struct em_perf_domain *em_cpu_get(int cpu)
{
struct device *cpu_dev;

cpu_dev = get_cpu_device(cpu);
if (!cpu_dev)
return NULL;

return em_pd_get(cpu_dev);
}
EXPORT_SYMBOL_GPL(em_cpu_get);

5. em_cpu_energy

计算在当前性能下cpu消耗的power

static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
unsigned long max_util, unsigned long sum_util,
unsigned long allowed_cpu_cap)
{
unsigned long freq, scale_cpu;
struct em_perf_state *ps;
int i, cpu;

if (!sum_util)
return 0;

/*
* In order to predict the performance state, map the utilization of
* the most utilized CPU of the performance domain to a requested
* frequency, like schedutil. Take also into account that the real
* frequency might be set lower (due to thermal capping). Thus, clamp
* max utilization to the allowed CPU capacity before calculating
* effective frequency.
*/
cpu = cpumask_first(to_cpumask(pd->cpus));
// 获取CPU归一化的capacity
scale_cpu = arch_scale_cpu_capacity(cpu);
ps = &pd->table[pd->nr_perf_states - 1];

max_util = map_util_perf(max_util);
// allowed_cpu_cap会限制max_util
max_util = min(max_util, allowed_cpu_cap);
// 获取通过max_util获取对应的最大freq
freq = map_util_freq(max_util, ps->frequency, scale_cpu);

/*
* Find the lowest performance state of the Energy Model above the
* requested frequency.
*/
// 找到最大freq对应的em table
for (i = 0; i < pd->nr_perf_states; i++) {
ps = &pd->table[i];
if (ps->frequency >= freq)
break;
}

/*
* The capacity of a CPU in the domain at the performance state (ps)
* can be computed as:
* 通过频率比例，计算当前性能下需要的cpu的capacity
* scale_cpu * ps->freq
* ps->cap = -------------------- (1)
* cpu_max_freq
*
* So, ignoring the costs of idle states (which are not available in
* the EM), the energy consumed by this CPU at that performance state
* is estimated as:
* 通过util比例，计算在当前性能下cpu消耗的power
* ps->power * cpu_util
* cpu_nrg = -------------------- (2)
* ps->cap
*
* since 'cpu_util / ps->cap' represents its percentage of busy time.
*
* NOTE: Although the result of this computation actually is in
* units of power, it can be manipulated as an energy value
* over a scheduling period, since it is assumed to be
* constant during that interval.
*
* By injecting (1) in (2), 'cpu_nrg' can be re-expressed as a product
* of two terms:
*
* ps->power * cpu_max_freq cpu_util
* cpu_nrg = ------------------------ * --------- (3)
* ps->freq scale_cpu
*
* The first term is static, and is stored in the em_perf_state struct
* as 'ps->cost'.
*
* Since all CPUs of the domain have the same micro-architecture, they
* share the same 'ps->cost', and the same CPU capacity. Hence, the
* total energy of the domain (which is the simple sum of the energy of
* all of its CPUs) can be factorized as:
*
* ps->cost * \Sum cpu_util
* pd_nrg = ------------------------ (4)
* scale_cpu
*/
// 计算在当前性能下cpu消耗的power
return ps->cost * sum_util / scale_cpu;
}