* [Cake] profiling using perf
@ 2019-03-08 20:58 Georgios Amanakis
2019-03-08 21:01 ` Georgios Amanakis
0 siblings, 1 reply; 5+ messages in thread
From: Georgios Amanakis @ 2019-03-08 20:58 UTC (permalink / raw)
To: Cake List
Dear List,
I made an effort to profile the performance of cake with perf in
openwrt. perf was run on a WRT1900ACS router while downloading
archlinux.iso via torrent in a LAN client. You can find the annotated
sch_cake.c in the attachment as well as a performance histogram of
sch_cake (percentages are relative to sch_cake).
Hopefully people can take a look at it, and see if there are
performance concerns. As far as I can tell
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [Cake] profiling using perf
2019-03-08 20:58 [Cake] profiling using perf Georgios Amanakis
@ 2019-03-08 21:01 ` Georgios Amanakis
2019-03-09 16:03 ` Toke Høiland-Jørgensen
0 siblings, 1 reply; 5+ messages in thread
From: Georgios Amanakis @ 2019-03-08 21:01 UTC (permalink / raw)
To: Cake List
[-- Attachment #1: Type: text/plain, Size: 423 bytes --]
Dear List,
I made an effort to profile the performance of cake with perf in
openwrt. perf was run on a WRT1900ACS router while downloading
archlinux.iso via torrent in a LAN client. You can find the annotated
sch_cake.c in the attachment as well as a performance histogram of
sch_cake (percentages are relative to sch_cake). Hopefully people can
take a look at it, and see if there are performance concerns.
Best,
George
[-- Attachment #2: cake_dequeue.annotation --]
[-- Type: application/octet-stream, Size: 85930 bytes --]
cake_dequeue() /lib/modules/4.14.104/sch_cake.ko
Event: cycles:ppp
Percent
Disassembly of section .text:
0000230c <cake_dequeue>:
cake_dequeue():
while (!!(skb = cake_dequeue_one(sch)))
kfree_skb(skb);
}
static struct sk_buff *cake_dequeue(struct Qdisc *sch)
{
0.83 push {r4, r5, r6, r7, r8, r9, sl, fp, lr}
struct cake_sched_data *q = qdisc_priv(sch);
struct cake_tin_data *b = &q->tins[q->cur_tin];
0.09 add r1, r0, #16640 ; 0x4100
{
0.01 sub sp, sp, #100 ; 0x64
struct cake_tin_data *b = &q->tins[q->cur_tin];
0.13 ldr r2, [r0, #264] ; 0x108
movw r3, #22720 ; 0x58c0
0.49 movt r3, #1
0.02 str r1, [sp, #56] ; 0x38
0.10 ldrh r1, [r1, #96] ; 0x60
{
str r0, [sp, #52] ; 0x34
qdisc_priv():
#define QDISC_ALIGNTO 64
#define QDISC_ALIGN(len) (((len) + QDISC_ALIGNTO-1) & ~(QDISC_ALIGNTO-1))
static inline void *qdisc_priv(struct Qdisc *q)
{
return (char *) q + QDISC_ALIGN(sizeof(struct Qdisc));
0.15 add r0, r0, #256 ; 0x100
0.07 str r0, [sp, #72] ; 0x48
cobalt_invsqrt():
vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
0.10 movw r0, #0
cake_dequeue():
struct cake_tin_data *b = &q->tins[q->cur_tin];
0.01 mla r3, r3, r1, r2
cobalt_invsqrt():
vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
0.34 movt r0, #0
0.16 str r0, [sp, #44] ; 0x2c
cake_dequeue():
struct cake_tin_data *b = &q->tins[q->cur_tin];
0.07 str r3, [sp, #60] ; 0x3c
struct cake_host *srchost, *dsthost;
ktime_t now = ktime_get();
0.06 → bl qdisc_peek_dequeued
strd r0, [sp, #24]
struct cake_flow *flow;
struct list_head *head;
bool first_flow = true;
0.35 mov r1, #1
u16 host_load;
u64 delay;
u32 len;
begin:
if (!sch->q.qlen)
0.03 4c: ldr r3, [sp, #52] ; 0x34
0.30 ldr r3, [r3, #76] ; 0x4c
0.11 cmp r3, #0
0.22 ↓ bne 68
return NULL;
0.01 5c: mov r3, #0
str r3, [sp, #48] ; 0x30
↓ b 114c
/* global hard shaper */
if (ktime_after(q->time_next_packet, now) &&
0.21 68: ldr r3, [sp, #72] ; 0x48
0.09 movw r0, #16416 ; 0x4020
ktime_compare():
*/
static inline int ktime_compare(const ktime_t cmp1, const ktime_t cmp2)
{
if (cmp1 < cmp2)
return -1;
if (cmp1 > cmp2)
0.06 ldrd r4, [sp, #24]
cake_dequeue():
0.24 add r3, r3, r0
0.01 ldrd r2, [r3, #-8]
ktime_compare():
0.24 cmp r4, r2
0.02 sbcs ip, r5, r3
0.29 ↓ bge dc
cake_dequeue():
ktime_after(q->failsafe_next_packet, now)) {
0.08 ldr ip, [sp, #72] ; 0x48
ktime_compare():
0.04 ldrd r6, [sp, #24]
cake_dequeue():
ldrd r4, [ip, r0]
ktime_compare():
0.19 cmp r6, r4
sbcs r0, r7, r5
0.09 ↓ bge dc
cake_dequeue():
u64 next = min(ktime_to_ns(q->time_next_packet),
ktime_to_ns(q->failsafe_next_packet));
sch->qstats.overlimits++;
ldr r0, [sp, #52] ; 0x34
u64 next = min(ktime_to_ns(q->time_next_packet),
0.03 cmp r4, r2
sbcs ip, r5, r3
sch->qstats.overlimits++;
0.08 ldr r1, [r0, #116] ; 0x74
u64 next = min(ktime_to_ns(q->time_next_packet),
movlt r2, r4
0.08 movlt r3, r5
sch->qstats.overlimits++;
add r1, r1, #1
0.08 str r1, [r0, #116] ; 0x74
qdisc_watchdog_schedule_ns(&q->watchdog, next);
ldr r1, [sp, #72] ; 0x48
0.07 add r0, r1, #16384 ; 0x4000
return NULL;
mov r1, #0
qdisc_watchdog_schedule_ns(&q->watchdog, next);
0.05 add r0, r0, #104 ; 0x68
return NULL;
str r1, [sp, #48] ; 0x30
qdisc_watchdog_schedule_ns(&q->watchdog, next);
0.10 → bl qdisc_peek_dequeued
return NULL;
0.03 ↓ b 114c
}
/* Choose a class to work on. */
if (!q->rate_ns) {
0.14 dc: ldr r3, [sp, #72] ; 0x48
0.07 add r3, r3, #16384 ; 0x4000
0.05 add r3, r3, #48 ; 0x30
0.03 ldrd r2, [r3, #-8]
0.03 orrs r3, r2, r3
0.35 ↓ beq 18c
0.09 ldr r3, [sp, #52] ; 0x34
/* In shaped mode, choose:
* - Highest-priority tin with queue and meeting schedule, or
* - The earliest-scheduled tin with queue.
*/
ktime_t best_time = ns_to_ktime(KTIME_MAX);
int tin, best_tin = 0;
0.05 mov r6, #0
for (tin = 0; tin < q->tin_cnt; tin++) {
0.05 mov ip, r6
ktime_t best_time = ns_to_ktime(KTIME_MAX);
0.05 mvn r4, #0
0.05 mvn r5, #-2147483648 ; 0x80000000
0.23 ldrd r8, [sp, #24]
0.04 ldr r7, [r3, #264] ; 0x108
for (tin = 0; tin < q->tin_cnt; tin++) {
0.26 ldr r3, [sp, #56] ; 0x38
0.11 add r0, r7, #88064 ; 0x15800
0.05 add r0, r0, #44 ; 0x2c
0.08 ldrh lr, [r3, #14]
0.04 ↓ b 240
if (b->tin_deficit <= 0)
124: cmp r8, #0
↓ bne 144
↓ b 10dc
empty = false;
130: cmp r3, #0
b->tin_deficit += b->tin_quantum_band;
ldrh r3, [r9, r7]
empty = false;
movne r5, #0
b->tin_deficit += b->tin_quantum_band;
add r8, r3, r8
str r8, [r9, #2160] ; 0x870
q->cur_tin++;
144: ldrh r3, [r0, #96] ; 0x60
if (q->cur_tin >= q->tin_cnt) {
ldrh r8, [r0, #14]
q->cur_tin++;
add r3, r3, #1
uxth r3, r3
strh r3, [r0, #96] ; 0x60
if (q->cur_tin >= q->tin_cnt) {
cmp r8, r3
b++;
addhi r2, r2, #88064 ; 0x15800
addhi r2, r2, #192 ; 0xc0
if (q->cur_tin >= q->tin_cnt) {
↓ bhi 1ac
b = q->tins;
ldr r3, [sp, #52] ; 0x34
if (wrapped) {
cmp r4, #0
q->cur_tin = 0;
strh r6, [r0, #96] ; 0x60
b = q->tins;
ldr r2, [r3, #264] ; 0x108
if (wrapped) {
↓ beq 184
if (empty)
cmp r5, #0
↑ bne 5c
wrapped = true;
184: mov r4, #1
↓ b 1ac
b->tin_deficit += b->tin_quantum_band;
18c: ldr r2, [sp, #60] ; 0x3c
bool wrapped = false, empty = true;
mov r4, #0
b->tin_deficit += b->tin_quantum_band;
ldr r0, [sp, #56] ; 0x38
bool wrapped = false, empty = true;
mov r5, #1
q->cur_tin = 0;
mov r6, r4
movw lr, #2094 ; 0x82e
movw ip, #2092 ; 0x82c
b->tin_deficit += b->tin_quantum_band;
movw r7, #2156 ; 0x86c
while (b->tin_deficit < 0 ||
1ac: add r9, r2, #86016 ; 0x15000
ldr r8, [r9, #2160] ; 0x870
ldrh r3, [r9, lr]
ldrh sl, [r9, ip]
cmp r8, #0
add r3, r3, sl
↑ blt 130
cmp r3, #0
↑ beq 124
str r2, [sp, #60] ; 0x3c
↓ b 27c
b = q->tins + tin;
if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
0.07 1d8: ldrh r3, [r0, #2]
0.20 ldrh r2, [r0]
0.09 cmn r3, r2
0.18 ↓ beq 234
ktime_t time_to_pkt = \
0.15 ldrd sl, [r0, #36] ; 0x24
0.07 subs sl, sl, r8
0.01 sbc fp, fp, r9
ktime_compare():
if (cmp1 < cmp2)
0.19 cmp sl, r4
0.05 sbcs r3, fp, r5
0.12 movlt r3, #1
0.04 movge r3, #0
cake_dequeue():
ktime_sub(b->time_next_packet, now);
if (ktime_to_ns(time_to_pkt) <= 0 ||
0.14 cmp sl, #1
0.01 sbcs r2, fp, #0
ktime_compare():
0.28 orrlt r3, r3, #1
if (cmp1 > cmp2)
cmp r4, sl
0.36 sbcs r2, r5, fp
if (cmp1 < cmp2)
0.01 eor r3, r3, #1
if (cmp1 > cmp2)
0.11 andlt r3, r3, #1
0.02 movge r3, #0
0.05 cmp r3, #0
cake_dequeue():
ktime_t time_to_pkt = \
0.01 moveq r4, sl
0.06 moveq r5, fp
ktime_compare():
moveq r6, ip
0.23 234: add r0, r0, #88064 ; 0x15800
cake_dequeue():
for (tin = 0; tin < q->tin_cnt; tin++) {
0.09 add ip, ip, #1
0.08 add r0, r0, #192 ; 0xc0
0.19 240: cmp lr, ip
0.29 ↑ bne 1d8
}
}
}
q->cur_tin = best_tin;
b = q->tins + best_tin;
0.34 movw r3, #22720 ; 0x58c0
0.09 movt r3, #1
q->cur_tin = best_tin;
ldr r2, [sp, #56] ; 0x38
/* No point in going further if no packets to deliver. */
if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
0.30 movw r0, #2094 ; 0x82e
b = q->tins + best_tin;
0.01 mla r3, r3, r6, r7
q->cur_tin = best_tin;
0.07 strh r6, [r2, #96] ; 0x60
if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
0.26 movw r2, #2092 ; 0x82c
0.01 add r9, r3, #86016 ; 0x15000
b = q->tins + best_tin;
0.22 str r3, [sp, #60] ; 0x3c
if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
ldrh r2, [r9, r2]
0.10 ldrh r3, [r9, r0]
0.08 cmn r3, r2
0.06 ↑ beq 5c
return NULL;
}
retry:
/* service this class */
head = &b->decaying_flows;
0.06 27c: ldr r3, [sp, #60] ; 0x3c
if (!first_flow || list_empty(head)) {
0.04 cmp r1, #0
head = &b->decaying_flows;
0.01 add r3, r3, #88064 ; 0x15800
0.11 str r3, [sp, #76] ; 0x4c
0.02 add r3, r3, #72 ; 0x48
0.10 str r3, [sp, #80] ; 0x50
if (!first_flow || list_empty(head)) {
0.13 ↓ beq 2b4
__read_once_size():
})
static __always_inline
void __read_once_size(const volatile void *p, void *res, int size)
{
__READ_ONCE_SIZE;
0.14 ldr r2, [r9, #2120] ; 0x848
cake_dequeue():
0.03 cmp r3, r2
__read_once_size():
0.00 str r2, [sp, #84] ; 0x54
cake_dequeue():
moveq r1, #1
0.02 ↓ beq 2bc
0.01 ↓ b 2fc
0.01 2b0: mov r9, fp
head = &b->decaying_flows;
0.29 2b4: ldr r3, [sp, #80] ; 0x50
str r3, [sp, #84] ; 0x54
head = &b->new_flows;
0.11 2bc: ldr r3, [sp, #76] ; 0x4c
__read_once_size():
0.80 ldr r2, [r9, #2104] ; 0x838
cake_dequeue():
0.09 add r3, r3, #56 ; 0x38
if (list_empty(head)) {
1.12 cmp r3, r2
0.20 ↓ bne 304
head = &b->old_flows;
2.02 ldr r3, [sp, #76] ; 0x4c
__read_once_size():
0.18 ldr r2, [r9, #2112] ; 0x840
cake_dequeue():
1.14 add r3, r3, #64 ; 0x40
if (unlikely(list_empty(head))) {
0.05 cmp r3, r2
1.00 ↓ bne 304
__read_once_size():
ldr r3, [r9, #2120] ; 0x848
cake_dequeue():
head = &b->decaying_flows;
if (unlikely(list_empty(head)))
ldr r2, [sp, #84] ; 0x54
cmp r2, r3
↑ beq 4c
mov r3, r2
↓ b 304
0.12 2fc: ldr r3, [sp, #80] ; 0x50
str r3, [sp, #84] ; 0x54
goto begin;
}
}
}
flow = list_first_entry(head, struct cake_flow, flowchain);
0.26 304: ldr r8, [r3]
q->cur_flow = flow - b->flows;
0.89 ldr r2, [sp, #60] ; 0x3c
flow = list_first_entry(head, struct cake_flow, flowchain);
0.09 sub r3, r8, #8
q->cur_flow = flow - b->flows;
0.84 sub r3, r3, r2
0.24 ldr r2, [sp, #56] ; 0x38
1.82 asr r3, r3, #6
0.10 strh r3, [r2, #98] ; 0x62
srchost = &b->hosts[flow->srchost];
dsthost = &b->hosts[flow->dsthost];
host_load = 1;
/* flow isolation (DRR++) */
if (flow->deficit <= 0) {
1.01 ldr r3, [r8, #8]
srchost = &b->hosts[flow->srchost];
0.13 ldrh r2, [r8, #48] ; 0x30
if (flow->deficit <= 0) {
2.08 cmp r3, #0
dsthost = &b->hosts[flow->dsthost];
0.14 ldrh r3, [r8, #50] ; 0x32
srchost = &b->hosts[flow->srchost];
1.63 str r2, [sp, #88] ; 0x58
0.27 movgt sl, r8
dsthost = &b->hosts[flow->dsthost];
0.98 str r3, [sp, #92] ; 0x5c
0.17 movgt fp, r9
if (flow->deficit <= 0) {
3.57 ↓ bgt 4ec
/* Keep all flows with deficits out of the sparse and decaying
* rotations. No non-empty flow can go into the decaying
* rotation, so they can't get deficits
*/
if (flow->set == CAKE_SET_SPARSE) {
0.24 ldrb r3, [r8, #52] ; 0x34
1.66 cmp r3, #1
0.04 ldrne r3, [sp, #52] ; 0x34
0.97 addne r3, r3, #16384 ; 0x4000
0.30 ↓ bne 408
if (flow->head) {
ldr r3, [r8, #-8]
0.02 cmp r3, #0
↓ beq 3f8
b->sparse_flow_count--;
b->bulk_flow_count++;
if (cake_dsrc(q->flow_mode))
ldr r3, [sp, #52] ; 0x34
b->sparse_flow_count--;
movw ip, #2094 ; 0x82e
b->bulk_flow_count++;
movw r0, #2092 ; 0x82c
b->sparse_flow_count--;
ldrh r1, [r9, ip]
b->bulk_flow_count++;
0.02 ldrh r2, [r9, r0]
if (cake_dsrc(q->flow_mode))
add r3, r3, #16384 ; 0x4000
b->sparse_flow_count--;
sub r1, r1, #1
strh r1, [r9, ip]
b->bulk_flow_count++;
add r2, r2, #1
strh r2, [r9, r0]
if (cake_dsrc(q->flow_mode))
ldrb r2, [r3, #273] ; 0x111
and r1, r2, #5
cmp r1, #5
↓ bne 3c0
srchost->srchost_bulk_flow_count++;
ldr r1, [sp, #88] ; 0x58
mov r2, #12
ldr r0, [sp, #60] ; 0x3c
mla r2, r2, r1, r0
add r2, r2, #75776 ; 0x12800
ldrh r1, [r2, #8]
add r1, r1, #1
strh r1, [r2, #8]
ldrb r2, [r3, #273] ; 0x111
if (cake_ddst(q->flow_mode))
0.02 3c0: and r2, r2, #6
cmp r2, #6
↓ bne 3ec
dsthost->dsthost_bulk_flow_count++;
ldr r1, [sp, #92] ; 0x5c
mov r2, #12
0.01 ldr r0, [sp, #60] ; 0x3c
mla r2, r2, r1, r0
add r2, r2, #75776 ; 0x12800
ldrh r1, [r2, #10]
add r1, r1, #1
strh r1, [r2, #10]
flow->set = CAKE_SET_BULK;
3ec: mov r2, #3
0.01 strb r2, [r8, #52] ; 0x34
0.01 ↓ b 408
0.03 3f8: ldr r3, [sp, #52] ; 0x34
} else {
/* we've moved it to the bulk rotation for
* correct deficit accounting but we still want
* to count it as a sparse flow, not a bulk one.
*/
flow->set = CAKE_SET_SPARSE_WAIT;
mov r2, #2
strb r2, [r8, #52] ; 0x34
add r3, r3, #16384 ; 0x4000
}
}
if (cake_dsrc(q->flow_mode))
1.22 408: ldrb r3, [r3, #273] ; 0x111
0.11 and r2, r3, #5
1.42 and r3, r3, #6
0.09 cmp r2, #5
1.00 ↓ bne 10cc
host_load = max(host_load, srchost->srchost_bulk_flow_count);
ldr r1, [sp, #88] ; 0x58
mov r2, #12
ldr r0, [sp, #60] ; 0x3c
mla r2, r2, r1, r0
add r2, r2, #75776 ; 0x12800
ldrh r2, [r2, #8]
cmp r2, #0
↓ beq 10cc
if (cake_ddst(q->flow_mode))
cmp r3, #6
movne r4, r2
↓ bne 474
↓ b 450
host_load = 1;
0.20 44c: mov r2, #1
host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2.19 450: ldr r1, [sp, #92] ; 0x5c
0.10 mov r3, #12
4.08 ldr r0, [sp, #60] ; 0x3c
0.08 mla r3, r3, r1, r0
3.43 add r3, r3, #75776 ; 0x12800
1.75 ldrh r4, [r3, #10]
cmp r2, r4
1.00 movcs r4, r2
uxth r2, r4
WARN_ON(host_load > CAKE_QUEUES);
0.99 474: cmp r2, #1024 ; 0x400
↓ bls 48c
movw r0, #0
movw r1, #2141 ; 0x85d
movt r0, #0
→ bl qdisc_peek_dequeued
/* The shifted prandom_u32() is a way to apply dithering to
* avoid accumulating roundoff errors
*/
flow->deficit += (b->flow_quantum * quantum_div[host_load] +
1.91 48c: ldr r2, [sp, #44] ; 0x2c
lsl r4, r4, #1
1.11 movw r3, #2052 ; 0x804
ldrh r3, [r9, r3]
2.62 ldrh r4, [r2, r4]
mul r4, r4, r3
(prandom_u32() >> 16)) >> 16;
3.53 → bl qdisc_peek_dequeued
__list_del_entry():
static inline void __list_del_entry(struct list_head *entry)
{
if (!__list_del_entry_valid(entry))
return;
__list_del(entry->prev, entry->next);
2.75 ldr ip, [r8, #4]
0.04 ldr lr, [r8]
cake_dequeue():
first_flow = false;
1.31 mov r1, #0
flow->deficit += (b->flow_quantum * quantum_div[host_load] +
0.10 ldr r2, [r8, #8]
0.90 add r0, r4, r0, lsr #16
list_move_tail(&flow->flowchain, &b->old_flows);
ldr r3, [sp, #76] ; 0x4c
flow->deficit += (b->flow_quantum * quantum_div[host_load] +
1.98 add r2, r2, r0, lsr #16
0.01 str r2, [r8, #8]
__list_del():
next->prev = prev;
1.88 str ip, [lr, #4]
cake_dequeue():
list_move_tail(&flow->flowchain, &b->old_flows);
0.02 add r3, r3, #64 ; 0x40
__write_once_size():
static __always_inline void __write_once_size(volatile void *p, void *res, int size)
{
switch (size) {
case 1: *(volatile __u8 *)p = *(__u8 *)res; break;
case 2: *(volatile __u16 *)p = *(__u16 *)res; break;
case 4: *(volatile __u32 *)p = *(__u32 *)res; break;
0.92 str lr, [ip]
list_add_tail():
__list_add(new, head->prev, head);
0.02 ldr r2, [r9, #2116] ; 0x844
__list_add():
next->prev = new;
0.77 str r8, [r9, #2116] ; 0x844
new->next = next;
0.07 str r3, [r8]
new->prev = prev;
0.86 str r2, [r8, #4]
__write_once_size():
0.02 str r8, [r2]
1.66 ↑ b 2bc
cake_dequeue():
goto retry;
}
/* Retrieve a packet via the AQM */
while (1) {
skb = cake_dequeue_one(sch);
1.11 4ec: ldr r0, [sp, #52] ; 0x34
0.06 → bl cake_dequeue_one
if (!skb) {
0.33 subs r3, r0, #0
0.18 str r3, [sp, #48] ; 0x30
0.08 ↓ bne 840
cobalt_queue_empty():
if (vars->p_drop &&
0.01 ldr ip, [sl, #40] ; 0x28
0.04 mov r8, sl
mov r9, fp
0.06 cmp ip, #0
↓ beq 1118
ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
ldrd r0, [sl, #32]
add r3, fp, #2064 ; 0x810
ldrd r6, [sp, #24]
if (vars->p_drop &&
ldrd r2, [r3]
ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
subs r4, r6, r0
sbc r5, r7, r1
if (vars->p_drop &&
cmp r5, r3
ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
mov r0, r4
if (vars->p_drop &&
cmpeq r4, r2
↓ bls 112c
if (vars->p_drop < p->p_dec)
ldr r3, [fp, #2084] ; 0x824
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
ldr r1, [sl, #16]
if (vars->p_drop < p->p_dec)
cmp ip, r3
↓ bcs 570
vars->p_drop = 0;
ldr r3, [sp, #48] ; 0x30
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
cmp r1, #0
vars->blue_timer = now;
mov r4, r6
mov r5, r7
strd r4, [sl, #32]
vars->p_drop = 0;
str r3, [sl, #40] ; 0x28
vars->dropping = false;
strb r3, [sl, #44] ; 0x2c
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
↓ bne 10e8
↓ b 638
vars->p_drop -= p->p_dec;
570: sub r3, ip, r3
vars->dropping = false;
ldr r0, [sp, #48] ; 0x30
clz ip, r3
vars->p_drop -= p->p_dec;
str r3, [sl, #40] ; 0x28
vars->blue_timer = now;
ldrd r2, [sp, #24]
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
cmp r1, #0
lsr ip, ip, #5
vars->dropping = false;
strb r0, [sl, #44] ; 0x2c
vars->blue_timer = now;
strd r2, [sl, #32]
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
↓ beq 630
ldrd r6, [sl, #24]
subs r2, r2, r6
sbc r3, r3, r7
cmp r2, #0
sbcs r3, r3, #0
↓ blt 630
vars->count--;
0.01 5b0: sub r2, r1, #1
0.02 str r2, [r8, #16]
cobalt_invsqrt():
if (vars->count < REC_INV_SQRT_CACHE)
cmp r2, #15
↓ bhi 5d4
vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
ldr r3, [sp, #44] ; 0x2c
add r2, r3, r2, lsl #2
ldr r1, [r2, #2052] ; 0x804
0.00 str r1, [r8, #20]
↓ b 610
cobalt_newton_step():
invsqrt = vars->rec_inv_sqrt;
5d4: ldr r0, [r8, #20]
invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
umull r4, r5, r0, r0
val = (3LL << 32) - ((u64)vars->count * invsqrt2);
mov r4, #0
umull r2, r3, r5, r2
mov r5, #3
subs r4, r4, r2
sbc r5, r5, r3
val >>= 2; /* avoid overflow in following multiply */
lsr r2, r4, #2
orr r2, r2, r5, lsl #30
lsr r1, r5, #2
val = (val * invsqrt) >> (32 - 2 + 1);
umull r2, r3, r2, r0
mla r3, r0, r1, r3
lsr r1, r2, #31
orr r1, r1, r3, lsl #1
vars->rec_inv_sqrt = val;
str r1, [r8, #20]
reciprocal_scale():
*
* Return: a result based on @val in interval [0, @ep_ro).
*/
static inline u32 reciprocal_scale(u32 val, u32 ep_ro)
{
return (u32)(((u64) val * ep_ro) >> 32);
610: ldr r2, [r9, #2056] ; 0x808
0.02 mov r0, #0
mov fp, r0
umull r2, r3, r2, r1
mla sl, r1, r0, r3
cobalt_control():
return ktime_add_ns(t, reciprocal_scale(interval,
adds r4, sl, r6
adc r5, r0, r7
cobalt_queue_empty():
vars->drop_next = cobalt_control(vars->drop_next,
strd r4, [r8, #24]
cake_dequeue():
/* this queue was actually empty */
if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
630: cmp ip, #0
↓ beq 648
b->unresponsive_flow_count--;
638: movw r2, #2098 ; 0x832
ldrh r3, [r9, r2]
sub r3, r3, #1
strh r3, [r9, r2]
0.04 648: ldr ip, [r8, #40] ; 0x28
if (flow->cvars.p_drop || flow->cvars.count ||
0.04 64c: cmp ip, #0
ldr r0, [r8, #4]
ldr r1, [r8]
↓ bne 67c
0.26 65c: ldr r3, [r8, #16]
cmp r3, #0
0.04 ↓ bne 67c
ktime_compare():
if (cmp1 < cmp2)
0.01 ldrd r2, [r8, #24]
0.10 ldrd r4, [sp, #24]
cmp r4, r2
0.25 sbcs r3, r5, r3
↓ bge 76c
__list_del():
next->prev = prev;
0.27 67c: str r0, [r1, #4]
__list_add():
new->next = next;
ldr r2, [sp, #84] ; 0x54
__write_once_size():
0.03 str r1, [r0]
list_add_tail():
__list_add(new, head->prev, head);
ldr r3, [r9, #2124] ; 0x84c
__list_add():
next->prev = new;
0.21 str r8, [r9, #2124] ; 0x84c
new->prev = prev;
0.01 strd r2, [r8]
__write_once_size():
0.13 str r8, [r3]
cake_dequeue():
/* keep in the flowchain until the state has
* decayed to rest
*/
list_move_tail(&flow->flowchain,
&b->decaying_flows);
if (flow->set == CAKE_SET_BULK) {
ldrb r3, [r8, #52] ; 0x34
0.05 cmp r3, #3
↓ bne 730
b->bulk_flow_count--;
if (cake_dsrc(q->flow_mode))
0.04 ldr r3, [sp, #52] ; 0x34
b->bulk_flow_count--;
movw r1, #2092 ; 0x82c
0.01 ldrh r2, [r9, r1]
if (cake_dsrc(q->flow_mode))
add r3, r3, #16384 ; 0x4000
b->bulk_flow_count--;
0.02 sub r2, r2, #1
strh r2, [r9, r1]
if (cake_dsrc(q->flow_mode))
ldrb r2, [r3, #273] ; 0x111
and r1, r2, #5
0.01 cmp r1, #5
↓ bne 6f0
srchost->srchost_bulk_flow_count--;
ldr r1, [sp, #88] ; 0x58
mov r2, #12
ldr r0, [sp, #60] ; 0x3c
mla r2, r2, r1, r0
add r2, r2, #75776 ; 0x12800
ldrh r1, [r2, #8]
sub r1, r1, #1
strh r1, [r2, #8]
ldrb r2, [r3, #273] ; 0x111
if (cake_ddst(q->flow_mode))
6f0: and r2, r2, #6
cmp r2, #6
↓ bne 71c
dsthost->dsthost_bulk_flow_count--;
0.01 ldr r2, [sp, #92] ; 0x5c
mov r3, #12
ldr r1, [sp, #60] ; 0x3c
mla r3, r3, r2, r1
add r3, r3, #75776 ; 0x12800
ldrh r2, [r3, #10]
sub r2, r2, #1
strh r2, [r3, #10]
b->decaying_flow_count++;
71c: add r2, r9, #2096 ; 0x830
0.01 ldrh r3, [r2]
add r3, r3, #1
0.01 strh r3, [r2]
↓ b 75c
} else if (flow->set == CAKE_SET_SPARSE ||
0.11 730: sub r3, r3, #1
0.22 cmp r3, #1
↓ bhi 75c
flow->set == CAKE_SET_SPARSE_WAIT) {
b->sparse_flow_count--;
0.02 movw r1, #2094 ; 0x82e
b->decaying_flow_count++;
0.01 add r2, r9, #2096 ; 0x830
b->sparse_flow_count--;
0.01 ldrh r3, [r9, r1]
0.04 sub r3, r3, #1
0.00 strh r3, [r9, r1]
b->decaying_flow_count++;
0.01 ldrh r3, [r2]
0.01 add r3, r3, #1
0.01 strh r3, [r2]
}
flow->set = CAKE_SET_DECAYING;
0.37 75c: mov r3, #4
first_flow = false;
0.01 mov r1, #0
flow->set = CAKE_SET_DECAYING;
0.01 strb r3, [r8, #52] ; 0x34
↑ b 4c
__list_del():
next->prev = prev;
0.03 76c: str r0, [r1, #4]
__write_once_size():
str r1, [r0]
cake_dequeue():
} else {
/* remove empty queue from the flowchain */
list_del_init(&flow->flowchain);
if (flow->set == CAKE_SET_SPARSE ||
0.03 ldrb r3, [r8, #52] ; 0x34
__write_once_size():
str r8, [r8]
cake_dequeue():
sub r2, r3, #1
INIT_LIST_HEAD():
list->prev = list;
str r8, [r8, #4]
cake_dequeue():
cmp r2, #1
↓ bhi 7a0
flow->set == CAKE_SET_SPARSE_WAIT)
b->sparse_flow_count--;
movw r2, #2094 ; 0x82e
ldrh r3, [r9, r2]
sub r3, r3, #1
strh r3, [r9, r2]
↓ b 834
else if (flow->set == CAKE_SET_BULK) {
7a0: cmp r3, #3
0.01 ↓ bne 824
b->bulk_flow_count--;
if (cake_dsrc(q->flow_mode))
ldr r3, [sp, #52] ; 0x34
b->bulk_flow_count--;
movw r1, #2092 ; 0x82c
ldrh r2, [r9, r1]
if (cake_dsrc(q->flow_mode))
add r3, r3, #16384 ; 0x4000
b->bulk_flow_count--;
sub r2, r2, #1
strh r2, [r9, r1]
if (cake_dsrc(q->flow_mode))
ldrb r2, [r3, #273] ; 0x111
and r1, r2, #5
cmp r1, #5
↓ bne 7f4
srchost->srchost_bulk_flow_count--;
ldr r1, [sp, #88] ; 0x58
mov r2, #12
ldr r0, [sp, #60] ; 0x3c
mla r2, r2, r1, r0
add r2, r2, #75776 ; 0x12800
ldrh r1, [r2, #8]
sub r1, r1, #1
strh r1, [r2, #8]
ldrb r2, [r3, #273] ; 0x111
if (cake_ddst(q->flow_mode))
7f4: and r2, r2, #6
cmp r2, #6
↓ bne 834
dsthost->dsthost_bulk_flow_count--;
ldr r2, [sp, #92] ; 0x5c
mov r3, #12
ldr r1, [sp, #60] ; 0x3c
mla r3, r3, r2, r1
add r3, r3, #75776 ; 0x12800
ldrh r2, [r3, #10]
sub r2, r2, #1
strh r2, [r3, #10]
↓ b 834
} else
b->decaying_flow_count--;
824: add r2, r9, #2096 ; 0x830
ldrh r3, [r2]
0.01 sub r3, r3, #1
strh r3, [r2]
flow->set = CAKE_SET_NONE;
0.01 834: mov r1, #0
strb r1, [r8, #52] ; 0x34
↑ b 4c
cobalt_should_drop():
sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
0.21 840: ldr r3, [sp, #48] ; 0x30
over_target = sojourn > p->target &&
0.01 add r0, fp, #2064 ; 0x810
schedule = ktime_sub(now, vars->drop_next);
0.03 ldrd r4, [sl, #24]
0.01 ldrd r6, [sp, #24]
sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
0.21 ldrd r2, [r3, #32]
schedule = ktime_sub(now, vars->drop_next);
0.03 subs r8, r6, r4
0.18 ldr r1, [sl, #16]
sbc r9, r7, r5
sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
0.05 subs r6, r6, r2
0.01 sbc r7, r7, r3
schedule = ktime_sub(now, vars->drop_next);
0.04 strd r4, [sp, #32]
cmp r1, #0
sojourn > p->mtu_time * bulk_flows * 2 &&
0.05 ldrd r4, [r0]
0.01 mvn lr, r9
sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
0.07 strd r6, [sp, #16]
lsr lr, lr, #31
0.12 moveq lr, #0
sojourn > p->mtu_time * bulk_flows * 2 &&
0.01 cmp r7, r5
0.06 ldrb ip, [sl, #44] ; 0x2c
cmpeq r6, r4
0.46 ↓ bls 8f8
cake_dequeue():
}
/* Last packet in queue may be marked, shouldn't be dropped */
if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
(b->bulk_flow_count *
!!(q->rate_flags &
0.01 ldr r4, [sp, #56] ; 0x38
(b->bulk_flow_count *
0.01 movw r5, #2092 ; 0x82c
ldrh r5, [fp, r5]
cobalt_should_drop():
sojourn > p->mtu_time * bulk_flows * 2 &&
0.01 ldrd r6, [r0, #8]
cake_dequeue():
!!(q->rate_flags &
ldrh r4, [r4, #56] ; 0x38
cobalt_should_drop():
sojourn > p->mtu_time * bulk_flows * 2 &&
0.04 adds r2, r6, r6
cake_dequeue():
!!(q->rate_flags &
ubfx r4, r4, #2, #1
cobalt_should_drop():
sojourn > p->mtu_time * bulk_flows * 2 &&
0.04 adc r3, r7, r7
0.02 mov r6, r2
cake_dequeue():
(b->bulk_flow_count *
mul r4, r5, r4
cobalt_should_drop():
sojourn > p->mtu_time * bulk_flows * 2 &&
mov r7, r3
0.01 strd r6, [sp, #64] ; 0x40
umull r6, r7, r4, r2
0.04 ldrd r2, [sp, #64] ; 0x40
0.03 mla r7, r4, r3, r7
over_target = sojourn > p->target &&
ldrd r4, [sp, #16]
0.05 cmp r5, r7
cmpeq r4, r6
0.01 ↓ bls 8f8
sojourn > p->mtu_time * 4;
0.01 adds r4, r2, r2
0.01 adc r5, r3, r3
sojourn > p->mtu_time * bulk_flows * 2 &&
ldrd r2, [sp, #16]
0.05 cmp r3, r5
cmpeq r2, r4
0.02 ↓ bhi 90c
} else if (vars->dropping) {
0.21 8f8: cmp ip, #0
vars->ecn_marked = false;
0.04 mov r3, #0
strb r3, [sl, #45] ; 0x2d
} else if (vars->dropping) {
0.03 ↓ bne 964
↓ b 974
if (!vars->dropping) {
0.01 90c: cmp ip, #0
vars->ecn_marked = false;
mov r3, #0
0.03 strb r3, [sl, #45] ; 0x2d
if (!vars->dropping) {
0.01 ↓ bne 954
vars->dropping = true;
0.03 mov r3, #1
strb r3, [sl, #44] ; 0x2c
reciprocal_scale():
ldrd r2, [r0, #-8]
mov r5, #0
ldr r0, [sl, #20]
mov r3, r5
0.07 umull r4, r5, r2, r0
mla r6, r0, r3, r5
cobalt_control():
return ktime_add_ns(t, reciprocal_scale(interval,
ldrd r2, [sp, #24]
0.00 adds r2, r2, r6
adc r3, r3, ip
0.01 mov r6, r2
mov r7, r3
cobalt_should_drop():
vars->drop_next = cobalt_control(now,
strd r6, [sl, #24]
if (!vars->count)
0.02 954: cmp r1, #0
vars->count = 1;
moveq r3, #1
0.03 streq r3, [sl, #16]
0.01 ↓ b 974
if (next_due && vars->dropping) {
0.01 964: cmp lr, #0
vars->dropping = false;
strb r3, [sl, #44] ; 0x2c
if (next_due && vars->dropping) {
↓ bne b98
↓ b cbc
0.07 974: cmp lr, #0
↓ beq cbc
0.02 ldrb r3, [sl, #44] ; 0x2c
cmp r3, #0
ldrdeq r2, [sl, #24]
ldreq r1, [sl, #16]
strdeq r2, [sp, #32]
↓ beq b98
INET_ECN_set_ce():
ipv6_change_dsfield(inner, INET_ECN_MASK, dscp);
}
static inline int INET_ECN_set_ce(struct sk_buff *skb)
{
switch (skb->protocol) {
0.02 ldr r3, [sp, #48] ; 0x30
ldrh r3, [r3, #148] ; 0x94
cmp r3, #8
↓ beq 9b4
movw r2, #56710 ; 0xdd86
cmp r3, r2
↓ beq a3c
↓ b ad0
skb_network_header():
skb->transport_header += offset;
}
static inline unsigned char *skb_network_header(const struct sk_buff *skb)
{
return skb->head + skb->network_header;
0.01 9b4: ldr r0, [sp, #48] ; 0x30
ldrh r2, [r0, #152] ; 0x98
ldr r1, [r0, #164] ; 0xa4
INET_ECN_set_ce():
case cpu_to_be16(ETH_P_IP):
if (skb_network_header(skb) + sizeof(struct iphdr) <=
add r3, r2, #20
ldr r0, [r0, #156] ; 0x9c
add r3, r1, r3
cmp r0, r3
↓ bcc ad0
skb_network_header():
add r2, r1, r2
IP_ECN_set_ce():
u32 ecn = (iph->tos + 1) & INET_ECN_MASK;
ldrb r1, [r2, #1]
add r3, r1, #1
if (!(ecn & 2))
tst r3, #2
u32 ecn = (iph->tos + 1) & INET_ECN_MASK;
0.03 and r3, r3, #3
if (!(ecn & 2))
↓ bne a04
return !ecn;
0.01 cmp r3, #0
moveq r2, #1
movne r2, #0
movne r4, #1
moveq r4, #0
↓ b adc
u32 check = (__force u32)iph->check;
0.01 a04: ldrh r0, [r2, #10]
iph->tos |= INET_ECN_CE;
orr r1, r1, #3
check += (__force u16)htons(0xFFFB) + (__force u16)htons(ecn);
rev16 r3, r3
iph->tos |= INET_ECN_CE;
strb r1, [r2, #1]
add r1, r0, #64256 ; 0xfb00
mov r4, #0
add r1, r1, #255 ; 0xff
iph->check = (__force __sum16)(check + (check>=0xFFFF));
movw r0, #65534 ; 0xfffe
check += (__force u16)htons(0xFFFB) + (__force u16)htons(ecn);
0.04 uxtah r3, r1, r3
iph->check = (__force __sum16)(check + (check>=0xFFFF));
cmp r3, r0
addhi r3, r3, #1
strh r3, [r2, #10]
iph->tos |= INET_ECN_CE;
mov r2, #1
↓ b adc
skb_network_header():
a3c: ldr lr, [sp, #48] ; 0x30
ldrh r1, [lr, #152] ; 0x98
ldr r0, [lr, #164] ; 0xa4
INET_ECN_set_ce():
skb_tail_pointer(skb))
return IP_ECN_set_ce(ip_hdr(skb));
break;
case cpu_to_be16(ETH_P_IPV6):
if (skb_network_header(skb) + sizeof(struct ipv6hdr) <=
add r3, r1, #40 ; 0x28
ldr r2, [lr, #156] ; 0x9c
add r3, r0, r3
cmp r2, r3
↓ bcc ad0
ipv6_get_dsfield():
}
static inline __u8 ipv6_get_dsfield(const struct ipv6hdr *ipv6h)
{
return ntohs(*(const __be16 *)ipv6h) >> 4;
ldrh r2, [r0, r1]
rev16 r2, r2
IP6_ECN_set_ce():
if (INET_ECN_is_not_ect(ipv6_get_dsfield(iph)))
ubfx r2, r2, #4, #2
cmp r2, #0
moveq r4, #1
↓ beq adc
from = *(__be32 *)iph;
ldr r2, [r0, r1]
to = from | htonl(INET_ECN_CE << 20);
orr ip, r2, #12288 ; 0x3000
*(__be32 *)iph = to;
str ip, [r0, r1]
if (skb->ip_summed == CHECKSUM_COMPLETE)
ldrb r3, [lr, #104] ; 0x68
and r3, r3, #96 ; 0x60
cmp r3, #64 ; 0x40
movne r4, #0
movne r2, #1
↓ bne adc
ldr r3, [lr, #112] ; 0x70
csum_sub():
}
#endif
static inline __wsum csum_sub(__wsum csum, __wsum addend)
{
return csum_add(csum, ~addend);
mvn r2, r2
mov r1, #0
IP6_ECN_set_ce():
skb->csum = csum_add(csum_sub(skb->csum, (__force __wsum)from),
mov r4, #0
adds r3, r3, r2
mov r2, #1
csum_add():
res += (__force u32)addend;
add r3, r3, ip
movcs r1, #1
add r3, r3, r1
IP6_ECN_set_ce():
ldr r1, [sp, #48] ; 0x30
csum_add():
return (__force __wsum)(res + (res < (__force u32)addend));
cmp ip, r3
addhi r3, r3, #1
IP6_ECN_set_ce():
str r3, [r1, #112] ; 0x70
↓ b adc
INET_ECN_set_ce():
if (skb_network_header(skb) + sizeof(struct ipv6hdr) <=
ad0: mov r4, #1
mov r2, #0
↓ b adc
cobalt_should_drop():
vars->count++;
0.03 adc: ldr r3, [sl, #16]
drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
strb r2, [sl, #45] ; 0x2d
vars->count++;
add r3, r3, #1
str r3, [sl, #16]
if (!vars->count)
cmp r3, #0
vars->count--;
mvneq r2, #0
moveq ip, r3
streq r2, [sl, #16]
↓ beq b24
cobalt_invsqrt():
if (vars->count < REC_INV_SQRT_CACHE)
cmp r3, #15
↓ bhi b1c
vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
ldr r2, [sp, #44] ; 0x2c
add r3, r2, r3, lsl #2
ldr ip, [r3, #2052] ; 0x804
0.01 str ip, [sl, #20]
↓ b b68
b1c: mov r2, r3
mov ip, #0
cobalt_newton_step():
invsqrt = vars->rec_inv_sqrt;
b24: ldr lr, [sl, #20]
invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
umull r0, r1, lr, lr
val = (3LL << 32) - ((u64)vars->count * invsqrt2);
mov r0, #0
umull r2, r3, r1, r2
subs r0, r0, r2
val >>= 2; /* avoid overflow in following multiply */
lsr r2, r0, #2
val = (3LL << 32) - ((u64)vars->count * invsqrt2);
mla r3, ip, r1, r3
mov r1, #3
sbc r1, r1, r3
mov r7, r1
val >>= 2; /* avoid overflow in following multiply */
lsr r1, r1, #2
orr r2, r2, r7, lsl #30
val = (val * invsqrt) >> (32 - 2 + 1);
umull r2, r3, r2, lr
mla r3, lr, r1, r3
lsr ip, r2, #31
orr ip, ip, r3, lsl #1
vars->rec_inv_sqrt = val;
str ip, [sl, #20]
reciprocal_scale():
b68: ldr r0, [fp, #2056] ; 0x808
mov lr, #0
cobalt_control():
return ktime_add_ns(t, reciprocal_scale(interval,
ldrd r6, [sl, #24]
reciprocal_scale():
umull r0, r1, r0, ip
mla r8, ip, lr, r1
cobalt_control():
adds r2, r8, r6
cobalt_should_drop():
schedule = ktime_sub(now, vars->drop_next);
ldrd r8, [sp, #24]
cobalt_control():
return ktime_add_ns(t, reciprocal_scale(interval,
adc r3, lr, r7
cobalt_should_drop():
vars->drop_next = cobalt_control(vars->drop_next,
strd r2, [sl, #24]
schedule = ktime_sub(now, vars->drop_next);
subs r8, r8, r2
sbc r9, r9, r3
↓ b cc0
vars->drop_next = cobalt_control(vars->drop_next,
0.03 b98: add ip, fp, #2064 ; 0x810
cobalt_newton_step():
invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
mov lr, #0
cobalt_should_drop():
vars->count--;
ba0: sub r1, r1, #1
cobalt_newton_step():
val = (val * invsqrt) >> (32 - 2 + 1);
mov r3, #0
mov r2, #0
cobalt_invsqrt():
if (vars->count < REC_INV_SQRT_CACHE)
cmp r1, #15
cobalt_newton_step():
val = (val * invsqrt) >> (32 - 2 + 1);
strd r2, [sp, #16]
cobalt_invsqrt():
vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
ldr r3, [sp, #44] ; 0x2c
cobalt_should_drop():
vars->count--;
str r1, [sl, #16]
cobalt_invsqrt():
vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
add r0, r3, r1, lsl #2
if (vars->count < REC_INV_SQRT_CACHE)
↓ bhi c1c
vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
ldr r0, [r0, #2052] ; 0x804
reciprocal_scale():
mov r9, #0
cobalt_invsqrt():
str r0, [sl, #20]
reciprocal_scale():
ldrd r2, [ip, #-8]
umull r6, r7, r2, r0
cobalt_control():
return ktime_add_ns(t, reciprocal_scale(interval,
ldrd r2, [sp, #32]
reciprocal_scale():
mla r4, r0, r9, r7
cobalt_should_drop():
schedule = ktime_sub(now, vars->drop_next);
ldrd r8, [sp, #24]
cobalt_control():
return ktime_add_ns(t, reciprocal_scale(interval,
adds r2, r2, r4
adc r3, r3, lr
cobalt_should_drop():
schedule = ktime_sub(now, vars->drop_next);
subs r8, r8, r2
sbc r9, r9, r3
next_due = vars->count && ktime_to_ns(schedule) >= 0;
cmp r1, #0
cobalt_control():
return ktime_add_ns(t, reciprocal_scale(interval,
mov r4, r2
0.01 mov r5, r3
cobalt_should_drop():
vars->drop_next = cobalt_control(vars->drop_next,
strd r4, [sl, #24]
next_due = vars->count && ktime_to_ns(schedule) >= 0;
↓ bne ca0
if (vars->p_drop)
0.01 ldr r3, [sl, #40] ; 0x28
bool next_due, over_target, drop = false;
0.01 mov r4, r1
if (vars->p_drop)
cmp r3, #0
0.01 ↓ bne ccc
↓ b ce8
cobalt_newton_step():
invsqrt = vars->rec_inv_sqrt;
c1c: ldr r0, [sl, #20]
val = (3LL << 32) - ((u64)vars->count * invsqrt2);
mov r2, #0
mov r3, #3
invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
umull r4, r5, r0, r0
val = (3LL << 32) - ((u64)vars->count * invsqrt2);
umull r6, r7, r5, r1
subs r2, r2, r6
sbc r3, r3, r7
reciprocal_scale():
mov r7, #0
cobalt_newton_step():
val >>= 2; /* avoid overflow in following multiply */
lsr r4, r2, #2
orr r4, r4, r3, lsl #30
lsr r6, r3, #2
val = (val * invsqrt) >> (32 - 2 + 1);
umull r4, r5, r4, r0
mla r5, r0, r6, r5
lsr r3, r4, #31
orr r3, r3, r5, lsl #1
str r3, [sp, #16]
lsr r3, r5, #31
str r3, [sp, #20]
vars->rec_inv_sqrt = val;
ldrd r2, [sp, #16]
reciprocal_scale():
mov r3, #0
cobalt_newton_step():
str r2, [sl, #20]
reciprocal_scale():
ldrd r4, [ip, #-8]
mul r0, r4, r3
mla r0, r2, r7, r0
umull r2, r3, r4, r2
cobalt_control():
return ktime_add_ns(t, reciprocal_scale(interval,
ldrd r4, [sp, #32]
reciprocal_scale():
add r8, r0, r3
cobalt_control():
adds r4, r4, r8
cobalt_should_drop():
schedule = ktime_sub(now, vars->drop_next);
ldrd r8, [sp, #24]
cobalt_control():
return ktime_add_ns(t, reciprocal_scale(interval,
adc r5, r5, lr
cobalt_should_drop():
vars->drop_next = cobalt_control(vars->drop_next,
strd r4, [sl, #24]
schedule = ktime_sub(now, vars->drop_next);
subs r8, r8, r4
sbc r9, r9, r5
while (next_due) {
0.01 ca0: cmp r8, #0
sbcs r3, r9, #0
ldrdge r2, [sl, #24]
strdge r2, [sp, #32]
↑ bge ba0
bool next_due, over_target, drop = false;
mov r4, #0
↓ b cc0
0.16 cbc: mov r4, lr
if (vars->p_drop)
0.02 cc0: ldr r3, [sl, #40] ; 0x28
0.02 cmp r3, #0
↓ beq cdc
drop |= (prandom_u32() < vars->p_drop);
ccc: → bl qdisc_peek_dequeued
ldr r3, [sl, #40] ; 0x28
cmp r0, r3
orrcc r4, r4, #1
if (!vars->count)
0.56 cdc: ldr r3, [sl, #16]
cmp r3, #0
0.06 ↓ bne d0c
vars->drop_next = ktime_add_ns(now, p->interval);
0.13 ce8: add r3, fp, #2064 ; 0x810
0.05 ldrd r0, [sp, #24]
ldrd r2, [r3, #-8]
0.10 adds r0, r0, r2
adc r1, r1, r3
0.06 mov r2, r0
mov r3, r1
0.03 strd r2, [sl, #24]
↓ b d3c
else if (ktime_to_ns(schedule) > 0 && !drop)
0.17 d0c: cmp r8, #1
sbcs r3, r9, #0
0.09 eor r3, r4, #1
andge r3, r3, #1
0.03 movlt r3, #0
cmp r3, #0
0.04 ↓ beq d3c
vars->drop_next = now;
ldrd r2, [sp, #24]
mov r9, sl
mov r8, fp
strd r2, [sl, #24]
↓ b dfc
cake_dequeue():
if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
0.65 d3c: cmp r4, #0
↓ beq df4
CAKE_FLAG_INGRESS))) ||
ldr r3, [sl, #-8]
cmp r3, #0
↓ beq df4
!flow->head)
break;
/* drop this packet, get another one */
if (q->rate_flags & CAKE_FLAG_INGRESS) {
ldr r3, [sp, #56] ; 0x38
0.01 ldrh r3, [r3, #56] ; 0x38
tst r3, #4
↓ beq d98
len = cake_advance_shaper(q, b, skb,
ldrd r2, [sp, #24]
ldr r1, [sp, #60] ; 0x3c
ldr r0, [sp, #72] ; 0x48
strd r2, [sp]
mov r3, #1
0.01 ldr r2, [sp, #48] ; 0x30
str r3, [sp, #8]
0.01 → bl cake_advance_shaper
now, true);
flow->deficit -= len;
ldr r3, [sl, #8]
sub r3, r3, r0
str r3, [sl, #8]
b->tin_deficit -= len;
ldr r3, [fp, #2160] ; 0x870
sub r0, r3, r0
str r0, [fp, #2160] ; 0x870
}
flow->dropped++;
d98: ldr r3, [sl, #12]
b->tin_dropped++;
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
mov r1, #1
ldr r4, [sp, #52] ; 0x34
flow->dropped++;
add r3, r3, r1
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
ldr r5, [sp, #48] ; 0x30
flow->dropped++;
str r3, [sl, #12]
b->tin_dropped++;
ldr r3, [fp, #2168] ; 0x878
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
mov r0, r4
b->tin_dropped++;
add r3, r3, r1
str r3, [fp, #2168] ; 0x878
qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
ldr r2, [r5, #24]
→ bl qdisc_peek_dequeued
qstats_drop_inc():
sch->qstats.drops += count;
}
static inline void qstats_drop_inc(struct gnet_stats_queue *qstats)
{
qstats->drops++;
ldr r3, [r4, #108] ; 0x6c
cake_dequeue():
#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 8, 0)
qdisc_drop(skb, sch);
#else
qdisc_qstats_drop(sch);
kfree_skb(skb);
mov r0, r5
qstats_drop_inc():
add r3, r3, #1
str r3, [r4, #108] ; 0x6c
cake_dequeue():
→ bl qdisc_peek_dequeued
first_flow = false;
mov r1, #0
#endif
if (q->rate_flags & CAKE_FLAG_INGRESS)
ldr r3, [sp, #56] ; 0x38
ldrh r3, [r3, #56] ; 0x38
tst r3, #4
↑ beq 4ec
↑ b 2b0
0.30 df4: mov r9, sl
mov r8, fp
goto retry;
}
b->tin_ecn_mark += !!flow->cvars.ecn_marked;
0.02 dfc: ldrb r2, [r9, #45] ; 0x2d
bstats_update():
_bstats_update(bstats,
mov r7, #0
cake_dequeue():
0.08 ldr r3, [r8, #2172] ; 0x87c
_bstats_update():
bstats->bytes += bytes;
ldr lr, [sp, #52] ; 0x34
cake_dequeue():
add r3, r3, r2
skb_end_pointer():
return skb->end;
ldr r2, [sp, #48] ; 0x30
cake_dequeue():
0.33 str r3, [r8, #2172] ; 0x87c
_bstats_update():
ldr r4, [lr, #88] ; 0x58
skb_end_pointer():
0.03 ldr r3, [r2, #160] ; 0xa0
bstats_update():
_bstats_update(bstats,
ldr r6, [r2, #24]
cake_dequeue():
qdisc_bstats_update(sch, skb);
/* collect delay stats */
delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
0.05 ldr sl, [sp, #76] ; 0x4c
bstats_update():
ldrh r2, [r3, #4]
0.02 cmp r2, #0
ldrhne r2, [r3, #6]
0.32 moveq r2, #1
_bstats_update():
bstats->bytes += bytes;
ldr r3, [lr, #84] ; 0x54
0.05 str r3, [sp, #32]
bstats->packets += packets;
ldr r3, [lr, #92] ; 0x5c
bstats->bytes += bytes;
0.11 str r4, [sp, #36] ; 0x24
bstats->packets += packets;
add r3, r3, r2
bstats->bytes += bytes;
0.07 ldrd r4, [sp, #32]
bstats->packets += packets;
str r3, [lr, #92] ; 0x5c
cake_dequeue():
delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
0.20 ldr r3, [sp, #48] ; 0x30
_bstats_update():
bstats->bytes += bytes;
adds r4, r4, r6
0.05 adc r5, r5, r7
str r4, [lr, #84] ; 0x54
0.08 str r5, [lr, #88] ; 0x58
cake_dequeue():
ldrd r4, [r3, #32]
b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
0.03 ldrd r2, [sl, #152] ; 0x98
delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
ldrd r6, [sp, #24]
cake_ewma():
avg -= avg >> shift;
0.64 lsr r0, r2, #8
cake_dequeue():
delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
subs r6, r6, r4
cake_ewma():
avg -= avg >> shift;
0.01 orr r0, r0, r3, lsl #24
cake_dequeue():
delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
sbc r7, r7, r5
cake_ewma():
avg -= avg >> shift;
0.13 lsr r1, r3, #8
subs r2, r2, r0
0.02 sbc r3, r3, r1
avg += sample >> shift;
lsr lr, r6, #8
0.10 orr lr, lr, r7, lsl #24
str lr, [sp, #16]
0.16 lsr lr, r7, #8
str lr, [sp, #20]
avg -= avg >> shift;
0.06 mov r4, r2
mov r5, r3
avg += sample >> shift;
0.04 ldrd r2, [sp, #16]
adds r2, r2, r4
0.10 adc r3, r3, r5
cake_dequeue():
b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
strd r2, [sl, #152] ; 0x98
b->peak_delay = cake_ewma(b->peak_delay, delay,
0.08 ldrd r2, [sl, #160] ; 0xa0
cmp r3, r7
0.09 cmpeq r2, r6
↓ bcs ee8
0.02 mov r1, #2
lsr r0, r6, r1
orr r0, r0, r7, lsl #30
mov sl, r0
lsr r0, r7, r1
mov fp, r0
↓ b ef0
0.06 ee8: ldrd sl, [sp, #16]
0.01 mov r1, #8
delay > b->peak_delay ? 2 : 8);
b->base_delay = cake_ewma(b->base_delay, delay,
0.08 ef0: ldr lr, [sp, #76] ; 0x4c
cake_ewma():
avg -= avg >> shift;
0.09 rsb r0, r1, #32
0.01 lsr r4, r2, r1
0.01 sub ip, r1, #32
0.04 orr r4, r4, r3, lsl r0
avg += sample >> shift;
0.02 adds sl, sl, r2
avg -= avg >> shift;
0.03 orr r4, r4, r3, lsr ip
cake_dequeue():
b->base_delay = cake_ewma(b->base_delay, delay,
0.05 add r0, lr, #176 ; 0xb0
cake_ewma():
avg += sample >> shift;
0.05 adc fp, fp, r3
avg -= avg >> shift;
0.05 lsr r5, r3, r1
avg += sample >> shift;
0.03 subs r2, sl, r4
0.01 sbc r3, fp, r5
cake_dequeue():
b->peak_delay = cake_ewma(b->peak_delay, delay,
0.01 strd r2, [lr, #160] ; 0xa0
b->base_delay = cake_ewma(b->base_delay, delay,
0.01 ldrd r4, [r0, #-8]
0.08 cmp r5, r7
0.03 cmpeq r4, r6
0.02 movhi ip, #2
0.08 lsrhi r3, r6, ip
0.02 orrhi r3, r3, r7, lsl #30
0.03 movls ip, #8
0.02 strhi r3, [sp, #16]
cake_ewma():
avg -= avg >> shift;
0.04 rsb lr, ip, #32
lsrhi r3, r7, ip
0.02 lsr r2, r4, ip
0.04 strhi r3, [sp, #20]
0.08 sub r1, ip, #32
avg += sample >> shift;
0.02 ldrd sl, [sp, #16]
avg -= avg >> shift;
0.02 orr r2, r2, r5, lsl lr
orr r2, r2, r5, lsr r1
0.02 lsr r3, r5, ip
avg += sample >> shift;
0.03 adds sl, sl, r4
cake_dequeue():
delay < b->base_delay ? 2 : 8);
len = cake_advance_shaper(q, b, skb, now, false);
ldr r1, [sp, #60] ; 0x3c
cake_ewma():
avg += sample >> shift;
0.02 adc fp, fp, r5
0.01 subs r6, sl, r2
0.05 sbc r7, fp, r3
cake_dequeue():
len = cake_advance_shaper(q, b, skb, now, false);
0.03 ldrd r4, [sp, #24]
b->base_delay = cake_ewma(b->base_delay, delay,
0.03 strd r6, [r0, #-8]
len = cake_advance_shaper(q, b, skb, now, false);
0.02 mov ip, #0
0.07 ldr r6, [sp, #72] ; 0x48
0.02 ldr r2, [sp, #48] ; 0x30
0.02 strd r4, [sp]
0.09 mov r0, r6
0.03 str ip, [sp, #8]
0.03 → bl cake_advance_shaper
flow->deficit -= len;
0.22 ldr r3, [r9, #8]
b->tin_deficit -= len;
if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
0.01 movw r2, #16416 ; 0x4020
0.05 add r1, r6, r2
flow->deficit -= len;
0.04 sub r3, r3, r0
0.04 str r3, [r9, #8]
b->tin_deficit -= len;
0.22 ldr r3, [r8, #2160] ; 0x870
sub r3, r3, r0
0.02 str r3, [r8, #2160] ; 0x870
if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
ldrd r0, [r1, #-8]
ktime_compare():
if (cmp1 > cmp2)
0.01 cmp r4, r0
sbcs r3, r5, r1
cake_dequeue():
0.04 ldr r3, [sp, #52] ; 0x34
ldr r3, [r3, #76] ; 0x4c
ktime_compare():
0.14 ↓ bge 1010
cake_dequeue():
cmp r3, #0
0.05 ↓ beq 1070
u64 next = min(ktime_to_ns(q->time_next_packet),
0.17 ldr r3, [sp, #72] ; 0x48
0.02 add r2, r3, r2
ldrd r2, [r2]
0.05 cmp r0, r2
sbcs ip, r1, r3
0.15 movlt r3, r1
ktime_to_ns(q->failsafe_next_packet));
qdisc_watchdog_schedule_ns(&q->watchdog, next);
ldr r1, [sp, #72] ; 0x48
u64 next = min(ktime_to_ns(q->time_next_packet),
movlt r2, r0
qdisc_watchdog_schedule_ns(&q->watchdog, next);
0.01 add r0, r1, #16384 ; 0x4000
0.02 add r0, r0, #104 ; 0x68
→ bl qdisc_peek_dequeued
if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
↓ b 1090
} else if (!sch->q.qlen) {
1010: cmp r3, #0
↓ bne 1090
↓ b 1070
int i;
for (i = 0; i < q->tin_cnt; i++) {
if (q->tins[i].decaying_flow_count) {
0.05 101c: ldr r2, [lr, #264] ; 0x108
add r2, r2, r3
0.01 add r3, r3, #88064 ; 0x15800
add r1, r2, #88064 ; 0x15800
0.03 add r3, r3, #192 ; 0xc0
add r1, r1, #48 ; 0x30
0.05 ldrh r1, [r1]
cmp r1, #0
0.03 ↓ beq 1084
ktime_t next = \
ktime_add_ns(now,
q->tins[i].cparams.target);
qdisc_watchdog_schedule_ns(&q->watchdog,
ldr r3, [sp, #72] ; 0x48
ktime_add_ns(now,
0.05 add r2, r2, #88064 ; 0x15800
qdisc_watchdog_schedule_ns(&q->watchdog,
ldrd r4, [sp, #24]
0.05 add r0, r3, #16384 ; 0x4000
ktime_add_ns(now,
ldrd r2, [r2, #16]
qdisc_watchdog_schedule_ns(&q->watchdog,
0.02 add r0, r0, #104 ; 0x68
adds r4, r4, r2
0.01 adc r5, r5, r3
mov r2, r4
0.03 mov r3, r5
→ bl qdisc_peek_dequeued
ktime_to_ns(next));
break;
↓ b 1090
for (i = 0; i < q->tin_cnt; i++) {
0.11 1070: ldr r3, [sp, #56] ; 0x38
0.03 ldr lr, [sp, #52] ; 0x34
ldrh ip, [r3, #14]
0.02 mov r3, #0
mov r0, r3
0.07 1084: cmp ip, r0
add r0, r0, #1
0.04 ↑ bne 101c
}
}
}
if (q->overflow_timeout)
0.63 1090: ldr r2, [sp, #56] ; 0x38
0.07 ldrh r3, [r2, #12]
0.01 cmp r3, #0
q->overflow_timeout--;
0.09 subne r3, r3, #1
0.03 strhne r3, [r2, #12]
0.03 ↓ b 114c
cobalt_queue_empty():
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
0.11 10a8: ldrd r6, [r8, #24]
0.01 ldrd r2, [sp, #24]
subs r2, r2, r6
0.02 sbc r3, r3, r7
cmp r2, #0
0.01 sbcs r3, r3, #0
movge ip, #0
↑ bge 5b0
↑ b 64c
cake_dequeue():
if (cake_ddst(q->flow_mode))
0.15 10cc: cmp r3, #6
2.80 movne r4, #1
0.17 ↑ bne 48c
0.98 ↑ b 44c
b->tin_deficit += b->tin_quantum_band;
10dc: ldrh r3, [r9, r7]
str r3, [r9, #2160] ; 0x870
↑ b 144
cobalt_queue_empty():
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
10e8: ldrd r6, [sl, #24]
ldrd r2, [sp, #24]
subs r2, r2, r6
sbc r3, r3, r7
cmp r2, #0
sbcs r3, r3, #0
movge ip, #1
↑ bge 5b0
↑ b 638
0.25 110c: ldr r0, [r8, #4]
0.02 ldr r1, [r8]
0.02 ↑ b 65c
0.34 1118: ldr r1, [sl, #16]
vars->dropping = false;
0.04 strb ip, [sl, #44] ; 0x2c
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
0.01 cmp r1, #0
0.05 ↑ bne 10a8
0.05 ↑ b 110c
112c: ldr r1, [sl, #16]
vars->dropping = false;
ldr r3, [sp, #48] ; 0x30
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
cmp r1, #0
vars->dropping = false;
strb r3, [sl, #44] ; 0x2c
ldreq r0, [sl, #4]
ldreq r1, [sl]
if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
↑ beq 67c
↑ b 10a8
cake_dequeue():
return skb;
}
0.46 114c: ldr r0, [sp, #48] ; 0x30
0.02 add sp, sp, #100 ; 0x64
0.10 pop {r4, r5, r6, r7, r8, r9, sl, fp, pc}
[-- Attachment #3: perf.hist --]
[-- Type: application/octet-stream, Size: 1683 bytes --]
74.68% swapper 0x2678 K [k] cake_dequeue
10.61% swapper 0x5ad0 K [k] cake_enqueue
6.35% swapper 0x382c K [k] cake_hash
3.17% swapper 0x478 K [k] cake_dequeue_one
1.98% swapper 0x40c K [k] cake_advance_shaper
0.85% swapper 0x3cfc K [k] cake_overhead
0.69% swapper 0x138 K [k] cake_calc_overhead
0.39% swapper 0x49e8 K [k] cake_ack_filter
0.25% sh 0x2630 K [k] cake_dequeue
0.24% sh 0x54b8 K [k] cake_enqueue
0.15% dhcpv6.scri 0x26b4 K [k] cake_dequeue
0.12% fw3 0x5018 K [k] cake_enqueue
0.06% ksoftirqd/1 0x26b8 K [k] cake_dequeue
0.06% ubusd 0x2ba4 K [k] cake_dequeue
0.06% ubusd 0x4f2c K [k] cake_enqueue
0.06% sh 0x478 K [k] cake_dequeue_one
0.06% sh 0x198 K [k] cake_calc_overhead
0.05% sh 0x3bb0 K [k] cake_hash
0.05% perf 0x23b4 K [k] cake_dequeue
0.03% odhcp6c 0x5630 K [k] cake_enqueue
0.03% dnsmasq 0x2418 K [k] cake_dequeue
0.01% ksoftirqd/1 0x198 K [k] cake_calc_overhead
0.01% dnsmasq 0x36cc K [k] cake_hash
0.01% hostapd 0x276c K [k] cake_dequeue
0.01% swapper 0xb0 K [k] cake_get_tcpopt
0.01% hostapd 0x4c8 K [k] cake_dequeue_one
0.00% hostapd 0x3d00 K [k] cake_overhead
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [Cake] profiling using perf
2019-03-08 21:01 ` Georgios Amanakis
@ 2019-03-09 16:03 ` Toke Høiland-Jørgensen
2019-03-11 14:49 ` Adrian Popescu
0 siblings, 1 reply; 5+ messages in thread
From: Toke Høiland-Jørgensen @ 2019-03-09 16:03 UTC (permalink / raw)
To: Georgios Amanakis, Cake List
Georgios Amanakis <gamanakis@gmail.com> writes:
> Dear List,
>
> I made an effort to profile the performance of cake with perf in
> openwrt. perf was run on a WRT1900ACS router while downloading
> archlinux.iso via torrent in a LAN client. You can find the annotated
> sch_cake.c in the attachment as well as a performance histogram of
> sch_cake (percentages are relative to sch_cake). Hopefully people can
> take a look at it, and see if there are performance concerns.
Hmm, nothing immediately jumps out as low-hanging fruit to be harvested.
It's not too surprising the 200+-line cake_dequeue() is where most time
is spent, since that is where the bulk of the algorithm is implemented.
And, well, there's nothing in there that can obviously be removed unless
we want to drop features. I guess one could try to make it possible to
disable features at compile time; but that carries quite a bit of
complexity with it (for one, it needs testing with the combinatorial
explosion of possible configurations), so don't think it's realistic.
The only exception *might* be a compile time option to turn off those
stats that are not needed for the algorithm to run...
-Toke
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [Cake] profiling using perf
2019-03-09 16:03 ` Toke Høiland-Jørgensen
@ 2019-03-11 14:49 ` Adrian Popescu
2019-03-11 15:53 ` Jonathan Morton
0 siblings, 1 reply; 5+ messages in thread
From: Adrian Popescu @ 2019-03-11 14:49 UTC (permalink / raw)
To: Toke Høiland-Jørgensen; +Cc: Georgios Amanakis, Cake List
[-- Attachment #1: Type: text/plain, Size: 2805 bytes --]
Hello,
On Sat, Mar 9, 2019 at 6:03 PM Toke Høiland-Jørgensen <toke@redhat.com>
wrote:
> Georgios Amanakis <gamanakis@gmail.com> writes:
>
> > Dear List,
> >
> > I made an effort to profile the performance of cake with perf in
> > openwrt. perf was run on a WRT1900ACS router while downloading
> > archlinux.iso via torrent in a LAN client. You can find the annotated
> > sch_cake.c in the attachment as well as a performance histogram of
> > sch_cake (percentages are relative to sch_cake). Hopefully people can
> > take a look at it, and see if there are performance concerns.
>
> Hmm, nothing immediately jumps out as low-hanging fruit to be harvested.
> It's not too surprising the 200+-line cake_dequeue() is where most time
> is spent, since that is where the bulk of the algorithm is implemented.
>
> And, well, there's nothing in there that can obviously be removed unless
> we want to drop features. I guess one could try to make it possible to
> disable features at compile time; but that carries quite a bit of
> complexity with it (for one, it needs testing with the combinatorial
> explosion of possible configurations), so don't think it's realistic.
> The only exception *might* be a compile time option to turn off those
> stats that are not needed for the algorithm to run...
>
The algorithm itself has probably been optimized over the years. It
might be a good idea to think of other ways to perform some
operations and simplify the algorithm. The code may not be that
slow on a high end CPU such as a Core i5 and anything faster.
The problem with the current implementation is that it's not able to
saturate a gigabit connection even on dual core ARM routers with
frequencies above 1.2 GHz. Routers for home users are probably going
to rely on hardware offloads to saturate gigabit connections for a
long time. This doesn't mean cake is poorly optimized or poorly
implemented. It's not a good fit for small embedded systems with small
CPU caches.
Different data structures might help improve performance.
This is why I've run a bunch of tests over the last few weeks. My
conclusion is that the current version of cake can't deal with more
than 100 mbps on ar71xx. mt7621 seems to go up to about 200 mbps.
I was thinking of a few things to try:
- disable some stats and profile
- lower the number of queues from 1024 to 256
- look into profiling to figure out what's causing cache misses
- disable some features and profile again
- set up a lab for all this testing
It's hard to find the time to do all of this. There's a lot to learn
in the process.
>
> -Toke
> _______________________________________________
> Cake mailing list
> Cake@lists.bufferbloat.net
> https://lists.bufferbloat.net/listinfo/cake
>
[-- Attachment #2: Type: text/html, Size: 3826 bytes --]
^ permalink raw reply [flat|nested] 5+ messages in thread
* Re: [Cake] profiling using perf
2019-03-11 14:49 ` Adrian Popescu
@ 2019-03-11 15:53 ` Jonathan Morton
0 siblings, 0 replies; 5+ messages in thread
From: Jonathan Morton @ 2019-03-11 15:53 UTC (permalink / raw)
To: Adrian Popescu; +Cc: Toke Høiland-Jørgensen, Cake List
> On 11 Mar, 2019, at 4:49 pm, Adrian Popescu <adriannnpopescu@gmail.com> wrote:
>
> This doesn't mean cake is poorly optimized or poorly
> implemented. It's not a good fit for small embedded systems with small
> CPU caches.
More importantly, how much CPU time is spent elsewhere than in sch_cake.c? I suspect there's a lot of overhead that we have only indirect control over.
- Jonathan Morton
^ permalink raw reply [flat|nested] 5+ messages in thread
end of thread, other threads:[~2019-03-11 15:53 UTC | newest]
Thread overview: 5+ messages (download: mbox.gz / follow: Atom feed)
-- links below jump to the message on this page --
2019-03-08 20:58 [Cake] profiling using perf Georgios Amanakis
2019-03-08 21:01 ` Georgios Amanakis
2019-03-09 16:03 ` Toke Høiland-Jørgensen
2019-03-11 14:49 ` Adrian Popescu
2019-03-11 15:53 ` Jonathan Morton
This is a public inbox, see mirroring instructions
for how to clone and mirror all data and code used for this inbox