[笔记]Linux Socket Filtering aka Berkeley Packet Filter (BPF)
本文是阅读linux/Documentation/networking/filter.txt文档的笔记。
学习BPF的过程:
- 阅读相关文献,记录两篇的笔记。另一篇的链接[笔记]BPF and XDP Reference Guide(cilium)
简介
BPF(Berkeley Packet Filter)允许用户空间程序将过滤器(filter)附加到任何套接字(socket)上,来允许或禁止某些类型的数据通过套接字。可以通过SO_DETACH_FILTER
从套接字中卸载filter,但一般使用不到,因为一旦socket关闭,其上的filter也会自动移除。
可以通过SO_LOCK_FILTER
选项为某个filter加锁,加锁后,filter不可在更改或移除,直到socket关闭。
tcpdump
通过libpcap
的内部编译器生成可加载的指令,通过调用SO_ATTACH_FILTER
加载到内核中。
除socket外,BPF可附加到内核其他地方:netfilter的xt_bpf,内核qdisc层的cls_bpf。
结构
// 在<linux/filter.h>中
struct sock_filter { /* Filter block */
__u16 code; /* Actual filter code */
__u8 jt; /* Jump true */
__u8 jf; /* Jump false */
__u32 k; /* Generic multiuse field (一般为code需要使用的value)*/
};
struct sock_fprog { /* Required for SO_ATTACH_FILTER. */
unsigned short len; /* Number of filter blocks */
struct sock_filter __user *filter;
};
例子
#include <sys/socket.h>
#include <sys/types.h>
#include <arpa/inet.h>
#include <linux/if_ether.h>
/* ... */
/* From the example above: tcpdump -i em1 port 22 -dd */
struct sock_filter code[] = {
{ 0x28, 0, 0, 0x0000000c },
{ 0x15, 0, 8, 0x000086dd },
{ 0x30, 0, 0, 0x00000014 },
{ 0x15, 2, 0, 0x00000084 },
{ 0x15, 1, 0, 0x00000006 },
{ 0x15, 0, 17, 0x00000011 },
{ 0x28, 0, 0, 0x00000036 },
{ 0x15, 14, 0, 0x00000016 },
{ 0x28, 0, 0, 0x00000038 },
{ 0x15, 12, 13, 0x00000016 },
{ 0x15, 0, 12, 0x00000800 },
{ 0x30, 0, 0, 0x00000017 },
{ 0x15, 2, 0, 0x00000084 },
{ 0x15, 1, 0, 0x00000006 },
{ 0x15, 0, 8, 0x00000011 },
{ 0x28, 0, 0, 0x00000014 },
{ 0x45, 6, 0, 0x00001fff },
{ 0xb1, 0, 0, 0x0000000e },
{ 0x48, 0, 0, 0x0000000e },
{ 0x15, 2, 0, 0x00000016 },
{ 0x48, 0, 0, 0x00000010 },
{ 0x15, 0, 1, 0x00000016 },
{ 0x06, 0, 0, 0x0000ffff },
{ 0x06, 0, 0, 0x00000000 },
};
struct sock_fprog bpf = {
.len = ARRAY_SIZE(code),
.filter = code,
};
/*创建PF_PACKET socket*/
sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (sock < 0)
/* ... bail out ... */
ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf));
if (ret < 0)
/* ... bail out ... */
/* ... */
close(sock);
setsockopt
系统调用,调用SO_DETACH_FILTER
时无需参数,调用SO_LOCK_FILTER
时,参数为1或0:
* setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER, &val, sizeof(val));
* setsockopt(sockfd, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val));
* setsockopt(sockfd, SOL_SOCKET, SO_LOCK_FILTER, &val, sizeof(val));
libpcap
覆盖了绝大多数socket filter的用例,一般直接使用libpcap
进行开发。
BPF引擎(engine)和指令集
tool/bpf/
目录下的bgf_asm
可以用来编写底层的filter。
BPF包含一个32位累加器A、32位寄存器X、16 x 32位的临时存储:
Element Description
A 32 bit wide accumulator
X 32 bit wide X register
M[] 16 x 32 bit wide misc registers aka "scratch memory
store", addressable from 0 to 15
bpf_asm
转换生成的程序是一个如下结构的数组。
op:16, jt:8, jf:8, k:32
op
: 16位的指令jt
jf
: jump if true, jump if falsek
:参数
在 linux/filter.h
下的指令以及相应的寻址方式:
Instruction Addressing mode Description
ld 1, 2, 3, 4, 12 Load word into A
ldi 4 Load word into A
ldh 1, 2 Load half-word into A
ldb 1, 2 Load byte into A
ldx 3, 4, 5, 12 Load word into X
ldxi 4 Load word into X
ldxb 5 Load byte into X
st 3 Store A into M[]
stx 3 Store X into M[]
jmp 6 Jump to label
ja 6 Jump to label
jeq 7, 8, 9, 10 Jump on A == <x>
jneq 9, 10 Jump on A != <x>
jne 9, 10 Jump on A != <x>
jlt 9, 10 Jump on A < <x>
jle 9, 10 Jump on A <= <x>
jgt 7, 8, 9, 10 Jump on A > <x>
jge 7, 8, 9, 10 Jump on A >= <x>
jset 7, 8, 9, 10 Jump on A & <x>
add 0, 4 A + <x>
sub 0, 4 A - <x>
mul 0, 4 A * <x>
div 0, 4 A / <x>
mod 0, 4 A % <x>
neg !A
and 0, 4 A & <x>
or 0, 4 A | <x>
xor 0, 4 A ^ <x>
lsh 0, 4 A << <x>
rsh 0, 4 A >> <x>
tax Copy A into X
txa Copy X into A
ret 4, 11 Return
Addressing mode Syntax Description
0 x/%x Register X
1 [k] BHW at byte offset k in the packet(BHW,二进制半字,32位)
2 [x + k] BHW at the offset X + k in the packet
3 M[k] Word at offset k in M[]
4 #k Literal value stored in k
5 4*([k]&0xf) Lower nibble * 4 at byte offset k in the packet
6 L Jump label L
7 #k,Lt,Lf Jump to Lt if true, otherwise jump to Lf
8 x/%x,Lt,Lf Jump to Lt if true, otherwise jump to Lf
9 #k,Lt Jump to Lt if predicate is true
10 x/%x,Lt Jump to Lt if predicate is true
11 a/%a Accumulator A
12 extension BPF extension
BPF extension寻址配合load指令使用,将查询的结果放到累加器A中。可能的BPF extension包括:
Extension Description
len skb->len
proto skb->protocol
type skb->pkt_type
poff Payload start offset
ifidx skb->dev->ifindex
nla Netlink attribute of type X with offset A
nlan Nested Netlink attribute of type X with offset A
mark skb->mark
queue skb->queue_mapping
hatype skb->dev->type
rxhash skb->hash
cpu raw_smp_processor_id()
vlan_tci skb_vlan_tag_get(skb)
vlan_avail skb_vlan_tag_present(skb)
vlan_tpid skb->vlan_proto
rand prandom_u32()
一些例子:
** ARP packets:
ldh [12] /*以太网首部跳过12byte,load half-word,也就是2byte,是以太网的类型字段*/
jne #0x806, drop /* 不等于0x806则跳转 */
ret #-1
drop: ret #0
** IPv4 TCP packets:
ldh [12]
jne #0x800, drop
ldb [23] /*以太网frame跳过14byte的首部,在到9byte(1byte是8bit)的ip首部,load 1byte,是IP数据包的协议类型*/
jneq #6, drop
ret #-1
drop: ret #0
** (Accelerated) VLAN w/ id 10:
ld vlan_tci /*这里用的是extensions的寻址,skb_vlan_tag_get(skb)*/
jneq #10, drop
ret #-1
drop: ret #0
上面的代码可以由bpf_asm
进行转换,生成xt_bpf
和cls_bpf
可以直接加载的code
$ ./bpf_asm foo
4,40 0 0 12,21 0 1 2054,6 0 0 4294967295,6 0 0 0,
In copy and paste C-like output:
$ ./bpf_asm -c foo
{ 0x28, 0, 0, 0x0000000c },
{ 0x15, 0, 1, 0x00000806 },
{ 0x06, 0, 0, 0xffffffff },
{ 0x06, 0, 0, 0000000000 },
tools/bpf/bpf_dbg
可以使用pcap文件来调试bpf程序。
JIT编译器
echo 1 > /proc/sys/net/core/bpf_jit_enable /*启动*/
echo 2 > /proc/sys/net/core/bpf_jit_enable /*编译的opcode会输入到内核日志中*/
当开启CONFIG_BPF_JIT_ALWAYS_ON
时,bpf_jit_enable
始终为1。
tools/bpf/
下的bpf_jit_disasm
可以将内核日志中的十六进制转换成反汇编。
# ./bpf_jit_disasm -o
70 bytes emitted from JIT compiler (pass:3, flen:6)
ffffffffa0069c8f + <x>:
0: push %rbp
55
1: mov %rsp,%rbp
48 89 e5
4: sub $0x60,%rsp
48 83 ec 60
8: mov %rbx,-0x8(%rbp)
48 89 5d f8
c: mov 0x68(%rdi),%r9d
44 8b 4f 68
10: sub 0x6c(%rdi),%r9d
44 2b 4f 6c
14: mov 0xd8(%rdi),%r8
4c 8b 87 d8 00 00 00
1b: mov $0xc,%esi
be 0c 00 00 00
20: callq 0xffffffffe0ff9442
e8 1d 94 ff e0
25: cmp $0x800,%eax
3d 00 08 00 00
2a: jne 0x0000000000000042
75 16
2c: mov $0x17,%esi
be 17 00 00 00
31: callq 0xffffffffe0ff945e
e8 28 94 ff e0
36: cmp $0x1,%eax
83 f8 01
39: jne 0x0000000000000042
75 07
3b: mov $0xffff,%eax
b8 ff ff 00 00
40: jmp 0x0000000000000044
eb 02
42: xor %eax,%eax
31 c0
44: leaveq
c9
45: retq
c3
BPF内核实现
内核中解释器使用的指令和上面描述的BPF指令集不同,更接近底层架构,以便获得更高的性能,被称为eBPF或internal BPF。新指令集可以通过“受限C语言”编写程序,并通过GCC/LLVM编译为eBPF,C -> eBPF -> native code
内核调用bpf_prog_create()
和bpf_prog_destroy()
来创建和销毁filter,调用BPG_PROG_RUN(filter,ctx)
宏进行解释与运行,这些都是透明的。参数filter为bpf_prog
结构体,由bpf_prog_create()
返回,ctx为给定的上下文,例如skb的指针。bpf_check_classic()
的所有约束和限制会在转换之前执行。
eBPF的主要变化:
寄存器数量由2变为10
- R0:保存eBPF或辅助函数的返回值
- R1-R5:存储调用辅助函数需要传递的参数
- R6-R9:用于存储中间值,辅助函数将保持这些寄存器不改变
- R10:只读寄存器,包含访问BPF stack的指针
只有一个主eBPF程序,它只能调用其他辅助函数,而非其他BPF程序(注:现在应该支持BPF之间的调用)
寄存器由32位变为64位
- 仍然保留了32位的ALU操作语义,使用64位寄存器的32位子寄存器保存
条件jt/jf替换为jt/fall-through
引入
bpf_call
和寄存器传递约定调用辅助函数前,根据辅助函数规定,将参数存储至R1-R5上。R1-R5寄存器映射到CPU的寄存器上,调用无性能损耗。
调用辅助函数后,R1-R5不可读,R0存储返回的值,R6-R9正在调用期间不会改变。eBPF程序只有1一个
ctx
,保存在R1上。
u64 f1() { return (*_f2)(1); } u64 f2(u64 a) { return f3(a + 1, a); } u64 f3(u64 a, u64 b) { return a - b; } // f2的eBPF看起来类似 f2: bpf_mov R2, R1 bpf_add R1, 1 bpf_call f3 bpf_exit
x86_64中,64位寄存器与HW寄存器的对应:
R0 - rax
R1 - rdi
R2 - rsi
R3 - rdx
R4 - rcx
R5 - r8
R6 - rbx
R7 - r13
R8 - r14
R9 - r15
R10 - rbp
x86_64下,C语言
u64 bpf_filter(u64 ctx)
{
return foo(ctx, 2, 3, 4, 5) + bar(ctx, 6, 7, 8, 9);
}
对应的BPF
bpf_mov R6, R1 /* save ctx */
bpf_mov R2, 2
bpf_mov R3, 3
bpf_mov R4, 4
bpf_mov R5, 5
bpf_call foo
bpf_mov R7, R0 /* save foo() return value */
bpf_mov R1, R6 /* restore ctx for next call */
bpf_mov R2, 6
bpf_mov R3, 7
bpf_mov R4, 8
bpf_mov R5, 9
bpf_call bar
bpf_add R0, R7
bpf_exit
对应的JIT编译结果
push %rbp
mov %rsp,%rbp
sub $0x228,%rsp
mov %rbx,-0x228(%rbp)
mov %r13,-0x220(%rbp)
mov %rdi,%rbx
mov $0x2,%esi
mov $0x3,%edx
mov $0x4,%ecx
mov $0x5,%r8d
callq foo
mov %rax,%r13
mov %rbx,%rdi
mov $0x6,%esi
mov $0x7,%edx
mov $0x8,%ecx
mov $0x9,%r8d
callq bar
add %r13,%rax
mov -0x228(%rbp),%rbx
mov -0x220(%rbp),%r13
leaveq
retq
为了保证eBPF程序迅速停止,eBPF程序限制只能有4096条指令。(注:现在限制扩到了100W)
根据不同用例,ctx中的内容不同。对于seccomp,R1指向seccomp_data
;对于BPF filters,R1指向skb
。
注:Seccomp(全称:secure computing mode)在2.6.12版本中引入linux内核,作为一种安全机制,主要用于限制用户态程序对系统调用的滥用。由于限制太强,后引入seccomp-bpf,借助bpf规则来过滤系统调用。
指令的转换:
op:16, jt:8, jf:8, k:32 ==> op:8, dst_reg:4, src_reg:4, off:16, imm:32
- 目前内部BPF(eBPF)指令已有87条。
- 内部BPF是一个通用的RISC指令集。复杂的过滤器可能耗尽寄存器,需要使用到堆栈。
- 程序的安全性通过两个步骤确定:深度优先搜索,禁止循环,并进行其他CFG验证;从第一个指令开始,探测所有可能的路径,观察寄存器和堆栈的状态变化。
eBPF opcode
+----------------+--------+--------------------+
| 4 bits | 1 bit | 3 bits |
| operation code | source | instruction class |
+----------------+--------+--------------------+
(MSB) (LSB)
最后3bit的LSB存储指令类别:
Classic BPF classes: eBPF classes:
BPF_LD 0x00 BPF_LD 0x00
BPF_LDX 0x01 BPF_LDX 0x01
BPF_ST 0x02 BPF_ST 0x02
BPF_STX 0x03 BPF_STX 0x03
BPF_ALU 0x04 BPF_ALU 0x04
BPF_JMP 0x05 BPF_JMP 0x05
BPF_RET 0x06 BPF_JMP32 0x06
BPF_MISC 0x07 BPF_ALU64 0x07
对于逻辑运算和跳转指令
- 当PF_CLASS(code) == BPF_ALU或BPF_JMP时,第四位(source部分)可以为
BPF_K 0x00
BPF_X 0x08
* in classic BPF, this means:
BPF_SRC(code) == BPF_X - use register X as source operand
BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand
* in eBPF, this means:
BPF_SRC(code) == BPF_X - use 'src_reg' register as source operand
BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand
- 当BPF_CLASS(code) == BPF_ALU或BPF_ALU64 ,前四位BPF_OP(code)可以为:
BPF_ADD 0x00
BPF_SUB 0x10
BPF_MUL 0x20
BPF_DIV 0x30
BPF_OR 0x40
BPF_AND 0x50
BPF_LSH 0x60
BPF_RSH 0x70
BPF_NEG 0x80
BPF_MOD 0x90
BPF_XOR 0xa0
BPF_MOV 0xb0 /* eBPF only: mov reg to reg */
BPF_ARSH 0xc0 /* eBPF only: sign extending shift right */
BPF_END 0xd0 /* eBPF only: endianness conversion */
当BPF_CLASS(code) == BPF_JMP或BPF_JMP32,前四位BPF_OP(code)可以为:
BPF_JA 0x00 /* BPF_JMP only */
BPF_JEQ 0x10
BPF_JGT 0x20
BPF_JGE 0x30
BPF_JSET 0x40
BPF_JNE 0x50 /* eBPF only: jump != */
BPF_JSGT 0x60 /* eBPF only: signed '>' */
BPF_JSGE 0x70 /* eBPF only: signed '>=' */
BPF_CALL 0x80 /* eBPF BPF_JMP only: function call */
BPF_EXIT 0x90 /* eBPF BPF_JMP only: function return */
BPF_JLT 0xa0 /* eBPF only: unsigned '<' */
BPF_JLE 0xb0 /* eBPF only: unsigned '<=' */
BPF_JSLT 0xc0 /* eBPF only: signed '<' */
BPF_JSLE 0xd0 /* eBPF only: signed '<=' */
BPF_XOR | BPF_K | BPF_ALU
在cBPF中表示A^=imm32
,在eBPF中表示src_reg = (u32) src_reg ^ (u32) imm32
(这段看代码,应该是dst_reg = (u32) dst_reg ^ (u32) imm32
?) BPF_JMP | BPF_EXIT表示直接退出。
对于加载和存储指令
+--------+--------+-------------------+
| 3 bits | 2 bits | 3 bits |
| mode | size | instruction class |
+--------+--------+-------------------+
(MSB) (LSB)
size部分
BPF_W 0x00 /* word */
BPF_H 0x08 /* half word */
BPF_B 0x10 /* byte */
BPF_DW 0x18 /* eBPF only, double word */
B - 1 byte
H - 2 byte
W - 4 byte
DW - 8 byte (eBPF only)
mode部分
BPF_IMM 0x00 /* used for 32-bit mov in classic BPF and 64-bit in eBPF */
BPF_ABS 0x20
BPF_IND 0x40
BPF_MEM 0x60
BPF_LEN 0x80 /* classic BPF only, reserved in eBPF */
BPF_MSH 0xa0 /* classic BPF only, reserved in eBPF */
BPF_XADD 0xc0 /* eBPF only, exclusive add */
两个非通用指令BPF_ABS | <size> | BPF_LD
和BPF_IND | <size> | BPF_LD
,用于访问数据包中的字段。使用条件:
1)ctx是指向sk_buff
的指针
2)R0-R6规定为:R0保存获取的结果,R6保存sk_buff
指针,R1-R5为暂存器,不可存储需要跨指令的数据(即执行完指令后R1-R5中的数据不可用)
3)访问超出sk_buff->data
范围 ,自动跳出
BPF_IND | BPF_W | BPF_LD means:
R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32))
and R1 - R5 were scratched.
其他的一些例子
BPF_MEM | <size> | BPF_STX: *(size *) (dst_reg + off) = src_reg
BPF_MEM | <size> | BPF_ST: *(size *) (dst_reg + off) = imm32
BPF_MEM | <size> | BPF_LDX: dst_reg = *(size *) (src_reg + off)
BPF_XADD | BPF_W | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg
BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg
注:在linux/filter.h下都有。ST是从dst到src,LD是从src到dst
BPF_LD | BPF_DW | BPF_IMM(获取一个8字节的立即数)会跨两个指令。
#define BPF_LD_IMM64_RAW(DST, SRC, IMM) \
((struct bpf_insn) { \
.code = BPF_LD | BPF_DW | BPF_IMM, \
.dst_reg = DST, \
.src_reg = SRC, \
.off = 0, \
.imm = (__u32) (IMM) }), \
((struct bpf_insn) { \
.code = 0, /* zero is reserved opcode */ \
.dst_reg = 0, \
.src_reg = 0, \
.off = 0, \
.imm = ((__u64) (IMM)) >> 32 })
eBPF验证器
两步:
1)做DAG(有向无环图)检测,确保无循环以及CFG(控制流保护)
2)从第一条指令开始,模拟执行所有可能的指令路径,观察寄存器和堆栈状态
一些检测规则:
1)不允许指针相加
2)不允许读未初始化的寄存器
3)辅助函数调用后R1-R5为不可读状态
4)load/store指令需要相应的寄存器类型为PTR_TO_CTX
、 PTR_TO_MAP
、PTR_TO_STACK
是才有效 (比如不能是标量),并且会有边界和对齐检测
5)程序开始时,R1类型为PTR_TO_CTX
(ctx指针)。可以通过is_valid_access()
回调函数,自定义验证方式,来进行边界与对齐检测。另外类型为PTR_TO_STACK
(堆栈指针)时,访问边界为[-MAX_BPF_STACK, 0)
6)堆栈只有在写入才能读取(对应PTR_TO_STACK
类型寄存器的操作)
7)bpf_verifier_ops->get_func_proto()
可以用来自定义“函数调用”时的“参数检测”(检测寄存器),返回值保存在R0
8)eBPF对seccomp与socket filter的验证方式是相同的,而cBPF对于seccomp,需要先进行seccomp verifier
,在进行其他的验证。
详细代码kernel/bpf/verifier.c
寄存器值追踪
验证器追踪寄存器中的值,检测值可能的范围。追踪由include/linux/bpf_verifier.h
下的struct bpf_reg_state
完成,寄存器值可能的类型:NOT_INIT
、SCALAR_VALUE
、point
。point
由分为:
PTR_TO_CTX Pointer to bpf_context.
CONST_PTR_TO_MAP Pointer to struct bpf_map. "Const" because arithmetic
on these pointers is forbidden.
PTR_TO_MAP_VALUE Pointer to the value stored in a map element.
PTR_TO_MAP_VALUE_OR_NULL
Either a pointer to a map value, or NULL; map accesses
(see section 'eBPF maps', below) return this type,
which becomes a PTR_TO_MAP_VALUE when checked != NULL.
Arithmetic on these pointers is forbidden.
PTR_TO_STACK Frame pointer.
PTR_TO_PACKET skb->data.
PTR_TO_PACKET_END skb->data + headlen; arithmetic forbidden.
PTR_TO_SOCKET Pointer to struct bpf_sock_ops, implicitly refcounted.
PTR_TO_SOCKET_OR_NULL
Either a pointer to a socket, or NULL; socket lookup
returns this type, which becomes a PTR_TO_SOCKET when
checked != NULL. PTR_TO_SOCKET is reference-counted,
so programs must release the reference through the
socket release function before the end of the program.
Arithmetic on these pointers is forbidden.
对于PTR_TO_SOCKET和PTR_TO_SOCKET_OR_NULL,在程序结束前需要使用socket release方法,释放引用
对于指针偏移的验证:
1)分fixed offset
和variable offset
2)验证器对vaiable offset
需要记录的状态:
* minimum and maximum values as unsigned
* minimum and maximum values as signed
* 一个'tnum'值,由两个u64组成,第一个为value,是具体的值;第二个为mask,用1标记未知的位。比如知道寄存器前56位为0,后8位不知道,则表示为tnum(0x0,0xff),表示省略了高位的0
- 状态可以通过逻辑运算更新,例如上的
tnum(0x0,0xff)
寄存器与0x40
取或,则变为tnum(0x40,0xbf)
- 状态可以根据条件分支判断,例如在
SCALAR_VALUE
(offset)大于8为true的分支,寄存器的umin_value
为9 - 对
PTR_TO_PACKET
类型的寄存器,使用id标识某个偏移量。例如寄存器A拷贝到寄存器B,两者有相同偏移量,因此offset id相同,一次验证即可。 PTR_TO_MAP_VALUE_OR_NULL
类寄存器(存储bgp map中查询返回的指针),也使用id标识,一旦验证某个id标识非NULL,其他副本也相同。除此外,可以做对齐验证。PTR_TO_SOCKET
和PTR_TO_SOCKET_OR_NULL
类寄存器(存储socket中查询返回的指针)也使用id标识,与PTR_TO_MAP_VALUE_OR_NULL
类似。另外,对非NULL的id,可将其副本调用socket release方法。
数据包的直接访问
对于cls_bpf
与act_bpf
,允许通过skb->date
和skb->data_end
直接访问数据包。
1: r4 = *(u32 *)(r1 +80) /* load skb->data_end */
2: r3 = *(u32 *)(r1 +76) /* load skb->data */
3: r5 = r3
4: r5 += 14
5: if r5 > r4 goto pc+16
R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp
6: r0 = *(u16 *)(r3 +12) /* access 12 and 13 bytes of the packet */
根据第5行的判断,验证器可以标记R3为pkg(id=0,off=0,r=14)
,其中id=0表示未向寄存器添加variablle offset
,off=0表示未向寄存器添加fixed offset
,r=14表示安全访问范围为[R3,R3+14]。同理R5标记为pkt(id=0,off=14,r=14)
eBPF maps
map
用于内核和用户空间之间共享不同类型的数据。map
存储使用BPF系统调用:
- create a map with given type and attributes
map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
using attr->map_type, attr->key_size, attr->value_size, attr->max_entries
returns process-local file descriptor or negative error
- lookup key in a given map
err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero and stores found elem into value or negative error
- create or update key/value pair in a given map
err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key, attr->value
returns zero or negative error
- find and delete element by key in a given map
err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
using attr->map_fd, attr->key
- to delete map: close(fd)
Exiting process will delete maps automatically
map
类型:hash、array、bloom filter、redix-tree等;map
定义:
. type
. max number of elements
. key size in bytes
. value size in bytes
修剪(pruning)
验证器(verifier)在对新分支进行分析时,会比较当前指令之前检测过的状态,如果当前(寄存器)状态属于之前检测过的状态,这个分支就被认为可信,被修剪。
理解验证器的日志
不可达指令
static struct bpf_insn prog[] = { BPF_EXIT_INSN(), BPF_EXIT_INSN(), }; Error: unreachable insn 1
读取未初始化的寄存器
BPF_MOV64_REG(BPF_REG_0, BPF_REG_2), BPF_EXIT_INSN(), Error: 0: (bf) r0 = r2 R2 !read_ok
结束程序前未初始化寄存器
BPF_MOV64_REG(BPF_REG_2, BPF_REG_1), BPF_EXIT_INSN(), Error: 0: (bf) r2 = r1 1: (95) exit R0 !read_ok
越界访问堆栈
BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0), BPF_EXIT_INSN(), Error: 0: (7a) *(u64 *)(r10 +8) = 0 invalid stack off=8 size=8
未初始化堆栈(
map_lookup_elem
调用,R1为文件描述符,R2为key,读取了R2指向的未初始化堆栈)BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), Error: 0: (bf) r2 = r10 1: (07) r2 += -8 2: (b7) r1 = 0x0 3: (85) call 1 invalid indirect read from stack off -8+0 size 8
无效的
map_fd
(R1中的0x0)BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_EXIT_INSN(), Error: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (b7) r1 = 0x0 4: (85) call 1 fd 0 is not pointing to valid bpf_map
map_lookup_elem
的返回值未进行验证(保存在R0)map element: BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), BPF_EXIT_INSN(), Error: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (b7) r1 = 0x0 4: (85) call 1 5: (7a) *(u64 *)(r0 +0) = 0 R0 invalid mem access 'map_value_or_null'
BPF_ST_MEM
以错误的对齐方式进行访问BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1), BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0), BPF_EXIT_INSN(), Error: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (b7) r1 = 1 4: (85) call 1 5: (15) if r0 == 0x0 goto pc+1 R0=map_ptr R10=fp 6: (7a) *(u64 *)(r0 +4) = 0 misaligned access off 4 size 8
map_lookup_elem
结果R0为NULL,pc+2,执行指令8,会出现错误BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_LD_MAP_FD(BPF_REG_1, 0), BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem), BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2), BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0), BPF_EXIT_INSN(), BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1), BPF_EXIT_INSN(), Error: 0: (7a) *(u64 *)(r10 -8) = 0 1: (bf) r2 = r10 2: (07) r2 += -8 3: (b7) r1 = 1 4: (85) call 1 5: (15) if r0 == 0x0 goto pc+2 R0=map_ptr R10=fp 6: (7a) *(u64 *)(r0 +0) = 0 7: (95) exit from 5 to 8: R0=imm0 R10=fp 8: (7a) *(u64 *)(r0 +0) = 1 R0 invalid mem access 'imm'
bpf_sk_lookup_tcp
三个参数R1=ptr_to_ctx;R2=ptr_to_mem
,将R0置NULL前未检测、未释放引用BPF_MOV64_IMM(BPF_REG_2, 0), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 4), BPF_MOV64_IMM(BPF_REG_4, 0), BPF_MOV64_IMM(BPF_REG_5, 0), BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), BPF_MOV64_IMM(BPF_REG_0, 0), BPF_EXIT_INSN(), Error: 0: (b7) r2 = 0 1: (63) *(u32 *)(r10 -8) = r2 2: (bf) r2 = r10 3: (07) r2 += -8 4: (b7) r3 = 4 5: (b7) r4 = 0 6: (b7) r5 = 0 7: (85) call bpf_sk_lookup_tcp#65 8: (b7) r0 = 0 9: (95) exit Unreleased reference id=1, alloc_insn=7
R0未检测是否为NULL就返回了
BPF_MOV64_IMM(BPF_REG_2, 0), BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8), BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8), BPF_MOV64_IMM(BPF_REG_3, 4), BPF_MOV64_IMM(BPF_REG_4, 0), BPF_MOV64_IMM(BPF_REG_5, 0), BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp), BPF_EXIT_INSN(), Error: 0: (b7) r2 = 0 1: (63) *(u32 *)(r10 -8) = r2 2: (bf) r2 = r10 3: (07) r2 += -8 4: (b7) r3 = 4 5: (b7) r4 = 0 6: (b7) r5 = 0 7: (85) call bpf_sk_lookup_tcp#65 8: (95) exit Unreleased reference id=1, alloc_insn=7
测试
内核附带测试模块,包含用于cBPF和eBPF的各种测试,在lib/test_bpf.c
中 ,通过Kconfig
启动:
CONFIG_TEST_BPF=m
编译安装测试模块后,可以通过insmod
与modprobe
执行测试。结果在内核日志(dmesg)中可以找到。
附
参考(图片来源)
https://cloud.tencent.com/developer/inventory/600/article/1698426