[笔记]Linux Socket Filtering aka Berkeley Packet Filter (BPF)

本文是阅读linux/Documentation/networking/filter.txt文档的笔记。

学习BPF的过程:

简介

BPF(Berkeley Packet Filter)允许用户空间程序将过滤器(filter)附加到任何套接字(socket)上,来允许或禁止某些类型的数据通过套接字。可以通过SO_DETACH_FILTER从套接字中卸载filter,但一般使用不到,因为一旦socket关闭,其上的filter也会自动移除。

可以通过SO_LOCK_FILTER选项为某个filter加锁,加锁后,filter不可在更改或移除,直到socket关闭。

tcpdump通过libpcap的内部编译器生成可加载的指令,通过调用SO_ATTACH_FILTER加载到内核中。

除socket外,BPF可附加到内核其他地方:netfilter的xt_bpf,内核qdisc层的cls_bpf。

结构

// 在<linux/filter.h>中
struct sock_filter {	/* Filter block */
	__u16	code;   /* Actual filter code */
	__u8	jt;	/* Jump true */
	__u8	jf;	/* Jump false */
	__u32	k;      /* Generic multiuse field (一般为code需要使用的value)*/
};

struct sock_fprog {			/* Required for SO_ATTACH_FILTER. */
	unsigned short		   len;	/* Number of filter blocks */
	struct sock_filter __user *filter;
};

例子

#include <sys/socket.h>
#include <sys/types.h>
#include <arpa/inet.h>
#include <linux/if_ether.h>
/* ... */

/* From the example above: tcpdump -i em1 port 22 -dd */
struct sock_filter code[] = {
	{ 0x28,  0,  0, 0x0000000c },
	{ 0x15,  0,  8, 0x000086dd },
	{ 0x30,  0,  0, 0x00000014 },
	{ 0x15,  2,  0, 0x00000084 },
	{ 0x15,  1,  0, 0x00000006 },
	{ 0x15,  0, 17, 0x00000011 },
	{ 0x28,  0,  0, 0x00000036 },
	{ 0x15, 14,  0, 0x00000016 },
	{ 0x28,  0,  0, 0x00000038 },
	{ 0x15, 12, 13, 0x00000016 },
	{ 0x15,  0, 12, 0x00000800 },
	{ 0x30,  0,  0, 0x00000017 },
	{ 0x15,  2,  0, 0x00000084 },
	{ 0x15,  1,  0, 0x00000006 },
	{ 0x15,  0,  8, 0x00000011 },
	{ 0x28,  0,  0, 0x00000014 },
	{ 0x45,  6,  0, 0x00001fff },
	{ 0xb1,  0,  0, 0x0000000e },
	{ 0x48,  0,  0, 0x0000000e },
	{ 0x15,  2,  0, 0x00000016 },
	{ 0x48,  0,  0, 0x00000010 },
	{ 0x15,  0,  1, 0x00000016 },
	{ 0x06,  0,  0, 0x0000ffff },
	{ 0x06,  0,  0, 0x00000000 },
};

struct sock_fprog bpf = {
	.len = ARRAY_SIZE(code),
	.filter = code,
};

  /*创建PF_PACKET socket*/ 
sock = socket(PF_PACKET, SOCK_RAW, htons(ETH_P_ALL));
if (sock < 0)
	/* ... bail out ... */

ret = setsockopt(sock, SOL_SOCKET, SO_ATTACH_FILTER, &bpf, sizeof(bpf));
if (ret < 0)
	/* ... bail out ... */

/* ... */
close(sock);

setsockopt系统调用,调用SO_DETACH_FILTER时无需参数,调用SO_LOCK_FILTER时,参数为1或0:

* setsockopt(sockfd, SOL_SOCKET, SO_ATTACH_FILTER, &val, sizeof(val));
* setsockopt(sockfd, SOL_SOCKET, SO_DETACH_FILTER, &val, sizeof(val)); 
* setsockopt(sockfd, SOL_SOCKET, SO_LOCK_FILTER,   &val, sizeof(val));

libpcap覆盖了绝大多数socket filter的用例,一般直接使用libpcap进行开发。

BPF引擎(engine)和指令集

tool/bpf/目录下的bgf_asm可以用来编写底层的filter。

BPF包含一个32位累加器A、32位寄存器X、16 x 32位的临时存储:

Element          Description

A                32 bit wide accumulator
X                32 bit wide X register
M[]              16 x 32 bit wide misc registers aka "scratch memory
                 store", addressable from 0 to 15

bpf_asm转换生成的程序是一个如下结构的数组。

op:16, jt:8, jf:8, k:32
  • op : 16位的指令
  • jt jf: jump if true, jump if false
  • k:参数

linux/filter.h下的指令以及相应的寻址方式:

Instruction      Addressing mode      Description

  ld               1, 2, 3, 4, 12       Load word into A
  ldi              4                    Load word into A
  ldh              1, 2                 Load half-word into A
  ldb              1, 2                 Load byte into A
  ldx              3, 4, 5, 12          Load word into X
  ldxi             4                    Load word into X
  ldxb             5                    Load byte into X

  st               3                    Store A into M[]
  stx              3                    Store X into M[]

  jmp              6                    Jump to label
  ja               6                    Jump to label
  jeq              7, 8, 9, 10          Jump on A == <x>
  jneq             9, 10                Jump on A != <x>
  jne              9, 10                Jump on A != <x>
  jlt              9, 10                Jump on A <  <x>
  jle              9, 10                Jump on A <= <x>
  jgt              7, 8, 9, 10          Jump on A >  <x>
  jge              7, 8, 9, 10          Jump on A >= <x>
  jset             7, 8, 9, 10          Jump on A &  <x>

  add              0, 4                 A + <x>
  sub              0, 4                 A - <x>
  mul              0, 4                 A * <x>
  div              0, 4                 A / <x>
  mod              0, 4                 A % <x>
  neg                                   !A
  and              0, 4                 A & <x>
  or               0, 4                 A | <x>
  xor              0, 4                 A ^ <x>
  lsh              0, 4                 A << <x>
  rsh              0, 4                 A >> <x>

  tax                                   Copy A into X
  txa                                   Copy X into A

  ret              4, 11                Return
Addressing mode  Syntax               Description

 0               x/%x                 Register X
 1               [k]                  BHW at byte offset k in the packet(BHW,二进制半字,32位)
 2               [x + k]              BHW at the offset X + k in the packet
 3               M[k]                 Word at offset k in M[]
 4               #k                   Literal value stored in k
 5               4*([k]&0xf)          Lower nibble * 4 at byte offset k in the packet
 6               L                    Jump label L
 7               #k,Lt,Lf             Jump to Lt if true, otherwise jump to Lf
 8               x/%x,Lt,Lf           Jump to Lt if true, otherwise jump to Lf
 9               #k,Lt                Jump to Lt if predicate is true
10               x/%x,Lt              Jump to Lt if predicate is true
11               a/%a                 Accumulator A
12               extension            BPF extension

BPF extension寻址配合load指令使用,将查询的结果放到累加器A中。可能的BPF extension包括:

Extension                             Description

len                                   skb->len
proto                                 skb->protocol
type                                  skb->pkt_type
poff                                  Payload start offset
ifidx                                 skb->dev->ifindex
nla                                   Netlink attribute of type X with offset A
nlan                                  Nested Netlink attribute of type X with offset A
mark                                  skb->mark
queue                                 skb->queue_mapping
hatype                                skb->dev->type
rxhash                                skb->hash
cpu                                   raw_smp_processor_id()
vlan_tci                              skb_vlan_tag_get(skb)
vlan_avail                            skb_vlan_tag_present(skb)
vlan_tpid                             skb->vlan_proto
rand                                  prandom_u32()

一些例子:

** ARP packets:

  ldh [12]          /*以太网首部跳过12byte,load half-word,也就是2byte,是以太网的类型字段*/
  jne #0x806, drop  /* 不等于0x806则跳转 */
  ret #-1
  drop: ret #0

** IPv4 TCP packets:

  ldh [12]
  jne #0x800, drop
  ldb [23] /*以太网frame跳过14byte的首部,在到9byte(1byte是8bit)的ip首部,load 1byte,是IP数据包的协议类型*/
  jneq #6, drop
  ret #-1
  drop: ret #0

** (Accelerated) VLAN w/ id 10:

  ld vlan_tci  /*这里用的是extensions的寻址,skb_vlan_tag_get(skb)*/
  jneq #10, drop
  ret #-1
  drop: ret #0

上面的代码可以由bpf_asm进行转换,生成xt_bpfcls_bpf可以直接加载的code

$ ./bpf_asm foo
4,40 0 0 12,21 0 1 2054,6 0 0 4294967295,6 0 0 0,

In copy and paste C-like output:

$ ./bpf_asm -c foo
{ 0x28,  0,  0, 0x0000000c },
{ 0x15,  0,  1, 0x00000806 },
{ 0x06,  0,  0, 0xffffffff },
{ 0x06,  0,  0, 0000000000 },

tools/bpf/bpf_dbg可以使用pcap文件来调试bpf程序。

JIT编译器

echo 1 > /proc/sys/net/core/bpf_jit_enable  /*启动*/
echo 2 > /proc/sys/net/core/bpf_jit_enable  /*编译的opcode会输入到内核日志中*/

当开启CONFIG_BPF_JIT_ALWAYS_ON时,bpf_jit_enable始终为1。

tools/bpf/下的bpf_jit_disasm可以将内核日志中的十六进制转换成反汇编。

# ./bpf_jit_disasm -o
70 bytes emitted from JIT compiler (pass:3, flen:6)
ffffffffa0069c8f + <x>:
   0:	push   %rbp
	55
   1:	mov    %rsp,%rbp
	48 89 e5
   4:	sub    $0x60,%rsp
	48 83 ec 60
   8:	mov    %rbx,-0x8(%rbp)
	48 89 5d f8
   c:	mov    0x68(%rdi),%r9d
	44 8b 4f 68
  10:	sub    0x6c(%rdi),%r9d
	44 2b 4f 6c
  14:	mov    0xd8(%rdi),%r8
	4c 8b 87 d8 00 00 00
  1b:	mov    $0xc,%esi
	be 0c 00 00 00
  20:	callq  0xffffffffe0ff9442
	e8 1d 94 ff e0
  25:	cmp    $0x800,%eax
	3d 00 08 00 00
  2a:	jne    0x0000000000000042
	75 16
  2c:	mov    $0x17,%esi
	be 17 00 00 00
  31:	callq  0xffffffffe0ff945e
	e8 28 94 ff e0
  36:	cmp    $0x1,%eax
	83 f8 01
  39:	jne    0x0000000000000042
	75 07
  3b:	mov    $0xffff,%eax
	b8 ff ff 00 00
  40:	jmp    0x0000000000000044
	eb 02
  42:	xor    %eax,%eax
	31 c0
  44:	leaveq
	c9
  45:	retq
	c3

BPF内核实现

内核中解释器使用的指令和上面描述的BPF指令集不同,更接近底层架构,以便获得更高的性能,被称为eBPF或internal BPF。新指令集可以通过“受限C语言”编写程序,并通过GCC/LLVM编译为eBPF,C -> eBPF -> native code

内核调用bpf_prog_create()bpf_prog_destroy()来创建和销毁filter,调用BPG_PROG_RUN(filter,ctx)宏进行解释与运行,这些都是透明的。参数filter为bpf_prog结构体,由bpf_prog_create()返回,ctx为给定的上下文,例如skb的指针。bpf_check_classic()的所有约束和限制会在转换之前执行。

eBPF的主要变化:

  • 寄存器数量由2变为10

    • R0:保存eBPF或辅助函数的返回值
    • R1-R5:存储调用辅助函数需要传递的参数
    • R6-R9:用于存储中间值,辅助函数将保持这些寄存器不改变
    • R10:只读寄存器,包含访问BPF stack的指针

    只有一个主eBPF程序,它只能调用其他辅助函数,而非其他BPF程序(注:现在应该支持BPF之间的调用)

  • 寄存器由32位变为64位

    • 仍然保留了32位的ALU操作语义,使用64位寄存器的32位子寄存器保存
  • 条件jt/jf替换为jt/fall-through

  • 引入bpf_call和寄存器传递约定

    • 调用辅助函数前,根据辅助函数规定,将参数存储至R1-R5上。R1-R5寄存器映射到CPU的寄存器上,调用无性能损耗。

    • 调用辅助函数后,R1-R5不可读,R0存储返回的值,R6-R9正在调用期间不会改变。eBPF程序只有1一个ctx,保存在R1上。

    u64 f1() { return (*_f2)(1); }
    u64 f2(u64 a) { return f3(a + 1, a); }
    u64 f3(u64 a, u64 b) { return a - b; }
    // f2的eBPF看起来类似
     f2:
      bpf_mov R2, R1
      bpf_add R1, 1
      bpf_call f3
      bpf_exit

x86_64中,64位寄存器与HW寄存器的对应:

R0 - rax
R1 - rdi
R2 - rsi
R3 - rdx
R4 - rcx
R5 - r8
R6 - rbx
R7 - r13
R8 - r14
R9 - r15
R10 - rbp

x86_64下,C语言

u64 bpf_filter(u64 ctx)
{
    return foo(ctx, 2, 3, 4, 5) + bar(ctx, 6, 7, 8, 9);
}

对应的BPF

bpf_mov R6, R1 /* save ctx */
bpf_mov R2, 2
bpf_mov R3, 3
bpf_mov R4, 4
bpf_mov R5, 5
bpf_call foo
bpf_mov R7, R0 /* save foo() return value */
bpf_mov R1, R6 /* restore ctx for next call */
bpf_mov R2, 6
bpf_mov R3, 7
bpf_mov R4, 8
bpf_mov R5, 9
bpf_call bar
bpf_add R0, R7
bpf_exit

对应的JIT编译结果

push %rbp
mov %rsp,%rbp
sub $0x228,%rsp
mov %rbx,-0x228(%rbp)
mov %r13,-0x220(%rbp)
mov %rdi,%rbx
mov $0x2,%esi
mov $0x3,%edx
mov $0x4,%ecx
mov $0x5,%r8d
callq foo
mov %rax,%r13
mov %rbx,%rdi
mov $0x6,%esi
mov $0x7,%edx
mov $0x8,%ecx
mov $0x9,%r8d
callq bar
add %r13,%rax
mov -0x228(%rbp),%rbx
mov -0x220(%rbp),%r13
leaveq
retq

为了保证eBPF程序迅速停止,eBPF程序限制只能有4096条指令。(注:现在限制扩到了100W)

根据不同用例,ctx中的内容不同。对于seccomp,R1指向seccomp_data;对于BPF filters,R1指向skb

注:Seccomp(全称:secure computing mode)在2.6.12版本中引入linux内核,作为一种安全机制,主要用于限制用户态程序对系统调用的滥用。由于限制太强,后引入seccomp-bpf,借助bpf规则来过滤系统调用。

指令的转换:

op:16, jt:8, jf:8, k:32    ==>    op:8, dst_reg:4, src_reg:4, off:16, imm:32
  • 目前内部BPF(eBPF)指令已有87条。
  • 内部BPF是一个通用的RISC指令集。复杂的过滤器可能耗尽寄存器,需要使用到堆栈。
  • 程序的安全性通过两个步骤确定:深度优先搜索,禁止循环,并进行其他CFG验证;从第一个指令开始,探测所有可能的路径,观察寄存器和堆栈的状态变化。

eBPF opcode

+----------------+--------+--------------------+
|   4 bits       |  1 bit |   3 bits           |
| operation code | source | instruction class  |
+----------------+--------+--------------------+
(MSB)                                      (LSB)

最后3bit的LSB存储指令类别:

Classic BPF classes:    eBPF classes:

 BPF_LD    0x00          BPF_LD    0x00
 BPF_LDX   0x01          BPF_LDX   0x01
 BPF_ST    0x02          BPF_ST    0x02
 BPF_STX   0x03          BPF_STX   0x03
 BPF_ALU   0x04          BPF_ALU   0x04
 BPF_JMP   0x05          BPF_JMP   0x05
 BPF_RET   0x06          BPF_JMP32 0x06
 BPF_MISC  0x07          BPF_ALU64 0x07

对于逻辑运算和跳转指令

  • 当PF_CLASS(code) == BPF_ALU或BPF_JMP时,第四位(source部分)可以为
BPF_K     0x00
 BPF_X     0x08

* in classic BPF, this means:

 BPF_SRC(code) == BPF_X - use register X as source operand
 BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand

* in eBPF, this means:

 BPF_SRC(code) == BPF_X - use 'src_reg' register as source operand
 BPF_SRC(code) == BPF_K - use 32-bit immediate as source operand
  • 当BPF_CLASS(code) == BPF_ALU或BPF_ALU64 ,前四位BPF_OP(code)可以为:
BPF_ADD   0x00
BPF_SUB   0x10
BPF_MUL   0x20
BPF_DIV   0x30
BPF_OR    0x40
BPF_AND   0x50
BPF_LSH   0x60
BPF_RSH   0x70
BPF_NEG   0x80
BPF_MOD   0x90
BPF_XOR   0xa0
BPF_MOV   0xb0  /* eBPF only: mov reg to reg */
BPF_ARSH  0xc0  /* eBPF only: sign extending shift right */
BPF_END   0xd0  /* eBPF only: endianness conversion */

当BPF_CLASS(code) == BPF_JMP或BPF_JMP32,前四位BPF_OP(code)可以为:

BPF_JA    0x00  /* BPF_JMP only */
BPF_JEQ   0x10
BPF_JGT   0x20
BPF_JGE   0x30
BPF_JSET  0x40
BPF_JNE   0x50  /* eBPF only: jump != */
BPF_JSGT  0x60  /* eBPF only: signed '>' */
BPF_JSGE  0x70  /* eBPF only: signed '>=' */
BPF_CALL  0x80  /* eBPF BPF_JMP only: function call */
BPF_EXIT  0x90  /* eBPF BPF_JMP only: function return */
BPF_JLT   0xa0  /* eBPF only: unsigned '<' */
BPF_JLE   0xb0  /* eBPF only: unsigned '<=' */
BPF_JSLT  0xc0  /* eBPF only: signed '<' */
BPF_JSLE  0xd0  /* eBPF only: signed '<=' */
  • BPF_XOR | BPF_K | BPF_ALU 在cBPF中表示A^=imm32,在eBPF中表示src_reg = (u32) src_reg ^ (u32) imm32(这段看代码,应该是dst_reg = (u32) dst_reg ^ (u32) imm32?) BPF_JMP | BPF_EXIT表示直接退出。

对于加载和存储指令

+--------+--------+-------------------+
| 3 bits | 2 bits |   3 bits          |
|  mode  |  size  | instruction class |
+--------+--------+-------------------+
(MSB)                             (LSB)

size部分

 BPF_W   0x00    /* word */
 BPF_H   0x08    /* half word */
 BPF_B   0x10    /* byte */
 BPF_DW  0x18    /* eBPF only, double word */
 
B  - 1 byte
H  - 2 byte
W  - 4 byte
DW - 8 byte (eBPF only)

mode部分

BPF_IMM  0x00  /* used for 32-bit mov in classic BPF and 64-bit in eBPF */
BPF_ABS  0x20
BPF_IND  0x40
BPF_MEM  0x60
BPF_LEN  0x80  /* classic BPF only, reserved in eBPF */
BPF_MSH  0xa0  /* classic BPF only, reserved in eBPF */
BPF_XADD 0xc0  /* eBPF only, exclusive add */

两个非通用指令BPF_ABS | <size> | BPF_LDBPF_IND | <size> | BPF_LD,用于访问数据包中的字段。使用条件:

1)ctx是指向sk_buff的指针

2)R0-R6规定为:R0保存获取的结果,R6保存sk_buff指针,R1-R5为暂存器,不可存储需要跨指令的数据(即执行完指令后R1-R5中的数据不可用)

3)访问超出sk_buff->data范围 ,自动跳出

BPF_IND | BPF_W | BPF_LD means:

  R0 = ntohl(*(u32 *) (((struct sk_buff *) R6)->data + src_reg + imm32))
  and R1 - R5 were scratched.

其他的一些例子

BPF_MEM | <size> | BPF_STX:  *(size *) (dst_reg + off) = src_reg
BPF_MEM | <size> | BPF_ST:   *(size *) (dst_reg + off) = imm32
BPF_MEM | <size> | BPF_LDX:  dst_reg = *(size *) (src_reg + off)
BPF_XADD | BPF_W  | BPF_STX: lock xadd *(u32 *)(dst_reg + off16) += src_reg
BPF_XADD | BPF_DW | BPF_STX: lock xadd *(u64 *)(dst_reg + off16) += src_reg

注:在linux/filter.h下都有。ST是从dst到src,LD是从src到dst

BPF_LD | BPF_DW | BPF_IMM(获取一个8字节的立即数)会跨两个指令。

#define BPF_LD_IMM64_RAW(DST, SRC, IMM)				\
	((struct bpf_insn) {					\
		.code  = BPF_LD | BPF_DW | BPF_IMM,		\
		.dst_reg = DST,					\
		.src_reg = SRC,					\
		.off   = 0,					\
		.imm   = (__u32) (IMM) }),			\
	((struct bpf_insn) {					\
		.code  = 0, /* zero is reserved opcode */	\
		.dst_reg = 0,					\
		.src_reg = 0,					\
		.off   = 0,					\
		.imm   = ((__u64) (IMM)) >> 32 })

eBPF验证器

两步:

1)做DAG(有向无环图)检测,确保无循环以及CFG(控制流保护)

2)从第一条指令开始,模拟执行所有可能的指令路径,观察寄存器和堆栈状态

一些检测规则:

1)不允许指针相加

2)不允许读未初始化的寄存器

3)辅助函数调用后R1-R5为不可读状态

4)load/store指令需要相应的寄存器类型为PTR_TO_CTXPTR_TO_MAPPTR_TO_STACK是才有效 (比如不能是标量),并且会有边界和对齐检测

5)程序开始时,R1类型为PTR_TO_CTX(ctx指针)。可以通过is_valid_access()回调函数,自定义验证方式,来进行边界与对齐检测。另外类型为PTR_TO_STACK(堆栈指针)时,访问边界为[-MAX_BPF_STACK, 0)

6)堆栈只有在写入才能读取(对应PTR_TO_STACK类型寄存器的操作)

7)bpf_verifier_ops->get_func_proto()可以用来自定义“函数调用”时的“参数检测”(检测寄存器),返回值保存在R0

8)eBPF对seccomp与socket filter的验证方式是相同的,而cBPF对于seccomp,需要先进行seccomp verifier,在进行其他的验证。

详细代码kernel/bpf/verifier.c

寄存器值追踪

验证器追踪寄存器中的值,检测值可能的范围。追踪由include/linux/bpf_verifier.h下的struct bpf_reg_state完成,寄存器值可能的类型:NOT_INITSCALAR_VALUEpointpoint由分为:

PTR_TO_CTX          Pointer to bpf_context.
 CONST_PTR_TO_MAP    Pointer to struct bpf_map.  "Const" because arithmetic
                     on these pointers is forbidden.
 PTR_TO_MAP_VALUE    Pointer to the value stored in a map element.
 PTR_TO_MAP_VALUE_OR_NULL
                     Either a pointer to a map value, or NULL; map accesses
                     (see section 'eBPF maps', below) return this type,
                     which becomes a PTR_TO_MAP_VALUE when checked != NULL.
                     Arithmetic on these pointers is forbidden.
 PTR_TO_STACK        Frame pointer.
 PTR_TO_PACKET       skb->data.
 PTR_TO_PACKET_END   skb->data + headlen; arithmetic forbidden.
 PTR_TO_SOCKET       Pointer to struct bpf_sock_ops, implicitly refcounted.
 PTR_TO_SOCKET_OR_NULL
                     Either a pointer to a socket, or NULL; socket lookup
                     returns this type, which becomes a PTR_TO_SOCKET when
                     checked != NULL. PTR_TO_SOCKET is reference-counted,
                     so programs must release the reference through the
                     socket release function before the end of the program.
                     Arithmetic on these pointers is forbidden.

对于PTR_TO_SOCKET和PTR_TO_SOCKET_OR_NULL,在程序结束前需要使用socket release方法,释放引用

对于指针偏移的验证:

1)分fixed offsetvariable offset

2)验证器对vaiable offset需要记录的状态:

* minimum and maximum values as unsigned
* minimum and maximum values as signed
* 一个'tnum'值,由两个u64组成,第一个为value,是具体的值;第二个为mask,用1标记未知的位。比如知道寄存器前56位为0,后8位不知道,则表示为tnum(0x0,0xff),表示省略了高位的0
  • 状态可以通过逻辑运算更新,例如上的tnum(0x0,0xff)寄存器与0x40取或,则变为tnum(0x40,0xbf)
  • 状态可以根据条件分支判断,例如在SCALAR_VALUE(offset)大于8为true的分支,寄存器的umin_value为9
  • PTR_TO_PACKET类型的寄存器,使用id标识某个偏移量。例如寄存器A拷贝到寄存器B,两者有相同偏移量,因此offset id相同,一次验证即可。
  • PTR_TO_MAP_VALUE_OR_NULL类寄存器(存储bgp map中查询返回的指针),也使用id标识,一旦验证某个id标识非NULL,其他副本也相同。除此外,可以做对齐验证。
  • PTR_TO_SOCKETPTR_TO_SOCKET_OR_NULL类寄存器(存储socket中查询返回的指针)也使用id标识,与PTR_TO_MAP_VALUE_OR_NULL类似。另外,对非NULL的id,可将其副本调用socket release方法。

数据包的直接访问

对于cls_bpfact_bpf,允许通过skb->dateskb->data_end直接访问数据包。

1:  r4 = *(u32 *)(r1 +80)  /* load skb->data_end */
2:  r3 = *(u32 *)(r1 +76)  /* load skb->data */
3:  r5 = r3
4:  r5 += 14
5:  if r5 > r4 goto pc+16
R1=ctx R3=pkt(id=0,off=0,r=14) R4=pkt_end R5=pkt(id=0,off=14,r=14) R10=fp
6:  r0 = *(u16 *)(r3 +12) /* access 12 and 13 bytes of the packet */

根据第5行的判断,验证器可以标记R3为pkg(id=0,off=0,r=14),其中id=0表示未向寄存器添加variablle offset,off=0表示未向寄存器添加fixed offset,r=14表示安全访问范围为[R3,R3+14]。同理R5标记为pkt(id=0,off=14,r=14)

eBPF maps

map用于内核和用户空间之间共享不同类型的数据。map存储使用BPF系统调用:

- create a map with given type and attributes
  map_fd = bpf(BPF_MAP_CREATE, union bpf_attr *attr, u32 size)
  using attr->map_type, attr->key_size, attr->value_size, attr->max_entries
  returns process-local file descriptor or negative error

- lookup key in a given map
  err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero and stores found elem into value or negative error

- create or update key/value pair in a given map
  err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key, attr->value
  returns zero or negative error

- find and delete element by key in a given map
  err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
  using attr->map_fd, attr->key

- to delete map: close(fd)
  Exiting process will delete maps automatically

map类型:hash、array、bloom filter、redix-tree等;map定义:

. type
. max number of elements
. key size in bytes
. value size in bytes

修剪(pruning)

验证器(verifier)在对新分支进行分析时,会比较当前指令之前检测过的状态,如果当前(寄存器)状态属于之前检测过的状态,这个分支就被认为可信,被修剪。

理解验证器的日志

  • 不可达指令

    static struct bpf_insn prog[] = {
       BPF_EXIT_INSN(),
       BPF_EXIT_INSN(),
     };
     Error:
       unreachable insn 1
  • 读取未初始化的寄存器

      BPF_MOV64_REG(BPF_REG_0, BPF_REG_2),
      BPF_EXIT_INSN(),
    Error:
      0: (bf) r0 = r2
      R2 !read_ok
  • 结束程序前未初始化寄存器

      BPF_MOV64_REG(BPF_REG_2, BPF_REG_1),
      BPF_EXIT_INSN(),
    Error:
      0: (bf) r2 = r1
      1: (95) exit
      R0 !read_ok
  • 越界访问堆栈

      BPF_ST_MEM(BPF_DW, BPF_REG_10, 8, 0),
      BPF_EXIT_INSN(),
    Error:
      0: (7a) *(u64 *)(r10 +8) = 0
      invalid stack off=8 size=8
  • 未初始化堆栈(map_lookup_elem调用,R1为文件描述符,R2为key,读取了R2指向的未初始化堆栈)

    BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
    BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
    BPF_LD_MAP_FD(BPF_REG_1, 0),
    BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
    BPF_EXIT_INSN(),
    Error:
    0: (bf) r2 = r10
    1: (07) r2 += -8
    2: (b7) r1 = 0x0
    3: (85) call 1
    invalid indirect read from stack off -8+0 size 8
  • 无效的map_fd(R1中的0x0)

     BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
      BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
      BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
      BPF_LD_MAP_FD(BPF_REG_1, 0),
      BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
      BPF_EXIT_INSN(),
    Error:
      0: (7a) *(u64 *)(r10 -8) = 0
      1: (bf) r2 = r10
      2: (07) r2 += -8
      3: (b7) r1 = 0x0
      4: (85) call 1
      fd 0 is not pointing to valid bpf_map
  • map_lookup_elem的返回值未进行验证(保存在R0)

    map element:
      BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
      BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
      BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
      BPF_LD_MAP_FD(BPF_REG_1, 0),
      BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
      BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
      BPF_EXIT_INSN(),
    Error:
      0: (7a) *(u64 *)(r10 -8) = 0
      1: (bf) r2 = r10
      2: (07) r2 += -8
      3: (b7) r1 = 0x0
      4: (85) call 1
      5: (7a) *(u64 *)(r0 +0) = 0
      R0 invalid mem access 'map_value_or_null'
  • BPF_ST_MEM以错误的对齐方式进行访问

    BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
      BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
      BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
      BPF_LD_MAP_FD(BPF_REG_1, 0),
      BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
      BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 1),
      BPF_ST_MEM(BPF_DW, BPF_REG_0, 4, 0),
      BPF_EXIT_INSN(),
    Error:
      0: (7a) *(u64 *)(r10 -8) = 0
      1: (bf) r2 = r10
      2: (07) r2 += -8
      3: (b7) r1 = 1
      4: (85) call 1
      5: (15) if r0 == 0x0 goto pc+1
       R0=map_ptr R10=fp
      6: (7a) *(u64 *)(r0 +4) = 0
      misaligned access off 4 size 8
  • map_lookup_elem结果R0为NULL,pc+2,执行指令8,会出现错误

     BPF_ST_MEM(BPF_DW, BPF_REG_10, -8, 0),
      BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
      BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
      BPF_LD_MAP_FD(BPF_REG_1, 0),
      BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
      BPF_JMP_IMM(BPF_JEQ, BPF_REG_0, 0, 2),
      BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 0),
      BPF_EXIT_INSN(),
      BPF_ST_MEM(BPF_DW, BPF_REG_0, 0, 1),
      BPF_EXIT_INSN(),
    Error:
      0: (7a) *(u64 *)(r10 -8) = 0
      1: (bf) r2 = r10
      2: (07) r2 += -8
      3: (b7) r1 = 1
      4: (85) call 1
      5: (15) if r0 == 0x0 goto pc+2
       R0=map_ptr R10=fp
      6: (7a) *(u64 *)(r0 +0) = 0
      7: (95) exit
    
      from 5 to 8: R0=imm0 R10=fp
      8: (7a) *(u64 *)(r0 +0) = 1
      R0 invalid mem access 'imm'
  • bpf_sk_lookup_tcp三个参数R1=ptr_to_ctx;R2=ptr_to_mem,将R0置NULL前未检测、未释放引用

      BPF_MOV64_IMM(BPF_REG_2, 0),
      BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
      BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
      BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
      BPF_MOV64_IMM(BPF_REG_3, 4),
      BPF_MOV64_IMM(BPF_REG_4, 0),
      BPF_MOV64_IMM(BPF_REG_5, 0),
      BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
      BPF_MOV64_IMM(BPF_REG_0, 0),
      BPF_EXIT_INSN(),
    Error:
      0: (b7) r2 = 0
      1: (63) *(u32 *)(r10 -8) = r2
      2: (bf) r2 = r10
      3: (07) r2 += -8
      4: (b7) r3 = 4
      5: (b7) r4 = 0
      6: (b7) r5 = 0
      7: (85) call bpf_sk_lookup_tcp#65
      8: (b7) r0 = 0
      9: (95) exit
      Unreleased reference id=1, alloc_insn=7
  • R0未检测是否为NULL就返回了

      BPF_MOV64_IMM(BPF_REG_2, 0),
      BPF_STX_MEM(BPF_W, BPF_REG_10, BPF_REG_2, -8),
      BPF_MOV64_REG(BPF_REG_2, BPF_REG_10),
      BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -8),
      BPF_MOV64_IMM(BPF_REG_3, 4),
      BPF_MOV64_IMM(BPF_REG_4, 0),
      BPF_MOV64_IMM(BPF_REG_5, 0),
      BPF_EMIT_CALL(BPF_FUNC_sk_lookup_tcp),
      BPF_EXIT_INSN(),
    Error:
      0: (b7) r2 = 0
      1: (63) *(u32 *)(r10 -8) = r2
      2: (bf) r2 = r10
      3: (07) r2 += -8
      4: (b7) r3 = 4
      5: (b7) r4 = 0
      6: (b7) r5 = 0
      7: (85) call bpf_sk_lookup_tcp#65
      8: (95) exit
      Unreleased reference id=1, alloc_insn=7

测试

内核附带测试模块,包含用于cBPF和eBPF的各种测试,在lib/test_bpf.c中 ,通过Kconfig启动:

CONFIG_TEST_BPF=m

编译安装测试模块后,可以通过insmodmodprobe执行测试。结果在内核日志(dmesg)中可以找到。

参考(图片来源)

https://cloud.tencent.com/developer/inventory/600/article/1698426

https://damonyi.cc/2021/01/26/Linux-eBPF%E4%BB%8B%E7%BB%8D/