Featured image

How to use bpftrace

So. What’s bpftrace?

bpftrace is a high-level tracing language for Linux enhanced Berkeley Packet Filter (eBPF) available in recent Linux kernels (4.x). bpftrace uses LLVM as a backend to compile scripts to BPF-bytecode and makes use of BCC for interacting with the Linux BPF system, as well as existing Linux tracing capabilities: kernel dynamic tracing (kprobes), user-level dynamic tracing (uprobes), and tracepoints. The bpftrace language is inspired by awk and C, and predecessor tracers such as DTrace and SystemTap. bpftrace was created by Alastair Robertson.

Oha. So we can trace arbitrary kernel and user space functions. That sounds nice.

Which kernel functions can we trace?

kmille@linbox:~ sudo bpftrace -l | grep ipv4
...
kprobe:nf_nat_ipv4_in
kprobe:nf_nat_ipv4_out
kprobe:nf_nat_ipv4_local_fn
kprobe:nf_nat_redirect_ipv4
kprobe:nf_nat_masquerade_ipv4
kfunc:selinux_ipv4_postroute
kfunc:selinux_ipv4_output
kfunc:selinux_ipv4_forward
kfunc:smk_ipv4_check
kfunc:smack_ipv4_output
kfunc:ipv4_skb_to_auditdata
kfunc:apparmor_ipv4_postroute

There are different types of trace points (kfunc=kernel function, kprobe=kernel probe, uprobe=probe in the user space). To list all functions, the kernel knows right now:

kmille@linbox:~ sudo cat /sys/kernel/debug/tracing/available_filter_functions | head
__traceiter_initcall_level
__traceiter_initcall_start
__traceiter_initcall_finish
trace_initcall_finish_cb
initcall_blacklisted
do_one_initcall
do_one_initcall
match_dev_by_label
match_dev_by_uuid
rootfs_init_fs_context
kmille@linbox:~

Let’s list the current loaded iptables modules:

kmille@linbox:~ sudo lsmod  | grep iptable 
iptable_nat            16384  1
nf_nat                 53248  2 iptable_nat,xt_MASQUERADE
iptable_filter         16384  1
ip_tables              32768  2 iptable_filter,iptable_nat
x_tables               53248  11 ip6table_filter,xt_conntrack,iptable_filter,xt_state,xt_tcpudp,xt_addrtype,ip6_tables,ipt_REJECT,ip_tables,xt_MASQUERADE,ip6t_REJECT

To list the functions of the kernel module iptable_nat:

kmille@linbox:~ sudo cat /sys/kernel/debug/tracing/available_filter_functions | grep iptable_nat
iptable_nat_net_exit [iptable_nat]
iptable_nat_net_pre_exit [iptable_nat]
iptable_nat_do_chain [iptable_nat]
iptable_nat_table_init.part.0 [iptable_nat]
iptable_nat_table_init [iptable_nat]
kmille@linbox:~

Example

  1. Let’s use strace to find some interesting system calls.
root@buster:/home/vagrant# strace -e network iptables-legacy -nL
socket(AF_INET, SOCK_RAW, IPPROTO_RAW)  = 4
getsockopt(4, SOL_IP, IPT_SO_GET_INFO, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [84]) = 0
getsockopt(4, SOL_IP, IPT_SO_GET_ENTRIES, "filter\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., [672]) = 0
Chain INPUT (policy ACCEPT)
target     prot opt source               destination         

Chain FORWARD (policy ACCEPT)
target     prot opt source               destination         

Chain OUTPUT (policy ACCEPT)
target     prot opt source               destination         
+++ exited with 0 +++
  1. Let’s find out which kernel we use. Get the kernel source code and checkout the right version.
root@buster:~/mod/linux# uname -a
Linux buster 4.19.0-8-amd64 #1 SMP Debian 4.19.98-1 (2020-01-26) x86_64 GNU/Linux

root@buster:~/mod/linux# git status
HEAD detached at v4.19-rc8
nothing to commit, working tree clean
  1. grep for some function names, constants or variable names
root@buster:~/mod/linux# rg IPT_SO_GET_INFO
net/ipv4/ip_sockglue.c
1563:   if (optname >= BPFILTER_IPT_SO_GET_INFO &&
1600:   if (optname >= BPFILTER_IPT_SO_GET_INFO &&

net/ipv4/netfilter/ip_tables.c
1652:   case IPT_SO_GET_INFO:
1698:   case IPT_SO_GET_INFO:

include/uapi/linux/bpfilter.h
14:     BPFILTER_IPT_SO_GET_INFO = 64,

include/uapi/linux/netfilter_ipv4/ip_tables.h
140:#define IPT_SO_GET_INFO                     (IPT_BASE_CTL)
156:/* The argument to IPT_SO_GET_INFO */

In net/ipv4/netfilter/ip_tables.c looks promising. There are two interesting functions: compat_do_ipt_get_ctl and do_ipt_get_ctl. Let’s use bpftrace to find out if the function is called.

root@buster:~# bpftrace -e 'kprobe:do_ipt_get_ctl { printf("function was called!\n"); }'
Attaching 1 probe...
function was called!
function was called!

To make it clear: I run iptables in another terminal window. Let’s get some more details. This is how the signature of the compat_do_ipt_get_ctl function looks like:

static int compat_do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)

Let’s write some code.

#include <net/sock.h>

kprobe:do_ipt_get_ctl
{
    printf("called by %s (pid: %d). and: %d\n", comm, pid, ((sock *)arg0)->__sk_common.skc_family);
}

And execute it.

root@buster:~# bpftrace mypbraceprogram.bpf
/bpftrace/include/stdarg.h:52:1: warning: null character ignored [-Wnull-character]       
/lib/modules/4.19.0-8-amd64/source/arch/x86/include/asm/bitops.h:209:2: error: 'asm goto' constructs are not supported yet
/lib/modules/4.19.0-8-amd64/source/arch/x86/include/asm/bitops.h:256:2: error: 'asm goto' constructs are not supported yet
/lib/modules/4.19.0-8-amd64/source/arch/x86/include/asm/bitops.h:310:2: error: 'asm goto' constructs are not supported yet
/lib/modules/4.19.0-8-amd64/source/arch/x86/include/asm/jump_label.h:23:2: error: 'asm goto' constructs are not supported yet
/lib/modules/4.19.0-8-amd64/source/arch/x86/include/asm/signal.h:24:2: note: array 'sig' declared here
Attaching 1 probe...
called by iptables-legacy (pid: 2981). and: 2
called by iptables-legacy (pid: 2981). and: 2

What does the 2 mean?

/usr/src/linux-headers-4.19.0-8-common/include/linux/socket.h
160 /* Supported address families. */
161 #define AF_UNSPEC   0
162 #define AF_UNIX     1   /* Unix domain sockets      */
163 #define AF_LOCAL    1   /* POSIX name for AF_UNIX   */
164 #define AF_INET     2   /* Internet IP Protocol     */
165 #define AF_AX25     3   /* Amateur Radio AX.25      */
166 #define AF_IPX      4   /* Novell IPX           */

So it’s just the constant value IPv4. I think ((sock *)arg0)->__sk_common.skc_family) needs some further explanation. The first parameter of compat_do_ipt_get_ctl (arg0 in the words of bpftrace) is a pointer to a socket. A socket has a member of type sock_common called __sk_common. And __sk_common has the attribute skc_family . You can (have to) find it out by grepping through the code.

root@buster:/usr/src/linux-headers-4.19.0-8-common/include# rg '^struct sock \{'
net/sock.h
327:struct sock {
    ....
239 /**
 240   * struct sock - network layer representation of sockets
 241   * @__sk_common: shared layout with inet_timewait_sock
 242   * @sk_shutdown: mask of %SEND_SHUTDOWN and/or %RCV_SHUTDOWN
 243   * @sk_userlocks: %SO_SNDBUF and %SO_RCVBUF settings
 244   * @sk_lock:   synchronizer
 245   * @sk_kern_sock: True if sock is using kernel lock classes
 ...
 322   * @sk_rcu: used during RCU grace period
 323   * @sk_clockid: clockid used by time-based scheduling (SO_TXTIME)
 324   * @sk_txtime_deadline_mode: set deadline mode for SO_TXTIME
 325   * @sk_txtime_unused: unused txtime flags
 326   */
 327 struct sock {
 328     /*
 329      * Now struct inet_timewait_sock also uses sock_common, so please just
 330      * don't add nothing before this first member (__sk_common) --acme
 331      */
 332     struct sock_common  __sk_common;
 333 #define sk_node         __sk_common.skc_node
 
     ...
 
 123  *  struct sock_common - minimal network layer representation of sockets
 124  *  @skc_daddr: Foreign IPv4 addr
 125  *  @skc_rcv_saddr: Bound local IPv4 addr
 126  *  @skc_hash: hash value used with various protocol lookup tables
 127  *  @skc_u16hashes: two u16 hash values used by UDP lookup tables
 128  *  @skc_dport: placeholder for inet_dport/tw_dport
 129  *  @skc_num: placeholder for inet_num/tw_num
 130  *  @skc_family: network address family                          <----- here it is
 131  *  @skc_state: Connection state
...
 143  *  @skc_flags: place holder for sk_flags
 144  *      %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
 145  *      %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
 146  *  @skc_incoming_cpu: record/match cpu processing incoming packets
 147  *  @skc_refcnt: reference count
 148  *
 149  *  This is the minimal network layer representation of sockets, the header
 150  *  for struct sock and struct inet_timewait_sock.
 151  */
 152 struct sock_common {

Another example you can try is this one:

#include <linux/path.h>
#include <linux/dcache.h>

kprobe:vfs_open
{
    printf("open path: %s\n", str(((struct path *)arg0)->dentry->d_name.name));
}

You can run it in a single command too:

sudo bpftrace -I /usr/lib/modules/5.11.2-arch1-1/build/include/linux/ -e 'kprobe:vfs_open { printf("open path: %s\n", str(((struct path *)arg0)->dentry->d_name.name)); }' -v