Redirect port using TC BPF

I’m want to use TC BPF to redirect incoming traffic from port 80 to port 8080.
Below is my own code, but I’ve also tried the example from man 8 tc-bpf (search for 8080) and I get the same result.

#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
#include <linux/pkt_cls.h>
#include <linux/if_ether.h>
#include <linux/tcp.h>
#include <linux/in.h>
#include <linux/ip.h>

#include <linux/filter.h>

static inline void set_tcp_dport(struct __sk_buff *skb, int nh_off,
                                            __u16 old_port, __u16 new_port)
{
    bpf_l4_csum_replace(skb, nh_off + offsetof(struct tcphdr, check),
                        old_port, new_port, sizeof(new_port));
    bpf_skb_store_bytes(skb, nh_off + offsetof(struct tcphdr, dest),
                        &new_port, sizeof(new_port), 0);
}

SEC("tc_my")
int tc_bpf_my(struct __sk_buff *skb)
{
    struct iphdr ip;
    struct tcphdr tcp;
    if (0 != bpf_skb_load_bytes(skb, sizeof(struct ethhdr), &ip, sizeof(struct iphdr))) {
        bpf_printk("bpf_skb_load_bytes iph failed");
        return TC_ACT_OK;
    }

    if (0 != bpf_skb_load_bytes(skb, sizeof(struct ethhdr) + (ip.ihl << 2), &tcp, sizeof(struct tcphdr))) {
        bpf_printk("bpf_skb_load_bytes ethh failed");
        return TC_ACT_OK;
    }

    unsigned int src_port = bpf_ntohs(tcp.source);
    unsigned int dst_port = bpf_ntohs(tcp.dest);

    if (src_port == 80 || dst_port == 80 || src_port == 8080 || dst_port == 8080)
        bpf_printk("%pI4:%u -> %pI4:%u", &ip.saddr, src_port, &ip.daddr, dst_port);

    if (dst_port != 80)
        return TC_ACT_OK;

    set_tcp_dport(skb, ETH_HLEN + sizeof(struct iphdr), __constant_htons(80), __constant_htons(8080));

    return TC_ACT_OK;
}

char LICENSE[] SEC("license") = "GPL";

On machine A, I am running:

clang -g -O2 -Wall -target bpf -c tc_my.c -o tc_my.o
tc qdisc add dev ens160 clsact
tc filter add dev ens160 ingress bpf da obj tc_my.o sec tc_my
nc -l 8080

On machine B:

nc $IP_A 80

On machine B, nc seems connected, but ss shows:

SYN-SENT   0      1       $IP_B:53442   $IP_A:80    users:(("nc",pid=30180,fd=3))

On machine A, connection remains in SYN-RECV before being dropped.

I was expecting my program to behave as if I added this iptables rule:

iptables -t nat -A PREROUTING -p tcp -m tcp --dport 80 -j REDIRECT --to-port 8080

Maybe my expectations are wrong, but I would like to understand why. How can I get my TC BPF redirect to work?

SOLUTION

Following the explanation in my accepted answer, here is an example code which works for TCP, does ingress NAT 90->8080, and egress de-NAT 8080->90.

#include <linux/bpf.h>
#include <bpf/bpf_helpers.h>
#include <bpf/bpf_endian.h>
#include <linux/pkt_cls.h>
#include <linux/if_ether.h>
#include <linux/tcp.h>
#include <linux/in.h>
#include <linux/ip.h>

#include <linux/filter.h>

static inline void set_tcp_dport(struct __sk_buff *skb, int nh_off,
                                 __u16 old_port, __u16 new_port)
{
    bpf_l4_csum_replace(skb, nh_off + offsetof(struct tcphdr, check),
                        old_port, new_port, sizeof(new_port));
    bpf_skb_store_bytes(skb, nh_off + offsetof(struct tcphdr, dest),
                        &new_port, sizeof(new_port), 0);
}

static inline void set_tcp_sport(struct __sk_buff *skb, int nh_off,
                                 __u16 old_port, __u16 new_port)
{
    bpf_l4_csum_replace(skb, nh_off + offsetof(struct tcphdr, check),
                        old_port, new_port, sizeof(new_port));
    bpf_skb_store_bytes(skb, nh_off + offsetof(struct tcphdr, source),
                        &new_port, sizeof(new_port), 0);
}

SEC("tc_ingress")
int tc_ingress_(struct __sk_buff *skb)
{
    struct iphdr ip;
    struct tcphdr tcp;
    if (0 != bpf_skb_load_bytes(skb, sizeof(struct ethhdr), &ip, sizeof(struct iphdr)))
    {
        bpf_printk("bpf_skb_load_bytes iph failed");
        return TC_ACT_OK;
    }

    if (0 != bpf_skb_load_bytes(skb, sizeof(struct ethhdr) + (ip.ihl << 2), &tcp, sizeof(struct tcphdr)))
    {
        bpf_printk("bpf_skb_load_bytes ethh failed");
        return TC_ACT_OK;
    }

    unsigned int src_port = bpf_ntohs(tcp.source);
    unsigned int dst_port = bpf_ntohs(tcp.dest);

    if (src_port == 90 || dst_port == 90 || src_port == 8080 || dst_port == 8080)
        bpf_printk("INGRESS %pI4:%u -> %pI4:%u", &ip.saddr, src_port, &ip.daddr, dst_port);

    if (dst_port != 90)
        return TC_ACT_OK;

    set_tcp_dport(skb, ETH_HLEN + sizeof(struct iphdr), __constant_htons(90), __constant_htons(8080));

    return TC_ACT_OK;
}

SEC("tc_egress")
int tc_egress_(struct __sk_buff *skb)
{
    struct iphdr ip;
    struct tcphdr tcp;
    if (0 != bpf_skb_load_bytes(skb, sizeof(struct ethhdr), &ip, sizeof(struct iphdr)))
    {
        bpf_printk("bpf_skb_load_bytes iph failed");
        return TC_ACT_OK;
    }

    if (0 != bpf_skb_load_bytes(skb, sizeof(struct ethhdr) + (ip.ihl << 2), &tcp, sizeof(struct tcphdr)))
    {
        bpf_printk("bpf_skb_load_bytes ethh failed");
        return TC_ACT_OK;
    }

    unsigned int src_port = bpf_ntohs(tcp.source);
    unsigned int dst_port = bpf_ntohs(tcp.dest);

    if (src_port == 90 || dst_port == 90 || src_port == 8080 || dst_port == 8080)
        bpf_printk("EGRESS %pI4:%u -> %pI4:%u", &ip.saddr, src_port, &ip.daddr, dst_port);

    if (src_port != 8080)
        return TC_ACT_OK;

    set_tcp_sport(skb, ETH_HLEN + sizeof(struct iphdr), __constant_htons(8080), __constant_htons(90));

    return TC_ACT_OK;
}

char LICENSE[] SEC("license") = "GPL";

Here is how I build and loaded the different sections in my program:

clang -g -O2 -Wall -target bpf -c tc_my.c -o tc_my.o
tc filter add dev ens32 ingress bpf da obj /tc_my.o sec tc_ingress
tc filter add dev ens32 egress bpf da obj /tc_my.o sec tc_egress
Asked By: greenro

||

Contrary to Netfilter which includes a stateful NAT engine (using the conntrack lookup entries) and which will automatically de-NAT the reply traffic without explicit rule telling it to do so, implementing NAT elsewhere is stateless and requires to handle both directions. For incoming connections, that means handling NAT at ingress but also handling de-NAT at egress explicitly.

As witnessed by running tcpdump on the client:

# tcpdump -ttt -l -n -s0 -p -i lxcbr0 tcp
tcpdump: verbose output suppressed, use -v[v]... for full protocol decode
listening on lxcbr0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
 00:00:00.000000 IP 10.0.3.1.52542 > 10.0.3.214.80: Flags [S], seq 3033230443, win 64240, options [mss 1460,sackOK,TS val 2154801903 ecr 0,nop,wscale 7], length 0
 00:00:00.000058 IP 10.0.3.214.8080 > 10.0.3.1.52542: Flags [S.], seq 1400064141, ack 3033230444, win 65160, options [mss 1460,sackOK,TS val 3949758745 ecr 2154801903,nop,wscale 7], length 0
 00:00:00.000013 IP 10.0.3.1.52542 > 10.0.3.214.8080: Flags [R], seq 3033230444, win 0, length 0

the current eBPF code did only the first part. So incoming TCP packets to port 80 are indeed switched to port 8080 before any other part of the network stack can know about it, but then the reply traffic will just be issued from port 8080 (knowledge of any port 80 is lost after the eBPF code), while the client expects replies from port 80 too: the client’s kernel replies with a TCP RST and the client tries again, with the same outcome: no connectivity.

An equivalent inverse transformation has to be done on egress. As all this is stateless that means once done it will no longer be possible to connect directly to port 8080 for the same reasons: the same effect would then happen: connections to port 8080 will now be replied using port 80.

By contrast, applying an equivalent setup to UDP would have worked for incoming traffic only, because UDP doesn’t need to emit back anything when receiving traffic. But sending back ICMP errors (for example to signal to the client there is no longer a server listening) would fail. Even if eBPF code was done for the other direction for UDP, an ICMP error would still include the wrong UDP port in its partial UDP payload. Netfilter’s NAT also takes care of this.

Answered By: A.B
Categories: Answers Tags: , , ,
Answers are sorted by their score. The answer accepted by the question owner as the best is marked with
at the top-right corner.