Suricata中的eBPF解析

Suricata中的eBPF代码位于ebpf目录中,包括bypass_filter.cfilter.clb.cvlan_filter.cxdp_filter.cxdp_lb.c文件。

Filter

#define LINUX_VERSION_CODE 263682

struct bpf_map_def SEC("maps") ipv4_drop = {
    .type = BPF_MAP_TYPE_PERCPU_HASH,
    .key_size = sizeof(__u32),
    .value_size = sizeof(__u32),
    .max_entries = 32768,
};

// vlan头结构,共四个字节
struct vlan_hdr {
    // tci:3bit优先级,1bit CFI,12bit vlan id
    __u16   h_vlan_TCI;
    // 上层协议
    __u16   h_vlan_encapsulated_proto;
};

static __always_inline int ipv4_filter(struct __sk_buff *skb)
{
    __u32 nhoff;
    __u32 *value;
    __u32 ip = 0;

    // 获取偏移量
    nhoff = skb->cb[0];

    // 源IP
    ip = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
    // 查找源IP
    value = bpf_map_lookup_elem(&ipv4_drop, &ip);
    if (value) {
        // 找到了,丢弃并更新计数器
#if DEBUG
        char fmt[] = "Found value for saddr: %u\n";
        bpf_trace_printk(fmt, sizeof(fmt), value);
#endif
        *value = *value + 1;
        return 0;
    }

    // 目的IP
    ip = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
    // 查找目的IP
    value = bpf_map_lookup_elem(&ipv4_drop, &ip);
    if (value) {
        // 找到了,丢弃并更新计数器
#if DEBUG
        char fmt[] = "Found value for daddr: %u\n";
        bpf_trace_printk(fmt, sizeof(fmt), value);
#endif
        *value = *value + 1;
        return 0;
    }

#if DEBUG
    char fmt[] = "Nothing so ok\n";
    bpf_trace_printk(fmt, sizeof(fmt));
#endif
    return -1;
}

// 暂未实现
static __always_inline int ipv6_filter(struct __sk_buff *skb)
{
    return -1;
}

int SEC("filter") hashfilter(struct __sk_buff *skb)
{
    // 头偏移量为以太网头长度(14字节)
    __u32 nhoff = ETH_HLEN;

    // 获取ip协议
    __u16 proto = load_half(skb, offsetof(struct ethhdr, h_proto));

    // 对于VLAN或者QinQ帧
    if (proto == ETH_P_8021AD || proto == ETH_P_8021Q) {
        // 解1层vlan,获取上层协议
        proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
                          h_vlan_encapsulated_proto));
        // 更新头偏移量
        nhoff += sizeof(struct vlan_hdr);
    }

    // 保存当前偏移量
    skb->cb[0] = nhoff;
    switch (proto) {
        // 执行ipv4过滤
        case ETH_P_IP:
            return ipv4_filter(skb);
        // 执行ipv6过滤
        case ETH_P_IPV6:
            return ipv6_filter(skb);
        default:
            break;
    }
    return -1;
}

// license
char __license[] SEC("license") = "GPL";

// 内核版本4.6.2
__u32 __version SEC("version") = LINUX_VERSION_CODE;

Bypass Filter

#define LINUX_VERSION_CODE 263682

// ipv4流定义
struct flowv4_keys {
    __u32 src;
    __u32 dst;
    union {
        __u32 ports;
        __u16 port16[2];
    };
    __u8 ip_proto:1;
    __u16 vlan0:15;
    __u16 vlan1;
};

// ipv6流定义
struct flowv6_keys {
    __u32 src[4];
    __u32 dst[4];
    union {
        __u32 ports;
        __u16 port16[2];
    };
    __u8 ip_proto:1;
    __u16 vlan0:15;
    __u16 vlan1;
};

// 流量和包数统计量
struct pair {
    __u64 packets;
    __u64 bytes;
};

struct bpf_map_def SEC("maps") flow_table_v4 = {
    .type = BPF_MAP_TYPE_PERCPU_HASH,
    .key_size = sizeof(struct flowv4_keys),
    .value_size = sizeof(struct pair),
    .max_entries = 32768,
};

struct bpf_map_def SEC("maps") flow_table_v6 = {
    .type = BPF_MAP_TYPE_PERCPU_HASH,
    .key_size = sizeof(struct flowv6_keys),
    .value_size = sizeof(struct pair),
    .max_entries = 32768,
};

struct vlan_hdr {
    __u16	h_vlan_TCI;
    __u16	h_vlan_encapsulated_proto;
};

/**
 * IPv4 filter
 *
 * \return 0 to drop packet out and -1 to accept it
 */
static __always_inline int ipv4_filter(struct __sk_buff *skb, __u16 vlan0, __u16 vlan1)
{
    __u32 nhoff, verlen;
    struct flowv4_keys tuple;
    struct pair *value;
    __u16 port;
    __u8 ip_proto;

    nhoff = skb->cb[0];

    // 获取上层协议
    ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol));
    /* only support TCP and UDP for now */
    switch (ip_proto) {
        // tcp
        case IPPROTO_TCP:
            tuple.ip_proto = 1;
            break;
        // udp
        case IPPROTO_UDP:
            tuple.ip_proto = 0;
            break;
        default:
            return -1;
    }
    
    // 元组(ip、端口、协议、vlan)
    tuple.src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
    tuple.dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));

    verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
    nhoff += (verlen & 0xF) << 2;
    tuple.ports = load_word(skb, nhoff);
    port = tuple.port16[1];
    tuple.port16[1] = tuple.port16[0];
    tuple.port16[0] = port;
    tuple.vlan0 = vlan0;
    tuple.vlan1 = vlan1;

#if 0
    if ((tuple.port16[0] == 22) || (tuple.port16[1] == 22))
    {
        __u16 sp = tuple.port16[0];
        //__u16 dp = tuple.port16[1];
        char fmt[] = "Parsed SSH flow: %u %d -> %u\n";
        bpf_trace_printk(fmt, sizeof(fmt), tuple.src, sp, tuple.dst);
    }
#endif
    /* Test if src is in hash */
    // 查找流
    value = bpf_map_lookup_elem(&flow_table_v4, &tuple);
    if (value) {
#if 0
        {
            __u16 sp = tuple.port16[0];
            //__u16 dp = tuple.port16[1];
            char bfmt[] = "Found flow: %u %d -> %u\n";
            bpf_trace_printk(bfmt, sizeof(bfmt), tuple.src, sp, tuple.dst);
        }
#endif
        // 找到了,更新流量和包数
        value->packets++;
        value->bytes += skb->len;
        return 0;
    }
    // 未找到
    return -1;
}

/**
 * IPv6 filter
 *
 * \return 0 to drop packet out and -1 to accept it
 */
static __always_inline int ipv6_filter(struct __sk_buff *skb, __u16 vlan0, __u16 vlan1)
{
    __u32 nhoff;
    __u8 nhdr;
    struct flowv6_keys tuple;
    struct pair *value;
    __u16 port;

    nhoff = skb->cb[0];

    /* get next header */
    nhdr = load_byte(skb, nhoff + offsetof(struct ipv6hdr, nexthdr));

    /* only support direct TCP and UDP for now */
    switch (nhdr) {
        case IPPROTO_TCP:
            tuple.ip_proto = 1;
            break;
        case IPPROTO_UDP:
            tuple.ip_proto = 0;
            break;
        default:
            return -1;
    }

    tuple.src[0] = load_word(skb, nhoff + offsetof(struct ipv6hdr, saddr));
    tuple.src[1] = load_word(skb, nhoff + offsetof(struct ipv6hdr, saddr) + 4);
    tuple.src[2] = load_word(skb, nhoff + offsetof(struct ipv6hdr, saddr) + 8);
    tuple.src[3] = load_word(skb, nhoff + offsetof(struct ipv6hdr, saddr) + 12);
    tuple.dst[0] = load_word(skb, nhoff + offsetof(struct ipv6hdr, daddr));
    tuple.dst[1] = load_word(skb, nhoff + offsetof(struct ipv6hdr, daddr) + 4);
    tuple.dst[2] = load_word(skb, nhoff + offsetof(struct ipv6hdr, daddr) + 8);
    tuple.dst[3] = load_word(skb, nhoff + offsetof(struct ipv6hdr, daddr) + 12);

    /* Parse TCP */
    tuple.ports = load_word(skb, nhoff + 40 /* IPV6_HEADER_LEN */);
    port = tuple.port16[1];
    tuple.port16[1] = tuple.port16[0];
    tuple.port16[0] = port;

    tuple.vlan0 = vlan0;
    tuple.vlan1 = vlan1;

    //char fmt[] = "Now Got IPv6 port %u and %u\n";
    //bpf_trace_printk(fmt, sizeof(fmt), tuple.port16[0], tuple.port16[1]);
    /* Test if src is in hash */
    value = bpf_map_lookup_elem(&flow_table_v6, &tuple);
    if (value) {
        //char fmt[] = "Got a match IPv6: %u and %u\n";
        //bpf_trace_printk(fmt, sizeof(fmt), tuple.port16[0], tuple.port16[1]);
        value->packets++;
        value->bytes += skb->len;
        return 0;
    }
    return -1;
}

/**
 * filter function
 *
 * It is loaded in kernel by Suricata that uses the section name specified
 * by the SEC call to find it in the Elf binary object and load it.
 *
 * \return 0 to drop packet out and -1 to accept it
 */
int SEC("filter") hashfilter(struct __sk_buff *skb) {
    __u32 nhoff = ETH_HLEN;

    __u16 proto = load_half(skb, offsetof(struct ethhdr, h_proto));
    // 第一层vlan id
    __u16 vlan0 = skb->vlan_tci & 0x0fff;
    __u16 vlan1 = 0;

    // vlan或者qinq
    if (proto == ETH_P_8021AD || proto == ETH_P_8021Q) {
        // 上层协议
        proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
                          h_vlan_encapsulated_proto));
#if VLAN_TRACKING
        /* one vlan layer is stripped by OS so get vlan 1 at first pass */
        // 第二层vlan id
        vlan1 = load_half(skb, nhoff + offsetof(struct vlan_hdr,
                          h_vlan_TCI)) & 0x0fff;
#endif
        nhoff += sizeof(struct vlan_hdr);
    }

    skb->cb[0] = nhoff;
    switch (proto) {
        case ETH_P_IP:
            // ipv4过滤
            return ipv4_filter(skb, vlan0, vlan1);
        case ETH_P_IPV6:
            // ipv6过滤
            return ipv6_filter(skb, vlan0, vlan1);
        default:
#if 0
            {
                char fmt[] = "Got proto %u\n";
                bpf_trace_printk(fmt, sizeof(fmt), h_proto);
                break;
            }
#else
            break;
#endif
    }
    return -1;
}

char __license[] SEC("license") = "GPL";

__u32 __version SEC("version") = LINUX_VERSION_CODE;

Load Balancer

#define LINUX_VERSION_CODE 263682

#ifndef __section
# define __section(x)  __attribute__((section(x), used))
#endif

struct vlan_hdr {
    __u16 h_vlan_TCI;
    __u16 h_vlan_encapsulated_proto;
};

static __always_inline int ipv4_hash(struct __sk_buff *skb)
{
    __u32 nhoff;
    __u32 src, dst;

    nhoff = skb->cb[0];
    // 以源ip和目的ip计算哈希
    src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
    dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));

#if 0
    char fmt[] = "Got addr: %x -> %x at %d\n";
    bpf_trace_printk(fmt, sizeof(fmt), src, dst, nhoff);
    //char fmt2[] = "Got hash %u\n";
    //bpf_trace_printk(fmt2, sizeof(fmt2), src + dst);
#endif
    return  src + dst;
}

static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
{
    __u64 w0 = load_word(ctx, off);
    __u64 w1 = load_word(ctx, off + 4);
    __u64 w2 = load_word(ctx, off + 8);
    __u64 w3 = load_word(ctx, off + 12);

    return (__u32)(w0 ^ w1 ^ w2 ^ w3);
}

static __always_inline int ipv6_hash(struct __sk_buff *skb)
{
    __u32 nhoff;
    __u32 src_hash, dst_hash;

    // 以源ip和目的ip计算哈希
    nhoff = skb->cb[0];
    src_hash = ipv6_addr_hash(skb,
                              nhoff + offsetof(struct ipv6hdr, saddr));
    dst_hash = ipv6_addr_hash(skb,
                              nhoff + offsetof(struct ipv6hdr, daddr));

    return src_hash + dst_hash;
}

int  __section("loadbalancer") lb(struct __sk_buff *skb) {
    __u64 nhoff = ETH_HLEN;
    // 上层协议
    __u16 proto = load_half(skb, ETH_HLEN - ETH_TLEN);
    __u16 ret = proto;
    switch (proto) {
        // 处理vlan和qinq
        case ETH_P_8021Q:
        case ETH_P_8021AD:
            {
                // 解第二层vlan
                __u16 vproto = load_half(skb, nhoff +  offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
                switch(vproto) {
                    case ETH_P_8021AD:
                    case ETH_P_8021Q:
                        nhoff += sizeof(struct vlan_hdr);
                        proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
                        break;
                    default:
                        proto = vproto;
                }

                nhoff += sizeof(struct vlan_hdr);
                skb->cb[0] = nhoff;

                switch (proto) {
                    // 计算ipv4哈希
                    case ETH_P_IP:
#if 0
                        { char fmt[] = "ipv4\n"; bpf_trace_printk(fmt, sizeof(fmt));}
#endif
                        ret = ipv4_hash(skb);
                        break;
                    // 计算ipv6哈希
                    case ETH_P_IPV6:
                        ret = ipv6_hash(skb);
                        break;
                    default:
#if 0
                        {
                            char fmt[] = "Dflt VLAN proto %u\n";
                            bpf_trace_printk(fmt, sizeof(fmt), proto);
                            break;
                        }
#else
                        break;
#endif
                }
            }
            break;
        case ETH_P_IP:
            // 计算ipv4哈希
            ret = ipv4_hash(skb);
            break;
        case ETH_P_IPV6:
            // 计算ipv6哈希
            ret = ipv6_hash(skb);
            break;
        default:
#if 0
            {
                char fmt[] = "Got proto %x\n";
                bpf_trace_printk(fmt, sizeof(fmt), proto);
                break;
            }
#else
            break;
#endif
    }
    return ret;
}

char __license[] __section("license") = "GPL";

/* libbpf needs version section to check sync of eBPF code and kernel
 * but socket filter don't need it */
__u32 __version __section("version") = LINUX_VERSION_CODE;

Vlan Filter

#define LINUX_VERSION_CODE 263682

int SEC("filter") hashfilter(struct __sk_buff *skb) {
    __u16 vlan_id = skb->vlan_tci & 0x0fff;
    /* accept VLAN 2 and 4 and drop the rest */
    switch (vlan_id) {
        case 2:
        case 4:
            return -1;
        default:
            return 0;
    }
    return 0;
}

char __license[] SEC("license") = "GPL";

__u32 __version SEC("version") = LINUX_VERSION_CODE;

XDP Filter

#define LINUX_VERSION_CODE 263682

/* Hashing initval */
#define INITVAL 15485863

/* Set BUILD_CPUMAP to 0 if you want to run XDP bypass on kernel
 * older than 4.15 */
#define BUILD_CPUMAP        1
/* Increase CPUMAP_MAX_CPUS if ever you have more than 64 CPUs */
#define CPUMAP_MAX_CPUS     64

/* Set to 1 to bypass encrypted packets of TLS sessions. Suricata will
 * be blind to these packets or forged packets looking alike. */
#define ENCRYPTED_TLS_BYPASS    0

/* Set it to 0 if for example you plan to use the XDP filter in a
 * network card that don't support per CPU value (like netronome) */
#define USE_PERCPU_HASH     1
/* Set it to 0 if your XDP subsystem don't handle XDP_REDIRECT (like netronome) */
#define GOT_TX_PEER         1

/* set to non 0 to load balance in hardware mode on RSS_QUEUE_NUMBERS queues
 * and unset BUILD_CPUMAP (number must be a power of 2 for netronome) */
#define RSS_QUEUE_NUMBERS   32

/* no vlan tracking: set it to 0 if you don't use VLAN for tracking. Can
 * also be used as workaround of some hardware offload issue */
#define VLAN_TRACKING    1

struct vlan_hdr {
    __u16	h_vlan_TCI;
    __u16	h_vlan_encapsulated_proto;
};

struct flowv4_keys {
    __u32 src;
    __u32 dst;
    union {
        __u32 ports;
        __u16 port16[2];
    };
    __u8 ip_proto:1;
    __u16 vlan0:15;
    __u16 vlan1;
};

struct flowv6_keys {
    __u32 src[4];
    __u32 dst[4];
    union {
        __u32 ports;
        __u16 port16[2];
    };
    __u8 ip_proto:1;
    __u16 vlan0:15;
    __u16 vlan1;
};

struct pair {
    __u64 packets;
    __u64 bytes;
};

struct bpf_map_def SEC("maps") flow_table_v4 = {
#if USE_PERCPU_HASH
    .type = BPF_MAP_TYPE_PERCPU_HASH,
#else
    .type = BPF_MAP_TYPE_HASH,
#endif
    .key_size = sizeof(struct flowv4_keys),
    .value_size = sizeof(struct pair),
    .max_entries = 32768,
};

struct bpf_map_def SEC("maps") flow_table_v6 = {
#if USE_PERCPU_HASH
    .type = BPF_MAP_TYPE_PERCPU_HASH,
#else
    .type = BPF_MAP_TYPE_HASH,
#endif
    .key_size = sizeof(struct flowv6_keys),
    .value_size = sizeof(struct pair),
    .max_entries = 32768,
};


#if ENCRYPTED_TLS_BYPASS
struct bpf_map_def SEC("maps") tls_bypass_count = {
#if USE_PERCPU_HASH
    .type		= BPF_MAP_TYPE_PERCPU_ARRAY,
#else
    .type		= BPF_MAP_TYPE_ARRAY,
#endif
    .key_size	= sizeof(__u32),
    .value_size	= sizeof(__u64),
    .max_entries	= 1,
};
#endif

#if BUILD_CPUMAP
/* Special map type that can XDP_REDIRECT frames to another CPU */
struct bpf_map_def SEC("maps") cpu_map = {
    .type		= BPF_MAP_TYPE_CPUMAP,
    .key_size	= sizeof(__u32),
    .value_size	= sizeof(__u32),
    .max_entries	= CPUMAP_MAX_CPUS,
};

struct bpf_map_def SEC("maps") cpus_available = {
    .type		= BPF_MAP_TYPE_ARRAY,
    .key_size	= sizeof(__u32),
    .value_size	= sizeof(__u32),
    .max_entries	= CPUMAP_MAX_CPUS,
};

struct bpf_map_def SEC("maps") cpus_count = {
    .type		= BPF_MAP_TYPE_ARRAY,
    .key_size	= sizeof(__u32),
    .value_size	= sizeof(__u32),
    .max_entries	= 1,
};
#endif

#if GOT_TX_PEER
/* Map has only one element as we don't handle any sort of
 * routing for now. Key value set by user space is 0 and
 * value is the peer interface. */
struct bpf_map_def SEC("maps") tx_peer = {
    .type = BPF_MAP_TYPE_DEVMAP,
    .key_size = sizeof(int),
    .value_size = sizeof(int),
    .max_entries = 1,
};

/* single entry to indicate if we have peer, key value
 * set in user space is 0. It is only used to see if
 * a interface has a peer we need to send the information to */
struct bpf_map_def SEC("maps") tx_peer_int = {
    .type = BPF_MAP_TYPE_ARRAY,
    .key_size = sizeof(int),
    .value_size = sizeof(int),
    .max_entries = 1,
};
#endif

#define USE_GLOBAL_BYPASS   0
#if USE_GLOBAL_BYPASS
/* single entry to indicate if global bypass switch is on */
struct bpf_map_def SEC("maps") global_bypass = {
    .type = BPF_MAP_TYPE_ARRAY,
    .key_size = sizeof(char),
    .value_size = sizeof(char),
    .max_entries = 1,
};
#endif

// 获取tcp,udp源端口
static __always_inline int get_sport(void *trans_data, void *data_end,
        __u8 protocol)
{
    struct tcphdr *th;
    struct udphdr *uh;

    switch (protocol) {
        case IPPROTO_TCP:
            th = (struct tcphdr *)trans_data;
            if ((void *)(th + 1) > data_end)
                return -1;
            return th->source;
        case IPPROTO_UDP:
            uh = (struct udphdr *)trans_data;
            if ((void *)(uh + 1) > data_end)
                return -1;
            return uh->source;
        default:
            return 0;
    }
}

// 获取tcp,udp目的端口
static __always_inline int get_dport(void *trans_data, void *data_end,
        __u8 protocol)
{
    struct tcphdr *th;
    struct udphdr *uh;

    switch (protocol) {
        case IPPROTO_TCP:
            th = (struct tcphdr *)trans_data;
            if ((void *)(th + 1) > data_end)
                return -1;
            return th->dest;
        case IPPROTO_UDP:
            uh = (struct udphdr *)trans_data;
            if ((void *)(uh + 1) > data_end)
                return -1;
            return uh->dest;
        default:
            return 0;
    }
}

static int __always_inline filter_ipv4(struct xdp_md *ctx, void *data, __u64 nh_off, void *data_end, __u16 vlan0, __u16 vlan1)
{
    struct iphdr *iph = data + nh_off;
    int dport;
    int sport;
    struct flowv4_keys tuple;
    struct pair *value;
#if BUILD_CPUMAP || GOT_TX_PEER
    __u32 key0 = 0;
#endif
#if ENCRYPTED_TLS_BYPASS
    __u32 key1 = 0;
    __u32 *tls_count = NULL;
#endif
#if BUILD_CPUMAP
    __u32 cpu_dest;
    __u32 *cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
    __u32 *cpu_selected;
    __u32 cpu_hash;
#endif
#if GOT_TX_PEER
    int *iface_peer;
    int tx_port = 0;
#endif

    if ((void *)(iph + 1) > data_end)
        return XDP_PASS;

    // 元组
    if (iph->protocol == IPPROTO_TCP) {
        tuple.ip_proto = 1;
    } else {
        tuple.ip_proto = 0;
    }
    tuple.src = iph->saddr;
    tuple.dst = iph->daddr;

    dport = get_dport(iph + 1, data_end, iph->protocol);
    if (dport == -1)
        return XDP_PASS;

    sport = get_sport(iph + 1, data_end, iph->protocol);
    if (sport == -1)
        return XDP_PASS;

    tuple.port16[0] = (__u16)sport;
    tuple.port16[1] = (__u16)dport;

    tuple.vlan0 = vlan0;
    tuple.vlan1 = vlan1;

    // 流表查询
    value = bpf_map_lookup_elem(&flow_table_v4, &tuple);
#if 0
    {
        char fmt[] = "Current flow src: %u:%d\n";
        char fmt1[] = "Current flow dst: %u:%d\n";
        bpf_trace_printk(fmt, sizeof(fmt), tuple.src, tuple.port16[0]);
        bpf_trace_printk(fmt1, sizeof(fmt1), tuple.dst, tuple.port16[1]);
    }
#endif
    if (value) {
#if 0
        char fmt[] = "Found flow v4: %u %d -> %d\n";
        bpf_trace_printk(fmt, sizeof(fmt), tuple.src, sport, dport);
        char fmt[] = "Data: t:%lu p:%lu n:%lu\n";
        bpf_trace_printk(fmt, sizeof(fmt), value->time, value->packets, value->bytes);
#endif
#if USE_PERCPU_HASH
        // 存在,更新流量和包数
        value->packets++;
        value->bytes += data_end - data;
#else
        // 存在,更新流量和包数(原子操作)
        __sync_fetch_and_add(&value->packets, 1);
        __sync_fetch_and_add(&value->bytes, data_end - data);
#endif

#if GOT_TX_PEER
        iface_peer = bpf_map_lookup_elem(&tx_peer_int, &key0);
        if (!iface_peer) {
            return XDP_DROP;
        } else {
            return bpf_redirect_map(&tx_peer, tx_port, 0);
        }
#else
        // 丢弃
        return XDP_DROP;
#endif
    }

#if ENCRYPTED_TLS_BYPASS
    if ((dport == __constant_ntohs(443)) || (sport == __constant_ntohs(443))) {
        __u8 *app_data;
        /* drop application data for tls 1.2 */
        /* FIXME better parsing */
        nh_off += sizeof(struct iphdr) + sizeof(struct tcphdr);
        if (data_end > data + nh_off + 4) {
            app_data = data + nh_off;
            // tls加密数据
            if (app_data[0] == 0x17 && app_data[1] == 0x3 && app_data[2] == 0x3) {
                // 根据key查找,更新tls加密数据计数器
                tls_count = bpf_map_lookup_elem(&tls_bypass_count, &key1);
                if (tls_count) {
#if USE_PERCPU_HASH
                    tls_count++;
#else
                    __sync_fetch_and_add(tls_count, 1);
#endif
                }
#if GOT_TX_PEER
                iface_peer = bpf_map_lookup_elem(&tx_peer_int, &key0);
                if (!iface_peer) {
                    return XDP_DROP;
                } else {
                    return bpf_redirect_map(&tx_peer, tx_port, 0);
                }
#else
                // 丢弃
                return XDP_DROP;
#endif
            }
        }
    }
#endif

#if BUILD_CPUMAP
    /* IP-pairs + protocol (UDP/TCP/ICMP) hit same CPU */
    cpu_hash = tuple.src + tuple.dst;
    cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol);

    if (cpu_max && *cpu_max) {
        cpu_dest = cpu_hash % *cpu_max;
        cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_dest);
        if (!cpu_selected)
            return XDP_ABORTED;
        cpu_dest = *cpu_selected;
        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
    } else {
        return XDP_PASS;
    }
#else
#if RSS_QUEUE_NUMBERS
    /* IP-pairs + protocol (UDP/TCP/ICMP) hit same CPU */
    __u32 xdp_hash = tuple.src + tuple.dst;
    xdp_hash = SuperFastHash((char *)&xdp_hash, 4, INITVAL + iph->protocol);
    ctx->rx_queue_index = xdp_hash % RSS_QUEUE_NUMBERS;
#endif
    return XDP_PASS;
#endif
}

static int __always_inline filter_ipv6(struct xdp_md *ctx, void *data, __u64 nh_off, void *data_end, __u16 vlan0, __u16 vlan1)
{
    struct ipv6hdr *ip6h = data + nh_off;
    int dport;
    int sport;
    struct flowv6_keys tuple;
    struct pair *value;
#if BUILD_CPUMAP || GOT_TX_PEER
    __u32 key0 = 0;
#endif
#if BUILD_CPUMAP
    __u32 cpu_dest;
    int *cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
    __u32 *cpu_selected;
    __u32 cpu_hash;
#endif
#if GOT_TX_PEER
    int tx_port = 0;
    int *iface_peer;
#endif

    if ((void *)(ip6h + 1) > data_end)
        return 0;
    if (!((ip6h->nexthdr == IPPROTO_UDP) || (ip6h->nexthdr == IPPROTO_TCP)))
        return XDP_PASS;

    dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
    if (dport == -1)
        return XDP_PASS;

    sport = get_sport(ip6h + 1, data_end, ip6h->nexthdr);
    if (sport == -1)
        return XDP_PASS;

    if (ip6h->nexthdr == IPPROTO_TCP) {
        tuple.ip_proto = 1;
    } else {
        tuple.ip_proto = 0;
    }
    __builtin_memcpy(tuple.src, ip6h->saddr.s6_addr32, sizeof(tuple.src));
    __builtin_memcpy(tuple.dst, ip6h->daddr.s6_addr32, sizeof(tuple.dst));
    tuple.port16[0] = sport;
    tuple.port16[1] = dport;

    tuple.vlan0 = vlan0;
    tuple.vlan1 = vlan1;

    value = bpf_map_lookup_elem(&flow_table_v6, &tuple);
    if (value) {
#if 0
        char fmt6[] = "Found IPv6 flow: %d -> %d\n";
        bpf_trace_printk(fmt6, sizeof(fmt6), sport, dport);
#endif
#if USE_PERCPU_HASH
        value->packets++;
        value->bytes += data_end - data;
#else
        __sync_fetch_and_add(&value->packets, 1);
        __sync_fetch_and_add(&value->bytes, data_end - data);
#endif

#if GOT_TX_PEER
        iface_peer = bpf_map_lookup_elem(&tx_peer_int, &key0);
        if (!iface_peer) {
            return XDP_DROP;
        } else {
            return bpf_redirect_map(&tx_peer, tx_port, 0);
        }
#else
        return XDP_DROP;
#endif
    }

#if BUILD_CPUMAP
    /* IP-pairs + protocol (UDP/TCP/ICMP) hit same CPU */
    cpu_hash  = tuple.src[0] + tuple.dst[0];
    cpu_hash += tuple.src[1] + tuple.dst[1];
    cpu_hash += tuple.src[2] + tuple.dst[2];
    cpu_hash += tuple.src[3] + tuple.dst[3];
    cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL);

    if (cpu_max && *cpu_max) {
        cpu_dest = cpu_hash % *cpu_max;
        cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_dest);
        if (!cpu_selected)
            return XDP_ABORTED;
        cpu_dest = *cpu_selected;
        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
    } else {
        return XDP_PASS;
    }
#else
#if RSS_QUEUE_NUMBERS
    /* IP-pairs + protocol (UDP/TCP/ICMP) hit same CPU */
    __u32 xdp_hash  = tuple.src[0] + tuple.dst[0];
    xdp_hash += tuple.src[1] + tuple.dst[1];
    xdp_hash += tuple.src[2] + tuple.dst[2];
    xdp_hash += tuple.src[3] + tuple.dst[3];
    xdp_hash = SuperFastHash((char *)&xdp_hash, 4, INITVAL);
    ctx->rx_queue_index = xdp_hash % RSS_QUEUE_NUMBERS;
#endif

    return XDP_PASS;
#endif
}

int SEC("xdp") xdp_hashfilter(struct xdp_md *ctx)
{
    void *data_end = (void *)(long)ctx->data_end;
    void *data = (void *)(long)ctx->data;
    struct ethhdr *eth = data;
    __u16 h_proto;
    __u64 nh_off;
    __u16 vlan0 = 0;
    __u16 vlan1 = 0;
#if USE_GLOBAL_BYPASS
    int *iface_peer;
    char *g_switch = 0;
    char key0;
    int tx_port = 0;

    g_switch = bpf_map_lookup_elem(&global_bypass, &key0);
    if (g_switch && *g_switch) {
        iface_peer = bpf_map_lookup_elem(&tx_peer_int, &key0);
        if (!iface_peer) {
            return XDP_DROP;
        } else {
            return bpf_redirect_map(&tx_peer, tx_port, 0);
        }
    }
#endif

    nh_off = sizeof(*eth);
    if (data + nh_off > data_end)
        return XDP_PASS;

    h_proto = eth->h_proto;

    // vlan和qinq处理
    if (h_proto == __constant_htons(ETH_P_8021Q) || h_proto == __constant_htons(ETH_P_8021AD)) {
        struct vlan_hdr *vhdr;

        vhdr = data + nh_off;
        nh_off += sizeof(struct vlan_hdr);
        if (data + nh_off > data_end)
            return XDP_PASS;
        h_proto = vhdr->h_vlan_encapsulated_proto;
#if VLAN_TRACKING
        vlan0 = vhdr->h_vlan_TCI & 0x0fff;
#else
        vlan0 = 0;
#endif
    }
    if (h_proto == __constant_htons(ETH_P_8021Q) || h_proto == __constant_htons(ETH_P_8021AD)) {
        struct vlan_hdr *vhdr;

        vhdr = data + nh_off;
        nh_off += sizeof(struct vlan_hdr);
        if (data + nh_off > data_end)
            return XDP_PASS;
        h_proto = vhdr->h_vlan_encapsulated_proto;
#if VLAN_TRACKING
        vlan1 = vhdr->h_vlan_TCI & 0x0fff;
#else
        vlan1 = 0;
#endif
    }

    // 执行ipv4或ipv6过滤
    if (h_proto == __constant_htons(ETH_P_IP))
        return filter_ipv4(ctx, data, nh_off, data_end, vlan0, vlan1);
    else if (h_proto == __constant_htons(ETH_P_IPV6))
        return filter_ipv6(ctx, data, nh_off, data_end, vlan0, vlan1);

    return XDP_PASS;
}

char __license[] SEC("license") = "GPL";

__u32 __version SEC("version") = LINUX_VERSION_CODE;

XDP Load Balancer

#define LINUX_VERSION_CODE 263682

/* Hashing initval */
#define INITVAL 15485863

/* Increase CPUMAP_MAX_CPUS if ever you have more than 128 CPUs */
#define CPUMAP_MAX_CPUS 128

struct vlan_hdr {
    __u16	h_vlan_TCI;
    __u16	h_vlan_encapsulated_proto;
};

/* Special map type that can XDP_REDIRECT frames to another CPU */
struct bpf_map_def SEC("maps") cpu_map = {
    .type		= BPF_MAP_TYPE_CPUMAP,
    .key_size	= sizeof(__u32),
    .value_size	= sizeof(__u32),
    .max_entries	= CPUMAP_MAX_CPUS,
};

struct bpf_map_def SEC("maps") cpus_available = {
    .type		= BPF_MAP_TYPE_ARRAY,
    .key_size	= sizeof(__u32),
    .value_size	= sizeof(__u32),
    .max_entries	= CPUMAP_MAX_CPUS,
};

struct bpf_map_def SEC("maps") cpus_count = {
    .type		= BPF_MAP_TYPE_ARRAY,
    .key_size	= sizeof(__u32),
    .value_size	= sizeof(__u32),
    .max_entries	= 1,
};

static int __always_inline hash_ipv4(void *data, void *data_end)
{
    struct iphdr *iph = data;
    if ((void *)(iph + 1) > data_end)
        return XDP_PASS;

    __u32 key0 = 0;
    __u32 cpu_dest;
    __u32 *cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
    __u32 *cpu_selected;
    __u32 cpu_hash;

    /* IP-pairs hit same CPU */
    cpu_hash = iph->saddr + iph->daddr;
    cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL);

    if (cpu_max && *cpu_max) {
        cpu_dest = cpu_hash % *cpu_max;
        cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_dest);
        if (!cpu_selected)
            return XDP_ABORTED;
        cpu_dest = *cpu_selected;
        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
    } else {
        return XDP_PASS;
    }
}

static int __always_inline hash_ipv6(void *data, void *data_end)
{
    struct ipv6hdr *ip6h = data;
    if ((void *)(ip6h + 1) > data_end)
        return XDP_PASS;

    __u32 key0 = 0;
    __u32 cpu_dest;
    __u32 *cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
    __u32 *cpu_selected;
    __u32 cpu_hash;

    /* IP-pairs hit same CPU */
    cpu_hash  = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0];
    cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1];
    cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2];
    cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3];
    cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL);

    if (cpu_max && *cpu_max) {
        cpu_dest = cpu_hash % *cpu_max;
        cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_dest);
        if (!cpu_selected)
            return XDP_ABORTED;
        cpu_dest = *cpu_selected;
        return bpf_redirect_map(&cpu_map, cpu_dest, 0);
    } else {
        return XDP_PASS;
    }

    return XDP_PASS;
}

static int __always_inline filter_gre(struct xdp_md *ctx, void *data, __u64 nh_off, void *data_end)
{
    struct iphdr *iph = data + nh_off;
    __u16 proto;
    struct gre_hdr {
        __be16 flags;
        __be16 proto;
    };

    nh_off += sizeof(struct iphdr);
    struct gre_hdr *grhdr = (struct gre_hdr *)(iph + 1);

    if ((void *)(grhdr + 1) > data_end)
        return XDP_PASS;

    if (grhdr->flags & (GRE_VERSION|GRE_ROUTING))
        return XDP_PASS;

    nh_off += 4;
    proto = grhdr->proto;
    if (grhdr->flags & GRE_CSUM)
        nh_off += 4;
    if (grhdr->flags & GRE_KEY)
        nh_off += 4;
    if (grhdr->flags & GRE_SEQ)
        nh_off += 4;

    /* Update offset to skip ERPSAN header if we have one */
    if (proto == __constant_htons(ETH_P_ERSPAN)) {
        nh_off += 8;
    }

    if (data + nh_off > data_end)
        return XDP_PASS;
    if (bpf_xdp_adjust_head(ctx, 0 + nh_off))
        return XDP_PASS;

    data = (void *)(long)ctx->data;
    data_end = (void *)(long)ctx->data_end;

    /* we have now data starting at Ethernet header */
    struct ethhdr *eth = data;
    proto = eth->h_proto;
    /* we want to hash on IP so we need to get to ip hdr */
    nh_off = sizeof(*eth);

    if (data + nh_off > data_end)
        return XDP_PASS;

    /* we need to increase offset and update protocol
     * in the case we have VLANs */
    if (proto == __constant_htons(ETH_P_8021Q)) {
        struct vlan_hdr *vhdr = (struct vlan_hdr *)(data + nh_off);
        if ((void *)(vhdr + 1) > data_end)
            return XDP_PASS;
        proto = vhdr->h_vlan_encapsulated_proto;
        nh_off += sizeof(struct vlan_hdr);
    }

    if (data + nh_off > data_end)
        return XDP_PASS;
    /* proto should now be IP style */
    if (proto == __constant_htons(ETH_P_IP)) {
        return hash_ipv4(data + nh_off, data_end);
    } else if (proto == __constant_htons(ETH_P_IPV6)) {
        return hash_ipv6(data + nh_off, data_end);
    } else
        return XDP_PASS;
}

static int __always_inline filter_ipv4(struct xdp_md *ctx, void *data, __u64 nh_off, void *data_end)
{
    struct iphdr *iph = data + nh_off;
    if ((void *)(iph + 1) > data_end)
        return XDP_PASS;

    // 处理gre隧道协议
    if (iph->protocol == IPPROTO_GRE) {
        return filter_gre(ctx, data, nh_off, data_end);
    }
    // 计算ipv4哈希
    return hash_ipv4(data + nh_off, data_end);
}

static int __always_inline filter_ipv6(struct xdp_md *ctx, void *data, __u64 nh_off, void *data_end)
{
    struct ipv6hdr *ip6h = data + nh_off;
    return hash_ipv6((void *)ip6h, data_end);
}

int SEC("xdp") xdp_loadfilter(struct xdp_md *ctx)
{
    void *data_end = (void *)(long)ctx->data_end;
    void *data = (void *)(long)ctx->data;
    struct ethhdr *eth = data;
    __u16 h_proto;
    __u64 nh_off;

    nh_off = sizeof(*eth);
    if (data + nh_off > data_end)
        return XDP_PASS;

    h_proto = eth->h_proto;

#if 0
    if (h_proto != __constant_htons(ETH_P_IP)) {
        char fmt[] = "Current proto: %u\n";
        bpf_trace_printk(fmt, sizeof(fmt), h_proto);
    }
#endif
    if (h_proto == __constant_htons(ETH_P_8021Q) || h_proto == __constant_htons(ETH_P_8021AD)) {
        struct vlan_hdr *vhdr;

        vhdr = data + nh_off;
        nh_off += sizeof(struct vlan_hdr);
        if (data + nh_off > data_end)
            return XDP_PASS;
        h_proto = vhdr->h_vlan_encapsulated_proto;
    }
    if (h_proto == __constant_htons(ETH_P_8021Q) || h_proto == __constant_htons(ETH_P_8021AD)) {
        struct vlan_hdr *vhdr;

        vhdr = data + nh_off;
        nh_off += sizeof(struct vlan_hdr);
        if (data + nh_off > data_end)
            return XDP_PASS;
        h_proto = vhdr->h_vlan_encapsulated_proto;
    }

    if (h_proto == __constant_htons(ETH_P_IP))
        return filter_ipv4(ctx, data, nh_off, data_end);
    else if (h_proto == __constant_htons(ETH_P_IPV6))
        return filter_ipv6(ctx, data, nh_off, data_end);

    return XDP_PASS;
}

char __license[] SEC("license") = "GPL";

__u32 __version SEC("version") = LINUX_VERSION_CODE;