Suricata中的eBPF解析
Suricata
中的eBPF
代码位于ebpf
目录中,包括bypass_filter.c
、filter.c
、lb.c
、vlan_filter.c
、xdp_filter.c
、xdp_lb.c
文件。
Filter
#define LINUX_VERSION_CODE 263682
struct bpf_map_def SEC("maps") ipv4_drop = {
.type = BPF_MAP_TYPE_PERCPU_HASH,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = 32768,
};
// vlan头结构,共四个字节
struct vlan_hdr {
// tci:3bit优先级,1bit CFI,12bit vlan id
__u16 h_vlan_TCI;
// 上层协议
__u16 h_vlan_encapsulated_proto;
};
static __always_inline int ipv4_filter(struct __sk_buff *skb)
{
__u32 nhoff;
__u32 *value;
__u32 ip = 0;
// 获取偏移量
nhoff = skb->cb[0];
// 源IP
ip = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
// 查找源IP
value = bpf_map_lookup_elem(&ipv4_drop, &ip);
if (value) {
// 找到了,丢弃并更新计数器
#if DEBUG
char fmt[] = "Found value for saddr: %u\n";
bpf_trace_printk(fmt, sizeof(fmt), value);
#endif
*value = *value + 1;
return 0;
}
// 目的IP
ip = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
// 查找目的IP
value = bpf_map_lookup_elem(&ipv4_drop, &ip);
if (value) {
// 找到了,丢弃并更新计数器
#if DEBUG
char fmt[] = "Found value for daddr: %u\n";
bpf_trace_printk(fmt, sizeof(fmt), value);
#endif
*value = *value + 1;
return 0;
}
#if DEBUG
char fmt[] = "Nothing so ok\n";
bpf_trace_printk(fmt, sizeof(fmt));
#endif
return -1;
}
// 暂未实现
static __always_inline int ipv6_filter(struct __sk_buff *skb)
{
return -1;
}
int SEC("filter") hashfilter(struct __sk_buff *skb)
{
// 头偏移量为以太网头长度(14字节)
__u32 nhoff = ETH_HLEN;
// 获取ip协议
__u16 proto = load_half(skb, offsetof(struct ethhdr, h_proto));
// 对于VLAN或者QinQ帧
if (proto == ETH_P_8021AD || proto == ETH_P_8021Q) {
// 解1层vlan,获取上层协议
proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
h_vlan_encapsulated_proto));
// 更新头偏移量
nhoff += sizeof(struct vlan_hdr);
}
// 保存当前偏移量
skb->cb[0] = nhoff;
switch (proto) {
// 执行ipv4过滤
case ETH_P_IP:
return ipv4_filter(skb);
// 执行ipv6过滤
case ETH_P_IPV6:
return ipv6_filter(skb);
default:
break;
}
return -1;
}
// license
char __license[] SEC("license") = "GPL";
// 内核版本4.6.2
__u32 __version SEC("version") = LINUX_VERSION_CODE;
Bypass Filter
#define LINUX_VERSION_CODE 263682
// ipv4流定义
struct flowv4_keys {
__u32 src;
__u32 dst;
union {
__u32 ports;
__u16 port16[2];
};
__u8 ip_proto:1;
__u16 vlan0:15;
__u16 vlan1;
};
// ipv6流定义
struct flowv6_keys {
__u32 src[4];
__u32 dst[4];
union {
__u32 ports;
__u16 port16[2];
};
__u8 ip_proto:1;
__u16 vlan0:15;
__u16 vlan1;
};
// 流量和包数统计量
struct pair {
__u64 packets;
__u64 bytes;
};
struct bpf_map_def SEC("maps") flow_table_v4 = {
.type = BPF_MAP_TYPE_PERCPU_HASH,
.key_size = sizeof(struct flowv4_keys),
.value_size = sizeof(struct pair),
.max_entries = 32768,
};
struct bpf_map_def SEC("maps") flow_table_v6 = {
.type = BPF_MAP_TYPE_PERCPU_HASH,
.key_size = sizeof(struct flowv6_keys),
.value_size = sizeof(struct pair),
.max_entries = 32768,
};
struct vlan_hdr {
__u16 h_vlan_TCI;
__u16 h_vlan_encapsulated_proto;
};
/**
* IPv4 filter
*
* \return 0 to drop packet out and -1 to accept it
*/
static __always_inline int ipv4_filter(struct __sk_buff *skb, __u16 vlan0, __u16 vlan1)
{
__u32 nhoff, verlen;
struct flowv4_keys tuple;
struct pair *value;
__u16 port;
__u8 ip_proto;
nhoff = skb->cb[0];
// 获取上层协议
ip_proto = load_byte(skb, nhoff + offsetof(struct iphdr, protocol));
/* only support TCP and UDP for now */
switch (ip_proto) {
// tcp
case IPPROTO_TCP:
tuple.ip_proto = 1;
break;
// udp
case IPPROTO_UDP:
tuple.ip_proto = 0;
break;
default:
return -1;
}
// 元组(ip、端口、协议、vlan)
tuple.src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
tuple.dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
verlen = load_byte(skb, nhoff + 0/*offsetof(struct iphdr, ihl)*/);
nhoff += (verlen & 0xF) << 2;
tuple.ports = load_word(skb, nhoff);
port = tuple.port16[1];
tuple.port16[1] = tuple.port16[0];
tuple.port16[0] = port;
tuple.vlan0 = vlan0;
tuple.vlan1 = vlan1;
#if 0
if ((tuple.port16[0] == 22) || (tuple.port16[1] == 22))
{
__u16 sp = tuple.port16[0];
//__u16 dp = tuple.port16[1];
char fmt[] = "Parsed SSH flow: %u %d -> %u\n";
bpf_trace_printk(fmt, sizeof(fmt), tuple.src, sp, tuple.dst);
}
#endif
/* Test if src is in hash */
// 查找流
value = bpf_map_lookup_elem(&flow_table_v4, &tuple);
if (value) {
#if 0
{
__u16 sp = tuple.port16[0];
//__u16 dp = tuple.port16[1];
char bfmt[] = "Found flow: %u %d -> %u\n";
bpf_trace_printk(bfmt, sizeof(bfmt), tuple.src, sp, tuple.dst);
}
#endif
// 找到了,更新流量和包数
value->packets++;
value->bytes += skb->len;
return 0;
}
// 未找到
return -1;
}
/**
* IPv6 filter
*
* \return 0 to drop packet out and -1 to accept it
*/
static __always_inline int ipv6_filter(struct __sk_buff *skb, __u16 vlan0, __u16 vlan1)
{
__u32 nhoff;
__u8 nhdr;
struct flowv6_keys tuple;
struct pair *value;
__u16 port;
nhoff = skb->cb[0];
/* get next header */
nhdr = load_byte(skb, nhoff + offsetof(struct ipv6hdr, nexthdr));
/* only support direct TCP and UDP for now */
switch (nhdr) {
case IPPROTO_TCP:
tuple.ip_proto = 1;
break;
case IPPROTO_UDP:
tuple.ip_proto = 0;
break;
default:
return -1;
}
tuple.src[0] = load_word(skb, nhoff + offsetof(struct ipv6hdr, saddr));
tuple.src[1] = load_word(skb, nhoff + offsetof(struct ipv6hdr, saddr) + 4);
tuple.src[2] = load_word(skb, nhoff + offsetof(struct ipv6hdr, saddr) + 8);
tuple.src[3] = load_word(skb, nhoff + offsetof(struct ipv6hdr, saddr) + 12);
tuple.dst[0] = load_word(skb, nhoff + offsetof(struct ipv6hdr, daddr));
tuple.dst[1] = load_word(skb, nhoff + offsetof(struct ipv6hdr, daddr) + 4);
tuple.dst[2] = load_word(skb, nhoff + offsetof(struct ipv6hdr, daddr) + 8);
tuple.dst[3] = load_word(skb, nhoff + offsetof(struct ipv6hdr, daddr) + 12);
/* Parse TCP */
tuple.ports = load_word(skb, nhoff + 40 /* IPV6_HEADER_LEN */);
port = tuple.port16[1];
tuple.port16[1] = tuple.port16[0];
tuple.port16[0] = port;
tuple.vlan0 = vlan0;
tuple.vlan1 = vlan1;
//char fmt[] = "Now Got IPv6 port %u and %u\n";
//bpf_trace_printk(fmt, sizeof(fmt), tuple.port16[0], tuple.port16[1]);
/* Test if src is in hash */
value = bpf_map_lookup_elem(&flow_table_v6, &tuple);
if (value) {
//char fmt[] = "Got a match IPv6: %u and %u\n";
//bpf_trace_printk(fmt, sizeof(fmt), tuple.port16[0], tuple.port16[1]);
value->packets++;
value->bytes += skb->len;
return 0;
}
return -1;
}
/**
* filter function
*
* It is loaded in kernel by Suricata that uses the section name specified
* by the SEC call to find it in the Elf binary object and load it.
*
* \return 0 to drop packet out and -1 to accept it
*/
int SEC("filter") hashfilter(struct __sk_buff *skb) {
__u32 nhoff = ETH_HLEN;
__u16 proto = load_half(skb, offsetof(struct ethhdr, h_proto));
// 第一层vlan id
__u16 vlan0 = skb->vlan_tci & 0x0fff;
__u16 vlan1 = 0;
// vlan或者qinq
if (proto == ETH_P_8021AD || proto == ETH_P_8021Q) {
// 上层协议
proto = load_half(skb, nhoff + offsetof(struct vlan_hdr,
h_vlan_encapsulated_proto));
#if VLAN_TRACKING
/* one vlan layer is stripped by OS so get vlan 1 at first pass */
// 第二层vlan id
vlan1 = load_half(skb, nhoff + offsetof(struct vlan_hdr,
h_vlan_TCI)) & 0x0fff;
#endif
nhoff += sizeof(struct vlan_hdr);
}
skb->cb[0] = nhoff;
switch (proto) {
case ETH_P_IP:
// ipv4过滤
return ipv4_filter(skb, vlan0, vlan1);
case ETH_P_IPV6:
// ipv6过滤
return ipv6_filter(skb, vlan0, vlan1);
default:
#if 0
{
char fmt[] = "Got proto %u\n";
bpf_trace_printk(fmt, sizeof(fmt), h_proto);
break;
}
#else
break;
#endif
}
return -1;
}
char __license[] SEC("license") = "GPL";
__u32 __version SEC("version") = LINUX_VERSION_CODE;
Load Balancer
#define LINUX_VERSION_CODE 263682
#ifndef __section
# define __section(x) __attribute__((section(x), used))
#endif
struct vlan_hdr {
__u16 h_vlan_TCI;
__u16 h_vlan_encapsulated_proto;
};
static __always_inline int ipv4_hash(struct __sk_buff *skb)
{
__u32 nhoff;
__u32 src, dst;
nhoff = skb->cb[0];
// 以源ip和目的ip计算哈希
src = load_word(skb, nhoff + offsetof(struct iphdr, saddr));
dst = load_word(skb, nhoff + offsetof(struct iphdr, daddr));
#if 0
char fmt[] = "Got addr: %x -> %x at %d\n";
bpf_trace_printk(fmt, sizeof(fmt), src, dst, nhoff);
//char fmt2[] = "Got hash %u\n";
//bpf_trace_printk(fmt2, sizeof(fmt2), src + dst);
#endif
return src + dst;
}
static inline __u32 ipv6_addr_hash(struct __sk_buff *ctx, __u64 off)
{
__u64 w0 = load_word(ctx, off);
__u64 w1 = load_word(ctx, off + 4);
__u64 w2 = load_word(ctx, off + 8);
__u64 w3 = load_word(ctx, off + 12);
return (__u32)(w0 ^ w1 ^ w2 ^ w3);
}
static __always_inline int ipv6_hash(struct __sk_buff *skb)
{
__u32 nhoff;
__u32 src_hash, dst_hash;
// 以源ip和目的ip计算哈希
nhoff = skb->cb[0];
src_hash = ipv6_addr_hash(skb,
nhoff + offsetof(struct ipv6hdr, saddr));
dst_hash = ipv6_addr_hash(skb,
nhoff + offsetof(struct ipv6hdr, daddr));
return src_hash + dst_hash;
}
int __section("loadbalancer") lb(struct __sk_buff *skb) {
__u64 nhoff = ETH_HLEN;
// 上层协议
__u16 proto = load_half(skb, ETH_HLEN - ETH_TLEN);
__u16 ret = proto;
switch (proto) {
// 处理vlan和qinq
case ETH_P_8021Q:
case ETH_P_8021AD:
{
// 解第二层vlan
__u16 vproto = load_half(skb, nhoff + offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
switch(vproto) {
case ETH_P_8021AD:
case ETH_P_8021Q:
nhoff += sizeof(struct vlan_hdr);
proto = load_half(skb, nhoff + offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
break;
default:
proto = vproto;
}
nhoff += sizeof(struct vlan_hdr);
skb->cb[0] = nhoff;
switch (proto) {
// 计算ipv4哈希
case ETH_P_IP:
#if 0
{ char fmt[] = "ipv4\n"; bpf_trace_printk(fmt, sizeof(fmt));}
#endif
ret = ipv4_hash(skb);
break;
// 计算ipv6哈希
case ETH_P_IPV6:
ret = ipv6_hash(skb);
break;
default:
#if 0
{
char fmt[] = "Dflt VLAN proto %u\n";
bpf_trace_printk(fmt, sizeof(fmt), proto);
break;
}
#else
break;
#endif
}
}
break;
case ETH_P_IP:
// 计算ipv4哈希
ret = ipv4_hash(skb);
break;
case ETH_P_IPV6:
// 计算ipv6哈希
ret = ipv6_hash(skb);
break;
default:
#if 0
{
char fmt[] = "Got proto %x\n";
bpf_trace_printk(fmt, sizeof(fmt), proto);
break;
}
#else
break;
#endif
}
return ret;
}
char __license[] __section("license") = "GPL";
/* libbpf needs version section to check sync of eBPF code and kernel
* but socket filter don't need it */
__u32 __version __section("version") = LINUX_VERSION_CODE;
Vlan Filter
#define LINUX_VERSION_CODE 263682
int SEC("filter") hashfilter(struct __sk_buff *skb) {
__u16 vlan_id = skb->vlan_tci & 0x0fff;
/* accept VLAN 2 and 4 and drop the rest */
switch (vlan_id) {
case 2:
case 4:
return -1;
default:
return 0;
}
return 0;
}
char __license[] SEC("license") = "GPL";
__u32 __version SEC("version") = LINUX_VERSION_CODE;
XDP Filter
#define LINUX_VERSION_CODE 263682
/* Hashing initval */
#define INITVAL 15485863
/* Set BUILD_CPUMAP to 0 if you want to run XDP bypass on kernel
* older than 4.15 */
#define BUILD_CPUMAP 1
/* Increase CPUMAP_MAX_CPUS if ever you have more than 64 CPUs */
#define CPUMAP_MAX_CPUS 64
/* Set to 1 to bypass encrypted packets of TLS sessions. Suricata will
* be blind to these packets or forged packets looking alike. */
#define ENCRYPTED_TLS_BYPASS 0
/* Set it to 0 if for example you plan to use the XDP filter in a
* network card that don't support per CPU value (like netronome) */
#define USE_PERCPU_HASH 1
/* Set it to 0 if your XDP subsystem don't handle XDP_REDIRECT (like netronome) */
#define GOT_TX_PEER 1
/* set to non 0 to load balance in hardware mode on RSS_QUEUE_NUMBERS queues
* and unset BUILD_CPUMAP (number must be a power of 2 for netronome) */
#define RSS_QUEUE_NUMBERS 32
/* no vlan tracking: set it to 0 if you don't use VLAN for tracking. Can
* also be used as workaround of some hardware offload issue */
#define VLAN_TRACKING 1
struct vlan_hdr {
__u16 h_vlan_TCI;
__u16 h_vlan_encapsulated_proto;
};
struct flowv4_keys {
__u32 src;
__u32 dst;
union {
__u32 ports;
__u16 port16[2];
};
__u8 ip_proto:1;
__u16 vlan0:15;
__u16 vlan1;
};
struct flowv6_keys {
__u32 src[4];
__u32 dst[4];
union {
__u32 ports;
__u16 port16[2];
};
__u8 ip_proto:1;
__u16 vlan0:15;
__u16 vlan1;
};
struct pair {
__u64 packets;
__u64 bytes;
};
struct bpf_map_def SEC("maps") flow_table_v4 = {
#if USE_PERCPU_HASH
.type = BPF_MAP_TYPE_PERCPU_HASH,
#else
.type = BPF_MAP_TYPE_HASH,
#endif
.key_size = sizeof(struct flowv4_keys),
.value_size = sizeof(struct pair),
.max_entries = 32768,
};
struct bpf_map_def SEC("maps") flow_table_v6 = {
#if USE_PERCPU_HASH
.type = BPF_MAP_TYPE_PERCPU_HASH,
#else
.type = BPF_MAP_TYPE_HASH,
#endif
.key_size = sizeof(struct flowv6_keys),
.value_size = sizeof(struct pair),
.max_entries = 32768,
};
#if ENCRYPTED_TLS_BYPASS
struct bpf_map_def SEC("maps") tls_bypass_count = {
#if USE_PERCPU_HASH
.type = BPF_MAP_TYPE_PERCPU_ARRAY,
#else
.type = BPF_MAP_TYPE_ARRAY,
#endif
.key_size = sizeof(__u32),
.value_size = sizeof(__u64),
.max_entries = 1,
};
#endif
#if BUILD_CPUMAP
/* Special map type that can XDP_REDIRECT frames to another CPU */
struct bpf_map_def SEC("maps") cpu_map = {
.type = BPF_MAP_TYPE_CPUMAP,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = CPUMAP_MAX_CPUS,
};
struct bpf_map_def SEC("maps") cpus_available = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = CPUMAP_MAX_CPUS,
};
struct bpf_map_def SEC("maps") cpus_count = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = 1,
};
#endif
#if GOT_TX_PEER
/* Map has only one element as we don't handle any sort of
* routing for now. Key value set by user space is 0 and
* value is the peer interface. */
struct bpf_map_def SEC("maps") tx_peer = {
.type = BPF_MAP_TYPE_DEVMAP,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 1,
};
/* single entry to indicate if we have peer, key value
* set in user space is 0. It is only used to see if
* a interface has a peer we need to send the information to */
struct bpf_map_def SEC("maps") tx_peer_int = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(int),
.value_size = sizeof(int),
.max_entries = 1,
};
#endif
#define USE_GLOBAL_BYPASS 0
#if USE_GLOBAL_BYPASS
/* single entry to indicate if global bypass switch is on */
struct bpf_map_def SEC("maps") global_bypass = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(char),
.value_size = sizeof(char),
.max_entries = 1,
};
#endif
// 获取tcp,udp源端口
static __always_inline int get_sport(void *trans_data, void *data_end,
__u8 protocol)
{
struct tcphdr *th;
struct udphdr *uh;
switch (protocol) {
case IPPROTO_TCP:
th = (struct tcphdr *)trans_data;
if ((void *)(th + 1) > data_end)
return -1;
return th->source;
case IPPROTO_UDP:
uh = (struct udphdr *)trans_data;
if ((void *)(uh + 1) > data_end)
return -1;
return uh->source;
default:
return 0;
}
}
// 获取tcp,udp目的端口
static __always_inline int get_dport(void *trans_data, void *data_end,
__u8 protocol)
{
struct tcphdr *th;
struct udphdr *uh;
switch (protocol) {
case IPPROTO_TCP:
th = (struct tcphdr *)trans_data;
if ((void *)(th + 1) > data_end)
return -1;
return th->dest;
case IPPROTO_UDP:
uh = (struct udphdr *)trans_data;
if ((void *)(uh + 1) > data_end)
return -1;
return uh->dest;
default:
return 0;
}
}
static int __always_inline filter_ipv4(struct xdp_md *ctx, void *data, __u64 nh_off, void *data_end, __u16 vlan0, __u16 vlan1)
{
struct iphdr *iph = data + nh_off;
int dport;
int sport;
struct flowv4_keys tuple;
struct pair *value;
#if BUILD_CPUMAP || GOT_TX_PEER
__u32 key0 = 0;
#endif
#if ENCRYPTED_TLS_BYPASS
__u32 key1 = 0;
__u32 *tls_count = NULL;
#endif
#if BUILD_CPUMAP
__u32 cpu_dest;
__u32 *cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
__u32 *cpu_selected;
__u32 cpu_hash;
#endif
#if GOT_TX_PEER
int *iface_peer;
int tx_port = 0;
#endif
if ((void *)(iph + 1) > data_end)
return XDP_PASS;
// 元组
if (iph->protocol == IPPROTO_TCP) {
tuple.ip_proto = 1;
} else {
tuple.ip_proto = 0;
}
tuple.src = iph->saddr;
tuple.dst = iph->daddr;
dport = get_dport(iph + 1, data_end, iph->protocol);
if (dport == -1)
return XDP_PASS;
sport = get_sport(iph + 1, data_end, iph->protocol);
if (sport == -1)
return XDP_PASS;
tuple.port16[0] = (__u16)sport;
tuple.port16[1] = (__u16)dport;
tuple.vlan0 = vlan0;
tuple.vlan1 = vlan1;
// 流表查询
value = bpf_map_lookup_elem(&flow_table_v4, &tuple);
#if 0
{
char fmt[] = "Current flow src: %u:%d\n";
char fmt1[] = "Current flow dst: %u:%d\n";
bpf_trace_printk(fmt, sizeof(fmt), tuple.src, tuple.port16[0]);
bpf_trace_printk(fmt1, sizeof(fmt1), tuple.dst, tuple.port16[1]);
}
#endif
if (value) {
#if 0
char fmt[] = "Found flow v4: %u %d -> %d\n";
bpf_trace_printk(fmt, sizeof(fmt), tuple.src, sport, dport);
char fmt[] = "Data: t:%lu p:%lu n:%lu\n";
bpf_trace_printk(fmt, sizeof(fmt), value->time, value->packets, value->bytes);
#endif
#if USE_PERCPU_HASH
// 存在,更新流量和包数
value->packets++;
value->bytes += data_end - data;
#else
// 存在,更新流量和包数(原子操作)
__sync_fetch_and_add(&value->packets, 1);
__sync_fetch_and_add(&value->bytes, data_end - data);
#endif
#if GOT_TX_PEER
iface_peer = bpf_map_lookup_elem(&tx_peer_int, &key0);
if (!iface_peer) {
return XDP_DROP;
} else {
return bpf_redirect_map(&tx_peer, tx_port, 0);
}
#else
// 丢弃
return XDP_DROP;
#endif
}
#if ENCRYPTED_TLS_BYPASS
if ((dport == __constant_ntohs(443)) || (sport == __constant_ntohs(443))) {
__u8 *app_data;
/* drop application data for tls 1.2 */
/* FIXME better parsing */
nh_off += sizeof(struct iphdr) + sizeof(struct tcphdr);
if (data_end > data + nh_off + 4) {
app_data = data + nh_off;
// tls加密数据
if (app_data[0] == 0x17 && app_data[1] == 0x3 && app_data[2] == 0x3) {
// 根据key查找,更新tls加密数据计数器
tls_count = bpf_map_lookup_elem(&tls_bypass_count, &key1);
if (tls_count) {
#if USE_PERCPU_HASH
tls_count++;
#else
__sync_fetch_and_add(tls_count, 1);
#endif
}
#if GOT_TX_PEER
iface_peer = bpf_map_lookup_elem(&tx_peer_int, &key0);
if (!iface_peer) {
return XDP_DROP;
} else {
return bpf_redirect_map(&tx_peer, tx_port, 0);
}
#else
// 丢弃
return XDP_DROP;
#endif
}
}
}
#endif
#if BUILD_CPUMAP
/* IP-pairs + protocol (UDP/TCP/ICMP) hit same CPU */
cpu_hash = tuple.src + tuple.dst;
cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL + iph->protocol);
if (cpu_max && *cpu_max) {
cpu_dest = cpu_hash % *cpu_max;
cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_dest);
if (!cpu_selected)
return XDP_ABORTED;
cpu_dest = *cpu_selected;
return bpf_redirect_map(&cpu_map, cpu_dest, 0);
} else {
return XDP_PASS;
}
#else
#if RSS_QUEUE_NUMBERS
/* IP-pairs + protocol (UDP/TCP/ICMP) hit same CPU */
__u32 xdp_hash = tuple.src + tuple.dst;
xdp_hash = SuperFastHash((char *)&xdp_hash, 4, INITVAL + iph->protocol);
ctx->rx_queue_index = xdp_hash % RSS_QUEUE_NUMBERS;
#endif
return XDP_PASS;
#endif
}
static int __always_inline filter_ipv6(struct xdp_md *ctx, void *data, __u64 nh_off, void *data_end, __u16 vlan0, __u16 vlan1)
{
struct ipv6hdr *ip6h = data + nh_off;
int dport;
int sport;
struct flowv6_keys tuple;
struct pair *value;
#if BUILD_CPUMAP || GOT_TX_PEER
__u32 key0 = 0;
#endif
#if BUILD_CPUMAP
__u32 cpu_dest;
int *cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
__u32 *cpu_selected;
__u32 cpu_hash;
#endif
#if GOT_TX_PEER
int tx_port = 0;
int *iface_peer;
#endif
if ((void *)(ip6h + 1) > data_end)
return 0;
if (!((ip6h->nexthdr == IPPROTO_UDP) || (ip6h->nexthdr == IPPROTO_TCP)))
return XDP_PASS;
dport = get_dport(ip6h + 1, data_end, ip6h->nexthdr);
if (dport == -1)
return XDP_PASS;
sport = get_sport(ip6h + 1, data_end, ip6h->nexthdr);
if (sport == -1)
return XDP_PASS;
if (ip6h->nexthdr == IPPROTO_TCP) {
tuple.ip_proto = 1;
} else {
tuple.ip_proto = 0;
}
__builtin_memcpy(tuple.src, ip6h->saddr.s6_addr32, sizeof(tuple.src));
__builtin_memcpy(tuple.dst, ip6h->daddr.s6_addr32, sizeof(tuple.dst));
tuple.port16[0] = sport;
tuple.port16[1] = dport;
tuple.vlan0 = vlan0;
tuple.vlan1 = vlan1;
value = bpf_map_lookup_elem(&flow_table_v6, &tuple);
if (value) {
#if 0
char fmt6[] = "Found IPv6 flow: %d -> %d\n";
bpf_trace_printk(fmt6, sizeof(fmt6), sport, dport);
#endif
#if USE_PERCPU_HASH
value->packets++;
value->bytes += data_end - data;
#else
__sync_fetch_and_add(&value->packets, 1);
__sync_fetch_and_add(&value->bytes, data_end - data);
#endif
#if GOT_TX_PEER
iface_peer = bpf_map_lookup_elem(&tx_peer_int, &key0);
if (!iface_peer) {
return XDP_DROP;
} else {
return bpf_redirect_map(&tx_peer, tx_port, 0);
}
#else
return XDP_DROP;
#endif
}
#if BUILD_CPUMAP
/* IP-pairs + protocol (UDP/TCP/ICMP) hit same CPU */
cpu_hash = tuple.src[0] + tuple.dst[0];
cpu_hash += tuple.src[1] + tuple.dst[1];
cpu_hash += tuple.src[2] + tuple.dst[2];
cpu_hash += tuple.src[3] + tuple.dst[3];
cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL);
if (cpu_max && *cpu_max) {
cpu_dest = cpu_hash % *cpu_max;
cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_dest);
if (!cpu_selected)
return XDP_ABORTED;
cpu_dest = *cpu_selected;
return bpf_redirect_map(&cpu_map, cpu_dest, 0);
} else {
return XDP_PASS;
}
#else
#if RSS_QUEUE_NUMBERS
/* IP-pairs + protocol (UDP/TCP/ICMP) hit same CPU */
__u32 xdp_hash = tuple.src[0] + tuple.dst[0];
xdp_hash += tuple.src[1] + tuple.dst[1];
xdp_hash += tuple.src[2] + tuple.dst[2];
xdp_hash += tuple.src[3] + tuple.dst[3];
xdp_hash = SuperFastHash((char *)&xdp_hash, 4, INITVAL);
ctx->rx_queue_index = xdp_hash % RSS_QUEUE_NUMBERS;
#endif
return XDP_PASS;
#endif
}
int SEC("xdp") xdp_hashfilter(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct ethhdr *eth = data;
__u16 h_proto;
__u64 nh_off;
__u16 vlan0 = 0;
__u16 vlan1 = 0;
#if USE_GLOBAL_BYPASS
int *iface_peer;
char *g_switch = 0;
char key0;
int tx_port = 0;
g_switch = bpf_map_lookup_elem(&global_bypass, &key0);
if (g_switch && *g_switch) {
iface_peer = bpf_map_lookup_elem(&tx_peer_int, &key0);
if (!iface_peer) {
return XDP_DROP;
} else {
return bpf_redirect_map(&tx_peer, tx_port, 0);
}
}
#endif
nh_off = sizeof(*eth);
if (data + nh_off > data_end)
return XDP_PASS;
h_proto = eth->h_proto;
// vlan和qinq处理
if (h_proto == __constant_htons(ETH_P_8021Q) || h_proto == __constant_htons(ETH_P_8021AD)) {
struct vlan_hdr *vhdr;
vhdr = data + nh_off;
nh_off += sizeof(struct vlan_hdr);
if (data + nh_off > data_end)
return XDP_PASS;
h_proto = vhdr->h_vlan_encapsulated_proto;
#if VLAN_TRACKING
vlan0 = vhdr->h_vlan_TCI & 0x0fff;
#else
vlan0 = 0;
#endif
}
if (h_proto == __constant_htons(ETH_P_8021Q) || h_proto == __constant_htons(ETH_P_8021AD)) {
struct vlan_hdr *vhdr;
vhdr = data + nh_off;
nh_off += sizeof(struct vlan_hdr);
if (data + nh_off > data_end)
return XDP_PASS;
h_proto = vhdr->h_vlan_encapsulated_proto;
#if VLAN_TRACKING
vlan1 = vhdr->h_vlan_TCI & 0x0fff;
#else
vlan1 = 0;
#endif
}
// 执行ipv4或ipv6过滤
if (h_proto == __constant_htons(ETH_P_IP))
return filter_ipv4(ctx, data, nh_off, data_end, vlan0, vlan1);
else if (h_proto == __constant_htons(ETH_P_IPV6))
return filter_ipv6(ctx, data, nh_off, data_end, vlan0, vlan1);
return XDP_PASS;
}
char __license[] SEC("license") = "GPL";
__u32 __version SEC("version") = LINUX_VERSION_CODE;
XDP Load Balancer
#define LINUX_VERSION_CODE 263682
/* Hashing initval */
#define INITVAL 15485863
/* Increase CPUMAP_MAX_CPUS if ever you have more than 128 CPUs */
#define CPUMAP_MAX_CPUS 128
struct vlan_hdr {
__u16 h_vlan_TCI;
__u16 h_vlan_encapsulated_proto;
};
/* Special map type that can XDP_REDIRECT frames to another CPU */
struct bpf_map_def SEC("maps") cpu_map = {
.type = BPF_MAP_TYPE_CPUMAP,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = CPUMAP_MAX_CPUS,
};
struct bpf_map_def SEC("maps") cpus_available = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = CPUMAP_MAX_CPUS,
};
struct bpf_map_def SEC("maps") cpus_count = {
.type = BPF_MAP_TYPE_ARRAY,
.key_size = sizeof(__u32),
.value_size = sizeof(__u32),
.max_entries = 1,
};
static int __always_inline hash_ipv4(void *data, void *data_end)
{
struct iphdr *iph = data;
if ((void *)(iph + 1) > data_end)
return XDP_PASS;
__u32 key0 = 0;
__u32 cpu_dest;
__u32 *cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
__u32 *cpu_selected;
__u32 cpu_hash;
/* IP-pairs hit same CPU */
cpu_hash = iph->saddr + iph->daddr;
cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL);
if (cpu_max && *cpu_max) {
cpu_dest = cpu_hash % *cpu_max;
cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_dest);
if (!cpu_selected)
return XDP_ABORTED;
cpu_dest = *cpu_selected;
return bpf_redirect_map(&cpu_map, cpu_dest, 0);
} else {
return XDP_PASS;
}
}
static int __always_inline hash_ipv6(void *data, void *data_end)
{
struct ipv6hdr *ip6h = data;
if ((void *)(ip6h + 1) > data_end)
return XDP_PASS;
__u32 key0 = 0;
__u32 cpu_dest;
__u32 *cpu_max = bpf_map_lookup_elem(&cpus_count, &key0);
__u32 *cpu_selected;
__u32 cpu_hash;
/* IP-pairs hit same CPU */
cpu_hash = ip6h->saddr.s6_addr32[0] + ip6h->daddr.s6_addr32[0];
cpu_hash += ip6h->saddr.s6_addr32[1] + ip6h->daddr.s6_addr32[1];
cpu_hash += ip6h->saddr.s6_addr32[2] + ip6h->daddr.s6_addr32[2];
cpu_hash += ip6h->saddr.s6_addr32[3] + ip6h->daddr.s6_addr32[3];
cpu_hash = SuperFastHash((char *)&cpu_hash, 4, INITVAL);
if (cpu_max && *cpu_max) {
cpu_dest = cpu_hash % *cpu_max;
cpu_selected = bpf_map_lookup_elem(&cpus_available, &cpu_dest);
if (!cpu_selected)
return XDP_ABORTED;
cpu_dest = *cpu_selected;
return bpf_redirect_map(&cpu_map, cpu_dest, 0);
} else {
return XDP_PASS;
}
return XDP_PASS;
}
static int __always_inline filter_gre(struct xdp_md *ctx, void *data, __u64 nh_off, void *data_end)
{
struct iphdr *iph = data + nh_off;
__u16 proto;
struct gre_hdr {
__be16 flags;
__be16 proto;
};
nh_off += sizeof(struct iphdr);
struct gre_hdr *grhdr = (struct gre_hdr *)(iph + 1);
if ((void *)(grhdr + 1) > data_end)
return XDP_PASS;
if (grhdr->flags & (GRE_VERSION|GRE_ROUTING))
return XDP_PASS;
nh_off += 4;
proto = grhdr->proto;
if (grhdr->flags & GRE_CSUM)
nh_off += 4;
if (grhdr->flags & GRE_KEY)
nh_off += 4;
if (grhdr->flags & GRE_SEQ)
nh_off += 4;
/* Update offset to skip ERPSAN header if we have one */
if (proto == __constant_htons(ETH_P_ERSPAN)) {
nh_off += 8;
}
if (data + nh_off > data_end)
return XDP_PASS;
if (bpf_xdp_adjust_head(ctx, 0 + nh_off))
return XDP_PASS;
data = (void *)(long)ctx->data;
data_end = (void *)(long)ctx->data_end;
/* we have now data starting at Ethernet header */
struct ethhdr *eth = data;
proto = eth->h_proto;
/* we want to hash on IP so we need to get to ip hdr */
nh_off = sizeof(*eth);
if (data + nh_off > data_end)
return XDP_PASS;
/* we need to increase offset and update protocol
* in the case we have VLANs */
if (proto == __constant_htons(ETH_P_8021Q)) {
struct vlan_hdr *vhdr = (struct vlan_hdr *)(data + nh_off);
if ((void *)(vhdr + 1) > data_end)
return XDP_PASS;
proto = vhdr->h_vlan_encapsulated_proto;
nh_off += sizeof(struct vlan_hdr);
}
if (data + nh_off > data_end)
return XDP_PASS;
/* proto should now be IP style */
if (proto == __constant_htons(ETH_P_IP)) {
return hash_ipv4(data + nh_off, data_end);
} else if (proto == __constant_htons(ETH_P_IPV6)) {
return hash_ipv6(data + nh_off, data_end);
} else
return XDP_PASS;
}
static int __always_inline filter_ipv4(struct xdp_md *ctx, void *data, __u64 nh_off, void *data_end)
{
struct iphdr *iph = data + nh_off;
if ((void *)(iph + 1) > data_end)
return XDP_PASS;
// 处理gre隧道协议
if (iph->protocol == IPPROTO_GRE) {
return filter_gre(ctx, data, nh_off, data_end);
}
// 计算ipv4哈希
return hash_ipv4(data + nh_off, data_end);
}
static int __always_inline filter_ipv6(struct xdp_md *ctx, void *data, __u64 nh_off, void *data_end)
{
struct ipv6hdr *ip6h = data + nh_off;
return hash_ipv6((void *)ip6h, data_end);
}
int SEC("xdp") xdp_loadfilter(struct xdp_md *ctx)
{
void *data_end = (void *)(long)ctx->data_end;
void *data = (void *)(long)ctx->data;
struct ethhdr *eth = data;
__u16 h_proto;
__u64 nh_off;
nh_off = sizeof(*eth);
if (data + nh_off > data_end)
return XDP_PASS;
h_proto = eth->h_proto;
#if 0
if (h_proto != __constant_htons(ETH_P_IP)) {
char fmt[] = "Current proto: %u\n";
bpf_trace_printk(fmt, sizeof(fmt), h_proto);
}
#endif
if (h_proto == __constant_htons(ETH_P_8021Q) || h_proto == __constant_htons(ETH_P_8021AD)) {
struct vlan_hdr *vhdr;
vhdr = data + nh_off;
nh_off += sizeof(struct vlan_hdr);
if (data + nh_off > data_end)
return XDP_PASS;
h_proto = vhdr->h_vlan_encapsulated_proto;
}
if (h_proto == __constant_htons(ETH_P_8021Q) || h_proto == __constant_htons(ETH_P_8021AD)) {
struct vlan_hdr *vhdr;
vhdr = data + nh_off;
nh_off += sizeof(struct vlan_hdr);
if (data + nh_off > data_end)
return XDP_PASS;
h_proto = vhdr->h_vlan_encapsulated_proto;
}
if (h_proto == __constant_htons(ETH_P_IP))
return filter_ipv4(ctx, data, nh_off, data_end);
else if (h_proto == __constant_htons(ETH_P_IPV6))
return filter_ipv6(ctx, data, nh_off, data_end);
return XDP_PASS;
}
char __license[] SEC("license") = "GPL";
__u32 __version SEC("version") = LINUX_VERSION_CODE;