From b87b4734c6e56fa45ec612350e2aa480ed2d8dd6 Mon Sep 17 00:00:00 2001 From: Pavel Kubelun Date: Fri, 14 Jul 2017 15:38:47 -0400 Subject: [PATCH] package: add Shortcut Forwarding Engine driver Add an opensource Qualcomm Shortcut FE driver. The driver consists of: - sfe (core modules): shortcut-fe (ipv4) and shortcut-fe-ipv6 - modules that actually do the offloading - shortcut-fe-cm - simple shortcut-fe connection manager - a module that creates offloading rules and passes to sfe - fast-classifier - advanced connection manager - a module that creates offloading rules and passes to sfe. Comparing to sfe-cm has additional checks, connection statistics and creates an offloading rule only when a connection hits 128 packets flowed (can be adjusted through /sys/fast_classifier/offload_at_pkts) - userspace-fast-classifier - a fast-classifier user-space tool to manually add connections one want to offload Both sfe-cm and fast-classifier get information about connections through netfilter. And after some checks it creates an offloading rule that is passed to sfe. After that sfe marks a connection as fast-forwarded and routes the flow. Kernel parses this mark and bypasses its network stack taps for a specific connection in favor of sfe. There are 3 patches added with this commit: 950 - adds a hook into network stack, sets no tcp window check to 0 (according to sfe readme). 951 - because fast-classifier also supports bridge offloading (echo 1 > /sys/fast_classifier/skip_to_bridge_ingress) and offloaded connections do not naturally update bridge statistics, fast-classifier will do that. 952 - allow multiple listeners for conntrack events, it is requied for netlink and sfe coexistance (for example) All these patches are filled with ifdefs so the sfe code path is engaged only when these modules are compiled. It allows to keep kernel code untouched in case of no sfe compiled in. Because sfe-cm and fast-classifier are competing modules, It's recommended to keep loaded only one of them. If both are loaded then in a race condition sfe-cm always comes first. The driver is a merger of https://source.codeaurora.org/quic/qsdk/oss/lklm/shortcut-fe/log/?h=release/endive_cc with https://chromium.googlesource.com/chromiumos/third_party/kernel/+/chromeos-3.18/drivers/net/ethernet/qualcomm/sfe/ I've also added missing function to update udp statistics by fast-classifier and some more small fixes. Patches are taken from QSDK and that chromeos-3.18 repo and a bit adjusted. Signed-off-by: Pavel Kubelun --- package/kernel/shortcut-fe/Makefile | 121 + package/kernel/shortcut-fe/src/Kconfig | 14 + package/kernel/shortcut-fe/src/Makefile | 14 + package/kernel/shortcut-fe/src/README | 122 + .../kernel/shortcut-fe/src/fast-classifier.c | 1892 +++++++++ .../kernel/shortcut-fe/src/fast-classifier.h | 57 + .../shortcut-fe/src/nl_classifier_test.c | 281 ++ package/kernel/shortcut-fe/src/sfe.h | 61 + package/kernel/shortcut-fe/src/sfe_backport.h | 138 + package/kernel/shortcut-fe/src/sfe_cm.c | 1203 ++++++ package/kernel/shortcut-fe/src/sfe_cm.h | 222 ++ package/kernel/shortcut-fe/src/sfe_ipv4.c | 3369 +++++++++++++++++ package/kernel/shortcut-fe/src/sfe_ipv6.c | 3361 ++++++++++++++++ .../shortcut-fe/src/userspace_example.c | 232 ++ target/linux/generic/config-4.4 | 2 + target/linux/generic/config-4.9 | 2 + ...-linux-kernel-to-support-shortcut-fe.patch | 145 + ...ridge-APIs-needed-for-network-HW-acc.patch | 74 + ...k-events-support-multiple-registrant.patch | 318 ++ ...-linux-kernel-to-support-shortcut-fe.patch | 140 + ...ridge-APIs-needed-for-network-HW-acc.patch | 74 + ...k-events-support-multiple-registrant.patch | 345 ++ 22 files changed, 12187 insertions(+) create mode 100644 package/kernel/shortcut-fe/Makefile create mode 100644 package/kernel/shortcut-fe/src/Kconfig create mode 100644 package/kernel/shortcut-fe/src/Makefile create mode 100644 package/kernel/shortcut-fe/src/README create mode 100644 package/kernel/shortcut-fe/src/fast-classifier.c create mode 100644 package/kernel/shortcut-fe/src/fast-classifier.h create mode 100644 package/kernel/shortcut-fe/src/nl_classifier_test.c create mode 100644 package/kernel/shortcut-fe/src/sfe.h create mode 100644 package/kernel/shortcut-fe/src/sfe_backport.h create mode 100644 package/kernel/shortcut-fe/src/sfe_cm.c create mode 100644 package/kernel/shortcut-fe/src/sfe_cm.h create mode 100644 package/kernel/shortcut-fe/src/sfe_ipv4.c create mode 100644 package/kernel/shortcut-fe/src/sfe_ipv6.c create mode 100644 package/kernel/shortcut-fe/src/userspace_example.c create mode 100644 target/linux/generic/hack-4.4/950-net-patch-linux-kernel-to-support-shortcut-fe.patch create mode 100644 target/linux/generic/hack-4.4/951-bridge-Add-new-bridge-APIs-needed-for-network-HW-acc.patch create mode 100644 target/linux/generic/hack-4.4/952-net-conntrack-events-support-multiple-registrant.patch create mode 100644 target/linux/generic/hack-4.9/950-net-patch-linux-kernel-to-support-shortcut-fe.patch create mode 100644 target/linux/generic/hack-4.9/951-bridge-Add-new-bridge-APIs-needed-for-network-HW-acc.patch create mode 100644 target/linux/generic/hack-4.9/952-net-conntrack-events-support-multiple-registrant.patch diff --git a/package/kernel/shortcut-fe/Makefile b/package/kernel/shortcut-fe/Makefile new file mode 100644 index 000000000000..2849e58f6d8f --- /dev/null +++ b/package/kernel/shortcut-fe/Makefile @@ -0,0 +1,121 @@ +# +# Copyright (c) 2014 The Linux Foundation. All rights reserved. +# Permission to use, copy, modify, and/or distribute this software for +# any purpose with or without fee is hereby granted, provided that the +# above copyright notice and this permission notice appear in all copies. +# THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES +# WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF +# MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR +# ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES +# WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN +# ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT +# OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. +# +include $(TOPDIR)/rules.mk +include $(INCLUDE_DIR)/kernel.mk + +PKG_NAME:=shortcut-fe +PKG_RELEASE:=1 + +include $(INCLUDE_DIR)/package.mk + +define KernelPackage/shortcut-fe + SECTION:=kernel + CATEGORY:=Kernel modules + SUBMENU:=Network Support + DEPENDS:=@IPV6 + TITLE:=Kernel driver for SFE + FILES:=$(PKG_BUILD_DIR)/shortcut-fe.ko $(PKG_BUILD_DIR)/shortcut-fe-ipv6.ko + KCONFIG:=CONFIG_NF_CONNTRACK_EVENTS=y CONFIG_SHORTCUT_FE=y + AUTOLOAD:=$(call AutoProbe,shortcut-fe shortcut-fe-ipv6) +endef + +define KernelPackage/shortcut-fe/Description +Shortcut is an in-Linux-kernel IP packet forwarding engine. +endef + +define KernelPackage/shortcut-fe-cm + SECTION:=kernel + CATEGORY:=Kernel modules + SUBMENU:=Network Support + DEPENDS:=+kmod-ipt-conntrack +kmod-shortcut-fe + TITLE:=Kernel driver for SFE + FILES:=$(PKG_BUILD_DIR)/shortcut-fe-cm.ko + KCONFIG:=CONFIG_NF_CONNTRACK_CHAIN_EVENTS=y CONFIG_NF_CONNTRACK_MARK=y + AUTOLOAD:=$(call AutoProbe,shortcut-fe-cm) +endef + +define KernelPackage/shortcut-fe-cm/Description +Simple connection manager for the Shortcut forwarding engine. +endef + +define KernelPackage/fast-classifier + SECTION:=kernel + CATEGORY:=Kernel modules + SUBMENU:=Network Support + DEPENDS:=+kmod-ipt-conntrack +kmod-shortcut-fe + TITLE:=Kernel driver for FAST Classifier + FILES:=$(PKG_BUILD_DIR)/fast-classifier.ko + KCONFIG:=CONFIG_NF_CONNTRACK_CHAIN_EVENTS=y CONFIG_NF_CONNTRACK_MARK=y + AUTOLOAD:=$(call AutoLoad,z,fast-classifier) + PROVIDES:=$(PKG_NAME) +endef + +define KernelPackage/fast-classifier/description +FAST Classifier connection manager for Shortcut forwarding engine. +It talks to SFE to make decisions about offloading connections. +endef + +define Package/fast-classifier-example + TITLE:=Example user space program for fast-classifier + DEPENDS:=+libnl +kmod-fast-classifier +endef + +define Package/fast-classifier-example/description +Example user space program that communicates with fast +classifier kernel module +endef + +MAKE_OPTS:= \ + ARCH="$(LINUX_KARCH)" \ + CROSS_COMPILE="$(TARGET_CROSS)" \ + SUBDIRS="$(PKG_BUILD_DIR)" \ + EXTRA_CFLAGS="$(EXTRA_CFLAGS)" + +define Build/Compile + $(MAKE) -C "$(LINUX_DIR)" \ + $(MAKE_OPTS) \ + modules + $(if $(CONFIG_PACKAGE_fast-classifier-example),$(Build/Compile/fast-classifier-example)) +endef + +define Build/Compile/fast-classifier-example + $(TARGET_CC) -o $(PKG_BUILD_DIR)/userspace_fast_classifier \ + -I $(PKG_BUILD_DIR) \ + -I$(STAGING_DIR)/usr/include/libnl \ + -I$(STAGING_DIR)/usr/include/libnl3 \ + -lnl-genl-3 -lnl-3 \ + $(PKG_BUILD_DIR)/nl_classifier_test.c +endef + +ifneq ($(CONFIG_PACKAGE_kmod-shortcut-fe)$(CONFIG_PACKAGE_kmod-shortcut-fe-cm)$(CONFIG_PACKAGE_kmod-fast-classifier),) +define Build/InstallDev + $(INSTALL_DIR) $(1)/usr/include/shortcut-fe + $(CP) -rf $(PKG_BUILD_DIR)/sfe.h $(1)/usr/include/shortcut-fe +ifneq ($(CONFIG_PACKAGE_kmod-fast-classifier),) + $(INSTALL_DIR) $(1)/usr/include + $(CP) $(PKG_BUILD_DIR)/fast-classifier.h $(1)/usr/include/ +endif +endef +endif + +define Package/fast-classifier-example/install + $(INSTALL_DIR) $(1)/sbin + $(CP) $(PKG_BUILD_DIR)/userspace_fast_classifier $(1)/sbin/ +endef + +$(eval $(call KernelPackage,shortcut-fe)) +$(eval $(call KernelPackage,shortcut-fe-cm)) +$(eval $(call KernelPackage,fast-classifier)) +$(eval $(call BuildPackage,fast-classifier-example)) + diff --git a/package/kernel/shortcut-fe/src/Kconfig b/package/kernel/shortcut-fe/src/Kconfig new file mode 100644 index 000000000000..487f1e065e0c --- /dev/null +++ b/package/kernel/shortcut-fe/src/Kconfig @@ -0,0 +1,14 @@ +# +# Shortcut forwarding engine +# + +config SHORTCUT_FE + tristate "Shortcut Forwarding Engine" + depends on NF_CONNTRACK + ---help--- + Shortcut is a fast in-kernel packet forwarding engine. + + To compile this code as a module, choose M here: the module will be + called shortcut-fe. + + If unsure, say N. diff --git a/package/kernel/shortcut-fe/src/Makefile b/package/kernel/shortcut-fe/src/Makefile new file mode 100644 index 000000000000..88d95d232e2c --- /dev/null +++ b/package/kernel/shortcut-fe/src/Makefile @@ -0,0 +1,14 @@ +# +# Makefile for Shortcut FE. +# + +obj-m += shortcut-fe.o shortcut-fe-ipv6.o shortcut-fe-cm.o fast-classifier.o + +shortcut-fe-objs := \ + sfe_ipv4.o + +shortcut-fe-ipv6-objs := \ + sfe_ipv6.o + +shortcut-fe-cm-objs := \ + sfe_cm.o diff --git a/package/kernel/shortcut-fe/src/README b/package/kernel/shortcut-fe/src/README new file mode 100644 index 000000000000..1bf1cc255581 --- /dev/null +++ b/package/kernel/shortcut-fe/src/README @@ -0,0 +1,122 @@ +Shortcut Forwarding Engine +-------------------------- + +Welcome to "Shortcut" :-) + +Here's a quick FAQ: + + +Q) What is Shortcut? + +A) Shortcut is an in-Linux-kernel IP packet forwarding engine. It's designed +to offer very high speed IP packet forwarding based on IP connection tracking. +It's dramatically faster than the standard netfilter-based NAT forwarding path +but is designed to synchronise state back to netfilter/conntrack so that it +doesn't need to deal with all of the complexities of special cases. + + +Q) What versions of IP does it support? + +A) The current version only supports IPv4 but will be extended to support IPv6 in +the future. + + +Q) What transport protocols does it support? + +A) TCP and UDP. It also knows enough about ICMP to spot ICMP error messages +related to TCP and UDP and handle things accordingly. + + +Q) Is there a design spec for this software? + +A) Not at the moment. I'll write one when I get more time. The code is +intended to be a good tutorial though - it's very heavily commented. If you +find yourself reading something and not understanding it then I take that to +mean I've probably not done a sufficently good job of explaining what it's +doing in the comments. Let me know - I will try to fix it :-) + + +Q) Why was it written? + +A) It was written as a demonstration of what can be done to provide high +performance forwarding inside the kernel. There were two initial motivations: + +1) To provide a platform to enable research into how QoS analysis systems can +offload work and avoid huge Linux overheads. + +2) To provide a tool to investigate the behaviour of various processors, SoCs +and software sets so that we can characterize and design new network processor +SoCs. + + +Q) How much faster is it than the Linux kernel forwarding path? + +A) At the time of pushing this to github it's been tested on a QCA AP135. +This has a Scorpion (QCA Scopion, not the QMC one :-)) SoC, QCA9550. The +SoC's processor is a MIPS74K running at 720 MHz and with a DDR2 memory +subsystem that offers a peak of 600 MT/s (16-bit transfers). + +Running IPv4 NAT forwarding of UDP between the board's 2 GMAC ports and +using a SmartBits 200 as a traffic generator Linux is able to forward 70k PPS. +Once the SFE code is invoked this will increase to 350k PPS! + +There's also a slightly hacky mode which causes SFE to bypass the Linux +bridge layer, but this isn't really ready for use because it doesn't have +sufficient MAC address checks or integration of statistics back to the +Ethernet bridge, but that runs at 436k PPS. + + +Q) Are there any diagnostics? + +A) Yes, this is a research tool after all! There's a complex way to do this +that's more general purpose and a simple one - here's the simple one: + + mknod /dev/sfe c 253 0 + +The file /dev/sfe is an XML-ish output and provides details of all the +network connections currently being offloaded. It also reports the numbers +of packets that took various "exception" paths within the code. In addition +it provides a summary of the number of connections, attempts to accelerate +connections, cancel accelerations, etc. It also reports the numbers of +packets that were forwarded and not forwarded by the engine and has some +stats on the effectiveness of the hashing algorithm it uses. + + +Q) How does the code interact with Linux? + +A) There are four minor patches required to make this software run with +Linux. These are currently against a 3.3.8 or 3.4.0 kernel: + +* (net/core/dev.c) adds a hook to allow packets to be extracted out. + +* (net/netfilter/nf_conntrack_proto_tcp.c) exposes a state variable inside + netfilter that's necessary to enable TCP sequence and ACK checking within + the offload path. Note that this specific patch is against the QCA QSDK + patched version of 3.3.8 - there's a slightly braindead "performance" + patch in that kernel, courtesy of the OpenWrt community that makes the + Linux forwarding path slightly faster at the expense of losing + functionality :-( + +* (net/Kconfig) adds the shortcut-fe option. + +* (net/Makefile) adds the shortcut-fe build support. + +Once these are applied and the module is loaded then everything else +is automatic :-) The patches are in this git repo. + + +Q) Are any of the pieces reused from other projects? + +A) Yes! Some of the forwarding concepts are reused from the Ubicom Network +Accelerator that morphed into part of the Akronite NSS. This code has all +been substantially changed though to accomodate Linux's needs. + +There are also some pieces that I borrowed from the QCA "FastNAT" software +written by Xiaoping Fan . Xiaoping's code was the +first actual demonstration within QCA that this in-kernel concept could yield +signficant performance gains. + + +Enjoy! +Dave Hudson + diff --git a/package/kernel/shortcut-fe/src/fast-classifier.c b/package/kernel/shortcut-fe/src/fast-classifier.c new file mode 100644 index 000000000000..48a2d27f4d57 --- /dev/null +++ b/package/kernel/shortcut-fe/src/fast-classifier.c @@ -0,0 +1,1892 @@ +/* + * fast-classifier.c + * Shortcut forwarding engine connection manager. + * fast-classifier + * + * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sfe_backport.h" +#include "sfe.h" +#include "sfe_cm.h" +#include "fast-classifier.h" + +typedef enum fast_classifier_exception { + FAST_CL_EXCEPTION_PACKET_BROADCAST, + FAST_CL_EXCEPTION_PACKET_MULTICAST, + FAST_CL_EXCEPTION_NO_IIF, + FAST_CL_EXCEPTION_NO_CT, + FAST_CL_EXCEPTION_CT_NO_TRACK, + FAST_CL_EXCEPTION_CT_NO_CONFIRM, + FAST_CL_EXCEPTION_CT_IS_ALG, + FAST_CL_EXCEPTION_IS_IPV4_MCAST, + FAST_CL_EXCEPTION_IS_IPV6_MCAST, + FAST_CL_EXCEPTION_TCP_NOT_ASSURED, + FAST_CL_EXCEPTION_TCP_NOT_ESTABLISHED, + FAST_CL_EXCEPTION_UNKNOW_PROTOCOL, + FAST_CL_EXCEPTION_NO_SRC_DEV, + FAST_CL_EXCEPTION_NO_SRC_XLATE_DEV, + FAST_CL_EXCEPTION_NO_DEST_DEV, + FAST_CL_EXCEPTION_NO_DEST_XLATE_DEV, + FAST_CL_EXCEPTION_NO_BRIDGE, + FAST_CL_EXCEPTION_LOCAL_OUT, + FAST_CL_EXCEPTION_WAIT_FOR_ACCELERATION, + FAST_CL_EXCEPTION_UPDATE_PROTOCOL_FAIL, + FAST_CL_EXCEPTION_CT_DESTROY_MISS, + FAST_CL_EXCEPTION_MAX +} fast_classifier_exception_t; + +static char *fast_classifier_exception_events_string[FAST_CL_EXCEPTION_MAX] = { + "PACKET_BROADCAST", + "PACKET_MULTICAST", + "NO_IIF", + "NO_CT", + "CT_NO_TRACK", + "CT_NO_CONFIRM", + "CT_IS_ALG", + "IS_IPV4_MCAST", + "IS_IPV6_MCAST", + "TCP_NOT_ASSURED", + "TCP_NOT_ESTABLISHED", + "UNKNOW_PROTOCOL", + "NO_SRC_DEV", + "NO_SRC_XLATE_DEV", + "NO_DEST_DEV", + "NO_DEST_XLATE_DEV", + "NO_BRIDGE", + "LOCAL_OUT", + "WAIT_FOR_ACCELERATION", + "UPDATE_PROTOCOL_FAIL", + "CT_DESTROY_MISS", +}; + +/* + * Per-module structure. + */ +struct fast_classifier { + spinlock_t lock; /* Lock for SMP correctness */ + + /* + * Control state. + */ + struct kobject *sys_fast_classifier; /* sysfs linkage */ + + /* + * Callback notifiers. + */ + struct notifier_block dev_notifier; /* Device notifier */ + struct notifier_block inet_notifier; /* IPv4 notifier */ + struct notifier_block inet6_notifier; /* IPv6 notifier */ + u32 exceptions[FAST_CL_EXCEPTION_MAX]; +}; + +static struct fast_classifier __fsc; + +static struct nla_policy fast_classifier_genl_policy[FAST_CLASSIFIER_A_MAX + 1] = { + [FAST_CLASSIFIER_A_TUPLE] = { + .type = NLA_UNSPEC, + .len = sizeof(struct fast_classifier_tuple) + }, +}; + +static struct genl_multicast_group fast_classifier_genl_mcgrp[] = { + { + .name = FAST_CLASSIFIER_GENL_MCGRP, + }, +}; + +static struct genl_family fast_classifier_gnl_family = { + .id = GENL_ID_GENERATE, + .hdrsize = FAST_CLASSIFIER_GENL_HDRSIZE, + .name = FAST_CLASSIFIER_GENL_NAME, + .version = FAST_CLASSIFIER_GENL_VERSION, + .maxattr = FAST_CLASSIFIER_A_MAX, +}; + +static int fast_classifier_offload_genl_msg(struct sk_buff *skb, struct genl_info *info); +static int fast_classifier_nl_genl_msg_DUMP(struct sk_buff *skb, struct netlink_callback *cb); + +static struct genl_ops fast_classifier_gnl_ops[] = { + { + .cmd = FAST_CLASSIFIER_C_OFFLOAD, + .flags = 0, + .policy = fast_classifier_genl_policy, + .doit = fast_classifier_offload_genl_msg, + .dumpit = NULL, + }, + { + .cmd = FAST_CLASSIFIER_C_OFFLOADED, + .flags = 0, + .policy = fast_classifier_genl_policy, + .doit = NULL, + .dumpit = fast_classifier_nl_genl_msg_DUMP, + }, + { + .cmd = FAST_CLASSIFIER_C_DONE, + .flags = 0, + .policy = fast_classifier_genl_policy, + .doit = NULL, + .dumpit = fast_classifier_nl_genl_msg_DUMP, + }, +}; + +static atomic_t offload_msgs = ATOMIC_INIT(0); +static atomic_t offload_no_match_msgs = ATOMIC_INIT(0); +static atomic_t offloaded_msgs = ATOMIC_INIT(0); +static atomic_t done_msgs = ATOMIC_INIT(0); + +static atomic_t offloaded_fail_msgs = ATOMIC_INIT(0); +static atomic_t done_fail_msgs = ATOMIC_INIT(0); + +/* + * Accelerate incoming packets destined for bridge device + * If a incoming packet is ultimatly destined for + * a bridge device we will first see the packet coming + * from the phyiscal device, we can skip straight to + * processing the packet like it came from the bridge + * for some more performance gains + * + * This only works when the hook is above the bridge. We + * only implement ingress for now, because for egress we + * want to have the bridge devices qdiscs be used. + */ +static bool skip_to_bridge_ingress; + +/* + * fast_classifier_incr_exceptions() + * increase an exception counter. + */ +static inline void fast_classifier_incr_exceptions(fast_classifier_exception_t except) +{ + struct fast_classifier *sc = &__fsc; + + spin_lock_bh(&sc->lock); + sc->exceptions[except]++; + spin_unlock_bh(&sc->lock); +} + +/* + * fast_classifier_recv() + * Handle packet receives. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +static int fast_classifier_recv(struct sk_buff *skb) +{ + struct net_device *dev; + struct net_device *master_dev = NULL; + int ret = 0; + + /* + * We know that for the vast majority of packets we need the transport + * layer header so we may as well start to fetch it now! + */ + prefetch(skb->data + 32); + barrier(); + + dev = skb->dev; + + /* + * Process packet like it arrived on the bridge device + */ + if (skip_to_bridge_ingress && + (dev->priv_flags & IFF_BRIDGE_PORT)) { + master_dev = sfe_dev_get_master(dev); + if (!master_dev) { + DEBUG_WARN("master dev is NULL %s\n"); + goto rx_exit; + } + dev = master_dev; + } + +#ifdef CONFIG_NET_CLS_ACT + /* + * If ingress Qdisc configured, and packet not processed by ingress Qdisc yet + * We cannot accelerate this packet. + */ + if (dev->ingress_queue && !(skb->tc_verd & TC_NCLS)) { + goto rx_exit; + } +#endif + + /* + * We're only interested in IPv4 and IPv6 packets. + */ + if (likely(htons(ETH_P_IP) == skb->protocol)) { + struct in_device *in_dev; + + /* + * Does our input device support IP processing? + */ + in_dev = (struct in_device *)dev->ip_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IP processing for device: %s\n", dev->name); + goto rx_exit; + } + + /* + * Does it have an IP address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(!in_dev->ifa_list)) { + DEBUG_TRACE("no IP address for device: %s\n", dev->name); + goto rx_exit; + } + + ret = sfe_ipv4_recv(dev, skb); + + } else if (likely(htons(ETH_P_IPV6) == skb->protocol)) { + struct inet6_dev *in_dev; + + /* + * Does our input device support IPv6 processing? + */ + in_dev = (struct inet6_dev *)dev->ip6_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name); + goto rx_exit; + } + + /* + * Does it have an IPv6 address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(list_empty(&in_dev->addr_list))) { + DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name); + goto rx_exit; + } + + ret = sfe_ipv6_recv(dev, skb); + + } else { + DEBUG_TRACE("not IP packet\n"); + } + +rx_exit: + if (master_dev) { + dev_put(master_dev); + } + + return ret; +} + +/* + * fast_classifier_find_dev_and_mac_addr() + * Find the device and MAC address for a given IPv4 address. + * + * Returns true if we find the device and MAC address, otherwise false. + * + * We look up the rtable entry for the address and, from its neighbour + * structure, obtain the hardware address. This means this function also + * works if the neighbours are routers too. + */ +static bool fast_classifier_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, bool is_v4) +{ + struct neighbour *neigh; + struct rtable *rt; + struct rt6_info *rt6; + struct dst_entry *dst; + struct net_device *mac_dev; + + /* + * Look up the rtable entry for the IP address then get the hardware + * address from its neighbour structure. This means this works when the + * neighbours are routers too. + */ + if (is_v4) { + rt = ip_route_output(&init_net, addr->ip, 0, 0, 0); + if (unlikely(IS_ERR(rt))) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt; + } else { + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0); + if (!rt6) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt6; + } + + rcu_read_lock(); + neigh = dst_neigh_lookup(dst, addr); + if (unlikely(!neigh)) { + rcu_read_unlock(); + dst_release(dst); + goto ret_fail; + } + + if (unlikely(!(neigh->nud_state & NUD_VALID))) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + mac_dev = neigh->dev; + if (!mac_dev) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len); + + dev_hold(mac_dev); + *dev = mac_dev; + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + + return true; + +ret_fail: + DEBUG_TRACE("failed to find MAC address for IP: %pIS\n", addr); + + return false; +} + +static DEFINE_SPINLOCK(sfe_connections_lock); + +struct sfe_connection { + struct hlist_node hl; + struct sfe_connection_create *sic; + struct nf_conn *ct; + int hits; + int offload_permit; + int offloaded; + bool is_v4; + unsigned char smac[ETH_ALEN]; + unsigned char dmac[ETH_ALEN]; +}; + +static int sfe_connections_size; + +#define FC_CONN_HASH_ORDER 13 +static DEFINE_HASHTABLE(fc_conn_ht, FC_CONN_HASH_ORDER); + +static u32 fc_conn_hash(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, bool is_v4) +{ + u32 idx, cnt = ((is_v4 ? sizeof(saddr->ip) : sizeof(saddr->ip6))/sizeof(u32)); + u32 hash = 0; + + for (idx = 0; idx < cnt; idx++) { + hash ^= ((u32 *)saddr)[idx] ^ ((u32 *)daddr)[idx]; + } + + return hash ^ (sport | (dport << 16)); +} + +/* + * fast_classifier_update_protocol() + * Update sfe_ipv4_create struct with new protocol information before we offload + */ +static int fast_classifier_update_protocol(struct sfe_connection_create *p_sic, struct nf_conn *ct) +{ + switch (p_sic->protocol) { + case IPPROTO_TCP: + p_sic->src_td_window_scale = ct->proto.tcp.seen[0].td_scale; + p_sic->src_td_max_window = ct->proto.tcp.seen[0].td_maxwin; + p_sic->src_td_end = ct->proto.tcp.seen[0].td_end; + p_sic->src_td_max_end = ct->proto.tcp.seen[0].td_maxend; + p_sic->dest_td_window_scale = ct->proto.tcp.seen[1].td_scale; + p_sic->dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin; + p_sic->dest_td_end = ct->proto.tcp.seen[1].td_end; + p_sic->dest_td_max_end = ct->proto.tcp.seen[1].td_maxend; + + if (nf_ct_tcp_no_window_check + || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL) + || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) { + p_sic->flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK; + } + + /* + * If the connection is shutting down do not manage it. + * state can not be SYN_SENT, SYN_RECV because connection is assured + * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE. + */ + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) { + spin_unlock_bh(&ct->lock); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_TCP_NOT_ESTABLISHED); + DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n", + ct->proto.tcp.state, &p_sic->src_ip, ntohs(p_sic->src_port), + &p_sic->dest_ip, ntohs(p_sic->dest_port)); + return 0; + } + spin_unlock_bh(&ct->lock); + break; + + case IPPROTO_UDP: + break; + + default: + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", p_sic->protocol); + return 0; + } + + return 1; +} + +/* fast_classifier_send_genl_msg() + * Function to send a generic netlink message + */ +static void fast_classifier_send_genl_msg(int msg, struct fast_classifier_tuple *fc_msg) +{ + struct sk_buff *skb; + int rc; + int buf_len; + int total_len; + void *msg_head; + + /* + * Calculate our packet payload size. + * Start with our family header. + */ + buf_len = fast_classifier_gnl_family.hdrsize; + + /* + * Add the nla_total_size of each attribute we're going to nla_put(). + */ + buf_len += nla_total_size(sizeof(*fc_msg)); + + /* + * Lastly we need to add space for the NL message header since + * genlmsg_new only accounts for the GENL header and not the + * outer NL header. To do this, we use a NL helper function which + * calculates the total size of a netlink message given a payload size. + * Note this value does not include the GENL header, but that's + * added automatically by genlmsg_new. + */ + total_len = nlmsg_total_size(buf_len); + skb = genlmsg_new(total_len, GFP_ATOMIC); + if (!skb) + return; + + msg_head = genlmsg_put(skb, 0, 0, &fast_classifier_gnl_family, 0, msg); + if (!msg_head) { + nlmsg_free(skb); + return; + } + + rc = nla_put(skb, FAST_CLASSIFIER_A_TUPLE, sizeof(struct fast_classifier_tuple), fc_msg); + if (rc != 0) { + genlmsg_cancel(skb, msg_head); + nlmsg_free(skb); + return; + } + + genlmsg_end(skb, msg_head); + if (rc < 0) { + genlmsg_cancel(skb, msg_head); + nlmsg_free(skb); + return; + } + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) + rc = genlmsg_multicast(&fast_classifier_gnl_family, skb, 0, 0, GFP_ATOMIC); +#else + rc = genlmsg_multicast(skb, 0, fast_classifier_genl_mcgrp[0].id, GFP_ATOMIC); +#endif + switch (msg) { + case FAST_CLASSIFIER_C_OFFLOADED: + atomic_inc(&offloaded_msgs); + if (rc != 0) + atomic_inc(&offloaded_fail_msgs); + break; + case FAST_CLASSIFIER_C_DONE: + atomic_inc(&done_msgs); + if (rc != 0) + atomic_inc(&done_fail_msgs); + break; + default: + DEBUG_ERROR("fast-classifer: Unknown message type sent!\n"); + break; + } + + DEBUG_TRACE("Notify NL message %d ", msg); + DEBUG_TRACE("sip=%pIS dip=%pIS ", &fc_msg->src_saddr, &fc_msg->dst_saddr); + DEBUG_TRACE("protocol=%d sport=%d dport=%d smac=%pM dmac=%pM\n", + fc_msg->proto, fc_msg->sport, fc_msg->dport, fc_msg->smac, fc_msg->dmac); +} + +/* + * fast_classifier_find_conn() + * find a connection object in the hash table + * @pre the sfe_connection_lock must be held before calling this function + */ +static struct sfe_connection * +fast_classifier_find_conn(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, + unsigned char proto, bool is_v4) +{ + struct sfe_connection_create *p_sic; + struct sfe_connection *conn; + u32 key; + + key = fc_conn_hash(saddr, daddr, sport, dport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == sport && + p_sic->dest_port == dport && + sfe_addr_equal(&p_sic->src_ip, saddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip, daddr, is_v4)) { + return conn; + } + } + + DEBUG_TRACE("connection not found\n"); + return NULL; +} + +/* + * fast_classifier_sb_find_conn() + * find a connection object in the hash table according to information of packet + * if not found, reverse the tuple and try again. + * @pre the sfe_connection_lock must be held before calling this function + */ +static struct sfe_connection * +fast_classifier_sb_find_conn(sfe_ip_addr_t *saddr, sfe_ip_addr_t *daddr, + unsigned short sport, unsigned short dport, + unsigned char proto, bool is_v4) +{ + struct sfe_connection_create *p_sic; + struct sfe_connection *conn; + u32 key; + + key = fc_conn_hash(saddr, daddr, sport, dport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == sport && + p_sic->dest_port_xlate == dport && + sfe_addr_equal(&p_sic->src_ip, saddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip_xlate, daddr, is_v4)) { + return conn; + } + } + + /* + * Reverse the tuple and try again + */ + key = fc_conn_hash(daddr, saddr, dport, sport, is_v4); + + sfe_hash_for_each_possible(fc_conn_ht, conn, hl, key) { + if (conn->is_v4 != is_v4) { + continue; + } + + p_sic = conn->sic; + + if (p_sic->protocol == proto && + p_sic->src_port == dport && + p_sic->dest_port_xlate == sport && + sfe_addr_equal(&p_sic->src_ip, daddr, is_v4) && + sfe_addr_equal(&p_sic->dest_ip_xlate, saddr, is_v4)) { + return conn; + } + } + + DEBUG_TRACE("connection not found\n"); + return NULL; +} + +/* + * fast_classifier_add_conn() + * add a connection object in the hash table if no duplicate + * @conn connection to add + * @return conn if successful, NULL if duplicate + */ +static struct sfe_connection * +fast_classifier_add_conn(struct sfe_connection *conn) +{ + struct sfe_connection_create *sic = conn->sic; + u32 key; + + spin_lock_bh(&sfe_connections_lock); + if (fast_classifier_find_conn(&sic->src_ip, &sic->dest_ip, sic->src_port, + sic->dest_port, sic->protocol, conn->is_v4)) { + spin_unlock_bh(&sfe_connections_lock); + return NULL; + } + + key = fc_conn_hash(&sic->src_ip, &sic->dest_ip, + sic->src_port, sic->dest_port, conn->is_v4); + + hash_add(fc_conn_ht, &conn->hl, key); + sfe_connections_size++; + spin_unlock_bh(&sfe_connections_lock); + + DEBUG_TRACE(" -> adding item to sfe_connections, new size: %d\n", sfe_connections_size); + + DEBUG_TRACE("new offloadable: key: %u proto: %d src_ip: %pIS dst_ip: %pIS, src_port: %d, dst_port: %d\n", + key, sic->protocol, &(sic->src_ip), &(sic->dest_ip), sic->src_port, sic->dest_port); + + return conn; +} + +/* + * fast_classifier_offload_genl_msg() + * Called from user space to offload a connection + */ +static int +fast_classifier_offload_genl_msg(struct sk_buff *skb, struct genl_info *info) +{ + struct nlattr *na; + struct fast_classifier_tuple *fc_msg; + struct sfe_connection *conn; + + na = info->attrs[FAST_CLASSIFIER_A_TUPLE]; + fc_msg = nla_data(na); + + DEBUG_TRACE("want to offload: %d-%d, %pIS, %pIS, %d, %d SMAC=%pM DMAC=%pM\n", + fc_msg->ethertype, + fc_msg->proto, + &fc_msg->src_saddr, + &fc_msg->dst_saddr, + fc_msg->sport, + fc_msg->dport, + fc_msg->smac, + fc_msg->dmac); + + spin_lock_bh(&sfe_connections_lock); + conn = fast_classifier_sb_find_conn((sfe_ip_addr_t *)&fc_msg->src_saddr, + (sfe_ip_addr_t *)&fc_msg->dst_saddr, + fc_msg->sport, + fc_msg->dport, + fc_msg->proto, + (fc_msg->ethertype == AF_INET)); + if (!conn) { + spin_unlock_bh(&sfe_connections_lock); + DEBUG_TRACE("REQUEST OFFLOAD NO MATCH\n"); + atomic_inc(&offload_no_match_msgs); + return 0; + } + + conn->offload_permit = 1; + spin_unlock_bh(&sfe_connections_lock); + atomic_inc(&offload_msgs); + + DEBUG_TRACE("INFO: calling sfe rule creation!\n"); + return 0; +} + +/* + * fast_classifier_nl_genl_msg_DUMP() + * ignore fast_classifier_messages OFFLOADED and DONE + */ +static int fast_classifier_nl_genl_msg_DUMP(struct sk_buff *skb, + struct netlink_callback *cb) +{ + return 0; +} + +/* auto offload connection once we have this many packets*/ +static int offload_at_pkts = 128; + +/* + * fast_classifier_post_routing() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +static unsigned int fast_classifier_post_routing(struct sk_buff *skb, bool is_v4) +{ + int ret; + struct sfe_connection_create sic; + struct sfe_connection_create *p_sic; + struct net_device *in; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct net_device *dev; + struct net_device *src_dev; + struct net_device *dest_dev; + struct net_device *src_br_dev = NULL; + struct net_device *dest_br_dev = NULL; + struct nf_conntrack_tuple orig_tuple; + struct nf_conntrack_tuple reply_tuple; + struct sfe_connection *conn; + SFE_NF_CONN_ACCT(acct); + + /* + * Don't process broadcast or multicast packets. + */ + if (unlikely(skb->pkt_type == PACKET_BROADCAST)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_PACKET_BROADCAST); + DEBUG_TRACE("broadcast, ignoring\n"); + return NF_ACCEPT; + } + if (unlikely(skb->pkt_type == PACKET_MULTICAST)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_PACKET_MULTICAST); + DEBUG_TRACE("multicast, ignoring\n"); + return NF_ACCEPT; + } + + /* + * Don't process packets that are not being forwarded. + */ + in = dev_get_by_index(&init_net, skb->skb_iif); + if (!in) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_IIF); + DEBUG_TRACE("packet not forwarding\n"); + return NF_ACCEPT; + } + + dev_put(in); + + /* + * Don't process packets that aren't being tracked by conntrack. + */ + ct = nf_ct_get(skb, &ctinfo); + if (unlikely(!ct)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_CT); + DEBUG_TRACE("no conntrack connection, ignoring\n"); + return NF_ACCEPT; + } + + /* + * Don't process untracked connections. + */ + if (unlikely(nf_ct_is_untracked(ct))) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_NO_TRACK); + DEBUG_TRACE("untracked connection\n"); + return NF_ACCEPT; + } + + /* + * Unconfirmed connection may be dropped by Linux at the final step, + * So we don't process unconfirmed connections. + */ + if (!nf_ct_is_confirmed(ct)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_NO_CONFIRM); + DEBUG_TRACE("unconfirmed connection\n"); + return NF_ACCEPT; + } + + /* + * Don't process connections that require support from a 'helper' (typically a NAT ALG). + */ + if (unlikely(nfct_help(ct))) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_IS_ALG); + DEBUG_TRACE("connection has helper\n"); + return NF_ACCEPT; + } + + /* + * Check if the acceleration of a flow could be rejected quickly. + */ + acct = nf_conn_acct_find(ct); + if (acct) { + long long packets = atomic64_read(&SFE_ACCT_COUNTER(acct)[CTINFO2DIR(ctinfo)].packets); + if ((packets > 0xff) && (packets & 0xff)) { + /* + * Connection hits slow path at least 256 times, so it must be not able to accelerate. + * But we also give it a chance to walk through ECM every 256 packets + */ + return NF_ACCEPT; + } + } + + memset(&sic, 0, sizeof(sic)); + + /* + * Look up the details of our connection in conntrack. + * + * Note that the data we get from conntrack is for the "ORIGINAL" direction + * but our packet may actually be in the "REPLY" direction. + */ + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + sic.protocol = (s32)orig_tuple.dst.protonum; + + sic.flags = 0; + + /* + * Get addressing information, non-NAT first + */ + if (is_v4) { + u32 dscp; + + sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_IS_IPV4_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip; + sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip; + + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } else { + u32 dscp; + + sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) || + ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_IS_IPV6_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6); + sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6); + + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } + + switch (sic.protocol) { + case IPPROTO_TCP: + sic.src_port = orig_tuple.src.u.tcp.port; + sic.dest_port = orig_tuple.dst.u.tcp.port; + sic.src_port_xlate = reply_tuple.dst.u.tcp.port; + sic.dest_port_xlate = reply_tuple.src.u.tcp.port; + + /* + * Don't try to manage a non-established connection. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_TCP_NOT_ASSURED); + DEBUG_TRACE("non-established connection\n"); + return NF_ACCEPT; + } + + break; + + case IPPROTO_UDP: + sic.src_port = orig_tuple.src.u.udp.port; + sic.dest_port = orig_tuple.dst.u.udp.port; + sic.src_port_xlate = reply_tuple.dst.u.udp.port; + sic.dest_port_xlate = reply_tuple.src.u.udp.port; + break; + + default: + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", sic.protocol); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + sic.original_accel = 1; + sic.reply_accel = 1; +#endif + + /* + * Get QoS information + */ + if (skb->priority) { + sic.dest_priority = skb->priority; + sic.src_priority = sic.dest_priority; + sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY; + } + + DEBUG_TRACE("POST_ROUTE: checking new connection: %d src_ip: %pIS dst_ip: %pIS, src_port: %d, dst_port: %d\n", + sic.protocol, &sic.src_ip, &sic.dest_ip, sic.src_port, sic.dest_port); + + /* + * If we already have this connection in our list, skip it + * XXX: this may need to be optimized + */ + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&sic.src_ip, &sic.dest_ip, sic.src_port, sic.dest_port, sic.protocol, is_v4); + if (conn) { + conn->hits++; + + if (!conn->offloaded) { + if (conn->offload_permit || conn->hits >= offload_at_pkts) { + DEBUG_TRACE("OFFLOADING CONNECTION, TOO MANY HITS\n"); + + if (fast_classifier_update_protocol(conn->sic, conn->ct) == 0) { + spin_unlock_bh(&sfe_connections_lock); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_UPDATE_PROTOCOL_FAIL); + DEBUG_TRACE("UNKNOWN PROTOCOL OR CONNECTION CLOSING, SKIPPING\n"); + return NF_ACCEPT; + } + + DEBUG_TRACE("INFO: calling sfe rule creation!\n"); + spin_unlock_bh(&sfe_connections_lock); + + ret = is_v4 ? sfe_ipv4_create_rule(conn->sic) : sfe_ipv6_create_rule(conn->sic); + if ((ret == 0) || (ret == -EADDRINUSE)) { + struct fast_classifier_tuple fc_msg; + + if (is_v4) { + fc_msg.ethertype = AF_INET; + fc_msg.src_saddr.in = *((struct in_addr *)&sic.src_ip); + fc_msg.dst_saddr.in = *((struct in_addr *)&sic.dest_ip_xlate); + } else { + fc_msg.ethertype = AF_INET6; + fc_msg.src_saddr.in6 = *((struct in6_addr *)&sic.src_ip); + fc_msg.dst_saddr.in6 = *((struct in6_addr *)&sic.dest_ip_xlate); + } + + fc_msg.proto = sic.protocol; + fc_msg.sport = sic.src_port; + fc_msg.dport = sic.dest_port_xlate; + memcpy(fc_msg.smac, conn->smac, ETH_ALEN); + memcpy(fc_msg.dmac, conn->dmac, ETH_ALEN); + fast_classifier_send_genl_msg(FAST_CLASSIFIER_C_OFFLOADED, &fc_msg); + conn->offloaded = 1; + } + + return NF_ACCEPT; + } + } + + spin_unlock_bh(&sfe_connections_lock); + if (conn->offloaded) { + is_v4 ? sfe_ipv4_update_rule(conn->sic) : sfe_ipv6_update_rule(conn->sic); + } + + DEBUG_TRACE("FOUND, SKIPPING\n"); + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_WAIT_FOR_ACCELERATION); + return NF_ACCEPT; + } + + spin_unlock_bh(&sfe_connections_lock); + + /* + * Get the net device and MAC addresses that correspond to the various source and + * destination host addresses. + */ + if (!fast_classifier_find_dev_and_mac_addr(&sic.src_ip, &src_dev, sic.src_mac, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_SRC_DEV); + return NF_ACCEPT; + } + + if (!fast_classifier_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_SRC_XLATE_DEV); + goto done1; + } + + dev_put(dev); + + if (!fast_classifier_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_DEST_DEV); + goto done1; + } + + dev_put(dev); + + if (!fast_classifier_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev, sic.dest_mac_xlate, is_v4)) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_DEST_XLATE_DEV); + goto done1; + } + + /* + * Our devices may actually be part of a bridge interface. If that's + * the case then find the bridge interface instead. + */ + if (src_dev->priv_flags & IFF_BRIDGE_PORT) { + src_br_dev = sfe_dev_get_master(src_dev); + if (!src_br_dev) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", src_dev->name); + goto done2; + } + + src_dev = src_br_dev; + } + + if (dest_dev->priv_flags & IFF_BRIDGE_PORT) { + dest_br_dev = sfe_dev_get_master(dest_dev); + if (!dest_br_dev) { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name); + goto done3; + } + + dest_dev = dest_br_dev; + } + + sic.src_dev = src_dev; + sic.dest_dev = dest_dev; + + sic.src_mtu = src_dev->mtu; + sic.dest_mtu = dest_dev->mtu; + + if (skb->mark) { + DEBUG_TRACE("SKB MARK NON ZERO %x\n", skb->mark); + } + sic.mark = skb->mark; + + conn = kmalloc(sizeof(*conn), GFP_ATOMIC); + if (!conn) { + printk(KERN_CRIT "ERROR: no memory for sfe\n"); + goto done3; + } + conn->hits = 0; + conn->offload_permit = 0; + conn->offloaded = 0; + conn->is_v4 = is_v4; + DEBUG_TRACE("Source MAC=%pM\n", sic.src_mac); + memcpy(conn->smac, sic.src_mac, ETH_ALEN); + memcpy(conn->dmac, sic.dest_mac_xlate, ETH_ALEN); + + p_sic = kmalloc(sizeof(*p_sic), GFP_ATOMIC); + if (!p_sic) { + printk(KERN_CRIT "ERROR: no memory for sfe\n"); + kfree(conn); + goto done3; + } + + memcpy(p_sic, &sic, sizeof(sic)); + conn->sic = p_sic; + conn->ct = ct; + + if (!fast_classifier_add_conn(conn)) { + kfree(conn->sic); + kfree(conn); + } + + /* + * If we had bridge ports then release them too. + */ + if (dest_br_dev) { + dev_put(dest_br_dev); + } + +done3: + if (src_br_dev) { + dev_put(src_br_dev); + } + +done2: + dev_put(dest_dev); + +done1: + dev_put(src_dev); + + return NF_ACCEPT; +} + +/* + * fast_classifier_ipv4_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +fast_classifier_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return fast_classifier_post_routing(skb, true); +} + +/* + * fast_classifier_ipv6_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +fast_classifier_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return fast_classifier_post_routing(skb, false); +} + +/* + * fast_classifier_update_mark() + * updates the mark for a fast-classifier connection + */ +static void fast_classifier_update_mark(struct sfe_connection_mark *mark, bool is_v4) +{ + struct sfe_connection *conn; + + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&mark->src_ip, &mark->dest_ip, + mark->src_port, mark->dest_port, + mark->protocol, is_v4); + if (conn) { + conn->sic->mark = mark->mark; + } + + spin_unlock_bh(&sfe_connections_lock); +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +/* + * fast_classifier_conntrack_event() + * Callback event invoked when a conntrack connection's state changes. + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static int fast_classifier_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +#else +static int fast_classifier_conntrack_event(unsigned int events, struct nf_ct_event *item) +#endif +{ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + struct nf_ct_event *item = ptr; +#endif + struct sfe_connection_destroy sid; + struct nf_conn *ct = item->ct; + struct nf_conntrack_tuple orig_tuple; + struct sfe_connection *conn; + struct fast_classifier_tuple fc_msg; + int offloaded = 0; + bool is_v4; + + /* + * If we don't have a conntrack entry then we're done. + */ + if (unlikely(!ct)) { + DEBUG_WARN("no ct in conntrack event callback\n"); + return NOTIFY_DONE; + } + + /* + * If this is an untracked connection then we can't have any state either. + */ + if (unlikely(nf_ct_is_untracked(ct))) { + DEBUG_TRACE("ignoring untracked conn\n"); + return NOTIFY_DONE; + } + + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + sid.protocol = (s32)orig_tuple.dst.protonum; + + /* + * Extract information from the conntrack connection. We're only interested + * in nominal connection information (i.e. we're ignoring any NAT information). + */ + if (likely(nf_ct_l3num(ct) == AF_INET)) { + sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + is_v4 = true; + } else if (likely(nf_ct_l3num(ct) == AF_INET6)) { + sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + is_v4 = false; + } else { + DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n"); + return NOTIFY_DONE; + } + + switch (sid.protocol) { + case IPPROTO_TCP: + sid.src_port = orig_tuple.src.u.tcp.port; + sid.dest_port = orig_tuple.dst.u.tcp.port; + break; + + case IPPROTO_UDP: + sid.src_port = orig_tuple.src.u.udp.port; + sid.dest_port = orig_tuple.dst.u.udp.port; + break; + + default: + DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol); + return NOTIFY_DONE; + } + + /* + * Check for an updated mark + */ + if ((events & (1 << IPCT_MARK)) && (ct->mark != 0)) { + struct sfe_connection_mark mark; + + mark.protocol = sid.protocol; + mark.src_ip = sid.src_ip; + mark.dest_ip = sid.dest_ip; + mark.src_port = sid.src_port; + mark.dest_port = sid.dest_port; + mark.mark = ct->mark; + + is_v4 ? sfe_ipv4_mark_rule(&mark) : sfe_ipv6_mark_rule(&mark); + fast_classifier_update_mark(&mark, is_v4); + } + + /* + * We're only interested in destroy events at this point + */ + if (unlikely(!(events & (1 << IPCT_DESTROY)))) { + DEBUG_TRACE("ignoring non-destroy event\n"); + return NOTIFY_DONE; + } + + DEBUG_TRACE("Try to clean up: proto: %d src_ip: %pIS dst_ip: %pIS, src_port: %d, dst_port: %d\n", + sid.protocol, &sid.src_ip, &sid.dest_ip, sid.src_port, sid.dest_port); + + spin_lock_bh(&sfe_connections_lock); + + conn = fast_classifier_find_conn(&sid.src_ip, &sid.dest_ip, sid.src_port, sid.dest_port, sid.protocol, is_v4); + if (conn && conn->offloaded) { + if (is_v4) { + fc_msg.ethertype = AF_INET; + fc_msg.src_saddr.in = *((struct in_addr *)&conn->sic->src_ip); + fc_msg.dst_saddr.in = *((struct in_addr *)&conn->sic->dest_ip_xlate); + } else { + fc_msg.ethertype = AF_INET6; + fc_msg.src_saddr.in6 = *((struct in6_addr *)&conn->sic->src_ip); + fc_msg.dst_saddr.in6 = *((struct in6_addr *)&conn->sic->dest_ip_xlate); + } + + fc_msg.proto = conn->sic->protocol; + fc_msg.sport = conn->sic->src_port; + fc_msg.dport = conn->sic->dest_port_xlate; + memcpy(fc_msg.smac, conn->smac, ETH_ALEN); + memcpy(fc_msg.dmac, conn->dmac, ETH_ALEN); + offloaded = 1; + } + + if (conn) { + DEBUG_TRACE("Free connection\n"); + + hash_del(&conn->hl); + sfe_connections_size--; + kfree(conn->sic); + kfree(conn); + } else { + fast_classifier_incr_exceptions(FAST_CL_EXCEPTION_CT_DESTROY_MISS); + } + + spin_unlock_bh(&sfe_connections_lock); + + is_v4 ? sfe_ipv4_destroy_rule(&sid) : sfe_ipv6_destroy_rule(&sid); + + if (offloaded) { + fast_classifier_send_genl_msg(FAST_CLASSIFIER_C_DONE, &fc_msg); + } + + return NOTIFY_DONE; +} + +/* + * Netfilter conntrack event system to monitor connection tracking changes + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static struct notifier_block fast_classifier_conntrack_notifier = { + .notifier_call = fast_classifier_conntrack_event, +}; +#else +static struct nf_ct_event_notifier fast_classifier_conntrack_notifier = { + .fcn = fast_classifier_conntrack_event, +}; +#endif +#endif + +/* + * Structure to establish a hook into the post routing netfilter point - this + * will pick up local outbound and packets going from one interface to another. + * + * Note: see include/linux/netfilter_ipv4.h for info related to priority levels. + * We want to examine packets after NAT translation and any ALG processing. + */ +static struct nf_hook_ops fast_classifier_ops_post_routing[] __read_mostly = { + SFE_IPV4_NF_POST_ROUTING_HOOK(__fast_classifier_ipv4_post_routing_hook), + SFE_IPV6_NF_POST_ROUTING_HOOK(__fast_classifier_ipv6_post_routing_hook), +}; + +/* + * fast_classifier_sync_rule() + * Synchronize a connection's state. + */ +static void fast_classifier_sync_rule(struct sfe_connection_sync *sis) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + SFE_NF_CONN_ACCT(acct); + + /* + * Create a tuple so as to be able to look up a connection + */ + memset(&tuple, 0, sizeof(tuple)); + tuple.src.u.all = (__be16)sis->src_port; + tuple.dst.dir = IP_CT_DIR_ORIGINAL; + tuple.dst.protonum = (u8)sis->protocol; + tuple.dst.u.all = (__be16)sis->dest_port; + + if (sis->is_v6) { + tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6); + tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6); + tuple.src.l3num = AF_INET6; + + DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all)); + } else { + tuple.src.u3.ip = sis->src_ip.ip; + tuple.dst.u3.ip = sis->dest_ip.ip; + tuple.src.l3num = AF_INET; + + DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all)); + } + + /* + * Update packet count for ingress on bridge device + */ + if (skip_to_bridge_ingress) { + struct rtnl_link_stats64 nlstats; + nlstats.tx_packets = 0; + nlstats.tx_bytes = 0; + + if (sis->src_dev && IFF_EBRIDGE && + (sis->src_new_packet_count || sis->src_new_byte_count)) { + nlstats.rx_packets = sis->src_new_packet_count; + nlstats.rx_bytes = sis->src_new_byte_count; + spin_lock_bh(&sfe_connections_lock); + br_dev_update_stats(sis->src_dev, &nlstats); + spin_unlock_bh(&sfe_connections_lock); + } + if (sis->dest_dev && IFF_EBRIDGE && + (sis->dest_new_packet_count || sis->dest_new_byte_count)) { + nlstats.rx_packets = sis->dest_new_packet_count; + nlstats.rx_bytes = sis->dest_new_byte_count; + spin_lock_bh(&sfe_connections_lock); + br_dev_update_stats(sis->dest_dev, &nlstats); + spin_unlock_bh(&sfe_connections_lock); + } + } + + /* + * Look up conntrack connection + */ + h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple); + if (unlikely(!h)) { + DEBUG_TRACE("no connection found\n"); + return; + } + + ct = nf_ct_tuplehash_to_ctrack(h); + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); + + /* + * Only update if this is not a fixed timeout + */ + if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_lock_bh(&ct->lock); +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)) + ct->timeout += sis->delta_jiffies; +#else + ct->timeout.expires += sis->delta_jiffies; +#endif + spin_unlock_bh(&ct->lock); + } + + acct = nf_conn_acct_find(ct); + if (acct) { + spin_lock_bh(&ct->lock); + atomic64_add(sis->src_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets); + atomic64_add(sis->src_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes); + atomic64_add(sis->dest_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + atomic64_add(sis->dest_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes); + spin_unlock_bh(&ct->lock); + } + + switch (sis->protocol) { + case IPPROTO_TCP: + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) { + ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) { + ct->proto.tcp.seen[0].td_end = sis->src_td_end; + } + if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) { + ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end; + } + if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) { + ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) { + ct->proto.tcp.seen[1].td_end = sis->dest_td_end; + } + if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) { + ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end; + } + spin_unlock_bh(&ct->lock); + break; + + case IPPROTO_UDP: + /* + * In Linux connection track, UDP flow has two timeout values: + * /proc/sys/net/netfilter/nf_conntrack_udp_timeout: + * this is for uni-direction UDP flow, normally its value is 60 seconds + * /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream: + * this is for bi-direction UDP flow, normally its value is 180 seconds + * + * Linux will update timer of UDP flow to stream timeout once it seen packets + * in reply direction. But if flow is accelerated by NSS or SFE, Linux won't + * see any packets. So we have to do the same thing in our stats sync message. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status) && acct) { + u_int64_t reply_pkts = atomic64_read(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + + if (reply_pkts != 0) { + struct nf_conntrack_l4proto *l4proto; + unsigned int *timeouts; + + set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + set_bit(IPS_ASSURED_BIT, &ct->status); + + l4proto = __nf_ct_l4proto_find((sis->is_v6 ? AF_INET6 : AF_INET), IPPROTO_UDP); + timeouts = nf_ct_timeout_lookup(&init_net, ct, l4proto); + + spin_lock_bh(&ct->lock); +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)) + ct->timeout = nfct_time_stamp + timeouts[UDP_CT_REPLIED]; +#else + ct->timeout.expires = jiffies + timeouts[UDP_CT_REPLIED]; +#endif + spin_unlock_bh(&ct->lock); + } + } + break; + } + + /* + * Release connection + */ + nf_ct_put(ct); +} + +/* + * fast_classifier_device_event() + */ +static int fast_classifier_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = SFE_DEV_EVENT_PTR(ptr); + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_inet_event() + */ +static int fast_classifier_inet_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_inet6_event() + */ +static int fast_classifier_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * fast_classifier_get_offload_at_pkts() + */ +static ssize_t fast_classifier_get_offload_at_pkts(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", offload_at_pkts); +} + +/* + * fast_classifier_set_offload_at_pkts() + */ +static ssize_t fast_classifier_set_offload_at_pkts(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + long new; + int ret; + + ret = kstrtol(buf, 0, &new); + if (ret == -EINVAL || ((int)new != new)) + return -EINVAL; + + offload_at_pkts = new; + + return size; +} + +/* + * fast_classifier_get_debug_info() + */ +static ssize_t fast_classifier_get_debug_info(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + size_t len = 0; + struct sfe_connection *conn; + u32 i; + + spin_lock_bh(&sfe_connections_lock); + len += scnprintf(buf, PAGE_SIZE - len, "size=%d offload=%d offload_no_match=%d" + " offloaded=%d done=%d offl_dbg_msg_fail=%d done_dbg_msg_fail=%d\n", + sfe_connections_size, + atomic_read(&offload_msgs), + atomic_read(&offload_no_match_msgs), + atomic_read(&offloaded_msgs), + atomic_read(&done_msgs), + atomic_read(&offloaded_fail_msgs), + atomic_read(&done_fail_msgs)); + sfe_hash_for_each(fc_conn_ht, i, conn, hl) { + len += scnprintf(buf + len, PAGE_SIZE - len, + (conn->is_v4 ? "o=%d, p=%d [%pM]:%pI4:%u %pI4:%u:[%pM] m=%08x h=%d\n" : "o=%d, p=%d [%pM]:%pI6:%u %pI6:%u:[%pM] m=%08x h=%d\n"), + conn->offloaded, + conn->sic->protocol, + conn->sic->src_mac, + &conn->sic->src_ip, + conn->sic->src_port, + &conn->sic->dest_ip, + conn->sic->dest_port, + conn->sic->dest_mac_xlate, + conn->sic->mark, + conn->hits); + } + spin_unlock_bh(&sfe_connections_lock); + + return len; +} + +/* + * fast_classifier_get_skip_bridge_ingress() + */ +static ssize_t fast_classifier_get_skip_bridge_ingress(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", skip_to_bridge_ingress); +} + +/* + * fast_classifier_set_skip_bridge_ingress() + */ +static ssize_t fast_classifier_set_skip_bridge_ingress(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t size) +{ + long new; + int ret; + + ret = kstrtol(buf, 0, &new); + if (ret == -EINVAL || ((int)new != new)) + return -EINVAL; + + skip_to_bridge_ingress = new ? 1 : 0; + + return size; +} + +/* + * fast_classifier_get_exceptions + * dump exception counters + */ +static ssize_t fast_classifier_get_exceptions(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int idx, len; + struct fast_classifier *sc = &__fsc; + + spin_lock_bh(&sc->lock); + for (len = 0, idx = 0; idx < FAST_CL_EXCEPTION_MAX; idx++) { + if (sc->exceptions[idx]) { + len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", fast_classifier_exception_events_string[idx], sc->exceptions[idx]); + } + } + spin_unlock_bh(&sc->lock); + + return len; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute fast_classifier_attrs[] = { + __ATTR(offload_at_pkts, S_IWUSR | S_IRUGO, fast_classifier_get_offload_at_pkts, fast_classifier_set_offload_at_pkts), + __ATTR(debug_info, S_IRUGO, fast_classifier_get_debug_info, NULL), + __ATTR(skip_to_bridge_ingress, S_IWUSR | S_IRUGO, fast_classifier_get_skip_bridge_ingress, fast_classifier_set_skip_bridge_ingress), + __ATTR(exceptions, S_IRUGO, fast_classifier_get_exceptions, NULL), +}; + +/* + * fast_classifier_init() + */ +static int __init fast_classifier_init(void) +{ + struct fast_classifier *sc = &__fsc; + int result = -1; + size_t i, j; + + printk(KERN_ALERT "fast-classifier: starting up\n"); + DEBUG_INFO("SFE CM init\n"); + + hash_init(fc_conn_ht); + + /* + * Create sys/fast_classifier + */ + sc->sys_fast_classifier = kobject_create_and_add("fast_classifier", NULL); + if (!sc->sys_fast_classifier) { + DEBUG_ERROR("failed to register fast_classifier\n"); + goto exit1; + } + + for (i = 0; i < ARRAY_SIZE(fast_classifier_attrs); i++) { + result = sysfs_create_file(sc->sys_fast_classifier, &fast_classifier_attrs[i].attr); + if (result) { + DEBUG_ERROR("failed to register %s : %d\n", + fast_classifier_attrs[i].attr.name, result); + goto exit2; + } + } + + sc->dev_notifier.notifier_call = fast_classifier_device_event; + sc->dev_notifier.priority = 1; + register_netdevice_notifier(&sc->dev_notifier); + + sc->inet_notifier.notifier_call = fast_classifier_inet_event; + sc->inet_notifier.priority = 1; + register_inetaddr_notifier(&sc->inet_notifier); + + sc->inet6_notifier.notifier_call = fast_classifier_inet6_event; + sc->inet6_notifier.priority = 1; + register_inet6addr_notifier(&sc->inet6_notifier); + + /* + * Register our netfilter hooks. + */ + result = nf_register_hooks(fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + if (result < 0) { + DEBUG_ERROR("can't register nf post routing hook: %d\n", result); + goto exit3; + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + /* + * Register a notifier hook to get fast notifications of expired connections. + * Note: In CONFIG_NF_CONNTRACK_CHAIN_EVENTS enabled case, nf_conntrack_register_notifier() + * function always returns 0. + */ + +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + (void)nf_conntrack_register_notifier(&init_net, &fast_classifier_conntrack_notifier); +#else + result = nf_conntrack_register_notifier(&init_net, &fast_classifier_conntrack_notifier); + if (result < 0) { + DEBUG_ERROR("can't register nf notifier hook: %d\n", result); + goto exit4; + } +#endif +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) + result = genl_register_family_with_ops_groups(&fast_classifier_gnl_family, + fast_classifier_gnl_ops, + fast_classifier_genl_mcgrp); + if (result) { + DEBUG_ERROR("failed to register genl ops: %d\n", result); + goto exit5; + } +#else + result = genl_register_family(&fast_classifier_gnl_family); + if (result) { + printk(KERN_CRIT "unable to register genl family\n"); + goto exit5; + } + + result = genl_register_ops(&fast_classifier_gnl_family, fast_classifier_gnl_ops); + if (result) { + printk(KERN_CRIT "unable to register ops\n"); + goto exit6; + } + + result = genl_register_mc_group(&fast_classifier_gnl_family, + fast_classifier_genl_mcgrp); + if (result) { + printk(KERN_CRIT "unable to register multicast group\n"); + goto exit6; + } +#endif + + printk(KERN_ALERT "fast-classifier: registered\n"); + + spin_lock_init(&sc->lock); + + /* + * Hook the receive path in the network stack. + */ + BUG_ON(fast_nat_recv); + RCU_INIT_POINTER(fast_nat_recv, fast_classifier_recv); + + /* + * Hook the shortcut sync callback. + */ + sfe_ipv4_register_sync_rule_callback(fast_classifier_sync_rule); + sfe_ipv6_register_sync_rule_callback(fast_classifier_sync_rule); + return 0; + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0)) +exit6: + genl_unregister_family(&fast_classifier_gnl_family); +#endif + +exit5: +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_conntrack_unregister_notifier(&init_net, &fast_classifier_conntrack_notifier); + +exit4: +#endif + nf_unregister_hooks(fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + +exit3: + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + +exit2: + for (j = 0; j < i; j++) { + sysfs_remove_file(sc->sys_fast_classifier, &fast_classifier_attrs[j].attr); + } + kobject_put(sc->sys_fast_classifier); + +exit1: + return result; +} + +/* + * fast_classifier_exit() + */ +static void __exit fast_classifier_exit(void) +{ + struct fast_classifier *sc = &__fsc; + int result = -1; + + DEBUG_INFO("SFE CM exit\n"); + printk(KERN_ALERT "fast-classifier: shutting down\n"); + + /* + * Unregister our sync callback. + */ + sfe_ipv4_register_sync_rule_callback(NULL); + sfe_ipv6_register_sync_rule_callback(NULL); + + /* + * Unregister our receive callback. + */ + RCU_INIT_POINTER(fast_nat_recv, NULL); + + /* + * Wait for all callbacks to complete. + */ + rcu_barrier(); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + sfe_ipv6_destroy_all_rules_for_dev(NULL); + +#if (LINUX_VERSION_CODE < KERNEL_VERSION(3, 13, 0)) + result = genl_unregister_ops(&fast_classifier_gnl_family, fast_classifier_gnl_ops); + if (result != 0) { + printk(KERN_CRIT "Unable to unreigster genl_ops\n"); + } +#endif + + result = genl_unregister_family(&fast_classifier_gnl_family); + if (result != 0) { + printk(KERN_CRIT "Unable to unreigster genl_family\n"); + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_conntrack_unregister_notifier(&init_net, &fast_classifier_conntrack_notifier); + +#endif + nf_unregister_hooks(fast_classifier_ops_post_routing, ARRAY_SIZE(fast_classifier_ops_post_routing)); + + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + + kobject_put(sc->sys_fast_classifier); +} + +module_init(fast_classifier_init) +module_exit(fast_classifier_exit) + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/package/kernel/shortcut-fe/src/fast-classifier.h b/package/kernel/shortcut-fe/src/fast-classifier.h new file mode 100644 index 000000000000..6b7a18cf6a5d --- /dev/null +++ b/package/kernel/shortcut-fe/src/fast-classifier.h @@ -0,0 +1,57 @@ +/* + * User space header to send message to the fast classifier + * + * Copyright (c) 2013,2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#define FAST_CLASSIFIER_GENL_VERSION (1) +#define FAST_CLASSIFIER_GENL_NAME "FC" +#define FAST_CLASSIFIER_GENL_MCGRP "FC_MCGRP" +#define FAST_CLASSIFIER_GENL_HDRSIZE (0) + +enum { + FAST_CLASSIFIER_A_UNSPEC, + FAST_CLASSIFIER_A_TUPLE, + __FAST_CLASSIFIER_A_MAX, +}; + +#define FAST_CLASSIFIER_A_MAX (__FAST_CLASSIFIER_A_MAX - 1) + +enum { + FAST_CLASSIFIER_C_UNSPEC, + FAST_CLASSIFIER_C_OFFLOAD, + FAST_CLASSIFIER_C_OFFLOADED, + FAST_CLASSIFIER_C_DONE, + __FAST_CLASSIFIER_C_MAX, +}; + +#define FAST_CLASSIFIER_C_MAX (__FAST_CLASSIFIER_C_MAX - 1) + +struct fast_classifier_tuple { + unsigned short ethertype; + unsigned char proto; + union { + struct in_addr in; + struct in6_addr in6; + } src_saddr; + union { + struct in_addr in; + struct in6_addr in6; + } dst_saddr; + unsigned short sport; + unsigned short dport; + unsigned char smac[ETH_ALEN]; + unsigned char dmac[ETH_ALEN]; +}; diff --git a/package/kernel/shortcut-fe/src/nl_classifier_test.c b/package/kernel/shortcut-fe/src/nl_classifier_test.c new file mode 100644 index 000000000000..639417964db5 --- /dev/null +++ b/package/kernel/shortcut-fe/src/nl_classifier_test.c @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#define NL_CLASSIFIER_GENL_VERSION 1 +#define NL_CLASSIFIER_GENL_FAMILY "FC" +#define NL_CLASSIFIER_GENL_GROUP "FC_MCGRP" +#define NL_CLASSIFIER_GENL_HDRSIZE 0 + +enum NL_CLASSIFIER_CMD { + NL_CLASSIFIER_CMD_UNSPEC, + NL_CLASSIFIER_CMD_ACCEL, + NL_CLASSIFIER_CMD_ACCEL_OK, + NL_CLASSIFIER_CMD_CONNECTION_CLOSED, + NL_CLASSIFIER_CMD_MAX, +}; + +enum NL_CLASSIFIER_ATTR { + NL_CLASSIFIER_ATTR_UNSPEC, + NL_CLASSIFIER_ATTR_TUPLE, + NL_CLASSIFIER_ATTR_MAX, +}; + +union nl_classifier_tuple_ip { + struct in_addr in; + struct in6_addr in6; +}; + +struct nl_classifier_tuple { + unsigned short af; + unsigned char proto; + union nl_classifier_tuple_ip src_ip; + union nl_classifier_tuple_ip dst_ip; + unsigned short sport; + unsigned short dport; + unsigned char smac[6]; + unsigned char dmac[6]; +}; + +struct nl_classifier_instance { + struct nl_sock *sock; + int family_id; + int group_id; + int stop; +}; + +struct nl_classifier_instance nl_cls_inst; + +static struct nla_policy nl_classifier_genl_policy[(NL_CLASSIFIER_ATTR_MAX+1)] = { + [NL_CLASSIFIER_ATTR_TUPLE] = { .type = NLA_UNSPEC }, +}; + +void nl_classifier_dump_nl_tuple(struct nl_classifier_tuple *tuple) +{ + char ip_str[64]; + + printf("protocol = %s\n", (tuple->proto == IPPROTO_UDP) ? "udp" : ((tuple->proto == IPPROTO_TCP) ? "tcp" : "unknown")); + printf("source ip = %s\n", inet_ntop(tuple->af, &tuple->src_ip, ip_str, sizeof(ip_str))); + printf("destination ip = %s\n", inet_ntop(tuple->af, &tuple->dst_ip, ip_str, sizeof(ip_str))); + printf("source port = %d\n", ntohs(tuple->sport)); + printf("destination port = %d\n", ntohs(tuple->dport)); +} + +int nl_classifier_msg_recv(struct nl_msg *msg, void *arg) +{ + struct nlmsghdr *nlh = nlmsg_hdr(msg); + struct genlmsghdr *gnlh = nlmsg_data(nlh); + struct nlattr *attrs[(NL_CLASSIFIER_ATTR_MAX+1)]; + + genlmsg_parse(nlh, NL_CLASSIFIER_GENL_HDRSIZE, attrs, NL_CLASSIFIER_ATTR_MAX, nl_classifier_genl_policy); + + switch (gnlh->cmd) { + case NL_CLASSIFIER_CMD_ACCEL_OK: + printf("Acceleration successful:\n"); + nl_classifier_dump_nl_tuple(nla_data(attrs[NL_CLASSIFIER_ATTR_TUPLE])); + return NL_OK; + case NL_CLASSIFIER_CMD_CONNECTION_CLOSED: + printf("Connection is closed:\n"); + nl_classifier_dump_nl_tuple(nla_data(attrs[NL_CLASSIFIER_ATTR_TUPLE])); + return NL_OK; + default: + printf("nl classifier received unknow message %d\n", gnlh->cmd); + } + + return NL_SKIP; +} + +void nl_classifier_offload(struct nl_classifier_instance *inst, + unsigned char proto, unsigned long *src_saddr, + unsigned long *dst_saddr, unsigned short sport, + unsigned short dport, int af) +{ + struct nl_msg *msg; + int ret; + struct nl_classifier_tuple classifier_msg; + + memset(&classifier_msg, 0, sizeof(classifier_msg)); + classifier_msg.af = af; + classifier_msg.proto = proto; + memcpy(&classifier_msg.src_ip, src_saddr, (af == AF_INET ? 4 : 16)); + memcpy(&classifier_msg.dst_ip, dst_saddr, (af == AF_INET ? 4 : 16)); + classifier_msg.sport = sport; + classifier_msg.dport = dport; + + msg = nlmsg_alloc(); + if (!msg) { + printf("Unable to allocate message\n"); + return; + } + + genlmsg_put(msg, NL_AUTO_PID, NL_AUTO_SEQ, inst->family_id, + NL_CLASSIFIER_GENL_HDRSIZE, NLM_F_REQUEST, + NL_CLASSIFIER_CMD_ACCEL, NL_CLASSIFIER_GENL_VERSION); + nla_put(msg, NL_CLASSIFIER_ATTR_TUPLE, sizeof(classifier_msg), &classifier_msg); + + ret = nl_send_auto(inst->sock, msg); + if (ret < 0) { + printf("send netlink message failed.\n"); + nlmsg_free(msg); + return; + } + + nlmsg_free(msg); + printf("nl classifier offload connection successful\n"); +} + +int nl_classifier_init(struct nl_classifier_instance *inst) +{ + int ret; + + inst->sock = nl_socket_alloc(); + if (!inst->sock) { + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(inst->sock); + + inst->family_id = genl_ctrl_resolve(inst->sock, NL_CLASSIFIER_GENL_FAMILY); + if (inst->family_id < 0) { + printf("Unable to resolve family %s\n", NL_CLASSIFIER_GENL_FAMILY); + goto init_failed; + } + + inst->group_id = genl_ctrl_resolve_grp(inst->sock, NL_CLASSIFIER_GENL_FAMILY, NL_CLASSIFIER_GENL_GROUP); + if (inst->group_id < 0) { + printf("Unable to resolve mcast group %s\n", NL_CLASSIFIER_GENL_GROUP); + goto init_failed; + } + + ret = nl_socket_add_membership(inst->sock, inst->group_id); + if (ret < 0) { + printf("Unable to add membership\n"); + goto init_failed; + } + + nl_socket_disable_seq_check(inst->sock); + nl_socket_modify_cb(inst->sock, NL_CB_VALID, NL_CB_CUSTOM, nl_classifier_msg_recv, NULL); + + printf("nl classifier init successful\n"); + return 0; + +init_failed: + if (inst->sock) { + nl_close(inst->sock); + nl_socket_free(inst->sock); + inst->sock = NULL; + } + return -1; +} + +void nl_classifier_exit(struct nl_classifier_instance *inst) +{ + if (inst->sock) { + nl_close(inst->sock); + nl_socket_free(inst->sock); + inst->sock = NULL; + } + printf("nl classifier exit successful\n"); +} + +int nl_classifier_parse_arg(int argc, char *argv[], unsigned char *proto, unsigned long *src_saddr, + unsigned long *dst_saddr, unsigned short *sport, unsigned short *dport, int *af) +{ + int ret; + unsigned short port; + + if (argc < 7) { + printf("help: nl_classifier \n"); + return -1; + } + + if (0 == strncmp(argv[1], "v4", 2)) { + *af = AF_INET; + } else if (0 == strncmp(argv[1], "v6", 2)) { + *af = AF_INET6; + } else { + printf("Address family is not supported"); + return -1; + } + + if (0 == strncmp(argv[2], "udp", 3)) { + *proto = IPPROTO_UDP; + } else if (0 == strncmp(argv[2], "tcp", 3)) { + *proto = IPPROTO_TCP; + } else { + printf("Protocol is not supported"); + return -1; + } + + ret = inet_pton(*af, argv[3], src_saddr); + if (ret <= 0) { + printf("source ip has wrong format\n"); + return -1; + } + + ret = inet_pton(*af, argv[4], dst_saddr); + if (ret <= 0) { + printf("destination ip has wrong format\n"); + return -1; + } + + port = strtol(argv[5], NULL, 0); + *sport = htons(port); + port = strtol(argv[6], NULL, 0); + *dport = htons(port); + + printf("nl classifier parse arguments successful\n"); + return 0; +} + +int main(int argc, char *argv[]) +{ + struct nl_classifier_instance *inst = &nl_cls_inst; + unsigned char proto; + unsigned long src_addr[4]; + unsigned long dst_addr[4]; + unsigned short sport; + unsigned short dport; + int af; + int ret; + + ret = nl_classifier_parse_arg(argc, argv, &proto, src_addr, dst_addr, &sport, &dport, &af); + if (ret < 0) { + printf("Failed to parse arguments\n"); + return ret; + } + + ret = nl_classifier_init(inst); + if (ret < 0) { + printf("Unable to init generic netlink\n"); + return ret; + } + + nl_classifier_offload(inst, proto, src_addr, dst_addr, sport, dport, af); + + /* main loop to listen on message */ + while (!inst->stop) { + nl_recvmsgs_default(inst->sock); + } + + nl_classifier_exit(inst); + + return 0; +} diff --git a/package/kernel/shortcut-fe/src/sfe.h b/package/kernel/shortcut-fe/src/sfe.h new file mode 100644 index 000000000000..21a127556014 --- /dev/null +++ b/package/kernel/shortcut-fe/src/sfe.h @@ -0,0 +1,61 @@ +/* + * sfe.h + * Shortcut forwarding engine. + * + * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * Debug output verbosity level. + */ +#define DEBUG_LEVEL 0 + +#if (DEBUG_LEVEL < 1) +#define DEBUG_ERROR(s, ...) +#else +#define DEBUG_ERROR(s, ...) \ +do { \ + printk("%s[%u]: ERROR:", __FILE__, __LINE__); \ + printk(s, ##__VA_ARGS__); \ +} while (0) +#endif + +#if (DEBUG_LEVEL < 2) +#define DEBUG_WARN(s, ...) +#else +#define DEBUG_WARN(s, ...) \ +do { \ + printk("%s[%u]: WARN:", __FILE__, __LINE__); \ + printk(s, ##__VA_ARGS__); \ +} while (0) +#endif + +#if (DEBUG_LEVEL < 3) +#define DEBUG_INFO(s, ...) +#else +#define DEBUG_INFO(s, ...) \ +do { \ + printk("%s[%u]: INFO:", __FILE__, __LINE__); \ + printk(s, ##__VA_ARGS__); \ +} while (0) +#endif + +#if (DEBUG_LEVEL < 4) +#define DEBUG_TRACE(s, ...) +#else +#define DEBUG_TRACE(s, ...) \ +do { \ + printk("%s[%u]: TRACE:", __FILE__, __LINE__); \ + printk(s, ##__VA_ARGS__); \ +} while (0) +#endif diff --git a/package/kernel/shortcut-fe/src/sfe_backport.h b/package/kernel/shortcut-fe/src/sfe_backport.h new file mode 100644 index 000000000000..b24f6e67e6c7 --- /dev/null +++ b/package/kernel/shortcut-fe/src/sfe_backport.h @@ -0,0 +1,138 @@ +/* + * sfe_backport.h + * Shortcut forwarding engine compatible header file. + * + * Copyright (c) 2014-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(void *priv, \ + struct sk_buff *SKB, \ + const struct nf_hook_state *state) +#elif (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(const struct nf_hook_ops *OPS, \ + struct sk_buff *SKB, \ + const struct net_device *UNUSED, \ + const struct net_device *OUT, \ + int (*OKFN)(struct sk_buff *)) +#else +#define sfe_define_post_routing_hook(FN_NAME, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ +static unsigned int FN_NAME(unsigned int HOOKNUM, \ + struct sk_buff *SKB, \ + const struct net_device *UNUSED, \ + const struct net_device *OUT, \ + int (*OKFN)(struct sk_buff *)) +#endif + +#define sfe_cm_ipv4_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__sfe_cm_ipv4_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define sfe_cm_ipv6_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__sfe_cm_ipv6_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define fast_classifier_ipv4_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__fast_classifier_ipv4_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) +#define fast_classifier_ipv6_post_routing_hook(HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) \ + sfe_define_post_routing_hook(__fast_classifier_ipv6_post_routing_hook, HOOKNUM, OPS, SKB, UNUSED, OUT, OKFN) + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define SFE_IPV4_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .pf = NFPROTO_IPV4, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#else +#define SFE_IPV4_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .owner = THIS_MODULE, \ + .pf = NFPROTO_IPV4, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 4, 0)) +#define SFE_IPV6_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .pf = NFPROTO_IPV6, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP_PRI_NAT_SRC + 1, \ + } +#else +#define SFE_IPV6_NF_POST_ROUTING_HOOK(fn) \ + { \ + .hook = fn, \ + .owner = THIS_MODULE, \ + .pf = NFPROTO_IPV6, \ + .hooknum = NF_INET_POST_ROUTING, \ + .priority = NF_IP6_PRI_NAT_SRC + 1, \ + } +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 3, 0)) +#define SFE_NF_CT_DEFAULT_ZONE (&nf_ct_zone_dflt) +#else +#define SFE_NF_CT_DEFAULT_ZONE NF_CT_DEFAULT_ZONE +#endif + +/* + * sfe_dev_get_master + * get master of bridge port, and hold it + */ +static inline struct net_device *sfe_dev_get_master(struct net_device *dev) +{ + struct net_device *master; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 9, 0)) + rcu_read_lock(); + master = netdev_master_upper_dev_get_rcu(dev); + if (master) + dev_hold(master); + + rcu_read_unlock(); +#else + master = dev->master; + if (master) + dev_hold(master); +#endif + return master; +} + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 11, 0)) +#define SFE_DEV_EVENT_PTR(PTR) netdev_notifier_info_to_dev(PTR) +#else +#define SFE_DEV_EVENT_PTR(PTR) (struct net_device *)(PTR) +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define SFE_NF_CONN_ACCT(NM) struct nf_conn_acct *NM +#else +#define SFE_NF_CONN_ACCT(NM) struct nf_conn_counter *NM +#endif + +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(3, 13, 0)) +#define SFE_ACCT_COUNTER(NM) ((NM)->counter) +#else +#define SFE_ACCT_COUNTER(NM) (NM) +#endif + +#define sfe_hash_for_each_possible(name, obj, member, key) \ + hash_for_each_possible(name, obj, member, key) + +#define sfe_hash_for_each(name, bkt, obj, member) \ + hash_for_each(name, bkt, obj, member) diff --git a/package/kernel/shortcut-fe/src/sfe_cm.c b/package/kernel/shortcut-fe/src/sfe_cm.c new file mode 100644 index 000000000000..6d3e21085aba --- /dev/null +++ b/package/kernel/shortcut-fe/src/sfe_cm.c @@ -0,0 +1,1203 @@ +/* + * sfe-cm.c + * Shortcut forwarding engine connection manager. + * + * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" +#include "sfe_backport.h" + +typedef enum sfe_cm_exception { + SFE_CM_EXCEPTION_PACKET_BROADCAST, + SFE_CM_EXCEPTION_PACKET_MULTICAST, + SFE_CM_EXCEPTION_NO_IIF, + SFE_CM_EXCEPTION_NO_CT, + SFE_CM_EXCEPTION_CT_NO_TRACK, + SFE_CM_EXCEPTION_CT_NO_CONFIRM, + SFE_CM_EXCEPTION_CT_IS_ALG, + SFE_CM_EXCEPTION_IS_IPV4_MCAST, + SFE_CM_EXCEPTION_IS_IPV6_MCAST, + SFE_CM_EXCEPTION_TCP_NOT_ASSURED, + SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED, + SFE_CM_EXCEPTION_UNKNOW_PROTOCOL, + SFE_CM_EXCEPTION_NO_SRC_DEV, + SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV, + SFE_CM_EXCEPTION_NO_DEST_DEV, + SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV, + SFE_CM_EXCEPTION_NO_BRIDGE, + SFE_CM_EXCEPTION_LOCAL_OUT, + SFE_CM_EXCEPTION_MAX +} sfe_cm_exception_t; + +static char *sfe_cm_exception_events_string[SFE_CM_EXCEPTION_MAX] = { + "PACKET_BROADCAST", + "PACKET_MULTICAST", + "NO_IIF", + "NO_CT", + "CT_NO_TRACK", + "CT_NO_CONFIRM", + "CT_IS_ALG", + "IS_IPV4_MCAST", + "IS_IPV6_MCAST", + "TCP_NOT_ASSURED", + "TCP_NOT_ESTABLISHED", + "UNKNOW_PROTOCOL", + "NO_SRC_DEV", + "NO_SRC_XLATE_DEV", + "NO_DEST_DEV", + "NO_DEST_XLATE_DEV", + "NO_BRIDGE", + "LOCAL_OUT" +}; + +/* + * Per-module structure. + */ +struct sfe_cm { + spinlock_t lock; /* Lock for SMP correctness */ + + /* + * Control state. + */ + struct kobject *sys_sfe_cm; /* sysfs linkage */ + + /* + * Callback notifiers. + */ + struct notifier_block dev_notifier; /* Device notifier */ + struct notifier_block inet_notifier; /* IPv4 notifier */ + struct notifier_block inet6_notifier; /* IPv6 notifier */ + u32 exceptions[SFE_CM_EXCEPTION_MAX]; +}; + +static struct sfe_cm __sc; + + +/* + * sfe_cm_incr_exceptions() + * increase an exception counter. + */ +static inline void sfe_cm_incr_exceptions(sfe_cm_exception_t except) +{ + struct sfe_cm *sc = &__sc; + + spin_lock_bh(&sc->lock); + sc->exceptions[except]++; + spin_unlock_bh(&sc->lock); +} + +/* + * sfe_cm_recv() + * Handle packet receives. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +static int sfe_cm_recv(struct sk_buff *skb) +{ + struct net_device *dev; + + /* + * We know that for the vast majority of packets we need the transport + * layer header so we may as well start to fetch it now! + */ + prefetch(skb->data + 32); + barrier(); + + dev = skb->dev; + +#ifdef CONFIG_NET_CLS_ACT + /* + * If ingress Qdisc configured, and packet not processed by ingress Qdisc yet + * We cannot accelerate this packet. + */ + if (dev->ingress_queue && !(skb->tc_verd & TC_NCLS)) { + return 0; + } +#endif + + /* + * We're only interested in IPv4 and IPv6 packets. + */ + if (likely(htons(ETH_P_IP) == skb->protocol)) { + struct in_device *in_dev; + + /* + * Does our input device support IP processing? + */ + in_dev = (struct in_device *)dev->ip_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IP processing for device: %s\n", dev->name); + return 0; + } + + /* + * Does it have an IP address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(!in_dev->ifa_list)) { + DEBUG_TRACE("no IP address for device: %s\n", dev->name); + return 0; + } + + return sfe_ipv4_recv(dev, skb); + } + + if (likely(htons(ETH_P_IPV6) == skb->protocol)) { + struct inet6_dev *in_dev; + + /* + * Does our input device support IPv6 processing? + */ + in_dev = (struct inet6_dev *)dev->ip6_ptr; + if (unlikely(!in_dev)) { + DEBUG_TRACE("no IPv6 processing for device: %s\n", dev->name); + return 0; + } + + /* + * Does it have an IPv6 address? If it doesn't then we can't do anything + * interesting here! + */ + if (unlikely(list_empty(&in_dev->addr_list))) { + DEBUG_TRACE("no IPv6 address for device: %s\n", dev->name); + return 0; + } + + return sfe_ipv6_recv(dev, skb); + } + + DEBUG_TRACE("not IP packet\n"); + return 0; +} + +/* + * sfe_cm_find_dev_and_mac_addr() + * Find the device and MAC address for a given IPv4/IPv6 address. + * + * Returns true if we find the device and MAC address, otherwise false. + * + * We look up the rtable entry for the address and, from its neighbour + * structure, obtain the hardware address. This means this function also + * works if the neighbours are routers too. + */ +static bool sfe_cm_find_dev_and_mac_addr(sfe_ip_addr_t *addr, struct net_device **dev, u8 *mac_addr, int is_v4) +{ + struct neighbour *neigh; + struct rtable *rt; + struct rt6_info *rt6; + struct dst_entry *dst; + struct net_device *mac_dev; + + /* + * Look up the rtable entry for the IP address then get the hardware + * address from its neighbour structure. This means this work when the + * neighbours are routers too. + */ + if (likely(is_v4)) { + rt = ip_route_output(&init_net, addr->ip, 0, 0, 0); + if (unlikely(IS_ERR(rt))) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt; + } else { + rt6 = rt6_lookup(&init_net, (struct in6_addr *)addr->ip6, 0, 0, 0); + if (!rt6) { + goto ret_fail; + } + + dst = (struct dst_entry *)rt6; + } + + rcu_read_lock(); + neigh = dst_neigh_lookup(dst, addr); + if (unlikely(!neigh)) { + rcu_read_unlock(); + dst_release(dst); + goto ret_fail; + } + + if (unlikely(!(neigh->nud_state & NUD_VALID))) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + mac_dev = neigh->dev; + if (!mac_dev) { + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + goto ret_fail; + } + + memcpy(mac_addr, neigh->ha, (size_t)mac_dev->addr_len); + + dev_hold(mac_dev); + *dev = mac_dev; + rcu_read_unlock(); + neigh_release(neigh); + dst_release(dst); + + return true; + +ret_fail: + if (is_v4) { + DEBUG_TRACE("failed to find MAC address for IP: %pI4\n", &addr->ip); + + } else { + DEBUG_TRACE("failed to find MAC address for IP: %pI6\n", addr->ip6); + } + + return false; +} + +/* + * sfe_cm_post_routing() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +static unsigned int sfe_cm_post_routing(struct sk_buff *skb, int is_v4) +{ + struct sfe_connection_create sic; + struct net_device *in; + struct nf_conn *ct; + enum ip_conntrack_info ctinfo; + struct net_device *dev; + struct net_device *src_dev; + struct net_device *dest_dev; + struct net_device *src_br_dev = NULL; + struct net_device *dest_br_dev = NULL; + struct nf_conntrack_tuple orig_tuple; + struct nf_conntrack_tuple reply_tuple; + SFE_NF_CONN_ACCT(acct); + + /* + * Don't process broadcast or multicast packets. + */ + if (unlikely(skb->pkt_type == PACKET_BROADCAST)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_BROADCAST); + DEBUG_TRACE("broadcast, ignoring\n"); + return NF_ACCEPT; + } + if (unlikely(skb->pkt_type == PACKET_MULTICAST)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_PACKET_MULTICAST); + DEBUG_TRACE("multicast, ignoring\n"); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + /* + * Packet to xfrm for encapsulation, we can't process it + */ + if (unlikely(skb_dst(skb)->xfrm)) { + DEBUG_TRACE("packet to xfrm, ignoring\n"); + return NF_ACCEPT; + } +#endif + + /* + * Don't process locally generated packets. + */ + if (skb->sk) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_LOCAL_OUT); + DEBUG_TRACE("skip local out packet\n"); + return NF_ACCEPT; + } + + /* + * Don't process packets that are not being forwarded. + */ + in = dev_get_by_index(&init_net, skb->skb_iif); + if (!in) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_IIF); + DEBUG_TRACE("packet not forwarding\n"); + return NF_ACCEPT; + } + + dev_put(in); + + /* + * Don't process packets that aren't being tracked by conntrack. + */ + ct = nf_ct_get(skb, &ctinfo); + if (unlikely(!ct)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_CT); + DEBUG_TRACE("no conntrack connection, ignoring\n"); + return NF_ACCEPT; + } + + /* + * Don't process untracked connections. + */ + if (unlikely(nf_ct_is_untracked(ct))) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_TRACK); + DEBUG_TRACE("untracked connection\n"); + return NF_ACCEPT; + } + + /* + * Unconfirmed connection may be dropped by Linux at the final step, + * So we don't process unconfirmed connections. + */ + if (!nf_ct_is_confirmed(ct)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_NO_CONFIRM); + DEBUG_TRACE("unconfirmed connection\n"); + return NF_ACCEPT; + } + + /* + * Don't process connections that require support from a 'helper' (typically a NAT ALG). + */ + if (unlikely(nfct_help(ct))) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_CT_IS_ALG); + DEBUG_TRACE("connection has helper\n"); + return NF_ACCEPT; + } + + /* + * Check if the acceleration of a flow could be rejected quickly. + */ + acct = nf_conn_acct_find(ct); + if (acct) { + long long packets = atomic64_read(&SFE_ACCT_COUNTER(acct)[CTINFO2DIR(ctinfo)].packets); + if ((packets > 0xff) && (packets & 0xff)) { + /* + * Connection hits slow path at least 256 times, so it must be not able to accelerate. + * But we also give it a chance to walk through ECM every 256 packets + */ + return NF_ACCEPT; + } + } + + /* + * Look up the details of our connection in conntrack. + * + * Note that the data we get from conntrack is for the "ORIGINAL" direction + * but our packet may actually be in the "REPLY" direction. + */ + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + reply_tuple = ct->tuplehash[IP_CT_DIR_REPLY].tuple; + sic.protocol = (s32)orig_tuple.dst.protonum; + + sic.flags = 0; + + /* + * Get addressing information, non-NAT first + */ + if (likely(is_v4)) { + u32 dscp; + + sic.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sic.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + if (ipv4_is_multicast(sic.src_ip.ip) || ipv4_is_multicast(sic.dest_ip.ip)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV4_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip = (__be32)reply_tuple.dst.u3.ip; + sic.dest_ip_xlate.ip = (__be32)reply_tuple.src.u3.ip; + + dscp = ipv4_get_dsfield(ip_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } else { + u32 dscp; + + sic.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sic.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + if (ipv6_addr_is_multicast((struct in6_addr *)sic.src_ip.ip6) || + ipv6_addr_is_multicast((struct in6_addr *)sic.dest_ip.ip6)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_IS_IPV6_MCAST); + DEBUG_TRACE("multicast address\n"); + return NF_ACCEPT; + } + + /* + * NAT'ed addresses - note these are as seen from the 'reply' direction + * When NAT does not apply to this connection these will be identical to the above. + */ + sic.src_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.dst.u3.in6); + sic.dest_ip_xlate.ip6[0] = *((struct sfe_ipv6_addr *)&reply_tuple.src.u3.in6); + + dscp = ipv6_get_dsfield(ipv6_hdr(skb)) >> XT_DSCP_SHIFT; + if (dscp) { + sic.dest_dscp = dscp; + sic.src_dscp = sic.dest_dscp; + sic.flags |= SFE_CREATE_FLAG_REMARK_DSCP; + } + } + + switch (sic.protocol) { + case IPPROTO_TCP: + sic.src_port = orig_tuple.src.u.tcp.port; + sic.dest_port = orig_tuple.dst.u.tcp.port; + sic.src_port_xlate = reply_tuple.dst.u.tcp.port; + sic.dest_port_xlate = reply_tuple.src.u.tcp.port; + sic.src_td_window_scale = ct->proto.tcp.seen[0].td_scale; + sic.src_td_max_window = ct->proto.tcp.seen[0].td_maxwin; + sic.src_td_end = ct->proto.tcp.seen[0].td_end; + sic.src_td_max_end = ct->proto.tcp.seen[0].td_maxend; + sic.dest_td_window_scale = ct->proto.tcp.seen[1].td_scale; + sic.dest_td_max_window = ct->proto.tcp.seen[1].td_maxwin; + sic.dest_td_end = ct->proto.tcp.seen[1].td_end; + sic.dest_td_max_end = ct->proto.tcp.seen[1].td_maxend; + + if (nf_ct_tcp_no_window_check + || (ct->proto.tcp.seen[0].flags & IP_CT_TCP_FLAG_BE_LIBERAL) + || (ct->proto.tcp.seen[1].flags & IP_CT_TCP_FLAG_BE_LIBERAL)) { + sic.flags |= SFE_CREATE_FLAG_NO_SEQ_CHECK; + } + + /* + * Don't try to manage a non-established connection. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ASSURED); + DEBUG_TRACE("non-established connection\n"); + return NF_ACCEPT; + } + + /* + * If the connection is shutting down do not manage it. + * state can not be SYN_SENT, SYN_RECV because connection is assured + * Not managed states: FIN_WAIT, CLOSE_WAIT, LAST_ACK, TIME_WAIT, CLOSE. + */ + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.state != TCP_CONNTRACK_ESTABLISHED) { + spin_unlock_bh(&ct->lock); + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_TCP_NOT_ESTABLISHED); + DEBUG_TRACE("connection in termination state: %#x, s: %pI4:%u, d: %pI4:%u\n", + ct->proto.tcp.state, &sic.src_ip, ntohs(sic.src_port), + &sic.dest_ip, ntohs(sic.dest_port)); + return NF_ACCEPT; + } + spin_unlock_bh(&ct->lock); + break; + + case IPPROTO_UDP: + sic.src_port = orig_tuple.src.u.udp.port; + sic.dest_port = orig_tuple.dst.u.udp.port; + sic.src_port_xlate = reply_tuple.dst.u.udp.port; + sic.dest_port_xlate = reply_tuple.src.u.udp.port; + break; + + default: + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_UNKNOW_PROTOCOL); + DEBUG_TRACE("unhandled protocol %d\n", sic.protocol); + return NF_ACCEPT; + } + +#ifdef CONFIG_XFRM + sic.original_accel = 1; + sic.reply_accel = 1; + + /* + * For packets de-capsulated from xfrm, we still can accelerate it + * on the direction we just received the packet. + */ + if (unlikely(skb->sp)) { + if (sic.protocol == IPPROTO_TCP && + !(sic.flags & SFE_CREATE_FLAG_NO_SEQ_CHECK)) { + return NF_ACCEPT; + } + + if (CTINFO2DIR(ctinfo) == IP_CT_DIR_ORIGINAL) { + sic.reply_accel = 0; + } else { + sic.original_accel = 0; + } + } +#endif + + /* + * Get QoS information + */ + if (skb->priority) { + sic.dest_priority = skb->priority; + sic.src_priority = sic.dest_priority; + sic.flags |= SFE_CREATE_FLAG_REMARK_PRIORITY; + } + + /* + * Get the net device and MAC addresses that correspond to the various source and + * destination host addresses. + */ + if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip, &src_dev, sic.src_mac, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_DEV); + return NF_ACCEPT; + } + + if (!sfe_cm_find_dev_and_mac_addr(&sic.src_ip_xlate, &dev, sic.src_mac_xlate, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_SRC_XLATE_DEV); + goto done1; + } + + dev_put(dev); + + if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip, &dev, sic.dest_mac, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_DEV); + goto done1; + } + + dev_put(dev); + + if (!sfe_cm_find_dev_and_mac_addr(&sic.dest_ip_xlate, &dest_dev, sic.dest_mac_xlate, is_v4)) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_DEST_XLATE_DEV); + goto done1; + } + + /* + * Our devices may actually be part of a bridge interface. If that's + * the case then find the bridge interface instead. + */ + if (src_dev->priv_flags & IFF_BRIDGE_PORT) { + src_br_dev = sfe_dev_get_master(src_dev); + if (!src_br_dev) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", src_dev->name); + goto done2; + } + + src_dev = src_br_dev; + } + + if (dest_dev->priv_flags & IFF_BRIDGE_PORT) { + dest_br_dev = sfe_dev_get_master(dest_dev); + if (!dest_br_dev) { + sfe_cm_incr_exceptions(SFE_CM_EXCEPTION_NO_BRIDGE); + DEBUG_TRACE("no bridge found for: %s\n", dest_dev->name); + goto done3; + } + + dest_dev = dest_br_dev; + } + + sic.src_dev = src_dev; + sic.dest_dev = dest_dev; + + sic.src_mtu = src_dev->mtu; + sic.dest_mtu = dest_dev->mtu; + sic.mark = skb->mark; + if (likely(is_v4)) { + sfe_ipv4_create_rule(&sic); + } else { + sfe_ipv6_create_rule(&sic); + } + + /* + * If we had bridge ports then release them too. + */ + if (dest_br_dev) { + dev_put(dest_br_dev); + } + +done3: + if (src_br_dev) { + dev_put(src_br_dev); + } + +done2: + dev_put(dest_dev); + +done1: + dev_put(src_dev); + + return NF_ACCEPT; +} + +/* + * sfe_cm_ipv4_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +sfe_cm_ipv4_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return sfe_cm_post_routing(skb, true); +} + +/* + * sfe_cm_ipv6_post_routing_hook() + * Called for packets about to leave the box - either locally generated or forwarded from another interface + */ +sfe_cm_ipv6_post_routing_hook(hooknum, ops, skb, in_unused, out, okfn) +{ + return sfe_cm_post_routing(skb, false); +} + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +/* + * sfe_cm_conntrack_event() + * Callback event invoked when a conntrack connection's state changes. + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static int sfe_cm_conntrack_event(struct notifier_block *this, + unsigned long events, void *ptr) +#else +static int sfe_cm_conntrack_event(unsigned int events, struct nf_ct_event *item) +#endif +{ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + struct nf_ct_event *item = ptr; +#endif + struct sfe_connection_destroy sid; + struct nf_conn *ct = item->ct; + struct nf_conntrack_tuple orig_tuple; + + /* + * If we don't have a conntrack entry then we're done. + */ + if (unlikely(!ct)) { + DEBUG_WARN("no ct in conntrack event callback\n"); + return NOTIFY_DONE; + } + + /* + * If this is an untracked connection then we can't have any state either. + */ + if (unlikely(nf_ct_is_untracked(ct))) { + DEBUG_TRACE("ignoring untracked conn\n"); + return NOTIFY_DONE; + } + + /* + * We're only interested in destroy events. + */ + if (unlikely(!(events & (1 << IPCT_DESTROY)))) { + DEBUG_TRACE("ignoring non-destroy event\n"); + return NOTIFY_DONE; + } + + orig_tuple = ct->tuplehash[IP_CT_DIR_ORIGINAL].tuple; + sid.protocol = (s32)orig_tuple.dst.protonum; + + /* + * Extract information from the conntrack connection. We're only interested + * in nominal connection information (i.e. we're ignoring any NAT information). + */ + switch (sid.protocol) { + case IPPROTO_TCP: + sid.src_port = orig_tuple.src.u.tcp.port; + sid.dest_port = orig_tuple.dst.u.tcp.port; + break; + + case IPPROTO_UDP: + sid.src_port = orig_tuple.src.u.udp.port; + sid.dest_port = orig_tuple.dst.u.udp.port; + break; + + default: + DEBUG_TRACE("unhandled protocol: %d\n", sid.protocol); + return NOTIFY_DONE; + } + + if (likely(nf_ct_l3num(ct) == AF_INET)) { + sid.src_ip.ip = (__be32)orig_tuple.src.u3.ip; + sid.dest_ip.ip = (__be32)orig_tuple.dst.u3.ip; + + sfe_ipv4_destroy_rule(&sid); + } else if (likely(nf_ct_l3num(ct) == AF_INET6)) { + sid.src_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.src.u3.in6); + sid.dest_ip.ip6[0] = *((struct sfe_ipv6_addr *)&orig_tuple.dst.u3.in6); + + sfe_ipv6_destroy_rule(&sid); + } else { + DEBUG_TRACE("ignoring non-IPv4 and non-IPv6 connection\n"); + } + + return NOTIFY_DONE; +} + +/* + * Netfilter conntrack event system to monitor connection tracking changes + */ +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +static struct notifier_block sfe_cm_conntrack_notifier = { + .notifier_call = sfe_cm_conntrack_event, +}; +#else +static struct nf_ct_event_notifier sfe_cm_conntrack_notifier = { + .fcn = sfe_cm_conntrack_event, +}; +#endif +#endif + +/* + * Structure to establish a hook into the post routing netfilter point - this + * will pick up local outbound and packets going from one interface to another. + * + * Note: see include/linux/netfilter_ipv4.h for info related to priority levels. + * We want to examine packets after NAT translation and any ALG processing. + */ +static struct nf_hook_ops sfe_cm_ops_post_routing[] __read_mostly = { + SFE_IPV4_NF_POST_ROUTING_HOOK(__sfe_cm_ipv4_post_routing_hook), + SFE_IPV6_NF_POST_ROUTING_HOOK(__sfe_cm_ipv6_post_routing_hook), +}; + +/* + * sfe_cm_sync_rule() + * Synchronize a connection's state. + */ +static void sfe_cm_sync_rule(struct sfe_connection_sync *sis) +{ + struct nf_conntrack_tuple_hash *h; + struct nf_conntrack_tuple tuple; + struct nf_conn *ct; + SFE_NF_CONN_ACCT(acct); + + /* + * Create a tuple so as to be able to look up a connection + */ + memset(&tuple, 0, sizeof(tuple)); + tuple.src.u.all = (__be16)sis->src_port; + tuple.dst.dir = IP_CT_DIR_ORIGINAL; + tuple.dst.protonum = (u8)sis->protocol; + tuple.dst.u.all = (__be16)sis->dest_port; + + if (sis->is_v6) { + tuple.src.u3.in6 = *((struct in6_addr *)sis->src_ip.ip6); + tuple.dst.u3.in6 = *((struct in6_addr *)sis->dest_ip.ip6); + tuple.src.l3num = AF_INET6; + + DEBUG_TRACE("update connection - p: %d, s: %pI6:%u, d: %pI6:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.in6, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.in6, (unsigned int)ntohs(tuple.dst.u.all)); + } else { + tuple.src.u3.ip = sis->src_ip.ip; + tuple.dst.u3.ip = sis->dest_ip.ip; + tuple.src.l3num = AF_INET; + + DEBUG_TRACE("update connection - p: %d, s: %pI4:%u, d: %pI4:%u\n", + (int)tuple.dst.protonum, + &tuple.src.u3.ip, (unsigned int)ntohs(tuple.src.u.all), + &tuple.dst.u3.ip, (unsigned int)ntohs(tuple.dst.u.all)); + } + + /* + * Look up conntrack connection + */ + h = nf_conntrack_find_get(&init_net, SFE_NF_CT_DEFAULT_ZONE, &tuple); + if (unlikely(!h)) { + DEBUG_TRACE("no connection found\n"); + return; + } + + ct = nf_ct_tuplehash_to_ctrack(h); + NF_CT_ASSERT(ct->timeout.data == (unsigned long)ct); + + /* + * Only update if this is not a fixed timeout + */ + if (!test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) { + spin_lock_bh(&ct->lock); +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)) + ct->timeout += sis->delta_jiffies; +#else + ct->timeout.expires += sis->delta_jiffies; +#endif + spin_unlock_bh(&ct->lock); + } + + acct = nf_conn_acct_find(ct); + if (acct) { + spin_lock_bh(&ct->lock); + atomic64_add(sis->src_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].packets); + atomic64_add(sis->src_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_ORIGINAL].bytes); + atomic64_add(sis->dest_new_packet_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + atomic64_add(sis->dest_new_byte_count, &SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].bytes); + spin_unlock_bh(&ct->lock); + } + + switch (sis->protocol) { + case IPPROTO_TCP: + spin_lock_bh(&ct->lock); + if (ct->proto.tcp.seen[0].td_maxwin < sis->src_td_max_window) { + ct->proto.tcp.seen[0].td_maxwin = sis->src_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[0].td_end - sis->src_td_end) < 0) { + ct->proto.tcp.seen[0].td_end = sis->src_td_end; + } + if ((s32)(ct->proto.tcp.seen[0].td_maxend - sis->src_td_max_end) < 0) { + ct->proto.tcp.seen[0].td_maxend = sis->src_td_max_end; + } + if (ct->proto.tcp.seen[1].td_maxwin < sis->dest_td_max_window) { + ct->proto.tcp.seen[1].td_maxwin = sis->dest_td_max_window; + } + if ((s32)(ct->proto.tcp.seen[1].td_end - sis->dest_td_end) < 0) { + ct->proto.tcp.seen[1].td_end = sis->dest_td_end; + } + if ((s32)(ct->proto.tcp.seen[1].td_maxend - sis->dest_td_max_end) < 0) { + ct->proto.tcp.seen[1].td_maxend = sis->dest_td_max_end; + } + spin_unlock_bh(&ct->lock); + break; + case IPPROTO_UDP: + /* + * In Linux connection track, UDP flow has two timeout values: + * /proc/sys/net/netfilter/nf_conntrack_udp_timeout: + * this is for uni-direction UDP flow, normally its value is 60 seconds + * /proc/sys/net/netfilter/nf_conntrack_udp_timeout_stream: + * this is for bi-direction UDP flow, normally its value is 180 seconds + * + * Linux will update timer of UDP flow to stream timeout once it seen packets + * in reply direction. But if flow is accelerated by NSS or SFE, Linux won't + * see any packets. So we have to do the same thing in our stats sync message. + */ + if (!test_bit(IPS_ASSURED_BIT, &ct->status) && acct) { + u_int64_t reply_pkts = atomic64_read(&SFE_ACCT_COUNTER(acct)[IP_CT_DIR_REPLY].packets); + + if (reply_pkts != 0) { + struct nf_conntrack_l4proto *l4proto; + unsigned int *timeouts; + + set_bit(IPS_SEEN_REPLY_BIT, &ct->status); + set_bit(IPS_ASSURED_BIT, &ct->status); + + l4proto = __nf_ct_l4proto_find((sis->is_v6 ? AF_INET6 : AF_INET), IPPROTO_UDP); + timeouts = nf_ct_timeout_lookup(&init_net, ct, l4proto); + + spin_lock_bh(&ct->lock); +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(4, 9, 0)) + ct->timeout = nfct_time_stamp + timeouts[UDP_CT_REPLIED]; +#else + ct->timeout.expires = jiffies + timeouts[UDP_CT_REPLIED]; +#endif + spin_unlock_bh(&ct->lock); + } + } + break; + } + + /* + * Release connection + */ + nf_ct_put(ct); +} + +/* + * sfe_cm_device_event() + */ +static int sfe_cm_device_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = SFE_DEV_EVENT_PTR(ptr); + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_inet_event() + */ +static int sfe_cm_inet_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv4_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_inet6_event() + */ +static int sfe_cm_inet6_event(struct notifier_block *this, unsigned long event, void *ptr) +{ + struct net_device *dev = ((struct inet6_ifaddr *)ptr)->idev->dev; + + if (dev && (event == NETDEV_DOWN)) { + sfe_ipv6_destroy_all_rules_for_dev(dev); + } + + return NOTIFY_DONE; +} + +/* + * sfe_cm_get_exceptions + * dump exception counters + */ +static ssize_t sfe_cm_get_exceptions(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int idx, len; + struct sfe_cm *sc = &__sc; + + spin_lock_bh(&sc->lock); + for (len = 0, idx = 0; idx < SFE_CM_EXCEPTION_MAX; idx++) { + if (sc->exceptions[idx]) { + len += snprintf(buf + len, (ssize_t)(PAGE_SIZE - len), "%s = %d\n", sfe_cm_exception_events_string[idx], sc->exceptions[idx]); + } + } + spin_unlock_bh(&sc->lock); + + return len; +} + +/* + * sfe_cm_get_stop + * dump stop + */ +static ssize_t sfe_cm_get_stop(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + int (*fast_recv)(struct sk_buff *skb); + rcu_read_lock(); + fast_recv = rcu_dereference(fast_nat_recv); + rcu_read_unlock(); + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", fast_recv ? 0 : 1); +} + +static ssize_t sfe_cm_set_stop(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + int ret; + u32 num; + int (*fast_recv)(struct sk_buff *skb); + + ret = kstrtou32(buf, 0, &num); + if (ret) + return ret; + + /* + * Hook/Unhook the receive path in the network stack. + */ + if (num) { + RCU_INIT_POINTER(fast_nat_recv, NULL); + } else { + rcu_read_lock(); + fast_recv = rcu_dereference(fast_nat_recv); + rcu_read_unlock(); + if (!fast_recv) { + BUG_ON(fast_nat_recv); + RCU_INIT_POINTER(fast_nat_recv, sfe_cm_recv); + } + } + + DEBUG_TRACE("sfe_cm_stop = %d\n", num); + return count; +} + +/* + * sfe_cm_get_defunct_all + * dump state of SFE + */ +static ssize_t sfe_cm_get_defunct_all(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + return snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", 0); +} + +static ssize_t sfe_cm_set_defunct_all(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + sfe_ipv6_destroy_all_rules_for_dev(NULL); + return count; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_attrs[] = { + __ATTR(exceptions, S_IRUGO, sfe_cm_get_exceptions, NULL), + __ATTR(stop, S_IWUSR | S_IRUGO, sfe_cm_get_stop, sfe_cm_set_stop), + __ATTR(defunct_all, S_IWUSR | S_IRUGO, sfe_cm_get_defunct_all, sfe_cm_set_defunct_all), +}; + +/* + * sfe_cm_init() + */ +static int __init sfe_cm_init(void) +{ + struct sfe_cm *sc = &__sc; + int result = -1; + size_t i, j; + + DEBUG_INFO("SFE CM init\n"); + + /* + * Create sys/sfe_cm + */ + sc->sys_sfe_cm = kobject_create_and_add("sfe_cm", NULL); + if (!sc->sys_sfe_cm) { + DEBUG_ERROR("failed to register sfe_cm\n"); + goto exit1; + } + + for (i = 0; i < ARRAY_SIZE(sfe_attrs); i++) { + result = sysfs_create_file(sc->sys_sfe_cm, &sfe_attrs[i].attr); + if (result) { + DEBUG_ERROR("failed to register %s : %d\n", + sfe_attrs[i].attr.name, result); + goto exit2; + } + } + + sc->dev_notifier.notifier_call = sfe_cm_device_event; + sc->dev_notifier.priority = 1; + register_netdevice_notifier(&sc->dev_notifier); + + sc->inet_notifier.notifier_call = sfe_cm_inet_event; + sc->inet_notifier.priority = 1; + register_inetaddr_notifier(&sc->inet_notifier); + + sc->inet6_notifier.notifier_call = sfe_cm_inet6_event; + sc->inet6_notifier.priority = 1; + register_inet6addr_notifier(&sc->inet6_notifier); + /* + * Register our netfilter hooks. + */ + result = nf_register_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); + if (result < 0) { + DEBUG_ERROR("can't register nf post routing hook: %d\n", result); + goto exit3; + } + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + /* + * Register a notifier hook to get fast notifications of expired connections. + * Note: In CONFIG_NF_CONNTRACK_CHAIN_EVENTS enabled case, nf_conntrack_register_notifier() + * function always returns 0. + */ + +#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + (void)nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier); +#else + result = nf_conntrack_register_notifier(&init_net, &sfe_cm_conntrack_notifier); + if (result < 0) { + DEBUG_ERROR("can't register nf notifier hook: %d\n", result); + goto exit4; + } +#endif +#endif + + spin_lock_init(&sc->lock); + + /* + * Hook the shortcut sync callback. + */ + sfe_ipv4_register_sync_rule_callback(sfe_cm_sync_rule); + sfe_ipv6_register_sync_rule_callback(sfe_cm_sync_rule); + return 0; + +#ifdef CONFIG_NF_CONNTRACK_EVENTS +#ifndef CONFIG_NF_CONNTRACK_CHAIN_EVENTS +exit4: + nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); +#endif +#endif +exit3: + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); +exit2: + for (j = 0; j < i; j++) { + sysfs_remove_file(sc->sys_sfe_cm, &sfe_attrs[j].attr); + } + kobject_put(sc->sys_sfe_cm); + +exit1: + return result; +} + +/* + * sfe_cm_exit() + */ +static void __exit sfe_cm_exit(void) +{ + struct sfe_cm *sc = &__sc; + + DEBUG_INFO("SFE CM exit\n"); + + /* + * Unregister our sync callback. + */ + sfe_ipv4_register_sync_rule_callback(NULL); + sfe_ipv6_register_sync_rule_callback(NULL); + + /* + * Unregister our receive callback. + */ + RCU_INIT_POINTER(fast_nat_recv, NULL); + + /* + * Wait for all callbacks to complete. + */ + rcu_barrier(); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + sfe_ipv6_destroy_all_rules_for_dev(NULL); + +#ifdef CONFIG_NF_CONNTRACK_EVENTS + nf_conntrack_unregister_notifier(&init_net, &sfe_cm_conntrack_notifier); + +#endif + nf_unregister_hooks(sfe_cm_ops_post_routing, ARRAY_SIZE(sfe_cm_ops_post_routing)); + + unregister_inet6addr_notifier(&sc->inet6_notifier); + unregister_inetaddr_notifier(&sc->inet_notifier); + unregister_netdevice_notifier(&sc->dev_notifier); + + kobject_put(sc->sys_sfe_cm); +} + +module_init(sfe_cm_init) +module_exit(sfe_cm_exit) + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - Connection Manager"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/package/kernel/shortcut-fe/src/sfe_cm.h b/package/kernel/shortcut-fe/src/sfe_cm.h new file mode 100644 index 000000000000..6f2a819d125f --- /dev/null +++ b/package/kernel/shortcut-fe/src/sfe_cm.h @@ -0,0 +1,222 @@ +/* + * sfe_cm.h + * Shortcut forwarding engine. + * + * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +/* + * connection flags. + */ +#define SFE_CREATE_FLAG_NO_SEQ_CHECK BIT(0) + /* Indicates that we should not check sequence numbers */ +#define SFE_CREATE_FLAG_REMARK_PRIORITY BIT(1) + /* Indicates that we should remark priority of skb */ +#define SFE_CREATE_FLAG_REMARK_DSCP BIT(2) + /* Indicates that we should remark DSCP of packet */ + +/* + * IPv6 address structure + */ +struct sfe_ipv6_addr { + __be32 addr[4]; +}; + +typedef union { + __be32 ip; + struct sfe_ipv6_addr ip6[1]; +} sfe_ip_addr_t; + +/* + * connection creation structure. + */ +struct sfe_connection_create { + int protocol; + struct net_device *src_dev; + struct net_device *dest_dev; + u32 flags; + u32 src_mtu; + u32 dest_mtu; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t src_ip_xlate; + sfe_ip_addr_t dest_ip; + sfe_ip_addr_t dest_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u8 src_mac[ETH_ALEN]; + u8 src_mac_xlate[ETH_ALEN]; + u8 dest_mac[ETH_ALEN]; + u8 dest_mac_xlate[ETH_ALEN]; + u8 src_td_window_scale; + u32 src_td_max_window; + u32 src_td_end; + u32 src_td_max_end; + u8 dest_td_window_scale; + u32 dest_td_max_window; + u32 dest_td_end; + u32 dest_td_max_end; + u32 mark; +#ifdef CONFIG_XFRM + u32 original_accel; + u32 reply_accel; +#endif + u32 src_priority; + u32 dest_priority; + u32 src_dscp; + u32 dest_dscp; +}; + +/* + * connection destruction structure. + */ +struct sfe_connection_destroy { + int protocol; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t dest_ip; + __be16 src_port; + __be16 dest_port; +}; + +typedef enum sfe_sync_reason { + SFE_SYNC_REASON_STATS, /* Sync is to synchronize stats */ + SFE_SYNC_REASON_FLUSH, /* Sync is to flush a entry */ + SFE_SYNC_REASON_DESTROY /* Sync is to destroy a entry(requested by connection manager) */ +} sfe_sync_reason_t; + +/* + * Structure used to sync connection stats/state back within the system. + * + * NOTE: The addresses here are NON-NAT addresses, i.e. the true endpoint addressing. + * 'src' is the creator of the connection. + */ +struct sfe_connection_sync { + struct net_device *src_dev; + struct net_device *dest_dev; + int is_v6; /* Is it for ipv6? */ + int protocol; /* IP protocol number (IPPROTO_...) */ + sfe_ip_addr_t src_ip; /* Non-NAT source address, i.e. the creator of the connection */ + sfe_ip_addr_t src_ip_xlate; /* NATed source address */ + __be16 src_port; /* Non-NAT source port */ + __be16 src_port_xlate; /* NATed source port */ + sfe_ip_addr_t dest_ip; /* Non-NAT destination address, i.e. to whom the connection was created */ + sfe_ip_addr_t dest_ip_xlate; /* NATed destination address */ + __be16 dest_port; /* Non-NAT destination port */ + __be16 dest_port_xlate; /* NATed destination port */ + u32 src_td_max_window; + u32 src_td_end; + u32 src_td_max_end; + u64 src_packet_count; + u64 src_byte_count; + u32 src_new_packet_count; + u32 src_new_byte_count; + u32 dest_td_max_window; + u32 dest_td_end; + u32 dest_td_max_end; + u64 dest_packet_count; + u64 dest_byte_count; + u32 dest_new_packet_count; + u32 dest_new_byte_count; + u32 reason; /* reason for stats sync message, i.e. destroy, flush, period sync */ + u64 delta_jiffies; /* Time to be added to the current timeout to keep the connection alive */ +}; + +/* + * connection mark structure + */ +struct sfe_connection_mark { + int protocol; + sfe_ip_addr_t src_ip; + sfe_ip_addr_t dest_ip; + __be16 src_port; + __be16 dest_port; + u32 mark; +}; + +/* + * Expose the hook for the receive processing. + */ +extern int (*fast_nat_recv)(struct sk_buff *skb); + +/* + * Expose what should be a static flag in the TCP connection tracker. + */ +extern int nf_ct_tcp_no_window_check; + +/* + * This callback will be called in a timer + * at 100 times per second to sync stats back to + * Linux connection track. + * + * A RCU lock is taken to prevent this callback + * from unregistering. + */ +typedef void (*sfe_sync_rule_callback_t)(struct sfe_connection_sync *); + +/* + * IPv4 APIs used by connection manager + */ +int sfe_ipv4_recv(struct net_device *dev, struct sk_buff *skb); +int sfe_ipv4_create_rule(struct sfe_connection_create *sic); +void sfe_ipv4_destroy_rule(struct sfe_connection_destroy *sid); +void sfe_ipv4_destroy_all_rules_for_dev(struct net_device *dev); +void sfe_ipv4_register_sync_rule_callback(sfe_sync_rule_callback_t callback); +void sfe_ipv4_update_rule(struct sfe_connection_create *sic); +void sfe_ipv4_mark_rule(struct sfe_connection_mark *mark); + +/* + * IPv6 APIs used by connection manager + */ +int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb); +int sfe_ipv6_create_rule(struct sfe_connection_create *sic); +void sfe_ipv6_destroy_rule(struct sfe_connection_destroy *sid); +void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev); +void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t callback); +void sfe_ipv6_update_rule(struct sfe_connection_create *sic); +void sfe_ipv6_mark_rule(struct sfe_connection_mark *mark); + +/* + * sfe_ipv6_addr_equal() + * compare ipv6 address + * + * return: 1, equal; 0, no equal + */ +static inline int sfe_ipv6_addr_equal(struct sfe_ipv6_addr *a, + struct sfe_ipv6_addr *b) +{ + return a->addr[0] == b->addr[0] && + a->addr[1] == b->addr[1] && + a->addr[2] == b->addr[2] && + a->addr[3] == b->addr[3]; +} + +/* + * sfe_ipv4_addr_equal() + * compare ipv4 address + * + * return: 1, equal; 0, no equal + */ +#define sfe_ipv4_addr_equal(a, b) ((u32)(a) == (u32)(b)) + +/* + * sfe_addr_equal() + * compare ipv4 or ipv6 address + * + * return: 1, equal; 0, no equal + */ +static inline int sfe_addr_equal(sfe_ip_addr_t *a, + sfe_ip_addr_t *b, int is_v4) +{ + return is_v4 ? sfe_ipv4_addr_equal(a->ip, b->ip) : sfe_ipv6_addr_equal(a->ip6, b->ip6); +} diff --git a/package/kernel/shortcut-fe/src/sfe_ipv4.c b/package/kernel/shortcut-fe/src/sfe_ipv4.c new file mode 100644 index 000000000000..34ed09b58a01 --- /dev/null +++ b/package/kernel/shortcut-fe/src/sfe_ipv4.c @@ -0,0 +1,3369 @@ +/* + * sfe_ipv4.c + * Shortcut forwarding engine - IPv4 edition. + * + * Copyright (c) 2013-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" + +/* + * By default Linux IP header and transport layer header structures are + * unpacked, assuming that such headers should be 32-bit aligned. + * Unfortunately some wireless adaptors can't cope with this requirement and + * some CPUs can't handle misaligned accesses. For those platforms we + * define SFE_IPV4_UNALIGNED_IP_HEADER and mark the structures as packed. + * When we do this the compiler will generate slightly worse code than for the + * aligned case (on most platforms) but will be much quicker than fixing + * things up in an unaligned trap handler. + */ +#define SFE_IPV4_UNALIGNED_IP_HEADER 1 +#if SFE_IPV4_UNALIGNED_IP_HEADER +#define SFE_IPV4_UNALIGNED_STRUCT __attribute__((aligned(4))) +#else +#define SFE_IPV4_UNALIGNED_STRUCT +#endif + +/* + * An Ethernet header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_eth_hdr { + __be16 h_dest[ETH_ALEN / 2]; + __be16 h_source[ETH_ALEN / 2]; + __be16 h_proto; +} SFE_IPV4_UNALIGNED_STRUCT; + +#define SFE_IPV4_DSCP_MASK 0x3 +#define SFE_IPV4_DSCP_SHIFT 2 + +/* + * An IPv4 header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_ip_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 ihl:4, + version:4; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u8 version:4, + ihl:4; +#else +#error "Please fix " +#endif + __u8 tos; + __be16 tot_len; + __be16 id; + __be16 frag_off; + __u8 ttl; + __u8 protocol; + __sum16 check; + __be32 saddr; + __be32 daddr; + + /* + * The options start here. + */ +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * A UDP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_udp_hdr { + __be16 source; + __be16 dest; + __be16 len; + __sum16 check; +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * A TCP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV4_UNALIGNED_STRUCT) + */ +struct sfe_ipv4_tcp_hdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +} SFE_IPV4_UNALIGNED_STRUCT; + +/* + * Specifies the lower bound on ACK numbers carried in the TCP header + */ +#define SFE_IPV4_TCP_MAX_ACK_WINDOW 65520 + +/* + * IPv4 TCP connection match additional data. + */ +struct sfe_ipv4_tcp_connection_match { + u8 win_scale; /* Window scale */ + u32 max_win; /* Maximum window size seen */ + u32 end; /* Sequence number of the next byte to send (seq + segment length) */ + u32 max_end; /* Sequence number of the last byte to ack */ +}; + +/* + * Bit flags for IPv4 connection matching entry. + */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC (1<<0) + /* Perform source translation */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST (1<<1) + /* Perform destination translation */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK (1<<2) + /* Ignore TCP sequence numbers */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR (1<<3) + /* Fast Ethernet header write */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR (1<<4) + /* Fast Ethernet header write */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK (1<<5) + /* remark priority of SKB */ +#define SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6) + /* remark DSCP of packet */ + +/* + * IPv4 connection matching structure. + */ +struct sfe_ipv4_connection_match { + /* + * References to other objects. + */ + struct sfe_ipv4_connection_match *next; + struct sfe_ipv4_connection_match *prev; + struct sfe_ipv4_connection *connection; + struct sfe_ipv4_connection_match *counter_match; + /* Matches the flow in the opposite direction as the one in *connection */ + struct sfe_ipv4_connection_match *active_next; + struct sfe_ipv4_connection_match *active_prev; + bool active; /* Flag to indicate if we're on the active list */ + + /* + * Characteristics that identify flows that match this rule. + */ + struct net_device *match_dev; /* Network device */ + u8 match_protocol; /* Protocol */ + __be32 match_src_ip; /* Source IP address */ + __be32 match_dest_ip; /* Destination IP address */ + __be16 match_src_port; /* Source port/connection ident */ + __be16 match_dest_port; /* Destination port/connection ident */ + + /* + * Control the operations of the match. + */ + u32 flags; /* Bit flags */ +#ifdef CONFIG_XFRM + u32 flow_accel; /* The flow accelerated or not */ +#endif + + /* + * Connection state that we track once we match. + */ + union { /* Protocol-specific state */ + struct sfe_ipv4_tcp_connection_match tcp; + } protocol_state; + /* + * Stats recorded in a sync period. These stats will be added to + * rx_packet_count64/rx_byte_count64 after a sync period. + */ + u32 rx_packet_count; + u32 rx_byte_count; + + /* + * Packet translation information. + */ + __be32 xlate_src_ip; /* Address after source translation */ + __be16 xlate_src_port; /* Port/connection ident after source translation */ + u16 xlate_src_csum_adjustment; + /* Transport layer checksum adjustment after source translation */ + u16 xlate_src_partial_csum_adjustment; + /* Transport layer pseudo header checksum adjustment after source translation */ + + __be32 xlate_dest_ip; /* Address after destination translation */ + __be16 xlate_dest_port; /* Port/connection ident after destination translation */ + u16 xlate_dest_csum_adjustment; + /* Transport layer checksum adjustment after destination translation */ + u16 xlate_dest_partial_csum_adjustment; + /* Transport layer pseudo header checksum adjustment after destination translation */ + + /* + * QoS information + */ + u32 priority; + u32 dscp; + + /* + * Packet transmit information. + */ + struct net_device *xmit_dev; /* Network device on which to transmit */ + unsigned short int xmit_dev_mtu; + /* Interface MTU */ + u16 xmit_dest_mac[ETH_ALEN / 2]; + /* Destination MAC address to use when forwarding */ + u16 xmit_src_mac[ETH_ALEN / 2]; + /* Source MAC address to use when forwarding */ + + /* + * Summary stats. + */ + u64 rx_packet_count64; + u64 rx_byte_count64; +}; + +/* + * Per-connection data structure. + */ +struct sfe_ipv4_connection { + struct sfe_ipv4_connection *next; + /* Pointer to the next entry in a hash chain */ + struct sfe_ipv4_connection *prev; + /* Pointer to the previous entry in a hash chain */ + int protocol; /* IP protocol number */ + __be32 src_ip; /* Src IP addr pre-translation */ + __be32 src_ip_xlate; /* Src IP addr post-translation */ + __be32 dest_ip; /* Dest IP addr pre-translation */ + __be32 dest_ip_xlate; /* Dest IP addr post-translation */ + __be16 src_port; /* Src port pre-translation */ + __be16 src_port_xlate; /* Src port post-translation */ + __be16 dest_port; /* Dest port pre-translation */ + __be16 dest_port_xlate; /* Dest port post-translation */ + struct sfe_ipv4_connection_match *original_match; + /* Original direction matching structure */ + struct net_device *original_dev; + /* Original direction source device */ + struct sfe_ipv4_connection_match *reply_match; + /* Reply direction matching structure */ + struct net_device *reply_dev; /* Reply direction source device */ + u64 last_sync_jiffies; /* Jiffies count for the last sync */ + struct sfe_ipv4_connection *all_connections_next; + /* Pointer to the next entry in the list of all connections */ + struct sfe_ipv4_connection *all_connections_prev; + /* Pointer to the previous entry in the list of all connections */ + u32 mark; /* mark for outgoing packet */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * IPv4 connections and hash table size information. + */ +#define SFE_IPV4_CONNECTION_HASH_SHIFT 12 +#define SFE_IPV4_CONNECTION_HASH_SIZE (1 << SFE_IPV4_CONNECTION_HASH_SHIFT) +#define SFE_IPV4_CONNECTION_HASH_MASK (SFE_IPV4_CONNECTION_HASH_SIZE - 1) + +enum sfe_ipv4_exception_events { + SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL, + SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION, + SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL, + SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION, + SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS, + SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS, + SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK, + SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS, + SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL, + SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION, + SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH, + SFE_IPV4_EXCEPTION_EVENT_NON_V4, + SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT, + SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE, + SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL, + SFE_IPV4_EXCEPTION_EVENT_CSUM_ERROR, + SFE_IPV4_EXCEPTION_EVENT_LAST +}; + +static char *sfe_ipv4_exception_events_string[SFE_IPV4_EXCEPTION_EVENT_LAST] = { + "UDP_HEADER_INCOMPLETE", + "UDP_NO_CONNECTION", + "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "UDP_SMALL_TTL", + "UDP_NEEDS_FRAGMENTATION", + "TCP_HEADER_INCOMPLETE", + "TCP_NO_CONNECTION_SLOW_FLAGS", + "TCP_NO_CONNECTION_FAST_FLAGS", + "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "TCP_SMALL_TTL", + "TCP_NEEDS_FRAGMENTATION", + "TCP_FLAGS", + "TCP_SEQ_EXCEEDS_RIGHT_EDGE", + "TCP_SMALL_DATA_OFFS", + "TCP_BAD_SACK", + "TCP_BIG_DATA_OFFS", + "TCP_SEQ_BEFORE_LEFT_EDGE", + "TCP_ACK_EXCEEDS_RIGHT_EDGE", + "TCP_ACK_BEFORE_LEFT_EDGE", + "ICMP_HEADER_INCOMPLETE", + "ICMP_UNHANDLED_TYPE", + "ICMP_IPV4_HEADER_INCOMPLETE", + "ICMP_IPV4_NON_V4", + "ICMP_IPV4_IP_OPTIONS_INCOMPLETE", + "ICMP_IPV4_UDP_HEADER_INCOMPLETE", + "ICMP_IPV4_TCP_HEADER_INCOMPLETE", + "ICMP_IPV4_UNHANDLED_PROTOCOL", + "ICMP_NO_CONNECTION", + "ICMP_FLUSHED_CONNECTION", + "HEADER_INCOMPLETE", + "BAD_TOTAL_LENGTH", + "NON_V4", + "NON_INITIAL_FRAGMENT", + "DATAGRAM_INCOMPLETE", + "IP_OPTIONS_INCOMPLETE", + "UNHANDLED_PROTOCOL", + "CSUM_ERROR" +}; + +/* + * Per-module structure. + */ +struct sfe_ipv4 { + spinlock_t lock; /* Lock for SMP correctness */ + struct sfe_ipv4_connection_match *active_head; + /* Head of the list of recently active connections */ + struct sfe_ipv4_connection_match *active_tail; + /* Tail of the list of recently active connections */ + struct sfe_ipv4_connection *all_connections_head; + /* Head of the list of all connections */ + struct sfe_ipv4_connection *all_connections_tail; + /* Tail of the list of all connections */ + unsigned int num_connections; /* Number of connections */ + struct timer_list timer; /* Timer used for periodic sync ops */ + sfe_sync_rule_callback_t __rcu sync_rule_callback; + /* Callback function registered by a connection manager for stats syncing */ + struct sfe_ipv4_connection *conn_hash[SFE_IPV4_CONNECTION_HASH_SIZE]; + /* Connection hash table */ + struct sfe_ipv4_connection_match *conn_match_hash[SFE_IPV4_CONNECTION_HASH_SIZE]; + /* Connection match hash table */ + + /* + * Stats recorded in a sync period. These stats will be added to + * connection_xxx64 after a sync period. + */ + u32 connection_create_requests; + /* Number of IPv4 connection create requests */ + u32 connection_create_collisions; + /* Number of IPv4 connection create requests that collided with existing hash table entries */ + u32 connection_destroy_requests; + /* Number of IPv4 connection destroy requests */ + u32 connection_destroy_misses; + /* Number of IPv4 connection destroy requests that missed our hash table */ + u32 connection_match_hash_hits; + /* Number of IPv4 connection match hash hits */ + u32 connection_match_hash_reorders; + /* Number of IPv4 connection match hash reorders */ + u32 connection_flushes; /* Number of IPv4 connection flushes */ + u32 packets_forwarded; /* Number of IPv4 packets forwarded */ + u32 packets_not_forwarded; /* Number of IPv4 packets not forwarded */ + u32 exception_events[SFE_IPV4_EXCEPTION_EVENT_LAST]; + + /* + * Summary statistics. + */ + u64 connection_create_requests64; + /* Number of IPv4 connection create requests */ + u64 connection_create_collisions64; + /* Number of IPv4 connection create requests that collided with existing hash table entries */ + u64 connection_destroy_requests64; + /* Number of IPv4 connection destroy requests */ + u64 connection_destroy_misses64; + /* Number of IPv4 connection destroy requests that missed our hash table */ + u64 connection_match_hash_hits64; + /* Number of IPv4 connection match hash hits */ + u64 connection_match_hash_reorders64; + /* Number of IPv4 connection match hash reorders */ + u64 connection_flushes64; /* Number of IPv4 connection flushes */ + u64 packets_forwarded64; /* Number of IPv4 packets forwarded */ + u64 packets_not_forwarded64; + /* Number of IPv4 packets not forwarded */ + u64 exception_events64[SFE_IPV4_EXCEPTION_EVENT_LAST]; + + /* + * Control state. + */ + struct kobject *sys_sfe_ipv4; /* sysfs linkage */ + int debug_dev; /* Major number of the debug char device */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * Enumeration of the XML output. + */ +enum sfe_ipv4_debug_xml_states { + SFE_IPV4_DEBUG_XML_STATE_START, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_START, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_CONNECTION, + SFE_IPV4_DEBUG_XML_STATE_CONNECTIONS_END, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_START, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION, + SFE_IPV4_DEBUG_XML_STATE_EXCEPTIONS_END, + SFE_IPV4_DEBUG_XML_STATE_STATS, + SFE_IPV4_DEBUG_XML_STATE_END, + SFE_IPV4_DEBUG_XML_STATE_DONE +}; + +/* + * XML write state. + */ +struct sfe_ipv4_debug_xml_write_state { + enum sfe_ipv4_debug_xml_states state; + /* XML output file state machine state */ + int iter_exception; /* Next exception iterator */ +}; + +typedef bool (*sfe_ipv4_debug_xml_write_method_t)(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws); + +static struct sfe_ipv4 __si; + +/* + * sfe_ipv4_gen_ip_csum() + * Generate the IP checksum for an IPv4 header. + * + * Note that this function assumes that we have only 20 bytes of IP header. + */ +static inline u16 sfe_ipv4_gen_ip_csum(struct sfe_ipv4_ip_hdr *iph) +{ + u32 sum; + u16 *i = (u16 *)iph; + + iph->check = 0; + + /* + * Generate the sum. + */ + sum = i[0] + i[1] + i[2] + i[3] + i[4] + i[5] + i[6] + i[7] + i[8] + i[9]; + + /* + * Fold it to ones-complement form. + */ + sum = (sum & 0xffff) + (sum >> 16); + sum = (sum & 0xffff) + (sum >> 16); + + return (u16)sum ^ 0xffff; +} + +/* + * sfe_ipv4_get_connection_match_hash() + * Generate the hash used in connection match lookups. + */ +static inline unsigned int sfe_ipv4_get_connection_match_hash(struct net_device *dev, u8 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + size_t dev_addr = (size_t)dev; + u32 hash = ((u32)dev_addr) ^ ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv4_find_sfe_ipv4_connection_match() + * Get the IPv4 flow match info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static struct sfe_ipv4_connection_match * +sfe_ipv4_find_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct net_device *dev, u8 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *head; + unsigned int conn_match_idx; + + conn_match_idx = sfe_ipv4_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port); + cm = si->conn_match_hash[conn_match_idx]; + + /* + * If we don't have anything in this chain then bail. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((cm->match_src_port == src_port) + && (cm->match_dest_port == dest_port) + && (cm->match_src_ip == src_ip) + && (cm->match_dest_ip == dest_ip) + && (cm->match_protocol == protocol) + && (cm->match_dev == dev)) { + si->connection_match_hash_hits++; + return cm; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain and + * move matching entry to the top of the hash chain. We presume that this + * will be reused again very quickly. + */ + head = cm; + do { + cm = cm->next; + } while (cm && (cm->match_src_port != src_port + || cm->match_dest_port != dest_port + || cm->match_src_ip != src_ip + || cm->match_dest_ip != dest_ip + || cm->match_protocol != protocol + || cm->match_dev != dev)); + + /* + * Not found then we're done. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * We found a match so move it. + */ + if (cm->next) { + cm->next->prev = cm->prev; + } + cm->prev->next = cm->next; + cm->prev = NULL; + cm->next = head; + head->prev = cm; + si->conn_match_hash[conn_match_idx] = cm; + si->connection_match_hash_reorders++; + + return cm; +} + +/* + * sfe_ipv4_connection_match_update_summary_stats() + * Update the summary stats for a connection match entry. + */ +static inline void sfe_ipv4_connection_match_update_summary_stats(struct sfe_ipv4_connection_match *cm) +{ + cm->rx_packet_count64 += cm->rx_packet_count; + cm->rx_packet_count = 0; + cm->rx_byte_count64 += cm->rx_byte_count; + cm->rx_byte_count = 0; +} + +/* + * sfe_ipv4_connection_match_compute_translations() + * Compute port and address translations for a connection match entry. + */ +static void sfe_ipv4_connection_match_compute_translations(struct sfe_ipv4_connection_match *cm) +{ + /* + * Before we insert the entry look to see if this is tagged as doing address + * translations. If it is then work out the adjustment that we need to apply + * to the transport checksum. + */ + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) { + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + u16 src_ip_hi = cm->match_src_ip >> 16; + u16 src_ip_lo = cm->match_src_ip & 0xffff; + u32 xlate_src_ip = ~cm->xlate_src_ip; + u16 xlate_src_ip_hi = xlate_src_ip >> 16; + u16 xlate_src_ip_lo = xlate_src_ip & 0xffff; + u16 xlate_src_port = ~cm->xlate_src_port; + u32 adj; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + adj = src_ip_hi + src_ip_lo + cm->match_src_port + + xlate_src_ip_hi + xlate_src_ip_lo + xlate_src_port; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_csum_adjustment = (u16)adj; + + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) { + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + u16 dest_ip_hi = cm->match_dest_ip >> 16; + u16 dest_ip_lo = cm->match_dest_ip & 0xffff; + u32 xlate_dest_ip = ~cm->xlate_dest_ip; + u16 xlate_dest_ip_hi = xlate_dest_ip >> 16; + u16 xlate_dest_ip_lo = xlate_dest_ip & 0xffff; + u16 xlate_dest_port = ~cm->xlate_dest_port; + u32 adj; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + adj = dest_ip_hi + dest_ip_lo + cm->match_dest_port + + xlate_dest_ip_hi + xlate_dest_ip_lo + xlate_dest_port; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC) { + u32 adj = ~cm->match_src_ip + cm->xlate_src_ip; + if (adj < cm->xlate_src_ip) { + adj++; + } + + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_partial_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST) { + u32 adj = ~cm->match_dest_ip + cm->xlate_dest_ip; + if (adj < cm->xlate_dest_ip) { + adj++; + } + + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_partial_csum_adjustment = (u16)adj; + } + +} + +/* + * sfe_ipv4_update_summary_stats() + * Update the summary stats. + */ +static void sfe_ipv4_update_summary_stats(struct sfe_ipv4 *si) +{ + int i; + + si->connection_create_requests64 += si->connection_create_requests; + si->connection_create_requests = 0; + si->connection_create_collisions64 += si->connection_create_collisions; + si->connection_create_collisions = 0; + si->connection_destroy_requests64 += si->connection_destroy_requests; + si->connection_destroy_requests = 0; + si->connection_destroy_misses64 += si->connection_destroy_misses; + si->connection_destroy_misses = 0; + si->connection_match_hash_hits64 += si->connection_match_hash_hits; + si->connection_match_hash_hits = 0; + si->connection_match_hash_reorders64 += si->connection_match_hash_reorders; + si->connection_match_hash_reorders = 0; + si->connection_flushes64 += si->connection_flushes; + si->connection_flushes = 0; + si->packets_forwarded64 += si->packets_forwarded; + si->packets_forwarded = 0; + si->packets_not_forwarded64 += si->packets_not_forwarded; + si->packets_not_forwarded = 0; + + for (i = 0; i < SFE_IPV4_EXCEPTION_EVENT_LAST; i++) { + si->exception_events64[i] += si->exception_events[i]; + si->exception_events[i] = 0; + } +} + +/* + * sfe_ipv4_insert_sfe_ipv4_connection_match() + * Insert a connection match into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv4_insert_sfe_ipv4_connection_match(struct sfe_ipv4 *si, + struct sfe_ipv4_connection_match *cm) +{ + struct sfe_ipv4_connection_match **hash_head; + struct sfe_ipv4_connection_match *prev_head; + unsigned int conn_match_idx + = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + + hash_head = &si->conn_match_hash[conn_match_idx]; + prev_head = *hash_head; + cm->prev = NULL; + if (prev_head) { + prev_head->prev = cm; + } + + cm->next = prev_head; + *hash_head = cm; +} + +/* + * sfe_ipv4_remove_sfe_ipv4_connection_match() + * Remove a connection match object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv4_remove_sfe_ipv4_connection_match(struct sfe_ipv4 *si, struct sfe_ipv4_connection_match *cm) +{ + /* + * Unlink the connection match entry from the hash. + */ + if (cm->prev) { + cm->prev->next = cm->next; + } else { + unsigned int conn_match_idx + = sfe_ipv4_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + si->conn_match_hash[conn_match_idx] = cm->next; + } + + if (cm->next) { + cm->next->prev = cm->prev; + } + + /* + * If the connection match entry is in the active list remove it. + */ + if (cm->active) { + if (likely(cm->active_prev)) { + cm->active_prev->active_next = cm->active_next; + } else { + si->active_head = cm->active_next; + } + + if (likely(cm->active_next)) { + cm->active_next->active_prev = cm->active_prev; + } else { + si->active_tail = cm->active_prev; + } + } +} + +/* + * sfe_ipv4_get_connection_hash() + * Generate the hash used in connection lookups. + */ +static inline unsigned int sfe_ipv4_get_connection_hash(u8 protocol, __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + u32 hash = ntohl(src_ip ^ dest_ip) ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV4_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV4_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv4_find_sfe_ipv4_connection() + * Get the IPv4 connection info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline struct sfe_ipv4_connection *sfe_ipv4_find_sfe_ipv4_connection(struct sfe_ipv4 *si, u32 protocol, + __be32 src_ip, __be16 src_port, + __be32 dest_ip, __be16 dest_port) +{ + struct sfe_ipv4_connection *c; + unsigned int conn_idx = sfe_ipv4_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port); + c = si->conn_hash[conn_idx]; + + /* + * If we don't have anything in this chain then bale. + */ + if (unlikely(!c)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((c->src_port == src_port) + && (c->dest_port == dest_port) + && (c->src_ip == src_ip) + && (c->dest_ip == dest_ip) + && (c->protocol == protocol)) { + return c; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain. + */ + do { + c = c->next; + } while (c && (c->src_port != src_port + || c->dest_port != dest_port + || c->src_ip != src_ip + || c->dest_ip != dest_ip + || c->protocol != protocol)); + + /* + * Will need connection entry for next create/destroy metadata, + * So no need to re-order entry for these requests + */ + return c; +} + +/* + * sfe_ipv4_mark_rule() + * Updates the mark for a current offloaded connection + * + * Will take hash lock upon entry + */ +void sfe_ipv4_mark_rule(struct sfe_connection_mark *mark) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + + spin_lock_bh(&si->lock); + c = sfe_ipv4_find_sfe_ipv4_connection(si, mark->protocol, + mark->src_ip.ip, mark->src_port, + mark->dest_ip.ip, mark->dest_port); + if (c) { + WARN_ON((0 != c->mark) && (0 == mark->mark)); + c->mark = mark->mark; + } + spin_unlock_bh(&si->lock); + + if (c) { + DEBUG_TRACE("Matching connection found for mark, " + "setting from %08x to %08x\n", + c->mark, mark->mark); + } +} + +/* + * sfe_ipv4_insert_sfe_ipv4_connection() + * Insert a connection into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv4_insert_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c) +{ + struct sfe_ipv4_connection **hash_head; + struct sfe_ipv4_connection *prev_head; + unsigned int conn_idx; + + /* + * Insert entry into the connection hash. + */ + conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + hash_head = &si->conn_hash[conn_idx]; + prev_head = *hash_head; + c->prev = NULL; + if (prev_head) { + prev_head->prev = c; + } + + c->next = prev_head; + *hash_head = c; + + /* + * Insert entry into the "all connections" list. + */ + if (si->all_connections_tail) { + c->all_connections_prev = si->all_connections_tail; + si->all_connections_tail->all_connections_next = c; + } else { + c->all_connections_prev = NULL; + si->all_connections_head = c; + } + + si->all_connections_tail = c; + c->all_connections_next = NULL; + si->num_connections++; + + /* + * Insert the connection match objects too. + */ + sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->original_match); + sfe_ipv4_insert_sfe_ipv4_connection_match(si, c->reply_match); +} + +/* + * sfe_ipv4_remove_sfe_ipv4_connection() + * Remove a sfe_ipv4_connection object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv4_remove_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c) +{ + /* + * Remove the connection match objects. + */ + sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->reply_match); + sfe_ipv4_remove_sfe_ipv4_connection_match(si, c->original_match); + + /* + * Unlink the connection. + */ + if (c->prev) { + c->prev->next = c->next; + } else { + unsigned int conn_idx = sfe_ipv4_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + si->conn_hash[conn_idx] = c->next; + } + + if (c->next) { + c->next->prev = c->prev; + } + + /* + * Unlink connection from all_connections list + */ + if (c->all_connections_prev) { + c->all_connections_prev->all_connections_next = c->all_connections_next; + } else { + si->all_connections_head = c->all_connections_next; + } + + if (c->all_connections_next) { + c->all_connections_next->all_connections_prev = c->all_connections_prev; + } else { + si->all_connections_tail = c->all_connections_prev; + } + + si->num_connections--; +} + +/* + * sfe_ipv4_sync_sfe_ipv4_connection() + * Sync a connection. + * + * On entry to this function we expect that the lock for the connection is either + * already held or isn't required. + */ +static void sfe_ipv4_gen_sync_sfe_ipv4_connection(struct sfe_ipv4 *si, struct sfe_ipv4_connection *c, + struct sfe_connection_sync *sis, sfe_sync_reason_t reason, + u64 now_jiffies) +{ + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + + /* + * Fill in the update message. + */ + sis->is_v6 = 0; + sis->protocol = c->protocol; + sis->src_ip.ip = c->src_ip; + sis->src_ip_xlate.ip = c->src_ip_xlate; + sis->dest_ip.ip = c->dest_ip; + sis->dest_ip_xlate.ip = c->dest_ip_xlate; + sis->src_port = c->src_port; + sis->src_port_xlate = c->src_port_xlate; + sis->dest_port = c->dest_port; + sis->dest_port_xlate = c->dest_port_xlate; + + original_cm = c->original_match; + reply_cm = c->reply_match; + sis->src_td_max_window = original_cm->protocol_state.tcp.max_win; + sis->src_td_end = original_cm->protocol_state.tcp.end; + sis->src_td_max_end = original_cm->protocol_state.tcp.max_end; + sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win; + sis->dest_td_end = reply_cm->protocol_state.tcp.end; + sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end; + + sis->src_new_packet_count = original_cm->rx_packet_count; + sis->src_new_byte_count = original_cm->rx_byte_count; + sis->dest_new_packet_count = reply_cm->rx_packet_count; + sis->dest_new_byte_count = reply_cm->rx_byte_count; + + sfe_ipv4_connection_match_update_summary_stats(original_cm); + sfe_ipv4_connection_match_update_summary_stats(reply_cm); + + sis->src_dev = original_cm->match_dev; + sis->src_packet_count = original_cm->rx_packet_count64; + sis->src_byte_count = original_cm->rx_byte_count64; + + sis->dest_dev = reply_cm->match_dev; + sis->dest_packet_count = reply_cm->rx_packet_count64; + sis->dest_byte_count = reply_cm->rx_byte_count64; + + sis->reason = reason; + + /* + * Get the time increment since our last sync. + */ + sis->delta_jiffies = now_jiffies - c->last_sync_jiffies; + c->last_sync_jiffies = now_jiffies; +} + +/* + * sfe_ipv4_flush_sfe_ipv4_connection() + * Flush a connection and free all associated resources. + * + * We need to be called with bottom halves disabled locally as we need to acquire + * the connection hash lock and release it again. In general we're actually called + * from within a BH and so we're fine, but we're also called when connections are + * torn down. + */ +static void sfe_ipv4_flush_sfe_ipv4_connection(struct sfe_ipv4 *si, + struct sfe_ipv4_connection *c, + sfe_sync_reason_t reason) +{ + struct sfe_connection_sync sis; + u64 now_jiffies; + sfe_sync_rule_callback_t sync_rule_callback; + + rcu_read_lock(); + spin_lock_bh(&si->lock); + si->connection_flushes++; + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + spin_unlock_bh(&si->lock); + + if (sync_rule_callback) { + /* + * Generate a sync message and then sync. + */ + now_jiffies = get_jiffies_64(); + sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, reason, now_jiffies); + sync_rule_callback(&sis); + } + + rcu_read_unlock(); + + /* + * Release our hold of the source and dest devices and free the memory + * for our connection objects. + */ + dev_put(c->original_dev); + dev_put(c->reply_dev); + kfree(c->original_match); + kfree(c->reply_match); + kfree(c); +} + +/* + * sfe_ipv4_recv_udp() + * Handle UDP packet receives and forwarding. + */ +static int sfe_ipv4_recv_udp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv4_udp_hdr *udph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + u8 ttl; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (unlikely(!pskb_may_pull(skb, (sizeof(struct sfe_ipv4_udp_hdr) + ihl)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for UDP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the UDP header cached though so allow more time for any prefetching. + */ + src_ip = iph->saddr; + dest_ip = iph->daddr; + + udph = (struct sfe_ipv4_udp_hdr *)(skb->data + ihl); + src_port = udph->source; + dest_port = udph->dest; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); + if (unlikely(!cm)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our TTL allow forwarding? + */ + ttl = iph->ttl; + if (unlikely(ttl < 2)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ttl too low\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely(len > cm->xmit_dev_mtu)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; + } + + /* + * Decrement our TTL. + */ + iph->ttl = ttl - 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 udp_csum; + + iph->saddr = cm->xlate_src_ip; + udph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum; + + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = udp_csum + cm->xlate_src_partial_csum_adjustment; + } else { + sum = udp_csum + cm->xlate_src_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 udp_csum; + + iph->daddr = cm->xlate_dest_ip; + udph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum; + + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = udp_csum + cm->xlate_dest_partial_csum_adjustment; + } else { + sum = udp_csum + cm->xlate_dest_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Replace the IP checksum. + */ + iph->check = sfe_ipv4_gen_ip_csum(iph); + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IP, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv4_eth_hdr *eth = (struct sfe_ipv4_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IP); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet. + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv4_process_tcp_option_sack() + * Parse TCP SACK option and update ack according + */ +static bool sfe_ipv4_process_tcp_option_sack(const struct sfe_ipv4_tcp_hdr *th, const u32 data_offs, + u32 *ack) +{ + u32 length = sizeof(struct sfe_ipv4_tcp_hdr); + u8 *ptr = (u8 *)th + length; + + /* + * Ignore processing if TCP packet has only TIMESTAMP option. + */ + if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1) + && likely(ptr[0] == TCPOPT_NOP) + && likely(ptr[1] == TCPOPT_NOP) + && likely(ptr[2] == TCPOPT_TIMESTAMP) + && likely(ptr[3] == TCPOLEN_TIMESTAMP)) { + return true; + } + + /* + * TCP options. Parse SACK option. + */ + while (length < data_offs) { + u8 size; + u8 kind; + + ptr = (u8 *)th + length; + kind = *ptr; + + /* + * NOP, for padding + * Not in the switch because to fast escape and to not calculate size + */ + if (kind == TCPOPT_NOP) { + length++; + continue; + } + + if (kind == TCPOPT_SACK) { + u32 sack = 0; + u8 re = 1 + 1; + + size = *(ptr + 1); + if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK)) + || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK)) + || (size > (data_offs - length))) { + return false; + } + + re += 4; + while (re < size) { + u32 sack_re; + u8 *sptr = ptr + re; + sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3]; + if (sack_re > sack) { + sack = sack_re; + } + re += TCPOLEN_SACK_PERBLOCK; + } + if (sack > *ack) { + *ack = sack; + } + length += size; + continue; + } + if (kind == TCPOPT_EOL) { + return true; + } + size = *(ptr + 1); + if (size < 2) { + return false; + } + length += size; + } + + return true; +} + +/* + * sfe_ipv4_recv_tcp() + * Handle TCP packet receives and forwarding. + */ +static int sfe_ipv4_recv_tcp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv4_tcp_hdr *tcph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *counter_cm; + u8 ttl; + u32 flags; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (unlikely(!pskb_may_pull(skb, (sizeof(struct sfe_ipv4_tcp_hdr) + ihl)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for TCP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the TCP header cached though so allow more time for any prefetching. + */ + src_ip = iph->saddr; + dest_ip = iph->daddr; + + tcph = (struct sfe_ipv4_tcp_hdr *)(skb->data + ihl); + src_port = tcph->source; + dest_port = tcph->dest; + flags = tcp_flag_word(tcph); + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); + if (unlikely(!cm)) { + /* + * We didn't get a connection but as TCP is connection-oriented that + * may be because this is a non-fast connection (not running established). + * For diagnostic purposes we differentiate this here. + */ + if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - fast flags\n"); + return 0; + } + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - slow flags: 0x%x\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + /* + * Does our TTL allow forwarding? + */ + ttl = iph->ttl; + if (unlikely(ttl < 2)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ttl too low\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely((len > cm->xmit_dev_mtu) && !skb_is_gso(skb))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN + * set is not a fast path packet. + */ + if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP flags: 0x%x are not fast\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + counter_cm = cm->counter_match; + + /* + * Are we doing sequence number checking? + */ + if (likely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) { + u32 seq; + u32 ack; + u32 sack; + u32 data_offs; + u32 end; + u32 left_edge; + u32 scaled_win; + u32 max_end; + + /* + * Is our sequence fully past the right hand edge of the window? + */ + seq = ntohl(tcph->seq); + if (unlikely((s32)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u exceeds right edge: %u\n", + seq, cm->protocol_state.tcp.max_end + 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't too short. + */ + data_offs = tcph->doff << 2; + if (unlikely(data_offs < sizeof(struct sfe_ipv4_tcp_hdr))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Update ACK according to any SACK option. + */ + ack = ntohl(tcph->ack_seq); + sack = ack; + if (unlikely(!sfe_ipv4_process_tcp_option_sack(tcph, data_offs, &sack))) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BAD_SACK]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP option SACK size is wrong\n"); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't past the end of the packet. + */ + data_offs += sizeof(struct sfe_ipv4_ip_hdr); + if (unlikely(len < data_offs)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n", + data_offs, len); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + end = seq + len - data_offs; + + /* + * Is our sequence fully before the left hand edge of the window? + */ + if (unlikely((s32)(end - (cm->protocol_state.tcp.end + - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u before left edge: %u\n", + end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Are we acking data that is to the right of what has been sent? + */ + if (unlikely((s32)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u exceeds right edge: %u\n", + sack, counter_cm->protocol_state.tcp.end + 1); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Is our ack too far before the left hand edge of the window? + */ + left_edge = counter_cm->protocol_state.tcp.end + - cm->protocol_state.tcp.max_win + - SFE_IPV4_TCP_MAX_ACK_WINDOW + - 1; + if (unlikely((s32)(sack - left_edge) < 0)) { + struct sfe_ipv4_connection *c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge); + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Have we just seen the largest window size yet for this connection? If yes + * then we need to record the new value. + */ + scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale; + scaled_win += (sack - ack); + if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) { + cm->protocol_state.tcp.max_win = scaled_win; + } + + /* + * If our sequence and/or ack numbers have advanced then record the new state. + */ + if (likely((s32)(end - cm->protocol_state.tcp.end) >= 0)) { + cm->protocol_state.tcp.end = end; + } + + max_end = sack + scaled_win; + if (likely((s32)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) { + counter_cm->protocol_state.tcp.max_end = max_end; + } + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + iph->tos = (iph->tos & SFE_IPV4_DSCP_MASK) | cm->dscp; + } + + /* + * Decrement our TTL. + */ + iph->ttl = ttl - 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 tcp_csum; + u32 sum; + + iph->saddr = cm->xlate_src_ip; + tcph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = tcp_csum + cm->xlate_src_partial_csum_adjustment; + } else { + sum = tcp_csum + cm->xlate_src_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 tcp_csum; + u32 sum; + + iph->daddr = cm->xlate_dest_ip; + tcph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + if (unlikely(skb->ip_summed == CHECKSUM_PARTIAL)) { + sum = tcp_csum + cm->xlate_dest_partial_csum_adjustment; + } else { + sum = tcp_csum + cm->xlate_dest_csum_adjustment; + } + + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Replace the IP checksum. + */ + iph->check = sfe_ipv4_gen_ip_csum(iph); + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IP, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv4_eth_hdr *eth = (struct sfe_ipv4_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IP); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv4_recv_icmp() + * Handle ICMP packet receives. + * + * ICMP packets aren't handled as a "fast path" and always have us process them + * through the default Linux stack. What we do need to do is look for any errors + * about connections we are handling in the fast path. If we find any such + * connections then we want to flush their state so that the ICMP error path + * within Linux has all of the correct state should it need it. + */ +static int sfe_ipv4_recv_icmp(struct sfe_ipv4 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv4_ip_hdr *iph, unsigned int ihl) +{ + struct icmphdr *icmph; + struct sfe_ipv4_ip_hdr *icmp_iph; + unsigned int icmp_ihl_words; + unsigned int icmp_ihl; + u32 *icmp_trans_h; + struct sfe_ipv4_udp_hdr *icmp_udph; + struct sfe_ipv4_tcp_hdr *icmp_tcph; + __be32 src_ip; + __be32 dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection *c; + u32 pull_len = sizeof(struct icmphdr) + ihl; + + /* + * Is our packet too short to contain a valid ICMP header? + */ + len -= ihl; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for ICMP header\n"); + return 0; + } + + /* + * We only handle "destination unreachable" and "time exceeded" messages. + */ + icmph = (struct icmphdr *)(skb->data + ihl); + if ((icmph->type != ICMP_DEST_UNREACH) + && (icmph->type != ICMP_TIME_EXCEEDED)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->type); + return 0; + } + + /* + * Do we have the full embedded IP header? + */ + len -= sizeof(struct icmphdr); + pull_len += sizeof(struct sfe_ipv4_ip_hdr); + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded IP header not complete\n"); + return 0; + } + + /* + * Is our embedded IP version wrong? + */ + icmp_iph = (struct sfe_ipv4_ip_hdr *)(icmph + 1); + if (unlikely(icmp_iph->version != 4)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_NON_V4]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", icmp_iph->version); + return 0; + } + + /* + * Do we have the full embedded IP header, including any options? + */ + icmp_ihl_words = icmp_iph->ihl; + icmp_ihl = icmp_ihl_words << 2; + pull_len += icmp_ihl - sizeof(struct sfe_ipv4_ip_hdr); + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_IP_OPTIONS_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded header not large enough for IP options\n"); + return 0; + } + + len -= icmp_ihl; + icmp_trans_h = ((u32 *)icmp_iph) + icmp_ihl_words; + + /* + * Handle the embedded transport layer header. + */ + switch (icmp_iph->protocol) { + case IPPROTO_UDP: + /* + * We should have 8 bytes of UDP header - that's enough to identify + * the connection. + */ + pull_len += 8; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Incomplete embedded UDP header\n"); + return 0; + } + + icmp_udph = (struct sfe_ipv4_udp_hdr *)icmp_trans_h; + src_port = icmp_udph->source; + dest_port = icmp_udph->dest; + break; + + case IPPROTO_TCP: + /* + * We should have 8 bytes of TCP header - that's enough to identify + * the connection. + */ + pull_len += 8; + if (!pskb_may_pull(skb, pull_len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Incomplete embedded TCP header\n"); + return 0; + } + + icmp_tcph = (struct sfe_ipv4_tcp_hdr *)icmp_trans_h; + src_port = icmp_tcph->source; + dest_port = icmp_tcph->dest; + break; + + default: + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_IPV4_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", icmp_iph->protocol); + return 0; + } + + src_ip = icmp_iph->saddr; + dest_ip = icmp_iph->daddr; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. Note that we reverse the source and destination + * here because our embedded message contains a packet that was sent in the + * opposite direction to the one in which we just received it. It will have + * been sent on the interface from which we received it though so that's still + * ok to use. + */ + cm = sfe_ipv4_find_sfe_ipv4_connection_match(si, dev, icmp_iph->protocol, dest_ip, dest_port, src_ip, src_port); + if (unlikely(!cm)) { + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * We found a connection so now remove it from the connection list and flush + * its state. + */ + c = cm->connection; + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; +} + +/* + * sfe_ipv4_recv() + * Handle packet receives and forwaring. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_ipv4_recv(struct net_device *dev, struct sk_buff *skb) +{ + struct sfe_ipv4 *si = &__si; + unsigned int len; + unsigned int tot_len; + unsigned int frag_off; + unsigned int ihl; + bool flush_on_find; + bool ip_options; + struct sfe_ipv4_ip_hdr *iph; + u32 protocol; + + /* + * Check that we have space for an IP header here. + */ + len = skb->len; + if (unlikely(!pskb_may_pull(skb, sizeof(struct sfe_ipv4_ip_hdr)))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short\n", len); + return 0; + } + + /* + * Check that our "total length" is large enough for an IP header. + */ + iph = (struct sfe_ipv4_ip_hdr *)skb->data; + tot_len = ntohs(iph->tot_len); + if (unlikely(tot_len < sizeof(struct sfe_ipv4_ip_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_BAD_TOTAL_LENGTH]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("tot_len: %u is too short\n", tot_len); + return 0; + } + + /* + * Is our IP version wrong? + */ + if (unlikely(iph->version != 4)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_V4]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", iph->version); + return 0; + } + + /* + * Does our datagram fit inside the skb? + */ + if (unlikely(tot_len > len)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("tot_len: %u, exceeds len: %u\n", tot_len, len); + return 0; + } + + /* + * Do we have a non-initial fragment? + */ + frag_off = ntohs(iph->frag_off); + if (unlikely(frag_off & IP_OFFSET)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + + /* + * If we have a (first) fragment then mark it to cause any connection to flush. + */ + flush_on_find = unlikely(frag_off & IP_MF) ? true : false; + + /* + * Do we have any IP options? That's definite a slow path! If we do have IP + * options we need to recheck our header size. + */ + ihl = iph->ihl << 2; + ip_options = unlikely(ihl != sizeof(struct sfe_ipv4_ip_hdr)) ? true : false; + if (unlikely(ip_options)) { + if (unlikely(len < ihl)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short for header of size: %u\n", len, ihl); + return 0; + } + + flush_on_find = true; + } + + if (unlikely(ip_fast_csum((u8 *)iph, iph->ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_CSUM_ERROR]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("checksum of ipv4 header is invalid\n"); + return 0; + } + + protocol = iph->protocol; + if (IPPROTO_UDP == protocol) { + return sfe_ipv4_recv_udp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_TCP == protocol) { + return sfe_ipv4_recv_tcp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_ICMP == protocol) { + return sfe_ipv4_recv_icmp(si, skb, dev, len, iph, ihl); + } + + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV4_EXCEPTION_EVENT_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", protocol); + return 0; +} + +static void +sfe_ipv4_update_tcp_state(struct sfe_ipv4_connection *c, + struct sfe_connection_create *sic) +{ + struct sfe_ipv4_connection_match *orig_cm; + struct sfe_ipv4_connection_match *repl_cm; + struct sfe_ipv4_tcp_connection_match *orig_tcp; + struct sfe_ipv4_tcp_connection_match *repl_tcp; + + orig_cm = c->original_match; + repl_cm = c->reply_match; + orig_tcp = &orig_cm->protocol_state.tcp; + repl_tcp = &repl_cm->protocol_state.tcp; + + /* update orig */ + if (orig_tcp->max_win < sic->src_td_max_window) { + orig_tcp->max_win = sic->src_td_max_window; + } + if ((s32)(orig_tcp->end - sic->src_td_end) < 0) { + orig_tcp->end = sic->src_td_end; + } + if ((s32)(orig_tcp->max_end - sic->src_td_max_end) < 0) { + orig_tcp->max_end = sic->src_td_max_end; + } + + /* update reply */ + if (repl_tcp->max_win < sic->dest_td_max_window) { + repl_tcp->max_win = sic->dest_td_max_window; + } + if ((s32)(repl_tcp->end - sic->dest_td_end) < 0) { + repl_tcp->end = sic->dest_td_end; + } + if ((s32)(repl_tcp->max_end - sic->dest_td_max_end) < 0) { + repl_tcp->max_end = sic->dest_td_max_end; + } + + /* update match flags */ + orig_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags &= ~SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + orig_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } +} + +static void +sfe_ipv4_update_protocol_state(struct sfe_ipv4_connection *c, + struct sfe_connection_create *sic) +{ + switch (sic->protocol) { + case IPPROTO_TCP: + sfe_ipv4_update_tcp_state(c, sic); + break; + } +} + +void sfe_ipv4_update_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv4_connection *c; + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + + c = sfe_ipv4_find_sfe_ipv4_connection(si, + sic->protocol, + sic->src_ip.ip, + sic->src_port, + sic->dest_ip.ip, + sic->dest_port); + if (c != NULL) { + sfe_ipv4_update_protocol_state(c, sic); + } + + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv4_create_rule() + * Create a forwarding rule. + */ +int sfe_ipv4_create_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + struct net_device *dest_dev; + struct net_device *src_dev; + + dest_dev = sic->dest_dev; + src_dev = sic->src_dev; + + if (unlikely((dest_dev->reg_state != NETREG_REGISTERED) || + (src_dev->reg_state != NETREG_REGISTERED))) { + return -EINVAL; + } + + spin_lock_bh(&si->lock); + si->connection_create_requests++; + + /* + * Check to see if there is already a flow that matches the rule we're + * trying to create. If there is then we can't create a new one. + */ + c = sfe_ipv4_find_sfe_ipv4_connection(si, + sic->protocol, + sic->src_ip.ip, + sic->src_port, + sic->dest_ip.ip, + sic->dest_port); + if (c != NULL) { + si->connection_create_collisions++; + + /* + * If we already have the flow then it's likely that this + * request to create the connection rule contains more + * up-to-date information. Check and update accordingly. + */ + sfe_ipv4_update_protocol_state(c, sic); + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection already exists - mark: %08x, p: %d\n" + " s: %s:%pM:%pI4:%u, d: %s:%pM:%pI4:%u\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, &sic->src_ip.ip, ntohs(sic->src_port), + sic->dest_dev->name, sic->dest_mac, &sic->dest_ip.ip, ntohs(sic->dest_port)); + return -EADDRINUSE; + } + + /* + * Allocate the various connection tracking objects. + */ + c = (struct sfe_ipv4_connection *)kmalloc(sizeof(struct sfe_ipv4_connection), GFP_ATOMIC); + if (unlikely(!c)) { + spin_unlock_bh(&si->lock); + return -ENOMEM; + } + + original_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC); + if (unlikely(!original_cm)) { + spin_unlock_bh(&si->lock); + kfree(c); + return -ENOMEM; + } + + reply_cm = (struct sfe_ipv4_connection_match *)kmalloc(sizeof(struct sfe_ipv4_connection_match), GFP_ATOMIC); + if (unlikely(!reply_cm)) { + spin_unlock_bh(&si->lock); + kfree(original_cm); + kfree(c); + return -ENOMEM; + } + + /* + * Fill in the "original" direction connection matching object. + * Note that the transmit MAC address is "dest_mac_xlate" because + * we always know both ends of a connection by their translated + * addresses and not their public addresses. + */ + original_cm->match_dev = src_dev; + original_cm->match_protocol = sic->protocol; + original_cm->match_src_ip = sic->src_ip.ip; + original_cm->match_src_port = sic->src_port; + original_cm->match_dest_ip = sic->dest_ip.ip; + original_cm->match_dest_port = sic->dest_port; + original_cm->xlate_src_ip = sic->src_ip_xlate.ip; + original_cm->xlate_src_port = sic->src_port_xlate; + original_cm->xlate_dest_ip = sic->dest_ip_xlate.ip; + original_cm->xlate_dest_port = sic->dest_port_xlate; + original_cm->rx_packet_count = 0; + original_cm->rx_packet_count64 = 0; + original_cm->rx_byte_count = 0; + original_cm->rx_byte_count64 = 0; + original_cm->xmit_dev = dest_dev; + original_cm->xmit_dev_mtu = sic->dest_mtu; + memcpy(original_cm->xmit_src_mac, dest_dev->dev_addr, ETH_ALEN); + memcpy(original_cm->xmit_dest_mac, sic->dest_mac_xlate, ETH_ALEN); + original_cm->connection = c; + original_cm->counter_match = reply_cm; + original_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + original_cm->priority = sic->src_priority; + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + original_cm->dscp = sic->src_dscp << SFE_IPV4_DSCP_SHIFT; + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_XFRM + original_cm->flow_accel = sic->original_accel; +#endif + original_cm->active_next = NULL; + original_cm->active_prev = NULL; + original_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(dest_dev->flags & IFF_POINTOPOINT)) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (dest_dev->header_ops) { + if (dest_dev->header_ops->create == eth_header) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + /* + * Fill in the "reply" direction connection matching object. + */ + reply_cm->match_dev = dest_dev; + reply_cm->match_protocol = sic->protocol; + reply_cm->match_src_ip = sic->dest_ip_xlate.ip; + reply_cm->match_src_port = sic->dest_port_xlate; + reply_cm->match_dest_ip = sic->src_ip_xlate.ip; + reply_cm->match_dest_port = sic->src_port_xlate; + reply_cm->xlate_src_ip = sic->dest_ip.ip; + reply_cm->xlate_src_port = sic->dest_port; + reply_cm->xlate_dest_ip = sic->src_ip.ip; + reply_cm->xlate_dest_port = sic->src_port; + reply_cm->rx_packet_count = 0; + reply_cm->rx_packet_count64 = 0; + reply_cm->rx_byte_count = 0; + reply_cm->rx_byte_count64 = 0; + reply_cm->xmit_dev = src_dev; + reply_cm->xmit_dev_mtu = sic->src_mtu; + memcpy(reply_cm->xmit_src_mac, src_dev->dev_addr, ETH_ALEN); + memcpy(reply_cm->xmit_dest_mac, sic->src_mac, ETH_ALEN); + reply_cm->connection = c; + reply_cm->counter_match = original_cm; + reply_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + reply_cm->priority = sic->dest_priority; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + reply_cm->dscp = sic->dest_dscp << SFE_IPV4_DSCP_SHIFT; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_XFRM + reply_cm->flow_accel = sic->reply_accel; +#endif + reply_cm->active_next = NULL; + reply_cm->active_prev = NULL; + reply_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(src_dev->flags & IFF_POINTOPOINT)) { + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (src_dev->header_ops) { + if (src_dev->header_ops->create == eth_header) { + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + + if (sic->dest_ip.ip != sic->dest_ip_xlate.ip || sic->dest_port != sic->dest_port_xlate) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC; + } + + if (sic->src_ip.ip != sic->src_ip_xlate.ip || sic->src_port != sic->src_port_xlate) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_SRC; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_XLATE_DEST; + } + + c->protocol = sic->protocol; + c->src_ip = sic->src_ip.ip; + c->src_ip_xlate = sic->src_ip_xlate.ip; + c->src_port = sic->src_port; + c->src_port_xlate = sic->src_port_xlate; + c->original_dev = src_dev; + c->original_match = original_cm; + c->dest_ip = sic->dest_ip.ip; + c->dest_ip_xlate = sic->dest_ip_xlate.ip; + c->dest_port = sic->dest_port; + c->dest_port_xlate = sic->dest_port_xlate; + c->reply_dev = dest_dev; + c->reply_match = reply_cm; + c->mark = sic->mark; + c->debug_read_seq = 0; + c->last_sync_jiffies = get_jiffies_64(); + + /* + * Take hold of our source and dest devices for the duration of the connection. + */ + dev_hold(c->original_dev); + dev_hold(c->reply_dev); + + /* + * Initialize the protocol-specific information that we track. + */ + switch (sic->protocol) { + case IPPROTO_TCP: + original_cm->protocol_state.tcp.win_scale = sic->src_td_window_scale; + original_cm->protocol_state.tcp.max_win = sic->src_td_max_window ? sic->src_td_max_window : 1; + original_cm->protocol_state.tcp.end = sic->src_td_end; + original_cm->protocol_state.tcp.max_end = sic->src_td_max_end; + reply_cm->protocol_state.tcp.win_scale = sic->dest_td_window_scale; + reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window ? sic->dest_td_max_window : 1; + reply_cm->protocol_state.tcp.end = sic->dest_td_end; + reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + original_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + reply_cm->flags |= SFE_IPV4_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } + break; + } + + sfe_ipv4_connection_match_compute_translations(original_cm); + sfe_ipv4_connection_match_compute_translations(reply_cm); + sfe_ipv4_insert_sfe_ipv4_connection(si, c); + + spin_unlock_bh(&si->lock); + + /* + * We have everything we need! + */ + DEBUG_INFO("new connection - mark: %08x, p: %d\n" + " s: %s:%pM(%pM):%pI4(%pI4):%u(%u)\n" + " d: %s:%pM(%pM):%pI4(%pI4):%u(%u)\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_mac_xlate, + &sic->src_ip.ip, &sic->src_ip_xlate.ip, ntohs(sic->src_port), ntohs(sic->src_port_xlate), + dest_dev->name, sic->dest_mac, sic->dest_mac_xlate, + &sic->dest_ip.ip, &sic->dest_ip_xlate.ip, ntohs(sic->dest_port), ntohs(sic->dest_port_xlate)); + + return 0; +} + +/* + * sfe_ipv4_destroy_rule() + * Destroy a forwarding rule. + */ +void sfe_ipv4_destroy_rule(struct sfe_connection_destroy *sid) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + + spin_lock_bh(&si->lock); + si->connection_destroy_requests++; + + /* + * Check to see if we have a flow that matches the rule we're trying + * to destroy. If there isn't then we can't destroy it. + */ + c = sfe_ipv4_find_sfe_ipv4_connection(si, sid->protocol, sid->src_ip.ip, sid->src_port, + sid->dest_ip.ip, sid->dest_port); + if (!c) { + si->connection_destroy_misses++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection does not exist - p: %d, s: %pI4:%u, d: %pI4:%u\n", + sid->protocol, &sid->src_ip, ntohs(sid->src_port), + &sid->dest_ip, ntohs(sid->dest_port)); + return; + } + + /* + * Remove our connection details from the hash tables. + */ + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + spin_unlock_bh(&si->lock); + + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_DESTROY); + + DEBUG_INFO("connection destroyed - p: %d, s: %pI4:%u, d: %pI4:%u\n", + sid->protocol, &sid->src_ip.ip, ntohs(sid->src_port), + &sid->dest_ip.ip, ntohs(sid->dest_port)); +} + +/* + * sfe_ipv4_register_sync_rule_callback() + * Register a callback for rule synchronization. + */ +void sfe_ipv4_register_sync_rule_callback(sfe_sync_rule_callback_t sync_rule_callback) +{ + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + rcu_assign_pointer(si->sync_rule_callback, sync_rule_callback); + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv4_get_debug_dev() + */ +static ssize_t sfe_ipv4_get_debug_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv4 *si = &__si; + ssize_t count; + int num; + + spin_lock_bh(&si->lock); + num = si->debug_dev; + spin_unlock_bh(&si->lock); + + count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num); + return count; +} + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv4_debug_dev_attr = + __ATTR(debug_dev, S_IWUSR | S_IRUGO, sfe_ipv4_get_debug_dev, NULL); + +/* + * sfe_ipv4_destroy_all_rules_for_dev() + * Destroy all connections that match a particular device. + * + * If we pass dev as NULL then this destroys all connections. + */ +void sfe_ipv4_destroy_all_rules_for_dev(struct net_device *dev) +{ + struct sfe_ipv4 *si = &__si; + struct sfe_ipv4_connection *c; + +another_round: + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + /* + * Does this connection relate to the device we are destroying? + */ + if (!dev + || (dev == c->original_dev) + || (dev == c->reply_dev)) { + break; + } + } + + if (c) { + sfe_ipv4_remove_sfe_ipv4_connection(si, c); + } + + spin_unlock_bh(&si->lock); + + if (c) { + sfe_ipv4_flush_sfe_ipv4_connection(si, c, SFE_SYNC_REASON_DESTROY); + goto another_round; + } +} + +/* + * sfe_ipv4_periodic_sync() + */ +static void sfe_ipv4_periodic_sync(unsigned long arg) +{ + struct sfe_ipv4 *si = (struct sfe_ipv4 *)arg; + u64 now_jiffies; + int quota; + sfe_sync_rule_callback_t sync_rule_callback; + + now_jiffies = get_jiffies_64(); + + rcu_read_lock(); + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + if (!sync_rule_callback) { + rcu_read_unlock(); + goto done; + } + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + /* + * Get an estimate of the number of connections to parse in this sync. + */ + quota = (si->num_connections + 63) / 64; + + /* + * Walk the "active" list and sync the connection state. + */ + while (quota--) { + struct sfe_ipv4_connection_match *cm; + struct sfe_ipv4_connection_match *counter_cm; + struct sfe_ipv4_connection *c; + struct sfe_connection_sync sis; + + cm = si->active_head; + if (!cm) { + break; + } + + /* + * There's a possibility that our counter match is in the active list too. + * If it is then remove it. + */ + counter_cm = cm->counter_match; + if (counter_cm->active) { + counter_cm->active = false; + + /* + * We must have a connection preceding this counter match + * because that's the one that got us to this point, so we don't have + * to worry about removing the head of the list. + */ + counter_cm->active_prev->active_next = counter_cm->active_next; + + if (likely(counter_cm->active_next)) { + counter_cm->active_next->active_prev = counter_cm->active_prev; + } else { + si->active_tail = counter_cm->active_prev; + } + + counter_cm->active_next = NULL; + counter_cm->active_prev = NULL; + } + + /* + * Now remove the head of the active scan list. + */ + cm->active = false; + si->active_head = cm->active_next; + if (likely(cm->active_next)) { + cm->active_next->active_prev = NULL; + } else { + si->active_tail = NULL; + } + cm->active_next = NULL; + + /* + * Sync the connection state. + */ + c = cm->connection; + sfe_ipv4_gen_sync_sfe_ipv4_connection(si, c, &sis, SFE_SYNC_REASON_STATS, now_jiffies); + + /* + * We don't want to be holding the lock when we sync! + */ + spin_unlock_bh(&si->lock); + sync_rule_callback(&sis); + spin_lock_bh(&si->lock); + } + + spin_unlock_bh(&si->lock); + rcu_read_unlock(); + +done: + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); +} + +#define CHAR_DEV_MSG_SIZE 768 + +/* + * sfe_ipv4_debug_dev_read_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + si->debug_read_seq++; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_connection() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_connection(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + struct sfe_ipv4_connection *c; + struct sfe_ipv4_connection_match *original_cm; + struct sfe_ipv4_connection_match *reply_cm; + int bytes_read; + int protocol; + struct net_device *src_dev; + __be32 src_ip; + __be32 src_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + u64 src_rx_packets; + u64 src_rx_bytes; + struct net_device *dest_dev; + __be32 dest_ip; + __be32 dest_ip_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u64 dest_rx_packets; + u64 dest_rx_bytes; + u64 last_sync_jiffies; + u32 mark, src_priority, dest_priority, src_dscp, dest_dscp; + + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + if (c->debug_read_seq < si->debug_read_seq) { + c->debug_read_seq = si->debug_read_seq; + break; + } + } + + /* + * If there were no connections then move to the next state. + */ + if (!c) { + spin_unlock_bh(&si->lock); + ws->state++; + return true; + } + + original_cm = c->original_match; + reply_cm = c->reply_match; + + protocol = c->protocol; + src_dev = c->original_dev; + src_ip = c->src_ip; + src_ip_xlate = c->src_ip_xlate; + src_port = c->src_port; + src_port_xlate = c->src_port_xlate; + src_priority = original_cm->priority; + src_dscp = original_cm->dscp >> SFE_IPV4_DSCP_SHIFT; + + sfe_ipv4_connection_match_update_summary_stats(original_cm); + sfe_ipv4_connection_match_update_summary_stats(reply_cm); + + src_rx_packets = original_cm->rx_packet_count64; + src_rx_bytes = original_cm->rx_byte_count64; + dest_dev = c->reply_dev; + dest_ip = c->dest_ip; + dest_ip_xlate = c->dest_ip_xlate; + dest_port = c->dest_port; + dest_port_xlate = c->dest_port_xlate; + dest_priority = reply_cm->priority; + dest_dscp = reply_cm->dscp >> SFE_IPV4_DSCP_SHIFT; + dest_rx_packets = reply_cm->rx_packet_count64; + dest_rx_bytes = reply_cm->rx_byte_count64; + last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies; + mark = c->mark; + + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\t\n", + protocol, + src_dev->name, + &src_ip, &src_ip_xlate, + ntohs(src_port), ntohs(src_port_xlate), + src_priority, src_dscp, + src_rx_packets, src_rx_bytes, + dest_dev->name, + &dest_ip, &dest_ip_xlate, + ntohs(dest_port), ntohs(dest_port_xlate), + dest_priority, dest_dscp, + dest_rx_packets, dest_rx_bytes, + last_sync_jiffies, mark); + + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + return true; +} + +/* + * sfe_ipv4_debug_dev_read_connections_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_connections_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_start() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_start(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_exception() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_exception(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + u64 ct; + + spin_lock_bh(&si->lock); + ct = si->exception_events64[ws->iter_exception]; + spin_unlock_bh(&si->lock); + + if (ct) { + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, + "\t\t\n", + sfe_ipv4_exception_events_string[ws->iter_exception], + ct); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + } + + ws->iter_exception++; + if (ws->iter_exception >= SFE_IPV4_EXCEPTION_EVENT_LAST) { + ws->iter_exception = 0; + ws->state++; + } + + return true; +} + +/* + * sfe_ipv4_debug_dev_read_exceptions_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_exceptions_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_stats() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_stats(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + unsigned int num_connections; + u64 packets_forwarded; + u64 packets_not_forwarded; + u64 connection_create_requests; + u64 connection_create_collisions; + u64 connection_destroy_requests; + u64 connection_destroy_misses; + u64 connection_flushes; + u64 connection_match_hash_hits; + u64 connection_match_hash_reorders; + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + num_connections = si->num_connections; + packets_forwarded = si->packets_forwarded64; + packets_not_forwarded = si->packets_not_forwarded64; + connection_create_requests = si->connection_create_requests64; + connection_create_collisions = si->connection_create_collisions64; + connection_destroy_requests = si->connection_destroy_requests64; + connection_destroy_misses = si->connection_destroy_misses64; + connection_flushes = si->connection_flushes64; + connection_match_hash_hits = si->connection_match_hash_hits64; + connection_match_hash_reorders = si->connection_match_hash_reorders64; + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n", + num_connections, + packets_forwarded, + packets_not_forwarded, + connection_create_requests, + connection_create_collisions, + connection_destroy_requests, + connection_destroy_misses, + connection_flushes, + connection_match_hash_hits, + connection_match_hash_reorders); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv4_debug_dev_read_end() + * Generate part of the XML output. + */ +static bool sfe_ipv4_debug_dev_read_end(struct sfe_ipv4 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv4_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * Array of write functions that write various XML elements that correspond to + * our XML output state machine. + */ +static sfe_ipv4_debug_xml_write_method_t sfe_ipv4_debug_xml_write_methods[SFE_IPV4_DEBUG_XML_STATE_DONE] = { + sfe_ipv4_debug_dev_read_start, + sfe_ipv4_debug_dev_read_connections_start, + sfe_ipv4_debug_dev_read_connections_connection, + sfe_ipv4_debug_dev_read_connections_end, + sfe_ipv4_debug_dev_read_exceptions_start, + sfe_ipv4_debug_dev_read_exceptions_exception, + sfe_ipv4_debug_dev_read_exceptions_end, + sfe_ipv4_debug_dev_read_stats, + sfe_ipv4_debug_dev_read_end, +}; + +/* + * sfe_ipv4_debug_dev_read() + * Send info to userspace upon read request from user + */ +static ssize_t sfe_ipv4_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset) +{ + char msg[CHAR_DEV_MSG_SIZE]; + int total_read = 0; + struct sfe_ipv4_debug_xml_write_state *ws; + struct sfe_ipv4 *si = &__si; + + ws = (struct sfe_ipv4_debug_xml_write_state *)filp->private_data; + while ((ws->state != SFE_IPV4_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) { + if ((sfe_ipv4_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) { + continue; + } + } + + return total_read; +} + +/* + * sfe_ipv4_debug_dev_write() + * Write to char device resets some stats + */ +static ssize_t sfe_ipv4_debug_dev_write(struct file *filp, const char *buffer, size_t length, loff_t *offset) +{ + struct sfe_ipv4 *si = &__si; + + spin_lock_bh(&si->lock); + sfe_ipv4_update_summary_stats(si); + + si->packets_forwarded64 = 0; + si->packets_not_forwarded64 = 0; + si->connection_create_requests64 = 0; + si->connection_create_collisions64 = 0; + si->connection_destroy_requests64 = 0; + si->connection_destroy_misses64 = 0; + si->connection_flushes64 = 0; + si->connection_match_hash_hits64 = 0; + si->connection_match_hash_reorders64 = 0; + spin_unlock_bh(&si->lock); + + return length; +} + +/* + * sfe_ipv4_debug_dev_open() + */ +static int sfe_ipv4_debug_dev_open(struct inode *inode, struct file *file) +{ + struct sfe_ipv4_debug_xml_write_state *ws; + + ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data; + if (!ws) { + ws = kzalloc(sizeof(struct sfe_ipv4_debug_xml_write_state), GFP_KERNEL); + if (!ws) { + return -ENOMEM; + } + + ws->state = SFE_IPV4_DEBUG_XML_STATE_START; + file->private_data = ws; + } + + return 0; +} + +/* + * sfe_ipv4_debug_dev_release() + */ +static int sfe_ipv4_debug_dev_release(struct inode *inode, struct file *file) +{ + struct sfe_ipv4_debug_xml_write_state *ws; + + ws = (struct sfe_ipv4_debug_xml_write_state *)file->private_data; + if (ws) { + /* + * We've finished with our output so free the write state. + */ + kfree(ws); + } + + return 0; +} + +/* + * File operations used in the debug char device + */ +static struct file_operations sfe_ipv4_debug_dev_fops = { + .read = sfe_ipv4_debug_dev_read, + .write = sfe_ipv4_debug_dev_write, + .open = sfe_ipv4_debug_dev_open, + .release = sfe_ipv4_debug_dev_release +}; + +/* + * sfe_ipv4_init() + */ +static int __init sfe_ipv4_init(void) +{ + struct sfe_ipv4 *si = &__si; + int result = -1; + + DEBUG_INFO("SFE IPv4 init\n"); + + /* + * Create sys/sfe_ipv4 + */ + si->sys_sfe_ipv4 = kobject_create_and_add("sfe_ipv4", NULL); + if (!si->sys_sfe_ipv4) { + DEBUG_ERROR("failed to register sfe_ipv4\n"); + goto exit1; + } + + /* + * Create files, one for each parameter supported by this module. + */ + result = sysfs_create_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + if (result) { + DEBUG_ERROR("failed to register debug dev file: %d\n", result); + goto exit2; + } + + /* + * Register our debug char device. + */ + result = register_chrdev(0, "sfe_ipv4", &sfe_ipv4_debug_dev_fops); + if (result < 0) { + DEBUG_ERROR("Failed to register chrdev: %d\n", result); + goto exit3; + } + + si->debug_dev = result; + + /* + * Create a timer to handle periodic statistics. + */ + setup_timer(&si->timer, sfe_ipv4_periodic_sync, (unsigned long)si); + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); + + spin_lock_init(&si->lock); + + return 0; + +exit3: + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + +exit2: + kobject_put(si->sys_sfe_ipv4); + +exit1: + return result; +} + +/* + * sfe_ipv4_exit() + */ +static void __exit sfe_ipv4_exit(void) +{ + struct sfe_ipv4 *si = &__si; + + DEBUG_INFO("SFE IPv4 exit\n"); + + /* + * Destroy all connections. + */ + sfe_ipv4_destroy_all_rules_for_dev(NULL); + + del_timer_sync(&si->timer); + + unregister_chrdev(si->debug_dev, "sfe_ipv4"); + + sysfs_remove_file(si->sys_sfe_ipv4, &sfe_ipv4_debug_dev_attr.attr); + + kobject_put(si->sys_sfe_ipv4); + +} + +module_init(sfe_ipv4_init) +module_exit(sfe_ipv4_exit) + +EXPORT_SYMBOL(sfe_ipv4_recv); +EXPORT_SYMBOL(sfe_ipv4_create_rule); +EXPORT_SYMBOL(sfe_ipv4_destroy_rule); +EXPORT_SYMBOL(sfe_ipv4_destroy_all_rules_for_dev); +EXPORT_SYMBOL(sfe_ipv4_register_sync_rule_callback); +EXPORT_SYMBOL(sfe_ipv4_mark_rule); +EXPORT_SYMBOL(sfe_ipv4_update_rule); + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - IPv4 edition"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/package/kernel/shortcut-fe/src/sfe_ipv6.c b/package/kernel/shortcut-fe/src/sfe_ipv6.c new file mode 100644 index 000000000000..16f460723aa6 --- /dev/null +++ b/package/kernel/shortcut-fe/src/sfe_ipv6.c @@ -0,0 +1,3361 @@ +/* + * sfe_ipv6.c + * Shortcut forwarding engine - IPv6 support. + * + * Copyright (c) 2015-2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include + +#include "sfe.h" +#include "sfe_cm.h" + +/* + * By default Linux IP header and transport layer header structures are + * unpacked, assuming that such headers should be 32-bit aligned. + * Unfortunately some wireless adaptors can't cope with this requirement and + * some CPUs can't handle misaligned accesses. For those platforms we + * define SFE_IPV6_UNALIGNED_IP_HEADER and mark the structures as packed. + * When we do this the compiler will generate slightly worse code than for the + * aligned case (on most platforms) but will be much quicker than fixing + * things up in an unaligned trap handler. + */ +#define SFE_IPV6_UNALIGNED_IP_HEADER 1 +#if SFE_IPV6_UNALIGNED_IP_HEADER +#define SFE_IPV6_UNALIGNED_STRUCT __attribute__((aligned(4))) +#else +#define SFE_IPV6_UNALIGNED_STRUCT +#endif + +#define CHAR_DEV_MSG_SIZE 768 + +/* + * An Ethernet header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_eth_hdr { + __be16 h_dest[ETH_ALEN / 2]; + __be16 h_source[ETH_ALEN / 2]; + __be16 h_proto; +} SFE_IPV6_UNALIGNED_STRUCT; + +#define SFE_IPV6_DSCP_MASK 0xf03f +#define SFE_IPV6_DSCP_SHIFT 2 + +/* + * An IPv6 header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_ip_hdr { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 priority:4, + version:4; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 version:4, + priority:4; +#else +#error "Please fix " +#endif + __u8 flow_lbl[3]; + __be16 payload_len; + __u8 nexthdr; + __u8 hop_limit; + struct sfe_ipv6_addr saddr; + struct sfe_ipv6_addr daddr; + + /* + * The extension header start here. + */ +} SFE_IPV6_UNALIGNED_STRUCT; + +#define SFE_IPV6_EXT_HDR_HOP 0 +#define SFE_IPV6_EXT_HDR_ROUTING 43 +#define SFE_IPV6_EXT_HDR_FRAG 44 +#define SFE_IPV6_EXT_HDR_ESP 50 +#define SFE_IPV6_EXT_HDR_AH 51 +#define SFE_IPV6_EXT_HDR_NONE 59 +#define SFE_IPV6_EXT_HDR_DST 60 +#define SFE_IPV6_EXT_HDR_MH 135 + +/* + * fragmentation header + */ + +struct sfe_ipv6_frag_hdr { + __u8 nexthdr; + __u8 reserved; + __be16 frag_off; + __be32 identification; +}; + +#define SFE_IPV6_FRAG_OFFSET 0xfff8 + +/* + * generic IPv6 extension header + */ +struct sfe_ipv6_ext_hdr { + __u8 next_hdr; + __u8 hdr_len; + __u8 padding[6]; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * A UDP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_udp_hdr { + __be16 source; + __be16 dest; + __be16 len; + __sum16 check; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * A TCP header, but with an optional "packed" attribute to + * help with performance on some platforms (see the definition of + * SFE_IPV6_UNALIGNED_STRUCT) + */ +struct sfe_ipv6_tcp_hdr { + __be16 source; + __be16 dest; + __be32 seq; + __be32 ack_seq; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u16 res1:4, + doff:4, + fin:1, + syn:1, + rst:1, + psh:1, + ack:1, + urg:1, + ece:1, + cwr:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u16 doff:4, + res1:4, + cwr:1, + ece:1, + urg:1, + ack:1, + psh:1, + rst:1, + syn:1, + fin:1; +#else +#error "Adjust your defines" +#endif + __be16 window; + __sum16 check; + __be16 urg_ptr; +} SFE_IPV6_UNALIGNED_STRUCT; + +/* + * Specifies the lower bound on ACK numbers carried in the TCP header + */ +#define SFE_IPV6_TCP_MAX_ACK_WINDOW 65520 + +/* + * IPv6 TCP connection match additional data. + */ +struct sfe_ipv6_tcp_connection_match { + u8 win_scale; /* Window scale */ + u32 max_win; /* Maximum window size seen */ + u32 end; /* Sequence number of the next byte to send (seq + segment length) */ + u32 max_end; /* Sequence number of the last byte to ack */ +}; + +/* + * Bit flags for IPv6 connection matching entry. + */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC (1<<0) + /* Perform source translation */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST (1<<1) + /* Perform destination translation */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK (1<<2) + /* Ignore TCP sequence numbers */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR (1<<3) + /* Fast Ethernet header write */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR (1<<4) + /* Fast Ethernet header write */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK (1<<5) + /* remark priority of SKB */ +#define SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK (1<<6) + /* remark DSCP of packet */ + +/* + * IPv6 connection matching structure. + */ +struct sfe_ipv6_connection_match { + /* + * References to other objects. + */ + struct sfe_ipv6_connection_match *next; + struct sfe_ipv6_connection_match *prev; + struct sfe_ipv6_connection *connection; + struct sfe_ipv6_connection_match *counter_match; + /* Matches the flow in the opposite direction as the one in connection */ + struct sfe_ipv6_connection_match *active_next; + struct sfe_ipv6_connection_match *active_prev; + bool active; /* Flag to indicate if we're on the active list */ + + /* + * Characteristics that identify flows that match this rule. + */ + struct net_device *match_dev; /* Network device */ + u8 match_protocol; /* Protocol */ + struct sfe_ipv6_addr match_src_ip[1]; /* Source IP address */ + struct sfe_ipv6_addr match_dest_ip[1]; /* Destination IP address */ + __be16 match_src_port; /* Source port/connection ident */ + __be16 match_dest_port; /* Destination port/connection ident */ + + /* + * Control the operations of the match. + */ + u32 flags; /* Bit flags */ +#ifdef CONFIG_XFRM + u32 flow_accel; /* The flow accelerated or not */ +#endif + + /* + * Connection state that we track once we match. + */ + union { /* Protocol-specific state */ + struct sfe_ipv6_tcp_connection_match tcp; + } protocol_state; + /* + * Stats recorded in a sync period. These stats will be added to + * rx_packet_count64/rx_byte_count64 after a sync period. + */ + u32 rx_packet_count; + u32 rx_byte_count; + + /* + * Packet translation information. + */ + struct sfe_ipv6_addr xlate_src_ip[1]; /* Address after source translation */ + __be16 xlate_src_port; /* Port/connection ident after source translation */ + u16 xlate_src_csum_adjustment; + /* Transport layer checksum adjustment after source translation */ + struct sfe_ipv6_addr xlate_dest_ip[1]; /* Address after destination translation */ + __be16 xlate_dest_port; /* Port/connection ident after destination translation */ + u16 xlate_dest_csum_adjustment; + /* Transport layer checksum adjustment after destination translation */ + + /* + * QoS information + */ + u32 priority; + u32 dscp; + + /* + * Packet transmit information. + */ + struct net_device *xmit_dev; /* Network device on which to transmit */ + unsigned short int xmit_dev_mtu; + /* Interface MTU */ + u16 xmit_dest_mac[ETH_ALEN / 2]; + /* Destination MAC address to use when forwarding */ + u16 xmit_src_mac[ETH_ALEN / 2]; + /* Source MAC address to use when forwarding */ + + /* + * Summary stats. + */ + u64 rx_packet_count64; + u64 rx_byte_count64; +}; + +/* + * Per-connection data structure. + */ +struct sfe_ipv6_connection { + struct sfe_ipv6_connection *next; + /* Pointer to the next entry in a hash chain */ + struct sfe_ipv6_connection *prev; + /* Pointer to the previous entry in a hash chain */ + int protocol; /* IP protocol number */ + struct sfe_ipv6_addr src_ip[1]; /* Src IP addr pre-translation */ + struct sfe_ipv6_addr src_ip_xlate[1]; /* Src IP addr post-translation */ + struct sfe_ipv6_addr dest_ip[1]; /* Dest IP addr pre-translation */ + struct sfe_ipv6_addr dest_ip_xlate[1]; /* Dest IP addr post-translation */ + __be16 src_port; /* Src port pre-translation */ + __be16 src_port_xlate; /* Src port post-translation */ + __be16 dest_port; /* Dest port pre-translation */ + __be16 dest_port_xlate; /* Dest port post-translation */ + struct sfe_ipv6_connection_match *original_match; + /* Original direction matching structure */ + struct net_device *original_dev; + /* Original direction source device */ + struct sfe_ipv6_connection_match *reply_match; + /* Reply direction matching structure */ + struct net_device *reply_dev; /* Reply direction source device */ + u64 last_sync_jiffies; /* Jiffies count for the last sync */ + struct sfe_ipv6_connection *all_connections_next; + /* Pointer to the next entry in the list of all connections */ + struct sfe_ipv6_connection *all_connections_prev; + /* Pointer to the previous entry in the list of all connections */ + u32 mark; /* mark for outgoing packet */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * IPv6 connections and hash table size information. + */ +#define SFE_IPV6_CONNECTION_HASH_SHIFT 12 +#define SFE_IPV6_CONNECTION_HASH_SIZE (1 << SFE_IPV6_CONNECTION_HASH_SHIFT) +#define SFE_IPV6_CONNECTION_HASH_MASK (SFE_IPV6_CONNECTION_HASH_SIZE - 1) + +enum sfe_ipv6_exception_events { + SFE_IPV6_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_UDP_NO_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_UDP_SMALL_TTL, + SFE_IPV6_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION, + SFE_IPV6_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_TTL, + SFE_IPV6_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION, + SFE_IPV6_EXCEPTION_EVENT_TCP_FLAGS, + SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS, + SFE_IPV6_EXCEPTION_EVENT_TCP_BAD_SACK, + SFE_IPV6_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS, + SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_NON_V6, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_IP_OPTIONS_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UDP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_TCP_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UNHANDLED_PROTOCOL, + SFE_IPV6_EXCEPTION_EVENT_ICMP_NO_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION, + SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_BAD_TOTAL_LENGTH, + SFE_IPV6_EXCEPTION_EVENT_NON_V6, + SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT, + SFE_IPV6_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_IP_OPTIONS_INCOMPLETE, + SFE_IPV6_EXCEPTION_EVENT_UNHANDLED_PROTOCOL, + SFE_IPV6_EXCEPTION_EVENT_FLOW_COOKIE_ADD_FAIL, + SFE_IPV6_EXCEPTION_EVENT_LAST +}; + +static char *sfe_ipv6_exception_events_string[SFE_IPV6_EXCEPTION_EVENT_LAST] = { + "UDP_HEADER_INCOMPLETE", + "UDP_NO_CONNECTION", + "UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "UDP_SMALL_TTL", + "UDP_NEEDS_FRAGMENTATION", + "TCP_HEADER_INCOMPLETE", + "TCP_NO_CONNECTION_SLOW_FLAGS", + "TCP_NO_CONNECTION_FAST_FLAGS", + "TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT", + "TCP_SMALL_TTL", + "TCP_NEEDS_FRAGMENTATION", + "TCP_FLAGS", + "TCP_SEQ_EXCEEDS_RIGHT_EDGE", + "TCP_SMALL_DATA_OFFS", + "TCP_BAD_SACK", + "TCP_BIG_DATA_OFFS", + "TCP_SEQ_BEFORE_LEFT_EDGE", + "TCP_ACK_EXCEEDS_RIGHT_EDGE", + "TCP_ACK_BEFORE_LEFT_EDGE", + "ICMP_HEADER_INCOMPLETE", + "ICMP_UNHANDLED_TYPE", + "ICMP_IPV6_HEADER_INCOMPLETE", + "ICMP_IPV6_NON_V6", + "ICMP_IPV6_IP_OPTIONS_INCOMPLETE", + "ICMP_IPV6_UDP_HEADER_INCOMPLETE", + "ICMP_IPV6_TCP_HEADER_INCOMPLETE", + "ICMP_IPV6_UNHANDLED_PROTOCOL", + "ICMP_NO_CONNECTION", + "ICMP_FLUSHED_CONNECTION", + "HEADER_INCOMPLETE", + "BAD_TOTAL_LENGTH", + "NON_V6", + "NON_INITIAL_FRAGMENT", + "DATAGRAM_INCOMPLETE", + "IP_OPTIONS_INCOMPLETE", + "UNHANDLED_PROTOCOL", + "FLOW_COOKIE_ADD_FAIL" +}; + +/* + * Per-module structure. + */ +struct sfe_ipv6 { + spinlock_t lock; /* Lock for SMP correctness */ + struct sfe_ipv6_connection_match *active_head; + /* Head of the list of recently active connections */ + struct sfe_ipv6_connection_match *active_tail; + /* Tail of the list of recently active connections */ + struct sfe_ipv6_connection *all_connections_head; + /* Head of the list of all connections */ + struct sfe_ipv6_connection *all_connections_tail; + /* Tail of the list of all connections */ + unsigned int num_connections; /* Number of connections */ + struct timer_list timer; /* Timer used for periodic sync ops */ + sfe_sync_rule_callback_t __rcu sync_rule_callback; + /* Callback function registered by a connection manager for stats syncing */ + struct sfe_ipv6_connection *conn_hash[SFE_IPV6_CONNECTION_HASH_SIZE]; + /* Connection hash table */ + struct sfe_ipv6_connection_match *conn_match_hash[SFE_IPV6_CONNECTION_HASH_SIZE]; + /* Connection match hash table */ + + /* + * Stats recorded in a sync period. These stats will be added to + * connection_xxx64 after a sync period. + */ + u32 connection_create_requests; + /* Number of IPv6 connection create requests */ + u32 connection_create_collisions; + /* Number of IPv6 connection create requests that collided with existing hash table entries */ + u32 connection_destroy_requests; + /* Number of IPv6 connection destroy requests */ + u32 connection_destroy_misses; + /* Number of IPv6 connection destroy requests that missed our hash table */ + u32 connection_match_hash_hits; + /* Number of IPv6 connection match hash hits */ + u32 connection_match_hash_reorders; + /* Number of IPv6 connection match hash reorders */ + u32 connection_flushes; /* Number of IPv6 connection flushes */ + u32 packets_forwarded; /* Number of IPv6 packets forwarded */ + u32 packets_not_forwarded; /* Number of IPv6 packets not forwarded */ + u32 exception_events[SFE_IPV6_EXCEPTION_EVENT_LAST]; + + /* + * Summary statistics. + */ + u64 connection_create_requests64; + /* Number of IPv6 connection create requests */ + u64 connection_create_collisions64; + /* Number of IPv6 connection create requests that collided with existing hash table entries */ + u64 connection_destroy_requests64; + /* Number of IPv6 connection destroy requests */ + u64 connection_destroy_misses64; + /* Number of IPv6 connection destroy requests that missed our hash table */ + u64 connection_match_hash_hits64; + /* Number of IPv6 connection match hash hits */ + u64 connection_match_hash_reorders64; + /* Number of IPv6 connection match hash reorders */ + u64 connection_flushes64; /* Number of IPv6 connection flushes */ + u64 packets_forwarded64; /* Number of IPv6 packets forwarded */ + u64 packets_not_forwarded64; + /* Number of IPv6 packets not forwarded */ + u64 exception_events64[SFE_IPV6_EXCEPTION_EVENT_LAST]; + + /* + * Control state. + */ + struct kobject *sys_sfe_ipv6; /* sysfs linkage */ + int debug_dev; /* Major number of the debug char device */ + u32 debug_read_seq; /* sequence number for debug dump */ +}; + +/* + * Enumeration of the XML output. + */ +enum sfe_ipv6_debug_xml_states { + SFE_IPV6_DEBUG_XML_STATE_START, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_START, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_CONNECTION, + SFE_IPV6_DEBUG_XML_STATE_CONNECTIONS_END, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_START, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_EXCEPTION, + SFE_IPV6_DEBUG_XML_STATE_EXCEPTIONS_END, + SFE_IPV6_DEBUG_XML_STATE_STATS, + SFE_IPV6_DEBUG_XML_STATE_END, + SFE_IPV6_DEBUG_XML_STATE_DONE +}; + +/* + * XML write state. + */ +struct sfe_ipv6_debug_xml_write_state { + enum sfe_ipv6_debug_xml_states state; + /* XML output file state machine state */ + int iter_exception; /* Next exception iterator */ +}; + +typedef bool (*sfe_ipv6_debug_xml_write_method_t)(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws); + +static struct sfe_ipv6 __si6; + +/* + * sfe_ipv6_get_debug_dev() + */ +static ssize_t sfe_ipv6_get_debug_dev(struct device *dev, struct device_attribute *attr, char *buf); + +/* + * sysfs attributes. + */ +static const struct device_attribute sfe_ipv6_debug_dev_attr = + __ATTR(debug_dev, S_IWUSR | S_IRUGO, sfe_ipv6_get_debug_dev, NULL); + +/* + * sfe_ipv6_is_ext_hdr() + * check if we recognize ipv6 extension header + */ +static inline bool sfe_ipv6_is_ext_hdr(u8 hdr) +{ + return (hdr == SFE_IPV6_EXT_HDR_HOP) || + (hdr == SFE_IPV6_EXT_HDR_ROUTING) || + (hdr == SFE_IPV6_EXT_HDR_FRAG) || + (hdr == SFE_IPV6_EXT_HDR_AH) || + (hdr == SFE_IPV6_EXT_HDR_DST) || + (hdr == SFE_IPV6_EXT_HDR_MH); +} + +/* + * sfe_ipv6_change_dsfield() + * change dscp field in IPv6 packet + */ +static inline void sfe_ipv6_change_dsfield(struct sfe_ipv6_ip_hdr *iph, u8 dscp) +{ + __be16 *p = (__be16 *)iph; + + *p = ((*p & htons(SFE_IPV6_DSCP_MASK)) | htons((u16)dscp << 4)); +} + +/* + * sfe_ipv6_get_connection_match_hash() + * Generate the hash used in connection match lookups. + */ +static inline unsigned int sfe_ipv6_get_connection_match_hash(struct net_device *dev, u8 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + u32 idx, hash = 0; + size_t dev_addr = (size_t)dev; + + for (idx = 0; idx < 4; idx++) { + hash ^= src_ip->addr[idx] ^ dest_ip->addr[idx]; + } + hash = ((u32)dev_addr) ^ hash ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV6_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV6_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv6_find_connection_match() + * Get the IPv6 flow match info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static struct sfe_ipv6_connection_match * +sfe_ipv6_find_connection_match(struct sfe_ipv6 *si, struct net_device *dev, u8 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *head; + unsigned int conn_match_idx; + + conn_match_idx = sfe_ipv6_get_connection_match_hash(dev, protocol, src_ip, src_port, dest_ip, dest_port); + cm = si->conn_match_hash[conn_match_idx]; + + /* + * If we don't have anything in this chain then bail. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((cm->match_src_port == src_port) + && (cm->match_dest_port == dest_port) + && (sfe_ipv6_addr_equal(cm->match_src_ip, src_ip)) + && (sfe_ipv6_addr_equal(cm->match_dest_ip, dest_ip)) + && (cm->match_protocol == protocol) + && (cm->match_dev == dev)) { + si->connection_match_hash_hits++; + return cm; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain and + * move matching entry to the top of the hash chain. We presume that this + * will be reused again very quickly. + */ + head = cm; + do { + cm = cm->next; + } while (cm && (cm->match_src_port != src_port + || cm->match_dest_port != dest_port + || !sfe_ipv6_addr_equal(cm->match_src_ip, src_ip) + || !sfe_ipv6_addr_equal(cm->match_dest_ip, dest_ip) + || cm->match_protocol != protocol + || cm->match_dev != dev)); + + /* + * Not found then we're done. + */ + if (unlikely(!cm)) { + return NULL; + } + + /* + * We found a match so move it. + */ + if (cm->next) { + cm->next->prev = cm->prev; + } + cm->prev->next = cm->next; + cm->prev = NULL; + cm->next = head; + head->prev = cm; + si->conn_match_hash[conn_match_idx] = cm; + si->connection_match_hash_reorders++; + + return cm; +} + +/* + * sfe_ipv6_connection_match_update_summary_stats() + * Update the summary stats for a connection match entry. + */ +static inline void sfe_ipv6_connection_match_update_summary_stats(struct sfe_ipv6_connection_match *cm) +{ + cm->rx_packet_count64 += cm->rx_packet_count; + cm->rx_packet_count = 0; + cm->rx_byte_count64 += cm->rx_byte_count; + cm->rx_byte_count = 0; +} + +/* + * sfe_ipv6_connection_match_compute_translations() + * Compute port and address translations for a connection match entry. + */ +static void sfe_ipv6_connection_match_compute_translations(struct sfe_ipv6_connection_match *cm) +{ + u32 diff[9]; + u32 *idx_32; + u16 *idx_16; + + /* + * Before we insert the entry look to see if this is tagged as doing address + * translations. If it is then work out the adjustment that we need to apply + * to the transport checksum. + */ + if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC) { + u32 adj = 0; + u32 carry = 0; + + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + idx_32 = diff; + *(idx_32++) = cm->match_src_ip->addr[0]; + *(idx_32++) = cm->match_src_ip->addr[1]; + *(idx_32++) = cm->match_src_ip->addr[2]; + *(idx_32++) = cm->match_src_ip->addr[3]; + + idx_16 = (u16 *)idx_32; + *(idx_16++) = cm->match_src_port; + *(idx_16++) = ~cm->xlate_src_port; + idx_32 = (u32 *)idx_16; + + *(idx_32++) = ~cm->xlate_src_ip->addr[0]; + *(idx_32++) = ~cm->xlate_src_ip->addr[1]; + *(idx_32++) = ~cm->xlate_src_ip->addr[2]; + *(idx_32++) = ~cm->xlate_src_ip->addr[3]; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + for (idx_32 = diff; idx_32 < diff + 9; idx_32++) { + u32 w = *idx_32; + adj += carry; + adj += w; + carry = (w > adj); + } + adj += carry; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_src_csum_adjustment = (u16)adj; + } + + if (cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST) { + u32 adj = 0; + u32 carry = 0; + + /* + * Precompute an incremental checksum adjustment so we can + * edit packets in this stream very quickly. The algorithm is from RFC1624. + */ + idx_32 = diff; + *(idx_32++) = cm->match_dest_ip->addr[0]; + *(idx_32++) = cm->match_dest_ip->addr[1]; + *(idx_32++) = cm->match_dest_ip->addr[2]; + *(idx_32++) = cm->match_dest_ip->addr[3]; + + idx_16 = (u16 *)idx_32; + *(idx_16++) = cm->match_dest_port; + *(idx_16++) = ~cm->xlate_dest_port; + idx_32 = (u32 *)idx_16; + + *(idx_32++) = ~cm->xlate_dest_ip->addr[0]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[1]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[2]; + *(idx_32++) = ~cm->xlate_dest_ip->addr[3]; + + /* + * When we compute this fold it down to a 16-bit offset + * as that way we can avoid having to do a double + * folding of the twos-complement result because the + * addition of 2 16-bit values cannot cause a double + * wrap-around! + */ + for (idx_32 = diff; idx_32 < diff + 9; idx_32++) { + u32 w = *idx_32; + adj += carry; + adj += w; + carry = (w > adj); + } + adj += carry; + adj = (adj & 0xffff) + (adj >> 16); + adj = (adj & 0xffff) + (adj >> 16); + cm->xlate_dest_csum_adjustment = (u16)adj; + } +} + +/* + * sfe_ipv6_update_summary_stats() + * Update the summary stats. + */ +static void sfe_ipv6_update_summary_stats(struct sfe_ipv6 *si) +{ + int i; + + si->connection_create_requests64 += si->connection_create_requests; + si->connection_create_requests = 0; + si->connection_create_collisions64 += si->connection_create_collisions; + si->connection_create_collisions = 0; + si->connection_destroy_requests64 += si->connection_destroy_requests; + si->connection_destroy_requests = 0; + si->connection_destroy_misses64 += si->connection_destroy_misses; + si->connection_destroy_misses = 0; + si->connection_match_hash_hits64 += si->connection_match_hash_hits; + si->connection_match_hash_hits = 0; + si->connection_match_hash_reorders64 += si->connection_match_hash_reorders; + si->connection_match_hash_reorders = 0; + si->connection_flushes64 += si->connection_flushes; + si->connection_flushes = 0; + si->packets_forwarded64 += si->packets_forwarded; + si->packets_forwarded = 0; + si->packets_not_forwarded64 += si->packets_not_forwarded; + si->packets_not_forwarded = 0; + + for (i = 0; i < SFE_IPV6_EXCEPTION_EVENT_LAST; i++) { + si->exception_events64[i] += si->exception_events[i]; + si->exception_events[i] = 0; + } +} + +/* + * sfe_ipv6_insert_connection_match() + * Insert a connection match into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv6_insert_connection_match(struct sfe_ipv6 *si, + struct sfe_ipv6_connection_match *cm) +{ + struct sfe_ipv6_connection_match **hash_head; + struct sfe_ipv6_connection_match *prev_head; + unsigned int conn_match_idx + = sfe_ipv6_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + + hash_head = &si->conn_match_hash[conn_match_idx]; + prev_head = *hash_head; + cm->prev = NULL; + if (prev_head) { + prev_head->prev = cm; + } + + cm->next = prev_head; + *hash_head = cm; +} + +/* + * sfe_ipv6_remove_connection_match() + * Remove a connection match object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline void sfe_ipv6_remove_connection_match(struct sfe_ipv6 *si, struct sfe_ipv6_connection_match *cm) +{ + /* + * Unlink the connection match entry from the hash. + */ + if (cm->prev) { + cm->prev->next = cm->next; + } else { + unsigned int conn_match_idx + = sfe_ipv6_get_connection_match_hash(cm->match_dev, cm->match_protocol, + cm->match_src_ip, cm->match_src_port, + cm->match_dest_ip, cm->match_dest_port); + si->conn_match_hash[conn_match_idx] = cm->next; + } + + if (cm->next) { + cm->next->prev = cm->prev; + } + + /* + * If the connection match entry is in the active list remove it. + */ + if (cm->active) { + if (likely(cm->active_prev)) { + cm->active_prev->active_next = cm->active_next; + } else { + si->active_head = cm->active_next; + } + + if (likely(cm->active_next)) { + cm->active_next->active_prev = cm->active_prev; + } else { + si->active_tail = cm->active_prev; + } + } +} + +/* + * sfe_ipv6_get_connection_hash() + * Generate the hash used in connection lookups. + */ +static inline unsigned int sfe_ipv6_get_connection_hash(u8 protocol, struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + u32 idx, hash = 0; + + for (idx = 0; idx < 4; idx++) { + hash ^= src_ip->addr[idx] ^ dest_ip->addr[idx]; + } + hash = hash ^ protocol ^ ntohs(src_port ^ dest_port); + return ((hash >> SFE_IPV6_CONNECTION_HASH_SHIFT) ^ hash) & SFE_IPV6_CONNECTION_HASH_MASK; +} + +/* + * sfe_ipv6_find_connection() + * Get the IPv6 connection info that corresponds to a particular 5-tuple. + * + * On entry we must be holding the lock that protects the hash table. + */ +static inline struct sfe_ipv6_connection *sfe_ipv6_find_connection(struct sfe_ipv6 *si, u32 protocol, + struct sfe_ipv6_addr *src_ip, __be16 src_port, + struct sfe_ipv6_addr *dest_ip, __be16 dest_port) +{ + struct sfe_ipv6_connection *c; + unsigned int conn_idx = sfe_ipv6_get_connection_hash(protocol, src_ip, src_port, dest_ip, dest_port); + c = si->conn_hash[conn_idx]; + + /* + * If we don't have anything in this chain then bale. + */ + if (unlikely(!c)) { + return NULL; + } + + /* + * Hopefully the first entry is the one we want. + */ + if ((c->src_port == src_port) + && (c->dest_port == dest_port) + && (sfe_ipv6_addr_equal(c->src_ip, src_ip)) + && (sfe_ipv6_addr_equal(c->dest_ip, dest_ip)) + && (c->protocol == protocol)) { + return c; + } + + /* + * Unfortunately we didn't find it at head, so we search it in chain. + */ + do { + c = c->next; + } while (c && (c->src_port != src_port + || c->dest_port != dest_port + || !sfe_ipv6_addr_equal(c->src_ip, src_ip) + || !sfe_ipv6_addr_equal(c->dest_ip, dest_ip) + || c->protocol != protocol)); + + /* + * Will need connection entry for next create/destroy metadata, + * So no need to re-order entry for these requests + */ + return c; +} + +/* + * sfe_ipv6_mark_rule() + * Updates the mark for a current offloaded connection + * + * Will take hash lock upon entry + */ +void sfe_ipv6_mark_rule(struct sfe_connection_mark *mark) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + + spin_lock_bh(&si->lock); + c = sfe_ipv6_find_connection(si, mark->protocol, + mark->src_ip.ip6, mark->src_port, + mark->dest_ip.ip6, mark->dest_port); + if (c) { + WARN_ON((0 != c->mark) && (0 == mark->mark)); + c->mark = mark->mark; + } + spin_unlock_bh(&si->lock); + + if (c) { + DEBUG_TRACE("Matching connection found for mark, " + "setting from %08x to %08x\n", + c->mark, mark->mark); + } +} + +/* + * sfe_ipv6_insert_connection() + * Insert a connection into the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv6_insert_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c) +{ + struct sfe_ipv6_connection **hash_head; + struct sfe_ipv6_connection *prev_head; + unsigned int conn_idx; + + /* + * Insert entry into the connection hash. + */ + conn_idx = sfe_ipv6_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + hash_head = &si->conn_hash[conn_idx]; + prev_head = *hash_head; + c->prev = NULL; + if (prev_head) { + prev_head->prev = c; + } + + c->next = prev_head; + *hash_head = c; + + /* + * Insert entry into the "all connections" list. + */ + if (si->all_connections_tail) { + c->all_connections_prev = si->all_connections_tail; + si->all_connections_tail->all_connections_next = c; + } else { + c->all_connections_prev = NULL; + si->all_connections_head = c; + } + + si->all_connections_tail = c; + c->all_connections_next = NULL; + si->num_connections++; + + /* + * Insert the connection match objects too. + */ + sfe_ipv6_insert_connection_match(si, c->original_match); + sfe_ipv6_insert_connection_match(si, c->reply_match); +} + +/* + * sfe_ipv6_remove_connection() + * Remove a sfe_ipv6_connection object from the hash. + * + * On entry we must be holding the lock that protects the hash table. + */ +static void sfe_ipv6_remove_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c) +{ + /* + * Remove the connection match objects. + */ + sfe_ipv6_remove_connection_match(si, c->reply_match); + sfe_ipv6_remove_connection_match(si, c->original_match); + + /* + * Unlink the connection. + */ + if (c->prev) { + c->prev->next = c->next; + } else { + unsigned int conn_idx = sfe_ipv6_get_connection_hash(c->protocol, c->src_ip, c->src_port, + c->dest_ip, c->dest_port); + si->conn_hash[conn_idx] = c->next; + } + + if (c->next) { + c->next->prev = c->prev; + } + + /* + * Unlink connection from all_connections list + */ + if (c->all_connections_prev) { + c->all_connections_prev->all_connections_next = c->all_connections_next; + } else { + si->all_connections_head = c->all_connections_next; + } + + if (c->all_connections_next) { + c->all_connections_next->all_connections_prev = c->all_connections_prev; + } else { + si->all_connections_tail = c->all_connections_prev; + } + + si->num_connections--; +} + +/* + * sfe_ipv6_gen_sync_connection() + * Sync a connection. + * + * On entry to this function we expect that the lock for the connection is either + * already held or isn't required. + */ +static void sfe_ipv6_gen_sync_connection(struct sfe_ipv6 *si, struct sfe_ipv6_connection *c, + struct sfe_connection_sync *sis, sfe_sync_reason_t reason, + u64 now_jiffies) +{ + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + + /* + * Fill in the update message. + */ + sis->is_v6 = 1; + sis->protocol = c->protocol; + sis->src_ip.ip6[0] = c->src_ip[0]; + sis->src_ip_xlate.ip6[0] = c->src_ip_xlate[0]; + sis->dest_ip.ip6[0] = c->dest_ip[0]; + sis->dest_ip_xlate.ip6[0] = c->dest_ip_xlate[0]; + sis->src_port = c->src_port; + sis->src_port_xlate = c->src_port_xlate; + sis->dest_port = c->dest_port; + sis->dest_port_xlate = c->dest_port_xlate; + + original_cm = c->original_match; + reply_cm = c->reply_match; + sis->src_td_max_window = original_cm->protocol_state.tcp.max_win; + sis->src_td_end = original_cm->protocol_state.tcp.end; + sis->src_td_max_end = original_cm->protocol_state.tcp.max_end; + sis->dest_td_max_window = reply_cm->protocol_state.tcp.max_win; + sis->dest_td_end = reply_cm->protocol_state.tcp.end; + sis->dest_td_max_end = reply_cm->protocol_state.tcp.max_end; + + sis->src_new_packet_count = original_cm->rx_packet_count; + sis->src_new_byte_count = original_cm->rx_byte_count; + sis->dest_new_packet_count = reply_cm->rx_packet_count; + sis->dest_new_byte_count = reply_cm->rx_byte_count; + + sfe_ipv6_connection_match_update_summary_stats(original_cm); + sfe_ipv6_connection_match_update_summary_stats(reply_cm); + + sis->src_dev = original_cm->match_dev; + sis->src_packet_count = original_cm->rx_packet_count64; + sis->src_byte_count = original_cm->rx_byte_count64; + + sis->dest_dev = reply_cm->match_dev; + sis->dest_packet_count = reply_cm->rx_packet_count64; + sis->dest_byte_count = reply_cm->rx_byte_count64; + + sis->reason = reason; + + /* + * Get the time increment since our last sync. + */ + sis->delta_jiffies = now_jiffies - c->last_sync_jiffies; + c->last_sync_jiffies = now_jiffies; +} + +/* + * sfe_ipv6_flush_connection() + * Flush a connection and free all associated resources. + * + * We need to be called with bottom halves disabled locally as we need to acquire + * the connection hash lock and release it again. In general we're actually called + * from within a BH and so we're fine, but we're also called when connections are + * torn down. + */ +static void sfe_ipv6_flush_connection(struct sfe_ipv6 *si, + struct sfe_ipv6_connection *c, + sfe_sync_reason_t reason) +{ + struct sfe_connection_sync sis; + u64 now_jiffies; + sfe_sync_rule_callback_t sync_rule_callback; + + rcu_read_lock(); + spin_lock_bh(&si->lock); + si->connection_flushes++; + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + spin_unlock_bh(&si->lock); + + if (sync_rule_callback) { + /* + * Generate a sync message and then sync. + */ + now_jiffies = get_jiffies_64(); + sfe_ipv6_gen_sync_connection(si, c, &sis, reason, now_jiffies); + sync_rule_callback(&sis); + } + + rcu_read_unlock(); + + /* + * Release our hold of the source and dest devices and free the memory + * for our connection objects. + */ + dev_put(c->original_dev); + dev_put(c->reply_dev); + kfree(c->original_match); + kfree(c->reply_match); + kfree(c); +} + +/* + * sfe_ipv6_recv_udp() + * Handle UDP packet receives and forwarding. + */ +static int sfe_ipv6_recv_udp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv6_udp_hdr *udph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (!pskb_may_pull(skb, (sizeof(struct sfe_ipv6_udp_hdr) + ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for UDP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the UDP header cached though so allow more time for any prefetching. + */ + src_ip = &iph->saddr; + dest_ip = &iph->daddr; + + udph = (struct sfe_ipv6_udp_hdr *)(skb->data + ihl); + src_port = udph->source; + dest_port = udph->dest; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_UDP, src_ip, src_port, dest_ip, dest_port); + if (unlikely(!cm)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our hop_limit allow forwarding? + */ + if (unlikely(iph->hop_limit < 2)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("hop_limit too low\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely(len > cm->xmit_dev_mtu)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UDP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + sfe_ipv6_change_dsfield(iph, cm->dscp); + } + + /* + * Decrement our hop_limit. + */ + iph->hop_limit -= 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 udp_csum; + + iph->saddr = cm->xlate_src_ip[0]; + udph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum = udp_csum + cm->xlate_src_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 udp_csum; + + iph->daddr = cm->xlate_dest_ip[0]; + udph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + udp_csum = udph->check; + if (likely(udp_csum)) { + u32 sum = udp_csum + cm->xlate_dest_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + udph->check = (u16)sum; + } + } + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IPV6, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv6_eth_hdr *eth = (struct sfe_ipv6_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IPV6); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet. + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv6_process_tcp_option_sack() + * Parse TCP SACK option and update ack according + */ +static bool sfe_ipv6_process_tcp_option_sack(const struct sfe_ipv6_tcp_hdr *th, const u32 data_offs, + u32 *ack) +{ + u32 length = sizeof(struct sfe_ipv6_tcp_hdr); + u8 *ptr = (u8 *)th + length; + + /* + * Ignore processing if TCP packet has only TIMESTAMP option. + */ + if (likely(data_offs == length + TCPOLEN_TIMESTAMP + 1 + 1) + && likely(ptr[0] == TCPOPT_NOP) + && likely(ptr[1] == TCPOPT_NOP) + && likely(ptr[2] == TCPOPT_TIMESTAMP) + && likely(ptr[3] == TCPOLEN_TIMESTAMP)) { + return true; + } + + /* + * TCP options. Parse SACK option. + */ + while (length < data_offs) { + u8 size; + u8 kind; + + ptr = (u8 *)th + length; + kind = *ptr; + + /* + * NOP, for padding + * Not in the switch because to fast escape and to not calculate size + */ + if (kind == TCPOPT_NOP) { + length++; + continue; + } + + if (kind == TCPOPT_SACK) { + u32 sack = 0; + u8 re = 1 + 1; + + size = *(ptr + 1); + if ((size < (1 + 1 + TCPOLEN_SACK_PERBLOCK)) + || ((size - (1 + 1)) % (TCPOLEN_SACK_PERBLOCK)) + || (size > (data_offs - length))) { + return false; + } + + re += 4; + while (re < size) { + u32 sack_re; + u8 *sptr = ptr + re; + sack_re = (sptr[0] << 24) | (sptr[1] << 16) | (sptr[2] << 8) | sptr[3]; + if (sack_re > sack) { + sack = sack_re; + } + re += TCPOLEN_SACK_PERBLOCK; + } + if (sack > *ack) { + *ack = sack; + } + length += size; + continue; + } + if (kind == TCPOPT_EOL) { + return true; + } + size = *(ptr + 1); + if (size < 2) { + return false; + } + length += size; + } + + return true; +} + +/* + * sfe_ipv6_recv_tcp() + * Handle TCP packet receives and forwarding. + */ +static int sfe_ipv6_recv_tcp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl, bool flush_on_find) +{ + struct sfe_ipv6_tcp_hdr *tcph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *counter_cm; + u32 flags; + struct net_device *xmit_dev; + + /* + * Is our packet too short to contain a valid UDP header? + */ + if (!pskb_may_pull(skb, (sizeof(struct sfe_ipv6_tcp_hdr) + ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for TCP header\n"); + return 0; + } + + /* + * Read the IP address and port information. Read the IP header data first + * because we've almost certainly got that in the cache. We may not yet have + * the TCP header cached though so allow more time for any prefetching. + */ + src_ip = &iph->saddr; + dest_ip = &iph->daddr; + + tcph = (struct sfe_ipv6_tcp_hdr *)(skb->data + ihl); + src_port = tcph->source; + dest_port = tcph->dest; + flags = tcp_flag_word(tcph); + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. + */ + cm = sfe_ipv6_find_connection_match(si, dev, IPPROTO_TCP, src_ip, src_port, dest_ip, dest_port); + if (unlikely(!cm)) { + /* + * We didn't get a connection but as TCP is connection-oriented that + * may be because this is a non-fast connection (not running established). + * For diagnostic purposes we differentiate this here. + */ + if (likely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) == TCP_FLAG_ACK)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_FAST_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - fast flags\n"); + return 0; + } + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NO_CONNECTION_SLOW_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found - slow flags: 0x%x\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + return 0; + } + + /* + * If our packet has beern marked as "flush on find" we can't actually + * forward it in the fast path, but now that we've found an associated + * connection we can flush that out before we process the packet. + */ + if (unlikely(flush_on_find)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_IP_OPTIONS_OR_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("flush on find\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + +#ifdef CONFIG_XFRM + /* + * We can't accelerate the flow on this direction, just let it go + * through the slow path. + */ + if (unlikely(!cm->flow_accel)) { + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + return 0; + } +#endif + + /* + * Does our hop_limit allow forwarding? + */ + if (unlikely(iph->hop_limit < 2)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_TTL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("hop_limit too low\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * If our packet is larger than the MTU of the transmit interface then + * we can't forward it easily. + */ + if (unlikely((len > cm->xmit_dev_mtu) && !skb_is_gso(skb))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_NEEDS_FRAGMENTATION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("larger than mtu\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Look at our TCP flags. Anything missing an ACK or that has RST, SYN or FIN + * set is not a fast path packet. + */ + if (unlikely((flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)) != TCP_FLAG_ACK)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_FLAGS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP flags: 0x%x are not fast\n", + flags & (TCP_FLAG_SYN | TCP_FLAG_RST | TCP_FLAG_FIN | TCP_FLAG_ACK)); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + counter_cm = cm->counter_match; + + /* + * Are we doing sequence number checking? + */ + if (likely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK))) { + u32 seq; + u32 ack; + u32 sack; + u32 data_offs; + u32 end; + u32 left_edge; + u32 scaled_win; + u32 max_end; + + /* + * Is our sequence fully past the right hand edge of the window? + */ + seq = ntohl(tcph->seq); + if (unlikely((s32)(seq - (cm->protocol_state.tcp.max_end + 1)) > 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u exceeds right edge: %u\n", + seq, cm->protocol_state.tcp.max_end + 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't too short. + */ + data_offs = tcph->doff << 2; + if (unlikely(data_offs < sizeof(struct sfe_ipv6_tcp_hdr))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SMALL_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, too small\n", data_offs); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Update ACK according to any SACK option. + */ + ack = ntohl(tcph->ack_seq); + sack = ack; + if (unlikely(!sfe_ipv6_process_tcp_option_sack(tcph, data_offs, &sack))) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_BAD_SACK]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP option SACK size is wrong\n"); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Check that our TCP data offset isn't past the end of the packet. + */ + data_offs += sizeof(struct sfe_ipv6_ip_hdr); + if (unlikely(len < data_offs)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_BIG_DATA_OFFS]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("TCP data offset: %u, past end of packet: %u\n", + data_offs, len); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + end = seq + len - data_offs; + + /* + * Is our sequence fully before the left hand edge of the window? + */ + if (unlikely((s32)(end - (cm->protocol_state.tcp.end + - counter_cm->protocol_state.tcp.max_win - 1)) < 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_SEQ_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("seq: %u before left edge: %u\n", + end, cm->protocol_state.tcp.end - counter_cm->protocol_state.tcp.max_win - 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Are we acking data that is to the right of what has been sent? + */ + if (unlikely((s32)(sack - (counter_cm->protocol_state.tcp.end + 1)) > 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_EXCEEDS_RIGHT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u exceeds right edge: %u\n", + sack, counter_cm->protocol_state.tcp.end + 1); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Is our ack too far before the left hand edge of the window? + */ + left_edge = counter_cm->protocol_state.tcp.end + - cm->protocol_state.tcp.max_win + - SFE_IPV6_TCP_MAX_ACK_WINDOW + - 1; + if (unlikely((s32)(sack - left_edge) < 0)) { + struct sfe_ipv6_connection *c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_TCP_ACK_BEFORE_LEFT_EDGE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("ack: %u before left edge: %u\n", sack, left_edge); + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; + } + + /* + * Have we just seen the largest window size yet for this connection? If yes + * then we need to record the new value. + */ + scaled_win = ntohs(tcph->window) << cm->protocol_state.tcp.win_scale; + scaled_win += (sack - ack); + if (unlikely(cm->protocol_state.tcp.max_win < scaled_win)) { + cm->protocol_state.tcp.max_win = scaled_win; + } + + /* + * If our sequence and/or ack numbers have advanced then record the new state. + */ + if (likely((s32)(end - cm->protocol_state.tcp.end) >= 0)) { + cm->protocol_state.tcp.end = end; + } + + max_end = sack + scaled_win; + if (likely((s32)(max_end - counter_cm->protocol_state.tcp.max_end) >= 0)) { + counter_cm->protocol_state.tcp.max_end = max_end; + } + } + + /* + * From this point on we're good to modify the packet. + */ + + /* + * Update DSCP + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK)) { + sfe_ipv6_change_dsfield(iph, cm->dscp); + } + + /* + * Decrement our hop_limit. + */ + iph->hop_limit -= 1; + + /* + * Do we have to perform translations of the source address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC)) { + u16 tcp_csum; + u32 sum; + + iph->saddr = cm->xlate_src_ip[0]; + tcph->source = cm->xlate_src_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + sum = tcp_csum + cm->xlate_src_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Do we have to perform translations of the destination address/port? + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST)) { + u16 tcp_csum; + u32 sum; + + iph->daddr = cm->xlate_dest_ip[0]; + tcph->dest = cm->xlate_dest_port; + + /* + * Do we have a non-zero UDP checksum? If we do then we need + * to update it. + */ + tcp_csum = tcph->check; + sum = tcp_csum + cm->xlate_dest_csum_adjustment; + sum = (sum & 0xffff) + (sum >> 16); + tcph->check = (u16)sum; + } + + /* + * Update traffic stats. + */ + cm->rx_packet_count++; + cm->rx_byte_count += len; + + /* + * If we're not already on the active list then insert ourselves at the tail + * of the current list. + */ + if (unlikely(!cm->active)) { + cm->active = true; + cm->active_prev = si->active_tail; + if (likely(si->active_tail)) { + si->active_tail->active_next = cm; + } else { + si->active_head = cm; + } + si->active_tail = cm; + } + + xmit_dev = cm->xmit_dev; + skb->dev = xmit_dev; + + /* + * Check to see if we need to write a header. + */ + if (likely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR)) { + if (unlikely(!(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR))) { + dev_hard_header(skb, xmit_dev, ETH_P_IPV6, + cm->xmit_dest_mac, cm->xmit_src_mac, len); + } else { + /* + * For the simple case we write this really fast. + */ + struct sfe_ipv6_eth_hdr *eth = (struct sfe_ipv6_eth_hdr *)__skb_push(skb, ETH_HLEN); + eth->h_proto = htons(ETH_P_IPV6); + eth->h_dest[0] = cm->xmit_dest_mac[0]; + eth->h_dest[1] = cm->xmit_dest_mac[1]; + eth->h_dest[2] = cm->xmit_dest_mac[2]; + eth->h_source[0] = cm->xmit_src_mac[0]; + eth->h_source[1] = cm->xmit_src_mac[1]; + eth->h_source[2] = cm->xmit_src_mac[2]; + } + } + + /* + * Update priority of skb. + */ + if (unlikely(cm->flags & SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK)) { + skb->priority = cm->priority; + } + + /* + * Mark outgoing packet + */ + skb->mark = cm->connection->mark; + if (skb->mark) { + DEBUG_TRACE("SKB MARK is NON ZERO %x\n", skb->mark); + } + + si->packets_forwarded++; + spin_unlock_bh(&si->lock); + + /* + * We're going to check for GSO flags when we transmit the packet so + * start fetching the necessary cache line now. + */ + prefetch(skb_shinfo(skb)); + + /* + * Mark that this packet has been fast forwarded. + */ + skb->fast_forwarded = 1; + + /* + * Send the packet on its way. + */ + dev_queue_xmit(skb); + + return 1; +} + +/* + * sfe_ipv6_recv_icmp() + * Handle ICMP packet receives. + * + * ICMP packets aren't handled as a "fast path" and always have us process them + * through the default Linux stack. What we do need to do is look for any errors + * about connections we are handling in the fast path. If we find any such + * connections then we want to flush their state so that the ICMP error path + * within Linux has all of the correct state should it need it. + */ +static int sfe_ipv6_recv_icmp(struct sfe_ipv6 *si, struct sk_buff *skb, struct net_device *dev, + unsigned int len, struct sfe_ipv6_ip_hdr *iph, unsigned int ihl) +{ + struct icmp6hdr *icmph; + struct sfe_ipv6_ip_hdr *icmp_iph; + struct sfe_ipv6_udp_hdr *icmp_udph; + struct sfe_ipv6_tcp_hdr *icmp_tcph; + struct sfe_ipv6_addr *src_ip; + struct sfe_ipv6_addr *dest_ip; + __be16 src_port; + __be16 dest_port; + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection *c; + u8 next_hdr; + + /* + * Is our packet too short to contain a valid ICMP header? + */ + len -= ihl; + if (!pskb_may_pull(skb, ihl + sizeof(struct icmp6hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("packet too short for ICMP header\n"); + return 0; + } + + /* + * We only handle "destination unreachable" and "time exceeded" messages. + */ + icmph = (struct icmp6hdr *)(skb->data + ihl); + if ((icmph->icmp6_type != ICMPV6_DEST_UNREACH) + && (icmph->icmp6_type != ICMPV6_TIME_EXCEED)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_UNHANDLED_TYPE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("unhandled ICMP type: 0x%x\n", icmph->icmp6_type); + return 0; + } + + /* + * Do we have the full embedded IP header? + * We should have 8 bytes of next L4 header - that's enough to identify + * the connection. + */ + len -= sizeof(struct icmp6hdr); + ihl += sizeof(struct icmp6hdr); + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ip_hdr) + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Embedded IP header not complete\n"); + return 0; + } + + /* + * Is our embedded IP version wrong? + */ + icmp_iph = (struct sfe_ipv6_ip_hdr *)(icmph + 1); + if (unlikely(icmp_iph->version != 6)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_NON_V6]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", icmp_iph->version); + return 0; + } + + len -= sizeof(struct sfe_ipv6_ip_hdr); + ihl += sizeof(struct sfe_ipv6_ip_hdr); + next_hdr = icmp_iph->nexthdr; + while (unlikely(sfe_ipv6_is_ext_hdr(next_hdr))) { + struct sfe_ipv6_ext_hdr *ext_hdr; + unsigned int ext_hdr_len; + + ext_hdr = (struct sfe_ipv6_ext_hdr *)(skb->data + ihl); + if (next_hdr == SFE_IPV6_EXT_HDR_FRAG) { + struct sfe_ipv6_frag_hdr *frag_hdr = (struct sfe_ipv6_frag_hdr *)ext_hdr; + unsigned int frag_off = ntohs(frag_hdr->frag_off); + + if (frag_off & SFE_IPV6_FRAG_OFFSET) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + } + + ext_hdr_len = ext_hdr->hdr_len; + ext_hdr_len <<= 3; + ext_hdr_len += sizeof(struct sfe_ipv6_ext_hdr); + len -= ext_hdr_len; + ihl += ext_hdr_len; + /* + * We should have 8 bytes of next header - that's enough to identify + * the connection. + */ + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("extension header %d not completed\n", next_hdr); + return 0; + } + + next_hdr = ext_hdr->next_hdr; + } + + /* + * Handle the embedded transport layer header. + */ + switch (next_hdr) { + case IPPROTO_UDP: + icmp_udph = (struct sfe_ipv6_udp_hdr *)(skb->data + ihl); + src_port = icmp_udph->source; + dest_port = icmp_udph->dest; + break; + + case IPPROTO_TCP: + icmp_tcph = (struct sfe_ipv6_tcp_hdr *)(skb->data + ihl); + src_port = icmp_tcph->source; + dest_port = icmp_tcph->dest; + break; + + default: + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_IPV6_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("Unhandled embedded IP protocol: %u\n", next_hdr); + return 0; + } + + src_ip = &icmp_iph->saddr; + dest_ip = &icmp_iph->daddr; + + spin_lock_bh(&si->lock); + + /* + * Look for a connection match. Note that we reverse the source and destination + * here because our embedded message contains a packet that was sent in the + * opposite direction to the one in which we just received it. It will have + * been sent on the interface from which we received it though so that's still + * ok to use. + */ + cm = sfe_ipv6_find_connection_match(si, dev, icmp_iph->nexthdr, dest_ip, dest_port, src_ip, src_port); + if (unlikely(!cm)) { + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_NO_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("no connection found\n"); + return 0; + } + + /* + * We found a connection so now remove it from the connection list and flush + * its state. + */ + c = cm->connection; + sfe_ipv6_remove_connection(si, c); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_ICMP_FLUSHED_CONNECTION]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_FLUSH); + return 0; +} + +/* + * sfe_ipv6_recv() + * Handle packet receives and forwaring. + * + * Returns 1 if the packet is forwarded or 0 if it isn't. + */ +int sfe_ipv6_recv(struct net_device *dev, struct sk_buff *skb) +{ + struct sfe_ipv6 *si = &__si6; + unsigned int len; + unsigned int payload_len; + unsigned int ihl = sizeof(struct sfe_ipv6_ip_hdr); + bool flush_on_find = false; + struct sfe_ipv6_ip_hdr *iph; + u8 next_hdr; + + /* + * Check that we have space for an IP header and an uplayer header here. + */ + len = skb->len; + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("len: %u is too short\n", len); + return 0; + } + + /* + * Is our IP version wrong? + */ + iph = (struct sfe_ipv6_ip_hdr *)skb->data; + if (unlikely(iph->version != 6)) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_V6]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("IP version: %u\n", iph->version); + return 0; + } + + /* + * Does our datagram fit inside the skb? + */ + payload_len = ntohs(iph->payload_len); + if (unlikely(payload_len > (len - ihl))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_DATAGRAM_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("payload_len: %u, exceeds len: %u\n", payload_len, (len - sizeof(struct sfe_ipv6_ip_hdr))); + return 0; + } + + next_hdr = iph->nexthdr; + while (unlikely(sfe_ipv6_is_ext_hdr(next_hdr))) { + struct sfe_ipv6_ext_hdr *ext_hdr; + unsigned int ext_hdr_len; + + ext_hdr = (struct sfe_ipv6_ext_hdr *)(skb->data + ihl); + if (next_hdr == SFE_IPV6_EXT_HDR_FRAG) { + struct sfe_ipv6_frag_hdr *frag_hdr = (struct sfe_ipv6_frag_hdr *)ext_hdr; + unsigned int frag_off = ntohs(frag_hdr->frag_off); + + if (frag_off & SFE_IPV6_FRAG_OFFSET) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_NON_INITIAL_FRAGMENT]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("non-initial fragment\n"); + return 0; + } + } + + ext_hdr_len = ext_hdr->hdr_len; + ext_hdr_len <<= 3; + ext_hdr_len += sizeof(struct sfe_ipv6_ext_hdr); + ihl += ext_hdr_len; + if (!pskb_may_pull(skb, ihl + sizeof(struct sfe_ipv6_ext_hdr))) { + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_HEADER_INCOMPLETE]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("extension header %d not completed\n", next_hdr); + return 0; + } + + flush_on_find = true; + next_hdr = ext_hdr->next_hdr; + } + + if (IPPROTO_UDP == next_hdr) { + return sfe_ipv6_recv_udp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_TCP == next_hdr) { + return sfe_ipv6_recv_tcp(si, skb, dev, len, iph, ihl, flush_on_find); + } + + if (IPPROTO_ICMPV6 == next_hdr) { + return sfe_ipv6_recv_icmp(si, skb, dev, len, iph, ihl); + } + + spin_lock_bh(&si->lock); + si->exception_events[SFE_IPV6_EXCEPTION_EVENT_UNHANDLED_PROTOCOL]++; + si->packets_not_forwarded++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("not UDP, TCP or ICMP: %u\n", next_hdr); + return 0; +} + +/* + * sfe_ipv6_update_tcp_state() + * update TCP window variables. + */ +static void +sfe_ipv6_update_tcp_state(struct sfe_ipv6_connection *c, + struct sfe_connection_create *sic) +{ + struct sfe_ipv6_connection_match *orig_cm; + struct sfe_ipv6_connection_match *repl_cm; + struct sfe_ipv6_tcp_connection_match *orig_tcp; + struct sfe_ipv6_tcp_connection_match *repl_tcp; + + orig_cm = c->original_match; + repl_cm = c->reply_match; + orig_tcp = &orig_cm->protocol_state.tcp; + repl_tcp = &repl_cm->protocol_state.tcp; + + /* update orig */ + if (orig_tcp->max_win < sic->src_td_max_window) { + orig_tcp->max_win = sic->src_td_max_window; + } + if ((s32)(orig_tcp->end - sic->src_td_end) < 0) { + orig_tcp->end = sic->src_td_end; + } + if ((s32)(orig_tcp->max_end - sic->src_td_max_end) < 0) { + orig_tcp->max_end = sic->src_td_max_end; + } + + /* update reply */ + if (repl_tcp->max_win < sic->dest_td_max_window) { + repl_tcp->max_win = sic->dest_td_max_window; + } + if ((s32)(repl_tcp->end - sic->dest_td_end) < 0) { + repl_tcp->end = sic->dest_td_end; + } + if ((s32)(repl_tcp->max_end - sic->dest_td_max_end) < 0) { + repl_tcp->max_end = sic->dest_td_max_end; + } + + /* update match flags */ + orig_cm->flags &= ~SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags &= ~SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + orig_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + repl_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } +} + +/* + * sfe_ipv6_update_protocol_state() + * update protocol specified state machine. + */ +static void +sfe_ipv6_update_protocol_state(struct sfe_ipv6_connection *c, + struct sfe_connection_create *sic) +{ + switch (sic->protocol) { + case IPPROTO_TCP: + sfe_ipv6_update_tcp_state(c, sic); + break; + } +} + +/* + * sfe_ipv6_update_rule() + * update forwarding rule after rule is created. + */ +void sfe_ipv6_update_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv6_connection *c; + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + + c = sfe_ipv6_find_connection(si, + sic->protocol, + sic->src_ip.ip6, + sic->src_port, + sic->dest_ip.ip6, + sic->dest_port); + if (c != NULL) { + sfe_ipv6_update_protocol_state(c, sic); + } + + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv6_create_rule() + * Create a forwarding rule. + */ +int sfe_ipv6_create_rule(struct sfe_connection_create *sic) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + struct net_device *dest_dev; + struct net_device *src_dev; + + dest_dev = sic->dest_dev; + src_dev = sic->src_dev; + + if (unlikely((dest_dev->reg_state != NETREG_REGISTERED) || + (src_dev->reg_state != NETREG_REGISTERED))) { + return -EINVAL; + } + + spin_lock_bh(&si->lock); + si->connection_create_requests++; + + /* + * Check to see if there is already a flow that matches the rule we're + * trying to create. If there is then we can't create a new one. + */ + c = sfe_ipv6_find_connection(si, + sic->protocol, + sic->src_ip.ip6, + sic->src_port, + sic->dest_ip.ip6, + sic->dest_port); + if (c != NULL) { + si->connection_create_collisions++; + + /* + * If we already have the flow then it's likely that this + * request to create the connection rule contains more + * up-to-date information. Check and update accordingly. + */ + sfe_ipv6_update_protocol_state(c, sic); + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection already exists - mark: %08x, p: %d\n" + " s: %s:%pM:%pI6:%u, d: %s:%pM:%pI6:%u\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_ip.ip6, ntohs(sic->src_port), + sic->dest_dev->name, sic->dest_mac, sic->dest_ip.ip6, ntohs(sic->dest_port)); + return -EADDRINUSE; + } + + /* + * Allocate the various connection tracking objects. + */ + c = (struct sfe_ipv6_connection *)kmalloc(sizeof(struct sfe_ipv6_connection), GFP_ATOMIC); + if (unlikely(!c)) { + spin_unlock_bh(&si->lock); + return -ENOMEM; + } + + original_cm = (struct sfe_ipv6_connection_match *)kmalloc(sizeof(struct sfe_ipv6_connection_match), GFP_ATOMIC); + if (unlikely(!original_cm)) { + spin_unlock_bh(&si->lock); + kfree(c); + return -ENOMEM; + } + + reply_cm = (struct sfe_ipv6_connection_match *)kmalloc(sizeof(struct sfe_ipv6_connection_match), GFP_ATOMIC); + if (unlikely(!reply_cm)) { + spin_unlock_bh(&si->lock); + kfree(original_cm); + kfree(c); + return -ENOMEM; + } + + /* + * Fill in the "original" direction connection matching object. + * Note that the transmit MAC address is "dest_mac_xlate" because + * we always know both ends of a connection by their translated + * addresses and not their public addresses. + */ + original_cm->match_dev = src_dev; + original_cm->match_protocol = sic->protocol; + original_cm->match_src_ip[0] = sic->src_ip.ip6[0]; + original_cm->match_src_port = sic->src_port; + original_cm->match_dest_ip[0] = sic->dest_ip.ip6[0]; + original_cm->match_dest_port = sic->dest_port; + original_cm->xlate_src_ip[0] = sic->src_ip_xlate.ip6[0]; + original_cm->xlate_src_port = sic->src_port_xlate; + original_cm->xlate_dest_ip[0] = sic->dest_ip_xlate.ip6[0]; + original_cm->xlate_dest_port = sic->dest_port_xlate; + original_cm->rx_packet_count = 0; + original_cm->rx_packet_count64 = 0; + original_cm->rx_byte_count = 0; + original_cm->rx_byte_count64 = 0; + original_cm->xmit_dev = dest_dev; + original_cm->xmit_dev_mtu = sic->dest_mtu; + memcpy(original_cm->xmit_src_mac, dest_dev->dev_addr, ETH_ALEN); + memcpy(original_cm->xmit_dest_mac, sic->dest_mac_xlate, ETH_ALEN); + original_cm->connection = c; + original_cm->counter_match = reply_cm; + original_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + original_cm->priority = sic->src_priority; + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + original_cm->dscp = sic->src_dscp << SFE_IPV6_DSCP_SHIFT; + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_XFRM + original_cm->flow_accel = sic->original_accel; +#endif + original_cm->active_next = NULL; + original_cm->active_prev = NULL; + original_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(dest_dev->flags & IFF_POINTOPOINT)) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (dest_dev->header_ops) { + if (dest_dev->header_ops->create == eth_header) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + /* + * Fill in the "reply" direction connection matching object. + */ + reply_cm->match_dev = dest_dev; + reply_cm->match_protocol = sic->protocol; + reply_cm->match_src_ip[0] = sic->dest_ip_xlate.ip6[0]; + reply_cm->match_src_port = sic->dest_port_xlate; + reply_cm->match_dest_ip[0] = sic->src_ip_xlate.ip6[0]; + reply_cm->match_dest_port = sic->src_port_xlate; + reply_cm->xlate_src_ip[0] = sic->dest_ip.ip6[0]; + reply_cm->xlate_src_port = sic->dest_port; + reply_cm->xlate_dest_ip[0] = sic->src_ip.ip6[0]; + reply_cm->xlate_dest_port = sic->src_port; + reply_cm->rx_packet_count = 0; + reply_cm->rx_packet_count64 = 0; + reply_cm->rx_byte_count = 0; + reply_cm->rx_byte_count64 = 0; + reply_cm->xmit_dev = src_dev; + reply_cm->xmit_dev_mtu = sic->src_mtu; + memcpy(reply_cm->xmit_src_mac, src_dev->dev_addr, ETH_ALEN); + memcpy(reply_cm->xmit_dest_mac, sic->src_mac, ETH_ALEN); + reply_cm->connection = c; + reply_cm->counter_match = original_cm; + reply_cm->flags = 0; + if (sic->flags & SFE_CREATE_FLAG_REMARK_PRIORITY) { + reply_cm->priority = sic->dest_priority; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_PRIORITY_REMARK; + } + if (sic->flags & SFE_CREATE_FLAG_REMARK_DSCP) { + reply_cm->dscp = sic->dest_dscp << SFE_IPV6_DSCP_SHIFT; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_DSCP_REMARK; + } +#ifdef CONFIG_XFRM + reply_cm->flow_accel = sic->reply_accel; +#endif + reply_cm->active_next = NULL; + reply_cm->active_prev = NULL; + reply_cm->active = false; + + /* + * For PPP links we don't write an L2 header. For everything else we do. + */ + if (!(src_dev->flags & IFF_POINTOPOINT)) { + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_L2_HDR; + + /* + * If our dev writes Ethernet headers then we can write a really fast + * version. + */ + if (src_dev->header_ops) { + if (src_dev->header_ops->create == eth_header) { + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_WRITE_FAST_ETH_HDR; + } + } + } + + + if (!sfe_ipv6_addr_equal(sic->dest_ip.ip6, sic->dest_ip_xlate.ip6) || sic->dest_port != sic->dest_port_xlate) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC; + } + + if (!sfe_ipv6_addr_equal(sic->src_ip.ip6, sic->src_ip_xlate.ip6) || sic->src_port != sic->src_port_xlate) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_SRC; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_XLATE_DEST; + } + + c->protocol = sic->protocol; + c->src_ip[0] = sic->src_ip.ip6[0]; + c->src_ip_xlate[0] = sic->src_ip_xlate.ip6[0]; + c->src_port = sic->src_port; + c->src_port_xlate = sic->src_port_xlate; + c->original_dev = src_dev; + c->original_match = original_cm; + c->dest_ip[0] = sic->dest_ip.ip6[0]; + c->dest_ip_xlate[0] = sic->dest_ip_xlate.ip6[0]; + c->dest_port = sic->dest_port; + c->dest_port_xlate = sic->dest_port_xlate; + c->reply_dev = dest_dev; + c->reply_match = reply_cm; + c->mark = sic->mark; + c->debug_read_seq = 0; + c->last_sync_jiffies = get_jiffies_64(); + + /* + * Take hold of our source and dest devices for the duration of the connection. + */ + dev_hold(c->original_dev); + dev_hold(c->reply_dev); + + /* + * Initialize the protocol-specific information that we track. + */ + switch (sic->protocol) { + case IPPROTO_TCP: + original_cm->protocol_state.tcp.win_scale = sic->src_td_window_scale; + original_cm->protocol_state.tcp.max_win = sic->src_td_max_window ? sic->src_td_max_window : 1; + original_cm->protocol_state.tcp.end = sic->src_td_end; + original_cm->protocol_state.tcp.max_end = sic->src_td_max_end; + reply_cm->protocol_state.tcp.win_scale = sic->dest_td_window_scale; + reply_cm->protocol_state.tcp.max_win = sic->dest_td_max_window ? sic->dest_td_max_window : 1; + reply_cm->protocol_state.tcp.end = sic->dest_td_end; + reply_cm->protocol_state.tcp.max_end = sic->dest_td_max_end; + if (sic->flags & SFE_CREATE_FLAG_NO_SEQ_CHECK) { + original_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + reply_cm->flags |= SFE_IPV6_CONNECTION_MATCH_FLAG_NO_SEQ_CHECK; + } + break; + } + + sfe_ipv6_connection_match_compute_translations(original_cm); + sfe_ipv6_connection_match_compute_translations(reply_cm); + sfe_ipv6_insert_connection(si, c); + + spin_unlock_bh(&si->lock); + + /* + * We have everything we need! + */ + DEBUG_INFO("new connection - mark: %08x, p: %d\n" + " s: %s:%pM(%pM):%pI6(%pI6):%u(%u)\n" + " d: %s:%pM(%pM):%pI6(%pI6):%u(%u)\n", + sic->mark, sic->protocol, + sic->src_dev->name, sic->src_mac, sic->src_mac_xlate, + sic->src_ip.ip6, sic->src_ip_xlate.ip6, ntohs(sic->src_port), ntohs(sic->src_port_xlate), + dest_dev->name, sic->dest_mac, sic->dest_mac_xlate, + sic->dest_ip.ip6, sic->dest_ip_xlate.ip6, ntohs(sic->dest_port), ntohs(sic->dest_port_xlate)); + + return 0; +} + +/* + * sfe_ipv6_destroy_rule() + * Destroy a forwarding rule. + */ +void sfe_ipv6_destroy_rule(struct sfe_connection_destroy *sid) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + + spin_lock_bh(&si->lock); + si->connection_destroy_requests++; + + /* + * Check to see if we have a flow that matches the rule we're trying + * to destroy. If there isn't then we can't destroy it. + */ + c = sfe_ipv6_find_connection(si, sid->protocol, sid->src_ip.ip6, sid->src_port, + sid->dest_ip.ip6, sid->dest_port); + if (!c) { + si->connection_destroy_misses++; + spin_unlock_bh(&si->lock); + + DEBUG_TRACE("connection does not exist - p: %d, s: %pI6:%u, d: %pI6:%u\n", + sid->protocol, sid->src_ip.ip6, ntohs(sid->src_port), + sid->dest_ip.ip6, ntohs(sid->dest_port)); + return; + } + + /* + * Remove our connection details from the hash tables. + */ + sfe_ipv6_remove_connection(si, c); + spin_unlock_bh(&si->lock); + + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); + + DEBUG_INFO("connection destroyed - p: %d, s: %pI6:%u, d: %pI6:%u\n", + sid->protocol, sid->src_ip.ip6, ntohs(sid->src_port), + sid->dest_ip.ip6, ntohs(sid->dest_port)); +} + +/* + * sfe_ipv6_register_sync_rule_callback() + * Register a callback for rule synchronization. + */ +void sfe_ipv6_register_sync_rule_callback(sfe_sync_rule_callback_t sync_rule_callback) +{ + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + rcu_assign_pointer(si->sync_rule_callback, sync_rule_callback); + spin_unlock_bh(&si->lock); +} + +/* + * sfe_ipv6_get_debug_dev() + */ +static ssize_t sfe_ipv6_get_debug_dev(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct sfe_ipv6 *si = &__si6; + ssize_t count; + int num; + + spin_lock_bh(&si->lock); + num = si->debug_dev; + spin_unlock_bh(&si->lock); + + count = snprintf(buf, (ssize_t)PAGE_SIZE, "%d\n", num); + return count; +} + +/* + * sfe_ipv6_destroy_all_rules_for_dev() + * Destroy all connections that match a particular device. + * + * If we pass dev as NULL then this destroys all connections. + */ +void sfe_ipv6_destroy_all_rules_for_dev(struct net_device *dev) +{ + struct sfe_ipv6 *si = &__si6; + struct sfe_ipv6_connection *c; + +another_round: + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + /* + * Does this connection relate to the device we are destroying? + */ + if (!dev + || (dev == c->original_dev) + || (dev == c->reply_dev)) { + break; + } + } + + if (c) { + sfe_ipv6_remove_connection(si, c); + } + + spin_unlock_bh(&si->lock); + + if (c) { + sfe_ipv6_flush_connection(si, c, SFE_SYNC_REASON_DESTROY); + goto another_round; + } +} + +/* + * sfe_ipv6_periodic_sync() + */ +static void sfe_ipv6_periodic_sync(unsigned long arg) +{ + struct sfe_ipv6 *si = (struct sfe_ipv6 *)arg; + u64 now_jiffies; + int quota; + sfe_sync_rule_callback_t sync_rule_callback; + + now_jiffies = get_jiffies_64(); + + rcu_read_lock(); + sync_rule_callback = rcu_dereference(si->sync_rule_callback); + if (!sync_rule_callback) { + rcu_read_unlock(); + goto done; + } + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + /* + * Get an estimate of the number of connections to parse in this sync. + */ + quota = (si->num_connections + 63) / 64; + + /* + * Walk the "active" list and sync the connection state. + */ + while (quota--) { + struct sfe_ipv6_connection_match *cm; + struct sfe_ipv6_connection_match *counter_cm; + struct sfe_ipv6_connection *c; + struct sfe_connection_sync sis; + + cm = si->active_head; + if (!cm) { + break; + } + + /* + * There's a possibility that our counter match is in the active list too. + * If it is then remove it. + */ + counter_cm = cm->counter_match; + if (counter_cm->active) { + counter_cm->active = false; + + /* + * We must have a connection preceding this counter match + * because that's the one that got us to this point, so we don't have + * to worry about removing the head of the list. + */ + counter_cm->active_prev->active_next = counter_cm->active_next; + + if (likely(counter_cm->active_next)) { + counter_cm->active_next->active_prev = counter_cm->active_prev; + } else { + si->active_tail = counter_cm->active_prev; + } + + counter_cm->active_next = NULL; + counter_cm->active_prev = NULL; + } + + /* + * Now remove the head of the active scan list. + */ + cm->active = false; + si->active_head = cm->active_next; + if (likely(cm->active_next)) { + cm->active_next->active_prev = NULL; + } else { + si->active_tail = NULL; + } + cm->active_next = NULL; + + /* + * Sync the connection state. + */ + c = cm->connection; + sfe_ipv6_gen_sync_connection(si, c, &sis, SFE_SYNC_REASON_STATS, now_jiffies); + + /* + * We don't want to be holding the lock when we sync! + */ + spin_unlock_bh(&si->lock); + sync_rule_callback(&sis); + spin_lock_bh(&si->lock); + } + + spin_unlock_bh(&si->lock); + rcu_read_unlock(); + +done: + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); +} + +/* + * sfe_ipv6_debug_dev_read_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + si->debug_read_seq++; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_connection() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_connection(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + struct sfe_ipv6_connection *c; + struct sfe_ipv6_connection_match *original_cm; + struct sfe_ipv6_connection_match *reply_cm; + int bytes_read; + int protocol; + struct net_device *src_dev; + struct sfe_ipv6_addr src_ip; + struct sfe_ipv6_addr src_ip_xlate; + __be16 src_port; + __be16 src_port_xlate; + u64 src_rx_packets; + u64 src_rx_bytes; + struct net_device *dest_dev; + struct sfe_ipv6_addr dest_ip; + struct sfe_ipv6_addr dest_ip_xlate; + __be16 dest_port; + __be16 dest_port_xlate; + u64 dest_rx_packets; + u64 dest_rx_bytes; + u64 last_sync_jiffies; + u32 mark, src_priority, dest_priority, src_dscp, dest_dscp; + + spin_lock_bh(&si->lock); + + for (c = si->all_connections_head; c; c = c->all_connections_next) { + if (c->debug_read_seq < si->debug_read_seq) { + c->debug_read_seq = si->debug_read_seq; + break; + } + } + + /* + * If there were no connections then move to the next state. + */ + if (!c) { + spin_unlock_bh(&si->lock); + ws->state++; + return true; + } + + original_cm = c->original_match; + reply_cm = c->reply_match; + + protocol = c->protocol; + src_dev = c->original_dev; + src_ip = c->src_ip[0]; + src_ip_xlate = c->src_ip_xlate[0]; + src_port = c->src_port; + src_port_xlate = c->src_port_xlate; + src_priority = original_cm->priority; + src_dscp = original_cm->dscp >> SFE_IPV6_DSCP_SHIFT; + + sfe_ipv6_connection_match_update_summary_stats(original_cm); + sfe_ipv6_connection_match_update_summary_stats(reply_cm); + + src_rx_packets = original_cm->rx_packet_count64; + src_rx_bytes = original_cm->rx_byte_count64; + dest_dev = c->reply_dev; + dest_ip = c->dest_ip[0]; + dest_ip_xlate = c->dest_ip_xlate[0]; + dest_port = c->dest_port; + dest_port_xlate = c->dest_port_xlate; + dest_priority = reply_cm->priority; + dest_dscp = reply_cm->dscp >> SFE_IPV6_DSCP_SHIFT; + dest_rx_packets = reply_cm->rx_packet_count64; + dest_rx_bytes = reply_cm->rx_byte_count64; + last_sync_jiffies = get_jiffies_64() - c->last_sync_jiffies; + mark = c->mark; + + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\t\n", + protocol, + src_dev->name, + &src_ip, &src_ip_xlate, + ntohs(src_port), ntohs(src_port_xlate), + src_priority, src_dscp, + src_rx_packets, src_rx_bytes, + dest_dev->name, + &dest_ip, &dest_ip_xlate, + ntohs(dest_port), ntohs(dest_port_xlate), + dest_priority, dest_dscp, + dest_rx_packets, dest_rx_bytes, + last_sync_jiffies, mark); + + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + return true; +} + +/* + * sfe_ipv6_debug_dev_read_connections_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_connections_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_start() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_start(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_exception() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_exception(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + u64 ct; + + spin_lock_bh(&si->lock); + ct = si->exception_events64[ws->iter_exception]; + spin_unlock_bh(&si->lock); + + if (ct) { + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, + "\t\t\n", + sfe_ipv6_exception_events_string[ws->iter_exception], + ct); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + } + + ws->iter_exception++; + if (ws->iter_exception >= SFE_IPV6_EXCEPTION_EVENT_LAST) { + ws->iter_exception = 0; + ws->state++; + } + + return true; +} + +/* + * sfe_ipv6_debug_dev_read_exceptions_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_exceptions_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_stats() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_stats(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + unsigned int num_connections; + u64 packets_forwarded; + u64 packets_not_forwarded; + u64 connection_create_requests; + u64 connection_create_collisions; + u64 connection_destroy_requests; + u64 connection_destroy_misses; + u64 connection_flushes; + u64 connection_match_hash_hits; + u64 connection_match_hash_reorders; + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + num_connections = si->num_connections; + packets_forwarded = si->packets_forwarded64; + packets_not_forwarded = si->packets_not_forwarded64; + connection_create_requests = si->connection_create_requests64; + connection_create_collisions = si->connection_create_collisions64; + connection_destroy_requests = si->connection_destroy_requests64; + connection_destroy_misses = si->connection_destroy_misses64; + connection_flushes = si->connection_flushes64; + connection_match_hash_hits = si->connection_match_hash_hits64; + connection_match_hash_reorders = si->connection_match_hash_reorders64; + spin_unlock_bh(&si->lock); + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\t\n", + num_connections, + packets_forwarded, + packets_not_forwarded, + connection_create_requests, + connection_create_collisions, + connection_destroy_requests, + connection_destroy_misses, + connection_flushes, + connection_match_hash_hits, + connection_match_hash_reorders); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * sfe_ipv6_debug_dev_read_end() + * Generate part of the XML output. + */ +static bool sfe_ipv6_debug_dev_read_end(struct sfe_ipv6 *si, char *buffer, char *msg, size_t *length, + int *total_read, struct sfe_ipv6_debug_xml_write_state *ws) +{ + int bytes_read; + + bytes_read = snprintf(msg, CHAR_DEV_MSG_SIZE, "\n"); + if (copy_to_user(buffer + *total_read, msg, CHAR_DEV_MSG_SIZE)) { + return false; + } + + *length -= bytes_read; + *total_read += bytes_read; + + ws->state++; + return true; +} + +/* + * Array of write functions that write various XML elements that correspond to + * our XML output state machine. + */ +static sfe_ipv6_debug_xml_write_method_t sfe_ipv6_debug_xml_write_methods[SFE_IPV6_DEBUG_XML_STATE_DONE] = { + sfe_ipv6_debug_dev_read_start, + sfe_ipv6_debug_dev_read_connections_start, + sfe_ipv6_debug_dev_read_connections_connection, + sfe_ipv6_debug_dev_read_connections_end, + sfe_ipv6_debug_dev_read_exceptions_start, + sfe_ipv6_debug_dev_read_exceptions_exception, + sfe_ipv6_debug_dev_read_exceptions_end, + sfe_ipv6_debug_dev_read_stats, + sfe_ipv6_debug_dev_read_end, +}; + +/* + * sfe_ipv6_debug_dev_read() + * Send info to userspace upon read request from user + */ +static ssize_t sfe_ipv6_debug_dev_read(struct file *filp, char *buffer, size_t length, loff_t *offset) +{ + char msg[CHAR_DEV_MSG_SIZE]; + int total_read = 0; + struct sfe_ipv6_debug_xml_write_state *ws; + struct sfe_ipv6 *si = &__si6; + + ws = (struct sfe_ipv6_debug_xml_write_state *)filp->private_data; + while ((ws->state != SFE_IPV6_DEBUG_XML_STATE_DONE) && (length > CHAR_DEV_MSG_SIZE)) { + if ((sfe_ipv6_debug_xml_write_methods[ws->state])(si, buffer, msg, &length, &total_read, ws)) { + continue; + } + } + + return total_read; +} + +/* + * sfe_ipv6_debug_dev_write() + * Write to char device resets some stats + */ +static ssize_t sfe_ipv6_debug_dev_write(struct file *filp, const char *buffer, size_t length, loff_t *offset) +{ + struct sfe_ipv6 *si = &__si6; + + spin_lock_bh(&si->lock); + sfe_ipv6_update_summary_stats(si); + + si->packets_forwarded64 = 0; + si->packets_not_forwarded64 = 0; + si->connection_create_requests64 = 0; + si->connection_create_collisions64 = 0; + si->connection_destroy_requests64 = 0; + si->connection_destroy_misses64 = 0; + si->connection_flushes64 = 0; + si->connection_match_hash_hits64 = 0; + si->connection_match_hash_reorders64 = 0; + spin_unlock_bh(&si->lock); + + return length; +} + +/* + * sfe_ipv6_debug_dev_open() + */ +static int sfe_ipv6_debug_dev_open(struct inode *inode, struct file *file) +{ + struct sfe_ipv6_debug_xml_write_state *ws; + + ws = (struct sfe_ipv6_debug_xml_write_state *)file->private_data; + if (ws) { + return 0; + } + + ws = kzalloc(sizeof(struct sfe_ipv6_debug_xml_write_state), GFP_KERNEL); + if (!ws) { + return -ENOMEM; + } + + ws->state = SFE_IPV6_DEBUG_XML_STATE_START; + file->private_data = ws; + + return 0; +} + +/* + * sfe_ipv6_debug_dev_release() + */ +static int sfe_ipv6_debug_dev_release(struct inode *inode, struct file *file) +{ + struct sfe_ipv6_debug_xml_write_state *ws; + + ws = (struct sfe_ipv6_debug_xml_write_state *)file->private_data; + if (ws) { + /* + * We've finished with our output so free the write state. + */ + kfree(ws); + } + + return 0; +} + +/* + * File operations used in the debug char device + */ +static struct file_operations sfe_ipv6_debug_dev_fops = { + .read = sfe_ipv6_debug_dev_read, + .write = sfe_ipv6_debug_dev_write, + .open = sfe_ipv6_debug_dev_open, + .release = sfe_ipv6_debug_dev_release +}; + +/* + * sfe_ipv6_init() + */ +static int __init sfe_ipv6_init(void) +{ + struct sfe_ipv6 *si = &__si6; + int result = -1; + + DEBUG_INFO("SFE IPv6 init\n"); + + /* + * Create sys/sfe_ipv6 + */ + si->sys_sfe_ipv6 = kobject_create_and_add("sfe_ipv6", NULL); + if (!si->sys_sfe_ipv6) { + DEBUG_ERROR("failed to register sfe_ipv6\n"); + goto exit1; + } + + /* + * Create files, one for each parameter supported by this module. + */ + result = sysfs_create_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + if (result) { + DEBUG_ERROR("failed to register debug dev file: %d\n", result); + goto exit2; + } + + /* + * Register our debug char device. + */ + result = register_chrdev(0, "sfe_ipv6", &sfe_ipv6_debug_dev_fops); + if (result < 0) { + DEBUG_ERROR("Failed to register chrdev: %d\n", result); + goto exit3; + } + + si->debug_dev = result; + + /* + * Create a timer to handle periodic statistics. + */ + setup_timer(&si->timer, sfe_ipv6_periodic_sync, (unsigned long)si); + mod_timer(&si->timer, jiffies + ((HZ + 99) / 100)); + + spin_lock_init(&si->lock); + + return 0; + +exit3: + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + +exit2: + kobject_put(si->sys_sfe_ipv6); + +exit1: + return result; +} + +/* + * sfe_ipv6_exit() + */ +static void __exit sfe_ipv6_exit(void) +{ + struct sfe_ipv6 *si = &__si6; + + DEBUG_INFO("SFE IPv6 exit\n"); + + /* + * Destroy all connections. + */ + sfe_ipv6_destroy_all_rules_for_dev(NULL); + + del_timer_sync(&si->timer); + + unregister_chrdev(si->debug_dev, "sfe_ipv6"); + + sysfs_remove_file(si->sys_sfe_ipv6, &sfe_ipv6_debug_dev_attr.attr); + + kobject_put(si->sys_sfe_ipv6); +} + +module_init(sfe_ipv6_init) +module_exit(sfe_ipv6_exit) + +EXPORT_SYMBOL(sfe_ipv6_recv); +EXPORT_SYMBOL(sfe_ipv6_create_rule); +EXPORT_SYMBOL(sfe_ipv6_destroy_rule); +EXPORT_SYMBOL(sfe_ipv6_destroy_all_rules_for_dev); +EXPORT_SYMBOL(sfe_ipv6_register_sync_rule_callback); +EXPORT_SYMBOL(sfe_ipv6_mark_rule); +EXPORT_SYMBOL(sfe_ipv6_update_rule); + +MODULE_DESCRIPTION("Shortcut Forwarding Engine - IPv6 support"); +MODULE_LICENSE("Dual BSD/GPL"); + diff --git a/package/kernel/shortcut-fe/src/userspace_example.c b/package/kernel/shortcut-fe/src/userspace_example.c new file mode 100644 index 000000000000..8bdd4670cd8c --- /dev/null +++ b/package/kernel/shortcut-fe/src/userspace_example.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2013,2016 The Linux Foundation. All rights reserved. + * Permission to use, copy, modify, and/or distribute this software for + * any purpose with or without fee is hereby granted, provided that the + * above copyright notice and this permission notice appear in all copies. + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT + * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ + +#include +#include +#include +#include +#include + +#include "fast-classifier.h" + +static struct nl_sock *sock; +static struct nl_sock *sock_event; +static int family; +static int grp_id; + +static struct nla_policy fast_classifier_genl_policy[FAST_CLASSIFIER_A_MAX + 1] = { + [FAST_CLASSIFIER_A_TUPLE] = { .type = NLA_UNSPEC }, +}; + +void dump_fc_tuple(struct fast_classifier_tuple *fc_msg) +{ + char src_str[INET_ADDRSTRLEN]; + char dst_str[INET_ADDRSTRLEN]; + + printf("TUPLE: %d, %s, %s, %d, %d" + " SMAC=%02x:%02x:%02x:%02x:%02x:%02x", + " DMAC=%02x:%02x:%02x:%02x:%02x:%02x\n", + fc_msg->proto, + inet_ntop(AF_INET, + &fc_msg->src_saddr.in.s_addr, + src_str, + INET_ADDRSTRLEN), + inet_ntop(AF_INET, + &fc_msg->dst_saddr.in.s_addr, + dst_str, + INET_ADDRSTRLEN), + fc_msg->sport, fc_msg->dport, + fc_msg->smac[0], fc_msg->smac[1], fc_msg->smac[2], + fc_msg->smac[3], fc_msg->smac[4], fc_msg->smac[5], + fc_msg->dmac[0], fc_msg->dmac[1], fc_msg->dmac[2], + fc_msg->dmac[3], fc_msg->dmac[4], fc_msg->dmac[5]); +} + +static int parse_cb(struct nl_msg *msg, void *arg) +{ + struct nlmsghdr *nlh = nlmsg_hdr(msg); + struct genlmsghdr *gnlh = nlmsg_data(nlh); + struct nlattr *attrs[FAST_CLASSIFIER_A_MAX]; + + genlmsg_parse(nlh, 0, attrs, FAST_CLASSIFIER_A_MAX, fast_classifier_genl_policy); + + switch (gnlh->cmd) { + case FAST_CLASSIFIER_C_OFFLOADED: + printf("Got a offloaded message\n"); + dump_fc_tuple(nla_data(attrs[FAST_CLASSIFIER_A_TUPLE])); + return NL_OK; + case FAST_CLASSIFIER_C_DONE: + printf("Got a done message\n"); + dump_fc_tuple(nla_data(attrs[FAST_CLASSIFIER_A_TUPLE])); + return NL_OK; + } + + return NL_SKIP; +} + +int fast_classifier_init(void) +{ + int err; + + sock = nl_socket_alloc(); + if (!sock) { + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(sock); + + sock_event = nl_socket_alloc(); + if (!sock_event) { + nl_close(sock); + nl_socket_free(sock); + printf("Unable to allocation socket.\n"); + return -1; + } + genl_connect(sock_event); + + family = genl_ctrl_resolve(sock, FAST_CLASSIFIER_GENL_NAME); + if (family < 0) { + nl_close(sock_event); + nl_close(sock); + nl_socket_free(sock); + nl_socket_free(sock_event); + printf("Unable to resolve family\n"); + return -1; + } + + grp_id = genl_ctrl_resolve_grp(sock, FAST_CLASSIFIER_GENL_NAME, + FAST_CLASSIFIER_GENL_MCGRP); + if (grp_id < 0) { + printf("Unable to resolve mcast group\n"); + return -1; + } + + err = nl_socket_add_membership(sock_event, grp_id); + if (err < 0) { + printf("Unable to add membership\n"); + return -1; + } + + nl_socket_disable_seq_check(sock_event); + nl_socket_modify_cb(sock_event, NL_CB_VALID, NL_CB_CUSTOM, parse_cb, NULL); + + return 0; +} + +void fast_classifier_close(void) +{ + nl_close(sock_event); + nl_close(sock); + nl_socket_free(sock_event); + nl_socket_free(sock); +} + +void fast_classifier_ipv4_offload(unsigned char proto, unsigned long src_saddr, + unsigned long dst_saddr, unsigned short sport, + unsigned short dport) +{ + struct nl_msg *msg; + int ret; +#ifdef DEBUG + char src_str[INET_ADDRSTRLEN]; + char dst_str[INET_ADDRSTRLEN]; +#endif + struct fast_classifier_tuple fc_msg; + +#ifdef DEBUG + printf("DEBUG: would offload: %d, %s, %s, %d, %d\n", proto, + inet_ntop(AF_INET, &src_saddr, src_str, INET_ADDRSTRLEN), + inet_ntop(AF_INET, &dst_saddr, dst_str, INET_ADDRSTRLEN), + sport, dport); +#endif + + fc_msg.proto = proto; + fc_msg.src_saddr.in.s_addr = src_saddr; + fc_msg.dst_saddr.in.s_addr = dst_saddr; + fc_msg.sport = sport; + fc_msg.dport = dport; + fc_msg.smac[0] = 'a'; + fc_msg.smac[1] = 'b'; + fc_msg.smac[2] = 'c'; + fc_msg.smac[3] = 'd'; + fc_msg.smac[4] = 'e'; + fc_msg.smac[5] = 'f'; + fc_msg.dmac[0] = 'f'; + fc_msg.dmac[1] = 'e'; + fc_msg.dmac[2] = 'd'; + fc_msg.dmac[3] = 'c'; + fc_msg.dmac[4] = 'b'; + fc_msg.dmac[5] = 'a'; + + if (fast_classifier_init() < 0) { + printf("Unable to init generic netlink\n"); + exit(1); + } + + msg = nlmsg_alloc(); + if (!msg) { + nl_socket_free(sock); + printf("Unable to allocate message\n"); + return; + } + + genlmsg_put(msg, NL_AUTO_PID, NL_AUTO_SEQ, family, + FAST_CLASSIFIER_GENL_HDRSIZE, NLM_F_REQUEST, + FAST_CLASSIFIER_C_OFFLOAD, FAST_CLASSIFIER_GENL_VERSION); + nla_put(msg, 1, sizeof(fc_msg), &fc_msg); + + ret = nl_send_auto_complete(sock, msg); + + nlmsg_free(msg); + if (ret < 0) { + printf("nlmsg_free failed"); + nl_close(sock); + nl_socket_free(sock); + return; + } + + ret = nl_wait_for_ack(sock); + if (ret < 0) { + printf("wait for ack failed"); + nl_close(sock); + nl_socket_free(sock); + return; + } +} + +void fast_classifier_listen_for_messages(void) +{ + printf("waiting for netlink events\n"); + + while (1) { + nl_recvmsgs_default(sock_event); + } +} + +int main(int argc, char *argv[]) +{ + if (fast_classifier_init() < 0) { + printf("Unable to init generic netlink\n"); + exit(1); + } + + fast_classifier_ipv4_offload('a', 0, 0, 0, 0); + + /* this never returns */ + fast_classifier_listen_for_messages(); + + fast_classifier_close(); + + return 0; +} diff --git a/target/linux/generic/config-4.4 b/target/linux/generic/config-4.4 index 1c0af9597fd5..980a7871e8d1 100644 --- a/target/linux/generic/config-4.4 +++ b/target/linux/generic/config-4.4 @@ -2682,6 +2682,7 @@ CONFIG_NFS_V3=y # CONFIG_NFT_DUP_IPV6 is not set # CONFIG_NF_CONNTRACK is not set # CONFIG_NF_CONNTRACK_AMANDA is not set +# CONFIG_NF_CONNTRACK_CHAIN_EVENTS is not set # CONFIG_NF_CONNTRACK_EVENTS is not set # CONFIG_NF_CONNTRACK_FTP is not set # CONFIG_NF_CONNTRACK_H323 is not set @@ -3650,6 +3651,7 @@ CONFIG_SHMEM=y # CONFIG_SH_ETH is not set # CONFIG_SH_TIMER_CMT is not set # CONFIG_SH_TIMER_MTU2 is not set +# CONFIG_SHORTCUT_FE is not set # CONFIG_SH_TIMER_TMU is not set # CONFIG_SI7005 is not set # CONFIG_SI7020 is not set diff --git a/target/linux/generic/config-4.9 b/target/linux/generic/config-4.9 index 24bbbc05878a..1b1df923ee35 100644 --- a/target/linux/generic/config-4.9 +++ b/target/linux/generic/config-4.9 @@ -2951,6 +2951,7 @@ CONFIG_NFS_V3=y # CONFIG_NFT_DUP_IPV6 is not set # CONFIG_NF_CONNTRACK is not set # CONFIG_NF_CONNTRACK_AMANDA is not set +# CONFIG_NF_CONNTRACK_CHAIN_EVENTS is not set # CONFIG_NF_CONNTRACK_EVENTS is not set # CONFIG_NF_CONNTRACK_FTP is not set # CONFIG_NF_CONNTRACK_H323 is not set @@ -3981,6 +3982,7 @@ CONFIG_SHMEM=y # CONFIG_SH_TIMER_CMT is not set # CONFIG_SH_TIMER_MTU2 is not set # CONFIG_SH_TIMER_TMU is not set +# CONFIG_SHORTCUT_FE is not set # CONFIG_SI1145 is not set # CONFIG_SI7005 is not set # CONFIG_SI7020 is not set diff --git a/target/linux/generic/hack-4.4/950-net-patch-linux-kernel-to-support-shortcut-fe.patch b/target/linux/generic/hack-4.4/950-net-patch-linux-kernel-to-support-shortcut-fe.patch new file mode 100644 index 000000000000..a410015e2e98 --- /dev/null +++ b/target/linux/generic/hack-4.4/950-net-patch-linux-kernel-to-support-shortcut-fe.patch @@ -0,0 +1,145 @@ +From ee0b636607a8685dcac16b217ee49432de171cb0 Mon Sep 17 00:00:00 2001 +From: Xiaoping Fan +Date: Fri, 26 Feb 2016 15:01:53 -0800 +Subject: [PATCH 1/3] net: patch linux kernel to support shortcut-fe + +1, add a new flag 'fast_forwarded' in skb structure. +2, put a hook in '__netif_receive_skb_core' to + deliver packet to shortcut-fe. + +Change-Id: Icaa7c172a06df1c3bc89ff89814d1136772fe217 +Signed-off-by: Xiaoping Fan + +msm: ipq806x: exporting TCP sequence check parameters + +This is for use in NSS connection manager. + +Change-Id: I01d30c0ab552308c439353c0d51d3d0ab3aa7699 +Signed-off-by: Pamidipati, Vijay +Signed-off-by: Murat Sezgin +--- + include/linux/skbuff.h | 5 +++++ + net/Kconfig | 3 +++ + net/core/dev.c | 27 +++++++++++++++++++++++++++ + net/netfilter/nf_conntrack_proto_tcp.c | 10 ++++++++++ + 4 files changed, 45 insertions(+) + +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index a4b1aaa..f72eea2 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -643,7 +643,12 @@ struct sk_buff { + __u8 inner_protocol_type:1; + __u8 remcsum_offload:1; + __u8 gro_skip:1; ++#ifdef CONFIG_SHORTCUT_FE ++ __u8 fast_forwarded:1; ++ /* 1 or 3 bit hole */ ++#else + /* 2 or 4 bit hole */ ++#endif + + #ifdef CONFIG_NET_SCHED + __u16 tc_index; /* traffic control index */ +diff --git a/net/Kconfig b/net/Kconfig +index 6dedbb5..e8c6e6c 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -397,3 +397,6 @@ endif # if NET + # Used by archs to tell that they support BPF_JIT + config HAVE_BPF_JIT + bool ++ ++config SHORTCUT_FE ++ bool "Enables kernel network stack path for Shortcut Forwarding Engine" +diff --git a/net/core/dev.c b/net/core/dev.c +index 11b9cda..3761691 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -2732,8 +2732,17 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, + unsigned int len; + int rc; + ++#ifdef CONFIG_SHORTCUT_FE ++ /* If this skb has been fast forwarded then we don't want it to ++ * go to any taps (by definition we're trying to bypass them). ++ */ ++ if (!skb->fast_forwarded) { ++#endif + if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) + dev_queue_xmit_nit(skb, dev); ++#ifdef CONFIG_SHORTCUT_FE ++ } ++#endif + + #ifdef CONFIG_ETHERNET_PACKET_MANGLE + if (!dev->eth_mangle_tx || +@@ -3825,6 +3834,11 @@ void netdev_rx_handler_unregister(struct net_device *dev) + } + EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); + ++#ifdef CONFIG_SHORTCUT_FE ++int (*fast_nat_recv)(struct sk_buff *skb) __rcu __read_mostly; ++EXPORT_SYMBOL_GPL(fast_nat_recv); ++#endif ++ + /* + * Limit the use of PFMEMALLOC reserves to those protocols that implement + * the special handling of PFMEMALLOC skbs. +@@ -3867,6 +3881,9 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) + bool deliver_exact = false; + int ret = NET_RX_DROP; + __be16 type; ++#ifdef CONFIG_SHORTCUT_FE ++ int (*fast_recv)(struct sk_buff *skb); ++#endif + + net_timestamp_check(!netdev_tstamp_prequeue, skb); + +@@ -3893,6 +3910,16 @@ another_round: + goto out; + } + ++#ifdef CONFIG_SHORTCUT_FE ++ fast_recv = rcu_dereference(fast_nat_recv); ++ if (fast_recv) { ++ if (fast_recv(skb)) { ++ ret = NET_RX_SUCCESS; ++ goto out; ++ } ++ } ++#endif ++ + #ifdef CONFIG_NET_CLS_ACT + if (skb->tc_verd & TC_NCLS) { + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); +diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c +index d41fdab..6f3cdfd 100644 +--- a/net/netfilter/nf_conntrack_proto_tcp.c ++++ b/net/netfilter/nf_conntrack_proto_tcp.c +@@ -34,12 +34,22 @@ + #include + + /* Do not check the TCP window for incoming packets */ ++#ifdef CONFIG_SHORTCUT_FE ++int nf_ct_tcp_no_window_check __read_mostly = 0; ++EXPORT_SYMBOL_GPL(nf_ct_tcp_no_window_check); ++#else + static int nf_ct_tcp_no_window_check __read_mostly = 1; ++#endif + + /* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ ++#ifdef CONFIG_SHORTCUT_FE ++int nf_ct_tcp_be_liberal __read_mostly; ++EXPORT_SYMBOL_GPL(nf_ct_tcp_be_liberal); ++#else + static int nf_ct_tcp_be_liberal __read_mostly = 0; ++#endif + + /* If it is set to zero, we disable picking up already established + connections. */ +-- +2.7.4 + diff --git a/target/linux/generic/hack-4.4/951-bridge-Add-new-bridge-APIs-needed-for-network-HW-acc.patch b/target/linux/generic/hack-4.4/951-bridge-Add-new-bridge-APIs-needed-for-network-HW-acc.patch new file mode 100644 index 000000000000..315314327990 --- /dev/null +++ b/target/linux/generic/hack-4.4/951-bridge-Add-new-bridge-APIs-needed-for-network-HW-acc.patch @@ -0,0 +1,74 @@ +From b24526cad8642f974d3c3c440824ecca68ebd9f0 Mon Sep 17 00:00:00 2001 +From: Murat Sezgin +Date: Tue, 25 Nov 2014 17:22:24 -0800 +Subject: [PATCH 2/3] bridge: Add new bridge APIs needed for network HW + acceleration + +Bridge acceleration hardware needs to perform certain operations, + currently unsupported by the existing bridge code: + + ** cut ** + + *update bridge interface statistics from outside the bridge code: + once acceleration is enabled on a connection, packets will not flow + through the host CPU, so we need the hardware accelerator driver to + maintain the statistics on the host and update them and add whatever + flows through the hardware. + These change adds the corresponding functions, and make it available + to other through EXPORT_SYMBOLS(). + +Change-Id: I67afb325796004053897d9916e2df91827b65139 +Signed-off-by: Murat Sezgin +--- + include/linux/if_bridge.h | 2 ++ + net/bridge/br_if.c | 24 ++++++++++++++++++++++++ + 2 files changed, 26 insertions(+) + +diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h +index ef5661b..de79c00 100644 +--- a/include/linux/if_bridge.h ++++ b/include/linux/if_bridge.h +@@ -51,6 +51,8 @@ struct br_ip_list { + #define BR_DEFAULT_AGEING_TIME (300 * HZ) + + extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); ++extern void br_dev_update_stats(struct net_device *dev, ++ struct rtnl_link_stats64 *nlstats); + + typedef int br_should_route_hook_t(struct sk_buff *skb); + extern br_should_route_hook_t __rcu *br_should_route_hook; +diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c +index ec02f58..977415a 100644 +--- a/net/bridge/br_if.c ++++ b/net/bridge/br_if.c +@@ -588,3 +588,27 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) + if (mask & BR_AUTO_MASK) + nbp_update_port_count(br); + } ++ ++/* Update bridge statistics for bridge packets processed by offload engines */ ++void br_dev_update_stats(struct net_device *dev, ++ struct rtnl_link_stats64 *nlstats) ++{ ++ struct net_bridge *br; ++ struct pcpu_sw_netstats *stats; ++ ++ /* Is this a bridge? */ ++ if (!(dev->priv_flags & IFF_EBRIDGE)) ++ return; ++ ++ br = netdev_priv(dev); ++ stats = this_cpu_ptr(br->stats); ++ ++ u64_stats_update_begin(&stats->syncp); ++ stats->rx_packets += nlstats->rx_packets; ++ stats->rx_bytes += nlstats->rx_bytes; ++ stats->tx_packets += nlstats->tx_packets; ++ stats->tx_bytes += nlstats->tx_bytes; ++ u64_stats_update_end(&stats->syncp); ++} ++EXPORT_SYMBOL_GPL(br_dev_update_stats); ++ +-- +2.7.4 + diff --git a/target/linux/generic/hack-4.4/952-net-conntrack-events-support-multiple-registrant.patch b/target/linux/generic/hack-4.4/952-net-conntrack-events-support-multiple-registrant.patch new file mode 100644 index 000000000000..54e7c07b8136 --- /dev/null +++ b/target/linux/generic/hack-4.4/952-net-conntrack-events-support-multiple-registrant.patch @@ -0,0 +1,318 @@ +From 42824d4b753f84ccf885eca602c5037338b546c8 Mon Sep 17 00:00:00 2001 +From: Zhi Chen +Date: Tue, 13 Jan 2015 14:28:18 -0800 +Subject: [PATCH 3/3] net: conntrack events, support multiple registrant + +Merging this patch from kernel 3.4: +This was supported by old (.28) kernel versions but removed +because of it's overhead. +But we need this feature for NA connection manager. Both ipv4 +and ipv6 modules needs to register themselves to ct events. + +Change-Id: Iebfb254590fb594f5baf232f849d1b7ae45ef757 +Signed-off-by: Zhi Chen +--- + include/net/netfilter/nf_conntrack_ecache.h | 42 ++++++++++++++++++- + include/net/netns/conntrack.h | 4 ++ + net/netfilter/Kconfig | 8 ++++ + net/netfilter/nf_conntrack_core.c | 4 ++ + net/netfilter/nf_conntrack_ecache.c | 63 +++++++++++++++++++++++++++++ + net/netfilter/nf_conntrack_netlink.c | 17 ++++++++ + 6 files changed, 137 insertions(+), 1 deletion(-) + +diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h +index 57c8803..f921354 100644 +--- a/include/net/netfilter/nf_conntrack_ecache.h ++++ b/include/net/netfilter/nf_conntrack_ecache.h +@@ -63,6 +63,10 @@ struct nf_ct_event { + int report; + }; + ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++extern int nf_conntrack_register_notifier(struct net *net, struct notifier_block *nb); ++extern int nf_conntrack_unregister_notifier(struct net *net, struct notifier_block *nb); ++#else + struct nf_ct_event_notifier { + int (*fcn)(unsigned int events, struct nf_ct_event *item); + }; +@@ -71,17 +75,20 @@ int nf_conntrack_register_notifier(struct net *net, + struct nf_ct_event_notifier *nb); + void nf_conntrack_unregister_notifier(struct net *net, + struct nf_ct_event_notifier *nb); ++#endif + + void nf_ct_deliver_cached_events(struct nf_conn *ct); + + static inline void + nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) + { +- struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *e; ++#ifndef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++ struct net *net = nf_ct_net(ct); + + if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) + return; ++#endif + + e = nf_ct_ecache_find(ct); + if (e == NULL) +@@ -90,6 +97,38 @@ nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) + set_bit(event, &e->cache); + } + ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++static inline int ++nf_conntrack_eventmask_report(unsigned int eventmask, ++ struct nf_conn *ct, ++ u32 portid, ++ int report) ++{ ++ struct nf_conntrack_ecache *e; ++ struct net *net = nf_ct_net(ct); ++ ++ e = nf_ct_ecache_find(ct); ++ if (e == NULL) ++ return 0; ++ ++ if (nf_ct_is_confirmed(ct) && !nf_ct_is_dying(ct)) { ++ struct nf_ct_event item = { ++ .ct = ct, ++ .portid = e->portid ? e->portid : portid, ++ .report = report ++ }; ++ /* This is a resent of a destroy event? If so, skip missed */ ++ unsigned long missed = e->portid ? 0 : e->missed; ++ ++ if (!((eventmask | missed) & e->ctmask)) ++ return 0; ++ ++ atomic_notifier_call_chain(&net->ct.nf_conntrack_chain, eventmask | missed, &item); ++ } ++ ++ return 0; ++} ++#else + static inline int + nf_conntrack_eventmask_report(unsigned int eventmask, + struct nf_conn *ct, +@@ -143,6 +182,7 @@ out_unlock: + rcu_read_unlock(); + return ret; + } ++#endif + + static inline int + nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct, +diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h +index 723b61c..fbb9c16 100644 +--- a/include/net/netns/conntrack.h ++++ b/include/net/netns/conntrack.h +@@ -100,7 +100,11 @@ struct netns_ct { + struct hlist_head *expect_hash; + struct ct_pcpu __percpu *pcpu_lists; + struct ip_conntrack_stat __percpu *stat; ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++ struct atomic_notifier_head nf_conntrack_chain; ++#else + struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; ++#endif + struct nf_exp_event_notifier __rcu *nf_expect_event_cb; + struct nf_ip_net nf_ct_proto; + #if defined(CONFIG_NF_CONNTRACK_LABELS) +diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig +index 7da6cef..c374447 100644 +--- a/net/netfilter/Kconfig ++++ b/net/netfilter/Kconfig +@@ -136,6 +136,14 @@ config NF_CONNTRACK_TIMEOUT + + If unsure, say `N'. + ++config NF_CONNTRACK_CHAIN_EVENTS ++ bool "Register multiple callbacks to ct events" ++ depends on NF_CONNTRACK_EVENTS ++ help ++ Support multiple registrations. ++ ++ If unsure, say `N'. ++ + config NF_CONNTRACK_TIMESTAMP + bool 'Connection tracking timestamping' + depends on NETFILTER_ADVANCED +diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c +index 86a3c6f..027c203 100644 +--- a/net/netfilter/nf_conntrack_core.c ++++ b/net/netfilter/nf_conntrack_core.c +@@ -1817,6 +1817,10 @@ int nf_conntrack_init_net(struct net *net) + ret = nf_conntrack_proto_pernet_init(net); + if (ret < 0) + goto err_proto; ++ ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++ ATOMIC_INIT_NOTIFIER_HEAD(&net->ct.nf_conntrack_chain); ++#endif + return 0; + + err_proto: +diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c +index 4e78c57..d3e6773 100644 +--- a/net/netfilter/nf_conntrack_ecache.c ++++ b/net/netfilter/nf_conntrack_ecache.c +@@ -18,6 +18,9 @@ + #include + #include + #include ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++#include ++#endif + #include + #include + #include +@@ -115,6 +118,51 @@ static void ecache_work(struct work_struct *work) + + /* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++void nf_ct_deliver_cached_events(struct nf_conn *ct) ++{ ++ unsigned long events, missed; ++ struct nf_conntrack_ecache *e; ++ struct nf_ct_event item; ++ struct net *net = nf_ct_net(ct); ++ int ret = 0; ++ ++ e = nf_ct_ecache_find(ct); ++ if (e == NULL) ++ return; ++ ++ events = xchg(&e->cache, 0); ++ ++ if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events) ++ return; ++ ++ /* We make a copy of the missed event cache without taking ++ * the lock, thus we may send missed events twice. However, ++ * this does not harm and it happens very rarely. */ ++ missed = e->missed; ++ ++ if (!((events | missed) & e->ctmask)) ++ return; ++ ++ item.ct = ct; ++ item.portid = 0; ++ item.report = 0; ++ ++ atomic_notifier_call_chain(&net->ct.nf_conntrack_chain, ++ events | missed, ++ &item); ++ ++ if (likely(ret >= 0 && !missed)) ++ return; ++ ++ spin_lock_bh(&ct->lock); ++ if (ret < 0) ++ e->missed |= events; ++ else ++ e->missed &= ~missed; ++ spin_unlock_bh(&ct->lock); ++} ++#else + void nf_ct_deliver_cached_events(struct nf_conn *ct) + { + struct net *net = nf_ct_net(ct); +@@ -165,8 +213,15 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct) + out_unlock: + rcu_read_unlock(); + } ++#endif + EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); + ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++int nf_conntrack_register_notifier(struct net *net, struct notifier_block *nb) ++{ ++ return atomic_notifier_chain_register(&net->ct.nf_conntrack_chain, nb); ++} ++#else + int nf_conntrack_register_notifier(struct net *net, + struct nf_ct_event_notifier *new) + { +@@ -187,8 +242,15 @@ out_unlock: + mutex_unlock(&nf_ct_ecache_mutex); + return ret; + } ++#endif + EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); + ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++int nf_conntrack_unregister_notifier(struct net *net, struct notifier_block *nb) ++{ ++ return atomic_notifier_chain_unregister(&net->ct.nf_conntrack_chain, nb); ++} ++#else + void nf_conntrack_unregister_notifier(struct net *net, + struct nf_ct_event_notifier *new) + { +@@ -201,6 +263,7 @@ void nf_conntrack_unregister_notifier(struct net *net, + RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); + mutex_unlock(&nf_ct_ecache_mutex); + } ++#endif + EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); + + int nf_ct_expect_register_notifier(struct net *net, +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index 9f52729..8d8f355 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -28,6 +28,9 @@ + #include + #include + #include ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++#include ++#endif + #include + + #include +@@ -629,14 +632,22 @@ ctnetlink_nlmsg_size(const struct nf_conn *ct) + } + + #ifdef CONFIG_NF_CONNTRACK_EVENTS ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++static int ctnetlink_conntrack_event(struct notifier_block *this, ++ unsigned long events, void *ptr) ++#else + static int + ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) ++#endif + { + const struct nf_conntrack_zone *zone; + struct net *net; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nlattr *nest_parms; ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++ struct nf_ct_event *item = ptr; ++#endif + struct nf_conn *ct = item->ct; + struct sk_buff *skb; + unsigned int type; +@@ -3258,9 +3269,15 @@ ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb, + } + + #ifdef CONFIG_NF_CONNTRACK_EVENTS ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++static struct notifier_block ctnl_notifier = { ++ .notifier_call = ctnetlink_conntrack_event, ++}; ++#else + static struct nf_ct_event_notifier ctnl_notifier = { + .fcn = ctnetlink_conntrack_event, + }; ++#endif + + static struct nf_exp_event_notifier ctnl_notifier_exp = { + .fcn = ctnetlink_expect_event, +-- +2.7.4 + diff --git a/target/linux/generic/hack-4.9/950-net-patch-linux-kernel-to-support-shortcut-fe.patch b/target/linux/generic/hack-4.9/950-net-patch-linux-kernel-to-support-shortcut-fe.patch new file mode 100644 index 000000000000..e7844effdf9e --- /dev/null +++ b/target/linux/generic/hack-4.9/950-net-patch-linux-kernel-to-support-shortcut-fe.patch @@ -0,0 +1,140 @@ +From 5c43b9811dd88dd2d18d88a253212474bb665896 Mon Sep 17 00:00:00 2001 +From: Xiaoping Fan +Date: Fri, 26 Feb 2016 15:01:53 -0800 +Subject: [PATCH 1/3] net: patch linux kernel to support shortcut-fe + +1, add a new flag 'fast_forwarded' in skb structure. +2, put a hook in '__netif_receive_skb_core' to + deliver packet to shortcut-fe. + +Change-Id: Icaa7c172a06df1c3bc89ff89814d1136772fe217 +Signed-off-by: Xiaoping Fan + +msm: ipq806x: exporting TCP sequence check parameters + +This is for use in NSS connection manager. + +Change-Id: I01d30c0ab552308c439353c0d51d3d0ab3aa7699 +Signed-off-by: Pamidipati, Vijay +Signed-off-by: Murat Sezgin +--- + include/linux/skbuff.h | 3 +++ + net/Kconfig | 3 +++ + net/core/dev.c | 25 +++++++++++++++++++++++++ + net/netfilter/nf_conntrack_proto_tcp.c | 10 ++++++++++ + 4 files changed, 41 insertions(+) + +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index 9a0c945..56f4007 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -738,6 +738,9 @@ struct sk_buff { + #endif + __u8 ipvs_property:1; + __u8 inner_protocol_type:1; ++#ifdef CONFIG_SHORTCUT_FE ++ __u8 fast_forwarded:1; ++#endif + __u8 remcsum_offload:1; + #ifdef CONFIG_NET_SWITCHDEV + __u8 offload_fwd_mark:1; +diff --git a/net/Kconfig b/net/Kconfig +index b4621e1..6ba72cf 100644 +--- a/net/Kconfig ++++ b/net/Kconfig +@@ -444,3 +444,6 @@ config HAVE_CBPF_JIT + # Extended BPF JIT (eBPF) + config HAVE_EBPF_JIT + bool ++ ++config SHORTCUT_FE ++ bool "Enables kernel network stack path for Shortcut Forwarding Engine" +diff --git a/net/core/dev.c b/net/core/dev.c +index ca39cd2..74cd937 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -2929,8 +2929,17 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev, + unsigned int len; + int rc; + ++#ifdef CONFIG_SHORTCUT_FE ++ /* If this skb has been fast forwarded then we don't want it to ++ * go to any taps (by definition we're trying to bypass them). ++ */ ++ if (!skb->fast_forwarded) { ++#endif + if (!list_empty(&ptype_all) || !list_empty(&dev->ptype_all)) + dev_queue_xmit_nit(skb, dev); ++#ifdef CONFIG_SHORTCUT_FE ++ } ++#endif + + #ifdef CONFIG_ETHERNET_PACKET_MANGLE + if (!dev->eth_mangle_tx || +@@ -4061,6 +4070,11 @@ void netdev_rx_handler_unregister(struct net_device *dev) + } + EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister); + ++#ifdef CONFIG_SHORTCUT_FE ++int (*fast_nat_recv)(struct sk_buff *skb) __rcu __read_mostly; ++EXPORT_SYMBOL_GPL(fast_nat_recv); ++#endif ++ + /* + * Limit the use of PFMEMALLOC reserves to those protocols that implement + * the special handling of PFMEMALLOC skbs. +@@ -4108,6 +4122,9 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) + bool deliver_exact = false; + int ret = NET_RX_DROP; + __be16 type; ++#ifdef CONFIG_SHORTCUT_FE ++ int (*fast_recv)(struct sk_buff *skb); ++#endif + + net_timestamp_check(!netdev_tstamp_prequeue, skb); + +@@ -4134,6 +4151,14 @@ static int __netif_receive_skb_core(struct sk_buff *skb, bool pfmemalloc) + goto out; + } + ++#ifdef CONFIG_SHORTCUT_FE ++ fast_recv = rcu_dereference(fast_nat_recv); ++ if (fast_recv && fast_recv(skb)) { ++ ret = NET_RX_SUCCESS; ++ goto out; ++ } ++#endif ++ + #ifdef CONFIG_NET_CLS_ACT + if (skb->tc_verd & TC_NCLS) { + skb->tc_verd = CLR_TC_NCLS(skb->tc_verd); +diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c +index f24b626..4581481 100644 +--- a/net/netfilter/nf_conntrack_proto_tcp.c ++++ b/net/netfilter/nf_conntrack_proto_tcp.c +@@ -34,12 +34,22 @@ + #include + + /* Do not check the TCP window for incoming packets */ ++#ifdef CONFIG_SHORTCUT_FE ++int nf_ct_tcp_no_window_check __read_mostly = 0; ++EXPORT_SYMBOL_GPL(nf_ct_tcp_no_window_check); ++#else + static int nf_ct_tcp_no_window_check __read_mostly = 1; ++#endif + + /* "Be conservative in what you do, + be liberal in what you accept from others." + If it's non-zero, we mark only out of window RST segments as INVALID. */ ++#ifdef CONFIG_SHORTCUT_FE ++int nf_ct_tcp_be_liberal __read_mostly = 0; ++EXPORT_SYMBOL_GPL(nf_ct_tcp_be_liberal); ++#else + static int nf_ct_tcp_be_liberal __read_mostly = 0; ++#endif + + /* If it is set to zero, we disable picking up already established + connections. */ +-- +2.7.4 + diff --git a/target/linux/generic/hack-4.9/951-bridge-Add-new-bridge-APIs-needed-for-network-HW-acc.patch b/target/linux/generic/hack-4.9/951-bridge-Add-new-bridge-APIs-needed-for-network-HW-acc.patch new file mode 100644 index 000000000000..5cc7dabad7b1 --- /dev/null +++ b/target/linux/generic/hack-4.9/951-bridge-Add-new-bridge-APIs-needed-for-network-HW-acc.patch @@ -0,0 +1,74 @@ +From cdabe46cd62d22266aed7fdae36f72a36e3558e7 Mon Sep 17 00:00:00 2001 +From: Murat Sezgin +Date: Tue, 25 Nov 2014 17:22:24 -0800 +Subject: [PATCH 2/3] bridge: Add new bridge APIs needed for network HW + acceleration + +Bridge acceleration hardware needs to perform certain operations, + currently unsupported by the existing bridge code: + + ** cut ** + + *update bridge interface statistics from outside the bridge code: + once acceleration is enabled on a connection, packets will not flow + through the host CPU, so we need the hardware accelerator driver to + maintain the statistics on the host and update them and add whatever + flows through the hardware. + These change adds the corresponding functions, and make it available + to other through EXPORT_SYMBOLS(). + +Change-Id: I67afb325796004053897d9916e2df91827b65139 +Signed-off-by: Murat Sezgin +--- + include/linux/if_bridge.h | 1 + + net/bridge/br_if.c | 25 +++++++++++++++++++++++++ + 2 files changed, 26 insertions(+) + +diff --git a/include/linux/if_bridge.h b/include/linux/if_bridge.h +index 0911c8c..4473f62 100644 +--- a/include/linux/if_bridge.h ++++ b/include/linux/if_bridge.h +@@ -52,6 +52,7 @@ struct br_ip_list { + #define BR_DEFAULT_AGEING_TIME (300 * HZ) + + extern void brioctl_set(int (*ioctl_hook)(struct net *, unsigned int, void __user *)); ++extern void br_dev_update_stats(struct net_device *dev, struct rtnl_link_stats64 *nlstats); + + typedef int br_should_route_hook_t(struct sk_buff *skb); + extern br_should_route_hook_t __rcu *br_should_route_hook; +diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c +index ed0dd33..342b2d9 100644 +--- a/net/bridge/br_if.c ++++ b/net/bridge/br_if.c +@@ -655,3 +655,28 @@ void br_port_flags_change(struct net_bridge_port *p, unsigned long mask) + if (mask & BR_AUTO_MASK) + nbp_update_port_count(br); + } ++ ++/* Update bridge statistics for bridge packets processed by offload engines */ ++void br_dev_update_stats(struct net_device *dev, struct rtnl_link_stats64 *nlstats) ++{ ++ struct net_bridge *br; ++ struct pcpu_sw_netstats *stats; ++ ++ /* ++ * Is this a bridge? ++ */ ++ if (!(dev->priv_flags & IFF_EBRIDGE)) { ++ return; ++ } ++ ++ br = netdev_priv(dev); ++ stats = this_cpu_ptr(br->stats); ++ ++ u64_stats_update_begin(&stats->syncp); ++ stats->rx_packets += nlstats->rx_packets; ++ stats->rx_bytes += nlstats->rx_bytes; ++ stats->tx_packets += nlstats->tx_packets; ++ stats->tx_bytes += nlstats->tx_bytes; ++ u64_stats_update_end(&stats->syncp); ++} ++EXPORT_SYMBOL_GPL(br_dev_update_stats); +-- +2.7.4 + diff --git a/target/linux/generic/hack-4.9/952-net-conntrack-events-support-multiple-registrant.patch b/target/linux/generic/hack-4.9/952-net-conntrack-events-support-multiple-registrant.patch new file mode 100644 index 000000000000..a898c4a695da --- /dev/null +++ b/target/linux/generic/hack-4.9/952-net-conntrack-events-support-multiple-registrant.patch @@ -0,0 +1,345 @@ +From 56294a39f14712f868a495f28be6b2e92f0a48c6 Mon Sep 17 00:00:00 2001 +From: Zhi Chen +Date: Tue, 13 Jan 2015 14:28:18 -0800 +Subject: [PATCH 3/3] net: conntrack events, support multiple registrant + +Merging this patch from kernel 3.4: +This was supported by old (.28) kernel versions but removed +because of it's overhead. +But we need this feature for NA connection manager. Both ipv4 +and ipv6 modules needs to register themselves to ct events. + +Change-Id: Iebfb254590fb594f5baf232f849d1b7ae45ef757 +Signed-off-by: Zhi Chen +--- + include/net/netfilter/nf_conntrack_ecache.h | 13 +++- + include/net/netns/conntrack.h | 4 ++ + net/netfilter/Kconfig | 8 +++ + net/netfilter/nf_conntrack_core.c | 4 ++ + net/netfilter/nf_conntrack_ecache.c | 92 +++++++++++++++++++++++++++++ + net/netfilter/nf_conntrack_netlink.c | 19 ++++++ + 6 files changed, 139 insertions(+), 1 deletion(-) + +diff --git a/include/net/netfilter/nf_conntrack_ecache.h b/include/net/netfilter/nf_conntrack_ecache.h +index 12d967b..c2b98b6 100644 +--- a/include/net/netfilter/nf_conntrack_ecache.h ++++ b/include/net/netfilter/nf_conntrack_ecache.h +@@ -70,6 +70,10 @@ struct nf_ct_event { + int report; + }; + ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++extern int nf_conntrack_register_notifier(struct net *net, struct notifier_block *nb); ++extern int nf_conntrack_unregister_notifier(struct net *net, struct notifier_block *nb); ++#else + struct nf_ct_event_notifier { + int (*fcn)(unsigned int events, struct nf_ct_event *item); + }; +@@ -78,6 +82,7 @@ int nf_conntrack_register_notifier(struct net *net, + struct nf_ct_event_notifier *nb); + void nf_conntrack_unregister_notifier(struct net *net, + struct nf_ct_event_notifier *nb); ++#endif + + void nf_ct_deliver_cached_events(struct nf_conn *ct); + int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, +@@ -86,11 +91,13 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, + static inline void + nf_conntrack_event_cache(enum ip_conntrack_events event, struct nf_conn *ct) + { +- struct net *net = nf_ct_net(ct); + struct nf_conntrack_ecache *e; ++#ifndef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++ struct net *net = nf_ct_net(ct); + + if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) + return; ++#endif + + e = nf_ct_ecache_find(ct); + if (e == NULL) +@@ -103,10 +110,12 @@ static inline int + nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct, + u32 portid, int report) + { ++#ifndef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + const struct net *net = nf_ct_net(ct); + + if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) + return 0; ++#endif + + return nf_conntrack_eventmask_report(1 << event, ct, portid, report); + } +@@ -114,10 +123,12 @@ nf_conntrack_event_report(enum ip_conntrack_events event, struct nf_conn *ct, + static inline int + nf_conntrack_event(enum ip_conntrack_events event, struct nf_conn *ct) + { ++#ifndef CONFIG_NF_CONNTRACK_CHAIN_EVENTS + const struct net *net = nf_ct_net(ct); + + if (!rcu_access_pointer(net->ct.nf_conntrack_event_cb)) + return 0; ++#endif + + return nf_conntrack_eventmask_report(1 << event, ct, 0, 0); + } +diff --git a/include/net/netns/conntrack.h b/include/net/netns/conntrack.h +index e469e85..1d31db8 100644 +--- a/include/net/netns/conntrack.h ++++ b/include/net/netns/conntrack.h +@@ -86,7 +86,11 @@ struct netns_ct { + + struct ct_pcpu __percpu *pcpu_lists; + struct ip_conntrack_stat __percpu *stat; ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++ struct atomic_notifier_head nf_conntrack_chain; ++#else + struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb; ++#endif + struct nf_exp_event_notifier __rcu *nf_expect_event_cb; + struct nf_ip_net nf_ct_proto; + #if defined(CONFIG_NF_CONNTRACK_LABELS) +diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig +index 63073be..08d7aab 100644 +--- a/net/netfilter/Kconfig ++++ b/net/netfilter/Kconfig +@@ -136,6 +136,14 @@ config NF_CONNTRACK_TIMEOUT + + If unsure, say `N'. + ++config NF_CONNTRACK_CHAIN_EVENTS ++ bool "Register multiple callbacks to ct events" ++ depends on NF_CONNTRACK_EVENTS ++ help ++ Support multiple registrations. ++ ++ If unsure, say `N'. ++ + config NF_CONNTRACK_TIMESTAMP + bool 'Connection tracking timestamping' + depends on NETFILTER_ADVANCED +diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c +index 6bd1508..9b81c7c 100644 +--- a/net/netfilter/nf_conntrack_core.c ++++ b/net/netfilter/nf_conntrack_core.c +@@ -1998,6 +1998,10 @@ int nf_conntrack_init_net(struct net *net) + ret = nf_conntrack_proto_pernet_init(net); + if (ret < 0) + goto err_proto; ++ ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++ ATOMIC_INIT_NOTIFIER_HEAD(&net->ct.nf_conntrack_chain); ++#endif + return 0; + + err_proto: +diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c +index da9df2d..e0e2a8f 100644 +--- a/net/netfilter/nf_conntrack_ecache.c ++++ b/net/netfilter/nf_conntrack_ecache.c +@@ -18,6 +18,9 @@ + #include + #include + #include ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++#include ++#endif + #include + #include + #include +@@ -117,6 +120,38 @@ static void ecache_work(struct work_struct *work) + schedule_delayed_work(&ctnet->ecache_dwork, delay); + } + ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++int ++nf_conntrack_eventmask_report(unsigned int eventmask, ++ struct nf_conn *ct, ++ u32 portid, ++ int report) ++{ ++ struct nf_conntrack_ecache *e; ++ struct net *net = nf_ct_net(ct); ++ ++ e = nf_ct_ecache_find(ct); ++ if (e == NULL) ++ return 0; ++ ++ if (nf_ct_is_confirmed(ct)) { ++ struct nf_ct_event item = { ++ .ct = ct, ++ .portid = e->portid ? e->portid : portid, ++ .report = report ++ }; ++ /* This is a resent of a destroy event? If so, skip missed */ ++ unsigned long missed = e->portid ? 0 : e->missed; ++ ++ if (!((eventmask | missed) & e->ctmask)) ++ return 0; ++ ++ atomic_notifier_call_chain(&net->ct.nf_conntrack_chain, eventmask | missed, &item); ++ } ++ ++ return 0; ++} ++#else + int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, + u32 portid, int report) + { +@@ -171,10 +206,52 @@ int nf_conntrack_eventmask_report(unsigned int eventmask, struct nf_conn *ct, + rcu_read_unlock(); + return ret; + } ++#endif + EXPORT_SYMBOL_GPL(nf_conntrack_eventmask_report); + + /* deliver cached events and clear cache entry - must be called with locally + * disabled softirqs */ ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++void nf_ct_deliver_cached_events(struct nf_conn *ct) ++{ ++ unsigned long events, missed; ++ struct nf_conntrack_ecache *e; ++ struct nf_ct_event item; ++ struct net *net = nf_ct_net(ct); ++ ++ e = nf_ct_ecache_find(ct); ++ if (e == NULL) ++ return; ++ ++ events = xchg(&e->cache, 0); ++ ++ if (!nf_ct_is_confirmed(ct) || nf_ct_is_dying(ct) || !events) ++ return; ++ ++ /* We make a copy of the missed event cache without taking ++ * the lock, thus we may send missed events twice. However, ++ * this does not harm and it happens very rarely. */ ++ missed = e->missed; ++ ++ if (!((events | missed) & e->ctmask)) ++ return; ++ ++ item.ct = ct; ++ item.portid = 0; ++ item.report = 0; ++ ++ atomic_notifier_call_chain(&net->ct.nf_conntrack_chain, ++ events | missed, ++ &item); ++ ++ if (likely(!missed)) ++ return; ++ ++ spin_lock_bh(&ct->lock); ++ e->missed &= ~missed; ++ spin_unlock_bh(&ct->lock); ++} ++#else + void nf_ct_deliver_cached_events(struct nf_conn *ct) + { + struct net *net = nf_ct_net(ct); +@@ -225,6 +302,7 @@ void nf_ct_deliver_cached_events(struct nf_conn *ct) + out_unlock: + rcu_read_unlock(); + } ++#endif + EXPORT_SYMBOL_GPL(nf_ct_deliver_cached_events); + + void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, +@@ -257,6 +335,12 @@ void nf_ct_expect_event_report(enum ip_conntrack_expect_events event, + rcu_read_unlock(); + } + ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++int nf_conntrack_register_notifier(struct net *net, struct notifier_block *nb) ++{ ++ return atomic_notifier_chain_register(&net->ct.nf_conntrack_chain, nb); ++} ++#else + int nf_conntrack_register_notifier(struct net *net, + struct nf_ct_event_notifier *new) + { +@@ -277,8 +361,15 @@ int nf_conntrack_register_notifier(struct net *net, + mutex_unlock(&nf_ct_ecache_mutex); + return ret; + } ++#endif + EXPORT_SYMBOL_GPL(nf_conntrack_register_notifier); + ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++int nf_conntrack_unregister_notifier(struct net *net, struct notifier_block *nb) ++{ ++ return atomic_notifier_chain_unregister(&net->ct.nf_conntrack_chain, nb); ++} ++#else + void nf_conntrack_unregister_notifier(struct net *net, + struct nf_ct_event_notifier *new) + { +@@ -291,6 +382,7 @@ void nf_conntrack_unregister_notifier(struct net *net, + RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); + mutex_unlock(&nf_ct_ecache_mutex); + } ++#endif + EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); + + int nf_ct_expect_register_notifier(struct net *net, +diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c +index 04111c1..8c741f7 100644 +--- a/net/netfilter/nf_conntrack_netlink.c ++++ b/net/netfilter/nf_conntrack_netlink.c +@@ -28,6 +28,11 @@ + #include + #include + #include ++ ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++#include ++#endif ++ + #include + + #include +@@ -615,14 +620,22 @@ static size_t ctnetlink_nlmsg_size(const struct nf_conn *ct) + ; + } + ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++static int ctnetlink_conntrack_event(struct notifier_block *this, ++ unsigned long events, void *ptr) ++#else + static int + ctnetlink_conntrack_event(unsigned int events, struct nf_ct_event *item) ++#endif + { + const struct nf_conntrack_zone *zone; + struct net *net; + struct nlmsghdr *nlh; + struct nfgenmsg *nfmsg; + struct nlattr *nest_parms; ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++ struct nf_ct_event *item = (struct nf_ct_event *)ptr; ++#endif + struct nf_conn *ct = item->ct; + struct sk_buff *skb; + unsigned int type; +@@ -3260,9 +3273,15 @@ static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl, + } + + #ifdef CONFIG_NF_CONNTRACK_EVENTS ++#ifdef CONFIG_NF_CONNTRACK_CHAIN_EVENTS ++static struct notifier_block ctnl_notifier = { ++ .notifier_call = ctnetlink_conntrack_event, ++}; ++#else + static struct nf_ct_event_notifier ctnl_notifier = { + .fcn = ctnetlink_conntrack_event, + }; ++#endif + + static struct nf_exp_event_notifier ctnl_notifier_exp = { + .fcn = ctnetlink_expect_event, +-- +2.7.4 +