瀏覽代碼

Merge branch 'XDP-redirect-memory-return-API'

Jesper Dangaard Brouer says:

====================
XDP redirect memory return API

Submitted against net-next, as it contains NIC driver changes.

This patchset works towards supporting different XDP RX-ring memory
allocators.  As this will be needed by the AF_XDP zero-copy mode.

The patchset uses mlx5 as the sample driver, which gets implemented
XDP_REDIRECT RX-mode, but not ndo_xdp_xmit (as this API is subject to
change thought the patchset).

A new struct xdp_frame is introduced (modeled after cpumap xdp_pkt).
And both ndo_xdp_xmit and the new xdp_return_frame end-up using this.

Support for a driver supplied allocator is implemented, and a
refurbished version of page_pool is the first return allocator type
introduced.  This will be a integration point for AF_XDP zero-copy.

The mlx5 driver evolve into using the page_pool, and see a performance
increase (with ndo_xdp_xmit out ixgbe driver) from 6Mpps to 12Mpps.

The patchset stop at 16 patches (one over limit), but more API changes
are planned.  Specifically extending ndo_xdp_xmit and xdp_return_frame
APIs to support bulking.  As this will address some known limits.

V2: Updated according to Tariq's feedback
V3: Updated based on feedback from Jason Wang and Alex Duyck
V4: Updated based on feedback from Tariq and Jason
V5: Fix SPDX license, add Tariq's reviews, improve patch desc for perf test
V6: Updated based on feedback from Eric Dumazet and Alex Duyck
V7: Adapt to i40e that got XDP_REDIRECT support in-between
V8:
 Updated based on feedback kbuild test robot, and adjust for mlx5 changes
 page_pool only compiled into kernel when drivers Kconfig 'select' feature
V9:
 Remove some inline statements, let compiler decide what to inline
 Fix return value in virtio_net driver
 Adjust for mlx5 changes in-between submissions
V10:
 Minor adjust for mlx5 requested by Tariq
 Resubmit against net-next
V11: avoid leaking info stored in frame data on page reuse
====================

Acked-by: Alexei Starovoitov <ast@kernel.org>
Signed-off-by: David S. Miller <davem@davemloft.net>
David S. Miller 7 年之前
父節點
當前提交
684009d4fd

+ 23 - 10
drivers/net/ethernet/intel/i40e/i40e_txrx.c

@@ -638,7 +638,7 @@ static void i40e_unmap_and_free_tx_resource(struct i40e_ring *ring,
 		if (tx_buffer->tx_flags & I40E_TX_FLAGS_FD_SB)
 			kfree(tx_buffer->raw_buf);
 		else if (ring_is_xdp(ring))
-			page_frag_free(tx_buffer->raw_buf);
+			xdp_return_frame(tx_buffer->xdpf);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 		if (dma_unmap_len(tx_buffer, len))
@@ -841,7 +841,7 @@ static bool i40e_clean_tx_irq(struct i40e_vsi *vsi,
 
 		/* free the skb/XDP data */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buf->raw_buf);
+			xdp_return_frame(tx_buf->xdpf);
 		else
 			napi_consume_skb(tx_buf->skb, napi_budget);
 
@@ -2203,9 +2203,20 @@ static bool i40e_is_non_eop(struct i40e_ring *rx_ring,
 #define I40E_XDP_CONSUMED 1
 #define I40E_XDP_TX 2
 
-static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
+static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
 			      struct i40e_ring *xdp_ring);
 
+static int i40e_xmit_xdp_tx_ring(struct xdp_buff *xdp,
+				 struct i40e_ring *xdp_ring)
+{
+	struct xdp_frame *xdpf = convert_to_xdp_frame(xdp);
+
+	if (unlikely(!xdpf))
+		return I40E_XDP_CONSUMED;
+
+	return i40e_xmit_xdp_ring(xdpf, xdp_ring);
+}
+
 /**
  * i40e_run_xdp - run an XDP program
  * @rx_ring: Rx ring being processed
@@ -2225,13 +2236,15 @@ static struct sk_buff *i40e_run_xdp(struct i40e_ring *rx_ring,
 	if (!xdp_prog)
 		goto xdp_out;
 
+	prefetchw(xdp->data_hard_start); /* xdp_frame write */
+
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 	switch (act) {
 	case XDP_PASS:
 		break;
 	case XDP_TX:
 		xdp_ring = rx_ring->vsi->xdp_rings[rx_ring->queue_index];
-		result = i40e_xmit_xdp_ring(xdp, xdp_ring);
+		result = i40e_xmit_xdp_tx_ring(xdp, xdp_ring);
 		break;
 	case XDP_REDIRECT:
 		err = xdp_do_redirect(rx_ring->netdev, xdp, xdp_prog);
@@ -3478,13 +3491,13 @@ dma_error:
  * @xdp: data to transmit
  * @xdp_ring: XDP Tx ring
  **/
-static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
+static int i40e_xmit_xdp_ring(struct xdp_frame *xdpf,
 			      struct i40e_ring *xdp_ring)
 {
-	u32 size = xdp->data_end - xdp->data;
 	u16 i = xdp_ring->next_to_use;
 	struct i40e_tx_buffer *tx_bi;
 	struct i40e_tx_desc *tx_desc;
+	u32 size = xdpf->len;
 	dma_addr_t dma;
 
 	if (!unlikely(I40E_DESC_UNUSED(xdp_ring))) {
@@ -3492,14 +3505,14 @@ static int i40e_xmit_xdp_ring(struct xdp_buff *xdp,
 		return I40E_XDP_CONSUMED;
 	}
 
-	dma = dma_map_single(xdp_ring->dev, xdp->data, size, DMA_TO_DEVICE);
+	dma = dma_map_single(xdp_ring->dev, xdpf->data, size, DMA_TO_DEVICE);
 	if (dma_mapping_error(xdp_ring->dev, dma))
 		return I40E_XDP_CONSUMED;
 
 	tx_bi = &xdp_ring->tx_bi[i];
 	tx_bi->bytecount = size;
 	tx_bi->gso_segs = 1;
-	tx_bi->raw_buf = xdp->data;
+	tx_bi->xdpf = xdpf;
 
 	/* record length, and DMA address */
 	dma_unmap_len_set(tx_bi, len, size);
@@ -3675,7 +3688,7 @@ netdev_tx_t i40e_lan_xmit_frame(struct sk_buff *skb, struct net_device *netdev)
  *
  * Returns Zero if sent, else an error code
  **/
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 {
 	struct i40e_netdev_priv *np = netdev_priv(dev);
 	unsigned int queue_index = smp_processor_id();
@@ -3688,7 +3701,7 @@ int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (!i40e_enabled_xdp_vsi(vsi) || queue_index >= vsi->num_queue_pairs)
 		return -ENXIO;
 
-	err = i40e_xmit_xdp_ring(xdp, vsi->xdp_rings[queue_index]);
+	err = i40e_xmit_xdp_ring(xdpf, vsi->xdp_rings[queue_index]);
 	if (err != I40E_XDP_TX)
 		return -ENOSPC;
 

+ 2 - 1
drivers/net/ethernet/intel/i40e/i40e_txrx.h

@@ -306,6 +306,7 @@ static inline unsigned int i40e_txd_use_count(unsigned int size)
 struct i40e_tx_buffer {
 	struct i40e_tx_desc *next_to_watch;
 	union {
+		struct xdp_frame *xdpf;
 		struct sk_buff *skb;
 		void *raw_buf;
 	};
@@ -510,7 +511,7 @@ u32 i40e_get_tx_pending(struct i40e_ring *ring, bool in_sw);
 void i40e_detect_recover_hung(struct i40e_vsi *vsi);
 int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size);
 bool __i40e_chk_linearize(struct sk_buff *skb);
-int i40e_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp);
+int i40e_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf);
 void i40e_xdp_flush(struct net_device *dev);
 
 /**

+ 1 - 2
drivers/net/ethernet/intel/ixgbe/ixgbe.h

@@ -241,8 +241,7 @@ struct ixgbe_tx_buffer {
 	unsigned long time_stamp;
 	union {
 		struct sk_buff *skb;
-		/* XDP uses address ptr on irq_clean */
-		void *data;
+		struct xdp_frame *xdpf;
 	};
 	unsigned int bytecount;
 	unsigned short gso_segs;

+ 27 - 11
drivers/net/ethernet/intel/ixgbe/ixgbe_main.c

@@ -1216,7 +1216,7 @@ static bool ixgbe_clean_tx_irq(struct ixgbe_q_vector *q_vector,
 
 		/* free the skb */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			xdp_return_frame(tx_buffer->xdpf);
 		else
 			napi_consume_skb(tx_buffer->skb, napi_budget);
 
@@ -2262,7 +2262,7 @@ static struct sk_buff *ixgbe_build_skb(struct ixgbe_ring *rx_ring,
 #define IXGBE_XDP_TX 2
 
 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
-			       struct xdp_buff *xdp);
+			       struct xdp_frame *xdpf);
 
 static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 				     struct ixgbe_ring *rx_ring,
@@ -2270,6 +2270,7 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 {
 	int err, result = IXGBE_XDP_PASS;
 	struct bpf_prog *xdp_prog;
+	struct xdp_frame *xdpf;
 	u32 act;
 
 	rcu_read_lock();
@@ -2278,12 +2279,19 @@ static struct sk_buff *ixgbe_run_xdp(struct ixgbe_adapter *adapter,
 	if (!xdp_prog)
 		goto xdp_out;
 
+	prefetchw(xdp->data_hard_start); /* xdp_frame write */
+
 	act = bpf_prog_run_xdp(xdp_prog, xdp);
 	switch (act) {
 	case XDP_PASS:
 		break;
 	case XDP_TX:
-		result = ixgbe_xmit_xdp_ring(adapter, xdp);
+		xdpf = convert_to_xdp_frame(xdp);
+		if (unlikely(!xdpf)) {
+			result = IXGBE_XDP_CONSUMED;
+			break;
+		}
+		result = ixgbe_xmit_xdp_ring(adapter, xdpf);
 		break;
 	case XDP_REDIRECT:
 		err = xdp_do_redirect(adapter->netdev, xdp, xdp_prog);
@@ -5797,7 +5805,7 @@ static void ixgbe_clean_tx_ring(struct ixgbe_ring *tx_ring)
 
 		/* Free all the Tx ring sk_buffs */
 		if (ring_is_xdp(tx_ring))
-			page_frag_free(tx_buffer->data);
+			xdp_return_frame(tx_buffer->xdpf);
 		else
 			dev_kfree_skb_any(tx_buffer->skb);
 
@@ -6370,7 +6378,7 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
 	struct device *dev = rx_ring->dev;
 	int orig_node = dev_to_node(dev);
 	int ring_node = -1;
-	int size;
+	int size, err;
 
 	size = sizeof(struct ixgbe_rx_buffer) * rx_ring->count;
 
@@ -6407,6 +6415,13 @@ int ixgbe_setup_rx_resources(struct ixgbe_adapter *adapter,
 			     rx_ring->queue_index) < 0)
 		goto err;
 
+	err = xdp_rxq_info_reg_mem_model(&rx_ring->xdp_rxq,
+					 MEM_TYPE_PAGE_SHARED, NULL);
+	if (err) {
+		xdp_rxq_info_unreg(&rx_ring->xdp_rxq);
+		goto err;
+	}
+
 	rx_ring->xdp_prog = adapter->xdp_prog;
 
 	return 0;
@@ -8336,7 +8351,7 @@ static u16 ixgbe_select_queue(struct net_device *dev, struct sk_buff *skb,
 }
 
 static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
-			       struct xdp_buff *xdp)
+			       struct xdp_frame *xdpf)
 {
 	struct ixgbe_ring *ring = adapter->xdp_ring[smp_processor_id()];
 	struct ixgbe_tx_buffer *tx_buffer;
@@ -8345,12 +8360,12 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
 	dma_addr_t dma;
 	u16 i;
 
-	len = xdp->data_end - xdp->data;
+	len = xdpf->len;
 
 	if (unlikely(!ixgbe_desc_unused(ring)))
 		return IXGBE_XDP_CONSUMED;
 
-	dma = dma_map_single(ring->dev, xdp->data, len, DMA_TO_DEVICE);
+	dma = dma_map_single(ring->dev, xdpf->data, len, DMA_TO_DEVICE);
 	if (dma_mapping_error(ring->dev, dma))
 		return IXGBE_XDP_CONSUMED;
 
@@ -8365,7 +8380,8 @@ static int ixgbe_xmit_xdp_ring(struct ixgbe_adapter *adapter,
 
 	dma_unmap_len_set(tx_buffer, len, len);
 	dma_unmap_addr_set(tx_buffer, dma, dma);
-	tx_buffer->data = xdp->data;
+	tx_buffer->xdpf = xdpf;
+
 	tx_desc->read.buffer_addr = cpu_to_le64(dma);
 
 	/* put descriptor type bits */
@@ -9996,7 +10012,7 @@ static int ixgbe_xdp(struct net_device *dev, struct netdev_bpf *xdp)
 	}
 }
 
-static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 {
 	struct ixgbe_adapter *adapter = netdev_priv(dev);
 	struct ixgbe_ring *ring;
@@ -10012,7 +10028,7 @@ static int ixgbe_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (unlikely(!ring))
 		return -ENXIO;
 
-	err = ixgbe_xmit_xdp_ring(adapter, xdp);
+	err = ixgbe_xmit_xdp_ring(adapter, xdpf);
 	if (err != IXGBE_XDP_TX)
 		return -ENOSPC;
 

+ 1 - 0
drivers/net/ethernet/mellanox/mlx5/core/Kconfig

@@ -30,6 +30,7 @@ config MLX5_CORE_EN
 	bool "Mellanox Technologies ConnectX-4 Ethernet support"
 	depends on NETDEVICES && ETHERNET && INET && PCI && MLX5_CORE
 	depends on IPV6=y || IPV6=n || MLX5_CORE=m
+	select PAGE_POOL
 	default n
 	---help---
 	  Ethernet support in Mellanox Technologies ConnectX-4 NIC.

+ 4 - 0
drivers/net/ethernet/mellanox/mlx5/core/en.h

@@ -53,6 +53,8 @@
 #include "mlx5_core.h"
 #include "en_stats.h"
 
+struct page_pool;
+
 #define MLX5_SET_CFG(p, f, v) MLX5_SET(create_flow_group_in, p, f, v)
 
 #define MLX5E_ETH_HARD_MTU (ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN)
@@ -392,6 +394,7 @@ struct mlx5e_xdpsq {
 	struct {
 		struct mlx5e_dma_info     *di;
 		bool                       doorbell;
+		bool                       redirect_flush;
 	} db;
 
 	/* read only */
@@ -533,6 +536,7 @@ struct mlx5e_rq {
 	unsigned int           hw_mtu;
 	struct mlx5e_xdpsq     xdpsq;
 	DECLARE_BITMAP(flags, 8);
+	struct page_pool      *page_pool;
 
 	/* control */
 	struct mlx5_wq_ctrl    wq_ctrl;

+ 36 - 1
drivers/net/ethernet/mellanox/mlx5/core/en_main.c

@@ -35,6 +35,7 @@
 #include <linux/mlx5/fs.h>
 #include <net/vxlan.h>
 #include <linux/bpf.h>
+#include <net/page_pool.h>
 #include "eswitch.h"
 #include "en.h"
 #include "en_tc.h"
@@ -389,10 +390,11 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 			  struct mlx5e_rq_param *rqp,
 			  struct mlx5e_rq *rq)
 {
+	struct page_pool_params pp_params = { 0 };
 	struct mlx5_core_dev *mdev = c->mdev;
 	void *rqc = rqp->rqc;
 	void *rqc_wq = MLX5_ADDR_OF(rqc, rqc, wq);
-	u32 byte_count;
+	u32 byte_count, pool_size;
 	int npages;
 	int wq_sz;
 	int err;
@@ -432,9 +434,12 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 
 	rq->buff.map_dir = rq->xdp_prog ? DMA_BIDIRECTIONAL : DMA_FROM_DEVICE;
 	rq->buff.headroom = mlx5e_get_rq_headroom(mdev, params);
+	pool_size = 1 << params->log_rq_mtu_frames;
 
 	switch (rq->wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:
+
+		pool_size = MLX5_MPWRQ_PAGES_PER_WQE << mlx5e_mpwqe_get_log_rq_size(params);
 		rq->post_wqes = mlx5e_post_rx_mpwqes;
 		rq->dealloc_wqe = mlx5e_dealloc_rx_mpwqe;
 
@@ -512,6 +517,32 @@ static int mlx5e_alloc_rq(struct mlx5e_channel *c,
 		rq->mkey_be = c->mkey_be;
 	}
 
+	/* Create a page_pool and register it with rxq */
+	pp_params.order     = rq->buff.page_order;
+	pp_params.flags     = 0; /* No-internal DMA mapping in page_pool */
+	pp_params.pool_size = pool_size;
+	pp_params.nid       = cpu_to_node(c->cpu);
+	pp_params.dev       = c->pdev;
+	pp_params.dma_dir   = rq->buff.map_dir;
+
+	/* page_pool can be used even when there is no rq->xdp_prog,
+	 * given page_pool does not handle DMA mapping there is no
+	 * required state to clear. And page_pool gracefully handle
+	 * elevated refcnt.
+	 */
+	rq->page_pool = page_pool_create(&pp_params);
+	if (IS_ERR(rq->page_pool)) {
+		if (rq->wq_type != MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ)
+			kfree(rq->wqe.frag_info);
+		err = PTR_ERR(rq->page_pool);
+		rq->page_pool = NULL;
+		goto err_rq_wq_destroy;
+	}
+	err = xdp_rxq_info_reg_mem_model(&rq->xdp_rxq,
+					 MEM_TYPE_PAGE_POOL, rq->page_pool);
+	if (err)
+		goto err_rq_wq_destroy;
+
 	for (i = 0; i < wq_sz; i++) {
 		struct mlx5e_rx_wqe *wqe = mlx5_wq_ll_get_wqe(&rq->wq, i);
 
@@ -548,6 +579,8 @@ err_rq_wq_destroy:
 	if (rq->xdp_prog)
 		bpf_prog_put(rq->xdp_prog);
 	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	if (rq->page_pool)
+		page_pool_destroy(rq->page_pool);
 	mlx5_wq_destroy(&rq->wq_ctrl);
 
 	return err;
@@ -561,6 +594,8 @@ static void mlx5e_free_rq(struct mlx5e_rq *rq)
 		bpf_prog_put(rq->xdp_prog);
 
 	xdp_rxq_info_unreg(&rq->xdp_rxq);
+	if (rq->page_pool)
+		page_pool_destroy(rq->page_pool);
 
 	switch (rq->wq_type) {
 	case MLX5_WQ_TYPE_LINKED_LIST_STRIDING_RQ:

+ 35 - 7
drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

@@ -37,6 +37,7 @@
 #include <linux/bpf_trace.h>
 #include <net/busy_poll.h>
 #include <net/ip6_checksum.h>
+#include <net/page_pool.h>
 #include "en.h"
 #include "en_tc.h"
 #include "eswitch.h"
@@ -221,7 +222,7 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 	if (mlx5e_rx_cache_get(rq, dma_info))
 		return 0;
 
-	dma_info->page = dev_alloc_pages(rq->buff.page_order);
+	dma_info->page = page_pool_dev_alloc_pages(rq->page_pool);
 	if (unlikely(!dma_info->page))
 		return -ENOMEM;
 
@@ -236,15 +237,26 @@ static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
 	return 0;
 }
 
+static void mlx5e_page_dma_unmap(struct mlx5e_rq *rq,
+					struct mlx5e_dma_info *dma_info)
+{
+	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
+		       rq->buff.map_dir);
+}
+
 void mlx5e_page_release(struct mlx5e_rq *rq, struct mlx5e_dma_info *dma_info,
 			bool recycle)
 {
-	if (likely(recycle) && mlx5e_rx_cache_put(rq, dma_info))
-		return;
+	if (likely(recycle)) {
+		if (mlx5e_rx_cache_put(rq, dma_info))
+			return;
 
-	dma_unmap_page(rq->pdev, dma_info->addr, RQ_PAGE_SIZE(rq),
-		       rq->buff.map_dir);
-	put_page(dma_info->page);
+		mlx5e_page_dma_unmap(rq, dma_info);
+		page_pool_recycle_direct(rq->page_pool, dma_info->page);
+	} else {
+		mlx5e_page_dma_unmap(rq, dma_info);
+		put_page(dma_info->page);
+	}
 }
 
 static inline bool mlx5e_page_reuse(struct mlx5e_rq *rq,
@@ -800,9 +812,10 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 				   struct mlx5e_dma_info *di,
 				   void *va, u16 *rx_headroom, u32 *len)
 {
-	const struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
+	struct bpf_prog *prog = READ_ONCE(rq->xdp_prog);
 	struct xdp_buff xdp;
 	u32 act;
+	int err;
 
 	if (!prog)
 		return false;
@@ -823,6 +836,15 @@ static inline int mlx5e_xdp_handle(struct mlx5e_rq *rq,
 		if (unlikely(!mlx5e_xmit_xdp_frame(rq, di, &xdp)))
 			trace_xdp_exception(rq->netdev, prog, act);
 		return true;
+	case XDP_REDIRECT:
+		/* When XDP enabled then page-refcnt==1 here */
+		err = xdp_do_redirect(rq->netdev, &xdp, prog);
+		if (!err) {
+			__set_bit(MLX5E_RQ_FLAG_XDP_XMIT, rq->flags);
+			rq->xdpsq.db.redirect_flush = true;
+			mlx5e_page_dma_unmap(rq, di);
+		}
+		return true;
 	default:
 		bpf_warn_invalid_xdp_action(act);
 	case XDP_ABORTED:
@@ -868,6 +890,7 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 *cqe,
 
 	dma_sync_single_range_for_cpu(rq->pdev, di->addr, wi->offset,
 				      frag_size, DMA_FROM_DEVICE);
+	prefetchw(va); /* xdp_frame data area */
 	prefetch(data);
 	wi->offset += frag_size;
 
@@ -1140,6 +1163,11 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
 		xdpsq->db.doorbell = false;
 	}
 
+	if (xdpsq->db.redirect_flush) {
+		xdp_do_flush_map();
+		xdpsq->db.redirect_flush = false;
+	}
+
 	mlx5_cqwq_update_db_record(&cq->wq);
 
 	/* ensure cq space is freed before enabling more cqes */

+ 34 - 26
drivers/net/tun.c

@@ -248,11 +248,11 @@ struct veth {
 	__be16 h_vlan_TCI;
 };
 
-bool tun_is_xdp_buff(void *ptr)
+bool tun_is_xdp_frame(void *ptr)
 {
 	return (unsigned long)ptr & TUN_XDP_FLAG;
 }
-EXPORT_SYMBOL(tun_is_xdp_buff);
+EXPORT_SYMBOL(tun_is_xdp_frame);
 
 void *tun_xdp_to_ptr(void *ptr)
 {
@@ -660,10 +660,10 @@ void tun_ptr_free(void *ptr)
 {
 	if (!ptr)
 		return;
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		put_page(virt_to_head_page(xdp->data));
+		xdp_return_frame(xdpf);
 	} else {
 		__skb_array_destroy_skb(ptr);
 	}
@@ -854,6 +854,12 @@ static int tun_attach(struct tun_struct *tun, struct file *file,
 				       tun->dev, tfile->queue_index);
 		if (err < 0)
 			goto out;
+		err = xdp_rxq_info_reg_mem_model(&tfile->xdp_rxq,
+						 MEM_TYPE_PAGE_SHARED, NULL);
+		if (err < 0) {
+			xdp_rxq_info_unreg(&tfile->xdp_rxq);
+			goto out;
+		}
 		err = 0;
 	}
 
@@ -1295,21 +1301,13 @@ static const struct net_device_ops tun_netdev_ops = {
 	.ndo_get_stats64	= tun_net_get_stats64,
 };
 
-static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+static int tun_xdp_xmit(struct net_device *dev, struct xdp_frame *frame)
 {
 	struct tun_struct *tun = netdev_priv(dev);
-	struct xdp_buff *buff = xdp->data_hard_start;
-	int headroom = xdp->data - xdp->data_hard_start;
 	struct tun_file *tfile;
 	u32 numqueues;
 	int ret = 0;
 
-	/* Assure headroom is available and buff is properly aligned */
-	if (unlikely(headroom < sizeof(*xdp) || tun_is_xdp_buff(xdp)))
-		return -ENOSPC;
-
-	*buff = *xdp;
-
 	rcu_read_lock();
 
 	numqueues = READ_ONCE(tun->numqueues);
@@ -1323,7 +1321,7 @@ static int tun_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	/* Encode the XDP flag into lowest bit for consumer to differ
 	 * XDP buffer from sk_buff.
 	 */
-	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(buff))) {
+	if (ptr_ring_produce(&tfile->tx_ring, tun_xdp_to_ptr(frame))) {
 		this_cpu_inc(tun->pcpu_stats->tx_dropped);
 		ret = -ENOSPC;
 	}
@@ -1333,6 +1331,16 @@ out:
 	return ret;
 }
 
+static int tun_xdp_tx(struct net_device *dev, struct xdp_buff *xdp)
+{
+	struct xdp_frame *frame = convert_to_xdp_frame(xdp);
+
+	if (unlikely(!frame))
+		return -EOVERFLOW;
+
+	return tun_xdp_xmit(dev, frame);
+}
+
 static void tun_xdp_flush(struct net_device *dev)
 {
 	struct tun_struct *tun = netdev_priv(dev);
@@ -1680,7 +1688,7 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
 		case XDP_TX:
 			get_page(alloc_frag->page);
 			alloc_frag->offset += buflen;
-			if (tun_xdp_xmit(tun->dev, &xdp))
+			if (tun_xdp_tx(tun->dev, &xdp))
 				goto err_redirect;
 			tun_xdp_flush(tun->dev);
 			rcu_read_unlock();
@@ -2001,11 +2009,11 @@ static ssize_t tun_chr_write_iter(struct kiocb *iocb, struct iov_iter *from)
 
 static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 				struct tun_file *tfile,
-				struct xdp_buff *xdp,
+				struct xdp_frame *xdp_frame,
 				struct iov_iter *iter)
 {
 	int vnet_hdr_sz = 0;
-	size_t size = xdp->data_end - xdp->data;
+	size_t size = xdp_frame->len;
 	struct tun_pcpu_stats *stats;
 	size_t ret;
 
@@ -2021,7 +2029,7 @@ static ssize_t tun_put_user_xdp(struct tun_struct *tun,
 		iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
 	}
 
-	ret = copy_to_iter(xdp->data, size, iter) + vnet_hdr_sz;
+	ret = copy_to_iter(xdp_frame->data, size, iter) + vnet_hdr_sz;
 
 	stats = get_cpu_ptr(tun->pcpu_stats);
 	u64_stats_update_begin(&stats->syncp);
@@ -2189,11 +2197,11 @@ static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
 			return err;
 	}
 
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		ret = tun_put_user_xdp(tun, tfile, xdp, to);
-		put_page(virt_to_head_page(xdp->data));
+		ret = tun_put_user_xdp(tun, tfile, xdpf, to);
+		xdp_return_frame(xdpf);
 	} else {
 		struct sk_buff *skb = ptr;
 
@@ -2432,10 +2440,10 @@ out_free:
 static int tun_ptr_peek_len(void *ptr)
 {
 	if (likely(ptr)) {
-		if (tun_is_xdp_buff(ptr)) {
-			struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+		if (tun_is_xdp_frame(ptr)) {
+			struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-			return xdp->data_end - xdp->data;
+			return xdpf->len;
 		}
 		return __skb_array_len_with_tag(ptr);
 	} else {

+ 41 - 26
drivers/net/virtio_net.c

@@ -415,46 +415,51 @@ static void virtnet_xdp_flush(struct net_device *dev)
 	virtqueue_kick(sq->vq);
 }
 
-static bool __virtnet_xdp_xmit(struct virtnet_info *vi,
-			       struct xdp_buff *xdp)
+static int __virtnet_xdp_xmit(struct virtnet_info *vi,
+			       struct xdp_frame *xdpf)
 {
 	struct virtio_net_hdr_mrg_rxbuf *hdr;
-	unsigned int len;
+	struct xdp_frame *xdpf_sent;
 	struct send_queue *sq;
+	unsigned int len;
 	unsigned int qp;
-	void *xdp_sent;
 	int err;
 
 	qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id();
 	sq = &vi->sq[qp];
 
 	/* Free up any pending old buffers before queueing new ones. */
-	while ((xdp_sent = virtqueue_get_buf(sq->vq, &len)) != NULL) {
-		struct page *sent_page = virt_to_head_page(xdp_sent);
+	while ((xdpf_sent = virtqueue_get_buf(sq->vq, &len)) != NULL)
+		xdp_return_frame(xdpf_sent);
 
-		put_page(sent_page);
-	}
+	/* virtqueue want to use data area in-front of packet */
+	if (unlikely(xdpf->metasize > 0))
+		return -EOPNOTSUPP;
 
-	xdp->data -= vi->hdr_len;
+	if (unlikely(xdpf->headroom < vi->hdr_len))
+		return -EOVERFLOW;
+
+	/* Make room for virtqueue hdr (also change xdpf->headroom?) */
+	xdpf->data -= vi->hdr_len;
 	/* Zero header and leave csum up to XDP layers */
-	hdr = xdp->data;
+	hdr = xdpf->data;
 	memset(hdr, 0, vi->hdr_len);
+	xdpf->len   += vi->hdr_len;
 
-	sg_init_one(sq->sg, xdp->data, xdp->data_end - xdp->data);
+	sg_init_one(sq->sg, xdpf->data, xdpf->len);
 
-	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdp->data, GFP_ATOMIC);
+	err = virtqueue_add_outbuf(sq->vq, sq->sg, 1, xdpf, GFP_ATOMIC);
 	if (unlikely(err))
-		return false; /* Caller handle free/refcnt */
+		return -ENOSPC; /* Caller handle free/refcnt */
 
-	return true;
+	return 0;
 }
 
-static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
+static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_frame *xdpf)
 {
 	struct virtnet_info *vi = netdev_priv(dev);
 	struct receive_queue *rq = vi->rq;
 	struct bpf_prog *xdp_prog;
-	bool sent;
 
 	/* Only allow ndo_xdp_xmit if XDP is loaded on dev, as this
 	 * indicate XDP resources have been successfully allocated.
@@ -463,10 +468,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, struct xdp_buff *xdp)
 	if (!xdp_prog)
 		return -ENXIO;
 
-	sent = __virtnet_xdp_xmit(vi, xdp);
-	if (!sent)
-		return -ENOSPC;
-	return 0;
+	return __virtnet_xdp_xmit(vi, xdpf);
 }
 
 static unsigned int virtnet_get_headroom(struct virtnet_info *vi)
@@ -555,7 +557,6 @@ static struct sk_buff *receive_small(struct net_device *dev,
 	struct page *page = virt_to_head_page(buf);
 	unsigned int delta = 0;
 	struct page *xdp_page;
-	bool sent;
 	int err;
 
 	len -= vi->hdr_len;
@@ -564,6 +565,7 @@ static struct sk_buff *receive_small(struct net_device *dev,
 	xdp_prog = rcu_dereference(rq->xdp_prog);
 	if (xdp_prog) {
 		struct virtio_net_hdr_mrg_rxbuf *hdr = buf + header_offset;
+		struct xdp_frame *xdpf;
 		struct xdp_buff xdp;
 		void *orig_data;
 		u32 act;
@@ -606,8 +608,11 @@ static struct sk_buff *receive_small(struct net_device *dev,
 			delta = orig_data - xdp.data;
 			break;
 		case XDP_TX:
-			sent = __virtnet_xdp_xmit(vi, &xdp);
-			if (unlikely(!sent)) {
+			xdpf = convert_to_xdp_frame(&xdp);
+			if (unlikely(!xdpf))
+				goto err_xdp;
+			err = __virtnet_xdp_xmit(vi, xdpf);
+			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				goto err_xdp;
 			}
@@ -690,7 +695,6 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	struct bpf_prog *xdp_prog;
 	unsigned int truesize;
 	unsigned int headroom = mergeable_ctx_to_headroom(ctx);
-	bool sent;
 	int err;
 
 	head_skb = NULL;
@@ -698,6 +702,7 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 	rcu_read_lock();
 	xdp_prog = rcu_dereference(rq->xdp_prog);
 	if (xdp_prog) {
+		struct xdp_frame *xdpf;
 		struct page *xdp_page;
 		struct xdp_buff xdp;
 		void *data;
@@ -762,8 +767,11 @@ static struct sk_buff *receive_mergeable(struct net_device *dev,
 			}
 			break;
 		case XDP_TX:
-			sent = __virtnet_xdp_xmit(vi, &xdp);
-			if (unlikely(!sent)) {
+			xdpf = convert_to_xdp_frame(&xdp);
+			if (unlikely(!xdpf))
+				goto err_xdp;
+			err = __virtnet_xdp_xmit(vi, xdpf);
+			if (unlikely(err)) {
 				trace_xdp_exception(vi->dev, xdp_prog, act);
 				if (unlikely(xdp_page != page))
 					put_page(xdp_page);
@@ -1301,6 +1309,13 @@ static int virtnet_open(struct net_device *dev)
 		if (err < 0)
 			return err;
 
+		err = xdp_rxq_info_reg_mem_model(&vi->rq[i].xdp_rxq,
+						 MEM_TYPE_PAGE_SHARED, NULL);
+		if (err < 0) {
+			xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq);
+			return err;
+		}
+
 		virtnet_napi_enable(vi->rq[i].vq, &vi->rq[i].napi);
 		virtnet_napi_tx_enable(vi, vi->sq[i].vq, &vi->sq[i].napi);
 	}

+ 4 - 3
drivers/vhost/net.c

@@ -32,6 +32,7 @@
 #include <linux/skbuff.h>
 
 #include <net/sock.h>
+#include <net/xdp.h>
 
 #include "vhost.h"
 
@@ -181,10 +182,10 @@ static void vhost_net_buf_unproduce(struct vhost_net_virtqueue *nvq)
 
 static int vhost_net_buf_peek_len(void *ptr)
 {
-	if (tun_is_xdp_buff(ptr)) {
-		struct xdp_buff *xdp = tun_ptr_to_xdp(ptr);
+	if (tun_is_xdp_frame(ptr)) {
+		struct xdp_frame *xdpf = tun_ptr_to_xdp(ptr);
 
-		return xdp->data_end - xdp->data;
+		return xdpf->len;
 	}
 
 	return __skb_array_len_with_tag(ptr);

+ 1 - 23
include/linux/filter.h

@@ -30,6 +30,7 @@ struct sock;
 struct seccomp_data;
 struct bpf_prog_aux;
 struct xdp_rxq_info;
+struct xdp_buff;
 
 /* ArgX, context and stack frame pointer register positions. Note,
  * Arg1, Arg2, Arg3, etc are used as argument mappings of function
@@ -500,14 +501,6 @@ struct bpf_skb_data_end {
 	void *data_end;
 };
 
-struct xdp_buff {
-	void *data;
-	void *data_end;
-	void *data_meta;
-	void *data_hard_start;
-	struct xdp_rxq_info *rxq;
-};
-
 struct sk_msg_buff {
 	void *data;
 	void *data_end;
@@ -772,21 +765,6 @@ int xdp_do_redirect(struct net_device *dev,
 		    struct bpf_prog *prog);
 void xdp_do_flush_map(void);
 
-/* Drivers not supporting XDP metadata can use this helper, which
- * rejects any room expansion for metadata as a result.
- */
-static __always_inline void
-xdp_set_data_meta_invalid(struct xdp_buff *xdp)
-{
-	xdp->data_meta = xdp->data + 1;
-}
-
-static __always_inline bool
-xdp_data_meta_unsupported(const struct xdp_buff *xdp)
-{
-	return unlikely(xdp->data_meta > xdp->data);
-}
-
 void bpf_warn_invalid_xdp_action(u32 act);
 
 struct sock *do_sk_redirect_map(struct sk_buff *skb);

+ 2 - 2
include/linux/if_tun.h

@@ -22,7 +22,7 @@
 #if defined(CONFIG_TUN) || defined(CONFIG_TUN_MODULE)
 struct socket *tun_get_socket(struct file *);
 struct ptr_ring *tun_get_tx_ring(struct file *file);
-bool tun_is_xdp_buff(void *ptr);
+bool tun_is_xdp_frame(void *ptr);
 void *tun_xdp_to_ptr(void *ptr);
 void *tun_ptr_to_xdp(void *ptr);
 void tun_ptr_free(void *ptr);
@@ -39,7 +39,7 @@ static inline struct ptr_ring *tun_get_tx_ring(struct file *f)
 {
 	return ERR_PTR(-EINVAL);
 }
-static inline bool tun_is_xdp_buff(void *ptr)
+static inline bool tun_is_xdp_frame(void *ptr)
 {
 	return false;
 }

+ 2 - 2
include/linux/netdevice.h

@@ -1165,7 +1165,7 @@ struct dev_ifalias {
  *	This function is used to set or query state related to XDP on the
  *	netdevice and manage BPF offload. See definition of
  *	enum bpf_netdev_command for details.
- * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_buff *xdp);
+ * int (*ndo_xdp_xmit)(struct net_device *dev, struct xdp_frame *xdp);
  *	This function is used to submit a XDP packet for transmit on a
  *	netdevice.
  * void (*ndo_xdp_flush)(struct net_device *dev);
@@ -1356,7 +1356,7 @@ struct net_device_ops {
 	int			(*ndo_bpf)(struct net_device *dev,
 					   struct netdev_bpf *bpf);
 	int			(*ndo_xdp_xmit)(struct net_device *dev,
-						struct xdp_buff *xdp);
+						struct xdp_frame *xdp);
 	void			(*ndo_xdp_flush)(struct net_device *dev);
 };
 

+ 143 - 0
include/net/page_pool.h

@@ -0,0 +1,143 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.h
+ *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ *	Copyright (C) 2016 Red Hat, Inc.
+ */
+
+/**
+ * DOC: page_pool allocator
+ *
+ * This page_pool allocator is optimized for the XDP mode that
+ * uses one-frame-per-page, but have fallbacks that act like the
+ * regular page allocator APIs.
+ *
+ * Basic use involve replacing alloc_pages() calls with the
+ * page_pool_alloc_pages() call.  Drivers should likely use
+ * page_pool_dev_alloc_pages() replacing dev_alloc_pages().
+ *
+ * If page_pool handles DMA mapping (use page->private), then API user
+ * is responsible for invoking page_pool_put_page() once.  In-case of
+ * elevated refcnt, the DMA state is released, assuming other users of
+ * the page will eventually call put_page().
+ *
+ * If no DMA mapping is done, then it can act as shim-layer that
+ * fall-through to alloc_page.  As no state is kept on the page, the
+ * regular put_page() call is sufficient.
+ */
+#ifndef _NET_PAGE_POOL_H
+#define _NET_PAGE_POOL_H
+
+#include <linux/mm.h> /* Needed by ptr_ring */
+#include <linux/ptr_ring.h>
+#include <linux/dma-direction.h>
+
+#define PP_FLAG_DMA_MAP 1 /* Should page_pool do the DMA map/unmap */
+#define PP_FLAG_ALL	PP_FLAG_DMA_MAP
+
+/*
+ * Fast allocation side cache array/stack
+ *
+ * The cache size and refill watermark is related to the network
+ * use-case.  The NAPI budget is 64 packets.  After a NAPI poll the RX
+ * ring is usually refilled and the max consumed elements will be 64,
+ * thus a natural max size of objects needed in the cache.
+ *
+ * Keeping room for more objects, is due to XDP_DROP use-case.  As
+ * XDP_DROP allows the opportunity to recycle objects directly into
+ * this array, as it shares the same softirq/NAPI protection.  If
+ * cache is already full (or partly full) then the XDP_DROP recycles
+ * would have to take a slower code path.
+ */
+#define PP_ALLOC_CACHE_SIZE	128
+#define PP_ALLOC_CACHE_REFILL	64
+struct pp_alloc_cache {
+	u32 count;
+	void *cache[PP_ALLOC_CACHE_SIZE];
+};
+
+struct page_pool_params {
+	unsigned int	flags;
+	unsigned int	order;
+	unsigned int	pool_size;
+	int		nid;  /* Numa node id to allocate from pages from */
+	struct device	*dev; /* device, for DMA pre-mapping purposes */
+	enum dma_data_direction dma_dir; /* DMA mapping direction */
+};
+
+struct page_pool {
+	struct rcu_head rcu;
+	struct page_pool_params p;
+
+	/*
+	 * Data structure for allocation side
+	 *
+	 * Drivers allocation side usually already perform some kind
+	 * of resource protection.  Piggyback on this protection, and
+	 * require driver to protect allocation side.
+	 *
+	 * For NIC drivers this means, allocate a page_pool per
+	 * RX-queue. As the RX-queue is already protected by
+	 * Softirq/BH scheduling and napi_schedule. NAPI schedule
+	 * guarantee that a single napi_struct will only be scheduled
+	 * on a single CPU (see napi_schedule).
+	 */
+	struct pp_alloc_cache alloc ____cacheline_aligned_in_smp;
+
+	/* Data structure for storing recycled pages.
+	 *
+	 * Returning/freeing pages is more complicated synchronization
+	 * wise, because free's can happen on remote CPUs, with no
+	 * association with allocation resource.
+	 *
+	 * Use ptr_ring, as it separates consumer and producer
+	 * effeciently, it a way that doesn't bounce cache-lines.
+	 *
+	 * TODO: Implement bulk return pages into this structure.
+	 */
+	struct ptr_ring ring;
+};
+
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp);
+
+static inline struct page *page_pool_dev_alloc_pages(struct page_pool *pool)
+{
+	gfp_t gfp = (GFP_ATOMIC | __GFP_NOWARN);
+
+	return page_pool_alloc_pages(pool, gfp);
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params);
+
+void page_pool_destroy(struct page_pool *pool);
+
+/* Never call this directly, use helpers below */
+void __page_pool_put_page(struct page_pool *pool,
+			  struct page *page, bool allow_direct);
+
+static inline void page_pool_put_page(struct page_pool *pool, struct page *page)
+{
+	/* When page_pool isn't compiled-in, net/core/xdp.c doesn't
+	 * allow registering MEM_TYPE_PAGE_POOL, but shield linker.
+	 */
+#ifdef CONFIG_PAGE_POOL
+	__page_pool_put_page(pool, page, false);
+#endif
+}
+/* Very limited use-cases allow recycle direct */
+static inline void page_pool_recycle_direct(struct page_pool *pool,
+					    struct page *page)
+{
+	__page_pool_put_page(pool, page, true);
+}
+
+static inline bool is_page_pool_compiled_in(void)
+{
+#ifdef CONFIG_PAGE_POOL
+	return true;
+#else
+	return false;
+#endif
+}
+
+#endif /* _NET_PAGE_POOL_H */

+ 83 - 0
include/net/xdp.h

@@ -33,16 +33,99 @@
  * also mandatory during RX-ring setup.
  */
 
+enum xdp_mem_type {
+	MEM_TYPE_PAGE_SHARED = 0, /* Split-page refcnt based model */
+	MEM_TYPE_PAGE_ORDER0,     /* Orig XDP full page model */
+	MEM_TYPE_PAGE_POOL,
+	MEM_TYPE_MAX,
+};
+
+struct xdp_mem_info {
+	u32 type; /* enum xdp_mem_type, but known size type */
+	u32 id;
+};
+
+struct page_pool;
+
 struct xdp_rxq_info {
 	struct net_device *dev;
 	u32 queue_index;
 	u32 reg_state;
+	struct xdp_mem_info mem;
 } ____cacheline_aligned; /* perf critical, avoid false-sharing */
 
+struct xdp_buff {
+	void *data;
+	void *data_end;
+	void *data_meta;
+	void *data_hard_start;
+	struct xdp_rxq_info *rxq;
+};
+
+struct xdp_frame {
+	void *data;
+	u16 len;
+	u16 headroom;
+	u16 metasize;
+	/* Lifetime of xdp_rxq_info is limited to NAPI/enqueue time,
+	 * while mem info is valid on remote CPU.
+	 */
+	struct xdp_mem_info mem;
+	struct net_device *dev_rx; /* used by cpumap */
+};
+
+/* Convert xdp_buff to xdp_frame */
+static inline
+struct xdp_frame *convert_to_xdp_frame(struct xdp_buff *xdp)
+{
+	struct xdp_frame *xdp_frame;
+	int metasize;
+	int headroom;
+
+	/* Assure headroom is available for storing info */
+	headroom = xdp->data - xdp->data_hard_start;
+	metasize = xdp->data - xdp->data_meta;
+	metasize = metasize > 0 ? metasize : 0;
+	if (unlikely((headroom - metasize) < sizeof(*xdp_frame)))
+		return NULL;
+
+	/* Store info in top of packet */
+	xdp_frame = xdp->data_hard_start;
+
+	xdp_frame->data = xdp->data;
+	xdp_frame->len  = xdp->data_end - xdp->data;
+	xdp_frame->headroom = headroom - sizeof(*xdp_frame);
+	xdp_frame->metasize = metasize;
+
+	/* rxq only valid until napi_schedule ends, convert to xdp_mem_info */
+	xdp_frame->mem = xdp->rxq->mem;
+
+	return xdp_frame;
+}
+
+void xdp_return_frame(struct xdp_frame *xdpf);
+
 int xdp_rxq_info_reg(struct xdp_rxq_info *xdp_rxq,
 		     struct net_device *dev, u32 queue_index);
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq);
 void xdp_rxq_info_unused(struct xdp_rxq_info *xdp_rxq);
 bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq);
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+			       enum xdp_mem_type type, void *allocator);
+
+/* Drivers not supporting XDP metadata can use this helper, which
+ * rejects any room expansion for metadata as a result.
+ */
+static __always_inline void
+xdp_set_data_meta_invalid(struct xdp_buff *xdp)
+{
+	xdp->data_meta = xdp->data + 1;
+}
+
+static __always_inline bool
+xdp_data_meta_unsupported(const struct xdp_buff *xdp)
+{
+	return unlikely(xdp->data_meta > xdp->data);
+}
 
 #endif /* __LINUX_NET_XDP_H__ */

+ 50 - 82
kernel/bpf/cpumap.c

@@ -19,6 +19,7 @@
 #include <linux/bpf.h>
 #include <linux/filter.h>
 #include <linux/ptr_ring.h>
+#include <net/xdp.h>
 
 #include <linux/sched.h>
 #include <linux/workqueue.h>
@@ -137,27 +138,6 @@ free_cmap:
 	return ERR_PTR(err);
 }
 
-static void __cpu_map_queue_destructor(void *ptr)
-{
-	/* The tear-down procedure should have made sure that queue is
-	 * empty.  See __cpu_map_entry_replace() and work-queue
-	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
-	 * gracefully and warn once.
-	 */
-	if (WARN_ON_ONCE(ptr))
-		page_frag_free(ptr);
-}
-
-static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
-{
-	if (atomic_dec_and_test(&rcpu->refcnt)) {
-		/* The queue should be empty at this point */
-		ptr_ring_cleanup(rcpu->queue, __cpu_map_queue_destructor);
-		kfree(rcpu->queue);
-		kfree(rcpu);
-	}
-}
-
 static void get_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
 {
 	atomic_inc(&rcpu->refcnt);
@@ -179,45 +159,8 @@ static void cpu_map_kthread_stop(struct work_struct *work)
 	kthread_stop(rcpu->kthread);
 }
 
-/* For now, xdp_pkt is a cpumap internal data structure, with info
- * carried between enqueue to dequeue. It is mapped into the top
- * headroom of the packet, to avoid allocating separate mem.
- */
-struct xdp_pkt {
-	void *data;
-	u16 len;
-	u16 headroom;
-	u16 metasize;
-	struct net_device *dev_rx;
-};
-
-/* Convert xdp_buff to xdp_pkt */
-static struct xdp_pkt *convert_to_xdp_pkt(struct xdp_buff *xdp)
-{
-	struct xdp_pkt *xdp_pkt;
-	int metasize;
-	int headroom;
-
-	/* Assure headroom is available for storing info */
-	headroom = xdp->data - xdp->data_hard_start;
-	metasize = xdp->data - xdp->data_meta;
-	metasize = metasize > 0 ? metasize : 0;
-	if (unlikely((headroom - metasize) < sizeof(*xdp_pkt)))
-		return NULL;
-
-	/* Store info in top of packet */
-	xdp_pkt = xdp->data_hard_start;
-
-	xdp_pkt->data = xdp->data;
-	xdp_pkt->len  = xdp->data_end - xdp->data;
-	xdp_pkt->headroom = headroom - sizeof(*xdp_pkt);
-	xdp_pkt->metasize = metasize;
-
-	return xdp_pkt;
-}
-
 static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
-					 struct xdp_pkt *xdp_pkt)
+					 struct xdp_frame *xdpf)
 {
 	unsigned int frame_size;
 	void *pkt_data_start;
@@ -232,7 +175,7 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * would be preferred to set frame_size to 2048 or 4096
 	 * depending on the driver.
 	 *   frame_size = 2048;
-	 *   frame_len  = frame_size - sizeof(*xdp_pkt);
+	 *   frame_len  = frame_size - sizeof(*xdp_frame);
 	 *
 	 * Instead, with info avail, skb_shared_info in placed after
 	 * packet len.  This, unfortunately fakes the truesize.
@@ -240,21 +183,21 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	 * is not at a fixed memory location, with mixed length
 	 * packets, which is bad for cache-line hotness.
 	 */
-	frame_size = SKB_DATA_ALIGN(xdp_pkt->len) + xdp_pkt->headroom +
+	frame_size = SKB_DATA_ALIGN(xdpf->len) + xdpf->headroom +
 		SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
 
-	pkt_data_start = xdp_pkt->data - xdp_pkt->headroom;
+	pkt_data_start = xdpf->data - xdpf->headroom;
 	skb = build_skb(pkt_data_start, frame_size);
 	if (!skb)
 		return NULL;
 
-	skb_reserve(skb, xdp_pkt->headroom);
-	__skb_put(skb, xdp_pkt->len);
-	if (xdp_pkt->metasize)
-		skb_metadata_set(skb, xdp_pkt->metasize);
+	skb_reserve(skb, xdpf->headroom);
+	__skb_put(skb, xdpf->len);
+	if (xdpf->metasize)
+		skb_metadata_set(skb, xdpf->metasize);
 
 	/* Essential SKB info: protocol and skb->dev */
-	skb->protocol = eth_type_trans(skb, xdp_pkt->dev_rx);
+	skb->protocol = eth_type_trans(skb, xdpf->dev_rx);
 
 	/* Optional SKB info, currently missing:
 	 * - HW checksum info		(skb->ip_summed)
@@ -265,6 +208,31 @@ static struct sk_buff *cpu_map_build_skb(struct bpf_cpu_map_entry *rcpu,
 	return skb;
 }
 
+static void __cpu_map_ring_cleanup(struct ptr_ring *ring)
+{
+	/* The tear-down procedure should have made sure that queue is
+	 * empty.  See __cpu_map_entry_replace() and work-queue
+	 * invoked cpu_map_kthread_stop(). Catch any broken behaviour
+	 * gracefully and warn once.
+	 */
+	struct xdp_frame *xdpf;
+
+	while ((xdpf = ptr_ring_consume(ring)))
+		if (WARN_ON_ONCE(xdpf))
+			xdp_return_frame(xdpf);
+}
+
+static void put_cpu_map_entry(struct bpf_cpu_map_entry *rcpu)
+{
+	if (atomic_dec_and_test(&rcpu->refcnt)) {
+		/* The queue should be empty at this point */
+		__cpu_map_ring_cleanup(rcpu->queue);
+		ptr_ring_cleanup(rcpu->queue, NULL);
+		kfree(rcpu->queue);
+		kfree(rcpu);
+	}
+}
+
 static int cpu_map_kthread_run(void *data)
 {
 	struct bpf_cpu_map_entry *rcpu = data;
@@ -278,7 +246,7 @@ static int cpu_map_kthread_run(void *data)
 	 */
 	while (!kthread_should_stop() || !__ptr_ring_empty(rcpu->queue)) {
 		unsigned int processed = 0, drops = 0, sched = 0;
-		struct xdp_pkt *xdp_pkt;
+		struct xdp_frame *xdpf;
 
 		/* Release CPU reschedule checks */
 		if (__ptr_ring_empty(rcpu->queue)) {
@@ -301,13 +269,13 @@ static int cpu_map_kthread_run(void *data)
 		 * kthread CPU pinned. Lockless access to ptr_ring
 		 * consume side valid as no-resize allowed of queue.
 		 */
-		while ((xdp_pkt = __ptr_ring_consume(rcpu->queue))) {
+		while ((xdpf = __ptr_ring_consume(rcpu->queue))) {
 			struct sk_buff *skb;
 			int ret;
 
-			skb = cpu_map_build_skb(rcpu, xdp_pkt);
+			skb = cpu_map_build_skb(rcpu, xdpf);
 			if (!skb) {
-				page_frag_free(xdp_pkt);
+				xdp_return_frame(xdpf);
 				continue;
 			}
 
@@ -604,13 +572,13 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 	spin_lock(&q->producer_lock);
 
 	for (i = 0; i < bq->count; i++) {
-		void *xdp_pkt = bq->q[i];
+		struct xdp_frame *xdpf = bq->q[i];
 		int err;
 
-		err = __ptr_ring_produce(q, xdp_pkt);
+		err = __ptr_ring_produce(q, xdpf);
 		if (err) {
 			drops++;
-			page_frag_free(xdp_pkt); /* Free xdp_pkt */
+			xdp_return_frame(xdpf);
 		}
 		processed++;
 	}
@@ -625,7 +593,7 @@ static int bq_flush_to_queue(struct bpf_cpu_map_entry *rcpu,
 /* Runs under RCU-read-side, plus in softirq under NAPI protection.
  * Thus, safe percpu variable access.
  */
-static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
+static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_frame *xdpf)
 {
 	struct xdp_bulk_queue *bq = this_cpu_ptr(rcpu->bulkq);
 
@@ -636,28 +604,28 @@ static int bq_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_pkt *xdp_pkt)
 	 * driver to code invoking us to finished, due to driver
 	 * (e.g. ixgbe) recycle tricks based on page-refcnt.
 	 *
-	 * Thus, incoming xdp_pkt is always queued here (else we race
+	 * Thus, incoming xdp_frame is always queued here (else we race
 	 * with another CPU on page-refcnt and remaining driver code).
 	 * Queue time is very short, as driver will invoke flush
 	 * operation, when completing napi->poll call.
 	 */
-	bq->q[bq->count++] = xdp_pkt;
+	bq->q[bq->count++] = xdpf;
 	return 0;
 }
 
 int cpu_map_enqueue(struct bpf_cpu_map_entry *rcpu, struct xdp_buff *xdp,
 		    struct net_device *dev_rx)
 {
-	struct xdp_pkt *xdp_pkt;
+	struct xdp_frame *xdpf;
 
-	xdp_pkt = convert_to_xdp_pkt(xdp);
-	if (unlikely(!xdp_pkt))
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
 		return -EOVERFLOW;
 
 	/* Info needed when constructing SKB on remote CPU */
-	xdp_pkt->dev_rx = dev_rx;
+	xdpf->dev_rx = dev_rx;
 
-	bq_enqueue(rcpu, xdp_pkt);
+	bq_enqueue(rcpu, xdpf);
 	return 0;
 }
 

+ 3 - 0
net/Kconfig

@@ -423,6 +423,9 @@ config MAY_USE_DEVLINK
 	  on MAY_USE_DEVLINK to ensure they do not cause link errors when
 	  devlink is a loadable module and the driver using it is built-in.
 
+config PAGE_POOL
+       bool
+
 endif   # if NET
 
 # Used by archs to tell that they support BPF JIT compiler plus which flavour.

+ 1 - 0
net/core/Makefile

@@ -14,6 +14,7 @@ obj-y		     += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
 			fib_notifier.o xdp.o
 
 obj-y += net-sysfs.o
+obj-$(CONFIG_PAGE_POOL) += page_pool.o
 obj-$(CONFIG_PROC_FS) += net-procfs.o
 obj-$(CONFIG_NET_PKTGEN) += pktgen.o
 obj-$(CONFIG_NETPOLL) += netpoll.o

+ 23 - 2
net/core/filter.c

@@ -2692,6 +2692,7 @@ static unsigned long xdp_get_metalen(const struct xdp_buff *xdp)
 
 BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 {
+	void *xdp_frame_end = xdp->data_hard_start + sizeof(struct xdp_frame);
 	unsigned long metalen = xdp_get_metalen(xdp);
 	void *data_start = xdp->data_hard_start + metalen;
 	void *data = xdp->data + offset;
@@ -2700,6 +2701,13 @@ BPF_CALL_2(bpf_xdp_adjust_head, struct xdp_buff *, xdp, int, offset)
 		     data > xdp->data_end - ETH_HLEN))
 		return -EINVAL;
 
+	/* Avoid info leak, when reusing area prev used by xdp_frame */
+	if (data < xdp_frame_end) {
+		unsigned long clearlen = xdp_frame_end - data;
+
+		memset(data, 0, clearlen);
+	}
+
 	if (metalen)
 		memmove(xdp->data_meta + offset,
 			xdp->data_meta, metalen);
@@ -2749,13 +2757,18 @@ static int __bpf_tx_xdp(struct net_device *dev,
 			struct xdp_buff *xdp,
 			u32 index)
 {
+	struct xdp_frame *xdpf;
 	int err;
 
 	if (!dev->netdev_ops->ndo_xdp_xmit) {
 		return -EOPNOTSUPP;
 	}
 
-	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+	xdpf = convert_to_xdp_frame(xdp);
+	if (unlikely(!xdpf))
+		return -EOVERFLOW;
+
+	err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 	if (err)
 		return err;
 	dev->netdev_ops->ndo_xdp_flush(dev);
@@ -2771,11 +2784,19 @@ static int __bpf_tx_xdp_map(struct net_device *dev_rx, void *fwd,
 
 	if (map->map_type == BPF_MAP_TYPE_DEVMAP) {
 		struct net_device *dev = fwd;
+		struct xdp_frame *xdpf;
 
 		if (!dev->netdev_ops->ndo_xdp_xmit)
 			return -EOPNOTSUPP;
 
-		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdp);
+		xdpf = convert_to_xdp_frame(xdp);
+		if (unlikely(!xdpf))
+			return -EOVERFLOW;
+
+		/* TODO: move to inside map code instead, for bulk support
+		 * err = dev_map_enqueue(dev, xdp);
+		 */
+		err = dev->netdev_ops->ndo_xdp_xmit(dev, xdpf);
 		if (err)
 			return err;
 		__dev_map_insert_ctx(map, index);

+ 317 - 0
net/core/page_pool.c

@@ -0,0 +1,317 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * page_pool.c
+ *	Author:	Jesper Dangaard Brouer <netoptimizer@brouer.com>
+ *	Copyright (C) 2016 Red Hat, Inc.
+ */
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+
+#include <net/page_pool.h>
+#include <linux/dma-direction.h>
+#include <linux/dma-mapping.h>
+#include <linux/page-flags.h>
+#include <linux/mm.h> /* for __put_page() */
+
+static int page_pool_init(struct page_pool *pool,
+			  const struct page_pool_params *params)
+{
+	unsigned int ring_qsize = 1024; /* Default */
+
+	memcpy(&pool->p, params, sizeof(pool->p));
+
+	/* Validate only known flags were used */
+	if (pool->p.flags & ~(PP_FLAG_ALL))
+		return -EINVAL;
+
+	if (pool->p.pool_size)
+		ring_qsize = pool->p.pool_size;
+
+	/* Sanity limit mem that can be pinned down */
+	if (ring_qsize > 32768)
+		return -E2BIG;
+
+	/* DMA direction is either DMA_FROM_DEVICE or DMA_BIDIRECTIONAL.
+	 * DMA_BIDIRECTIONAL is for allowing page used for DMA sending,
+	 * which is the XDP_TX use-case.
+	 */
+	if ((pool->p.dma_dir != DMA_FROM_DEVICE) &&
+	    (pool->p.dma_dir != DMA_BIDIRECTIONAL))
+		return -EINVAL;
+
+	if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
+		return -ENOMEM;
+
+	return 0;
+}
+
+struct page_pool *page_pool_create(const struct page_pool_params *params)
+{
+	struct page_pool *pool;
+	int err = 0;
+
+	pool = kzalloc_node(sizeof(*pool), GFP_KERNEL, params->nid);
+	if (!pool)
+		return ERR_PTR(-ENOMEM);
+
+	err = page_pool_init(pool, params);
+	if (err < 0) {
+		pr_warn("%s() gave up with errno %d\n", __func__, err);
+		kfree(pool);
+		return ERR_PTR(err);
+	}
+	return pool;
+}
+EXPORT_SYMBOL(page_pool_create);
+
+/* fast path */
+static struct page *__page_pool_get_cached(struct page_pool *pool)
+{
+	struct ptr_ring *r = &pool->ring;
+	struct page *page;
+
+	/* Quicker fallback, avoid locks when ring is empty */
+	if (__ptr_ring_empty(r))
+		return NULL;
+
+	/* Test for safe-context, caller should provide this guarantee */
+	if (likely(in_serving_softirq())) {
+		if (likely(pool->alloc.count)) {
+			/* Fast-path */
+			page = pool->alloc.cache[--pool->alloc.count];
+			return page;
+		}
+		/* Slower-path: Alloc array empty, time to refill
+		 *
+		 * Open-coded bulk ptr_ring consumer.
+		 *
+		 * Discussion: the ring consumer lock is not really
+		 * needed due to the softirq/NAPI protection, but
+		 * later need the ability to reclaim pages on the
+		 * ring. Thus, keeping the locks.
+		 */
+		spin_lock(&r->consumer_lock);
+		while ((page = __ptr_ring_consume(r))) {
+			if (pool->alloc.count == PP_ALLOC_CACHE_REFILL)
+				break;
+			pool->alloc.cache[pool->alloc.count++] = page;
+		}
+		spin_unlock(&r->consumer_lock);
+		return page;
+	}
+
+	/* Slow-path: Get page from locked ring queue */
+	page = ptr_ring_consume(&pool->ring);
+	return page;
+}
+
+/* slow path */
+noinline
+static struct page *__page_pool_alloc_pages_slow(struct page_pool *pool,
+						 gfp_t _gfp)
+{
+	struct page *page;
+	gfp_t gfp = _gfp;
+	dma_addr_t dma;
+
+	/* We could always set __GFP_COMP, and avoid this branch, as
+	 * prep_new_page() can handle order-0 with __GFP_COMP.
+	 */
+	if (pool->p.order)
+		gfp |= __GFP_COMP;
+
+	/* FUTURE development:
+	 *
+	 * Current slow-path essentially falls back to single page
+	 * allocations, which doesn't improve performance.  This code
+	 * need bulk allocation support from the page allocator code.
+	 */
+
+	/* Cache was empty, do real allocation */
+	page = alloc_pages_node(pool->p.nid, gfp, pool->p.order);
+	if (!page)
+		return NULL;
+
+	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+		goto skip_dma_map;
+
+	/* Setup DMA mapping: use page->private for DMA-addr
+	 * This mapping is kept for lifetime of page, until leaving pool.
+	 */
+	dma = dma_map_page(pool->p.dev, page, 0,
+			   (PAGE_SIZE << pool->p.order),
+			   pool->p.dma_dir);
+	if (dma_mapping_error(pool->p.dev, dma)) {
+		put_page(page);
+		return NULL;
+	}
+	set_page_private(page, dma); /* page->private = dma; */
+
+skip_dma_map:
+	/* When page just alloc'ed is should/must have refcnt 1. */
+	return page;
+}
+
+/* For using page_pool replace: alloc_pages() API calls, but provide
+ * synchronization guarantee for allocation side.
+ */
+struct page *page_pool_alloc_pages(struct page_pool *pool, gfp_t gfp)
+{
+	struct page *page;
+
+	/* Fast-path: Get a page from cache */
+	page = __page_pool_get_cached(pool);
+	if (page)
+		return page;
+
+	/* Slow-path: cache empty, do real allocation */
+	page = __page_pool_alloc_pages_slow(pool, gfp);
+	return page;
+}
+EXPORT_SYMBOL(page_pool_alloc_pages);
+
+/* Cleanup page_pool state from page */
+static void __page_pool_clean_page(struct page_pool *pool,
+				   struct page *page)
+{
+	if (!(pool->p.flags & PP_FLAG_DMA_MAP))
+		return;
+
+	/* DMA unmap */
+	dma_unmap_page(pool->p.dev, page_private(page),
+		       PAGE_SIZE << pool->p.order, pool->p.dma_dir);
+	set_page_private(page, 0);
+}
+
+/* Return a page to the page allocator, cleaning up our state */
+static void __page_pool_return_page(struct page_pool *pool, struct page *page)
+{
+	__page_pool_clean_page(pool, page);
+	put_page(page);
+	/* An optimization would be to call __free_pages(page, pool->p.order)
+	 * knowing page is not part of page-cache (thus avoiding a
+	 * __page_cache_release() call).
+	 */
+}
+
+static bool __page_pool_recycle_into_ring(struct page_pool *pool,
+				   struct page *page)
+{
+	int ret;
+	/* BH protection not needed if current is serving softirq */
+	if (in_serving_softirq())
+		ret = ptr_ring_produce(&pool->ring, page);
+	else
+		ret = ptr_ring_produce_bh(&pool->ring, page);
+
+	return (ret == 0) ? true : false;
+}
+
+/* Only allow direct recycling in special circumstances, into the
+ * alloc side cache.  E.g. during RX-NAPI processing for XDP_DROP use-case.
+ *
+ * Caller must provide appropriate safe context.
+ */
+static bool __page_pool_recycle_direct(struct page *page,
+				       struct page_pool *pool)
+{
+	if (unlikely(pool->alloc.count == PP_ALLOC_CACHE_SIZE))
+		return false;
+
+	/* Caller MUST have verified/know (page_ref_count(page) == 1) */
+	pool->alloc.cache[pool->alloc.count++] = page;
+	return true;
+}
+
+void __page_pool_put_page(struct page_pool *pool,
+			  struct page *page, bool allow_direct)
+{
+	/* This allocator is optimized for the XDP mode that uses
+	 * one-frame-per-page, but have fallbacks that act like the
+	 * regular page allocator APIs.
+	 *
+	 * refcnt == 1 means page_pool owns page, and can recycle it.
+	 */
+	if (likely(page_ref_count(page) == 1)) {
+		/* Read barrier done in page_ref_count / READ_ONCE */
+
+		if (allow_direct && in_serving_softirq())
+			if (__page_pool_recycle_direct(page, pool))
+				return;
+
+		if (!__page_pool_recycle_into_ring(pool, page)) {
+			/* Cache full, fallback to free pages */
+			__page_pool_return_page(pool, page);
+		}
+		return;
+	}
+	/* Fallback/non-XDP mode: API user have elevated refcnt.
+	 *
+	 * Many drivers split up the page into fragments, and some
+	 * want to keep doing this to save memory and do refcnt based
+	 * recycling. Support this use case too, to ease drivers
+	 * switching between XDP/non-XDP.
+	 *
+	 * In-case page_pool maintains the DMA mapping, API user must
+	 * call page_pool_put_page once.  In this elevated refcnt
+	 * case, the DMA is unmapped/released, as driver is likely
+	 * doing refcnt based recycle tricks, meaning another process
+	 * will be invoking put_page.
+	 */
+	__page_pool_clean_page(pool, page);
+	put_page(page);
+}
+EXPORT_SYMBOL(__page_pool_put_page);
+
+static void __page_pool_empty_ring(struct page_pool *pool)
+{
+	struct page *page;
+
+	/* Empty recycle ring */
+	while ((page = ptr_ring_consume(&pool->ring))) {
+		/* Verify the refcnt invariant of cached pages */
+		if (!(page_ref_count(page) == 1))
+			pr_crit("%s() page_pool refcnt %d violation\n",
+				__func__, page_ref_count(page));
+
+		__page_pool_return_page(pool, page);
+	}
+}
+
+static void __page_pool_destroy_rcu(struct rcu_head *rcu)
+{
+	struct page_pool *pool;
+
+	pool = container_of(rcu, struct page_pool, rcu);
+
+	WARN(pool->alloc.count, "API usage violation");
+
+	__page_pool_empty_ring(pool);
+	ptr_ring_cleanup(&pool->ring, NULL);
+	kfree(pool);
+}
+
+/* Cleanup and release resources */
+void page_pool_destroy(struct page_pool *pool)
+{
+	struct page *page;
+
+	/* Empty alloc cache, assume caller made sure this is
+	 * no-longer in use, and page_pool_alloc_pages() cannot be
+	 * call concurrently.
+	 */
+	while (pool->alloc.count) {
+		page = pool->alloc.cache[--pool->alloc.count];
+		__page_pool_return_page(pool, page);
+	}
+
+	/* No more consumers should exist, but producers could still
+	 * be in-flight.
+	 */
+	__page_pool_empty_ring(pool);
+
+	/* An xdp_mem_allocator can still ref page_pool pointer */
+	call_rcu(&pool->rcu, __page_pool_destroy_rcu);
+}
+EXPORT_SYMBOL(page_pool_destroy);

+ 269 - 0
net/core/xdp.c

@@ -5,6 +5,10 @@
  */
 #include <linux/types.h>
 #include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/idr.h>
+#include <linux/rhashtable.h>
+#include <net/page_pool.h>
 
 #include <net/xdp.h>
 
@@ -13,6 +17,104 @@
 #define REG_STATE_UNREGISTERED	0x2
 #define REG_STATE_UNUSED	0x3
 
+static DEFINE_IDA(mem_id_pool);
+static DEFINE_MUTEX(mem_id_lock);
+#define MEM_ID_MAX 0xFFFE
+#define MEM_ID_MIN 1
+static int mem_id_next = MEM_ID_MIN;
+
+static bool mem_id_init; /* false */
+static struct rhashtable *mem_id_ht;
+
+struct xdp_mem_allocator {
+	struct xdp_mem_info mem;
+	union {
+		void *allocator;
+		struct page_pool *page_pool;
+	};
+	struct rhash_head node;
+	struct rcu_head rcu;
+};
+
+static u32 xdp_mem_id_hashfn(const void *data, u32 len, u32 seed)
+{
+	const u32 *k = data;
+	const u32 key = *k;
+
+	BUILD_BUG_ON(FIELD_SIZEOF(struct xdp_mem_allocator, mem.id)
+		     != sizeof(u32));
+
+	/* Use cyclic increasing ID as direct hash key, see rht_bucket_index */
+	return key << RHT_HASH_RESERVED_SPACE;
+}
+
+static int xdp_mem_id_cmp(struct rhashtable_compare_arg *arg,
+			  const void *ptr)
+{
+	const struct xdp_mem_allocator *xa = ptr;
+	u32 mem_id = *(u32 *)arg->key;
+
+	return xa->mem.id != mem_id;
+}
+
+static const struct rhashtable_params mem_id_rht_params = {
+	.nelem_hint = 64,
+	.head_offset = offsetof(struct xdp_mem_allocator, node),
+	.key_offset  = offsetof(struct xdp_mem_allocator, mem.id),
+	.key_len = FIELD_SIZEOF(struct xdp_mem_allocator, mem.id),
+	.max_size = MEM_ID_MAX,
+	.min_size = 8,
+	.automatic_shrinking = true,
+	.hashfn    = xdp_mem_id_hashfn,
+	.obj_cmpfn = xdp_mem_id_cmp,
+};
+
+static void __xdp_mem_allocator_rcu_free(struct rcu_head *rcu)
+{
+	struct xdp_mem_allocator *xa;
+
+	xa = container_of(rcu, struct xdp_mem_allocator, rcu);
+
+	/* Allow this ID to be reused */
+	ida_simple_remove(&mem_id_pool, xa->mem.id);
+
+	/* Notice, driver is expected to free the *allocator,
+	 * e.g. page_pool, and MUST also use RCU free.
+	 */
+
+	/* Poison memory */
+	xa->mem.id = 0xFFFF;
+	xa->mem.type = 0xF0F0;
+	xa->allocator = (void *)0xDEAD9001;
+
+	kfree(xa);
+}
+
+static void __xdp_rxq_info_unreg_mem_model(struct xdp_rxq_info *xdp_rxq)
+{
+	struct xdp_mem_allocator *xa;
+	int id = xdp_rxq->mem.id;
+	int err;
+
+	if (id == 0)
+		return;
+
+	mutex_lock(&mem_id_lock);
+
+	xa = rhashtable_lookup(mem_id_ht, &id, mem_id_rht_params);
+	if (!xa) {
+		mutex_unlock(&mem_id_lock);
+		return;
+	}
+
+	err = rhashtable_remove_fast(mem_id_ht, &xa->node, mem_id_rht_params);
+	WARN_ON(err);
+
+	call_rcu(&xa->rcu, __xdp_mem_allocator_rcu_free);
+
+	mutex_unlock(&mem_id_lock);
+}
+
 void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 {
 	/* Simplify driver cleanup code paths, allow unreg "unused" */
@@ -21,8 +123,14 @@ void xdp_rxq_info_unreg(struct xdp_rxq_info *xdp_rxq)
 
 	WARN(!(xdp_rxq->reg_state == REG_STATE_REGISTERED), "Driver BUG");
 
+	__xdp_rxq_info_unreg_mem_model(xdp_rxq);
+
 	xdp_rxq->reg_state = REG_STATE_UNREGISTERED;
 	xdp_rxq->dev = NULL;
+
+	/* Reset mem info to defaults */
+	xdp_rxq->mem.id = 0;
+	xdp_rxq->mem.type = 0;
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_unreg);
 
@@ -71,3 +179,164 @@ bool xdp_rxq_info_is_reg(struct xdp_rxq_info *xdp_rxq)
 	return (xdp_rxq->reg_state == REG_STATE_REGISTERED);
 }
 EXPORT_SYMBOL_GPL(xdp_rxq_info_is_reg);
+
+static int __mem_id_init_hash_table(void)
+{
+	struct rhashtable *rht;
+	int ret;
+
+	if (unlikely(mem_id_init))
+		return 0;
+
+	rht = kzalloc(sizeof(*rht), GFP_KERNEL);
+	if (!rht)
+		return -ENOMEM;
+
+	ret = rhashtable_init(rht, &mem_id_rht_params);
+	if (ret < 0) {
+		kfree(rht);
+		return ret;
+	}
+	mem_id_ht = rht;
+	smp_mb(); /* mutex lock should provide enough pairing */
+	mem_id_init = true;
+
+	return 0;
+}
+
+/* Allocate a cyclic ID that maps to allocator pointer.
+ * See: https://www.kernel.org/doc/html/latest/core-api/idr.html
+ *
+ * Caller must lock mem_id_lock.
+ */
+static int __mem_id_cyclic_get(gfp_t gfp)
+{
+	int retries = 1;
+	int id;
+
+again:
+	id = ida_simple_get(&mem_id_pool, mem_id_next, MEM_ID_MAX, gfp);
+	if (id < 0) {
+		if (id == -ENOSPC) {
+			/* Cyclic allocator, reset next id */
+			if (retries--) {
+				mem_id_next = MEM_ID_MIN;
+				goto again;
+			}
+		}
+		return id; /* errno */
+	}
+	mem_id_next = id + 1;
+
+	return id;
+}
+
+static bool __is_supported_mem_type(enum xdp_mem_type type)
+{
+	if (type == MEM_TYPE_PAGE_POOL)
+		return is_page_pool_compiled_in();
+
+	if (type >= MEM_TYPE_MAX)
+		return false;
+
+	return true;
+}
+
+int xdp_rxq_info_reg_mem_model(struct xdp_rxq_info *xdp_rxq,
+			       enum xdp_mem_type type, void *allocator)
+{
+	struct xdp_mem_allocator *xdp_alloc;
+	gfp_t gfp = GFP_KERNEL;
+	int id, errno, ret;
+	void *ptr;
+
+	if (xdp_rxq->reg_state != REG_STATE_REGISTERED) {
+		WARN(1, "Missing register, driver bug");
+		return -EFAULT;
+	}
+
+	if (!__is_supported_mem_type(type))
+		return -EOPNOTSUPP;
+
+	xdp_rxq->mem.type = type;
+
+	if (!allocator) {
+		if (type == MEM_TYPE_PAGE_POOL)
+			return -EINVAL; /* Setup time check page_pool req */
+		return 0;
+	}
+
+	/* Delay init of rhashtable to save memory if feature isn't used */
+	if (!mem_id_init) {
+		mutex_lock(&mem_id_lock);
+		ret = __mem_id_init_hash_table();
+		mutex_unlock(&mem_id_lock);
+		if (ret < 0) {
+			WARN_ON(1);
+			return ret;
+		}
+	}
+
+	xdp_alloc = kzalloc(sizeof(*xdp_alloc), gfp);
+	if (!xdp_alloc)
+		return -ENOMEM;
+
+	mutex_lock(&mem_id_lock);
+	id = __mem_id_cyclic_get(gfp);
+	if (id < 0) {
+		errno = id;
+		goto err;
+	}
+	xdp_rxq->mem.id = id;
+	xdp_alloc->mem  = xdp_rxq->mem;
+	xdp_alloc->allocator = allocator;
+
+	/* Insert allocator into ID lookup table */
+	ptr = rhashtable_insert_slow(mem_id_ht, &id, &xdp_alloc->node);
+	if (IS_ERR(ptr)) {
+		errno = PTR_ERR(ptr);
+		goto err;
+	}
+
+	mutex_unlock(&mem_id_lock);
+
+	return 0;
+err:
+	mutex_unlock(&mem_id_lock);
+	kfree(xdp_alloc);
+	return errno;
+}
+EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model);
+
+void xdp_return_frame(struct xdp_frame *xdpf)
+{
+	struct xdp_mem_info *mem = &xdpf->mem;
+	struct xdp_mem_allocator *xa;
+	void *data = xdpf->data;
+	struct page *page;
+
+	switch (mem->type) {
+	case MEM_TYPE_PAGE_POOL:
+		rcu_read_lock();
+		/* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */
+		xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params);
+		page = virt_to_head_page(data);
+		if (xa)
+			page_pool_put_page(xa->page_pool, page);
+		else
+			put_page(page);
+		rcu_read_unlock();
+		break;
+	case MEM_TYPE_PAGE_SHARED:
+		page_frag_free(data);
+		break;
+	case MEM_TYPE_PAGE_ORDER0:
+		page = virt_to_page(data); /* Assumes order0 page*/
+		put_page(page);
+		break;
+	default:
+		/* Not possible, checked in xdp_rxq_info_reg_mem_model() */
+		break;
+	}
+}
+EXPORT_SYMBOL_GPL(xdp_return_frame);