xdp_umem.c 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404
  1. // SPDX-License-Identifier: GPL-2.0
  2. /* XDP user-space packet buffer
  3. * Copyright(c) 2018 Intel Corporation.
  4. */
  5. #include <linux/init.h>
  6. #include <linux/sched/mm.h>
  7. #include <linux/sched/signal.h>
  8. #include <linux/sched/task.h>
  9. #include <linux/uaccess.h>
  10. #include <linux/slab.h>
  11. #include <linux/bpf.h>
  12. #include <linux/mm.h>
  13. #include <linux/netdevice.h>
  14. #include <linux/rtnetlink.h>
  15. #include "xdp_umem.h"
  16. #include "xsk_queue.h"
  17. #define XDP_UMEM_MIN_CHUNK_SIZE 2048
  18. void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  19. {
  20. unsigned long flags;
  21. spin_lock_irqsave(&umem->xsk_list_lock, flags);
  22. list_add_rcu(&xs->list, &umem->xsk_list);
  23. spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  24. }
  25. void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  26. {
  27. unsigned long flags;
  28. spin_lock_irqsave(&umem->xsk_list_lock, flags);
  29. list_del_rcu(&xs->list);
  30. spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  31. }
  32. /* The umem is stored both in the _rx struct and the _tx struct as we do
  33. * not know if the device has more tx queues than rx, or the opposite.
  34. * This might also change during run time.
  35. */
  36. static void xdp_reg_umem_at_qid(struct net_device *dev, struct xdp_umem *umem,
  37. u16 queue_id)
  38. {
  39. if (queue_id < dev->real_num_rx_queues)
  40. dev->_rx[queue_id].umem = umem;
  41. if (queue_id < dev->real_num_tx_queues)
  42. dev->_tx[queue_id].umem = umem;
  43. }
  44. struct xdp_umem *xdp_get_umem_from_qid(struct net_device *dev,
  45. u16 queue_id)
  46. {
  47. if (queue_id < dev->real_num_rx_queues)
  48. return dev->_rx[queue_id].umem;
  49. if (queue_id < dev->real_num_tx_queues)
  50. return dev->_tx[queue_id].umem;
  51. return NULL;
  52. }
  53. static void xdp_clear_umem_at_qid(struct net_device *dev, u16 queue_id)
  54. {
  55. if (queue_id < dev->real_num_rx_queues)
  56. dev->_rx[queue_id].umem = NULL;
  57. if (queue_id < dev->real_num_tx_queues)
  58. dev->_tx[queue_id].umem = NULL;
  59. }
  60. int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
  61. u16 queue_id, u16 flags)
  62. {
  63. bool force_zc, force_copy;
  64. struct netdev_bpf bpf;
  65. int err = 0;
  66. force_zc = flags & XDP_ZEROCOPY;
  67. force_copy = flags & XDP_COPY;
  68. if (force_zc && force_copy)
  69. return -EINVAL;
  70. rtnl_lock();
  71. if (xdp_get_umem_from_qid(dev, queue_id)) {
  72. err = -EBUSY;
  73. goto out_rtnl_unlock;
  74. }
  75. xdp_reg_umem_at_qid(dev, umem, queue_id);
  76. umem->dev = dev;
  77. umem->queue_id = queue_id;
  78. if (force_copy)
  79. /* For copy-mode, we are done. */
  80. goto out_rtnl_unlock;
  81. if (!dev->netdev_ops->ndo_bpf ||
  82. !dev->netdev_ops->ndo_xsk_async_xmit) {
  83. err = -EOPNOTSUPP;
  84. goto err_unreg_umem;
  85. }
  86. bpf.command = XDP_SETUP_XSK_UMEM;
  87. bpf.xsk.umem = umem;
  88. bpf.xsk.queue_id = queue_id;
  89. err = dev->netdev_ops->ndo_bpf(dev, &bpf);
  90. if (err)
  91. goto err_unreg_umem;
  92. rtnl_unlock();
  93. dev_hold(dev);
  94. umem->zc = true;
  95. return 0;
  96. err_unreg_umem:
  97. xdp_clear_umem_at_qid(dev, queue_id);
  98. if (!force_zc)
  99. err = 0; /* fallback to copy mode */
  100. out_rtnl_unlock:
  101. rtnl_unlock();
  102. return err;
  103. }
  104. static void xdp_umem_clear_dev(struct xdp_umem *umem)
  105. {
  106. struct netdev_bpf bpf;
  107. int err;
  108. if (umem->zc) {
  109. bpf.command = XDP_SETUP_XSK_UMEM;
  110. bpf.xsk.umem = NULL;
  111. bpf.xsk.queue_id = umem->queue_id;
  112. rtnl_lock();
  113. err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
  114. rtnl_unlock();
  115. if (err)
  116. WARN(1, "failed to disable umem!\n");
  117. }
  118. if (umem->dev) {
  119. rtnl_lock();
  120. xdp_clear_umem_at_qid(umem->dev, umem->queue_id);
  121. rtnl_unlock();
  122. }
  123. if (umem->zc) {
  124. dev_put(umem->dev);
  125. umem->zc = false;
  126. }
  127. }
  128. static void xdp_umem_unpin_pages(struct xdp_umem *umem)
  129. {
  130. unsigned int i;
  131. for (i = 0; i < umem->npgs; i++) {
  132. struct page *page = umem->pgs[i];
  133. set_page_dirty_lock(page);
  134. put_page(page);
  135. }
  136. kfree(umem->pgs);
  137. umem->pgs = NULL;
  138. }
  139. static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
  140. {
  141. if (umem->user) {
  142. atomic_long_sub(umem->npgs, &umem->user->locked_vm);
  143. free_uid(umem->user);
  144. }
  145. }
  146. static void xdp_umem_release(struct xdp_umem *umem)
  147. {
  148. struct task_struct *task;
  149. struct mm_struct *mm;
  150. xdp_umem_clear_dev(umem);
  151. if (umem->fq) {
  152. xskq_destroy(umem->fq);
  153. umem->fq = NULL;
  154. }
  155. if (umem->cq) {
  156. xskq_destroy(umem->cq);
  157. umem->cq = NULL;
  158. }
  159. xsk_reuseq_destroy(umem);
  160. xdp_umem_unpin_pages(umem);
  161. task = get_pid_task(umem->pid, PIDTYPE_PID);
  162. put_pid(umem->pid);
  163. if (!task)
  164. goto out;
  165. mm = get_task_mm(task);
  166. put_task_struct(task);
  167. if (!mm)
  168. goto out;
  169. mmput(mm);
  170. kfree(umem->pages);
  171. umem->pages = NULL;
  172. xdp_umem_unaccount_pages(umem);
  173. out:
  174. kfree(umem);
  175. }
  176. static void xdp_umem_release_deferred(struct work_struct *work)
  177. {
  178. struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
  179. xdp_umem_release(umem);
  180. }
  181. void xdp_get_umem(struct xdp_umem *umem)
  182. {
  183. refcount_inc(&umem->users);
  184. }
  185. void xdp_put_umem(struct xdp_umem *umem)
  186. {
  187. if (!umem)
  188. return;
  189. if (refcount_dec_and_test(&umem->users)) {
  190. INIT_WORK(&umem->work, xdp_umem_release_deferred);
  191. schedule_work(&umem->work);
  192. }
  193. }
  194. static int xdp_umem_pin_pages(struct xdp_umem *umem)
  195. {
  196. unsigned int gup_flags = FOLL_WRITE;
  197. long npgs;
  198. int err;
  199. umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
  200. GFP_KERNEL | __GFP_NOWARN);
  201. if (!umem->pgs)
  202. return -ENOMEM;
  203. down_write(&current->mm->mmap_sem);
  204. npgs = get_user_pages(umem->address, umem->npgs,
  205. gup_flags, &umem->pgs[0], NULL);
  206. up_write(&current->mm->mmap_sem);
  207. if (npgs != umem->npgs) {
  208. if (npgs >= 0) {
  209. umem->npgs = npgs;
  210. err = -ENOMEM;
  211. goto out_pin;
  212. }
  213. err = npgs;
  214. goto out_pgs;
  215. }
  216. return 0;
  217. out_pin:
  218. xdp_umem_unpin_pages(umem);
  219. out_pgs:
  220. kfree(umem->pgs);
  221. umem->pgs = NULL;
  222. return err;
  223. }
  224. static int xdp_umem_account_pages(struct xdp_umem *umem)
  225. {
  226. unsigned long lock_limit, new_npgs, old_npgs;
  227. if (capable(CAP_IPC_LOCK))
  228. return 0;
  229. lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  230. umem->user = get_uid(current_user());
  231. do {
  232. old_npgs = atomic_long_read(&umem->user->locked_vm);
  233. new_npgs = old_npgs + umem->npgs;
  234. if (new_npgs > lock_limit) {
  235. free_uid(umem->user);
  236. umem->user = NULL;
  237. return -ENOBUFS;
  238. }
  239. } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
  240. new_npgs) != old_npgs);
  241. return 0;
  242. }
  243. static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
  244. {
  245. u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
  246. unsigned int chunks, chunks_per_page;
  247. u64 addr = mr->addr, size = mr->len;
  248. int size_chk, err, i;
  249. if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
  250. /* Strictly speaking we could support this, if:
  251. * - huge pages, or*
  252. * - using an IOMMU, or
  253. * - making sure the memory area is consecutive
  254. * but for now, we simply say "computer says no".
  255. */
  256. return -EINVAL;
  257. }
  258. if (!is_power_of_2(chunk_size))
  259. return -EINVAL;
  260. if (!PAGE_ALIGNED(addr)) {
  261. /* Memory area has to be page size aligned. For
  262. * simplicity, this might change.
  263. */
  264. return -EINVAL;
  265. }
  266. if ((addr + size) < addr)
  267. return -EINVAL;
  268. chunks = (unsigned int)div_u64(size, chunk_size);
  269. if (chunks == 0)
  270. return -EINVAL;
  271. chunks_per_page = PAGE_SIZE / chunk_size;
  272. if (chunks < chunks_per_page || chunks % chunks_per_page)
  273. return -EINVAL;
  274. headroom = ALIGN(headroom, 64);
  275. size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
  276. if (size_chk < 0)
  277. return -EINVAL;
  278. umem->pid = get_task_pid(current, PIDTYPE_PID);
  279. umem->address = (unsigned long)addr;
  280. umem->chunk_mask = ~((u64)chunk_size - 1);
  281. umem->size = size;
  282. umem->headroom = headroom;
  283. umem->chunk_size_nohr = chunk_size - headroom;
  284. umem->npgs = size / PAGE_SIZE;
  285. umem->pgs = NULL;
  286. umem->user = NULL;
  287. INIT_LIST_HEAD(&umem->xsk_list);
  288. spin_lock_init(&umem->xsk_list_lock);
  289. refcount_set(&umem->users, 1);
  290. err = xdp_umem_account_pages(umem);
  291. if (err)
  292. goto out;
  293. err = xdp_umem_pin_pages(umem);
  294. if (err)
  295. goto out_account;
  296. umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
  297. if (!umem->pages) {
  298. err = -ENOMEM;
  299. goto out_account;
  300. }
  301. for (i = 0; i < umem->npgs; i++)
  302. umem->pages[i].addr = page_address(umem->pgs[i]);
  303. return 0;
  304. out_account:
  305. xdp_umem_unaccount_pages(umem);
  306. out:
  307. put_pid(umem->pid);
  308. return err;
  309. }
  310. struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
  311. {
  312. struct xdp_umem *umem;
  313. int err;
  314. umem = kzalloc(sizeof(*umem), GFP_KERNEL);
  315. if (!umem)
  316. return ERR_PTR(-ENOMEM);
  317. err = xdp_umem_reg(umem, mr);
  318. if (err) {
  319. kfree(umem);
  320. return ERR_PTR(err);
  321. }
  322. return umem;
  323. }
  324. bool xdp_umem_validate_queues(struct xdp_umem *umem)
  325. {
  326. return umem->fq && umem->cq;
  327. }