xdp_umem.c 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376
  1. // SPDX-License-Identifier: GPL-2.0
  2. /* XDP user-space packet buffer
  3. * Copyright(c) 2018 Intel Corporation.
  4. */
  5. #include <linux/init.h>
  6. #include <linux/sched/mm.h>
  7. #include <linux/sched/signal.h>
  8. #include <linux/sched/task.h>
  9. #include <linux/uaccess.h>
  10. #include <linux/slab.h>
  11. #include <linux/bpf.h>
  12. #include <linux/mm.h>
  13. #include <linux/netdevice.h>
  14. #include <linux/rtnetlink.h>
  15. #include "xdp_umem.h"
  16. #include "xsk_queue.h"
  17. #define XDP_UMEM_MIN_CHUNK_SIZE 2048
  18. void xdp_add_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  19. {
  20. unsigned long flags;
  21. spin_lock_irqsave(&umem->xsk_list_lock, flags);
  22. list_add_rcu(&xs->list, &umem->xsk_list);
  23. spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  24. }
  25. void xdp_del_sk_umem(struct xdp_umem *umem, struct xdp_sock *xs)
  26. {
  27. unsigned long flags;
  28. if (xs->dev) {
  29. spin_lock_irqsave(&umem->xsk_list_lock, flags);
  30. list_del_rcu(&xs->list);
  31. spin_unlock_irqrestore(&umem->xsk_list_lock, flags);
  32. if (umem->zc)
  33. synchronize_net();
  34. }
  35. }
  36. int xdp_umem_query(struct net_device *dev, u16 queue_id)
  37. {
  38. struct netdev_bpf bpf;
  39. ASSERT_RTNL();
  40. memset(&bpf, 0, sizeof(bpf));
  41. bpf.command = XDP_QUERY_XSK_UMEM;
  42. bpf.xsk.queue_id = queue_id;
  43. if (!dev->netdev_ops->ndo_bpf)
  44. return 0;
  45. return dev->netdev_ops->ndo_bpf(dev, &bpf) ?: !!bpf.xsk.umem;
  46. }
  47. int xdp_umem_assign_dev(struct xdp_umem *umem, struct net_device *dev,
  48. u32 queue_id, u16 flags)
  49. {
  50. bool force_zc, force_copy;
  51. struct netdev_bpf bpf;
  52. int err;
  53. force_zc = flags & XDP_ZEROCOPY;
  54. force_copy = flags & XDP_COPY;
  55. if (force_zc && force_copy)
  56. return -EINVAL;
  57. if (force_copy)
  58. return 0;
  59. if (!dev->netdev_ops->ndo_bpf || !dev->netdev_ops->ndo_xsk_async_xmit)
  60. return force_zc ? -ENOTSUPP : 0; /* fail or fallback */
  61. bpf.command = XDP_QUERY_XSK_UMEM;
  62. rtnl_lock();
  63. err = xdp_umem_query(dev, queue_id);
  64. if (err) {
  65. err = err < 0 ? -ENOTSUPP : -EBUSY;
  66. goto err_rtnl_unlock;
  67. }
  68. bpf.command = XDP_SETUP_XSK_UMEM;
  69. bpf.xsk.umem = umem;
  70. bpf.xsk.queue_id = queue_id;
  71. err = dev->netdev_ops->ndo_bpf(dev, &bpf);
  72. if (err)
  73. goto err_rtnl_unlock;
  74. rtnl_unlock();
  75. dev_hold(dev);
  76. umem->dev = dev;
  77. umem->queue_id = queue_id;
  78. umem->zc = true;
  79. return 0;
  80. err_rtnl_unlock:
  81. rtnl_unlock();
  82. return force_zc ? err : 0; /* fail or fallback */
  83. }
  84. static void xdp_umem_clear_dev(struct xdp_umem *umem)
  85. {
  86. struct netdev_bpf bpf;
  87. int err;
  88. if (umem->dev) {
  89. bpf.command = XDP_SETUP_XSK_UMEM;
  90. bpf.xsk.umem = NULL;
  91. bpf.xsk.queue_id = umem->queue_id;
  92. rtnl_lock();
  93. err = umem->dev->netdev_ops->ndo_bpf(umem->dev, &bpf);
  94. rtnl_unlock();
  95. if (err)
  96. WARN(1, "failed to disable umem!\n");
  97. dev_put(umem->dev);
  98. umem->dev = NULL;
  99. }
  100. }
  101. static void xdp_umem_unpin_pages(struct xdp_umem *umem)
  102. {
  103. unsigned int i;
  104. for (i = 0; i < umem->npgs; i++) {
  105. struct page *page = umem->pgs[i];
  106. set_page_dirty_lock(page);
  107. put_page(page);
  108. }
  109. kfree(umem->pgs);
  110. umem->pgs = NULL;
  111. }
  112. static void xdp_umem_unaccount_pages(struct xdp_umem *umem)
  113. {
  114. if (umem->user) {
  115. atomic_long_sub(umem->npgs, &umem->user->locked_vm);
  116. free_uid(umem->user);
  117. }
  118. }
  119. static void xdp_umem_release(struct xdp_umem *umem)
  120. {
  121. struct task_struct *task;
  122. struct mm_struct *mm;
  123. xdp_umem_clear_dev(umem);
  124. if (umem->fq) {
  125. xskq_destroy(umem->fq);
  126. umem->fq = NULL;
  127. }
  128. if (umem->cq) {
  129. xskq_destroy(umem->cq);
  130. umem->cq = NULL;
  131. }
  132. xdp_umem_unpin_pages(umem);
  133. task = get_pid_task(umem->pid, PIDTYPE_PID);
  134. put_pid(umem->pid);
  135. if (!task)
  136. goto out;
  137. mm = get_task_mm(task);
  138. put_task_struct(task);
  139. if (!mm)
  140. goto out;
  141. mmput(mm);
  142. kfree(umem->pages);
  143. umem->pages = NULL;
  144. xdp_umem_unaccount_pages(umem);
  145. out:
  146. kfree(umem);
  147. }
  148. static void xdp_umem_release_deferred(struct work_struct *work)
  149. {
  150. struct xdp_umem *umem = container_of(work, struct xdp_umem, work);
  151. xdp_umem_release(umem);
  152. }
  153. void xdp_get_umem(struct xdp_umem *umem)
  154. {
  155. refcount_inc(&umem->users);
  156. }
  157. void xdp_put_umem(struct xdp_umem *umem)
  158. {
  159. if (!umem)
  160. return;
  161. if (refcount_dec_and_test(&umem->users)) {
  162. INIT_WORK(&umem->work, xdp_umem_release_deferred);
  163. schedule_work(&umem->work);
  164. }
  165. }
  166. static int xdp_umem_pin_pages(struct xdp_umem *umem)
  167. {
  168. unsigned int gup_flags = FOLL_WRITE;
  169. long npgs;
  170. int err;
  171. umem->pgs = kcalloc(umem->npgs, sizeof(*umem->pgs),
  172. GFP_KERNEL | __GFP_NOWARN);
  173. if (!umem->pgs)
  174. return -ENOMEM;
  175. down_write(&current->mm->mmap_sem);
  176. npgs = get_user_pages(umem->address, umem->npgs,
  177. gup_flags, &umem->pgs[0], NULL);
  178. up_write(&current->mm->mmap_sem);
  179. if (npgs != umem->npgs) {
  180. if (npgs >= 0) {
  181. umem->npgs = npgs;
  182. err = -ENOMEM;
  183. goto out_pin;
  184. }
  185. err = npgs;
  186. goto out_pgs;
  187. }
  188. return 0;
  189. out_pin:
  190. xdp_umem_unpin_pages(umem);
  191. out_pgs:
  192. kfree(umem->pgs);
  193. umem->pgs = NULL;
  194. return err;
  195. }
  196. static int xdp_umem_account_pages(struct xdp_umem *umem)
  197. {
  198. unsigned long lock_limit, new_npgs, old_npgs;
  199. if (capable(CAP_IPC_LOCK))
  200. return 0;
  201. lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT;
  202. umem->user = get_uid(current_user());
  203. do {
  204. old_npgs = atomic_long_read(&umem->user->locked_vm);
  205. new_npgs = old_npgs + umem->npgs;
  206. if (new_npgs > lock_limit) {
  207. free_uid(umem->user);
  208. umem->user = NULL;
  209. return -ENOBUFS;
  210. }
  211. } while (atomic_long_cmpxchg(&umem->user->locked_vm, old_npgs,
  212. new_npgs) != old_npgs);
  213. return 0;
  214. }
  215. static int xdp_umem_reg(struct xdp_umem *umem, struct xdp_umem_reg *mr)
  216. {
  217. u32 chunk_size = mr->chunk_size, headroom = mr->headroom;
  218. unsigned int chunks, chunks_per_page;
  219. u64 addr = mr->addr, size = mr->len;
  220. int size_chk, err, i;
  221. if (chunk_size < XDP_UMEM_MIN_CHUNK_SIZE || chunk_size > PAGE_SIZE) {
  222. /* Strictly speaking we could support this, if:
  223. * - huge pages, or*
  224. * - using an IOMMU, or
  225. * - making sure the memory area is consecutive
  226. * but for now, we simply say "computer says no".
  227. */
  228. return -EINVAL;
  229. }
  230. if (!is_power_of_2(chunk_size))
  231. return -EINVAL;
  232. if (!PAGE_ALIGNED(addr)) {
  233. /* Memory area has to be page size aligned. For
  234. * simplicity, this might change.
  235. */
  236. return -EINVAL;
  237. }
  238. if ((addr + size) < addr)
  239. return -EINVAL;
  240. chunks = (unsigned int)div_u64(size, chunk_size);
  241. if (chunks == 0)
  242. return -EINVAL;
  243. chunks_per_page = PAGE_SIZE / chunk_size;
  244. if (chunks < chunks_per_page || chunks % chunks_per_page)
  245. return -EINVAL;
  246. headroom = ALIGN(headroom, 64);
  247. size_chk = chunk_size - headroom - XDP_PACKET_HEADROOM;
  248. if (size_chk < 0)
  249. return -EINVAL;
  250. umem->pid = get_task_pid(current, PIDTYPE_PID);
  251. umem->address = (unsigned long)addr;
  252. umem->props.chunk_mask = ~((u64)chunk_size - 1);
  253. umem->props.size = size;
  254. umem->headroom = headroom;
  255. umem->chunk_size_nohr = chunk_size - headroom;
  256. umem->npgs = size / PAGE_SIZE;
  257. umem->pgs = NULL;
  258. umem->user = NULL;
  259. INIT_LIST_HEAD(&umem->xsk_list);
  260. spin_lock_init(&umem->xsk_list_lock);
  261. refcount_set(&umem->users, 1);
  262. err = xdp_umem_account_pages(umem);
  263. if (err)
  264. goto out;
  265. err = xdp_umem_pin_pages(umem);
  266. if (err)
  267. goto out_account;
  268. umem->pages = kcalloc(umem->npgs, sizeof(*umem->pages), GFP_KERNEL);
  269. if (!umem->pages) {
  270. err = -ENOMEM;
  271. goto out_account;
  272. }
  273. for (i = 0; i < umem->npgs; i++)
  274. umem->pages[i].addr = page_address(umem->pgs[i]);
  275. return 0;
  276. out_account:
  277. xdp_umem_unaccount_pages(umem);
  278. out:
  279. put_pid(umem->pid);
  280. return err;
  281. }
  282. struct xdp_umem *xdp_umem_create(struct xdp_umem_reg *mr)
  283. {
  284. struct xdp_umem *umem;
  285. int err;
  286. umem = kzalloc(sizeof(*umem), GFP_KERNEL);
  287. if (!umem)
  288. return ERR_PTR(-ENOMEM);
  289. err = xdp_umem_reg(umem, mr);
  290. if (err) {
  291. kfree(umem);
  292. return ERR_PTR(err);
  293. }
  294. return umem;
  295. }
  296. bool xdp_umem_validate_queues(struct xdp_umem *umem)
  297. {
  298. return umem->fq && umem->cq;
  299. }