sockmap.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901
  1. /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. /* A BPF sock_map is used to store sock objects. This is primarly used
  13. * for doing socket redirect with BPF helper routines.
  14. *
  15. * A sock map may have BPF programs attached to it, currently a program
  16. * used to parse packets and a program to provide a verdict and redirect
  17. * decision on the packet are supported. Any programs attached to a sock
  18. * map are inherited by sock objects when they are added to the map. If
  19. * no BPF programs are attached the sock object may only be used for sock
  20. * redirect.
  21. *
  22. * A sock object may be in multiple maps, but can only inherit a single
  23. * parse or verdict program. If adding a sock object to a map would result
  24. * in having multiple parsing programs the update will return an EBUSY error.
  25. *
  26. * For reference this program is similar to devmap used in XDP context
  27. * reviewing these together may be useful. For an example please review
  28. * ./samples/bpf/sockmap/.
  29. */
  30. #include <linux/bpf.h>
  31. #include <net/sock.h>
  32. #include <linux/filter.h>
  33. #include <linux/errno.h>
  34. #include <linux/file.h>
  35. #include <linux/kernel.h>
  36. #include <linux/net.h>
  37. #include <linux/skbuff.h>
  38. #include <linux/workqueue.h>
  39. #include <linux/list.h>
  40. #include <net/strparser.h>
  41. #include <net/tcp.h>
  42. #define SOCK_CREATE_FLAG_MASK \
  43. (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
  44. struct bpf_stab {
  45. struct bpf_map map;
  46. struct sock **sock_map;
  47. struct bpf_prog *bpf_parse;
  48. struct bpf_prog *bpf_verdict;
  49. };
  50. enum smap_psock_state {
  51. SMAP_TX_RUNNING,
  52. };
  53. struct smap_psock_map_entry {
  54. struct list_head list;
  55. struct sock **entry;
  56. };
  57. struct smap_psock {
  58. struct rcu_head rcu;
  59. /* refcnt is used inside sk_callback_lock */
  60. u32 refcnt;
  61. /* datapath variables */
  62. struct sk_buff_head rxqueue;
  63. bool strp_enabled;
  64. /* datapath error path cache across tx work invocations */
  65. int save_rem;
  66. int save_off;
  67. struct sk_buff *save_skb;
  68. struct strparser strp;
  69. struct bpf_prog *bpf_parse;
  70. struct bpf_prog *bpf_verdict;
  71. struct list_head maps;
  72. /* Back reference used when sock callback trigger sockmap operations */
  73. struct sock *sock;
  74. unsigned long state;
  75. struct work_struct tx_work;
  76. struct work_struct gc_work;
  77. void (*save_data_ready)(struct sock *sk);
  78. void (*save_write_space)(struct sock *sk);
  79. void (*save_state_change)(struct sock *sk);
  80. };
  81. static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
  82. {
  83. return rcu_dereference_sk_user_data(sk);
  84. }
  85. /* compute the linear packet data range [data, data_end) for skb when
  86. * sk_skb type programs are in use.
  87. */
  88. static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
  89. {
  90. TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
  91. }
  92. enum __sk_action {
  93. __SK_DROP = 0,
  94. __SK_PASS,
  95. __SK_REDIRECT,
  96. };
  97. static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
  98. {
  99. struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
  100. int rc;
  101. if (unlikely(!prog))
  102. return __SK_DROP;
  103. skb_orphan(skb);
  104. /* We need to ensure that BPF metadata for maps is also cleared
  105. * when we orphan the skb so that we don't have the possibility
  106. * to reference a stale map.
  107. */
  108. TCP_SKB_CB(skb)->bpf.map = NULL;
  109. skb->sk = psock->sock;
  110. bpf_compute_data_pointers(skb);
  111. preempt_disable();
  112. rc = (*prog->bpf_func)(skb, prog->insnsi);
  113. preempt_enable();
  114. skb->sk = NULL;
  115. /* Moving return codes from UAPI namespace into internal namespace */
  116. return rc == SK_PASS ?
  117. (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
  118. __SK_DROP;
  119. }
  120. static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
  121. {
  122. struct sock *sk;
  123. int rc;
  124. rc = smap_verdict_func(psock, skb);
  125. switch (rc) {
  126. case __SK_REDIRECT:
  127. sk = do_sk_redirect_map(skb);
  128. if (likely(sk)) {
  129. struct smap_psock *peer = smap_psock_sk(sk);
  130. if (likely(peer &&
  131. test_bit(SMAP_TX_RUNNING, &peer->state) &&
  132. !sock_flag(sk, SOCK_DEAD) &&
  133. sock_writeable(sk))) {
  134. skb_set_owner_w(skb, sk);
  135. skb_queue_tail(&peer->rxqueue, skb);
  136. schedule_work(&peer->tx_work);
  137. break;
  138. }
  139. }
  140. /* Fall through and free skb otherwise */
  141. case __SK_DROP:
  142. default:
  143. kfree_skb(skb);
  144. }
  145. }
  146. static void smap_report_sk_error(struct smap_psock *psock, int err)
  147. {
  148. struct sock *sk = psock->sock;
  149. sk->sk_err = err;
  150. sk->sk_error_report(sk);
  151. }
  152. static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
  153. /* Called with lock_sock(sk) held */
  154. static void smap_state_change(struct sock *sk)
  155. {
  156. struct smap_psock_map_entry *e, *tmp;
  157. struct smap_psock *psock;
  158. struct socket_wq *wq;
  159. struct sock *osk;
  160. rcu_read_lock();
  161. /* Allowing transitions into an established syn_recv states allows
  162. * for early binding sockets to a smap object before the connection
  163. * is established.
  164. */
  165. switch (sk->sk_state) {
  166. case TCP_SYN_SENT:
  167. case TCP_SYN_RECV:
  168. case TCP_ESTABLISHED:
  169. break;
  170. case TCP_CLOSE_WAIT:
  171. case TCP_CLOSING:
  172. case TCP_LAST_ACK:
  173. case TCP_FIN_WAIT1:
  174. case TCP_FIN_WAIT2:
  175. case TCP_LISTEN:
  176. break;
  177. case TCP_CLOSE:
  178. /* Only release if the map entry is in fact the sock in
  179. * question. There is a case where the operator deletes
  180. * the sock from the map, but the TCP sock is closed before
  181. * the psock is detached. Use cmpxchg to verify correct
  182. * sock is removed.
  183. */
  184. psock = smap_psock_sk(sk);
  185. if (unlikely(!psock))
  186. break;
  187. write_lock_bh(&sk->sk_callback_lock);
  188. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  189. osk = cmpxchg(e->entry, sk, NULL);
  190. if (osk == sk) {
  191. list_del(&e->list);
  192. smap_release_sock(psock, sk);
  193. }
  194. }
  195. write_unlock_bh(&sk->sk_callback_lock);
  196. break;
  197. default:
  198. psock = smap_psock_sk(sk);
  199. if (unlikely(!psock))
  200. break;
  201. smap_report_sk_error(psock, EPIPE);
  202. break;
  203. }
  204. wq = rcu_dereference(sk->sk_wq);
  205. if (skwq_has_sleeper(wq))
  206. wake_up_interruptible_all(&wq->wait);
  207. rcu_read_unlock();
  208. }
  209. static void smap_read_sock_strparser(struct strparser *strp,
  210. struct sk_buff *skb)
  211. {
  212. struct smap_psock *psock;
  213. rcu_read_lock();
  214. psock = container_of(strp, struct smap_psock, strp);
  215. smap_do_verdict(psock, skb);
  216. rcu_read_unlock();
  217. }
  218. /* Called with lock held on socket */
  219. static void smap_data_ready(struct sock *sk)
  220. {
  221. struct smap_psock *psock;
  222. rcu_read_lock();
  223. psock = smap_psock_sk(sk);
  224. if (likely(psock)) {
  225. write_lock_bh(&sk->sk_callback_lock);
  226. strp_data_ready(&psock->strp);
  227. write_unlock_bh(&sk->sk_callback_lock);
  228. }
  229. rcu_read_unlock();
  230. }
  231. static void smap_tx_work(struct work_struct *w)
  232. {
  233. struct smap_psock *psock;
  234. struct sk_buff *skb;
  235. int rem, off, n;
  236. psock = container_of(w, struct smap_psock, tx_work);
  237. /* lock sock to avoid losing sk_socket at some point during loop */
  238. lock_sock(psock->sock);
  239. if (psock->save_skb) {
  240. skb = psock->save_skb;
  241. rem = psock->save_rem;
  242. off = psock->save_off;
  243. psock->save_skb = NULL;
  244. goto start;
  245. }
  246. while ((skb = skb_dequeue(&psock->rxqueue))) {
  247. rem = skb->len;
  248. off = 0;
  249. start:
  250. do {
  251. if (likely(psock->sock->sk_socket))
  252. n = skb_send_sock_locked(psock->sock,
  253. skb, off, rem);
  254. else
  255. n = -EINVAL;
  256. if (n <= 0) {
  257. if (n == -EAGAIN) {
  258. /* Retry when space is available */
  259. psock->save_skb = skb;
  260. psock->save_rem = rem;
  261. psock->save_off = off;
  262. goto out;
  263. }
  264. /* Hard errors break pipe and stop xmit */
  265. smap_report_sk_error(psock, n ? -n : EPIPE);
  266. clear_bit(SMAP_TX_RUNNING, &psock->state);
  267. kfree_skb(skb);
  268. goto out;
  269. }
  270. rem -= n;
  271. off += n;
  272. } while (rem);
  273. kfree_skb(skb);
  274. }
  275. out:
  276. release_sock(psock->sock);
  277. }
  278. static void smap_write_space(struct sock *sk)
  279. {
  280. struct smap_psock *psock;
  281. rcu_read_lock();
  282. psock = smap_psock_sk(sk);
  283. if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
  284. schedule_work(&psock->tx_work);
  285. rcu_read_unlock();
  286. }
  287. static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
  288. {
  289. if (!psock->strp_enabled)
  290. return;
  291. sk->sk_data_ready = psock->save_data_ready;
  292. sk->sk_write_space = psock->save_write_space;
  293. sk->sk_state_change = psock->save_state_change;
  294. psock->save_data_ready = NULL;
  295. psock->save_write_space = NULL;
  296. psock->save_state_change = NULL;
  297. strp_stop(&psock->strp);
  298. psock->strp_enabled = false;
  299. }
  300. static void smap_destroy_psock(struct rcu_head *rcu)
  301. {
  302. struct smap_psock *psock = container_of(rcu,
  303. struct smap_psock, rcu);
  304. /* Now that a grace period has passed there is no longer
  305. * any reference to this sock in the sockmap so we can
  306. * destroy the psock, strparser, and bpf programs. But,
  307. * because we use workqueue sync operations we can not
  308. * do it in rcu context
  309. */
  310. schedule_work(&psock->gc_work);
  311. }
  312. static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
  313. {
  314. psock->refcnt--;
  315. if (psock->refcnt)
  316. return;
  317. smap_stop_sock(psock, sock);
  318. clear_bit(SMAP_TX_RUNNING, &psock->state);
  319. rcu_assign_sk_user_data(sock, NULL);
  320. call_rcu_sched(&psock->rcu, smap_destroy_psock);
  321. }
  322. static int smap_parse_func_strparser(struct strparser *strp,
  323. struct sk_buff *skb)
  324. {
  325. struct smap_psock *psock;
  326. struct bpf_prog *prog;
  327. int rc;
  328. rcu_read_lock();
  329. psock = container_of(strp, struct smap_psock, strp);
  330. prog = READ_ONCE(psock->bpf_parse);
  331. if (unlikely(!prog)) {
  332. rcu_read_unlock();
  333. return skb->len;
  334. }
  335. /* Attach socket for bpf program to use if needed we can do this
  336. * because strparser clones the skb before handing it to a upper
  337. * layer, meaning skb_orphan has been called. We NULL sk on the
  338. * way out to ensure we don't trigger a BUG_ON in skb/sk operations
  339. * later and because we are not charging the memory of this skb to
  340. * any socket yet.
  341. */
  342. skb->sk = psock->sock;
  343. bpf_compute_data_pointers(skb);
  344. rc = (*prog->bpf_func)(skb, prog->insnsi);
  345. skb->sk = NULL;
  346. rcu_read_unlock();
  347. return rc;
  348. }
  349. static int smap_read_sock_done(struct strparser *strp, int err)
  350. {
  351. return err;
  352. }
  353. static int smap_init_sock(struct smap_psock *psock,
  354. struct sock *sk)
  355. {
  356. static const struct strp_callbacks cb = {
  357. .rcv_msg = smap_read_sock_strparser,
  358. .parse_msg = smap_parse_func_strparser,
  359. .read_sock_done = smap_read_sock_done,
  360. };
  361. return strp_init(&psock->strp, sk, &cb);
  362. }
  363. static void smap_init_progs(struct smap_psock *psock,
  364. struct bpf_stab *stab,
  365. struct bpf_prog *verdict,
  366. struct bpf_prog *parse)
  367. {
  368. struct bpf_prog *orig_parse, *orig_verdict;
  369. orig_parse = xchg(&psock->bpf_parse, parse);
  370. orig_verdict = xchg(&psock->bpf_verdict, verdict);
  371. if (orig_verdict)
  372. bpf_prog_put(orig_verdict);
  373. if (orig_parse)
  374. bpf_prog_put(orig_parse);
  375. }
  376. static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
  377. {
  378. if (sk->sk_data_ready == smap_data_ready)
  379. return;
  380. psock->save_data_ready = sk->sk_data_ready;
  381. psock->save_write_space = sk->sk_write_space;
  382. psock->save_state_change = sk->sk_state_change;
  383. sk->sk_data_ready = smap_data_ready;
  384. sk->sk_write_space = smap_write_space;
  385. sk->sk_state_change = smap_state_change;
  386. psock->strp_enabled = true;
  387. }
  388. static void sock_map_remove_complete(struct bpf_stab *stab)
  389. {
  390. bpf_map_area_free(stab->sock_map);
  391. kfree(stab);
  392. }
  393. static void smap_gc_work(struct work_struct *w)
  394. {
  395. struct smap_psock_map_entry *e, *tmp;
  396. struct smap_psock *psock;
  397. psock = container_of(w, struct smap_psock, gc_work);
  398. /* no callback lock needed because we already detached sockmap ops */
  399. if (psock->strp_enabled)
  400. strp_done(&psock->strp);
  401. cancel_work_sync(&psock->tx_work);
  402. __skb_queue_purge(&psock->rxqueue);
  403. /* At this point all strparser and xmit work must be complete */
  404. if (psock->bpf_parse)
  405. bpf_prog_put(psock->bpf_parse);
  406. if (psock->bpf_verdict)
  407. bpf_prog_put(psock->bpf_verdict);
  408. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  409. list_del(&e->list);
  410. kfree(e);
  411. }
  412. sock_put(psock->sock);
  413. kfree(psock);
  414. }
  415. static struct smap_psock *smap_init_psock(struct sock *sock,
  416. struct bpf_stab *stab)
  417. {
  418. struct smap_psock *psock;
  419. psock = kzalloc_node(sizeof(struct smap_psock),
  420. GFP_ATOMIC | __GFP_NOWARN,
  421. stab->map.numa_node);
  422. if (!psock)
  423. return ERR_PTR(-ENOMEM);
  424. psock->sock = sock;
  425. skb_queue_head_init(&psock->rxqueue);
  426. INIT_WORK(&psock->tx_work, smap_tx_work);
  427. INIT_WORK(&psock->gc_work, smap_gc_work);
  428. INIT_LIST_HEAD(&psock->maps);
  429. psock->refcnt = 1;
  430. rcu_assign_sk_user_data(sock, psock);
  431. sock_hold(sock);
  432. return psock;
  433. }
  434. static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
  435. {
  436. struct bpf_stab *stab;
  437. int err = -EINVAL;
  438. u64 cost;
  439. if (!capable(CAP_NET_ADMIN))
  440. return ERR_PTR(-EPERM);
  441. /* check sanity of attributes */
  442. if (attr->max_entries == 0 || attr->key_size != 4 ||
  443. attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
  444. return ERR_PTR(-EINVAL);
  445. if (attr->value_size > KMALLOC_MAX_SIZE)
  446. return ERR_PTR(-E2BIG);
  447. stab = kzalloc(sizeof(*stab), GFP_USER);
  448. if (!stab)
  449. return ERR_PTR(-ENOMEM);
  450. /* mandatory map attributes */
  451. stab->map.map_type = attr->map_type;
  452. stab->map.key_size = attr->key_size;
  453. stab->map.value_size = attr->value_size;
  454. stab->map.max_entries = attr->max_entries;
  455. stab->map.map_flags = attr->map_flags;
  456. stab->map.numa_node = bpf_map_attr_numa_node(attr);
  457. /* make sure page count doesn't overflow */
  458. cost = (u64) stab->map.max_entries * sizeof(struct sock *);
  459. if (cost >= U32_MAX - PAGE_SIZE)
  460. goto free_stab;
  461. stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
  462. /* if map size is larger than memlock limit, reject it early */
  463. err = bpf_map_precharge_memlock(stab->map.pages);
  464. if (err)
  465. goto free_stab;
  466. err = -ENOMEM;
  467. stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
  468. sizeof(struct sock *),
  469. stab->map.numa_node);
  470. if (!stab->sock_map)
  471. goto free_stab;
  472. return &stab->map;
  473. free_stab:
  474. kfree(stab);
  475. return ERR_PTR(err);
  476. }
  477. static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
  478. {
  479. struct smap_psock_map_entry *e, *tmp;
  480. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  481. if (e->entry == entry) {
  482. list_del(&e->list);
  483. break;
  484. }
  485. }
  486. }
  487. static void sock_map_free(struct bpf_map *map)
  488. {
  489. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  490. int i;
  491. synchronize_rcu();
  492. /* At this point no update, lookup or delete operations can happen.
  493. * However, be aware we can still get a socket state event updates,
  494. * and data ready callabacks that reference the psock from sk_user_data
  495. * Also psock worker threads are still in-flight. So smap_release_sock
  496. * will only free the psock after cancel_sync on the worker threads
  497. * and a grace period expire to ensure psock is really safe to remove.
  498. */
  499. rcu_read_lock();
  500. for (i = 0; i < stab->map.max_entries; i++) {
  501. struct smap_psock *psock;
  502. struct sock *sock;
  503. sock = xchg(&stab->sock_map[i], NULL);
  504. if (!sock)
  505. continue;
  506. write_lock_bh(&sock->sk_callback_lock);
  507. psock = smap_psock_sk(sock);
  508. smap_list_remove(psock, &stab->sock_map[i]);
  509. smap_release_sock(psock, sock);
  510. write_unlock_bh(&sock->sk_callback_lock);
  511. }
  512. rcu_read_unlock();
  513. if (stab->bpf_verdict)
  514. bpf_prog_put(stab->bpf_verdict);
  515. if (stab->bpf_parse)
  516. bpf_prog_put(stab->bpf_parse);
  517. sock_map_remove_complete(stab);
  518. }
  519. static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
  520. {
  521. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  522. u32 i = key ? *(u32 *)key : U32_MAX;
  523. u32 *next = (u32 *)next_key;
  524. if (i >= stab->map.max_entries) {
  525. *next = 0;
  526. return 0;
  527. }
  528. if (i == stab->map.max_entries - 1)
  529. return -ENOENT;
  530. *next = i + 1;
  531. return 0;
  532. }
  533. struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
  534. {
  535. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  536. if (key >= map->max_entries)
  537. return NULL;
  538. return READ_ONCE(stab->sock_map[key]);
  539. }
  540. static int sock_map_delete_elem(struct bpf_map *map, void *key)
  541. {
  542. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  543. struct smap_psock *psock;
  544. int k = *(u32 *)key;
  545. struct sock *sock;
  546. if (k >= map->max_entries)
  547. return -EINVAL;
  548. sock = xchg(&stab->sock_map[k], NULL);
  549. if (!sock)
  550. return -EINVAL;
  551. write_lock_bh(&sock->sk_callback_lock);
  552. psock = smap_psock_sk(sock);
  553. if (!psock)
  554. goto out;
  555. if (psock->bpf_parse)
  556. smap_stop_sock(psock, sock);
  557. smap_list_remove(psock, &stab->sock_map[k]);
  558. smap_release_sock(psock, sock);
  559. out:
  560. write_unlock_bh(&sock->sk_callback_lock);
  561. return 0;
  562. }
  563. /* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
  564. * done inside rcu critical sections. This ensures on updates that the psock
  565. * will not be released via smap_release_sock() until concurrent updates/deletes
  566. * complete. All operations operate on sock_map using cmpxchg and xchg
  567. * operations to ensure we do not get stale references. Any reads into the
  568. * map must be done with READ_ONCE() because of this.
  569. *
  570. * A psock is destroyed via call_rcu and after any worker threads are cancelled
  571. * and syncd so we are certain all references from the update/lookup/delete
  572. * operations as well as references in the data path are no longer in use.
  573. *
  574. * Psocks may exist in multiple maps, but only a single set of parse/verdict
  575. * programs may be inherited from the maps it belongs to. A reference count
  576. * is kept with the total number of references to the psock from all maps. The
  577. * psock will not be released until this reaches zero. The psock and sock
  578. * user data data use the sk_callback_lock to protect critical data structures
  579. * from concurrent access. This allows us to avoid two updates from modifying
  580. * the user data in sock and the lock is required anyways for modifying
  581. * callbacks, we simply increase its scope slightly.
  582. *
  583. * Rules to follow,
  584. * - psock must always be read inside RCU critical section
  585. * - sk_user_data must only be modified inside sk_callback_lock and read
  586. * inside RCU critical section.
  587. * - psock->maps list must only be read & modified inside sk_callback_lock
  588. * - sock_map must use READ_ONCE and (cmp)xchg operations
  589. * - BPF verdict/parse programs must use READ_ONCE and xchg operations
  590. */
  591. static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
  592. struct bpf_map *map,
  593. void *key, u64 flags)
  594. {
  595. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  596. struct smap_psock_map_entry *e = NULL;
  597. struct bpf_prog *verdict, *parse;
  598. struct sock *osock, *sock;
  599. struct smap_psock *psock;
  600. u32 i = *(u32 *)key;
  601. int err;
  602. if (unlikely(flags > BPF_EXIST))
  603. return -EINVAL;
  604. if (unlikely(i >= stab->map.max_entries))
  605. return -E2BIG;
  606. sock = READ_ONCE(stab->sock_map[i]);
  607. if (flags == BPF_EXIST && !sock)
  608. return -ENOENT;
  609. else if (flags == BPF_NOEXIST && sock)
  610. return -EEXIST;
  611. sock = skops->sk;
  612. /* 1. If sock map has BPF programs those will be inherited by the
  613. * sock being added. If the sock is already attached to BPF programs
  614. * this results in an error.
  615. */
  616. verdict = READ_ONCE(stab->bpf_verdict);
  617. parse = READ_ONCE(stab->bpf_parse);
  618. if (parse && verdict) {
  619. /* bpf prog refcnt may be zero if a concurrent attach operation
  620. * removes the program after the above READ_ONCE() but before
  621. * we increment the refcnt. If this is the case abort with an
  622. * error.
  623. */
  624. verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
  625. if (IS_ERR(verdict))
  626. return PTR_ERR(verdict);
  627. parse = bpf_prog_inc_not_zero(stab->bpf_parse);
  628. if (IS_ERR(parse)) {
  629. bpf_prog_put(verdict);
  630. return PTR_ERR(parse);
  631. }
  632. }
  633. write_lock_bh(&sock->sk_callback_lock);
  634. psock = smap_psock_sk(sock);
  635. /* 2. Do not allow inheriting programs if psock exists and has
  636. * already inherited programs. This would create confusion on
  637. * which parser/verdict program is running. If no psock exists
  638. * create one. Inside sk_callback_lock to ensure concurrent create
  639. * doesn't update user data.
  640. */
  641. if (psock) {
  642. if (READ_ONCE(psock->bpf_parse) && parse) {
  643. err = -EBUSY;
  644. goto out_progs;
  645. }
  646. psock->refcnt++;
  647. } else {
  648. psock = smap_init_psock(sock, stab);
  649. if (IS_ERR(psock)) {
  650. err = PTR_ERR(psock);
  651. goto out_progs;
  652. }
  653. set_bit(SMAP_TX_RUNNING, &psock->state);
  654. }
  655. e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
  656. if (!e) {
  657. err = -ENOMEM;
  658. goto out_progs;
  659. }
  660. e->entry = &stab->sock_map[i];
  661. /* 3. At this point we have a reference to a valid psock that is
  662. * running. Attach any BPF programs needed.
  663. */
  664. if (parse && verdict && !psock->strp_enabled) {
  665. err = smap_init_sock(psock, sock);
  666. if (err)
  667. goto out_free;
  668. smap_init_progs(psock, stab, verdict, parse);
  669. smap_start_sock(psock, sock);
  670. }
  671. /* 4. Place psock in sockmap for use and stop any programs on
  672. * the old sock assuming its not the same sock we are replacing
  673. * it with. Because we can only have a single set of programs if
  674. * old_sock has a strp we can stop it.
  675. */
  676. list_add_tail(&e->list, &psock->maps);
  677. write_unlock_bh(&sock->sk_callback_lock);
  678. osock = xchg(&stab->sock_map[i], sock);
  679. if (osock) {
  680. struct smap_psock *opsock = smap_psock_sk(osock);
  681. write_lock_bh(&osock->sk_callback_lock);
  682. if (osock != sock && parse)
  683. smap_stop_sock(opsock, osock);
  684. smap_list_remove(opsock, &stab->sock_map[i]);
  685. smap_release_sock(opsock, osock);
  686. write_unlock_bh(&osock->sk_callback_lock);
  687. }
  688. return 0;
  689. out_free:
  690. smap_release_sock(psock, sock);
  691. out_progs:
  692. if (verdict)
  693. bpf_prog_put(verdict);
  694. if (parse)
  695. bpf_prog_put(parse);
  696. write_unlock_bh(&sock->sk_callback_lock);
  697. kfree(e);
  698. return err;
  699. }
  700. int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
  701. {
  702. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  703. struct bpf_prog *orig;
  704. if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
  705. return -EINVAL;
  706. switch (type) {
  707. case BPF_SK_SKB_STREAM_PARSER:
  708. orig = xchg(&stab->bpf_parse, prog);
  709. break;
  710. case BPF_SK_SKB_STREAM_VERDICT:
  711. orig = xchg(&stab->bpf_verdict, prog);
  712. break;
  713. default:
  714. return -EOPNOTSUPP;
  715. }
  716. if (orig)
  717. bpf_prog_put(orig);
  718. return 0;
  719. }
  720. static void *sock_map_lookup(struct bpf_map *map, void *key)
  721. {
  722. return NULL;
  723. }
  724. static int sock_map_update_elem(struct bpf_map *map,
  725. void *key, void *value, u64 flags)
  726. {
  727. struct bpf_sock_ops_kern skops;
  728. u32 fd = *(u32 *)value;
  729. struct socket *socket;
  730. int err;
  731. socket = sockfd_lookup(fd, &err);
  732. if (!socket)
  733. return err;
  734. skops.sk = socket->sk;
  735. if (!skops.sk) {
  736. fput(socket->file);
  737. return -EINVAL;
  738. }
  739. if (skops.sk->sk_type != SOCK_STREAM ||
  740. skops.sk->sk_protocol != IPPROTO_TCP) {
  741. fput(socket->file);
  742. return -EOPNOTSUPP;
  743. }
  744. err = sock_map_ctx_update_elem(&skops, map, key, flags);
  745. fput(socket->file);
  746. return err;
  747. }
  748. const struct bpf_map_ops sock_map_ops = {
  749. .map_alloc = sock_map_alloc,
  750. .map_free = sock_map_free,
  751. .map_lookup_elem = sock_map_lookup,
  752. .map_get_next_key = sock_map_get_next_key,
  753. .map_update_elem = sock_map_update_elem,
  754. .map_delete_elem = sock_map_delete_elem,
  755. };
  756. BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
  757. struct bpf_map *, map, void *, key, u64, flags)
  758. {
  759. WARN_ON_ONCE(!rcu_read_lock_held());
  760. return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
  761. }
  762. const struct bpf_func_proto bpf_sock_map_update_proto = {
  763. .func = bpf_sock_map_update,
  764. .gpl_only = false,
  765. .pkt_access = true,
  766. .ret_type = RET_INTEGER,
  767. .arg1_type = ARG_PTR_TO_CTX,
  768. .arg2_type = ARG_CONST_MAP_PTR,
  769. .arg3_type = ARG_PTR_TO_MAP_KEY,
  770. .arg4_type = ARG_ANYTHING,
  771. };