sockmap.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893
  1. /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. /* A BPF sock_map is used to store sock objects. This is primarly used
  13. * for doing socket redirect with BPF helper routines.
  14. *
  15. * A sock map may have BPF programs attached to it, currently a program
  16. * used to parse packets and a program to provide a verdict and redirect
  17. * decision on the packet are supported. Any programs attached to a sock
  18. * map are inherited by sock objects when they are added to the map. If
  19. * no BPF programs are attached the sock object may only be used for sock
  20. * redirect.
  21. *
  22. * A sock object may be in multiple maps, but can only inherit a single
  23. * parse or verdict program. If adding a sock object to a map would result
  24. * in having multiple parsing programs the update will return an EBUSY error.
  25. *
  26. * For reference this program is similar to devmap used in XDP context
  27. * reviewing these together may be useful. For an example please review
  28. * ./samples/bpf/sockmap/.
  29. */
  30. #include <linux/bpf.h>
  31. #include <net/sock.h>
  32. #include <linux/filter.h>
  33. #include <linux/errno.h>
  34. #include <linux/file.h>
  35. #include <linux/kernel.h>
  36. #include <linux/net.h>
  37. #include <linux/skbuff.h>
  38. #include <linux/workqueue.h>
  39. #include <linux/list.h>
  40. #include <net/strparser.h>
  41. #include <net/tcp.h>
  42. #define SOCK_CREATE_FLAG_MASK \
  43. (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
  44. struct bpf_stab {
  45. struct bpf_map map;
  46. struct sock **sock_map;
  47. struct bpf_prog *bpf_parse;
  48. struct bpf_prog *bpf_verdict;
  49. };
  50. enum smap_psock_state {
  51. SMAP_TX_RUNNING,
  52. };
  53. struct smap_psock_map_entry {
  54. struct list_head list;
  55. struct sock **entry;
  56. };
  57. struct smap_psock {
  58. struct rcu_head rcu;
  59. /* refcnt is used inside sk_callback_lock */
  60. u32 refcnt;
  61. /* datapath variables */
  62. struct sk_buff_head rxqueue;
  63. bool strp_enabled;
  64. /* datapath error path cache across tx work invocations */
  65. int save_rem;
  66. int save_off;
  67. struct sk_buff *save_skb;
  68. struct strparser strp;
  69. struct bpf_prog *bpf_parse;
  70. struct bpf_prog *bpf_verdict;
  71. struct list_head maps;
  72. /* Back reference used when sock callback trigger sockmap operations */
  73. struct sock *sock;
  74. unsigned long state;
  75. struct work_struct tx_work;
  76. struct work_struct gc_work;
  77. void (*save_data_ready)(struct sock *sk);
  78. void (*save_write_space)(struct sock *sk);
  79. void (*save_state_change)(struct sock *sk);
  80. };
  81. static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
  82. {
  83. return rcu_dereference_sk_user_data(sk);
  84. }
  85. enum __sk_action {
  86. __SK_DROP = 0,
  87. __SK_PASS,
  88. __SK_REDIRECT,
  89. };
  90. static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
  91. {
  92. struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
  93. int rc;
  94. if (unlikely(!prog))
  95. return __SK_DROP;
  96. skb_orphan(skb);
  97. /* We need to ensure that BPF metadata for maps is also cleared
  98. * when we orphan the skb so that we don't have the possibility
  99. * to reference a stale map.
  100. */
  101. TCP_SKB_CB(skb)->bpf.map = NULL;
  102. skb->sk = psock->sock;
  103. bpf_compute_data_pointers(skb);
  104. preempt_disable();
  105. rc = (*prog->bpf_func)(skb, prog->insnsi);
  106. preempt_enable();
  107. skb->sk = NULL;
  108. /* Moving return codes from UAPI namespace into internal namespace */
  109. return rc == SK_PASS ?
  110. (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
  111. __SK_DROP;
  112. }
  113. static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
  114. {
  115. struct sock *sk;
  116. int rc;
  117. rc = smap_verdict_func(psock, skb);
  118. switch (rc) {
  119. case __SK_REDIRECT:
  120. sk = do_sk_redirect_map(skb);
  121. if (likely(sk)) {
  122. struct smap_psock *peer = smap_psock_sk(sk);
  123. if (likely(peer &&
  124. test_bit(SMAP_TX_RUNNING, &peer->state) &&
  125. !sock_flag(sk, SOCK_DEAD) &&
  126. sock_writeable(sk))) {
  127. skb_set_owner_w(skb, sk);
  128. skb_queue_tail(&peer->rxqueue, skb);
  129. schedule_work(&peer->tx_work);
  130. break;
  131. }
  132. }
  133. /* Fall through and free skb otherwise */
  134. case __SK_DROP:
  135. default:
  136. kfree_skb(skb);
  137. }
  138. }
  139. static void smap_report_sk_error(struct smap_psock *psock, int err)
  140. {
  141. struct sock *sk = psock->sock;
  142. sk->sk_err = err;
  143. sk->sk_error_report(sk);
  144. }
  145. static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
  146. /* Called with lock_sock(sk) held */
  147. static void smap_state_change(struct sock *sk)
  148. {
  149. struct smap_psock_map_entry *e, *tmp;
  150. struct smap_psock *psock;
  151. struct socket_wq *wq;
  152. struct sock *osk;
  153. rcu_read_lock();
  154. /* Allowing transitions into an established syn_recv states allows
  155. * for early binding sockets to a smap object before the connection
  156. * is established.
  157. */
  158. switch (sk->sk_state) {
  159. case TCP_SYN_SENT:
  160. case TCP_SYN_RECV:
  161. case TCP_ESTABLISHED:
  162. break;
  163. case TCP_CLOSE_WAIT:
  164. case TCP_CLOSING:
  165. case TCP_LAST_ACK:
  166. case TCP_FIN_WAIT1:
  167. case TCP_FIN_WAIT2:
  168. case TCP_LISTEN:
  169. break;
  170. case TCP_CLOSE:
  171. /* Only release if the map entry is in fact the sock in
  172. * question. There is a case where the operator deletes
  173. * the sock from the map, but the TCP sock is closed before
  174. * the psock is detached. Use cmpxchg to verify correct
  175. * sock is removed.
  176. */
  177. psock = smap_psock_sk(sk);
  178. if (unlikely(!psock))
  179. break;
  180. write_lock_bh(&sk->sk_callback_lock);
  181. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  182. osk = cmpxchg(e->entry, sk, NULL);
  183. if (osk == sk) {
  184. list_del(&e->list);
  185. smap_release_sock(psock, sk);
  186. }
  187. }
  188. write_unlock_bh(&sk->sk_callback_lock);
  189. break;
  190. default:
  191. psock = smap_psock_sk(sk);
  192. if (unlikely(!psock))
  193. break;
  194. smap_report_sk_error(psock, EPIPE);
  195. break;
  196. }
  197. wq = rcu_dereference(sk->sk_wq);
  198. if (skwq_has_sleeper(wq))
  199. wake_up_interruptible_all(&wq->wait);
  200. rcu_read_unlock();
  201. }
  202. static void smap_read_sock_strparser(struct strparser *strp,
  203. struct sk_buff *skb)
  204. {
  205. struct smap_psock *psock;
  206. rcu_read_lock();
  207. psock = container_of(strp, struct smap_psock, strp);
  208. smap_do_verdict(psock, skb);
  209. rcu_read_unlock();
  210. }
  211. /* Called with lock held on socket */
  212. static void smap_data_ready(struct sock *sk)
  213. {
  214. struct smap_psock *psock;
  215. rcu_read_lock();
  216. psock = smap_psock_sk(sk);
  217. if (likely(psock)) {
  218. write_lock_bh(&sk->sk_callback_lock);
  219. strp_data_ready(&psock->strp);
  220. write_unlock_bh(&sk->sk_callback_lock);
  221. }
  222. rcu_read_unlock();
  223. }
  224. static void smap_tx_work(struct work_struct *w)
  225. {
  226. struct smap_psock *psock;
  227. struct sk_buff *skb;
  228. int rem, off, n;
  229. psock = container_of(w, struct smap_psock, tx_work);
  230. /* lock sock to avoid losing sk_socket at some point during loop */
  231. lock_sock(psock->sock);
  232. if (psock->save_skb) {
  233. skb = psock->save_skb;
  234. rem = psock->save_rem;
  235. off = psock->save_off;
  236. psock->save_skb = NULL;
  237. goto start;
  238. }
  239. while ((skb = skb_dequeue(&psock->rxqueue))) {
  240. rem = skb->len;
  241. off = 0;
  242. start:
  243. do {
  244. if (likely(psock->sock->sk_socket))
  245. n = skb_send_sock_locked(psock->sock,
  246. skb, off, rem);
  247. else
  248. n = -EINVAL;
  249. if (n <= 0) {
  250. if (n == -EAGAIN) {
  251. /* Retry when space is available */
  252. psock->save_skb = skb;
  253. psock->save_rem = rem;
  254. psock->save_off = off;
  255. goto out;
  256. }
  257. /* Hard errors break pipe and stop xmit */
  258. smap_report_sk_error(psock, n ? -n : EPIPE);
  259. clear_bit(SMAP_TX_RUNNING, &psock->state);
  260. kfree_skb(skb);
  261. goto out;
  262. }
  263. rem -= n;
  264. off += n;
  265. } while (rem);
  266. kfree_skb(skb);
  267. }
  268. out:
  269. release_sock(psock->sock);
  270. }
  271. static void smap_write_space(struct sock *sk)
  272. {
  273. struct smap_psock *psock;
  274. rcu_read_lock();
  275. psock = smap_psock_sk(sk);
  276. if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
  277. schedule_work(&psock->tx_work);
  278. rcu_read_unlock();
  279. }
  280. static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
  281. {
  282. if (!psock->strp_enabled)
  283. return;
  284. sk->sk_data_ready = psock->save_data_ready;
  285. sk->sk_write_space = psock->save_write_space;
  286. sk->sk_state_change = psock->save_state_change;
  287. psock->save_data_ready = NULL;
  288. psock->save_write_space = NULL;
  289. psock->save_state_change = NULL;
  290. strp_stop(&psock->strp);
  291. psock->strp_enabled = false;
  292. }
  293. static void smap_destroy_psock(struct rcu_head *rcu)
  294. {
  295. struct smap_psock *psock = container_of(rcu,
  296. struct smap_psock, rcu);
  297. /* Now that a grace period has passed there is no longer
  298. * any reference to this sock in the sockmap so we can
  299. * destroy the psock, strparser, and bpf programs. But,
  300. * because we use workqueue sync operations we can not
  301. * do it in rcu context
  302. */
  303. schedule_work(&psock->gc_work);
  304. }
  305. static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
  306. {
  307. psock->refcnt--;
  308. if (psock->refcnt)
  309. return;
  310. smap_stop_sock(psock, sock);
  311. clear_bit(SMAP_TX_RUNNING, &psock->state);
  312. rcu_assign_sk_user_data(sock, NULL);
  313. call_rcu_sched(&psock->rcu, smap_destroy_psock);
  314. }
  315. static int smap_parse_func_strparser(struct strparser *strp,
  316. struct sk_buff *skb)
  317. {
  318. struct smap_psock *psock;
  319. struct bpf_prog *prog;
  320. int rc;
  321. rcu_read_lock();
  322. psock = container_of(strp, struct smap_psock, strp);
  323. prog = READ_ONCE(psock->bpf_parse);
  324. if (unlikely(!prog)) {
  325. rcu_read_unlock();
  326. return skb->len;
  327. }
  328. /* Attach socket for bpf program to use if needed we can do this
  329. * because strparser clones the skb before handing it to a upper
  330. * layer, meaning skb_orphan has been called. We NULL sk on the
  331. * way out to ensure we don't trigger a BUG_ON in skb/sk operations
  332. * later and because we are not charging the memory of this skb to
  333. * any socket yet.
  334. */
  335. skb->sk = psock->sock;
  336. bpf_compute_data_pointers(skb);
  337. rc = (*prog->bpf_func)(skb, prog->insnsi);
  338. skb->sk = NULL;
  339. rcu_read_unlock();
  340. return rc;
  341. }
  342. static int smap_read_sock_done(struct strparser *strp, int err)
  343. {
  344. return err;
  345. }
  346. static int smap_init_sock(struct smap_psock *psock,
  347. struct sock *sk)
  348. {
  349. static const struct strp_callbacks cb = {
  350. .rcv_msg = smap_read_sock_strparser,
  351. .parse_msg = smap_parse_func_strparser,
  352. .read_sock_done = smap_read_sock_done,
  353. };
  354. return strp_init(&psock->strp, sk, &cb);
  355. }
  356. static void smap_init_progs(struct smap_psock *psock,
  357. struct bpf_stab *stab,
  358. struct bpf_prog *verdict,
  359. struct bpf_prog *parse)
  360. {
  361. struct bpf_prog *orig_parse, *orig_verdict;
  362. orig_parse = xchg(&psock->bpf_parse, parse);
  363. orig_verdict = xchg(&psock->bpf_verdict, verdict);
  364. if (orig_verdict)
  365. bpf_prog_put(orig_verdict);
  366. if (orig_parse)
  367. bpf_prog_put(orig_parse);
  368. }
  369. static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
  370. {
  371. if (sk->sk_data_ready == smap_data_ready)
  372. return;
  373. psock->save_data_ready = sk->sk_data_ready;
  374. psock->save_write_space = sk->sk_write_space;
  375. psock->save_state_change = sk->sk_state_change;
  376. sk->sk_data_ready = smap_data_ready;
  377. sk->sk_write_space = smap_write_space;
  378. sk->sk_state_change = smap_state_change;
  379. psock->strp_enabled = true;
  380. }
  381. static void sock_map_remove_complete(struct bpf_stab *stab)
  382. {
  383. bpf_map_area_free(stab->sock_map);
  384. kfree(stab);
  385. }
  386. static void smap_gc_work(struct work_struct *w)
  387. {
  388. struct smap_psock_map_entry *e, *tmp;
  389. struct smap_psock *psock;
  390. psock = container_of(w, struct smap_psock, gc_work);
  391. /* no callback lock needed because we already detached sockmap ops */
  392. if (psock->strp_enabled)
  393. strp_done(&psock->strp);
  394. cancel_work_sync(&psock->tx_work);
  395. __skb_queue_purge(&psock->rxqueue);
  396. /* At this point all strparser and xmit work must be complete */
  397. if (psock->bpf_parse)
  398. bpf_prog_put(psock->bpf_parse);
  399. if (psock->bpf_verdict)
  400. bpf_prog_put(psock->bpf_verdict);
  401. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  402. list_del(&e->list);
  403. kfree(e);
  404. }
  405. sock_put(psock->sock);
  406. kfree(psock);
  407. }
  408. static struct smap_psock *smap_init_psock(struct sock *sock,
  409. struct bpf_stab *stab)
  410. {
  411. struct smap_psock *psock;
  412. psock = kzalloc_node(sizeof(struct smap_psock),
  413. GFP_ATOMIC | __GFP_NOWARN,
  414. stab->map.numa_node);
  415. if (!psock)
  416. return ERR_PTR(-ENOMEM);
  417. psock->sock = sock;
  418. skb_queue_head_init(&psock->rxqueue);
  419. INIT_WORK(&psock->tx_work, smap_tx_work);
  420. INIT_WORK(&psock->gc_work, smap_gc_work);
  421. INIT_LIST_HEAD(&psock->maps);
  422. psock->refcnt = 1;
  423. rcu_assign_sk_user_data(sock, psock);
  424. sock_hold(sock);
  425. return psock;
  426. }
  427. static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
  428. {
  429. struct bpf_stab *stab;
  430. int err = -EINVAL;
  431. u64 cost;
  432. if (!capable(CAP_NET_ADMIN))
  433. return ERR_PTR(-EPERM);
  434. /* check sanity of attributes */
  435. if (attr->max_entries == 0 || attr->key_size != 4 ||
  436. attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
  437. return ERR_PTR(-EINVAL);
  438. if (attr->value_size > KMALLOC_MAX_SIZE)
  439. return ERR_PTR(-E2BIG);
  440. stab = kzalloc(sizeof(*stab), GFP_USER);
  441. if (!stab)
  442. return ERR_PTR(-ENOMEM);
  443. /* mandatory map attributes */
  444. stab->map.map_type = attr->map_type;
  445. stab->map.key_size = attr->key_size;
  446. stab->map.value_size = attr->value_size;
  447. stab->map.max_entries = attr->max_entries;
  448. stab->map.map_flags = attr->map_flags;
  449. stab->map.numa_node = bpf_map_attr_numa_node(attr);
  450. /* make sure page count doesn't overflow */
  451. cost = (u64) stab->map.max_entries * sizeof(struct sock *);
  452. if (cost >= U32_MAX - PAGE_SIZE)
  453. goto free_stab;
  454. stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
  455. /* if map size is larger than memlock limit, reject it early */
  456. err = bpf_map_precharge_memlock(stab->map.pages);
  457. if (err)
  458. goto free_stab;
  459. err = -ENOMEM;
  460. stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
  461. sizeof(struct sock *),
  462. stab->map.numa_node);
  463. if (!stab->sock_map)
  464. goto free_stab;
  465. return &stab->map;
  466. free_stab:
  467. kfree(stab);
  468. return ERR_PTR(err);
  469. }
  470. static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
  471. {
  472. struct smap_psock_map_entry *e, *tmp;
  473. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  474. if (e->entry == entry) {
  475. list_del(&e->list);
  476. break;
  477. }
  478. }
  479. }
  480. static void sock_map_free(struct bpf_map *map)
  481. {
  482. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  483. int i;
  484. synchronize_rcu();
  485. /* At this point no update, lookup or delete operations can happen.
  486. * However, be aware we can still get a socket state event updates,
  487. * and data ready callabacks that reference the psock from sk_user_data
  488. * Also psock worker threads are still in-flight. So smap_release_sock
  489. * will only free the psock after cancel_sync on the worker threads
  490. * and a grace period expire to ensure psock is really safe to remove.
  491. */
  492. rcu_read_lock();
  493. for (i = 0; i < stab->map.max_entries; i++) {
  494. struct smap_psock *psock;
  495. struct sock *sock;
  496. sock = xchg(&stab->sock_map[i], NULL);
  497. if (!sock)
  498. continue;
  499. write_lock_bh(&sock->sk_callback_lock);
  500. psock = smap_psock_sk(sock);
  501. smap_list_remove(psock, &stab->sock_map[i]);
  502. smap_release_sock(psock, sock);
  503. write_unlock_bh(&sock->sk_callback_lock);
  504. }
  505. rcu_read_unlock();
  506. if (stab->bpf_verdict)
  507. bpf_prog_put(stab->bpf_verdict);
  508. if (stab->bpf_parse)
  509. bpf_prog_put(stab->bpf_parse);
  510. sock_map_remove_complete(stab);
  511. }
  512. static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
  513. {
  514. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  515. u32 i = key ? *(u32 *)key : U32_MAX;
  516. u32 *next = (u32 *)next_key;
  517. if (i >= stab->map.max_entries) {
  518. *next = 0;
  519. return 0;
  520. }
  521. if (i == stab->map.max_entries - 1)
  522. return -ENOENT;
  523. *next = i + 1;
  524. return 0;
  525. }
  526. struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
  527. {
  528. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  529. if (key >= map->max_entries)
  530. return NULL;
  531. return READ_ONCE(stab->sock_map[key]);
  532. }
  533. static int sock_map_delete_elem(struct bpf_map *map, void *key)
  534. {
  535. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  536. struct smap_psock *psock;
  537. int k = *(u32 *)key;
  538. struct sock *sock;
  539. if (k >= map->max_entries)
  540. return -EINVAL;
  541. sock = xchg(&stab->sock_map[k], NULL);
  542. if (!sock)
  543. return -EINVAL;
  544. write_lock_bh(&sock->sk_callback_lock);
  545. psock = smap_psock_sk(sock);
  546. if (!psock)
  547. goto out;
  548. if (psock->bpf_parse)
  549. smap_stop_sock(psock, sock);
  550. smap_list_remove(psock, &stab->sock_map[k]);
  551. smap_release_sock(psock, sock);
  552. out:
  553. write_unlock_bh(&sock->sk_callback_lock);
  554. return 0;
  555. }
  556. /* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
  557. * done inside rcu critical sections. This ensures on updates that the psock
  558. * will not be released via smap_release_sock() until concurrent updates/deletes
  559. * complete. All operations operate on sock_map using cmpxchg and xchg
  560. * operations to ensure we do not get stale references. Any reads into the
  561. * map must be done with READ_ONCE() because of this.
  562. *
  563. * A psock is destroyed via call_rcu and after any worker threads are cancelled
  564. * and syncd so we are certain all references from the update/lookup/delete
  565. * operations as well as references in the data path are no longer in use.
  566. *
  567. * Psocks may exist in multiple maps, but only a single set of parse/verdict
  568. * programs may be inherited from the maps it belongs to. A reference count
  569. * is kept with the total number of references to the psock from all maps. The
  570. * psock will not be released until this reaches zero. The psock and sock
  571. * user data data use the sk_callback_lock to protect critical data structures
  572. * from concurrent access. This allows us to avoid two updates from modifying
  573. * the user data in sock and the lock is required anyways for modifying
  574. * callbacks, we simply increase its scope slightly.
  575. *
  576. * Rules to follow,
  577. * - psock must always be read inside RCU critical section
  578. * - sk_user_data must only be modified inside sk_callback_lock and read
  579. * inside RCU critical section.
  580. * - psock->maps list must only be read & modified inside sk_callback_lock
  581. * - sock_map must use READ_ONCE and (cmp)xchg operations
  582. * - BPF verdict/parse programs must use READ_ONCE and xchg operations
  583. */
  584. static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
  585. struct bpf_map *map,
  586. void *key, u64 flags)
  587. {
  588. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  589. struct smap_psock_map_entry *e = NULL;
  590. struct bpf_prog *verdict, *parse;
  591. struct sock *osock, *sock;
  592. struct smap_psock *psock;
  593. u32 i = *(u32 *)key;
  594. int err;
  595. if (unlikely(flags > BPF_EXIST))
  596. return -EINVAL;
  597. if (unlikely(i >= stab->map.max_entries))
  598. return -E2BIG;
  599. sock = READ_ONCE(stab->sock_map[i]);
  600. if (flags == BPF_EXIST && !sock)
  601. return -ENOENT;
  602. else if (flags == BPF_NOEXIST && sock)
  603. return -EEXIST;
  604. sock = skops->sk;
  605. /* 1. If sock map has BPF programs those will be inherited by the
  606. * sock being added. If the sock is already attached to BPF programs
  607. * this results in an error.
  608. */
  609. verdict = READ_ONCE(stab->bpf_verdict);
  610. parse = READ_ONCE(stab->bpf_parse);
  611. if (parse && verdict) {
  612. /* bpf prog refcnt may be zero if a concurrent attach operation
  613. * removes the program after the above READ_ONCE() but before
  614. * we increment the refcnt. If this is the case abort with an
  615. * error.
  616. */
  617. verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
  618. if (IS_ERR(verdict))
  619. return PTR_ERR(verdict);
  620. parse = bpf_prog_inc_not_zero(stab->bpf_parse);
  621. if (IS_ERR(parse)) {
  622. bpf_prog_put(verdict);
  623. return PTR_ERR(parse);
  624. }
  625. }
  626. write_lock_bh(&sock->sk_callback_lock);
  627. psock = smap_psock_sk(sock);
  628. /* 2. Do not allow inheriting programs if psock exists and has
  629. * already inherited programs. This would create confusion on
  630. * which parser/verdict program is running. If no psock exists
  631. * create one. Inside sk_callback_lock to ensure concurrent create
  632. * doesn't update user data.
  633. */
  634. if (psock) {
  635. if (READ_ONCE(psock->bpf_parse) && parse) {
  636. err = -EBUSY;
  637. goto out_progs;
  638. }
  639. psock->refcnt++;
  640. } else {
  641. psock = smap_init_psock(sock, stab);
  642. if (IS_ERR(psock)) {
  643. err = PTR_ERR(psock);
  644. goto out_progs;
  645. }
  646. set_bit(SMAP_TX_RUNNING, &psock->state);
  647. }
  648. e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
  649. if (!e) {
  650. err = -ENOMEM;
  651. goto out_progs;
  652. }
  653. e->entry = &stab->sock_map[i];
  654. /* 3. At this point we have a reference to a valid psock that is
  655. * running. Attach any BPF programs needed.
  656. */
  657. if (parse && verdict && !psock->strp_enabled) {
  658. err = smap_init_sock(psock, sock);
  659. if (err)
  660. goto out_free;
  661. smap_init_progs(psock, stab, verdict, parse);
  662. smap_start_sock(psock, sock);
  663. }
  664. /* 4. Place psock in sockmap for use and stop any programs on
  665. * the old sock assuming its not the same sock we are replacing
  666. * it with. Because we can only have a single set of programs if
  667. * old_sock has a strp we can stop it.
  668. */
  669. list_add_tail(&e->list, &psock->maps);
  670. write_unlock_bh(&sock->sk_callback_lock);
  671. osock = xchg(&stab->sock_map[i], sock);
  672. if (osock) {
  673. struct smap_psock *opsock = smap_psock_sk(osock);
  674. write_lock_bh(&osock->sk_callback_lock);
  675. if (osock != sock && parse)
  676. smap_stop_sock(opsock, osock);
  677. smap_list_remove(opsock, &stab->sock_map[i]);
  678. smap_release_sock(opsock, osock);
  679. write_unlock_bh(&osock->sk_callback_lock);
  680. }
  681. return 0;
  682. out_free:
  683. smap_release_sock(psock, sock);
  684. out_progs:
  685. if (verdict)
  686. bpf_prog_put(verdict);
  687. if (parse)
  688. bpf_prog_put(parse);
  689. write_unlock_bh(&sock->sk_callback_lock);
  690. kfree(e);
  691. return err;
  692. }
  693. int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
  694. {
  695. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  696. struct bpf_prog *orig;
  697. if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
  698. return -EINVAL;
  699. switch (type) {
  700. case BPF_SK_SKB_STREAM_PARSER:
  701. orig = xchg(&stab->bpf_parse, prog);
  702. break;
  703. case BPF_SK_SKB_STREAM_VERDICT:
  704. orig = xchg(&stab->bpf_verdict, prog);
  705. break;
  706. default:
  707. return -EOPNOTSUPP;
  708. }
  709. if (orig)
  710. bpf_prog_put(orig);
  711. return 0;
  712. }
  713. static void *sock_map_lookup(struct bpf_map *map, void *key)
  714. {
  715. return NULL;
  716. }
  717. static int sock_map_update_elem(struct bpf_map *map,
  718. void *key, void *value, u64 flags)
  719. {
  720. struct bpf_sock_ops_kern skops;
  721. u32 fd = *(u32 *)value;
  722. struct socket *socket;
  723. int err;
  724. socket = sockfd_lookup(fd, &err);
  725. if (!socket)
  726. return err;
  727. skops.sk = socket->sk;
  728. if (!skops.sk) {
  729. fput(socket->file);
  730. return -EINVAL;
  731. }
  732. if (skops.sk->sk_type != SOCK_STREAM ||
  733. skops.sk->sk_protocol != IPPROTO_TCP) {
  734. fput(socket->file);
  735. return -EOPNOTSUPP;
  736. }
  737. err = sock_map_ctx_update_elem(&skops, map, key, flags);
  738. fput(socket->file);
  739. return err;
  740. }
  741. const struct bpf_map_ops sock_map_ops = {
  742. .map_alloc = sock_map_alloc,
  743. .map_free = sock_map_free,
  744. .map_lookup_elem = sock_map_lookup,
  745. .map_get_next_key = sock_map_get_next_key,
  746. .map_update_elem = sock_map_update_elem,
  747. .map_delete_elem = sock_map_delete_elem,
  748. };
  749. BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
  750. struct bpf_map *, map, void *, key, u64, flags)
  751. {
  752. WARN_ON_ONCE(!rcu_read_lock_held());
  753. return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
  754. }
  755. const struct bpf_func_proto bpf_sock_map_update_proto = {
  756. .func = bpf_sock_map_update,
  757. .gpl_only = false,
  758. .pkt_access = true,
  759. .ret_type = RET_INTEGER,
  760. .arg1_type = ARG_PTR_TO_CTX,
  761. .arg2_type = ARG_CONST_MAP_PTR,
  762. .arg3_type = ARG_PTR_TO_MAP_KEY,
  763. .arg4_type = ARG_ANYTHING,
  764. };