sockmap.c 22 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873
  1. /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. /* A BPF sock_map is used to store sock objects. This is primarly used
  13. * for doing socket redirect with BPF helper routines.
  14. *
  15. * A sock map may have BPF programs attached to it, currently a program
  16. * used to parse packets and a program to provide a verdict and redirect
  17. * decision on the packet are supported. Any programs attached to a sock
  18. * map are inherited by sock objects when they are added to the map. If
  19. * no BPF programs are attached the sock object may only be used for sock
  20. * redirect.
  21. *
  22. * A sock object may be in multiple maps, but can only inherit a single
  23. * parse or verdict program. If adding a sock object to a map would result
  24. * in having multiple parsing programs the update will return an EBUSY error.
  25. *
  26. * For reference this program is similar to devmap used in XDP context
  27. * reviewing these together may be useful. For an example please review
  28. * ./samples/bpf/sockmap/.
  29. */
  30. #include <linux/bpf.h>
  31. #include <net/sock.h>
  32. #include <linux/filter.h>
  33. #include <linux/errno.h>
  34. #include <linux/file.h>
  35. #include <linux/kernel.h>
  36. #include <linux/net.h>
  37. #include <linux/skbuff.h>
  38. #include <linux/workqueue.h>
  39. #include <linux/list.h>
  40. #include <net/strparser.h>
  41. struct bpf_stab {
  42. struct bpf_map map;
  43. struct sock **sock_map;
  44. struct bpf_prog *bpf_parse;
  45. struct bpf_prog *bpf_verdict;
  46. };
  47. enum smap_psock_state {
  48. SMAP_TX_RUNNING,
  49. };
  50. struct smap_psock_map_entry {
  51. struct list_head list;
  52. struct sock **entry;
  53. };
  54. struct smap_psock {
  55. struct rcu_head rcu;
  56. /* refcnt is used inside sk_callback_lock */
  57. u32 refcnt;
  58. /* datapath variables */
  59. struct sk_buff_head rxqueue;
  60. bool strp_enabled;
  61. /* datapath error path cache across tx work invocations */
  62. int save_rem;
  63. int save_off;
  64. struct sk_buff *save_skb;
  65. struct strparser strp;
  66. struct bpf_prog *bpf_parse;
  67. struct bpf_prog *bpf_verdict;
  68. struct list_head maps;
  69. /* Back reference used when sock callback trigger sockmap operations */
  70. struct sock *sock;
  71. unsigned long state;
  72. struct work_struct tx_work;
  73. struct work_struct gc_work;
  74. void (*save_data_ready)(struct sock *sk);
  75. void (*save_write_space)(struct sock *sk);
  76. void (*save_state_change)(struct sock *sk);
  77. };
  78. static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
  79. {
  80. return rcu_dereference_sk_user_data(sk);
  81. }
  82. static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
  83. {
  84. struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
  85. int rc;
  86. if (unlikely(!prog))
  87. return SK_DROP;
  88. skb_orphan(skb);
  89. skb->sk = psock->sock;
  90. bpf_compute_data_end(skb);
  91. rc = (*prog->bpf_func)(skb, prog->insnsi);
  92. skb->sk = NULL;
  93. return rc;
  94. }
  95. static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
  96. {
  97. struct sock *sk;
  98. int rc;
  99. /* Because we use per cpu values to feed input from sock redirect
  100. * in BPF program to do_sk_redirect_map() call we need to ensure we
  101. * are not preempted. RCU read lock is not sufficient in this case
  102. * with CONFIG_PREEMPT_RCU enabled so we must be explicit here.
  103. */
  104. preempt_disable();
  105. rc = smap_verdict_func(psock, skb);
  106. switch (rc) {
  107. case SK_REDIRECT:
  108. sk = do_sk_redirect_map();
  109. preempt_enable();
  110. if (likely(sk)) {
  111. struct smap_psock *peer = smap_psock_sk(sk);
  112. if (likely(peer &&
  113. test_bit(SMAP_TX_RUNNING, &peer->state) &&
  114. !sock_flag(sk, SOCK_DEAD) &&
  115. sock_writeable(sk))) {
  116. skb_set_owner_w(skb, sk);
  117. skb_queue_tail(&peer->rxqueue, skb);
  118. schedule_work(&peer->tx_work);
  119. break;
  120. }
  121. }
  122. /* Fall through and free skb otherwise */
  123. case SK_DROP:
  124. default:
  125. if (rc != SK_REDIRECT)
  126. preempt_enable();
  127. kfree_skb(skb);
  128. }
  129. }
  130. static void smap_report_sk_error(struct smap_psock *psock, int err)
  131. {
  132. struct sock *sk = psock->sock;
  133. sk->sk_err = err;
  134. sk->sk_error_report(sk);
  135. }
  136. static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
  137. /* Called with lock_sock(sk) held */
  138. static void smap_state_change(struct sock *sk)
  139. {
  140. struct smap_psock_map_entry *e, *tmp;
  141. struct smap_psock *psock;
  142. struct socket_wq *wq;
  143. struct sock *osk;
  144. rcu_read_lock();
  145. /* Allowing transitions into an established syn_recv states allows
  146. * for early binding sockets to a smap object before the connection
  147. * is established.
  148. */
  149. switch (sk->sk_state) {
  150. case TCP_SYN_SENT:
  151. case TCP_SYN_RECV:
  152. case TCP_ESTABLISHED:
  153. break;
  154. case TCP_CLOSE_WAIT:
  155. case TCP_CLOSING:
  156. case TCP_LAST_ACK:
  157. case TCP_FIN_WAIT1:
  158. case TCP_FIN_WAIT2:
  159. case TCP_LISTEN:
  160. break;
  161. case TCP_CLOSE:
  162. /* Only release if the map entry is in fact the sock in
  163. * question. There is a case where the operator deletes
  164. * the sock from the map, but the TCP sock is closed before
  165. * the psock is detached. Use cmpxchg to verify correct
  166. * sock is removed.
  167. */
  168. psock = smap_psock_sk(sk);
  169. if (unlikely(!psock))
  170. break;
  171. write_lock_bh(&sk->sk_callback_lock);
  172. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  173. osk = cmpxchg(e->entry, sk, NULL);
  174. if (osk == sk) {
  175. list_del(&e->list);
  176. smap_release_sock(psock, sk);
  177. }
  178. }
  179. write_unlock_bh(&sk->sk_callback_lock);
  180. break;
  181. default:
  182. psock = smap_psock_sk(sk);
  183. if (unlikely(!psock))
  184. break;
  185. smap_report_sk_error(psock, EPIPE);
  186. break;
  187. }
  188. wq = rcu_dereference(sk->sk_wq);
  189. if (skwq_has_sleeper(wq))
  190. wake_up_interruptible_all(&wq->wait);
  191. rcu_read_unlock();
  192. }
  193. static void smap_read_sock_strparser(struct strparser *strp,
  194. struct sk_buff *skb)
  195. {
  196. struct smap_psock *psock;
  197. rcu_read_lock();
  198. psock = container_of(strp, struct smap_psock, strp);
  199. smap_do_verdict(psock, skb);
  200. rcu_read_unlock();
  201. }
  202. /* Called with lock held on socket */
  203. static void smap_data_ready(struct sock *sk)
  204. {
  205. struct smap_psock *psock;
  206. rcu_read_lock();
  207. psock = smap_psock_sk(sk);
  208. if (likely(psock)) {
  209. write_lock_bh(&sk->sk_callback_lock);
  210. strp_data_ready(&psock->strp);
  211. write_unlock_bh(&sk->sk_callback_lock);
  212. }
  213. rcu_read_unlock();
  214. }
  215. static void smap_tx_work(struct work_struct *w)
  216. {
  217. struct smap_psock *psock;
  218. struct sk_buff *skb;
  219. int rem, off, n;
  220. psock = container_of(w, struct smap_psock, tx_work);
  221. /* lock sock to avoid losing sk_socket at some point during loop */
  222. lock_sock(psock->sock);
  223. if (psock->save_skb) {
  224. skb = psock->save_skb;
  225. rem = psock->save_rem;
  226. off = psock->save_off;
  227. psock->save_skb = NULL;
  228. goto start;
  229. }
  230. while ((skb = skb_dequeue(&psock->rxqueue))) {
  231. rem = skb->len;
  232. off = 0;
  233. start:
  234. do {
  235. if (likely(psock->sock->sk_socket))
  236. n = skb_send_sock_locked(psock->sock,
  237. skb, off, rem);
  238. else
  239. n = -EINVAL;
  240. if (n <= 0) {
  241. if (n == -EAGAIN) {
  242. /* Retry when space is available */
  243. psock->save_skb = skb;
  244. psock->save_rem = rem;
  245. psock->save_off = off;
  246. goto out;
  247. }
  248. /* Hard errors break pipe and stop xmit */
  249. smap_report_sk_error(psock, n ? -n : EPIPE);
  250. clear_bit(SMAP_TX_RUNNING, &psock->state);
  251. kfree_skb(skb);
  252. goto out;
  253. }
  254. rem -= n;
  255. off += n;
  256. } while (rem);
  257. kfree_skb(skb);
  258. }
  259. out:
  260. release_sock(psock->sock);
  261. }
  262. static void smap_write_space(struct sock *sk)
  263. {
  264. struct smap_psock *psock;
  265. rcu_read_lock();
  266. psock = smap_psock_sk(sk);
  267. if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
  268. schedule_work(&psock->tx_work);
  269. rcu_read_unlock();
  270. }
  271. static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
  272. {
  273. if (!psock->strp_enabled)
  274. return;
  275. sk->sk_data_ready = psock->save_data_ready;
  276. sk->sk_write_space = psock->save_write_space;
  277. sk->sk_state_change = psock->save_state_change;
  278. psock->save_data_ready = NULL;
  279. psock->save_write_space = NULL;
  280. psock->save_state_change = NULL;
  281. strp_stop(&psock->strp);
  282. psock->strp_enabled = false;
  283. }
  284. static void smap_destroy_psock(struct rcu_head *rcu)
  285. {
  286. struct smap_psock *psock = container_of(rcu,
  287. struct smap_psock, rcu);
  288. /* Now that a grace period has passed there is no longer
  289. * any reference to this sock in the sockmap so we can
  290. * destroy the psock, strparser, and bpf programs. But,
  291. * because we use workqueue sync operations we can not
  292. * do it in rcu context
  293. */
  294. schedule_work(&psock->gc_work);
  295. }
  296. static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
  297. {
  298. psock->refcnt--;
  299. if (psock->refcnt)
  300. return;
  301. smap_stop_sock(psock, sock);
  302. clear_bit(SMAP_TX_RUNNING, &psock->state);
  303. rcu_assign_sk_user_data(sock, NULL);
  304. call_rcu_sched(&psock->rcu, smap_destroy_psock);
  305. }
  306. static int smap_parse_func_strparser(struct strparser *strp,
  307. struct sk_buff *skb)
  308. {
  309. struct smap_psock *psock;
  310. struct bpf_prog *prog;
  311. int rc;
  312. rcu_read_lock();
  313. psock = container_of(strp, struct smap_psock, strp);
  314. prog = READ_ONCE(psock->bpf_parse);
  315. if (unlikely(!prog)) {
  316. rcu_read_unlock();
  317. return skb->len;
  318. }
  319. /* Attach socket for bpf program to use if needed we can do this
  320. * because strparser clones the skb before handing it to a upper
  321. * layer, meaning skb_orphan has been called. We NULL sk on the
  322. * way out to ensure we don't trigger a BUG_ON in skb/sk operations
  323. * later and because we are not charging the memory of this skb to
  324. * any socket yet.
  325. */
  326. skb->sk = psock->sock;
  327. bpf_compute_data_end(skb);
  328. rc = (*prog->bpf_func)(skb, prog->insnsi);
  329. skb->sk = NULL;
  330. rcu_read_unlock();
  331. return rc;
  332. }
  333. static int smap_read_sock_done(struct strparser *strp, int err)
  334. {
  335. return err;
  336. }
  337. static int smap_init_sock(struct smap_psock *psock,
  338. struct sock *sk)
  339. {
  340. static const struct strp_callbacks cb = {
  341. .rcv_msg = smap_read_sock_strparser,
  342. .parse_msg = smap_parse_func_strparser,
  343. .read_sock_done = smap_read_sock_done,
  344. };
  345. return strp_init(&psock->strp, sk, &cb);
  346. }
  347. static void smap_init_progs(struct smap_psock *psock,
  348. struct bpf_stab *stab,
  349. struct bpf_prog *verdict,
  350. struct bpf_prog *parse)
  351. {
  352. struct bpf_prog *orig_parse, *orig_verdict;
  353. orig_parse = xchg(&psock->bpf_parse, parse);
  354. orig_verdict = xchg(&psock->bpf_verdict, verdict);
  355. if (orig_verdict)
  356. bpf_prog_put(orig_verdict);
  357. if (orig_parse)
  358. bpf_prog_put(orig_parse);
  359. }
  360. static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
  361. {
  362. if (sk->sk_data_ready == smap_data_ready)
  363. return;
  364. psock->save_data_ready = sk->sk_data_ready;
  365. psock->save_write_space = sk->sk_write_space;
  366. psock->save_state_change = sk->sk_state_change;
  367. sk->sk_data_ready = smap_data_ready;
  368. sk->sk_write_space = smap_write_space;
  369. sk->sk_state_change = smap_state_change;
  370. psock->strp_enabled = true;
  371. }
  372. static void sock_map_remove_complete(struct bpf_stab *stab)
  373. {
  374. bpf_map_area_free(stab->sock_map);
  375. kfree(stab);
  376. }
  377. static void smap_gc_work(struct work_struct *w)
  378. {
  379. struct smap_psock_map_entry *e, *tmp;
  380. struct smap_psock *psock;
  381. psock = container_of(w, struct smap_psock, gc_work);
  382. /* no callback lock needed because we already detached sockmap ops */
  383. if (psock->strp_enabled)
  384. strp_done(&psock->strp);
  385. cancel_work_sync(&psock->tx_work);
  386. __skb_queue_purge(&psock->rxqueue);
  387. /* At this point all strparser and xmit work must be complete */
  388. if (psock->bpf_parse)
  389. bpf_prog_put(psock->bpf_parse);
  390. if (psock->bpf_verdict)
  391. bpf_prog_put(psock->bpf_verdict);
  392. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  393. list_del(&e->list);
  394. kfree(e);
  395. }
  396. sock_put(psock->sock);
  397. kfree(psock);
  398. }
  399. static struct smap_psock *smap_init_psock(struct sock *sock,
  400. struct bpf_stab *stab)
  401. {
  402. struct smap_psock *psock;
  403. psock = kzalloc_node(sizeof(struct smap_psock),
  404. GFP_ATOMIC | __GFP_NOWARN,
  405. stab->map.numa_node);
  406. if (!psock)
  407. return ERR_PTR(-ENOMEM);
  408. psock->sock = sock;
  409. skb_queue_head_init(&psock->rxqueue);
  410. INIT_WORK(&psock->tx_work, smap_tx_work);
  411. INIT_WORK(&psock->gc_work, smap_gc_work);
  412. INIT_LIST_HEAD(&psock->maps);
  413. psock->refcnt = 1;
  414. rcu_assign_sk_user_data(sock, psock);
  415. sock_hold(sock);
  416. return psock;
  417. }
  418. static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
  419. {
  420. struct bpf_stab *stab;
  421. int err = -EINVAL;
  422. u64 cost;
  423. /* check sanity of attributes */
  424. if (attr->max_entries == 0 || attr->key_size != 4 ||
  425. attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
  426. return ERR_PTR(-EINVAL);
  427. if (attr->value_size > KMALLOC_MAX_SIZE)
  428. return ERR_PTR(-E2BIG);
  429. stab = kzalloc(sizeof(*stab), GFP_USER);
  430. if (!stab)
  431. return ERR_PTR(-ENOMEM);
  432. /* mandatory map attributes */
  433. stab->map.map_type = attr->map_type;
  434. stab->map.key_size = attr->key_size;
  435. stab->map.value_size = attr->value_size;
  436. stab->map.max_entries = attr->max_entries;
  437. stab->map.map_flags = attr->map_flags;
  438. stab->map.numa_node = bpf_map_attr_numa_node(attr);
  439. /* make sure page count doesn't overflow */
  440. cost = (u64) stab->map.max_entries * sizeof(struct sock *);
  441. if (cost >= U32_MAX - PAGE_SIZE)
  442. goto free_stab;
  443. stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
  444. /* if map size is larger than memlock limit, reject it early */
  445. err = bpf_map_precharge_memlock(stab->map.pages);
  446. if (err)
  447. goto free_stab;
  448. err = -ENOMEM;
  449. stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
  450. sizeof(struct sock *),
  451. stab->map.numa_node);
  452. if (!stab->sock_map)
  453. goto free_stab;
  454. return &stab->map;
  455. free_stab:
  456. kfree(stab);
  457. return ERR_PTR(err);
  458. }
  459. static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
  460. {
  461. struct smap_psock_map_entry *e, *tmp;
  462. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  463. if (e->entry == entry) {
  464. list_del(&e->list);
  465. break;
  466. }
  467. }
  468. }
  469. static void sock_map_free(struct bpf_map *map)
  470. {
  471. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  472. int i;
  473. synchronize_rcu();
  474. /* At this point no update, lookup or delete operations can happen.
  475. * However, be aware we can still get a socket state event updates,
  476. * and data ready callabacks that reference the psock from sk_user_data
  477. * Also psock worker threads are still in-flight. So smap_release_sock
  478. * will only free the psock after cancel_sync on the worker threads
  479. * and a grace period expire to ensure psock is really safe to remove.
  480. */
  481. rcu_read_lock();
  482. for (i = 0; i < stab->map.max_entries; i++) {
  483. struct smap_psock *psock;
  484. struct sock *sock;
  485. sock = xchg(&stab->sock_map[i], NULL);
  486. if (!sock)
  487. continue;
  488. write_lock_bh(&sock->sk_callback_lock);
  489. psock = smap_psock_sk(sock);
  490. smap_list_remove(psock, &stab->sock_map[i]);
  491. smap_release_sock(psock, sock);
  492. write_unlock_bh(&sock->sk_callback_lock);
  493. }
  494. rcu_read_unlock();
  495. if (stab->bpf_verdict)
  496. bpf_prog_put(stab->bpf_verdict);
  497. if (stab->bpf_parse)
  498. bpf_prog_put(stab->bpf_parse);
  499. sock_map_remove_complete(stab);
  500. }
  501. static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
  502. {
  503. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  504. u32 i = key ? *(u32 *)key : U32_MAX;
  505. u32 *next = (u32 *)next_key;
  506. if (i >= stab->map.max_entries) {
  507. *next = 0;
  508. return 0;
  509. }
  510. if (i == stab->map.max_entries - 1)
  511. return -ENOENT;
  512. *next = i + 1;
  513. return 0;
  514. }
  515. struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
  516. {
  517. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  518. if (key >= map->max_entries)
  519. return NULL;
  520. return READ_ONCE(stab->sock_map[key]);
  521. }
  522. static int sock_map_delete_elem(struct bpf_map *map, void *key)
  523. {
  524. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  525. struct smap_psock *psock;
  526. int k = *(u32 *)key;
  527. struct sock *sock;
  528. if (k >= map->max_entries)
  529. return -EINVAL;
  530. sock = xchg(&stab->sock_map[k], NULL);
  531. if (!sock)
  532. return -EINVAL;
  533. write_lock_bh(&sock->sk_callback_lock);
  534. psock = smap_psock_sk(sock);
  535. if (!psock)
  536. goto out;
  537. if (psock->bpf_parse)
  538. smap_stop_sock(psock, sock);
  539. smap_list_remove(psock, &stab->sock_map[k]);
  540. smap_release_sock(psock, sock);
  541. out:
  542. write_unlock_bh(&sock->sk_callback_lock);
  543. return 0;
  544. }
  545. /* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
  546. * done inside rcu critical sections. This ensures on updates that the psock
  547. * will not be released via smap_release_sock() until concurrent updates/deletes
  548. * complete. All operations operate on sock_map using cmpxchg and xchg
  549. * operations to ensure we do not get stale references. Any reads into the
  550. * map must be done with READ_ONCE() because of this.
  551. *
  552. * A psock is destroyed via call_rcu and after any worker threads are cancelled
  553. * and syncd so we are certain all references from the update/lookup/delete
  554. * operations as well as references in the data path are no longer in use.
  555. *
  556. * Psocks may exist in multiple maps, but only a single set of parse/verdict
  557. * programs may be inherited from the maps it belongs to. A reference count
  558. * is kept with the total number of references to the psock from all maps. The
  559. * psock will not be released until this reaches zero. The psock and sock
  560. * user data data use the sk_callback_lock to protect critical data structures
  561. * from concurrent access. This allows us to avoid two updates from modifying
  562. * the user data in sock and the lock is required anyways for modifying
  563. * callbacks, we simply increase its scope slightly.
  564. *
  565. * Rules to follow,
  566. * - psock must always be read inside RCU critical section
  567. * - sk_user_data must only be modified inside sk_callback_lock and read
  568. * inside RCU critical section.
  569. * - psock->maps list must only be read & modified inside sk_callback_lock
  570. * - sock_map must use READ_ONCE and (cmp)xchg operations
  571. * - BPF verdict/parse programs must use READ_ONCE and xchg operations
  572. */
  573. static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
  574. struct bpf_map *map,
  575. void *key, u64 flags)
  576. {
  577. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  578. struct smap_psock_map_entry *e = NULL;
  579. struct bpf_prog *verdict, *parse;
  580. struct sock *osock, *sock;
  581. struct smap_psock *psock;
  582. u32 i = *(u32 *)key;
  583. int err;
  584. if (unlikely(flags > BPF_EXIST))
  585. return -EINVAL;
  586. if (unlikely(i >= stab->map.max_entries))
  587. return -E2BIG;
  588. sock = READ_ONCE(stab->sock_map[i]);
  589. if (flags == BPF_EXIST && !sock)
  590. return -ENOENT;
  591. else if (flags == BPF_NOEXIST && sock)
  592. return -EEXIST;
  593. sock = skops->sk;
  594. /* 1. If sock map has BPF programs those will be inherited by the
  595. * sock being added. If the sock is already attached to BPF programs
  596. * this results in an error.
  597. */
  598. verdict = READ_ONCE(stab->bpf_verdict);
  599. parse = READ_ONCE(stab->bpf_parse);
  600. if (parse && verdict) {
  601. /* bpf prog refcnt may be zero if a concurrent attach operation
  602. * removes the program after the above READ_ONCE() but before
  603. * we increment the refcnt. If this is the case abort with an
  604. * error.
  605. */
  606. verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
  607. if (IS_ERR(verdict))
  608. return PTR_ERR(verdict);
  609. parse = bpf_prog_inc_not_zero(stab->bpf_parse);
  610. if (IS_ERR(parse)) {
  611. bpf_prog_put(verdict);
  612. return PTR_ERR(parse);
  613. }
  614. }
  615. write_lock_bh(&sock->sk_callback_lock);
  616. psock = smap_psock_sk(sock);
  617. /* 2. Do not allow inheriting programs if psock exists and has
  618. * already inherited programs. This would create confusion on
  619. * which parser/verdict program is running. If no psock exists
  620. * create one. Inside sk_callback_lock to ensure concurrent create
  621. * doesn't update user data.
  622. */
  623. if (psock) {
  624. if (READ_ONCE(psock->bpf_parse) && parse) {
  625. err = -EBUSY;
  626. goto out_progs;
  627. }
  628. psock->refcnt++;
  629. } else {
  630. psock = smap_init_psock(sock, stab);
  631. if (IS_ERR(psock)) {
  632. err = PTR_ERR(psock);
  633. goto out_progs;
  634. }
  635. set_bit(SMAP_TX_RUNNING, &psock->state);
  636. }
  637. e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
  638. if (!e) {
  639. err = -ENOMEM;
  640. goto out_progs;
  641. }
  642. e->entry = &stab->sock_map[i];
  643. /* 3. At this point we have a reference to a valid psock that is
  644. * running. Attach any BPF programs needed.
  645. */
  646. if (parse && verdict && !psock->strp_enabled) {
  647. err = smap_init_sock(psock, sock);
  648. if (err)
  649. goto out_free;
  650. smap_init_progs(psock, stab, verdict, parse);
  651. smap_start_sock(psock, sock);
  652. }
  653. /* 4. Place psock in sockmap for use and stop any programs on
  654. * the old sock assuming its not the same sock we are replacing
  655. * it with. Because we can only have a single set of programs if
  656. * old_sock has a strp we can stop it.
  657. */
  658. list_add_tail(&e->list, &psock->maps);
  659. write_unlock_bh(&sock->sk_callback_lock);
  660. osock = xchg(&stab->sock_map[i], sock);
  661. if (osock) {
  662. struct smap_psock *opsock = smap_psock_sk(osock);
  663. write_lock_bh(&osock->sk_callback_lock);
  664. if (osock != sock && parse)
  665. smap_stop_sock(opsock, osock);
  666. smap_list_remove(opsock, &stab->sock_map[i]);
  667. smap_release_sock(opsock, osock);
  668. write_unlock_bh(&osock->sk_callback_lock);
  669. }
  670. return 0;
  671. out_free:
  672. smap_release_sock(psock, sock);
  673. out_progs:
  674. if (verdict)
  675. bpf_prog_put(verdict);
  676. if (parse)
  677. bpf_prog_put(parse);
  678. write_unlock_bh(&sock->sk_callback_lock);
  679. kfree(e);
  680. return err;
  681. }
  682. int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
  683. {
  684. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  685. struct bpf_prog *orig;
  686. if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
  687. return -EINVAL;
  688. switch (type) {
  689. case BPF_SK_SKB_STREAM_PARSER:
  690. orig = xchg(&stab->bpf_parse, prog);
  691. break;
  692. case BPF_SK_SKB_STREAM_VERDICT:
  693. orig = xchg(&stab->bpf_verdict, prog);
  694. break;
  695. default:
  696. return -EOPNOTSUPP;
  697. }
  698. if (orig)
  699. bpf_prog_put(orig);
  700. return 0;
  701. }
  702. static void *sock_map_lookup(struct bpf_map *map, void *key)
  703. {
  704. return NULL;
  705. }
  706. static int sock_map_update_elem(struct bpf_map *map,
  707. void *key, void *value, u64 flags)
  708. {
  709. struct bpf_sock_ops_kern skops;
  710. u32 fd = *(u32 *)value;
  711. struct socket *socket;
  712. int err;
  713. socket = sockfd_lookup(fd, &err);
  714. if (!socket)
  715. return err;
  716. skops.sk = socket->sk;
  717. if (!skops.sk) {
  718. fput(socket->file);
  719. return -EINVAL;
  720. }
  721. err = sock_map_ctx_update_elem(&skops, map, key, flags);
  722. fput(socket->file);
  723. return err;
  724. }
  725. const struct bpf_map_ops sock_map_ops = {
  726. .map_alloc = sock_map_alloc,
  727. .map_free = sock_map_free,
  728. .map_lookup_elem = sock_map_lookup,
  729. .map_get_next_key = sock_map_get_next_key,
  730. .map_update_elem = sock_map_update_elem,
  731. .map_delete_elem = sock_map_delete_elem,
  732. };
  733. BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
  734. struct bpf_map *, map, void *, key, u64, flags)
  735. {
  736. WARN_ON_ONCE(!rcu_read_lock_held());
  737. return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
  738. }
  739. const struct bpf_func_proto bpf_sock_map_update_proto = {
  740. .func = bpf_sock_map_update,
  741. .gpl_only = false,
  742. .pkt_access = true,
  743. .ret_type = RET_INTEGER,
  744. .arg1_type = ARG_PTR_TO_CTX,
  745. .arg2_type = ARG_CONST_MAP_PTR,
  746. .arg3_type = ARG_PTR_TO_MAP_KEY,
  747. .arg4_type = ARG_ANYTHING,
  748. };