sockmap.c 23 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898
  1. /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. /* A BPF sock_map is used to store sock objects. This is primarly used
  13. * for doing socket redirect with BPF helper routines.
  14. *
  15. * A sock map may have BPF programs attached to it, currently a program
  16. * used to parse packets and a program to provide a verdict and redirect
  17. * decision on the packet are supported. Any programs attached to a sock
  18. * map are inherited by sock objects when they are added to the map. If
  19. * no BPF programs are attached the sock object may only be used for sock
  20. * redirect.
  21. *
  22. * A sock object may be in multiple maps, but can only inherit a single
  23. * parse or verdict program. If adding a sock object to a map would result
  24. * in having multiple parsing programs the update will return an EBUSY error.
  25. *
  26. * For reference this program is similar to devmap used in XDP context
  27. * reviewing these together may be useful. For an example please review
  28. * ./samples/bpf/sockmap/.
  29. */
  30. #include <linux/bpf.h>
  31. #include <net/sock.h>
  32. #include <linux/filter.h>
  33. #include <linux/errno.h>
  34. #include <linux/file.h>
  35. #include <linux/kernel.h>
  36. #include <linux/net.h>
  37. #include <linux/skbuff.h>
  38. #include <linux/workqueue.h>
  39. #include <linux/list.h>
  40. #include <net/strparser.h>
  41. #include <net/tcp.h>
  42. struct bpf_stab {
  43. struct bpf_map map;
  44. struct sock **sock_map;
  45. struct bpf_prog *bpf_parse;
  46. struct bpf_prog *bpf_verdict;
  47. };
  48. enum smap_psock_state {
  49. SMAP_TX_RUNNING,
  50. };
  51. struct smap_psock_map_entry {
  52. struct list_head list;
  53. struct sock **entry;
  54. };
  55. struct smap_psock {
  56. struct rcu_head rcu;
  57. /* refcnt is used inside sk_callback_lock */
  58. u32 refcnt;
  59. /* datapath variables */
  60. struct sk_buff_head rxqueue;
  61. bool strp_enabled;
  62. /* datapath error path cache across tx work invocations */
  63. int save_rem;
  64. int save_off;
  65. struct sk_buff *save_skb;
  66. struct strparser strp;
  67. struct bpf_prog *bpf_parse;
  68. struct bpf_prog *bpf_verdict;
  69. struct list_head maps;
  70. /* Back reference used when sock callback trigger sockmap operations */
  71. struct sock *sock;
  72. unsigned long state;
  73. struct work_struct tx_work;
  74. struct work_struct gc_work;
  75. void (*save_data_ready)(struct sock *sk);
  76. void (*save_write_space)(struct sock *sk);
  77. void (*save_state_change)(struct sock *sk);
  78. };
  79. static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
  80. {
  81. return rcu_dereference_sk_user_data(sk);
  82. }
  83. /* compute the linear packet data range [data, data_end) for skb when
  84. * sk_skb type programs are in use.
  85. */
  86. static inline void bpf_compute_data_end_sk_skb(struct sk_buff *skb)
  87. {
  88. TCP_SKB_CB(skb)->bpf.data_end = skb->data + skb_headlen(skb);
  89. }
  90. enum __sk_action {
  91. __SK_DROP = 0,
  92. __SK_PASS,
  93. __SK_REDIRECT,
  94. };
  95. static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
  96. {
  97. struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
  98. int rc;
  99. if (unlikely(!prog))
  100. return __SK_DROP;
  101. skb_orphan(skb);
  102. /* We need to ensure that BPF metadata for maps is also cleared
  103. * when we orphan the skb so that we don't have the possibility
  104. * to reference a stale map.
  105. */
  106. TCP_SKB_CB(skb)->bpf.map = NULL;
  107. skb->sk = psock->sock;
  108. bpf_compute_data_end_sk_skb(skb);
  109. preempt_disable();
  110. rc = (*prog->bpf_func)(skb, prog->insnsi);
  111. preempt_enable();
  112. skb->sk = NULL;
  113. /* Moving return codes from UAPI namespace into internal namespace */
  114. return rc == SK_PASS ?
  115. (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
  116. __SK_DROP;
  117. }
  118. static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
  119. {
  120. struct sock *sk;
  121. int rc;
  122. rc = smap_verdict_func(psock, skb);
  123. switch (rc) {
  124. case __SK_REDIRECT:
  125. sk = do_sk_redirect_map(skb);
  126. if (likely(sk)) {
  127. struct smap_psock *peer = smap_psock_sk(sk);
  128. if (likely(peer &&
  129. test_bit(SMAP_TX_RUNNING, &peer->state) &&
  130. !sock_flag(sk, SOCK_DEAD) &&
  131. sock_writeable(sk))) {
  132. skb_set_owner_w(skb, sk);
  133. skb_queue_tail(&peer->rxqueue, skb);
  134. schedule_work(&peer->tx_work);
  135. break;
  136. }
  137. }
  138. /* Fall through and free skb otherwise */
  139. case __SK_DROP:
  140. default:
  141. kfree_skb(skb);
  142. }
  143. }
  144. static void smap_report_sk_error(struct smap_psock *psock, int err)
  145. {
  146. struct sock *sk = psock->sock;
  147. sk->sk_err = err;
  148. sk->sk_error_report(sk);
  149. }
  150. static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
  151. /* Called with lock_sock(sk) held */
  152. static void smap_state_change(struct sock *sk)
  153. {
  154. struct smap_psock_map_entry *e, *tmp;
  155. struct smap_psock *psock;
  156. struct socket_wq *wq;
  157. struct sock *osk;
  158. rcu_read_lock();
  159. /* Allowing transitions into an established syn_recv states allows
  160. * for early binding sockets to a smap object before the connection
  161. * is established.
  162. */
  163. switch (sk->sk_state) {
  164. case TCP_SYN_SENT:
  165. case TCP_SYN_RECV:
  166. case TCP_ESTABLISHED:
  167. break;
  168. case TCP_CLOSE_WAIT:
  169. case TCP_CLOSING:
  170. case TCP_LAST_ACK:
  171. case TCP_FIN_WAIT1:
  172. case TCP_FIN_WAIT2:
  173. case TCP_LISTEN:
  174. break;
  175. case TCP_CLOSE:
  176. /* Only release if the map entry is in fact the sock in
  177. * question. There is a case where the operator deletes
  178. * the sock from the map, but the TCP sock is closed before
  179. * the psock is detached. Use cmpxchg to verify correct
  180. * sock is removed.
  181. */
  182. psock = smap_psock_sk(sk);
  183. if (unlikely(!psock))
  184. break;
  185. write_lock_bh(&sk->sk_callback_lock);
  186. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  187. osk = cmpxchg(e->entry, sk, NULL);
  188. if (osk == sk) {
  189. list_del(&e->list);
  190. smap_release_sock(psock, sk);
  191. }
  192. }
  193. write_unlock_bh(&sk->sk_callback_lock);
  194. break;
  195. default:
  196. psock = smap_psock_sk(sk);
  197. if (unlikely(!psock))
  198. break;
  199. smap_report_sk_error(psock, EPIPE);
  200. break;
  201. }
  202. wq = rcu_dereference(sk->sk_wq);
  203. if (skwq_has_sleeper(wq))
  204. wake_up_interruptible_all(&wq->wait);
  205. rcu_read_unlock();
  206. }
  207. static void smap_read_sock_strparser(struct strparser *strp,
  208. struct sk_buff *skb)
  209. {
  210. struct smap_psock *psock;
  211. rcu_read_lock();
  212. psock = container_of(strp, struct smap_psock, strp);
  213. smap_do_verdict(psock, skb);
  214. rcu_read_unlock();
  215. }
  216. /* Called with lock held on socket */
  217. static void smap_data_ready(struct sock *sk)
  218. {
  219. struct smap_psock *psock;
  220. rcu_read_lock();
  221. psock = smap_psock_sk(sk);
  222. if (likely(psock)) {
  223. write_lock_bh(&sk->sk_callback_lock);
  224. strp_data_ready(&psock->strp);
  225. write_unlock_bh(&sk->sk_callback_lock);
  226. }
  227. rcu_read_unlock();
  228. }
  229. static void smap_tx_work(struct work_struct *w)
  230. {
  231. struct smap_psock *psock;
  232. struct sk_buff *skb;
  233. int rem, off, n;
  234. psock = container_of(w, struct smap_psock, tx_work);
  235. /* lock sock to avoid losing sk_socket at some point during loop */
  236. lock_sock(psock->sock);
  237. if (psock->save_skb) {
  238. skb = psock->save_skb;
  239. rem = psock->save_rem;
  240. off = psock->save_off;
  241. psock->save_skb = NULL;
  242. goto start;
  243. }
  244. while ((skb = skb_dequeue(&psock->rxqueue))) {
  245. rem = skb->len;
  246. off = 0;
  247. start:
  248. do {
  249. if (likely(psock->sock->sk_socket))
  250. n = skb_send_sock_locked(psock->sock,
  251. skb, off, rem);
  252. else
  253. n = -EINVAL;
  254. if (n <= 0) {
  255. if (n == -EAGAIN) {
  256. /* Retry when space is available */
  257. psock->save_skb = skb;
  258. psock->save_rem = rem;
  259. psock->save_off = off;
  260. goto out;
  261. }
  262. /* Hard errors break pipe and stop xmit */
  263. smap_report_sk_error(psock, n ? -n : EPIPE);
  264. clear_bit(SMAP_TX_RUNNING, &psock->state);
  265. kfree_skb(skb);
  266. goto out;
  267. }
  268. rem -= n;
  269. off += n;
  270. } while (rem);
  271. kfree_skb(skb);
  272. }
  273. out:
  274. release_sock(psock->sock);
  275. }
  276. static void smap_write_space(struct sock *sk)
  277. {
  278. struct smap_psock *psock;
  279. rcu_read_lock();
  280. psock = smap_psock_sk(sk);
  281. if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
  282. schedule_work(&psock->tx_work);
  283. rcu_read_unlock();
  284. }
  285. static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
  286. {
  287. if (!psock->strp_enabled)
  288. return;
  289. sk->sk_data_ready = psock->save_data_ready;
  290. sk->sk_write_space = psock->save_write_space;
  291. sk->sk_state_change = psock->save_state_change;
  292. psock->save_data_ready = NULL;
  293. psock->save_write_space = NULL;
  294. psock->save_state_change = NULL;
  295. strp_stop(&psock->strp);
  296. psock->strp_enabled = false;
  297. }
  298. static void smap_destroy_psock(struct rcu_head *rcu)
  299. {
  300. struct smap_psock *psock = container_of(rcu,
  301. struct smap_psock, rcu);
  302. /* Now that a grace period has passed there is no longer
  303. * any reference to this sock in the sockmap so we can
  304. * destroy the psock, strparser, and bpf programs. But,
  305. * because we use workqueue sync operations we can not
  306. * do it in rcu context
  307. */
  308. schedule_work(&psock->gc_work);
  309. }
  310. static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
  311. {
  312. psock->refcnt--;
  313. if (psock->refcnt)
  314. return;
  315. smap_stop_sock(psock, sock);
  316. clear_bit(SMAP_TX_RUNNING, &psock->state);
  317. rcu_assign_sk_user_data(sock, NULL);
  318. call_rcu_sched(&psock->rcu, smap_destroy_psock);
  319. }
  320. static int smap_parse_func_strparser(struct strparser *strp,
  321. struct sk_buff *skb)
  322. {
  323. struct smap_psock *psock;
  324. struct bpf_prog *prog;
  325. int rc;
  326. rcu_read_lock();
  327. psock = container_of(strp, struct smap_psock, strp);
  328. prog = READ_ONCE(psock->bpf_parse);
  329. if (unlikely(!prog)) {
  330. rcu_read_unlock();
  331. return skb->len;
  332. }
  333. /* Attach socket for bpf program to use if needed we can do this
  334. * because strparser clones the skb before handing it to a upper
  335. * layer, meaning skb_orphan has been called. We NULL sk on the
  336. * way out to ensure we don't trigger a BUG_ON in skb/sk operations
  337. * later and because we are not charging the memory of this skb to
  338. * any socket yet.
  339. */
  340. skb->sk = psock->sock;
  341. bpf_compute_data_end_sk_skb(skb);
  342. rc = (*prog->bpf_func)(skb, prog->insnsi);
  343. skb->sk = NULL;
  344. rcu_read_unlock();
  345. return rc;
  346. }
  347. static int smap_read_sock_done(struct strparser *strp, int err)
  348. {
  349. return err;
  350. }
  351. static int smap_init_sock(struct smap_psock *psock,
  352. struct sock *sk)
  353. {
  354. static const struct strp_callbacks cb = {
  355. .rcv_msg = smap_read_sock_strparser,
  356. .parse_msg = smap_parse_func_strparser,
  357. .read_sock_done = smap_read_sock_done,
  358. };
  359. return strp_init(&psock->strp, sk, &cb);
  360. }
  361. static void smap_init_progs(struct smap_psock *psock,
  362. struct bpf_stab *stab,
  363. struct bpf_prog *verdict,
  364. struct bpf_prog *parse)
  365. {
  366. struct bpf_prog *orig_parse, *orig_verdict;
  367. orig_parse = xchg(&psock->bpf_parse, parse);
  368. orig_verdict = xchg(&psock->bpf_verdict, verdict);
  369. if (orig_verdict)
  370. bpf_prog_put(orig_verdict);
  371. if (orig_parse)
  372. bpf_prog_put(orig_parse);
  373. }
  374. static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
  375. {
  376. if (sk->sk_data_ready == smap_data_ready)
  377. return;
  378. psock->save_data_ready = sk->sk_data_ready;
  379. psock->save_write_space = sk->sk_write_space;
  380. psock->save_state_change = sk->sk_state_change;
  381. sk->sk_data_ready = smap_data_ready;
  382. sk->sk_write_space = smap_write_space;
  383. sk->sk_state_change = smap_state_change;
  384. psock->strp_enabled = true;
  385. }
  386. static void sock_map_remove_complete(struct bpf_stab *stab)
  387. {
  388. bpf_map_area_free(stab->sock_map);
  389. kfree(stab);
  390. }
  391. static void smap_gc_work(struct work_struct *w)
  392. {
  393. struct smap_psock_map_entry *e, *tmp;
  394. struct smap_psock *psock;
  395. psock = container_of(w, struct smap_psock, gc_work);
  396. /* no callback lock needed because we already detached sockmap ops */
  397. if (psock->strp_enabled)
  398. strp_done(&psock->strp);
  399. cancel_work_sync(&psock->tx_work);
  400. __skb_queue_purge(&psock->rxqueue);
  401. /* At this point all strparser and xmit work must be complete */
  402. if (psock->bpf_parse)
  403. bpf_prog_put(psock->bpf_parse);
  404. if (psock->bpf_verdict)
  405. bpf_prog_put(psock->bpf_verdict);
  406. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  407. list_del(&e->list);
  408. kfree(e);
  409. }
  410. sock_put(psock->sock);
  411. kfree(psock);
  412. }
  413. static struct smap_psock *smap_init_psock(struct sock *sock,
  414. struct bpf_stab *stab)
  415. {
  416. struct smap_psock *psock;
  417. psock = kzalloc_node(sizeof(struct smap_psock),
  418. GFP_ATOMIC | __GFP_NOWARN,
  419. stab->map.numa_node);
  420. if (!psock)
  421. return ERR_PTR(-ENOMEM);
  422. psock->sock = sock;
  423. skb_queue_head_init(&psock->rxqueue);
  424. INIT_WORK(&psock->tx_work, smap_tx_work);
  425. INIT_WORK(&psock->gc_work, smap_gc_work);
  426. INIT_LIST_HEAD(&psock->maps);
  427. psock->refcnt = 1;
  428. rcu_assign_sk_user_data(sock, psock);
  429. sock_hold(sock);
  430. return psock;
  431. }
  432. static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
  433. {
  434. struct bpf_stab *stab;
  435. int err = -EINVAL;
  436. u64 cost;
  437. if (!capable(CAP_NET_ADMIN))
  438. return ERR_PTR(-EPERM);
  439. /* check sanity of attributes */
  440. if (attr->max_entries == 0 || attr->key_size != 4 ||
  441. attr->value_size != 4 || attr->map_flags & ~BPF_F_NUMA_NODE)
  442. return ERR_PTR(-EINVAL);
  443. if (attr->value_size > KMALLOC_MAX_SIZE)
  444. return ERR_PTR(-E2BIG);
  445. stab = kzalloc(sizeof(*stab), GFP_USER);
  446. if (!stab)
  447. return ERR_PTR(-ENOMEM);
  448. /* mandatory map attributes */
  449. stab->map.map_type = attr->map_type;
  450. stab->map.key_size = attr->key_size;
  451. stab->map.value_size = attr->value_size;
  452. stab->map.max_entries = attr->max_entries;
  453. stab->map.map_flags = attr->map_flags;
  454. stab->map.numa_node = bpf_map_attr_numa_node(attr);
  455. /* make sure page count doesn't overflow */
  456. cost = (u64) stab->map.max_entries * sizeof(struct sock *);
  457. if (cost >= U32_MAX - PAGE_SIZE)
  458. goto free_stab;
  459. stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
  460. /* if map size is larger than memlock limit, reject it early */
  461. err = bpf_map_precharge_memlock(stab->map.pages);
  462. if (err)
  463. goto free_stab;
  464. err = -ENOMEM;
  465. stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
  466. sizeof(struct sock *),
  467. stab->map.numa_node);
  468. if (!stab->sock_map)
  469. goto free_stab;
  470. return &stab->map;
  471. free_stab:
  472. kfree(stab);
  473. return ERR_PTR(err);
  474. }
  475. static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
  476. {
  477. struct smap_psock_map_entry *e, *tmp;
  478. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  479. if (e->entry == entry) {
  480. list_del(&e->list);
  481. break;
  482. }
  483. }
  484. }
  485. static void sock_map_free(struct bpf_map *map)
  486. {
  487. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  488. int i;
  489. synchronize_rcu();
  490. /* At this point no update, lookup or delete operations can happen.
  491. * However, be aware we can still get a socket state event updates,
  492. * and data ready callabacks that reference the psock from sk_user_data
  493. * Also psock worker threads are still in-flight. So smap_release_sock
  494. * will only free the psock after cancel_sync on the worker threads
  495. * and a grace period expire to ensure psock is really safe to remove.
  496. */
  497. rcu_read_lock();
  498. for (i = 0; i < stab->map.max_entries; i++) {
  499. struct smap_psock *psock;
  500. struct sock *sock;
  501. sock = xchg(&stab->sock_map[i], NULL);
  502. if (!sock)
  503. continue;
  504. write_lock_bh(&sock->sk_callback_lock);
  505. psock = smap_psock_sk(sock);
  506. smap_list_remove(psock, &stab->sock_map[i]);
  507. smap_release_sock(psock, sock);
  508. write_unlock_bh(&sock->sk_callback_lock);
  509. }
  510. rcu_read_unlock();
  511. if (stab->bpf_verdict)
  512. bpf_prog_put(stab->bpf_verdict);
  513. if (stab->bpf_parse)
  514. bpf_prog_put(stab->bpf_parse);
  515. sock_map_remove_complete(stab);
  516. }
  517. static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
  518. {
  519. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  520. u32 i = key ? *(u32 *)key : U32_MAX;
  521. u32 *next = (u32 *)next_key;
  522. if (i >= stab->map.max_entries) {
  523. *next = 0;
  524. return 0;
  525. }
  526. if (i == stab->map.max_entries - 1)
  527. return -ENOENT;
  528. *next = i + 1;
  529. return 0;
  530. }
  531. struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
  532. {
  533. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  534. if (key >= map->max_entries)
  535. return NULL;
  536. return READ_ONCE(stab->sock_map[key]);
  537. }
  538. static int sock_map_delete_elem(struct bpf_map *map, void *key)
  539. {
  540. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  541. struct smap_psock *psock;
  542. int k = *(u32 *)key;
  543. struct sock *sock;
  544. if (k >= map->max_entries)
  545. return -EINVAL;
  546. sock = xchg(&stab->sock_map[k], NULL);
  547. if (!sock)
  548. return -EINVAL;
  549. write_lock_bh(&sock->sk_callback_lock);
  550. psock = smap_psock_sk(sock);
  551. if (!psock)
  552. goto out;
  553. if (psock->bpf_parse)
  554. smap_stop_sock(psock, sock);
  555. smap_list_remove(psock, &stab->sock_map[k]);
  556. smap_release_sock(psock, sock);
  557. out:
  558. write_unlock_bh(&sock->sk_callback_lock);
  559. return 0;
  560. }
  561. /* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
  562. * done inside rcu critical sections. This ensures on updates that the psock
  563. * will not be released via smap_release_sock() until concurrent updates/deletes
  564. * complete. All operations operate on sock_map using cmpxchg and xchg
  565. * operations to ensure we do not get stale references. Any reads into the
  566. * map must be done with READ_ONCE() because of this.
  567. *
  568. * A psock is destroyed via call_rcu and after any worker threads are cancelled
  569. * and syncd so we are certain all references from the update/lookup/delete
  570. * operations as well as references in the data path are no longer in use.
  571. *
  572. * Psocks may exist in multiple maps, but only a single set of parse/verdict
  573. * programs may be inherited from the maps it belongs to. A reference count
  574. * is kept with the total number of references to the psock from all maps. The
  575. * psock will not be released until this reaches zero. The psock and sock
  576. * user data data use the sk_callback_lock to protect critical data structures
  577. * from concurrent access. This allows us to avoid two updates from modifying
  578. * the user data in sock and the lock is required anyways for modifying
  579. * callbacks, we simply increase its scope slightly.
  580. *
  581. * Rules to follow,
  582. * - psock must always be read inside RCU critical section
  583. * - sk_user_data must only be modified inside sk_callback_lock and read
  584. * inside RCU critical section.
  585. * - psock->maps list must only be read & modified inside sk_callback_lock
  586. * - sock_map must use READ_ONCE and (cmp)xchg operations
  587. * - BPF verdict/parse programs must use READ_ONCE and xchg operations
  588. */
  589. static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
  590. struct bpf_map *map,
  591. void *key, u64 flags)
  592. {
  593. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  594. struct smap_psock_map_entry *e = NULL;
  595. struct bpf_prog *verdict, *parse;
  596. struct sock *osock, *sock;
  597. struct smap_psock *psock;
  598. u32 i = *(u32 *)key;
  599. int err;
  600. if (unlikely(flags > BPF_EXIST))
  601. return -EINVAL;
  602. if (unlikely(i >= stab->map.max_entries))
  603. return -E2BIG;
  604. sock = READ_ONCE(stab->sock_map[i]);
  605. if (flags == BPF_EXIST && !sock)
  606. return -ENOENT;
  607. else if (flags == BPF_NOEXIST && sock)
  608. return -EEXIST;
  609. sock = skops->sk;
  610. /* 1. If sock map has BPF programs those will be inherited by the
  611. * sock being added. If the sock is already attached to BPF programs
  612. * this results in an error.
  613. */
  614. verdict = READ_ONCE(stab->bpf_verdict);
  615. parse = READ_ONCE(stab->bpf_parse);
  616. if (parse && verdict) {
  617. /* bpf prog refcnt may be zero if a concurrent attach operation
  618. * removes the program after the above READ_ONCE() but before
  619. * we increment the refcnt. If this is the case abort with an
  620. * error.
  621. */
  622. verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
  623. if (IS_ERR(verdict))
  624. return PTR_ERR(verdict);
  625. parse = bpf_prog_inc_not_zero(stab->bpf_parse);
  626. if (IS_ERR(parse)) {
  627. bpf_prog_put(verdict);
  628. return PTR_ERR(parse);
  629. }
  630. }
  631. write_lock_bh(&sock->sk_callback_lock);
  632. psock = smap_psock_sk(sock);
  633. /* 2. Do not allow inheriting programs if psock exists and has
  634. * already inherited programs. This would create confusion on
  635. * which parser/verdict program is running. If no psock exists
  636. * create one. Inside sk_callback_lock to ensure concurrent create
  637. * doesn't update user data.
  638. */
  639. if (psock) {
  640. if (READ_ONCE(psock->bpf_parse) && parse) {
  641. err = -EBUSY;
  642. goto out_progs;
  643. }
  644. psock->refcnt++;
  645. } else {
  646. psock = smap_init_psock(sock, stab);
  647. if (IS_ERR(psock)) {
  648. err = PTR_ERR(psock);
  649. goto out_progs;
  650. }
  651. set_bit(SMAP_TX_RUNNING, &psock->state);
  652. }
  653. e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
  654. if (!e) {
  655. err = -ENOMEM;
  656. goto out_progs;
  657. }
  658. e->entry = &stab->sock_map[i];
  659. /* 3. At this point we have a reference to a valid psock that is
  660. * running. Attach any BPF programs needed.
  661. */
  662. if (parse && verdict && !psock->strp_enabled) {
  663. err = smap_init_sock(psock, sock);
  664. if (err)
  665. goto out_free;
  666. smap_init_progs(psock, stab, verdict, parse);
  667. smap_start_sock(psock, sock);
  668. }
  669. /* 4. Place psock in sockmap for use and stop any programs on
  670. * the old sock assuming its not the same sock we are replacing
  671. * it with. Because we can only have a single set of programs if
  672. * old_sock has a strp we can stop it.
  673. */
  674. list_add_tail(&e->list, &psock->maps);
  675. write_unlock_bh(&sock->sk_callback_lock);
  676. osock = xchg(&stab->sock_map[i], sock);
  677. if (osock) {
  678. struct smap_psock *opsock = smap_psock_sk(osock);
  679. write_lock_bh(&osock->sk_callback_lock);
  680. if (osock != sock && parse)
  681. smap_stop_sock(opsock, osock);
  682. smap_list_remove(opsock, &stab->sock_map[i]);
  683. smap_release_sock(opsock, osock);
  684. write_unlock_bh(&osock->sk_callback_lock);
  685. }
  686. return 0;
  687. out_free:
  688. smap_release_sock(psock, sock);
  689. out_progs:
  690. if (verdict)
  691. bpf_prog_put(verdict);
  692. if (parse)
  693. bpf_prog_put(parse);
  694. write_unlock_bh(&sock->sk_callback_lock);
  695. kfree(e);
  696. return err;
  697. }
  698. int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
  699. {
  700. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  701. struct bpf_prog *orig;
  702. if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
  703. return -EINVAL;
  704. switch (type) {
  705. case BPF_SK_SKB_STREAM_PARSER:
  706. orig = xchg(&stab->bpf_parse, prog);
  707. break;
  708. case BPF_SK_SKB_STREAM_VERDICT:
  709. orig = xchg(&stab->bpf_verdict, prog);
  710. break;
  711. default:
  712. return -EOPNOTSUPP;
  713. }
  714. if (orig)
  715. bpf_prog_put(orig);
  716. return 0;
  717. }
  718. static void *sock_map_lookup(struct bpf_map *map, void *key)
  719. {
  720. return NULL;
  721. }
  722. static int sock_map_update_elem(struct bpf_map *map,
  723. void *key, void *value, u64 flags)
  724. {
  725. struct bpf_sock_ops_kern skops;
  726. u32 fd = *(u32 *)value;
  727. struct socket *socket;
  728. int err;
  729. socket = sockfd_lookup(fd, &err);
  730. if (!socket)
  731. return err;
  732. skops.sk = socket->sk;
  733. if (!skops.sk) {
  734. fput(socket->file);
  735. return -EINVAL;
  736. }
  737. if (skops.sk->sk_type != SOCK_STREAM ||
  738. skops.sk->sk_protocol != IPPROTO_TCP) {
  739. fput(socket->file);
  740. return -EOPNOTSUPP;
  741. }
  742. err = sock_map_ctx_update_elem(&skops, map, key, flags);
  743. fput(socket->file);
  744. return err;
  745. }
  746. const struct bpf_map_ops sock_map_ops = {
  747. .map_alloc = sock_map_alloc,
  748. .map_free = sock_map_free,
  749. .map_lookup_elem = sock_map_lookup,
  750. .map_get_next_key = sock_map_get_next_key,
  751. .map_update_elem = sock_map_update_elem,
  752. .map_delete_elem = sock_map_delete_elem,
  753. };
  754. BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
  755. struct bpf_map *, map, void *, key, u64, flags)
  756. {
  757. WARN_ON_ONCE(!rcu_read_lock_held());
  758. return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
  759. }
  760. const struct bpf_func_proto bpf_sock_map_update_proto = {
  761. .func = bpf_sock_map_update,
  762. .gpl_only = false,
  763. .pkt_access = true,
  764. .ret_type = RET_INTEGER,
  765. .arg1_type = ARG_PTR_TO_CTX,
  766. .arg2_type = ARG_CONST_MAP_PTR,
  767. .arg3_type = ARG_PTR_TO_MAP_KEY,
  768. .arg4_type = ARG_ANYTHING,
  769. };