sockmap.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937
  1. /* Copyright (c) 2017 Covalent IO, Inc. http://covalent.io
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. /* A BPF sock_map is used to store sock objects. This is primarly used
  13. * for doing socket redirect with BPF helper routines.
  14. *
  15. * A sock map may have BPF programs attached to it, currently a program
  16. * used to parse packets and a program to provide a verdict and redirect
  17. * decision on the packet are supported. Any programs attached to a sock
  18. * map are inherited by sock objects when they are added to the map. If
  19. * no BPF programs are attached the sock object may only be used for sock
  20. * redirect.
  21. *
  22. * A sock object may be in multiple maps, but can only inherit a single
  23. * parse or verdict program. If adding a sock object to a map would result
  24. * in having multiple parsing programs the update will return an EBUSY error.
  25. *
  26. * For reference this program is similar to devmap used in XDP context
  27. * reviewing these together may be useful. For an example please review
  28. * ./samples/bpf/sockmap/.
  29. */
  30. #include <linux/bpf.h>
  31. #include <net/sock.h>
  32. #include <linux/filter.h>
  33. #include <linux/errno.h>
  34. #include <linux/file.h>
  35. #include <linux/kernel.h>
  36. #include <linux/net.h>
  37. #include <linux/skbuff.h>
  38. #include <linux/workqueue.h>
  39. #include <linux/list.h>
  40. #include <net/strparser.h>
  41. #include <net/tcp.h>
  42. #define SOCK_CREATE_FLAG_MASK \
  43. (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
  44. struct bpf_stab {
  45. struct bpf_map map;
  46. struct sock **sock_map;
  47. struct bpf_prog *bpf_parse;
  48. struct bpf_prog *bpf_verdict;
  49. };
  50. enum smap_psock_state {
  51. SMAP_TX_RUNNING,
  52. };
  53. struct smap_psock_map_entry {
  54. struct list_head list;
  55. struct sock **entry;
  56. };
  57. struct smap_psock {
  58. struct rcu_head rcu;
  59. /* refcnt is used inside sk_callback_lock */
  60. u32 refcnt;
  61. /* datapath variables */
  62. struct sk_buff_head rxqueue;
  63. bool strp_enabled;
  64. /* datapath error path cache across tx work invocations */
  65. int save_rem;
  66. int save_off;
  67. struct sk_buff *save_skb;
  68. struct strparser strp;
  69. struct bpf_prog *bpf_parse;
  70. struct bpf_prog *bpf_verdict;
  71. struct list_head maps;
  72. /* Back reference used when sock callback trigger sockmap operations */
  73. struct sock *sock;
  74. unsigned long state;
  75. struct work_struct tx_work;
  76. struct work_struct gc_work;
  77. struct proto *sk_proto;
  78. void (*save_close)(struct sock *sk, long timeout);
  79. void (*save_data_ready)(struct sock *sk);
  80. void (*save_write_space)(struct sock *sk);
  81. };
  82. static inline struct smap_psock *smap_psock_sk(const struct sock *sk)
  83. {
  84. return rcu_dereference_sk_user_data(sk);
  85. }
  86. static struct proto tcp_bpf_proto;
  87. static int bpf_tcp_init(struct sock *sk)
  88. {
  89. struct smap_psock *psock;
  90. rcu_read_lock();
  91. psock = smap_psock_sk(sk);
  92. if (unlikely(!psock)) {
  93. rcu_read_unlock();
  94. return -EINVAL;
  95. }
  96. if (unlikely(psock->sk_proto)) {
  97. rcu_read_unlock();
  98. return -EBUSY;
  99. }
  100. psock->save_close = sk->sk_prot->close;
  101. psock->sk_proto = sk->sk_prot;
  102. sk->sk_prot = &tcp_bpf_proto;
  103. rcu_read_unlock();
  104. return 0;
  105. }
  106. static void bpf_tcp_release(struct sock *sk)
  107. {
  108. struct smap_psock *psock;
  109. rcu_read_lock();
  110. psock = smap_psock_sk(sk);
  111. if (likely(psock)) {
  112. sk->sk_prot = psock->sk_proto;
  113. psock->sk_proto = NULL;
  114. }
  115. rcu_read_unlock();
  116. }
  117. static void smap_release_sock(struct smap_psock *psock, struct sock *sock);
  118. static void bpf_tcp_close(struct sock *sk, long timeout)
  119. {
  120. void (*close_fun)(struct sock *sk, long timeout);
  121. struct smap_psock_map_entry *e, *tmp;
  122. struct smap_psock *psock;
  123. struct sock *osk;
  124. rcu_read_lock();
  125. psock = smap_psock_sk(sk);
  126. if (unlikely(!psock)) {
  127. rcu_read_unlock();
  128. return sk->sk_prot->close(sk, timeout);
  129. }
  130. /* The psock may be destroyed anytime after exiting the RCU critial
  131. * section so by the time we use close_fun the psock may no longer
  132. * be valid. However, bpf_tcp_close is called with the sock lock
  133. * held so the close hook and sk are still valid.
  134. */
  135. close_fun = psock->save_close;
  136. write_lock_bh(&sk->sk_callback_lock);
  137. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  138. osk = cmpxchg(e->entry, sk, NULL);
  139. if (osk == sk) {
  140. list_del(&e->list);
  141. smap_release_sock(psock, sk);
  142. }
  143. }
  144. write_unlock_bh(&sk->sk_callback_lock);
  145. rcu_read_unlock();
  146. close_fun(sk, timeout);
  147. }
  148. enum __sk_action {
  149. __SK_DROP = 0,
  150. __SK_PASS,
  151. __SK_REDIRECT,
  152. };
  153. static struct tcp_ulp_ops bpf_tcp_ulp_ops __read_mostly = {
  154. .name = "bpf_tcp",
  155. .uid = TCP_ULP_BPF,
  156. .user_visible = false,
  157. .owner = NULL,
  158. .init = bpf_tcp_init,
  159. .release = bpf_tcp_release,
  160. };
  161. static int bpf_tcp_ulp_register(void)
  162. {
  163. tcp_bpf_proto = tcp_prot;
  164. tcp_bpf_proto.close = bpf_tcp_close;
  165. return tcp_register_ulp(&bpf_tcp_ulp_ops);
  166. }
  167. static int smap_verdict_func(struct smap_psock *psock, struct sk_buff *skb)
  168. {
  169. struct bpf_prog *prog = READ_ONCE(psock->bpf_verdict);
  170. int rc;
  171. if (unlikely(!prog))
  172. return __SK_DROP;
  173. skb_orphan(skb);
  174. /* We need to ensure that BPF metadata for maps is also cleared
  175. * when we orphan the skb so that we don't have the possibility
  176. * to reference a stale map.
  177. */
  178. TCP_SKB_CB(skb)->bpf.map = NULL;
  179. skb->sk = psock->sock;
  180. bpf_compute_data_pointers(skb);
  181. preempt_disable();
  182. rc = (*prog->bpf_func)(skb, prog->insnsi);
  183. preempt_enable();
  184. skb->sk = NULL;
  185. /* Moving return codes from UAPI namespace into internal namespace */
  186. return rc == SK_PASS ?
  187. (TCP_SKB_CB(skb)->bpf.map ? __SK_REDIRECT : __SK_PASS) :
  188. __SK_DROP;
  189. }
  190. static void smap_do_verdict(struct smap_psock *psock, struct sk_buff *skb)
  191. {
  192. struct sock *sk;
  193. int rc;
  194. rc = smap_verdict_func(psock, skb);
  195. switch (rc) {
  196. case __SK_REDIRECT:
  197. sk = do_sk_redirect_map(skb);
  198. if (likely(sk)) {
  199. struct smap_psock *peer = smap_psock_sk(sk);
  200. if (likely(peer &&
  201. test_bit(SMAP_TX_RUNNING, &peer->state) &&
  202. !sock_flag(sk, SOCK_DEAD) &&
  203. sock_writeable(sk))) {
  204. skb_set_owner_w(skb, sk);
  205. skb_queue_tail(&peer->rxqueue, skb);
  206. schedule_work(&peer->tx_work);
  207. break;
  208. }
  209. }
  210. /* Fall through and free skb otherwise */
  211. case __SK_DROP:
  212. default:
  213. kfree_skb(skb);
  214. }
  215. }
  216. static void smap_report_sk_error(struct smap_psock *psock, int err)
  217. {
  218. struct sock *sk = psock->sock;
  219. sk->sk_err = err;
  220. sk->sk_error_report(sk);
  221. }
  222. static void smap_read_sock_strparser(struct strparser *strp,
  223. struct sk_buff *skb)
  224. {
  225. struct smap_psock *psock;
  226. rcu_read_lock();
  227. psock = container_of(strp, struct smap_psock, strp);
  228. smap_do_verdict(psock, skb);
  229. rcu_read_unlock();
  230. }
  231. /* Called with lock held on socket */
  232. static void smap_data_ready(struct sock *sk)
  233. {
  234. struct smap_psock *psock;
  235. rcu_read_lock();
  236. psock = smap_psock_sk(sk);
  237. if (likely(psock)) {
  238. write_lock_bh(&sk->sk_callback_lock);
  239. strp_data_ready(&psock->strp);
  240. write_unlock_bh(&sk->sk_callback_lock);
  241. }
  242. rcu_read_unlock();
  243. }
  244. static void smap_tx_work(struct work_struct *w)
  245. {
  246. struct smap_psock *psock;
  247. struct sk_buff *skb;
  248. int rem, off, n;
  249. psock = container_of(w, struct smap_psock, tx_work);
  250. /* lock sock to avoid losing sk_socket at some point during loop */
  251. lock_sock(psock->sock);
  252. if (psock->save_skb) {
  253. skb = psock->save_skb;
  254. rem = psock->save_rem;
  255. off = psock->save_off;
  256. psock->save_skb = NULL;
  257. goto start;
  258. }
  259. while ((skb = skb_dequeue(&psock->rxqueue))) {
  260. rem = skb->len;
  261. off = 0;
  262. start:
  263. do {
  264. if (likely(psock->sock->sk_socket))
  265. n = skb_send_sock_locked(psock->sock,
  266. skb, off, rem);
  267. else
  268. n = -EINVAL;
  269. if (n <= 0) {
  270. if (n == -EAGAIN) {
  271. /* Retry when space is available */
  272. psock->save_skb = skb;
  273. psock->save_rem = rem;
  274. psock->save_off = off;
  275. goto out;
  276. }
  277. /* Hard errors break pipe and stop xmit */
  278. smap_report_sk_error(psock, n ? -n : EPIPE);
  279. clear_bit(SMAP_TX_RUNNING, &psock->state);
  280. kfree_skb(skb);
  281. goto out;
  282. }
  283. rem -= n;
  284. off += n;
  285. } while (rem);
  286. kfree_skb(skb);
  287. }
  288. out:
  289. release_sock(psock->sock);
  290. }
  291. static void smap_write_space(struct sock *sk)
  292. {
  293. struct smap_psock *psock;
  294. rcu_read_lock();
  295. psock = smap_psock_sk(sk);
  296. if (likely(psock && test_bit(SMAP_TX_RUNNING, &psock->state)))
  297. schedule_work(&psock->tx_work);
  298. rcu_read_unlock();
  299. }
  300. static void smap_stop_sock(struct smap_psock *psock, struct sock *sk)
  301. {
  302. if (!psock->strp_enabled)
  303. return;
  304. sk->sk_data_ready = psock->save_data_ready;
  305. sk->sk_write_space = psock->save_write_space;
  306. psock->save_data_ready = NULL;
  307. psock->save_write_space = NULL;
  308. strp_stop(&psock->strp);
  309. psock->strp_enabled = false;
  310. }
  311. static void smap_destroy_psock(struct rcu_head *rcu)
  312. {
  313. struct smap_psock *psock = container_of(rcu,
  314. struct smap_psock, rcu);
  315. /* Now that a grace period has passed there is no longer
  316. * any reference to this sock in the sockmap so we can
  317. * destroy the psock, strparser, and bpf programs. But,
  318. * because we use workqueue sync operations we can not
  319. * do it in rcu context
  320. */
  321. schedule_work(&psock->gc_work);
  322. }
  323. static void smap_release_sock(struct smap_psock *psock, struct sock *sock)
  324. {
  325. psock->refcnt--;
  326. if (psock->refcnt)
  327. return;
  328. tcp_cleanup_ulp(sock);
  329. smap_stop_sock(psock, sock);
  330. clear_bit(SMAP_TX_RUNNING, &psock->state);
  331. rcu_assign_sk_user_data(sock, NULL);
  332. call_rcu_sched(&psock->rcu, smap_destroy_psock);
  333. }
  334. static int smap_parse_func_strparser(struct strparser *strp,
  335. struct sk_buff *skb)
  336. {
  337. struct smap_psock *psock;
  338. struct bpf_prog *prog;
  339. int rc;
  340. rcu_read_lock();
  341. psock = container_of(strp, struct smap_psock, strp);
  342. prog = READ_ONCE(psock->bpf_parse);
  343. if (unlikely(!prog)) {
  344. rcu_read_unlock();
  345. return skb->len;
  346. }
  347. /* Attach socket for bpf program to use if needed we can do this
  348. * because strparser clones the skb before handing it to a upper
  349. * layer, meaning skb_orphan has been called. We NULL sk on the
  350. * way out to ensure we don't trigger a BUG_ON in skb/sk operations
  351. * later and because we are not charging the memory of this skb to
  352. * any socket yet.
  353. */
  354. skb->sk = psock->sock;
  355. bpf_compute_data_pointers(skb);
  356. rc = (*prog->bpf_func)(skb, prog->insnsi);
  357. skb->sk = NULL;
  358. rcu_read_unlock();
  359. return rc;
  360. }
  361. static int smap_read_sock_done(struct strparser *strp, int err)
  362. {
  363. return err;
  364. }
  365. static int smap_init_sock(struct smap_psock *psock,
  366. struct sock *sk)
  367. {
  368. static const struct strp_callbacks cb = {
  369. .rcv_msg = smap_read_sock_strparser,
  370. .parse_msg = smap_parse_func_strparser,
  371. .read_sock_done = smap_read_sock_done,
  372. };
  373. return strp_init(&psock->strp, sk, &cb);
  374. }
  375. static void smap_init_progs(struct smap_psock *psock,
  376. struct bpf_stab *stab,
  377. struct bpf_prog *verdict,
  378. struct bpf_prog *parse)
  379. {
  380. struct bpf_prog *orig_parse, *orig_verdict;
  381. orig_parse = xchg(&psock->bpf_parse, parse);
  382. orig_verdict = xchg(&psock->bpf_verdict, verdict);
  383. if (orig_verdict)
  384. bpf_prog_put(orig_verdict);
  385. if (orig_parse)
  386. bpf_prog_put(orig_parse);
  387. }
  388. static void smap_start_sock(struct smap_psock *psock, struct sock *sk)
  389. {
  390. if (sk->sk_data_ready == smap_data_ready)
  391. return;
  392. psock->save_data_ready = sk->sk_data_ready;
  393. psock->save_write_space = sk->sk_write_space;
  394. sk->sk_data_ready = smap_data_ready;
  395. sk->sk_write_space = smap_write_space;
  396. psock->strp_enabled = true;
  397. }
  398. static void sock_map_remove_complete(struct bpf_stab *stab)
  399. {
  400. bpf_map_area_free(stab->sock_map);
  401. kfree(stab);
  402. }
  403. static void smap_gc_work(struct work_struct *w)
  404. {
  405. struct smap_psock_map_entry *e, *tmp;
  406. struct smap_psock *psock;
  407. psock = container_of(w, struct smap_psock, gc_work);
  408. /* no callback lock needed because we already detached sockmap ops */
  409. if (psock->strp_enabled)
  410. strp_done(&psock->strp);
  411. cancel_work_sync(&psock->tx_work);
  412. __skb_queue_purge(&psock->rxqueue);
  413. /* At this point all strparser and xmit work must be complete */
  414. if (psock->bpf_parse)
  415. bpf_prog_put(psock->bpf_parse);
  416. if (psock->bpf_verdict)
  417. bpf_prog_put(psock->bpf_verdict);
  418. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  419. list_del(&e->list);
  420. kfree(e);
  421. }
  422. sock_put(psock->sock);
  423. kfree(psock);
  424. }
  425. static struct smap_psock *smap_init_psock(struct sock *sock,
  426. struct bpf_stab *stab)
  427. {
  428. struct smap_psock *psock;
  429. psock = kzalloc_node(sizeof(struct smap_psock),
  430. GFP_ATOMIC | __GFP_NOWARN,
  431. stab->map.numa_node);
  432. if (!psock)
  433. return ERR_PTR(-ENOMEM);
  434. psock->sock = sock;
  435. skb_queue_head_init(&psock->rxqueue);
  436. INIT_WORK(&psock->tx_work, smap_tx_work);
  437. INIT_WORK(&psock->gc_work, smap_gc_work);
  438. INIT_LIST_HEAD(&psock->maps);
  439. psock->refcnt = 1;
  440. rcu_assign_sk_user_data(sock, psock);
  441. sock_hold(sock);
  442. return psock;
  443. }
  444. static struct bpf_map *sock_map_alloc(union bpf_attr *attr)
  445. {
  446. struct bpf_stab *stab;
  447. int err = -EINVAL;
  448. u64 cost;
  449. if (!capable(CAP_NET_ADMIN))
  450. return ERR_PTR(-EPERM);
  451. /* check sanity of attributes */
  452. if (attr->max_entries == 0 || attr->key_size != 4 ||
  453. attr->value_size != 4 || attr->map_flags & ~SOCK_CREATE_FLAG_MASK)
  454. return ERR_PTR(-EINVAL);
  455. if (attr->value_size > KMALLOC_MAX_SIZE)
  456. return ERR_PTR(-E2BIG);
  457. err = bpf_tcp_ulp_register();
  458. if (err && err != -EEXIST)
  459. return ERR_PTR(err);
  460. stab = kzalloc(sizeof(*stab), GFP_USER);
  461. if (!stab)
  462. return ERR_PTR(-ENOMEM);
  463. bpf_map_init_from_attr(&stab->map, attr);
  464. /* make sure page count doesn't overflow */
  465. cost = (u64) stab->map.max_entries * sizeof(struct sock *);
  466. if (cost >= U32_MAX - PAGE_SIZE)
  467. goto free_stab;
  468. stab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
  469. /* if map size is larger than memlock limit, reject it early */
  470. err = bpf_map_precharge_memlock(stab->map.pages);
  471. if (err)
  472. goto free_stab;
  473. err = -ENOMEM;
  474. stab->sock_map = bpf_map_area_alloc(stab->map.max_entries *
  475. sizeof(struct sock *),
  476. stab->map.numa_node);
  477. if (!stab->sock_map)
  478. goto free_stab;
  479. return &stab->map;
  480. free_stab:
  481. kfree(stab);
  482. return ERR_PTR(err);
  483. }
  484. static void smap_list_remove(struct smap_psock *psock, struct sock **entry)
  485. {
  486. struct smap_psock_map_entry *e, *tmp;
  487. list_for_each_entry_safe(e, tmp, &psock->maps, list) {
  488. if (e->entry == entry) {
  489. list_del(&e->list);
  490. break;
  491. }
  492. }
  493. }
  494. static void sock_map_free(struct bpf_map *map)
  495. {
  496. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  497. int i;
  498. synchronize_rcu();
  499. /* At this point no update, lookup or delete operations can happen.
  500. * However, be aware we can still get a socket state event updates,
  501. * and data ready callabacks that reference the psock from sk_user_data
  502. * Also psock worker threads are still in-flight. So smap_release_sock
  503. * will only free the psock after cancel_sync on the worker threads
  504. * and a grace period expire to ensure psock is really safe to remove.
  505. */
  506. rcu_read_lock();
  507. for (i = 0; i < stab->map.max_entries; i++) {
  508. struct smap_psock *psock;
  509. struct sock *sock;
  510. sock = xchg(&stab->sock_map[i], NULL);
  511. if (!sock)
  512. continue;
  513. write_lock_bh(&sock->sk_callback_lock);
  514. psock = smap_psock_sk(sock);
  515. /* This check handles a racing sock event that can get the
  516. * sk_callback_lock before this case but after xchg happens
  517. * causing the refcnt to hit zero and sock user data (psock)
  518. * to be null and queued for garbage collection.
  519. */
  520. if (likely(psock)) {
  521. smap_list_remove(psock, &stab->sock_map[i]);
  522. smap_release_sock(psock, sock);
  523. }
  524. write_unlock_bh(&sock->sk_callback_lock);
  525. }
  526. rcu_read_unlock();
  527. sock_map_remove_complete(stab);
  528. }
  529. static int sock_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
  530. {
  531. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  532. u32 i = key ? *(u32 *)key : U32_MAX;
  533. u32 *next = (u32 *)next_key;
  534. if (i >= stab->map.max_entries) {
  535. *next = 0;
  536. return 0;
  537. }
  538. if (i == stab->map.max_entries - 1)
  539. return -ENOENT;
  540. *next = i + 1;
  541. return 0;
  542. }
  543. struct sock *__sock_map_lookup_elem(struct bpf_map *map, u32 key)
  544. {
  545. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  546. if (key >= map->max_entries)
  547. return NULL;
  548. return READ_ONCE(stab->sock_map[key]);
  549. }
  550. static int sock_map_delete_elem(struct bpf_map *map, void *key)
  551. {
  552. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  553. struct smap_psock *psock;
  554. int k = *(u32 *)key;
  555. struct sock *sock;
  556. if (k >= map->max_entries)
  557. return -EINVAL;
  558. sock = xchg(&stab->sock_map[k], NULL);
  559. if (!sock)
  560. return -EINVAL;
  561. write_lock_bh(&sock->sk_callback_lock);
  562. psock = smap_psock_sk(sock);
  563. if (!psock)
  564. goto out;
  565. if (psock->bpf_parse)
  566. smap_stop_sock(psock, sock);
  567. smap_list_remove(psock, &stab->sock_map[k]);
  568. smap_release_sock(psock, sock);
  569. out:
  570. write_unlock_bh(&sock->sk_callback_lock);
  571. return 0;
  572. }
  573. /* Locking notes: Concurrent updates, deletes, and lookups are allowed and are
  574. * done inside rcu critical sections. This ensures on updates that the psock
  575. * will not be released via smap_release_sock() until concurrent updates/deletes
  576. * complete. All operations operate on sock_map using cmpxchg and xchg
  577. * operations to ensure we do not get stale references. Any reads into the
  578. * map must be done with READ_ONCE() because of this.
  579. *
  580. * A psock is destroyed via call_rcu and after any worker threads are cancelled
  581. * and syncd so we are certain all references from the update/lookup/delete
  582. * operations as well as references in the data path are no longer in use.
  583. *
  584. * Psocks may exist in multiple maps, but only a single set of parse/verdict
  585. * programs may be inherited from the maps it belongs to. A reference count
  586. * is kept with the total number of references to the psock from all maps. The
  587. * psock will not be released until this reaches zero. The psock and sock
  588. * user data data use the sk_callback_lock to protect critical data structures
  589. * from concurrent access. This allows us to avoid two updates from modifying
  590. * the user data in sock and the lock is required anyways for modifying
  591. * callbacks, we simply increase its scope slightly.
  592. *
  593. * Rules to follow,
  594. * - psock must always be read inside RCU critical section
  595. * - sk_user_data must only be modified inside sk_callback_lock and read
  596. * inside RCU critical section.
  597. * - psock->maps list must only be read & modified inside sk_callback_lock
  598. * - sock_map must use READ_ONCE and (cmp)xchg operations
  599. * - BPF verdict/parse programs must use READ_ONCE and xchg operations
  600. */
  601. static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops,
  602. struct bpf_map *map,
  603. void *key, u64 flags)
  604. {
  605. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  606. struct smap_psock_map_entry *e = NULL;
  607. struct bpf_prog *verdict, *parse;
  608. struct sock *osock, *sock;
  609. struct smap_psock *psock;
  610. u32 i = *(u32 *)key;
  611. int err;
  612. if (unlikely(flags > BPF_EXIST))
  613. return -EINVAL;
  614. if (unlikely(i >= stab->map.max_entries))
  615. return -E2BIG;
  616. sock = READ_ONCE(stab->sock_map[i]);
  617. if (flags == BPF_EXIST && !sock)
  618. return -ENOENT;
  619. else if (flags == BPF_NOEXIST && sock)
  620. return -EEXIST;
  621. sock = skops->sk;
  622. /* 1. If sock map has BPF programs those will be inherited by the
  623. * sock being added. If the sock is already attached to BPF programs
  624. * this results in an error.
  625. */
  626. verdict = READ_ONCE(stab->bpf_verdict);
  627. parse = READ_ONCE(stab->bpf_parse);
  628. if (parse && verdict) {
  629. /* bpf prog refcnt may be zero if a concurrent attach operation
  630. * removes the program after the above READ_ONCE() but before
  631. * we increment the refcnt. If this is the case abort with an
  632. * error.
  633. */
  634. verdict = bpf_prog_inc_not_zero(stab->bpf_verdict);
  635. if (IS_ERR(verdict))
  636. return PTR_ERR(verdict);
  637. parse = bpf_prog_inc_not_zero(stab->bpf_parse);
  638. if (IS_ERR(parse)) {
  639. bpf_prog_put(verdict);
  640. return PTR_ERR(parse);
  641. }
  642. }
  643. write_lock_bh(&sock->sk_callback_lock);
  644. psock = smap_psock_sk(sock);
  645. /* 2. Do not allow inheriting programs if psock exists and has
  646. * already inherited programs. This would create confusion on
  647. * which parser/verdict program is running. If no psock exists
  648. * create one. Inside sk_callback_lock to ensure concurrent create
  649. * doesn't update user data.
  650. */
  651. if (psock) {
  652. if (READ_ONCE(psock->bpf_parse) && parse) {
  653. err = -EBUSY;
  654. goto out_progs;
  655. }
  656. psock->refcnt++;
  657. } else {
  658. psock = smap_init_psock(sock, stab);
  659. if (IS_ERR(psock)) {
  660. err = PTR_ERR(psock);
  661. goto out_progs;
  662. }
  663. err = tcp_set_ulp_id(sock, TCP_ULP_BPF);
  664. if (err)
  665. goto out_progs;
  666. set_bit(SMAP_TX_RUNNING, &psock->state);
  667. }
  668. e = kzalloc(sizeof(*e), GFP_ATOMIC | __GFP_NOWARN);
  669. if (!e) {
  670. err = -ENOMEM;
  671. goto out_progs;
  672. }
  673. e->entry = &stab->sock_map[i];
  674. /* 3. At this point we have a reference to a valid psock that is
  675. * running. Attach any BPF programs needed.
  676. */
  677. if (parse && verdict && !psock->strp_enabled) {
  678. err = smap_init_sock(psock, sock);
  679. if (err)
  680. goto out_free;
  681. smap_init_progs(psock, stab, verdict, parse);
  682. smap_start_sock(psock, sock);
  683. }
  684. /* 4. Place psock in sockmap for use and stop any programs on
  685. * the old sock assuming its not the same sock we are replacing
  686. * it with. Because we can only have a single set of programs if
  687. * old_sock has a strp we can stop it.
  688. */
  689. list_add_tail(&e->list, &psock->maps);
  690. write_unlock_bh(&sock->sk_callback_lock);
  691. osock = xchg(&stab->sock_map[i], sock);
  692. if (osock) {
  693. struct smap_psock *opsock = smap_psock_sk(osock);
  694. write_lock_bh(&osock->sk_callback_lock);
  695. if (osock != sock && parse)
  696. smap_stop_sock(opsock, osock);
  697. smap_list_remove(opsock, &stab->sock_map[i]);
  698. smap_release_sock(opsock, osock);
  699. write_unlock_bh(&osock->sk_callback_lock);
  700. }
  701. return 0;
  702. out_free:
  703. smap_release_sock(psock, sock);
  704. out_progs:
  705. if (verdict)
  706. bpf_prog_put(verdict);
  707. if (parse)
  708. bpf_prog_put(parse);
  709. write_unlock_bh(&sock->sk_callback_lock);
  710. kfree(e);
  711. return err;
  712. }
  713. int sock_map_prog(struct bpf_map *map, struct bpf_prog *prog, u32 type)
  714. {
  715. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  716. struct bpf_prog *orig;
  717. if (unlikely(map->map_type != BPF_MAP_TYPE_SOCKMAP))
  718. return -EINVAL;
  719. switch (type) {
  720. case BPF_SK_SKB_STREAM_PARSER:
  721. orig = xchg(&stab->bpf_parse, prog);
  722. break;
  723. case BPF_SK_SKB_STREAM_VERDICT:
  724. orig = xchg(&stab->bpf_verdict, prog);
  725. break;
  726. default:
  727. return -EOPNOTSUPP;
  728. }
  729. if (orig)
  730. bpf_prog_put(orig);
  731. return 0;
  732. }
  733. static void *sock_map_lookup(struct bpf_map *map, void *key)
  734. {
  735. return NULL;
  736. }
  737. static int sock_map_update_elem(struct bpf_map *map,
  738. void *key, void *value, u64 flags)
  739. {
  740. struct bpf_sock_ops_kern skops;
  741. u32 fd = *(u32 *)value;
  742. struct socket *socket;
  743. int err;
  744. socket = sockfd_lookup(fd, &err);
  745. if (!socket)
  746. return err;
  747. skops.sk = socket->sk;
  748. if (!skops.sk) {
  749. fput(socket->file);
  750. return -EINVAL;
  751. }
  752. if (skops.sk->sk_type != SOCK_STREAM ||
  753. skops.sk->sk_protocol != IPPROTO_TCP) {
  754. fput(socket->file);
  755. return -EOPNOTSUPP;
  756. }
  757. err = sock_map_ctx_update_elem(&skops, map, key, flags);
  758. fput(socket->file);
  759. return err;
  760. }
  761. static void sock_map_release(struct bpf_map *map, struct file *map_file)
  762. {
  763. struct bpf_stab *stab = container_of(map, struct bpf_stab, map);
  764. struct bpf_prog *orig;
  765. orig = xchg(&stab->bpf_parse, NULL);
  766. if (orig)
  767. bpf_prog_put(orig);
  768. orig = xchg(&stab->bpf_verdict, NULL);
  769. if (orig)
  770. bpf_prog_put(orig);
  771. }
  772. const struct bpf_map_ops sock_map_ops = {
  773. .map_alloc = sock_map_alloc,
  774. .map_free = sock_map_free,
  775. .map_lookup_elem = sock_map_lookup,
  776. .map_get_next_key = sock_map_get_next_key,
  777. .map_update_elem = sock_map_update_elem,
  778. .map_delete_elem = sock_map_delete_elem,
  779. .map_release = sock_map_release,
  780. };
  781. BPF_CALL_4(bpf_sock_map_update, struct bpf_sock_ops_kern *, bpf_sock,
  782. struct bpf_map *, map, void *, key, u64, flags)
  783. {
  784. WARN_ON_ONCE(!rcu_read_lock_held());
  785. return sock_map_ctx_update_elem(bpf_sock, map, key, flags);
  786. }
  787. const struct bpf_func_proto bpf_sock_map_update_proto = {
  788. .func = bpf_sock_map_update,
  789. .gpl_only = false,
  790. .pkt_access = true,
  791. .ret_type = RET_INTEGER,
  792. .arg1_type = ARG_PTR_TO_CTX,
  793. .arg2_type = ARG_CONST_MAP_PTR,
  794. .arg3_type = ARG_PTR_TO_MAP_KEY,
  795. .arg4_type = ARG_ANYTHING,
  796. };