syscall.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659
  1. /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. #include <linux/bpf.h>
  13. #include <linux/syscalls.h>
  14. #include <linux/slab.h>
  15. #include <linux/anon_inodes.h>
  16. #include <linux/file.h>
  17. #include <linux/license.h>
  18. #include <linux/filter.h>
  19. #include <linux/version.h>
  20. static LIST_HEAD(bpf_map_types);
  21. static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
  22. {
  23. struct bpf_map_type_list *tl;
  24. struct bpf_map *map;
  25. list_for_each_entry(tl, &bpf_map_types, list_node) {
  26. if (tl->type == attr->map_type) {
  27. map = tl->ops->map_alloc(attr);
  28. if (IS_ERR(map))
  29. return map;
  30. map->ops = tl->ops;
  31. map->map_type = attr->map_type;
  32. return map;
  33. }
  34. }
  35. return ERR_PTR(-EINVAL);
  36. }
  37. /* boot time registration of different map implementations */
  38. void bpf_register_map_type(struct bpf_map_type_list *tl)
  39. {
  40. list_add(&tl->list_node, &bpf_map_types);
  41. }
  42. /* called from workqueue */
  43. static void bpf_map_free_deferred(struct work_struct *work)
  44. {
  45. struct bpf_map *map = container_of(work, struct bpf_map, work);
  46. /* implementation dependent freeing */
  47. map->ops->map_free(map);
  48. }
  49. /* decrement map refcnt and schedule it for freeing via workqueue
  50. * (unrelying map implementation ops->map_free() might sleep)
  51. */
  52. void bpf_map_put(struct bpf_map *map)
  53. {
  54. if (atomic_dec_and_test(&map->refcnt)) {
  55. INIT_WORK(&map->work, bpf_map_free_deferred);
  56. schedule_work(&map->work);
  57. }
  58. }
  59. static int bpf_map_release(struct inode *inode, struct file *filp)
  60. {
  61. struct bpf_map *map = filp->private_data;
  62. if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
  63. /* prog_array stores refcnt-ed bpf_prog pointers
  64. * release them all when user space closes prog_array_fd
  65. */
  66. bpf_prog_array_map_clear(map);
  67. bpf_map_put(map);
  68. return 0;
  69. }
  70. static const struct file_operations bpf_map_fops = {
  71. .release = bpf_map_release,
  72. };
  73. /* helper macro to check that unused fields 'union bpf_attr' are zero */
  74. #define CHECK_ATTR(CMD) \
  75. memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
  76. sizeof(attr->CMD##_LAST_FIELD), 0, \
  77. sizeof(*attr) - \
  78. offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
  79. sizeof(attr->CMD##_LAST_FIELD)) != NULL
  80. #define BPF_MAP_CREATE_LAST_FIELD max_entries
  81. /* called via syscall */
  82. static int map_create(union bpf_attr *attr)
  83. {
  84. struct bpf_map *map;
  85. int err;
  86. err = CHECK_ATTR(BPF_MAP_CREATE);
  87. if (err)
  88. return -EINVAL;
  89. /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
  90. map = find_and_alloc_map(attr);
  91. if (IS_ERR(map))
  92. return PTR_ERR(map);
  93. atomic_set(&map->refcnt, 1);
  94. err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
  95. if (err < 0)
  96. /* failed to allocate fd */
  97. goto free_map;
  98. return err;
  99. free_map:
  100. map->ops->map_free(map);
  101. return err;
  102. }
  103. /* if error is returned, fd is released.
  104. * On success caller should complete fd access with matching fdput()
  105. */
  106. struct bpf_map *bpf_map_get(struct fd f)
  107. {
  108. struct bpf_map *map;
  109. if (!f.file)
  110. return ERR_PTR(-EBADF);
  111. if (f.file->f_op != &bpf_map_fops) {
  112. fdput(f);
  113. return ERR_PTR(-EINVAL);
  114. }
  115. map = f.file->private_data;
  116. return map;
  117. }
  118. /* helper to convert user pointers passed inside __aligned_u64 fields */
  119. static void __user *u64_to_ptr(__u64 val)
  120. {
  121. return (void __user *) (unsigned long) val;
  122. }
  123. /* last field in 'union bpf_attr' used by this command */
  124. #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
  125. static int map_lookup_elem(union bpf_attr *attr)
  126. {
  127. void __user *ukey = u64_to_ptr(attr->key);
  128. void __user *uvalue = u64_to_ptr(attr->value);
  129. int ufd = attr->map_fd;
  130. struct fd f = fdget(ufd);
  131. struct bpf_map *map;
  132. void *key, *value, *ptr;
  133. int err;
  134. if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
  135. return -EINVAL;
  136. map = bpf_map_get(f);
  137. if (IS_ERR(map))
  138. return PTR_ERR(map);
  139. err = -ENOMEM;
  140. key = kmalloc(map->key_size, GFP_USER);
  141. if (!key)
  142. goto err_put;
  143. err = -EFAULT;
  144. if (copy_from_user(key, ukey, map->key_size) != 0)
  145. goto free_key;
  146. err = -ENOMEM;
  147. value = kmalloc(map->value_size, GFP_USER);
  148. if (!value)
  149. goto free_key;
  150. rcu_read_lock();
  151. ptr = map->ops->map_lookup_elem(map, key);
  152. if (ptr)
  153. memcpy(value, ptr, map->value_size);
  154. rcu_read_unlock();
  155. err = -ENOENT;
  156. if (!ptr)
  157. goto free_value;
  158. err = -EFAULT;
  159. if (copy_to_user(uvalue, value, map->value_size) != 0)
  160. goto free_value;
  161. err = 0;
  162. free_value:
  163. kfree(value);
  164. free_key:
  165. kfree(key);
  166. err_put:
  167. fdput(f);
  168. return err;
  169. }
  170. #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
  171. static int map_update_elem(union bpf_attr *attr)
  172. {
  173. void __user *ukey = u64_to_ptr(attr->key);
  174. void __user *uvalue = u64_to_ptr(attr->value);
  175. int ufd = attr->map_fd;
  176. struct fd f = fdget(ufd);
  177. struct bpf_map *map;
  178. void *key, *value;
  179. int err;
  180. if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
  181. return -EINVAL;
  182. map = bpf_map_get(f);
  183. if (IS_ERR(map))
  184. return PTR_ERR(map);
  185. err = -ENOMEM;
  186. key = kmalloc(map->key_size, GFP_USER);
  187. if (!key)
  188. goto err_put;
  189. err = -EFAULT;
  190. if (copy_from_user(key, ukey, map->key_size) != 0)
  191. goto free_key;
  192. err = -ENOMEM;
  193. value = kmalloc(map->value_size, GFP_USER);
  194. if (!value)
  195. goto free_key;
  196. err = -EFAULT;
  197. if (copy_from_user(value, uvalue, map->value_size) != 0)
  198. goto free_value;
  199. /* eBPF program that use maps are running under rcu_read_lock(),
  200. * therefore all map accessors rely on this fact, so do the same here
  201. */
  202. rcu_read_lock();
  203. err = map->ops->map_update_elem(map, key, value, attr->flags);
  204. rcu_read_unlock();
  205. free_value:
  206. kfree(value);
  207. free_key:
  208. kfree(key);
  209. err_put:
  210. fdput(f);
  211. return err;
  212. }
  213. #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
  214. static int map_delete_elem(union bpf_attr *attr)
  215. {
  216. void __user *ukey = u64_to_ptr(attr->key);
  217. int ufd = attr->map_fd;
  218. struct fd f = fdget(ufd);
  219. struct bpf_map *map;
  220. void *key;
  221. int err;
  222. if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
  223. return -EINVAL;
  224. map = bpf_map_get(f);
  225. if (IS_ERR(map))
  226. return PTR_ERR(map);
  227. err = -ENOMEM;
  228. key = kmalloc(map->key_size, GFP_USER);
  229. if (!key)
  230. goto err_put;
  231. err = -EFAULT;
  232. if (copy_from_user(key, ukey, map->key_size) != 0)
  233. goto free_key;
  234. rcu_read_lock();
  235. err = map->ops->map_delete_elem(map, key);
  236. rcu_read_unlock();
  237. free_key:
  238. kfree(key);
  239. err_put:
  240. fdput(f);
  241. return err;
  242. }
  243. /* last field in 'union bpf_attr' used by this command */
  244. #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
  245. static int map_get_next_key(union bpf_attr *attr)
  246. {
  247. void __user *ukey = u64_to_ptr(attr->key);
  248. void __user *unext_key = u64_to_ptr(attr->next_key);
  249. int ufd = attr->map_fd;
  250. struct fd f = fdget(ufd);
  251. struct bpf_map *map;
  252. void *key, *next_key;
  253. int err;
  254. if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
  255. return -EINVAL;
  256. map = bpf_map_get(f);
  257. if (IS_ERR(map))
  258. return PTR_ERR(map);
  259. err = -ENOMEM;
  260. key = kmalloc(map->key_size, GFP_USER);
  261. if (!key)
  262. goto err_put;
  263. err = -EFAULT;
  264. if (copy_from_user(key, ukey, map->key_size) != 0)
  265. goto free_key;
  266. err = -ENOMEM;
  267. next_key = kmalloc(map->key_size, GFP_USER);
  268. if (!next_key)
  269. goto free_key;
  270. rcu_read_lock();
  271. err = map->ops->map_get_next_key(map, key, next_key);
  272. rcu_read_unlock();
  273. if (err)
  274. goto free_next_key;
  275. err = -EFAULT;
  276. if (copy_to_user(unext_key, next_key, map->key_size) != 0)
  277. goto free_next_key;
  278. err = 0;
  279. free_next_key:
  280. kfree(next_key);
  281. free_key:
  282. kfree(key);
  283. err_put:
  284. fdput(f);
  285. return err;
  286. }
  287. static LIST_HEAD(bpf_prog_types);
  288. static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
  289. {
  290. struct bpf_prog_type_list *tl;
  291. list_for_each_entry(tl, &bpf_prog_types, list_node) {
  292. if (tl->type == type) {
  293. prog->aux->ops = tl->ops;
  294. prog->type = type;
  295. return 0;
  296. }
  297. }
  298. return -EINVAL;
  299. }
  300. void bpf_register_prog_type(struct bpf_prog_type_list *tl)
  301. {
  302. list_add(&tl->list_node, &bpf_prog_types);
  303. }
  304. /* fixup insn->imm field of bpf_call instructions:
  305. * if (insn->imm == BPF_FUNC_map_lookup_elem)
  306. * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
  307. * else if (insn->imm == BPF_FUNC_map_update_elem)
  308. * insn->imm = bpf_map_update_elem - __bpf_call_base;
  309. * else ...
  310. *
  311. * this function is called after eBPF program passed verification
  312. */
  313. static void fixup_bpf_calls(struct bpf_prog *prog)
  314. {
  315. const struct bpf_func_proto *fn;
  316. int i;
  317. for (i = 0; i < prog->len; i++) {
  318. struct bpf_insn *insn = &prog->insnsi[i];
  319. if (insn->code == (BPF_JMP | BPF_CALL)) {
  320. /* we reach here when program has bpf_call instructions
  321. * and it passed bpf_check(), means that
  322. * ops->get_func_proto must have been supplied, check it
  323. */
  324. BUG_ON(!prog->aux->ops->get_func_proto);
  325. if (insn->imm == BPF_FUNC_tail_call) {
  326. /* mark bpf_tail_call as different opcode
  327. * to avoid conditional branch in
  328. * interpeter for every normal call
  329. * and to prevent accidental JITing by
  330. * JIT compiler that doesn't support
  331. * bpf_tail_call yet
  332. */
  333. insn->imm = 0;
  334. insn->code |= BPF_X;
  335. continue;
  336. }
  337. fn = prog->aux->ops->get_func_proto(insn->imm);
  338. /* all functions that have prototype and verifier allowed
  339. * programs to call them, must be real in-kernel functions
  340. */
  341. BUG_ON(!fn->func);
  342. insn->imm = fn->func - __bpf_call_base;
  343. }
  344. }
  345. }
  346. /* drop refcnt on maps used by eBPF program and free auxilary data */
  347. static void free_used_maps(struct bpf_prog_aux *aux)
  348. {
  349. int i;
  350. for (i = 0; i < aux->used_map_cnt; i++)
  351. bpf_map_put(aux->used_maps[i]);
  352. kfree(aux->used_maps);
  353. }
  354. static void __prog_put_rcu(struct rcu_head *rcu)
  355. {
  356. struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
  357. free_used_maps(aux);
  358. bpf_prog_free(aux->prog);
  359. }
  360. /* version of bpf_prog_put() that is called after a grace period */
  361. void bpf_prog_put_rcu(struct bpf_prog *prog)
  362. {
  363. if (atomic_dec_and_test(&prog->aux->refcnt)) {
  364. prog->aux->prog = prog;
  365. call_rcu(&prog->aux->rcu, __prog_put_rcu);
  366. }
  367. }
  368. void bpf_prog_put(struct bpf_prog *prog)
  369. {
  370. if (atomic_dec_and_test(&prog->aux->refcnt)) {
  371. free_used_maps(prog->aux);
  372. bpf_prog_free(prog);
  373. }
  374. }
  375. EXPORT_SYMBOL_GPL(bpf_prog_put);
  376. static int bpf_prog_release(struct inode *inode, struct file *filp)
  377. {
  378. struct bpf_prog *prog = filp->private_data;
  379. bpf_prog_put_rcu(prog);
  380. return 0;
  381. }
  382. static const struct file_operations bpf_prog_fops = {
  383. .release = bpf_prog_release,
  384. };
  385. static struct bpf_prog *get_prog(struct fd f)
  386. {
  387. struct bpf_prog *prog;
  388. if (!f.file)
  389. return ERR_PTR(-EBADF);
  390. if (f.file->f_op != &bpf_prog_fops) {
  391. fdput(f);
  392. return ERR_PTR(-EINVAL);
  393. }
  394. prog = f.file->private_data;
  395. return prog;
  396. }
  397. /* called by sockets/tracing/seccomp before attaching program to an event
  398. * pairs with bpf_prog_put()
  399. */
  400. struct bpf_prog *bpf_prog_get(u32 ufd)
  401. {
  402. struct fd f = fdget(ufd);
  403. struct bpf_prog *prog;
  404. prog = get_prog(f);
  405. if (IS_ERR(prog))
  406. return prog;
  407. atomic_inc(&prog->aux->refcnt);
  408. fdput(f);
  409. return prog;
  410. }
  411. EXPORT_SYMBOL_GPL(bpf_prog_get);
  412. /* last field in 'union bpf_attr' used by this command */
  413. #define BPF_PROG_LOAD_LAST_FIELD kern_version
  414. static int bpf_prog_load(union bpf_attr *attr)
  415. {
  416. enum bpf_prog_type type = attr->prog_type;
  417. struct bpf_prog *prog;
  418. int err;
  419. char license[128];
  420. bool is_gpl;
  421. if (CHECK_ATTR(BPF_PROG_LOAD))
  422. return -EINVAL;
  423. /* copy eBPF program license from user space */
  424. if (strncpy_from_user(license, u64_to_ptr(attr->license),
  425. sizeof(license) - 1) < 0)
  426. return -EFAULT;
  427. license[sizeof(license) - 1] = 0;
  428. /* eBPF programs must be GPL compatible to use GPL-ed functions */
  429. is_gpl = license_is_gpl_compatible(license);
  430. if (attr->insn_cnt >= BPF_MAXINSNS)
  431. return -EINVAL;
  432. if (type == BPF_PROG_TYPE_KPROBE &&
  433. attr->kern_version != LINUX_VERSION_CODE)
  434. return -EINVAL;
  435. /* plain bpf_prog allocation */
  436. prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
  437. if (!prog)
  438. return -ENOMEM;
  439. prog->len = attr->insn_cnt;
  440. err = -EFAULT;
  441. if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
  442. prog->len * sizeof(struct bpf_insn)) != 0)
  443. goto free_prog;
  444. prog->orig_prog = NULL;
  445. prog->jited = false;
  446. atomic_set(&prog->aux->refcnt, 1);
  447. prog->gpl_compatible = is_gpl;
  448. /* find program type: socket_filter vs tracing_filter */
  449. err = find_prog_type(type, prog);
  450. if (err < 0)
  451. goto free_prog;
  452. /* run eBPF verifier */
  453. err = bpf_check(&prog, attr);
  454. if (err < 0)
  455. goto free_used_maps;
  456. /* fixup BPF_CALL->imm field */
  457. fixup_bpf_calls(prog);
  458. /* eBPF program is ready to be JITed */
  459. err = bpf_prog_select_runtime(prog);
  460. if (err < 0)
  461. goto free_used_maps;
  462. err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
  463. if (err < 0)
  464. /* failed to allocate fd */
  465. goto free_used_maps;
  466. return err;
  467. free_used_maps:
  468. free_used_maps(prog->aux);
  469. free_prog:
  470. bpf_prog_free(prog);
  471. return err;
  472. }
  473. SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
  474. {
  475. union bpf_attr attr = {};
  476. int err;
  477. /* the syscall is limited to root temporarily. This restriction will be
  478. * lifted when security audit is clean. Note that eBPF+tracing must have
  479. * this restriction, since it may pass kernel data to user space
  480. */
  481. if (!capable(CAP_SYS_ADMIN))
  482. return -EPERM;
  483. if (!access_ok(VERIFY_READ, uattr, 1))
  484. return -EFAULT;
  485. if (size > PAGE_SIZE) /* silly large */
  486. return -E2BIG;
  487. /* If we're handed a bigger struct than we know of,
  488. * ensure all the unknown bits are 0 - i.e. new
  489. * user-space does not rely on any kernel feature
  490. * extensions we dont know about yet.
  491. */
  492. if (size > sizeof(attr)) {
  493. unsigned char __user *addr;
  494. unsigned char __user *end;
  495. unsigned char val;
  496. addr = (void __user *)uattr + sizeof(attr);
  497. end = (void __user *)uattr + size;
  498. for (; addr < end; addr++) {
  499. err = get_user(val, addr);
  500. if (err)
  501. return err;
  502. if (val)
  503. return -E2BIG;
  504. }
  505. size = sizeof(attr);
  506. }
  507. /* copy attributes from user space, may be less than sizeof(bpf_attr) */
  508. if (copy_from_user(&attr, uattr, size) != 0)
  509. return -EFAULT;
  510. switch (cmd) {
  511. case BPF_MAP_CREATE:
  512. err = map_create(&attr);
  513. break;
  514. case BPF_MAP_LOOKUP_ELEM:
  515. err = map_lookup_elem(&attr);
  516. break;
  517. case BPF_MAP_UPDATE_ELEM:
  518. err = map_update_elem(&attr);
  519. break;
  520. case BPF_MAP_DELETE_ELEM:
  521. err = map_delete_elem(&attr);
  522. break;
  523. case BPF_MAP_GET_NEXT_KEY:
  524. err = map_get_next_key(&attr);
  525. break;
  526. case BPF_PROG_LOAD:
  527. err = bpf_prog_load(&attr);
  528. break;
  529. default:
  530. err = -EINVAL;
  531. break;
  532. }
  533. return err;
  534. }