syscall.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663
  1. /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. #include <linux/bpf.h>
  13. #include <linux/syscalls.h>
  14. #include <linux/slab.h>
  15. #include <linux/anon_inodes.h>
  16. #include <linux/file.h>
  17. #include <linux/license.h>
  18. #include <linux/filter.h>
  19. #include <linux/version.h>
  20. static LIST_HEAD(bpf_map_types);
  21. static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
  22. {
  23. struct bpf_map_type_list *tl;
  24. struct bpf_map *map;
  25. list_for_each_entry(tl, &bpf_map_types, list_node) {
  26. if (tl->type == attr->map_type) {
  27. map = tl->ops->map_alloc(attr);
  28. if (IS_ERR(map))
  29. return map;
  30. map->ops = tl->ops;
  31. map->map_type = attr->map_type;
  32. return map;
  33. }
  34. }
  35. return ERR_PTR(-EINVAL);
  36. }
  37. /* boot time registration of different map implementations */
  38. void bpf_register_map_type(struct bpf_map_type_list *tl)
  39. {
  40. list_add(&tl->list_node, &bpf_map_types);
  41. }
  42. /* called from workqueue */
  43. static void bpf_map_free_deferred(struct work_struct *work)
  44. {
  45. struct bpf_map *map = container_of(work, struct bpf_map, work);
  46. /* implementation dependent freeing */
  47. map->ops->map_free(map);
  48. }
  49. /* decrement map refcnt and schedule it for freeing via workqueue
  50. * (unrelying map implementation ops->map_free() might sleep)
  51. */
  52. void bpf_map_put(struct bpf_map *map)
  53. {
  54. if (atomic_dec_and_test(&map->refcnt)) {
  55. INIT_WORK(&map->work, bpf_map_free_deferred);
  56. schedule_work(&map->work);
  57. }
  58. }
  59. static int bpf_map_release(struct inode *inode, struct file *filp)
  60. {
  61. struct bpf_map *map = filp->private_data;
  62. if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
  63. /* prog_array stores refcnt-ed bpf_prog pointers
  64. * release them all when user space closes prog_array_fd
  65. */
  66. bpf_fd_array_map_clear(map);
  67. bpf_map_put(map);
  68. return 0;
  69. }
  70. static const struct file_operations bpf_map_fops = {
  71. .release = bpf_map_release,
  72. };
  73. /* helper macro to check that unused fields 'union bpf_attr' are zero */
  74. #define CHECK_ATTR(CMD) \
  75. memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
  76. sizeof(attr->CMD##_LAST_FIELD), 0, \
  77. sizeof(*attr) - \
  78. offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
  79. sizeof(attr->CMD##_LAST_FIELD)) != NULL
  80. #define BPF_MAP_CREATE_LAST_FIELD max_entries
  81. /* called via syscall */
  82. static int map_create(union bpf_attr *attr)
  83. {
  84. struct bpf_map *map;
  85. int err;
  86. err = CHECK_ATTR(BPF_MAP_CREATE);
  87. if (err)
  88. return -EINVAL;
  89. /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
  90. map = find_and_alloc_map(attr);
  91. if (IS_ERR(map))
  92. return PTR_ERR(map);
  93. atomic_set(&map->refcnt, 1);
  94. err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
  95. if (err < 0)
  96. /* failed to allocate fd */
  97. goto free_map;
  98. return err;
  99. free_map:
  100. map->ops->map_free(map);
  101. return err;
  102. }
  103. /* if error is returned, fd is released.
  104. * On success caller should complete fd access with matching fdput()
  105. */
  106. struct bpf_map *bpf_map_get(struct fd f)
  107. {
  108. struct bpf_map *map;
  109. if (!f.file)
  110. return ERR_PTR(-EBADF);
  111. if (f.file->f_op != &bpf_map_fops) {
  112. fdput(f);
  113. return ERR_PTR(-EINVAL);
  114. }
  115. map = f.file->private_data;
  116. return map;
  117. }
  118. /* helper to convert user pointers passed inside __aligned_u64 fields */
  119. static void __user *u64_to_ptr(__u64 val)
  120. {
  121. return (void __user *) (unsigned long) val;
  122. }
  123. /* last field in 'union bpf_attr' used by this command */
  124. #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
  125. static int map_lookup_elem(union bpf_attr *attr)
  126. {
  127. void __user *ukey = u64_to_ptr(attr->key);
  128. void __user *uvalue = u64_to_ptr(attr->value);
  129. int ufd = attr->map_fd;
  130. struct bpf_map *map;
  131. void *key, *value, *ptr;
  132. struct fd f;
  133. int err;
  134. if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
  135. return -EINVAL;
  136. f = fdget(ufd);
  137. map = bpf_map_get(f);
  138. if (IS_ERR(map))
  139. return PTR_ERR(map);
  140. err = -ENOMEM;
  141. key = kmalloc(map->key_size, GFP_USER);
  142. if (!key)
  143. goto err_put;
  144. err = -EFAULT;
  145. if (copy_from_user(key, ukey, map->key_size) != 0)
  146. goto free_key;
  147. err = -ENOMEM;
  148. value = kmalloc(map->value_size, GFP_USER);
  149. if (!value)
  150. goto free_key;
  151. rcu_read_lock();
  152. ptr = map->ops->map_lookup_elem(map, key);
  153. if (ptr)
  154. memcpy(value, ptr, map->value_size);
  155. rcu_read_unlock();
  156. err = -ENOENT;
  157. if (!ptr)
  158. goto free_value;
  159. err = -EFAULT;
  160. if (copy_to_user(uvalue, value, map->value_size) != 0)
  161. goto free_value;
  162. err = 0;
  163. free_value:
  164. kfree(value);
  165. free_key:
  166. kfree(key);
  167. err_put:
  168. fdput(f);
  169. return err;
  170. }
  171. #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
  172. static int map_update_elem(union bpf_attr *attr)
  173. {
  174. void __user *ukey = u64_to_ptr(attr->key);
  175. void __user *uvalue = u64_to_ptr(attr->value);
  176. int ufd = attr->map_fd;
  177. struct bpf_map *map;
  178. void *key, *value;
  179. struct fd f;
  180. int err;
  181. if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
  182. return -EINVAL;
  183. f = fdget(ufd);
  184. map = bpf_map_get(f);
  185. if (IS_ERR(map))
  186. return PTR_ERR(map);
  187. err = -ENOMEM;
  188. key = kmalloc(map->key_size, GFP_USER);
  189. if (!key)
  190. goto err_put;
  191. err = -EFAULT;
  192. if (copy_from_user(key, ukey, map->key_size) != 0)
  193. goto free_key;
  194. err = -ENOMEM;
  195. value = kmalloc(map->value_size, GFP_USER);
  196. if (!value)
  197. goto free_key;
  198. err = -EFAULT;
  199. if (copy_from_user(value, uvalue, map->value_size) != 0)
  200. goto free_value;
  201. /* eBPF program that use maps are running under rcu_read_lock(),
  202. * therefore all map accessors rely on this fact, so do the same here
  203. */
  204. rcu_read_lock();
  205. err = map->ops->map_update_elem(map, key, value, attr->flags);
  206. rcu_read_unlock();
  207. free_value:
  208. kfree(value);
  209. free_key:
  210. kfree(key);
  211. err_put:
  212. fdput(f);
  213. return err;
  214. }
  215. #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
  216. static int map_delete_elem(union bpf_attr *attr)
  217. {
  218. void __user *ukey = u64_to_ptr(attr->key);
  219. int ufd = attr->map_fd;
  220. struct bpf_map *map;
  221. struct fd f;
  222. void *key;
  223. int err;
  224. if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
  225. return -EINVAL;
  226. f = fdget(ufd);
  227. map = bpf_map_get(f);
  228. if (IS_ERR(map))
  229. return PTR_ERR(map);
  230. err = -ENOMEM;
  231. key = kmalloc(map->key_size, GFP_USER);
  232. if (!key)
  233. goto err_put;
  234. err = -EFAULT;
  235. if (copy_from_user(key, ukey, map->key_size) != 0)
  236. goto free_key;
  237. rcu_read_lock();
  238. err = map->ops->map_delete_elem(map, key);
  239. rcu_read_unlock();
  240. free_key:
  241. kfree(key);
  242. err_put:
  243. fdput(f);
  244. return err;
  245. }
  246. /* last field in 'union bpf_attr' used by this command */
  247. #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
  248. static int map_get_next_key(union bpf_attr *attr)
  249. {
  250. void __user *ukey = u64_to_ptr(attr->key);
  251. void __user *unext_key = u64_to_ptr(attr->next_key);
  252. int ufd = attr->map_fd;
  253. struct bpf_map *map;
  254. void *key, *next_key;
  255. struct fd f;
  256. int err;
  257. if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
  258. return -EINVAL;
  259. f = fdget(ufd);
  260. map = bpf_map_get(f);
  261. if (IS_ERR(map))
  262. return PTR_ERR(map);
  263. err = -ENOMEM;
  264. key = kmalloc(map->key_size, GFP_USER);
  265. if (!key)
  266. goto err_put;
  267. err = -EFAULT;
  268. if (copy_from_user(key, ukey, map->key_size) != 0)
  269. goto free_key;
  270. err = -ENOMEM;
  271. next_key = kmalloc(map->key_size, GFP_USER);
  272. if (!next_key)
  273. goto free_key;
  274. rcu_read_lock();
  275. err = map->ops->map_get_next_key(map, key, next_key);
  276. rcu_read_unlock();
  277. if (err)
  278. goto free_next_key;
  279. err = -EFAULT;
  280. if (copy_to_user(unext_key, next_key, map->key_size) != 0)
  281. goto free_next_key;
  282. err = 0;
  283. free_next_key:
  284. kfree(next_key);
  285. free_key:
  286. kfree(key);
  287. err_put:
  288. fdput(f);
  289. return err;
  290. }
  291. static LIST_HEAD(bpf_prog_types);
  292. static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
  293. {
  294. struct bpf_prog_type_list *tl;
  295. list_for_each_entry(tl, &bpf_prog_types, list_node) {
  296. if (tl->type == type) {
  297. prog->aux->ops = tl->ops;
  298. prog->type = type;
  299. return 0;
  300. }
  301. }
  302. return -EINVAL;
  303. }
  304. void bpf_register_prog_type(struct bpf_prog_type_list *tl)
  305. {
  306. list_add(&tl->list_node, &bpf_prog_types);
  307. }
  308. /* fixup insn->imm field of bpf_call instructions:
  309. * if (insn->imm == BPF_FUNC_map_lookup_elem)
  310. * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
  311. * else if (insn->imm == BPF_FUNC_map_update_elem)
  312. * insn->imm = bpf_map_update_elem - __bpf_call_base;
  313. * else ...
  314. *
  315. * this function is called after eBPF program passed verification
  316. */
  317. static void fixup_bpf_calls(struct bpf_prog *prog)
  318. {
  319. const struct bpf_func_proto *fn;
  320. int i;
  321. for (i = 0; i < prog->len; i++) {
  322. struct bpf_insn *insn = &prog->insnsi[i];
  323. if (insn->code == (BPF_JMP | BPF_CALL)) {
  324. /* we reach here when program has bpf_call instructions
  325. * and it passed bpf_check(), means that
  326. * ops->get_func_proto must have been supplied, check it
  327. */
  328. BUG_ON(!prog->aux->ops->get_func_proto);
  329. if (insn->imm == BPF_FUNC_tail_call) {
  330. /* mark bpf_tail_call as different opcode
  331. * to avoid conditional branch in
  332. * interpeter for every normal call
  333. * and to prevent accidental JITing by
  334. * JIT compiler that doesn't support
  335. * bpf_tail_call yet
  336. */
  337. insn->imm = 0;
  338. insn->code |= BPF_X;
  339. continue;
  340. }
  341. fn = prog->aux->ops->get_func_proto(insn->imm);
  342. /* all functions that have prototype and verifier allowed
  343. * programs to call them, must be real in-kernel functions
  344. */
  345. BUG_ON(!fn->func);
  346. insn->imm = fn->func - __bpf_call_base;
  347. }
  348. }
  349. }
  350. /* drop refcnt on maps used by eBPF program and free auxilary data */
  351. static void free_used_maps(struct bpf_prog_aux *aux)
  352. {
  353. int i;
  354. for (i = 0; i < aux->used_map_cnt; i++)
  355. bpf_map_put(aux->used_maps[i]);
  356. kfree(aux->used_maps);
  357. }
  358. static void __prog_put_rcu(struct rcu_head *rcu)
  359. {
  360. struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
  361. free_used_maps(aux);
  362. bpf_prog_free(aux->prog);
  363. }
  364. /* version of bpf_prog_put() that is called after a grace period */
  365. void bpf_prog_put_rcu(struct bpf_prog *prog)
  366. {
  367. if (atomic_dec_and_test(&prog->aux->refcnt)) {
  368. prog->aux->prog = prog;
  369. call_rcu(&prog->aux->rcu, __prog_put_rcu);
  370. }
  371. }
  372. void bpf_prog_put(struct bpf_prog *prog)
  373. {
  374. if (atomic_dec_and_test(&prog->aux->refcnt)) {
  375. free_used_maps(prog->aux);
  376. bpf_prog_free(prog);
  377. }
  378. }
  379. EXPORT_SYMBOL_GPL(bpf_prog_put);
  380. static int bpf_prog_release(struct inode *inode, struct file *filp)
  381. {
  382. struct bpf_prog *prog = filp->private_data;
  383. bpf_prog_put_rcu(prog);
  384. return 0;
  385. }
  386. static const struct file_operations bpf_prog_fops = {
  387. .release = bpf_prog_release,
  388. };
  389. static struct bpf_prog *get_prog(struct fd f)
  390. {
  391. struct bpf_prog *prog;
  392. if (!f.file)
  393. return ERR_PTR(-EBADF);
  394. if (f.file->f_op != &bpf_prog_fops) {
  395. fdput(f);
  396. return ERR_PTR(-EINVAL);
  397. }
  398. prog = f.file->private_data;
  399. return prog;
  400. }
  401. /* called by sockets/tracing/seccomp before attaching program to an event
  402. * pairs with bpf_prog_put()
  403. */
  404. struct bpf_prog *bpf_prog_get(u32 ufd)
  405. {
  406. struct fd f = fdget(ufd);
  407. struct bpf_prog *prog;
  408. prog = get_prog(f);
  409. if (IS_ERR(prog))
  410. return prog;
  411. atomic_inc(&prog->aux->refcnt);
  412. fdput(f);
  413. return prog;
  414. }
  415. EXPORT_SYMBOL_GPL(bpf_prog_get);
  416. /* last field in 'union bpf_attr' used by this command */
  417. #define BPF_PROG_LOAD_LAST_FIELD kern_version
  418. static int bpf_prog_load(union bpf_attr *attr)
  419. {
  420. enum bpf_prog_type type = attr->prog_type;
  421. struct bpf_prog *prog;
  422. int err;
  423. char license[128];
  424. bool is_gpl;
  425. if (CHECK_ATTR(BPF_PROG_LOAD))
  426. return -EINVAL;
  427. /* copy eBPF program license from user space */
  428. if (strncpy_from_user(license, u64_to_ptr(attr->license),
  429. sizeof(license) - 1) < 0)
  430. return -EFAULT;
  431. license[sizeof(license) - 1] = 0;
  432. /* eBPF programs must be GPL compatible to use GPL-ed functions */
  433. is_gpl = license_is_gpl_compatible(license);
  434. if (attr->insn_cnt >= BPF_MAXINSNS)
  435. return -EINVAL;
  436. if (type == BPF_PROG_TYPE_KPROBE &&
  437. attr->kern_version != LINUX_VERSION_CODE)
  438. return -EINVAL;
  439. /* plain bpf_prog allocation */
  440. prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
  441. if (!prog)
  442. return -ENOMEM;
  443. prog->len = attr->insn_cnt;
  444. err = -EFAULT;
  445. if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
  446. prog->len * sizeof(struct bpf_insn)) != 0)
  447. goto free_prog;
  448. prog->orig_prog = NULL;
  449. prog->jited = false;
  450. atomic_set(&prog->aux->refcnt, 1);
  451. prog->gpl_compatible = is_gpl;
  452. /* find program type: socket_filter vs tracing_filter */
  453. err = find_prog_type(type, prog);
  454. if (err < 0)
  455. goto free_prog;
  456. /* run eBPF verifier */
  457. err = bpf_check(&prog, attr);
  458. if (err < 0)
  459. goto free_used_maps;
  460. /* fixup BPF_CALL->imm field */
  461. fixup_bpf_calls(prog);
  462. /* eBPF program is ready to be JITed */
  463. err = bpf_prog_select_runtime(prog);
  464. if (err < 0)
  465. goto free_used_maps;
  466. err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
  467. if (err < 0)
  468. /* failed to allocate fd */
  469. goto free_used_maps;
  470. return err;
  471. free_used_maps:
  472. free_used_maps(prog->aux);
  473. free_prog:
  474. bpf_prog_free(prog);
  475. return err;
  476. }
  477. SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
  478. {
  479. union bpf_attr attr = {};
  480. int err;
  481. /* the syscall is limited to root temporarily. This restriction will be
  482. * lifted when security audit is clean. Note that eBPF+tracing must have
  483. * this restriction, since it may pass kernel data to user space
  484. */
  485. if (!capable(CAP_SYS_ADMIN))
  486. return -EPERM;
  487. if (!access_ok(VERIFY_READ, uattr, 1))
  488. return -EFAULT;
  489. if (size > PAGE_SIZE) /* silly large */
  490. return -E2BIG;
  491. /* If we're handed a bigger struct than we know of,
  492. * ensure all the unknown bits are 0 - i.e. new
  493. * user-space does not rely on any kernel feature
  494. * extensions we dont know about yet.
  495. */
  496. if (size > sizeof(attr)) {
  497. unsigned char __user *addr;
  498. unsigned char __user *end;
  499. unsigned char val;
  500. addr = (void __user *)uattr + sizeof(attr);
  501. end = (void __user *)uattr + size;
  502. for (; addr < end; addr++) {
  503. err = get_user(val, addr);
  504. if (err)
  505. return err;
  506. if (val)
  507. return -E2BIG;
  508. }
  509. size = sizeof(attr);
  510. }
  511. /* copy attributes from user space, may be less than sizeof(bpf_attr) */
  512. if (copy_from_user(&attr, uattr, size) != 0)
  513. return -EFAULT;
  514. switch (cmd) {
  515. case BPF_MAP_CREATE:
  516. err = map_create(&attr);
  517. break;
  518. case BPF_MAP_LOOKUP_ELEM:
  519. err = map_lookup_elem(&attr);
  520. break;
  521. case BPF_MAP_UPDATE_ELEM:
  522. err = map_update_elem(&attr);
  523. break;
  524. case BPF_MAP_DELETE_ELEM:
  525. err = map_delete_elem(&attr);
  526. break;
  527. case BPF_MAP_GET_NEXT_KEY:
  528. err = map_get_next_key(&attr);
  529. break;
  530. case BPF_PROG_LOAD:
  531. err = bpf_prog_load(&attr);
  532. break;
  533. default:
  534. err = -EINVAL;
  535. break;
  536. }
  537. return err;
  538. }