syscall.c 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665
  1. /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. #include <linux/bpf.h>
  13. #include <linux/syscalls.h>
  14. #include <linux/slab.h>
  15. #include <linux/anon_inodes.h>
  16. #include <linux/file.h>
  17. #include <linux/license.h>
  18. #include <linux/filter.h>
  19. #include <linux/version.h>
  20. static LIST_HEAD(bpf_map_types);
  21. static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
  22. {
  23. struct bpf_map_type_list *tl;
  24. struct bpf_map *map;
  25. list_for_each_entry(tl, &bpf_map_types, list_node) {
  26. if (tl->type == attr->map_type) {
  27. map = tl->ops->map_alloc(attr);
  28. if (IS_ERR(map))
  29. return map;
  30. map->ops = tl->ops;
  31. map->map_type = attr->map_type;
  32. return map;
  33. }
  34. }
  35. return ERR_PTR(-EINVAL);
  36. }
  37. /* boot time registration of different map implementations */
  38. void bpf_register_map_type(struct bpf_map_type_list *tl)
  39. {
  40. list_add(&tl->list_node, &bpf_map_types);
  41. }
  42. /* called from workqueue */
  43. static void bpf_map_free_deferred(struct work_struct *work)
  44. {
  45. struct bpf_map *map = container_of(work, struct bpf_map, work);
  46. /* implementation dependent freeing */
  47. map->ops->map_free(map);
  48. }
  49. /* decrement map refcnt and schedule it for freeing via workqueue
  50. * (unrelying map implementation ops->map_free() might sleep)
  51. */
  52. void bpf_map_put(struct bpf_map *map)
  53. {
  54. if (atomic_dec_and_test(&map->refcnt)) {
  55. INIT_WORK(&map->work, bpf_map_free_deferred);
  56. schedule_work(&map->work);
  57. }
  58. }
  59. static int bpf_map_release(struct inode *inode, struct file *filp)
  60. {
  61. struct bpf_map *map = filp->private_data;
  62. if (map->map_type == BPF_MAP_TYPE_PROG_ARRAY)
  63. /* prog_array stores refcnt-ed bpf_prog pointers
  64. * release them all when user space closes prog_array_fd
  65. */
  66. bpf_fd_array_map_clear(map);
  67. bpf_map_put(map);
  68. return 0;
  69. }
  70. static const struct file_operations bpf_map_fops = {
  71. .release = bpf_map_release,
  72. };
  73. /* helper macro to check that unused fields 'union bpf_attr' are zero */
  74. #define CHECK_ATTR(CMD) \
  75. memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
  76. sizeof(attr->CMD##_LAST_FIELD), 0, \
  77. sizeof(*attr) - \
  78. offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
  79. sizeof(attr->CMD##_LAST_FIELD)) != NULL
  80. #define BPF_MAP_CREATE_LAST_FIELD max_entries
  81. /* called via syscall */
  82. static int map_create(union bpf_attr *attr)
  83. {
  84. struct bpf_map *map;
  85. int err;
  86. err = CHECK_ATTR(BPF_MAP_CREATE);
  87. if (err)
  88. return -EINVAL;
  89. /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
  90. map = find_and_alloc_map(attr);
  91. if (IS_ERR(map))
  92. return PTR_ERR(map);
  93. atomic_set(&map->refcnt, 1);
  94. err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
  95. if (err < 0)
  96. /* failed to allocate fd */
  97. goto free_map;
  98. return err;
  99. free_map:
  100. map->ops->map_free(map);
  101. return err;
  102. }
  103. /* if error is returned, fd is released.
  104. * On success caller should complete fd access with matching fdput()
  105. */
  106. struct bpf_map *bpf_map_get(struct fd f)
  107. {
  108. struct bpf_map *map;
  109. if (!f.file)
  110. return ERR_PTR(-EBADF);
  111. if (f.file->f_op != &bpf_map_fops) {
  112. fdput(f);
  113. return ERR_PTR(-EINVAL);
  114. }
  115. map = f.file->private_data;
  116. return map;
  117. }
  118. /* helper to convert user pointers passed inside __aligned_u64 fields */
  119. static void __user *u64_to_ptr(__u64 val)
  120. {
  121. return (void __user *) (unsigned long) val;
  122. }
  123. /* last field in 'union bpf_attr' used by this command */
  124. #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
  125. static int map_lookup_elem(union bpf_attr *attr)
  126. {
  127. void __user *ukey = u64_to_ptr(attr->key);
  128. void __user *uvalue = u64_to_ptr(attr->value);
  129. int ufd = attr->map_fd;
  130. struct bpf_map *map;
  131. void *key, *value, *ptr;
  132. struct fd f;
  133. int err;
  134. if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
  135. return -EINVAL;
  136. f = fdget(ufd);
  137. map = bpf_map_get(f);
  138. if (IS_ERR(map))
  139. return PTR_ERR(map);
  140. err = -ENOMEM;
  141. key = kmalloc(map->key_size, GFP_USER);
  142. if (!key)
  143. goto err_put;
  144. err = -EFAULT;
  145. if (copy_from_user(key, ukey, map->key_size) != 0)
  146. goto free_key;
  147. err = -ENOMEM;
  148. value = kmalloc(map->value_size, GFP_USER);
  149. if (!value)
  150. goto free_key;
  151. rcu_read_lock();
  152. ptr = map->ops->map_lookup_elem(map, key);
  153. if (ptr)
  154. memcpy(value, ptr, map->value_size);
  155. rcu_read_unlock();
  156. err = -ENOENT;
  157. if (!ptr)
  158. goto free_value;
  159. err = -EFAULT;
  160. if (copy_to_user(uvalue, value, map->value_size) != 0)
  161. goto free_value;
  162. err = 0;
  163. free_value:
  164. kfree(value);
  165. free_key:
  166. kfree(key);
  167. err_put:
  168. fdput(f);
  169. return err;
  170. }
  171. #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
  172. static int map_update_elem(union bpf_attr *attr)
  173. {
  174. void __user *ukey = u64_to_ptr(attr->key);
  175. void __user *uvalue = u64_to_ptr(attr->value);
  176. int ufd = attr->map_fd;
  177. struct bpf_map *map;
  178. void *key, *value;
  179. struct fd f;
  180. int err;
  181. if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
  182. return -EINVAL;
  183. f = fdget(ufd);
  184. map = bpf_map_get(f);
  185. if (IS_ERR(map))
  186. return PTR_ERR(map);
  187. err = -ENOMEM;
  188. key = kmalloc(map->key_size, GFP_USER);
  189. if (!key)
  190. goto err_put;
  191. err = -EFAULT;
  192. if (copy_from_user(key, ukey, map->key_size) != 0)
  193. goto free_key;
  194. err = -ENOMEM;
  195. value = kmalloc(map->value_size, GFP_USER);
  196. if (!value)
  197. goto free_key;
  198. err = -EFAULT;
  199. if (copy_from_user(value, uvalue, map->value_size) != 0)
  200. goto free_value;
  201. /* eBPF program that use maps are running under rcu_read_lock(),
  202. * therefore all map accessors rely on this fact, so do the same here
  203. */
  204. rcu_read_lock();
  205. err = map->ops->map_update_elem(map, key, value, attr->flags);
  206. rcu_read_unlock();
  207. free_value:
  208. kfree(value);
  209. free_key:
  210. kfree(key);
  211. err_put:
  212. fdput(f);
  213. return err;
  214. }
  215. #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
  216. static int map_delete_elem(union bpf_attr *attr)
  217. {
  218. void __user *ukey = u64_to_ptr(attr->key);
  219. int ufd = attr->map_fd;
  220. struct bpf_map *map;
  221. struct fd f;
  222. void *key;
  223. int err;
  224. if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
  225. return -EINVAL;
  226. f = fdget(ufd);
  227. map = bpf_map_get(f);
  228. if (IS_ERR(map))
  229. return PTR_ERR(map);
  230. err = -ENOMEM;
  231. key = kmalloc(map->key_size, GFP_USER);
  232. if (!key)
  233. goto err_put;
  234. err = -EFAULT;
  235. if (copy_from_user(key, ukey, map->key_size) != 0)
  236. goto free_key;
  237. rcu_read_lock();
  238. err = map->ops->map_delete_elem(map, key);
  239. rcu_read_unlock();
  240. free_key:
  241. kfree(key);
  242. err_put:
  243. fdput(f);
  244. return err;
  245. }
  246. /* last field in 'union bpf_attr' used by this command */
  247. #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
  248. static int map_get_next_key(union bpf_attr *attr)
  249. {
  250. void __user *ukey = u64_to_ptr(attr->key);
  251. void __user *unext_key = u64_to_ptr(attr->next_key);
  252. int ufd = attr->map_fd;
  253. struct bpf_map *map;
  254. void *key, *next_key;
  255. struct fd f;
  256. int err;
  257. if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
  258. return -EINVAL;
  259. f = fdget(ufd);
  260. map = bpf_map_get(f);
  261. if (IS_ERR(map))
  262. return PTR_ERR(map);
  263. err = -ENOMEM;
  264. key = kmalloc(map->key_size, GFP_USER);
  265. if (!key)
  266. goto err_put;
  267. err = -EFAULT;
  268. if (copy_from_user(key, ukey, map->key_size) != 0)
  269. goto free_key;
  270. err = -ENOMEM;
  271. next_key = kmalloc(map->key_size, GFP_USER);
  272. if (!next_key)
  273. goto free_key;
  274. rcu_read_lock();
  275. err = map->ops->map_get_next_key(map, key, next_key);
  276. rcu_read_unlock();
  277. if (err)
  278. goto free_next_key;
  279. err = -EFAULT;
  280. if (copy_to_user(unext_key, next_key, map->key_size) != 0)
  281. goto free_next_key;
  282. err = 0;
  283. free_next_key:
  284. kfree(next_key);
  285. free_key:
  286. kfree(key);
  287. err_put:
  288. fdput(f);
  289. return err;
  290. }
  291. static LIST_HEAD(bpf_prog_types);
  292. static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
  293. {
  294. struct bpf_prog_type_list *tl;
  295. list_for_each_entry(tl, &bpf_prog_types, list_node) {
  296. if (tl->type == type) {
  297. prog->aux->ops = tl->ops;
  298. prog->type = type;
  299. return 0;
  300. }
  301. }
  302. return -EINVAL;
  303. }
  304. void bpf_register_prog_type(struct bpf_prog_type_list *tl)
  305. {
  306. list_add(&tl->list_node, &bpf_prog_types);
  307. }
  308. /* fixup insn->imm field of bpf_call instructions:
  309. * if (insn->imm == BPF_FUNC_map_lookup_elem)
  310. * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
  311. * else if (insn->imm == BPF_FUNC_map_update_elem)
  312. * insn->imm = bpf_map_update_elem - __bpf_call_base;
  313. * else ...
  314. *
  315. * this function is called after eBPF program passed verification
  316. */
  317. static void fixup_bpf_calls(struct bpf_prog *prog)
  318. {
  319. const struct bpf_func_proto *fn;
  320. int i;
  321. for (i = 0; i < prog->len; i++) {
  322. struct bpf_insn *insn = &prog->insnsi[i];
  323. if (insn->code == (BPF_JMP | BPF_CALL)) {
  324. /* we reach here when program has bpf_call instructions
  325. * and it passed bpf_check(), means that
  326. * ops->get_func_proto must have been supplied, check it
  327. */
  328. BUG_ON(!prog->aux->ops->get_func_proto);
  329. if (insn->imm == BPF_FUNC_get_route_realm)
  330. prog->dst_needed = 1;
  331. if (insn->imm == BPF_FUNC_tail_call) {
  332. /* mark bpf_tail_call as different opcode
  333. * to avoid conditional branch in
  334. * interpeter for every normal call
  335. * and to prevent accidental JITing by
  336. * JIT compiler that doesn't support
  337. * bpf_tail_call yet
  338. */
  339. insn->imm = 0;
  340. insn->code |= BPF_X;
  341. continue;
  342. }
  343. fn = prog->aux->ops->get_func_proto(insn->imm);
  344. /* all functions that have prototype and verifier allowed
  345. * programs to call them, must be real in-kernel functions
  346. */
  347. BUG_ON(!fn->func);
  348. insn->imm = fn->func - __bpf_call_base;
  349. }
  350. }
  351. }
  352. /* drop refcnt on maps used by eBPF program and free auxilary data */
  353. static void free_used_maps(struct bpf_prog_aux *aux)
  354. {
  355. int i;
  356. for (i = 0; i < aux->used_map_cnt; i++)
  357. bpf_map_put(aux->used_maps[i]);
  358. kfree(aux->used_maps);
  359. }
  360. static void __prog_put_rcu(struct rcu_head *rcu)
  361. {
  362. struct bpf_prog_aux *aux = container_of(rcu, struct bpf_prog_aux, rcu);
  363. free_used_maps(aux);
  364. bpf_prog_free(aux->prog);
  365. }
  366. /* version of bpf_prog_put() that is called after a grace period */
  367. void bpf_prog_put_rcu(struct bpf_prog *prog)
  368. {
  369. if (atomic_dec_and_test(&prog->aux->refcnt)) {
  370. prog->aux->prog = prog;
  371. call_rcu(&prog->aux->rcu, __prog_put_rcu);
  372. }
  373. }
  374. void bpf_prog_put(struct bpf_prog *prog)
  375. {
  376. if (atomic_dec_and_test(&prog->aux->refcnt)) {
  377. free_used_maps(prog->aux);
  378. bpf_prog_free(prog);
  379. }
  380. }
  381. EXPORT_SYMBOL_GPL(bpf_prog_put);
  382. static int bpf_prog_release(struct inode *inode, struct file *filp)
  383. {
  384. struct bpf_prog *prog = filp->private_data;
  385. bpf_prog_put_rcu(prog);
  386. return 0;
  387. }
  388. static const struct file_operations bpf_prog_fops = {
  389. .release = bpf_prog_release,
  390. };
  391. static struct bpf_prog *get_prog(struct fd f)
  392. {
  393. struct bpf_prog *prog;
  394. if (!f.file)
  395. return ERR_PTR(-EBADF);
  396. if (f.file->f_op != &bpf_prog_fops) {
  397. fdput(f);
  398. return ERR_PTR(-EINVAL);
  399. }
  400. prog = f.file->private_data;
  401. return prog;
  402. }
  403. /* called by sockets/tracing/seccomp before attaching program to an event
  404. * pairs with bpf_prog_put()
  405. */
  406. struct bpf_prog *bpf_prog_get(u32 ufd)
  407. {
  408. struct fd f = fdget(ufd);
  409. struct bpf_prog *prog;
  410. prog = get_prog(f);
  411. if (IS_ERR(prog))
  412. return prog;
  413. atomic_inc(&prog->aux->refcnt);
  414. fdput(f);
  415. return prog;
  416. }
  417. EXPORT_SYMBOL_GPL(bpf_prog_get);
  418. /* last field in 'union bpf_attr' used by this command */
  419. #define BPF_PROG_LOAD_LAST_FIELD kern_version
  420. static int bpf_prog_load(union bpf_attr *attr)
  421. {
  422. enum bpf_prog_type type = attr->prog_type;
  423. struct bpf_prog *prog;
  424. int err;
  425. char license[128];
  426. bool is_gpl;
  427. if (CHECK_ATTR(BPF_PROG_LOAD))
  428. return -EINVAL;
  429. /* copy eBPF program license from user space */
  430. if (strncpy_from_user(license, u64_to_ptr(attr->license),
  431. sizeof(license) - 1) < 0)
  432. return -EFAULT;
  433. license[sizeof(license) - 1] = 0;
  434. /* eBPF programs must be GPL compatible to use GPL-ed functions */
  435. is_gpl = license_is_gpl_compatible(license);
  436. if (attr->insn_cnt >= BPF_MAXINSNS)
  437. return -EINVAL;
  438. if (type == BPF_PROG_TYPE_KPROBE &&
  439. attr->kern_version != LINUX_VERSION_CODE)
  440. return -EINVAL;
  441. /* plain bpf_prog allocation */
  442. prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
  443. if (!prog)
  444. return -ENOMEM;
  445. prog->len = attr->insn_cnt;
  446. err = -EFAULT;
  447. if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
  448. prog->len * sizeof(struct bpf_insn)) != 0)
  449. goto free_prog;
  450. prog->orig_prog = NULL;
  451. prog->jited = 0;
  452. atomic_set(&prog->aux->refcnt, 1);
  453. prog->gpl_compatible = is_gpl ? 1 : 0;
  454. /* find program type: socket_filter vs tracing_filter */
  455. err = find_prog_type(type, prog);
  456. if (err < 0)
  457. goto free_prog;
  458. /* run eBPF verifier */
  459. err = bpf_check(&prog, attr);
  460. if (err < 0)
  461. goto free_used_maps;
  462. /* fixup BPF_CALL->imm field */
  463. fixup_bpf_calls(prog);
  464. /* eBPF program is ready to be JITed */
  465. err = bpf_prog_select_runtime(prog);
  466. if (err < 0)
  467. goto free_used_maps;
  468. err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
  469. if (err < 0)
  470. /* failed to allocate fd */
  471. goto free_used_maps;
  472. return err;
  473. free_used_maps:
  474. free_used_maps(prog->aux);
  475. free_prog:
  476. bpf_prog_free(prog);
  477. return err;
  478. }
  479. SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
  480. {
  481. union bpf_attr attr = {};
  482. int err;
  483. /* the syscall is limited to root temporarily. This restriction will be
  484. * lifted when security audit is clean. Note that eBPF+tracing must have
  485. * this restriction, since it may pass kernel data to user space
  486. */
  487. if (!capable(CAP_SYS_ADMIN))
  488. return -EPERM;
  489. if (!access_ok(VERIFY_READ, uattr, 1))
  490. return -EFAULT;
  491. if (size > PAGE_SIZE) /* silly large */
  492. return -E2BIG;
  493. /* If we're handed a bigger struct than we know of,
  494. * ensure all the unknown bits are 0 - i.e. new
  495. * user-space does not rely on any kernel feature
  496. * extensions we dont know about yet.
  497. */
  498. if (size > sizeof(attr)) {
  499. unsigned char __user *addr;
  500. unsigned char __user *end;
  501. unsigned char val;
  502. addr = (void __user *)uattr + sizeof(attr);
  503. end = (void __user *)uattr + size;
  504. for (; addr < end; addr++) {
  505. err = get_user(val, addr);
  506. if (err)
  507. return err;
  508. if (val)
  509. return -E2BIG;
  510. }
  511. size = sizeof(attr);
  512. }
  513. /* copy attributes from user space, may be less than sizeof(bpf_attr) */
  514. if (copy_from_user(&attr, uattr, size) != 0)
  515. return -EFAULT;
  516. switch (cmd) {
  517. case BPF_MAP_CREATE:
  518. err = map_create(&attr);
  519. break;
  520. case BPF_MAP_LOOKUP_ELEM:
  521. err = map_lookup_elem(&attr);
  522. break;
  523. case BPF_MAP_UPDATE_ELEM:
  524. err = map_update_elem(&attr);
  525. break;
  526. case BPF_MAP_DELETE_ELEM:
  527. err = map_delete_elem(&attr);
  528. break;
  529. case BPF_MAP_GET_NEXT_KEY:
  530. err = map_get_next_key(&attr);
  531. break;
  532. case BPF_PROG_LOAD:
  533. err = bpf_prog_load(&attr);
  534. break;
  535. default:
  536. err = -EINVAL;
  537. break;
  538. }
  539. return err;
  540. }