syscall.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621
  1. /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. #include <linux/bpf.h>
  13. #include <linux/syscalls.h>
  14. #include <linux/slab.h>
  15. #include <linux/anon_inodes.h>
  16. #include <linux/file.h>
  17. #include <linux/license.h>
  18. #include <linux/filter.h>
  19. #include <linux/version.h>
  20. static LIST_HEAD(bpf_map_types);
  21. static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
  22. {
  23. struct bpf_map_type_list *tl;
  24. struct bpf_map *map;
  25. list_for_each_entry(tl, &bpf_map_types, list_node) {
  26. if (tl->type == attr->map_type) {
  27. map = tl->ops->map_alloc(attr);
  28. if (IS_ERR(map))
  29. return map;
  30. map->ops = tl->ops;
  31. map->map_type = attr->map_type;
  32. return map;
  33. }
  34. }
  35. return ERR_PTR(-EINVAL);
  36. }
  37. /* boot time registration of different map implementations */
  38. void bpf_register_map_type(struct bpf_map_type_list *tl)
  39. {
  40. list_add(&tl->list_node, &bpf_map_types);
  41. }
  42. /* called from workqueue */
  43. static void bpf_map_free_deferred(struct work_struct *work)
  44. {
  45. struct bpf_map *map = container_of(work, struct bpf_map, work);
  46. /* implementation dependent freeing */
  47. map->ops->map_free(map);
  48. }
  49. /* decrement map refcnt and schedule it for freeing via workqueue
  50. * (unrelying map implementation ops->map_free() might sleep)
  51. */
  52. void bpf_map_put(struct bpf_map *map)
  53. {
  54. if (atomic_dec_and_test(&map->refcnt)) {
  55. INIT_WORK(&map->work, bpf_map_free_deferred);
  56. schedule_work(&map->work);
  57. }
  58. }
  59. static int bpf_map_release(struct inode *inode, struct file *filp)
  60. {
  61. struct bpf_map *map = filp->private_data;
  62. bpf_map_put(map);
  63. return 0;
  64. }
  65. static const struct file_operations bpf_map_fops = {
  66. .release = bpf_map_release,
  67. };
  68. /* helper macro to check that unused fields 'union bpf_attr' are zero */
  69. #define CHECK_ATTR(CMD) \
  70. memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
  71. sizeof(attr->CMD##_LAST_FIELD), 0, \
  72. sizeof(*attr) - \
  73. offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
  74. sizeof(attr->CMD##_LAST_FIELD)) != NULL
  75. #define BPF_MAP_CREATE_LAST_FIELD max_entries
  76. /* called via syscall */
  77. static int map_create(union bpf_attr *attr)
  78. {
  79. struct bpf_map *map;
  80. int err;
  81. err = CHECK_ATTR(BPF_MAP_CREATE);
  82. if (err)
  83. return -EINVAL;
  84. /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
  85. map = find_and_alloc_map(attr);
  86. if (IS_ERR(map))
  87. return PTR_ERR(map);
  88. atomic_set(&map->refcnt, 1);
  89. err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
  90. if (err < 0)
  91. /* failed to allocate fd */
  92. goto free_map;
  93. return err;
  94. free_map:
  95. map->ops->map_free(map);
  96. return err;
  97. }
  98. /* if error is returned, fd is released.
  99. * On success caller should complete fd access with matching fdput()
  100. */
  101. struct bpf_map *bpf_map_get(struct fd f)
  102. {
  103. struct bpf_map *map;
  104. if (!f.file)
  105. return ERR_PTR(-EBADF);
  106. if (f.file->f_op != &bpf_map_fops) {
  107. fdput(f);
  108. return ERR_PTR(-EINVAL);
  109. }
  110. map = f.file->private_data;
  111. return map;
  112. }
  113. /* helper to convert user pointers passed inside __aligned_u64 fields */
  114. static void __user *u64_to_ptr(__u64 val)
  115. {
  116. return (void __user *) (unsigned long) val;
  117. }
  118. /* last field in 'union bpf_attr' used by this command */
  119. #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
  120. static int map_lookup_elem(union bpf_attr *attr)
  121. {
  122. void __user *ukey = u64_to_ptr(attr->key);
  123. void __user *uvalue = u64_to_ptr(attr->value);
  124. int ufd = attr->map_fd;
  125. struct fd f = fdget(ufd);
  126. struct bpf_map *map;
  127. void *key, *value, *ptr;
  128. int err;
  129. if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
  130. return -EINVAL;
  131. map = bpf_map_get(f);
  132. if (IS_ERR(map))
  133. return PTR_ERR(map);
  134. err = -ENOMEM;
  135. key = kmalloc(map->key_size, GFP_USER);
  136. if (!key)
  137. goto err_put;
  138. err = -EFAULT;
  139. if (copy_from_user(key, ukey, map->key_size) != 0)
  140. goto free_key;
  141. err = -ENOMEM;
  142. value = kmalloc(map->value_size, GFP_USER);
  143. if (!value)
  144. goto free_key;
  145. rcu_read_lock();
  146. ptr = map->ops->map_lookup_elem(map, key);
  147. if (ptr)
  148. memcpy(value, ptr, map->value_size);
  149. rcu_read_unlock();
  150. err = -ENOENT;
  151. if (!ptr)
  152. goto free_value;
  153. err = -EFAULT;
  154. if (copy_to_user(uvalue, value, map->value_size) != 0)
  155. goto free_value;
  156. err = 0;
  157. free_value:
  158. kfree(value);
  159. free_key:
  160. kfree(key);
  161. err_put:
  162. fdput(f);
  163. return err;
  164. }
  165. #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
  166. static int map_update_elem(union bpf_attr *attr)
  167. {
  168. void __user *ukey = u64_to_ptr(attr->key);
  169. void __user *uvalue = u64_to_ptr(attr->value);
  170. int ufd = attr->map_fd;
  171. struct fd f = fdget(ufd);
  172. struct bpf_map *map;
  173. void *key, *value;
  174. int err;
  175. if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
  176. return -EINVAL;
  177. map = bpf_map_get(f);
  178. if (IS_ERR(map))
  179. return PTR_ERR(map);
  180. err = -ENOMEM;
  181. key = kmalloc(map->key_size, GFP_USER);
  182. if (!key)
  183. goto err_put;
  184. err = -EFAULT;
  185. if (copy_from_user(key, ukey, map->key_size) != 0)
  186. goto free_key;
  187. err = -ENOMEM;
  188. value = kmalloc(map->value_size, GFP_USER);
  189. if (!value)
  190. goto free_key;
  191. err = -EFAULT;
  192. if (copy_from_user(value, uvalue, map->value_size) != 0)
  193. goto free_value;
  194. /* eBPF program that use maps are running under rcu_read_lock(),
  195. * therefore all map accessors rely on this fact, so do the same here
  196. */
  197. rcu_read_lock();
  198. err = map->ops->map_update_elem(map, key, value, attr->flags);
  199. rcu_read_unlock();
  200. free_value:
  201. kfree(value);
  202. free_key:
  203. kfree(key);
  204. err_put:
  205. fdput(f);
  206. return err;
  207. }
  208. #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
  209. static int map_delete_elem(union bpf_attr *attr)
  210. {
  211. void __user *ukey = u64_to_ptr(attr->key);
  212. int ufd = attr->map_fd;
  213. struct fd f = fdget(ufd);
  214. struct bpf_map *map;
  215. void *key;
  216. int err;
  217. if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
  218. return -EINVAL;
  219. map = bpf_map_get(f);
  220. if (IS_ERR(map))
  221. return PTR_ERR(map);
  222. err = -ENOMEM;
  223. key = kmalloc(map->key_size, GFP_USER);
  224. if (!key)
  225. goto err_put;
  226. err = -EFAULT;
  227. if (copy_from_user(key, ukey, map->key_size) != 0)
  228. goto free_key;
  229. rcu_read_lock();
  230. err = map->ops->map_delete_elem(map, key);
  231. rcu_read_unlock();
  232. free_key:
  233. kfree(key);
  234. err_put:
  235. fdput(f);
  236. return err;
  237. }
  238. /* last field in 'union bpf_attr' used by this command */
  239. #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
  240. static int map_get_next_key(union bpf_attr *attr)
  241. {
  242. void __user *ukey = u64_to_ptr(attr->key);
  243. void __user *unext_key = u64_to_ptr(attr->next_key);
  244. int ufd = attr->map_fd;
  245. struct fd f = fdget(ufd);
  246. struct bpf_map *map;
  247. void *key, *next_key;
  248. int err;
  249. if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
  250. return -EINVAL;
  251. map = bpf_map_get(f);
  252. if (IS_ERR(map))
  253. return PTR_ERR(map);
  254. err = -ENOMEM;
  255. key = kmalloc(map->key_size, GFP_USER);
  256. if (!key)
  257. goto err_put;
  258. err = -EFAULT;
  259. if (copy_from_user(key, ukey, map->key_size) != 0)
  260. goto free_key;
  261. err = -ENOMEM;
  262. next_key = kmalloc(map->key_size, GFP_USER);
  263. if (!next_key)
  264. goto free_key;
  265. rcu_read_lock();
  266. err = map->ops->map_get_next_key(map, key, next_key);
  267. rcu_read_unlock();
  268. if (err)
  269. goto free_next_key;
  270. err = -EFAULT;
  271. if (copy_to_user(unext_key, next_key, map->key_size) != 0)
  272. goto free_next_key;
  273. err = 0;
  274. free_next_key:
  275. kfree(next_key);
  276. free_key:
  277. kfree(key);
  278. err_put:
  279. fdput(f);
  280. return err;
  281. }
  282. static LIST_HEAD(bpf_prog_types);
  283. static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
  284. {
  285. struct bpf_prog_type_list *tl;
  286. list_for_each_entry(tl, &bpf_prog_types, list_node) {
  287. if (tl->type == type) {
  288. prog->aux->ops = tl->ops;
  289. prog->type = type;
  290. return 0;
  291. }
  292. }
  293. return -EINVAL;
  294. }
  295. void bpf_register_prog_type(struct bpf_prog_type_list *tl)
  296. {
  297. list_add(&tl->list_node, &bpf_prog_types);
  298. }
  299. /* fixup insn->imm field of bpf_call instructions:
  300. * if (insn->imm == BPF_FUNC_map_lookup_elem)
  301. * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
  302. * else if (insn->imm == BPF_FUNC_map_update_elem)
  303. * insn->imm = bpf_map_update_elem - __bpf_call_base;
  304. * else ...
  305. *
  306. * this function is called after eBPF program passed verification
  307. */
  308. static void fixup_bpf_calls(struct bpf_prog *prog)
  309. {
  310. const struct bpf_func_proto *fn;
  311. int i;
  312. for (i = 0; i < prog->len; i++) {
  313. struct bpf_insn *insn = &prog->insnsi[i];
  314. if (insn->code == (BPF_JMP | BPF_CALL)) {
  315. /* we reach here when program has bpf_call instructions
  316. * and it passed bpf_check(), means that
  317. * ops->get_func_proto must have been supplied, check it
  318. */
  319. BUG_ON(!prog->aux->ops->get_func_proto);
  320. fn = prog->aux->ops->get_func_proto(insn->imm);
  321. /* all functions that have prototype and verifier allowed
  322. * programs to call them, must be real in-kernel functions
  323. */
  324. BUG_ON(!fn->func);
  325. insn->imm = fn->func - __bpf_call_base;
  326. }
  327. }
  328. }
  329. /* drop refcnt on maps used by eBPF program and free auxilary data */
  330. static void free_used_maps(struct bpf_prog_aux *aux)
  331. {
  332. int i;
  333. for (i = 0; i < aux->used_map_cnt; i++)
  334. bpf_map_put(aux->used_maps[i]);
  335. kfree(aux->used_maps);
  336. }
  337. void bpf_prog_put(struct bpf_prog *prog)
  338. {
  339. if (atomic_dec_and_test(&prog->aux->refcnt)) {
  340. free_used_maps(prog->aux);
  341. bpf_prog_free(prog);
  342. }
  343. }
  344. EXPORT_SYMBOL_GPL(bpf_prog_put);
  345. static int bpf_prog_release(struct inode *inode, struct file *filp)
  346. {
  347. struct bpf_prog *prog = filp->private_data;
  348. bpf_prog_put(prog);
  349. return 0;
  350. }
  351. static const struct file_operations bpf_prog_fops = {
  352. .release = bpf_prog_release,
  353. };
  354. static struct bpf_prog *get_prog(struct fd f)
  355. {
  356. struct bpf_prog *prog;
  357. if (!f.file)
  358. return ERR_PTR(-EBADF);
  359. if (f.file->f_op != &bpf_prog_fops) {
  360. fdput(f);
  361. return ERR_PTR(-EINVAL);
  362. }
  363. prog = f.file->private_data;
  364. return prog;
  365. }
  366. /* called by sockets/tracing/seccomp before attaching program to an event
  367. * pairs with bpf_prog_put()
  368. */
  369. struct bpf_prog *bpf_prog_get(u32 ufd)
  370. {
  371. struct fd f = fdget(ufd);
  372. struct bpf_prog *prog;
  373. prog = get_prog(f);
  374. if (IS_ERR(prog))
  375. return prog;
  376. atomic_inc(&prog->aux->refcnt);
  377. fdput(f);
  378. return prog;
  379. }
  380. EXPORT_SYMBOL_GPL(bpf_prog_get);
  381. /* last field in 'union bpf_attr' used by this command */
  382. #define BPF_PROG_LOAD_LAST_FIELD kern_version
  383. static int bpf_prog_load(union bpf_attr *attr)
  384. {
  385. enum bpf_prog_type type = attr->prog_type;
  386. struct bpf_prog *prog;
  387. int err;
  388. char license[128];
  389. bool is_gpl;
  390. if (CHECK_ATTR(BPF_PROG_LOAD))
  391. return -EINVAL;
  392. /* copy eBPF program license from user space */
  393. if (strncpy_from_user(license, u64_to_ptr(attr->license),
  394. sizeof(license) - 1) < 0)
  395. return -EFAULT;
  396. license[sizeof(license) - 1] = 0;
  397. /* eBPF programs must be GPL compatible to use GPL-ed functions */
  398. is_gpl = license_is_gpl_compatible(license);
  399. if (attr->insn_cnt >= BPF_MAXINSNS)
  400. return -EINVAL;
  401. if (type == BPF_PROG_TYPE_KPROBE &&
  402. attr->kern_version != LINUX_VERSION_CODE)
  403. return -EINVAL;
  404. /* plain bpf_prog allocation */
  405. prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
  406. if (!prog)
  407. return -ENOMEM;
  408. prog->len = attr->insn_cnt;
  409. err = -EFAULT;
  410. if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
  411. prog->len * sizeof(struct bpf_insn)) != 0)
  412. goto free_prog;
  413. prog->orig_prog = NULL;
  414. prog->jited = false;
  415. atomic_set(&prog->aux->refcnt, 1);
  416. prog->gpl_compatible = is_gpl;
  417. /* find program type: socket_filter vs tracing_filter */
  418. err = find_prog_type(type, prog);
  419. if (err < 0)
  420. goto free_prog;
  421. /* run eBPF verifier */
  422. err = bpf_check(&prog, attr);
  423. if (err < 0)
  424. goto free_used_maps;
  425. /* fixup BPF_CALL->imm field */
  426. fixup_bpf_calls(prog);
  427. /* eBPF program is ready to be JITed */
  428. bpf_prog_select_runtime(prog);
  429. err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
  430. if (err < 0)
  431. /* failed to allocate fd */
  432. goto free_used_maps;
  433. return err;
  434. free_used_maps:
  435. free_used_maps(prog->aux);
  436. free_prog:
  437. bpf_prog_free(prog);
  438. return err;
  439. }
  440. SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
  441. {
  442. union bpf_attr attr = {};
  443. int err;
  444. /* the syscall is limited to root temporarily. This restriction will be
  445. * lifted when security audit is clean. Note that eBPF+tracing must have
  446. * this restriction, since it may pass kernel data to user space
  447. */
  448. if (!capable(CAP_SYS_ADMIN))
  449. return -EPERM;
  450. if (!access_ok(VERIFY_READ, uattr, 1))
  451. return -EFAULT;
  452. if (size > PAGE_SIZE) /* silly large */
  453. return -E2BIG;
  454. /* If we're handed a bigger struct than we know of,
  455. * ensure all the unknown bits are 0 - i.e. new
  456. * user-space does not rely on any kernel feature
  457. * extensions we dont know about yet.
  458. */
  459. if (size > sizeof(attr)) {
  460. unsigned char __user *addr;
  461. unsigned char __user *end;
  462. unsigned char val;
  463. addr = (void __user *)uattr + sizeof(attr);
  464. end = (void __user *)uattr + size;
  465. for (; addr < end; addr++) {
  466. err = get_user(val, addr);
  467. if (err)
  468. return err;
  469. if (val)
  470. return -E2BIG;
  471. }
  472. size = sizeof(attr);
  473. }
  474. /* copy attributes from user space, may be less than sizeof(bpf_attr) */
  475. if (copy_from_user(&attr, uattr, size) != 0)
  476. return -EFAULT;
  477. switch (cmd) {
  478. case BPF_MAP_CREATE:
  479. err = map_create(&attr);
  480. break;
  481. case BPF_MAP_LOOKUP_ELEM:
  482. err = map_lookup_elem(&attr);
  483. break;
  484. case BPF_MAP_UPDATE_ELEM:
  485. err = map_update_elem(&attr);
  486. break;
  487. case BPF_MAP_DELETE_ELEM:
  488. err = map_delete_elem(&attr);
  489. break;
  490. case BPF_MAP_GET_NEXT_KEY:
  491. err = map_get_next_key(&attr);
  492. break;
  493. case BPF_PROG_LOAD:
  494. err = bpf_prog_load(&attr);
  495. break;
  496. default:
  497. err = -EINVAL;
  498. break;
  499. }
  500. return err;
  501. }