syscall.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615
  1. /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. #include <linux/bpf.h>
  13. #include <linux/syscalls.h>
  14. #include <linux/slab.h>
  15. #include <linux/anon_inodes.h>
  16. #include <linux/file.h>
  17. #include <linux/license.h>
  18. #include <linux/filter.h>
  19. static LIST_HEAD(bpf_map_types);
  20. static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
  21. {
  22. struct bpf_map_type_list *tl;
  23. struct bpf_map *map;
  24. list_for_each_entry(tl, &bpf_map_types, list_node) {
  25. if (tl->type == attr->map_type) {
  26. map = tl->ops->map_alloc(attr);
  27. if (IS_ERR(map))
  28. return map;
  29. map->ops = tl->ops;
  30. map->map_type = attr->map_type;
  31. return map;
  32. }
  33. }
  34. return ERR_PTR(-EINVAL);
  35. }
  36. /* boot time registration of different map implementations */
  37. void bpf_register_map_type(struct bpf_map_type_list *tl)
  38. {
  39. list_add(&tl->list_node, &bpf_map_types);
  40. }
  41. /* called from workqueue */
  42. static void bpf_map_free_deferred(struct work_struct *work)
  43. {
  44. struct bpf_map *map = container_of(work, struct bpf_map, work);
  45. /* implementation dependent freeing */
  46. map->ops->map_free(map);
  47. }
  48. /* decrement map refcnt and schedule it for freeing via workqueue
  49. * (unrelying map implementation ops->map_free() might sleep)
  50. */
  51. void bpf_map_put(struct bpf_map *map)
  52. {
  53. if (atomic_dec_and_test(&map->refcnt)) {
  54. INIT_WORK(&map->work, bpf_map_free_deferred);
  55. schedule_work(&map->work);
  56. }
  57. }
  58. static int bpf_map_release(struct inode *inode, struct file *filp)
  59. {
  60. struct bpf_map *map = filp->private_data;
  61. bpf_map_put(map);
  62. return 0;
  63. }
  64. static const struct file_operations bpf_map_fops = {
  65. .release = bpf_map_release,
  66. };
  67. /* helper macro to check that unused fields 'union bpf_attr' are zero */
  68. #define CHECK_ATTR(CMD) \
  69. memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
  70. sizeof(attr->CMD##_LAST_FIELD), 0, \
  71. sizeof(*attr) - \
  72. offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
  73. sizeof(attr->CMD##_LAST_FIELD)) != NULL
  74. #define BPF_MAP_CREATE_LAST_FIELD max_entries
  75. /* called via syscall */
  76. static int map_create(union bpf_attr *attr)
  77. {
  78. struct bpf_map *map;
  79. int err;
  80. err = CHECK_ATTR(BPF_MAP_CREATE);
  81. if (err)
  82. return -EINVAL;
  83. /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
  84. map = find_and_alloc_map(attr);
  85. if (IS_ERR(map))
  86. return PTR_ERR(map);
  87. atomic_set(&map->refcnt, 1);
  88. err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
  89. if (err < 0)
  90. /* failed to allocate fd */
  91. goto free_map;
  92. return err;
  93. free_map:
  94. map->ops->map_free(map);
  95. return err;
  96. }
  97. /* if error is returned, fd is released.
  98. * On success caller should complete fd access with matching fdput()
  99. */
  100. struct bpf_map *bpf_map_get(struct fd f)
  101. {
  102. struct bpf_map *map;
  103. if (!f.file)
  104. return ERR_PTR(-EBADF);
  105. if (f.file->f_op != &bpf_map_fops) {
  106. fdput(f);
  107. return ERR_PTR(-EINVAL);
  108. }
  109. map = f.file->private_data;
  110. return map;
  111. }
  112. /* helper to convert user pointers passed inside __aligned_u64 fields */
  113. static void __user *u64_to_ptr(__u64 val)
  114. {
  115. return (void __user *) (unsigned long) val;
  116. }
  117. /* last field in 'union bpf_attr' used by this command */
  118. #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
  119. static int map_lookup_elem(union bpf_attr *attr)
  120. {
  121. void __user *ukey = u64_to_ptr(attr->key);
  122. void __user *uvalue = u64_to_ptr(attr->value);
  123. int ufd = attr->map_fd;
  124. struct fd f = fdget(ufd);
  125. struct bpf_map *map;
  126. void *key, *value, *ptr;
  127. int err;
  128. if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
  129. return -EINVAL;
  130. map = bpf_map_get(f);
  131. if (IS_ERR(map))
  132. return PTR_ERR(map);
  133. err = -ENOMEM;
  134. key = kmalloc(map->key_size, GFP_USER);
  135. if (!key)
  136. goto err_put;
  137. err = -EFAULT;
  138. if (copy_from_user(key, ukey, map->key_size) != 0)
  139. goto free_key;
  140. err = -ENOMEM;
  141. value = kmalloc(map->value_size, GFP_USER);
  142. if (!value)
  143. goto free_key;
  144. rcu_read_lock();
  145. ptr = map->ops->map_lookup_elem(map, key);
  146. if (ptr)
  147. memcpy(value, ptr, map->value_size);
  148. rcu_read_unlock();
  149. err = -ENOENT;
  150. if (!ptr)
  151. goto free_value;
  152. err = -EFAULT;
  153. if (copy_to_user(uvalue, value, map->value_size) != 0)
  154. goto free_value;
  155. err = 0;
  156. free_value:
  157. kfree(value);
  158. free_key:
  159. kfree(key);
  160. err_put:
  161. fdput(f);
  162. return err;
  163. }
  164. #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags
  165. static int map_update_elem(union bpf_attr *attr)
  166. {
  167. void __user *ukey = u64_to_ptr(attr->key);
  168. void __user *uvalue = u64_to_ptr(attr->value);
  169. int ufd = attr->map_fd;
  170. struct fd f = fdget(ufd);
  171. struct bpf_map *map;
  172. void *key, *value;
  173. int err;
  174. if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
  175. return -EINVAL;
  176. map = bpf_map_get(f);
  177. if (IS_ERR(map))
  178. return PTR_ERR(map);
  179. err = -ENOMEM;
  180. key = kmalloc(map->key_size, GFP_USER);
  181. if (!key)
  182. goto err_put;
  183. err = -EFAULT;
  184. if (copy_from_user(key, ukey, map->key_size) != 0)
  185. goto free_key;
  186. err = -ENOMEM;
  187. value = kmalloc(map->value_size, GFP_USER);
  188. if (!value)
  189. goto free_key;
  190. err = -EFAULT;
  191. if (copy_from_user(value, uvalue, map->value_size) != 0)
  192. goto free_value;
  193. /* eBPF program that use maps are running under rcu_read_lock(),
  194. * therefore all map accessors rely on this fact, so do the same here
  195. */
  196. rcu_read_lock();
  197. err = map->ops->map_update_elem(map, key, value, attr->flags);
  198. rcu_read_unlock();
  199. free_value:
  200. kfree(value);
  201. free_key:
  202. kfree(key);
  203. err_put:
  204. fdput(f);
  205. return err;
  206. }
  207. #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
  208. static int map_delete_elem(union bpf_attr *attr)
  209. {
  210. void __user *ukey = u64_to_ptr(attr->key);
  211. int ufd = attr->map_fd;
  212. struct fd f = fdget(ufd);
  213. struct bpf_map *map;
  214. void *key;
  215. int err;
  216. if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
  217. return -EINVAL;
  218. map = bpf_map_get(f);
  219. if (IS_ERR(map))
  220. return PTR_ERR(map);
  221. err = -ENOMEM;
  222. key = kmalloc(map->key_size, GFP_USER);
  223. if (!key)
  224. goto err_put;
  225. err = -EFAULT;
  226. if (copy_from_user(key, ukey, map->key_size) != 0)
  227. goto free_key;
  228. rcu_read_lock();
  229. err = map->ops->map_delete_elem(map, key);
  230. rcu_read_unlock();
  231. free_key:
  232. kfree(key);
  233. err_put:
  234. fdput(f);
  235. return err;
  236. }
  237. /* last field in 'union bpf_attr' used by this command */
  238. #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
  239. static int map_get_next_key(union bpf_attr *attr)
  240. {
  241. void __user *ukey = u64_to_ptr(attr->key);
  242. void __user *unext_key = u64_to_ptr(attr->next_key);
  243. int ufd = attr->map_fd;
  244. struct fd f = fdget(ufd);
  245. struct bpf_map *map;
  246. void *key, *next_key;
  247. int err;
  248. if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
  249. return -EINVAL;
  250. map = bpf_map_get(f);
  251. if (IS_ERR(map))
  252. return PTR_ERR(map);
  253. err = -ENOMEM;
  254. key = kmalloc(map->key_size, GFP_USER);
  255. if (!key)
  256. goto err_put;
  257. err = -EFAULT;
  258. if (copy_from_user(key, ukey, map->key_size) != 0)
  259. goto free_key;
  260. err = -ENOMEM;
  261. next_key = kmalloc(map->key_size, GFP_USER);
  262. if (!next_key)
  263. goto free_key;
  264. rcu_read_lock();
  265. err = map->ops->map_get_next_key(map, key, next_key);
  266. rcu_read_unlock();
  267. if (err)
  268. goto free_next_key;
  269. err = -EFAULT;
  270. if (copy_to_user(unext_key, next_key, map->key_size) != 0)
  271. goto free_next_key;
  272. err = 0;
  273. free_next_key:
  274. kfree(next_key);
  275. free_key:
  276. kfree(key);
  277. err_put:
  278. fdput(f);
  279. return err;
  280. }
  281. static LIST_HEAD(bpf_prog_types);
  282. static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
  283. {
  284. struct bpf_prog_type_list *tl;
  285. list_for_each_entry(tl, &bpf_prog_types, list_node) {
  286. if (tl->type == type) {
  287. prog->aux->ops = tl->ops;
  288. prog->aux->prog_type = type;
  289. return 0;
  290. }
  291. }
  292. return -EINVAL;
  293. }
  294. void bpf_register_prog_type(struct bpf_prog_type_list *tl)
  295. {
  296. list_add(&tl->list_node, &bpf_prog_types);
  297. }
  298. /* fixup insn->imm field of bpf_call instructions:
  299. * if (insn->imm == BPF_FUNC_map_lookup_elem)
  300. * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
  301. * else if (insn->imm == BPF_FUNC_map_update_elem)
  302. * insn->imm = bpf_map_update_elem - __bpf_call_base;
  303. * else ...
  304. *
  305. * this function is called after eBPF program passed verification
  306. */
  307. static void fixup_bpf_calls(struct bpf_prog *prog)
  308. {
  309. const struct bpf_func_proto *fn;
  310. int i;
  311. for (i = 0; i < prog->len; i++) {
  312. struct bpf_insn *insn = &prog->insnsi[i];
  313. if (insn->code == (BPF_JMP | BPF_CALL)) {
  314. /* we reach here when program has bpf_call instructions
  315. * and it passed bpf_check(), means that
  316. * ops->get_func_proto must have been supplied, check it
  317. */
  318. BUG_ON(!prog->aux->ops->get_func_proto);
  319. fn = prog->aux->ops->get_func_proto(insn->imm);
  320. /* all functions that have prototype and verifier allowed
  321. * programs to call them, must be real in-kernel functions
  322. */
  323. BUG_ON(!fn->func);
  324. insn->imm = fn->func - __bpf_call_base;
  325. }
  326. }
  327. }
  328. /* drop refcnt on maps used by eBPF program and free auxilary data */
  329. static void free_used_maps(struct bpf_prog_aux *aux)
  330. {
  331. int i;
  332. for (i = 0; i < aux->used_map_cnt; i++)
  333. bpf_map_put(aux->used_maps[i]);
  334. kfree(aux->used_maps);
  335. }
  336. void bpf_prog_put(struct bpf_prog *prog)
  337. {
  338. if (atomic_dec_and_test(&prog->aux->refcnt)) {
  339. free_used_maps(prog->aux);
  340. bpf_prog_free(prog);
  341. }
  342. }
  343. static int bpf_prog_release(struct inode *inode, struct file *filp)
  344. {
  345. struct bpf_prog *prog = filp->private_data;
  346. bpf_prog_put(prog);
  347. return 0;
  348. }
  349. static const struct file_operations bpf_prog_fops = {
  350. .release = bpf_prog_release,
  351. };
  352. static struct bpf_prog *get_prog(struct fd f)
  353. {
  354. struct bpf_prog *prog;
  355. if (!f.file)
  356. return ERR_PTR(-EBADF);
  357. if (f.file->f_op != &bpf_prog_fops) {
  358. fdput(f);
  359. return ERR_PTR(-EINVAL);
  360. }
  361. prog = f.file->private_data;
  362. return prog;
  363. }
  364. /* called by sockets/tracing/seccomp before attaching program to an event
  365. * pairs with bpf_prog_put()
  366. */
  367. struct bpf_prog *bpf_prog_get(u32 ufd)
  368. {
  369. struct fd f = fdget(ufd);
  370. struct bpf_prog *prog;
  371. prog = get_prog(f);
  372. if (IS_ERR(prog))
  373. return prog;
  374. atomic_inc(&prog->aux->refcnt);
  375. fdput(f);
  376. return prog;
  377. }
  378. /* last field in 'union bpf_attr' used by this command */
  379. #define BPF_PROG_LOAD_LAST_FIELD log_buf
  380. static int bpf_prog_load(union bpf_attr *attr)
  381. {
  382. enum bpf_prog_type type = attr->prog_type;
  383. struct bpf_prog *prog;
  384. int err;
  385. char license[128];
  386. bool is_gpl;
  387. if (CHECK_ATTR(BPF_PROG_LOAD))
  388. return -EINVAL;
  389. /* copy eBPF program license from user space */
  390. if (strncpy_from_user(license, u64_to_ptr(attr->license),
  391. sizeof(license) - 1) < 0)
  392. return -EFAULT;
  393. license[sizeof(license) - 1] = 0;
  394. /* eBPF programs must be GPL compatible to use GPL-ed functions */
  395. is_gpl = license_is_gpl_compatible(license);
  396. if (attr->insn_cnt >= BPF_MAXINSNS)
  397. return -EINVAL;
  398. /* plain bpf_prog allocation */
  399. prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
  400. if (!prog)
  401. return -ENOMEM;
  402. prog->len = attr->insn_cnt;
  403. err = -EFAULT;
  404. if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
  405. prog->len * sizeof(struct bpf_insn)) != 0)
  406. goto free_prog;
  407. prog->orig_prog = NULL;
  408. prog->jited = false;
  409. atomic_set(&prog->aux->refcnt, 1);
  410. prog->aux->is_gpl_compatible = is_gpl;
  411. /* find program type: socket_filter vs tracing_filter */
  412. err = find_prog_type(type, prog);
  413. if (err < 0)
  414. goto free_prog;
  415. /* run eBPF verifier */
  416. err = bpf_check(prog, attr);
  417. if (err < 0)
  418. goto free_used_maps;
  419. /* fixup BPF_CALL->imm field */
  420. fixup_bpf_calls(prog);
  421. /* eBPF program is ready to be JITed */
  422. bpf_prog_select_runtime(prog);
  423. err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
  424. if (err < 0)
  425. /* failed to allocate fd */
  426. goto free_used_maps;
  427. return err;
  428. free_used_maps:
  429. free_used_maps(prog->aux);
  430. free_prog:
  431. bpf_prog_free(prog);
  432. return err;
  433. }
  434. SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
  435. {
  436. union bpf_attr attr = {};
  437. int err;
  438. /* the syscall is limited to root temporarily. This restriction will be
  439. * lifted when security audit is clean. Note that eBPF+tracing must have
  440. * this restriction, since it may pass kernel data to user space
  441. */
  442. if (!capable(CAP_SYS_ADMIN))
  443. return -EPERM;
  444. if (!access_ok(VERIFY_READ, uattr, 1))
  445. return -EFAULT;
  446. if (size > PAGE_SIZE) /* silly large */
  447. return -E2BIG;
  448. /* If we're handed a bigger struct than we know of,
  449. * ensure all the unknown bits are 0 - i.e. new
  450. * user-space does not rely on any kernel feature
  451. * extensions we dont know about yet.
  452. */
  453. if (size > sizeof(attr)) {
  454. unsigned char __user *addr;
  455. unsigned char __user *end;
  456. unsigned char val;
  457. addr = (void __user *)uattr + sizeof(attr);
  458. end = (void __user *)uattr + size;
  459. for (; addr < end; addr++) {
  460. err = get_user(val, addr);
  461. if (err)
  462. return err;
  463. if (val)
  464. return -E2BIG;
  465. }
  466. size = sizeof(attr);
  467. }
  468. /* copy attributes from user space, may be less than sizeof(bpf_attr) */
  469. if (copy_from_user(&attr, uattr, size) != 0)
  470. return -EFAULT;
  471. switch (cmd) {
  472. case BPF_MAP_CREATE:
  473. err = map_create(&attr);
  474. break;
  475. case BPF_MAP_LOOKUP_ELEM:
  476. err = map_lookup_elem(&attr);
  477. break;
  478. case BPF_MAP_UPDATE_ELEM:
  479. err = map_update_elem(&attr);
  480. break;
  481. case BPF_MAP_DELETE_ELEM:
  482. err = map_delete_elem(&attr);
  483. break;
  484. case BPF_MAP_GET_NEXT_KEY:
  485. err = map_get_next_key(&attr);
  486. break;
  487. case BPF_PROG_LOAD:
  488. err = bpf_prog_load(&attr);
  489. break;
  490. default:
  491. err = -EINVAL;
  492. break;
  493. }
  494. return err;
  495. }