syscall.c 13 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606
  1. /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. #include <linux/bpf.h>
  13. #include <linux/syscalls.h>
  14. #include <linux/slab.h>
  15. #include <linux/anon_inodes.h>
  16. #include <linux/file.h>
  17. #include <linux/license.h>
  18. #include <linux/filter.h>
  19. static LIST_HEAD(bpf_map_types);
  20. static struct bpf_map *find_and_alloc_map(union bpf_attr *attr)
  21. {
  22. struct bpf_map_type_list *tl;
  23. struct bpf_map *map;
  24. list_for_each_entry(tl, &bpf_map_types, list_node) {
  25. if (tl->type == attr->map_type) {
  26. map = tl->ops->map_alloc(attr);
  27. if (IS_ERR(map))
  28. return map;
  29. map->ops = tl->ops;
  30. map->map_type = attr->map_type;
  31. return map;
  32. }
  33. }
  34. return ERR_PTR(-EINVAL);
  35. }
  36. /* boot time registration of different map implementations */
  37. void bpf_register_map_type(struct bpf_map_type_list *tl)
  38. {
  39. list_add(&tl->list_node, &bpf_map_types);
  40. }
  41. /* called from workqueue */
  42. static void bpf_map_free_deferred(struct work_struct *work)
  43. {
  44. struct bpf_map *map = container_of(work, struct bpf_map, work);
  45. /* implementation dependent freeing */
  46. map->ops->map_free(map);
  47. }
  48. /* decrement map refcnt and schedule it for freeing via workqueue
  49. * (unrelying map implementation ops->map_free() might sleep)
  50. */
  51. void bpf_map_put(struct bpf_map *map)
  52. {
  53. if (atomic_dec_and_test(&map->refcnt)) {
  54. INIT_WORK(&map->work, bpf_map_free_deferred);
  55. schedule_work(&map->work);
  56. }
  57. }
  58. static int bpf_map_release(struct inode *inode, struct file *filp)
  59. {
  60. struct bpf_map *map = filp->private_data;
  61. bpf_map_put(map);
  62. return 0;
  63. }
  64. static const struct file_operations bpf_map_fops = {
  65. .release = bpf_map_release,
  66. };
  67. /* helper macro to check that unused fields 'union bpf_attr' are zero */
  68. #define CHECK_ATTR(CMD) \
  69. memchr_inv((void *) &attr->CMD##_LAST_FIELD + \
  70. sizeof(attr->CMD##_LAST_FIELD), 0, \
  71. sizeof(*attr) - \
  72. offsetof(union bpf_attr, CMD##_LAST_FIELD) - \
  73. sizeof(attr->CMD##_LAST_FIELD)) != NULL
  74. #define BPF_MAP_CREATE_LAST_FIELD max_entries
  75. /* called via syscall */
  76. static int map_create(union bpf_attr *attr)
  77. {
  78. struct bpf_map *map;
  79. int err;
  80. err = CHECK_ATTR(BPF_MAP_CREATE);
  81. if (err)
  82. return -EINVAL;
  83. /* find map type and init map: hashtable vs rbtree vs bloom vs ... */
  84. map = find_and_alloc_map(attr);
  85. if (IS_ERR(map))
  86. return PTR_ERR(map);
  87. atomic_set(&map->refcnt, 1);
  88. err = anon_inode_getfd("bpf-map", &bpf_map_fops, map, O_RDWR | O_CLOEXEC);
  89. if (err < 0)
  90. /* failed to allocate fd */
  91. goto free_map;
  92. return err;
  93. free_map:
  94. map->ops->map_free(map);
  95. return err;
  96. }
  97. /* if error is returned, fd is released.
  98. * On success caller should complete fd access with matching fdput()
  99. */
  100. struct bpf_map *bpf_map_get(struct fd f)
  101. {
  102. struct bpf_map *map;
  103. if (!f.file)
  104. return ERR_PTR(-EBADF);
  105. if (f.file->f_op != &bpf_map_fops) {
  106. fdput(f);
  107. return ERR_PTR(-EINVAL);
  108. }
  109. map = f.file->private_data;
  110. return map;
  111. }
  112. /* helper to convert user pointers passed inside __aligned_u64 fields */
  113. static void __user *u64_to_ptr(__u64 val)
  114. {
  115. return (void __user *) (unsigned long) val;
  116. }
  117. /* last field in 'union bpf_attr' used by this command */
  118. #define BPF_MAP_LOOKUP_ELEM_LAST_FIELD value
  119. static int map_lookup_elem(union bpf_attr *attr)
  120. {
  121. void __user *ukey = u64_to_ptr(attr->key);
  122. void __user *uvalue = u64_to_ptr(attr->value);
  123. int ufd = attr->map_fd;
  124. struct fd f = fdget(ufd);
  125. struct bpf_map *map;
  126. void *key, *value;
  127. int err;
  128. if (CHECK_ATTR(BPF_MAP_LOOKUP_ELEM))
  129. return -EINVAL;
  130. map = bpf_map_get(f);
  131. if (IS_ERR(map))
  132. return PTR_ERR(map);
  133. err = -ENOMEM;
  134. key = kmalloc(map->key_size, GFP_USER);
  135. if (!key)
  136. goto err_put;
  137. err = -EFAULT;
  138. if (copy_from_user(key, ukey, map->key_size) != 0)
  139. goto free_key;
  140. err = -ESRCH;
  141. rcu_read_lock();
  142. value = map->ops->map_lookup_elem(map, key);
  143. if (!value)
  144. goto err_unlock;
  145. err = -EFAULT;
  146. if (copy_to_user(uvalue, value, map->value_size) != 0)
  147. goto err_unlock;
  148. err = 0;
  149. err_unlock:
  150. rcu_read_unlock();
  151. free_key:
  152. kfree(key);
  153. err_put:
  154. fdput(f);
  155. return err;
  156. }
  157. #define BPF_MAP_UPDATE_ELEM_LAST_FIELD value
  158. static int map_update_elem(union bpf_attr *attr)
  159. {
  160. void __user *ukey = u64_to_ptr(attr->key);
  161. void __user *uvalue = u64_to_ptr(attr->value);
  162. int ufd = attr->map_fd;
  163. struct fd f = fdget(ufd);
  164. struct bpf_map *map;
  165. void *key, *value;
  166. int err;
  167. if (CHECK_ATTR(BPF_MAP_UPDATE_ELEM))
  168. return -EINVAL;
  169. map = bpf_map_get(f);
  170. if (IS_ERR(map))
  171. return PTR_ERR(map);
  172. err = -ENOMEM;
  173. key = kmalloc(map->key_size, GFP_USER);
  174. if (!key)
  175. goto err_put;
  176. err = -EFAULT;
  177. if (copy_from_user(key, ukey, map->key_size) != 0)
  178. goto free_key;
  179. err = -ENOMEM;
  180. value = kmalloc(map->value_size, GFP_USER);
  181. if (!value)
  182. goto free_key;
  183. err = -EFAULT;
  184. if (copy_from_user(value, uvalue, map->value_size) != 0)
  185. goto free_value;
  186. /* eBPF program that use maps are running under rcu_read_lock(),
  187. * therefore all map accessors rely on this fact, so do the same here
  188. */
  189. rcu_read_lock();
  190. err = map->ops->map_update_elem(map, key, value);
  191. rcu_read_unlock();
  192. free_value:
  193. kfree(value);
  194. free_key:
  195. kfree(key);
  196. err_put:
  197. fdput(f);
  198. return err;
  199. }
  200. #define BPF_MAP_DELETE_ELEM_LAST_FIELD key
  201. static int map_delete_elem(union bpf_attr *attr)
  202. {
  203. void __user *ukey = u64_to_ptr(attr->key);
  204. int ufd = attr->map_fd;
  205. struct fd f = fdget(ufd);
  206. struct bpf_map *map;
  207. void *key;
  208. int err;
  209. if (CHECK_ATTR(BPF_MAP_DELETE_ELEM))
  210. return -EINVAL;
  211. map = bpf_map_get(f);
  212. if (IS_ERR(map))
  213. return PTR_ERR(map);
  214. err = -ENOMEM;
  215. key = kmalloc(map->key_size, GFP_USER);
  216. if (!key)
  217. goto err_put;
  218. err = -EFAULT;
  219. if (copy_from_user(key, ukey, map->key_size) != 0)
  220. goto free_key;
  221. rcu_read_lock();
  222. err = map->ops->map_delete_elem(map, key);
  223. rcu_read_unlock();
  224. free_key:
  225. kfree(key);
  226. err_put:
  227. fdput(f);
  228. return err;
  229. }
  230. /* last field in 'union bpf_attr' used by this command */
  231. #define BPF_MAP_GET_NEXT_KEY_LAST_FIELD next_key
  232. static int map_get_next_key(union bpf_attr *attr)
  233. {
  234. void __user *ukey = u64_to_ptr(attr->key);
  235. void __user *unext_key = u64_to_ptr(attr->next_key);
  236. int ufd = attr->map_fd;
  237. struct fd f = fdget(ufd);
  238. struct bpf_map *map;
  239. void *key, *next_key;
  240. int err;
  241. if (CHECK_ATTR(BPF_MAP_GET_NEXT_KEY))
  242. return -EINVAL;
  243. map = bpf_map_get(f);
  244. if (IS_ERR(map))
  245. return PTR_ERR(map);
  246. err = -ENOMEM;
  247. key = kmalloc(map->key_size, GFP_USER);
  248. if (!key)
  249. goto err_put;
  250. err = -EFAULT;
  251. if (copy_from_user(key, ukey, map->key_size) != 0)
  252. goto free_key;
  253. err = -ENOMEM;
  254. next_key = kmalloc(map->key_size, GFP_USER);
  255. if (!next_key)
  256. goto free_key;
  257. rcu_read_lock();
  258. err = map->ops->map_get_next_key(map, key, next_key);
  259. rcu_read_unlock();
  260. if (err)
  261. goto free_next_key;
  262. err = -EFAULT;
  263. if (copy_to_user(unext_key, next_key, map->key_size) != 0)
  264. goto free_next_key;
  265. err = 0;
  266. free_next_key:
  267. kfree(next_key);
  268. free_key:
  269. kfree(key);
  270. err_put:
  271. fdput(f);
  272. return err;
  273. }
  274. static LIST_HEAD(bpf_prog_types);
  275. static int find_prog_type(enum bpf_prog_type type, struct bpf_prog *prog)
  276. {
  277. struct bpf_prog_type_list *tl;
  278. list_for_each_entry(tl, &bpf_prog_types, list_node) {
  279. if (tl->type == type) {
  280. prog->aux->ops = tl->ops;
  281. prog->aux->prog_type = type;
  282. return 0;
  283. }
  284. }
  285. return -EINVAL;
  286. }
  287. void bpf_register_prog_type(struct bpf_prog_type_list *tl)
  288. {
  289. list_add(&tl->list_node, &bpf_prog_types);
  290. }
  291. /* fixup insn->imm field of bpf_call instructions:
  292. * if (insn->imm == BPF_FUNC_map_lookup_elem)
  293. * insn->imm = bpf_map_lookup_elem - __bpf_call_base;
  294. * else if (insn->imm == BPF_FUNC_map_update_elem)
  295. * insn->imm = bpf_map_update_elem - __bpf_call_base;
  296. * else ...
  297. *
  298. * this function is called after eBPF program passed verification
  299. */
  300. static void fixup_bpf_calls(struct bpf_prog *prog)
  301. {
  302. const struct bpf_func_proto *fn;
  303. int i;
  304. for (i = 0; i < prog->len; i++) {
  305. struct bpf_insn *insn = &prog->insnsi[i];
  306. if (insn->code == (BPF_JMP | BPF_CALL)) {
  307. /* we reach here when program has bpf_call instructions
  308. * and it passed bpf_check(), means that
  309. * ops->get_func_proto must have been supplied, check it
  310. */
  311. BUG_ON(!prog->aux->ops->get_func_proto);
  312. fn = prog->aux->ops->get_func_proto(insn->imm);
  313. /* all functions that have prototype and verifier allowed
  314. * programs to call them, must be real in-kernel functions
  315. */
  316. BUG_ON(!fn->func);
  317. insn->imm = fn->func - __bpf_call_base;
  318. }
  319. }
  320. }
  321. /* drop refcnt on maps used by eBPF program and free auxilary data */
  322. static void free_used_maps(struct bpf_prog_aux *aux)
  323. {
  324. int i;
  325. for (i = 0; i < aux->used_map_cnt; i++)
  326. bpf_map_put(aux->used_maps[i]);
  327. kfree(aux->used_maps);
  328. }
  329. void bpf_prog_put(struct bpf_prog *prog)
  330. {
  331. if (atomic_dec_and_test(&prog->aux->refcnt)) {
  332. free_used_maps(prog->aux);
  333. bpf_prog_free(prog);
  334. }
  335. }
  336. static int bpf_prog_release(struct inode *inode, struct file *filp)
  337. {
  338. struct bpf_prog *prog = filp->private_data;
  339. bpf_prog_put(prog);
  340. return 0;
  341. }
  342. static const struct file_operations bpf_prog_fops = {
  343. .release = bpf_prog_release,
  344. };
  345. static struct bpf_prog *get_prog(struct fd f)
  346. {
  347. struct bpf_prog *prog;
  348. if (!f.file)
  349. return ERR_PTR(-EBADF);
  350. if (f.file->f_op != &bpf_prog_fops) {
  351. fdput(f);
  352. return ERR_PTR(-EINVAL);
  353. }
  354. prog = f.file->private_data;
  355. return prog;
  356. }
  357. /* called by sockets/tracing/seccomp before attaching program to an event
  358. * pairs with bpf_prog_put()
  359. */
  360. struct bpf_prog *bpf_prog_get(u32 ufd)
  361. {
  362. struct fd f = fdget(ufd);
  363. struct bpf_prog *prog;
  364. prog = get_prog(f);
  365. if (IS_ERR(prog))
  366. return prog;
  367. atomic_inc(&prog->aux->refcnt);
  368. fdput(f);
  369. return prog;
  370. }
  371. /* last field in 'union bpf_attr' used by this command */
  372. #define BPF_PROG_LOAD_LAST_FIELD log_buf
  373. static int bpf_prog_load(union bpf_attr *attr)
  374. {
  375. enum bpf_prog_type type = attr->prog_type;
  376. struct bpf_prog *prog;
  377. int err;
  378. char license[128];
  379. bool is_gpl;
  380. if (CHECK_ATTR(BPF_PROG_LOAD))
  381. return -EINVAL;
  382. /* copy eBPF program license from user space */
  383. if (strncpy_from_user(license, u64_to_ptr(attr->license),
  384. sizeof(license) - 1) < 0)
  385. return -EFAULT;
  386. license[sizeof(license) - 1] = 0;
  387. /* eBPF programs must be GPL compatible to use GPL-ed functions */
  388. is_gpl = license_is_gpl_compatible(license);
  389. if (attr->insn_cnt >= BPF_MAXINSNS)
  390. return -EINVAL;
  391. /* plain bpf_prog allocation */
  392. prog = bpf_prog_alloc(bpf_prog_size(attr->insn_cnt), GFP_USER);
  393. if (!prog)
  394. return -ENOMEM;
  395. prog->len = attr->insn_cnt;
  396. err = -EFAULT;
  397. if (copy_from_user(prog->insns, u64_to_ptr(attr->insns),
  398. prog->len * sizeof(struct bpf_insn)) != 0)
  399. goto free_prog;
  400. prog->orig_prog = NULL;
  401. prog->jited = false;
  402. atomic_set(&prog->aux->refcnt, 1);
  403. prog->aux->is_gpl_compatible = is_gpl;
  404. /* find program type: socket_filter vs tracing_filter */
  405. err = find_prog_type(type, prog);
  406. if (err < 0)
  407. goto free_prog;
  408. /* run eBPF verifier */
  409. err = bpf_check(prog, attr);
  410. if (err < 0)
  411. goto free_used_maps;
  412. /* fixup BPF_CALL->imm field */
  413. fixup_bpf_calls(prog);
  414. /* eBPF program is ready to be JITed */
  415. bpf_prog_select_runtime(prog);
  416. err = anon_inode_getfd("bpf-prog", &bpf_prog_fops, prog, O_RDWR | O_CLOEXEC);
  417. if (err < 0)
  418. /* failed to allocate fd */
  419. goto free_used_maps;
  420. return err;
  421. free_used_maps:
  422. free_used_maps(prog->aux);
  423. free_prog:
  424. bpf_prog_free(prog);
  425. return err;
  426. }
  427. SYSCALL_DEFINE3(bpf, int, cmd, union bpf_attr __user *, uattr, unsigned int, size)
  428. {
  429. union bpf_attr attr = {};
  430. int err;
  431. /* the syscall is limited to root temporarily. This restriction will be
  432. * lifted when security audit is clean. Note that eBPF+tracing must have
  433. * this restriction, since it may pass kernel data to user space
  434. */
  435. if (!capable(CAP_SYS_ADMIN))
  436. return -EPERM;
  437. if (!access_ok(VERIFY_READ, uattr, 1))
  438. return -EFAULT;
  439. if (size > PAGE_SIZE) /* silly large */
  440. return -E2BIG;
  441. /* If we're handed a bigger struct than we know of,
  442. * ensure all the unknown bits are 0 - i.e. new
  443. * user-space does not rely on any kernel feature
  444. * extensions we dont know about yet.
  445. */
  446. if (size > sizeof(attr)) {
  447. unsigned char __user *addr;
  448. unsigned char __user *end;
  449. unsigned char val;
  450. addr = (void __user *)uattr + sizeof(attr);
  451. end = (void __user *)uattr + size;
  452. for (; addr < end; addr++) {
  453. err = get_user(val, addr);
  454. if (err)
  455. return err;
  456. if (val)
  457. return -E2BIG;
  458. }
  459. size = sizeof(attr);
  460. }
  461. /* copy attributes from user space, may be less than sizeof(bpf_attr) */
  462. if (copy_from_user(&attr, uattr, size) != 0)
  463. return -EFAULT;
  464. switch (cmd) {
  465. case BPF_MAP_CREATE:
  466. err = map_create(&attr);
  467. break;
  468. case BPF_MAP_LOOKUP_ELEM:
  469. err = map_lookup_elem(&attr);
  470. break;
  471. case BPF_MAP_UPDATE_ELEM:
  472. err = map_update_elem(&attr);
  473. break;
  474. case BPF_MAP_DELETE_ELEM:
  475. err = map_delete_elem(&attr);
  476. break;
  477. case BPF_MAP_GET_NEXT_KEY:
  478. err = map_get_next_key(&attr);
  479. break;
  480. case BPF_PROG_LOAD:
  481. err = bpf_prog_load(&attr);
  482. break;
  483. default:
  484. err = -EINVAL;
  485. break;
  486. }
  487. return err;
  488. }