seccomp.c 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211121212131214121512161217121812191220122112221223122412251226122712281229123012311232
  1. /*
  2. * linux/kernel/seccomp.c
  3. *
  4. * Copyright 2004-2005 Andrea Arcangeli <andrea@cpushare.com>
  5. *
  6. * Copyright (C) 2012 Google, Inc.
  7. * Will Drewry <wad@chromium.org>
  8. *
  9. * This defines a simple but solid secure-computing facility.
  10. *
  11. * Mode 1 uses a fixed list of allowed system calls.
  12. * Mode 2 allows user-defined system call filters in the form
  13. * of Berkeley Packet Filters/Linux Socket Filters.
  14. */
  15. #include <linux/refcount.h>
  16. #include <linux/audit.h>
  17. #include <linux/compat.h>
  18. #include <linux/coredump.h>
  19. #include <linux/kmemleak.h>
  20. #include <linux/sched.h>
  21. #include <linux/sched/task_stack.h>
  22. #include <linux/seccomp.h>
  23. #include <linux/slab.h>
  24. #include <linux/syscalls.h>
  25. #include <linux/sysctl.h>
  26. #ifdef CONFIG_HAVE_ARCH_SECCOMP_FILTER
  27. #include <asm/syscall.h>
  28. #endif
  29. #ifdef CONFIG_SECCOMP_FILTER
  30. #include <linux/filter.h>
  31. #include <linux/pid.h>
  32. #include <linux/ptrace.h>
  33. #include <linux/security.h>
  34. #include <linux/tracehook.h>
  35. #include <linux/uaccess.h>
  36. /**
  37. * struct seccomp_filter - container for seccomp BPF programs
  38. *
  39. * @usage: reference count to manage the object lifetime.
  40. * get/put helpers should be used when accessing an instance
  41. * outside of a lifetime-guarded section. In general, this
  42. * is only needed for handling filters shared across tasks.
  43. * @log: true if all actions except for SECCOMP_RET_ALLOW should be logged
  44. * @prev: points to a previously installed, or inherited, filter
  45. * @prog: the BPF program to evaluate
  46. *
  47. * seccomp_filter objects are organized in a tree linked via the @prev
  48. * pointer. For any task, it appears to be a singly-linked list starting
  49. * with current->seccomp.filter, the most recently attached or inherited filter.
  50. * However, multiple filters may share a @prev node, by way of fork(), which
  51. * results in a unidirectional tree existing in memory. This is similar to
  52. * how namespaces work.
  53. *
  54. * seccomp_filter objects should never be modified after being attached
  55. * to a task_struct (other than @usage).
  56. */
  57. struct seccomp_filter {
  58. refcount_t usage;
  59. bool log;
  60. struct seccomp_filter *prev;
  61. struct bpf_prog *prog;
  62. };
  63. /* Limit any path through the tree to 256KB worth of instructions. */
  64. #define MAX_INSNS_PER_PATH ((1 << 18) / sizeof(struct sock_filter))
  65. /*
  66. * Endianness is explicitly ignored and left for BPF program authors to manage
  67. * as per the specific architecture.
  68. */
  69. static void populate_seccomp_data(struct seccomp_data *sd)
  70. {
  71. struct task_struct *task = current;
  72. struct pt_regs *regs = task_pt_regs(task);
  73. unsigned long args[6];
  74. sd->nr = syscall_get_nr(task, regs);
  75. sd->arch = syscall_get_arch();
  76. syscall_get_arguments(task, regs, 0, 6, args);
  77. sd->args[0] = args[0];
  78. sd->args[1] = args[1];
  79. sd->args[2] = args[2];
  80. sd->args[3] = args[3];
  81. sd->args[4] = args[4];
  82. sd->args[5] = args[5];
  83. sd->instruction_pointer = KSTK_EIP(task);
  84. }
  85. /**
  86. * seccomp_check_filter - verify seccomp filter code
  87. * @filter: filter to verify
  88. * @flen: length of filter
  89. *
  90. * Takes a previously checked filter (by bpf_check_classic) and
  91. * redirects all filter code that loads struct sk_buff data
  92. * and related data through seccomp_bpf_load. It also
  93. * enforces length and alignment checking of those loads.
  94. *
  95. * Returns 0 if the rule set is legal or -EINVAL if not.
  96. */
  97. static int seccomp_check_filter(struct sock_filter *filter, unsigned int flen)
  98. {
  99. int pc;
  100. for (pc = 0; pc < flen; pc++) {
  101. struct sock_filter *ftest = &filter[pc];
  102. u16 code = ftest->code;
  103. u32 k = ftest->k;
  104. switch (code) {
  105. case BPF_LD | BPF_W | BPF_ABS:
  106. ftest->code = BPF_LDX | BPF_W | BPF_ABS;
  107. /* 32-bit aligned and not out of bounds. */
  108. if (k >= sizeof(struct seccomp_data) || k & 3)
  109. return -EINVAL;
  110. continue;
  111. case BPF_LD | BPF_W | BPF_LEN:
  112. ftest->code = BPF_LD | BPF_IMM;
  113. ftest->k = sizeof(struct seccomp_data);
  114. continue;
  115. case BPF_LDX | BPF_W | BPF_LEN:
  116. ftest->code = BPF_LDX | BPF_IMM;
  117. ftest->k = sizeof(struct seccomp_data);
  118. continue;
  119. /* Explicitly include allowed calls. */
  120. case BPF_RET | BPF_K:
  121. case BPF_RET | BPF_A:
  122. case BPF_ALU | BPF_ADD | BPF_K:
  123. case BPF_ALU | BPF_ADD | BPF_X:
  124. case BPF_ALU | BPF_SUB | BPF_K:
  125. case BPF_ALU | BPF_SUB | BPF_X:
  126. case BPF_ALU | BPF_MUL | BPF_K:
  127. case BPF_ALU | BPF_MUL | BPF_X:
  128. case BPF_ALU | BPF_DIV | BPF_K:
  129. case BPF_ALU | BPF_DIV | BPF_X:
  130. case BPF_ALU | BPF_AND | BPF_K:
  131. case BPF_ALU | BPF_AND | BPF_X:
  132. case BPF_ALU | BPF_OR | BPF_K:
  133. case BPF_ALU | BPF_OR | BPF_X:
  134. case BPF_ALU | BPF_XOR | BPF_K:
  135. case BPF_ALU | BPF_XOR | BPF_X:
  136. case BPF_ALU | BPF_LSH | BPF_K:
  137. case BPF_ALU | BPF_LSH | BPF_X:
  138. case BPF_ALU | BPF_RSH | BPF_K:
  139. case BPF_ALU | BPF_RSH | BPF_X:
  140. case BPF_ALU | BPF_NEG:
  141. case BPF_LD | BPF_IMM:
  142. case BPF_LDX | BPF_IMM:
  143. case BPF_MISC | BPF_TAX:
  144. case BPF_MISC | BPF_TXA:
  145. case BPF_LD | BPF_MEM:
  146. case BPF_LDX | BPF_MEM:
  147. case BPF_ST:
  148. case BPF_STX:
  149. case BPF_JMP | BPF_JA:
  150. case BPF_JMP | BPF_JEQ | BPF_K:
  151. case BPF_JMP | BPF_JEQ | BPF_X:
  152. case BPF_JMP | BPF_JGE | BPF_K:
  153. case BPF_JMP | BPF_JGE | BPF_X:
  154. case BPF_JMP | BPF_JGT | BPF_K:
  155. case BPF_JMP | BPF_JGT | BPF_X:
  156. case BPF_JMP | BPF_JSET | BPF_K:
  157. case BPF_JMP | BPF_JSET | BPF_X:
  158. continue;
  159. default:
  160. return -EINVAL;
  161. }
  162. }
  163. return 0;
  164. }
  165. /**
  166. * seccomp_run_filters - evaluates all seccomp filters against @sd
  167. * @sd: optional seccomp data to be passed to filters
  168. * @match: stores struct seccomp_filter that resulted in the return value,
  169. * unless filter returned SECCOMP_RET_ALLOW, in which case it will
  170. * be unchanged.
  171. *
  172. * Returns valid seccomp BPF response codes.
  173. */
  174. #define ACTION_ONLY(ret) ((s32)((ret) & (SECCOMP_RET_ACTION_FULL)))
  175. static u32 seccomp_run_filters(const struct seccomp_data *sd,
  176. struct seccomp_filter **match)
  177. {
  178. struct seccomp_data sd_local;
  179. u32 ret = SECCOMP_RET_ALLOW;
  180. /* Make sure cross-thread synced filter points somewhere sane. */
  181. struct seccomp_filter *f =
  182. lockless_dereference(current->seccomp.filter);
  183. /* Ensure unexpected behavior doesn't result in failing open. */
  184. if (unlikely(WARN_ON(f == NULL)))
  185. return SECCOMP_RET_KILL_PROCESS;
  186. if (!sd) {
  187. populate_seccomp_data(&sd_local);
  188. sd = &sd_local;
  189. }
  190. /*
  191. * All filters in the list are evaluated and the lowest BPF return
  192. * value always takes priority (ignoring the DATA).
  193. */
  194. for (; f; f = f->prev) {
  195. u32 cur_ret = BPF_PROG_RUN(f->prog, sd);
  196. if (ACTION_ONLY(cur_ret) < ACTION_ONLY(ret)) {
  197. ret = cur_ret;
  198. *match = f;
  199. }
  200. }
  201. return ret;
  202. }
  203. #endif /* CONFIG_SECCOMP_FILTER */
  204. static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode)
  205. {
  206. assert_spin_locked(&current->sighand->siglock);
  207. if (current->seccomp.mode && current->seccomp.mode != seccomp_mode)
  208. return false;
  209. return true;
  210. }
  211. static inline void seccomp_assign_mode(struct task_struct *task,
  212. unsigned long seccomp_mode)
  213. {
  214. assert_spin_locked(&task->sighand->siglock);
  215. task->seccomp.mode = seccomp_mode;
  216. /*
  217. * Make sure TIF_SECCOMP cannot be set before the mode (and
  218. * filter) is set.
  219. */
  220. smp_mb__before_atomic();
  221. set_tsk_thread_flag(task, TIF_SECCOMP);
  222. }
  223. #ifdef CONFIG_SECCOMP_FILTER
  224. /* Returns 1 if the parent is an ancestor of the child. */
  225. static int is_ancestor(struct seccomp_filter *parent,
  226. struct seccomp_filter *child)
  227. {
  228. /* NULL is the root ancestor. */
  229. if (parent == NULL)
  230. return 1;
  231. for (; child; child = child->prev)
  232. if (child == parent)
  233. return 1;
  234. return 0;
  235. }
  236. /**
  237. * seccomp_can_sync_threads: checks if all threads can be synchronized
  238. *
  239. * Expects sighand and cred_guard_mutex locks to be held.
  240. *
  241. * Returns 0 on success, -ve on error, or the pid of a thread which was
  242. * either not in the correct seccomp mode or it did not have an ancestral
  243. * seccomp filter.
  244. */
  245. static inline pid_t seccomp_can_sync_threads(void)
  246. {
  247. struct task_struct *thread, *caller;
  248. BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
  249. assert_spin_locked(&current->sighand->siglock);
  250. /* Validate all threads being eligible for synchronization. */
  251. caller = current;
  252. for_each_thread(caller, thread) {
  253. pid_t failed;
  254. /* Skip current, since it is initiating the sync. */
  255. if (thread == caller)
  256. continue;
  257. if (thread->seccomp.mode == SECCOMP_MODE_DISABLED ||
  258. (thread->seccomp.mode == SECCOMP_MODE_FILTER &&
  259. is_ancestor(thread->seccomp.filter,
  260. caller->seccomp.filter)))
  261. continue;
  262. /* Return the first thread that cannot be synchronized. */
  263. failed = task_pid_vnr(thread);
  264. /* If the pid cannot be resolved, then return -ESRCH */
  265. if (unlikely(WARN_ON(failed == 0)))
  266. failed = -ESRCH;
  267. return failed;
  268. }
  269. return 0;
  270. }
  271. /**
  272. * seccomp_sync_threads: sets all threads to use current's filter
  273. *
  274. * Expects sighand and cred_guard_mutex locks to be held, and for
  275. * seccomp_can_sync_threads() to have returned success already
  276. * without dropping the locks.
  277. *
  278. */
  279. static inline void seccomp_sync_threads(void)
  280. {
  281. struct task_struct *thread, *caller;
  282. BUG_ON(!mutex_is_locked(&current->signal->cred_guard_mutex));
  283. assert_spin_locked(&current->sighand->siglock);
  284. /* Synchronize all threads. */
  285. caller = current;
  286. for_each_thread(caller, thread) {
  287. /* Skip current, since it needs no changes. */
  288. if (thread == caller)
  289. continue;
  290. /* Get a task reference for the new leaf node. */
  291. get_seccomp_filter(caller);
  292. /*
  293. * Drop the task reference to the shared ancestor since
  294. * current's path will hold a reference. (This also
  295. * allows a put before the assignment.)
  296. */
  297. put_seccomp_filter(thread);
  298. smp_store_release(&thread->seccomp.filter,
  299. caller->seccomp.filter);
  300. /*
  301. * Don't let an unprivileged task work around
  302. * the no_new_privs restriction by creating
  303. * a thread that sets it up, enters seccomp,
  304. * then dies.
  305. */
  306. if (task_no_new_privs(caller))
  307. task_set_no_new_privs(thread);
  308. /*
  309. * Opt the other thread into seccomp if needed.
  310. * As threads are considered to be trust-realm
  311. * equivalent (see ptrace_may_access), it is safe to
  312. * allow one thread to transition the other.
  313. */
  314. if (thread->seccomp.mode == SECCOMP_MODE_DISABLED)
  315. seccomp_assign_mode(thread, SECCOMP_MODE_FILTER);
  316. }
  317. }
  318. /**
  319. * seccomp_prepare_filter: Prepares a seccomp filter for use.
  320. * @fprog: BPF program to install
  321. *
  322. * Returns filter on success or an ERR_PTR on failure.
  323. */
  324. static struct seccomp_filter *seccomp_prepare_filter(struct sock_fprog *fprog)
  325. {
  326. struct seccomp_filter *sfilter;
  327. int ret;
  328. const bool save_orig = IS_ENABLED(CONFIG_CHECKPOINT_RESTORE);
  329. if (fprog->len == 0 || fprog->len > BPF_MAXINSNS)
  330. return ERR_PTR(-EINVAL);
  331. BUG_ON(INT_MAX / fprog->len < sizeof(struct sock_filter));
  332. /*
  333. * Installing a seccomp filter requires that the task has
  334. * CAP_SYS_ADMIN in its namespace or be running with no_new_privs.
  335. * This avoids scenarios where unprivileged tasks can affect the
  336. * behavior of privileged children.
  337. */
  338. if (!task_no_new_privs(current) &&
  339. security_capable_noaudit(current_cred(), current_user_ns(),
  340. CAP_SYS_ADMIN) != 0)
  341. return ERR_PTR(-EACCES);
  342. /* Allocate a new seccomp_filter */
  343. sfilter = kzalloc(sizeof(*sfilter), GFP_KERNEL | __GFP_NOWARN);
  344. if (!sfilter)
  345. return ERR_PTR(-ENOMEM);
  346. ret = bpf_prog_create_from_user(&sfilter->prog, fprog,
  347. seccomp_check_filter, save_orig);
  348. if (ret < 0) {
  349. kfree(sfilter);
  350. return ERR_PTR(ret);
  351. }
  352. refcount_set(&sfilter->usage, 1);
  353. return sfilter;
  354. }
  355. /**
  356. * seccomp_prepare_user_filter - prepares a user-supplied sock_fprog
  357. * @user_filter: pointer to the user data containing a sock_fprog.
  358. *
  359. * Returns 0 on success and non-zero otherwise.
  360. */
  361. static struct seccomp_filter *
  362. seccomp_prepare_user_filter(const char __user *user_filter)
  363. {
  364. struct sock_fprog fprog;
  365. struct seccomp_filter *filter = ERR_PTR(-EFAULT);
  366. #ifdef CONFIG_COMPAT
  367. if (in_compat_syscall()) {
  368. struct compat_sock_fprog fprog32;
  369. if (copy_from_user(&fprog32, user_filter, sizeof(fprog32)))
  370. goto out;
  371. fprog.len = fprog32.len;
  372. fprog.filter = compat_ptr(fprog32.filter);
  373. } else /* falls through to the if below. */
  374. #endif
  375. if (copy_from_user(&fprog, user_filter, sizeof(fprog)))
  376. goto out;
  377. filter = seccomp_prepare_filter(&fprog);
  378. out:
  379. return filter;
  380. }
  381. /**
  382. * seccomp_attach_filter: validate and attach filter
  383. * @flags: flags to change filter behavior
  384. * @filter: seccomp filter to add to the current process
  385. *
  386. * Caller must be holding current->sighand->siglock lock.
  387. *
  388. * Returns 0 on success, -ve on error.
  389. */
  390. static long seccomp_attach_filter(unsigned int flags,
  391. struct seccomp_filter *filter)
  392. {
  393. unsigned long total_insns;
  394. struct seccomp_filter *walker;
  395. assert_spin_locked(&current->sighand->siglock);
  396. /* Validate resulting filter length. */
  397. total_insns = filter->prog->len;
  398. for (walker = current->seccomp.filter; walker; walker = walker->prev)
  399. total_insns += walker->prog->len + 4; /* 4 instr penalty */
  400. if (total_insns > MAX_INSNS_PER_PATH)
  401. return -ENOMEM;
  402. /* If thread sync has been requested, check that it is possible. */
  403. if (flags & SECCOMP_FILTER_FLAG_TSYNC) {
  404. int ret;
  405. ret = seccomp_can_sync_threads();
  406. if (ret)
  407. return ret;
  408. }
  409. /* Set log flag, if present. */
  410. if (flags & SECCOMP_FILTER_FLAG_LOG)
  411. filter->log = true;
  412. /*
  413. * If there is an existing filter, make it the prev and don't drop its
  414. * task reference.
  415. */
  416. filter->prev = current->seccomp.filter;
  417. current->seccomp.filter = filter;
  418. /* Now that the new filter is in place, synchronize to all threads. */
  419. if (flags & SECCOMP_FILTER_FLAG_TSYNC)
  420. seccomp_sync_threads();
  421. return 0;
  422. }
  423. void __get_seccomp_filter(struct seccomp_filter *filter)
  424. {
  425. /* Reference count is bounded by the number of total processes. */
  426. refcount_inc(&filter->usage);
  427. }
  428. /* get_seccomp_filter - increments the reference count of the filter on @tsk */
  429. void get_seccomp_filter(struct task_struct *tsk)
  430. {
  431. struct seccomp_filter *orig = tsk->seccomp.filter;
  432. if (!orig)
  433. return;
  434. __get_seccomp_filter(orig);
  435. }
  436. static inline void seccomp_filter_free(struct seccomp_filter *filter)
  437. {
  438. if (filter) {
  439. bpf_prog_destroy(filter->prog);
  440. kfree(filter);
  441. }
  442. }
  443. static void __put_seccomp_filter(struct seccomp_filter *orig)
  444. {
  445. /* Clean up single-reference branches iteratively. */
  446. while (orig && refcount_dec_and_test(&orig->usage)) {
  447. struct seccomp_filter *freeme = orig;
  448. orig = orig->prev;
  449. seccomp_filter_free(freeme);
  450. }
  451. }
  452. /* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */
  453. void put_seccomp_filter(struct task_struct *tsk)
  454. {
  455. __put_seccomp_filter(tsk->seccomp.filter);
  456. }
  457. static void seccomp_init_siginfo(siginfo_t *info, int syscall, int reason)
  458. {
  459. memset(info, 0, sizeof(*info));
  460. info->si_signo = SIGSYS;
  461. info->si_code = SYS_SECCOMP;
  462. info->si_call_addr = (void __user *)KSTK_EIP(current);
  463. info->si_errno = reason;
  464. info->si_arch = syscall_get_arch();
  465. info->si_syscall = syscall;
  466. }
  467. /**
  468. * seccomp_send_sigsys - signals the task to allow in-process syscall emulation
  469. * @syscall: syscall number to send to userland
  470. * @reason: filter-supplied reason code to send to userland (via si_errno)
  471. *
  472. * Forces a SIGSYS with a code of SYS_SECCOMP and related sigsys info.
  473. */
  474. static void seccomp_send_sigsys(int syscall, int reason)
  475. {
  476. struct siginfo info;
  477. seccomp_init_siginfo(&info, syscall, reason);
  478. force_sig_info(SIGSYS, &info, current);
  479. }
  480. #endif /* CONFIG_SECCOMP_FILTER */
  481. /* For use with seccomp_actions_logged */
  482. #define SECCOMP_LOG_KILL_PROCESS (1 << 0)
  483. #define SECCOMP_LOG_KILL_THREAD (1 << 1)
  484. #define SECCOMP_LOG_TRAP (1 << 2)
  485. #define SECCOMP_LOG_ERRNO (1 << 3)
  486. #define SECCOMP_LOG_TRACE (1 << 4)
  487. #define SECCOMP_LOG_LOG (1 << 5)
  488. #define SECCOMP_LOG_ALLOW (1 << 6)
  489. static u32 seccomp_actions_logged = SECCOMP_LOG_KILL_PROCESS |
  490. SECCOMP_LOG_KILL_THREAD |
  491. SECCOMP_LOG_TRAP |
  492. SECCOMP_LOG_ERRNO |
  493. SECCOMP_LOG_TRACE |
  494. SECCOMP_LOG_LOG;
  495. static inline void seccomp_log(unsigned long syscall, long signr, u32 action,
  496. bool requested)
  497. {
  498. bool log = false;
  499. switch (action) {
  500. case SECCOMP_RET_ALLOW:
  501. break;
  502. case SECCOMP_RET_TRAP:
  503. log = requested && seccomp_actions_logged & SECCOMP_LOG_TRAP;
  504. break;
  505. case SECCOMP_RET_ERRNO:
  506. log = requested && seccomp_actions_logged & SECCOMP_LOG_ERRNO;
  507. break;
  508. case SECCOMP_RET_TRACE:
  509. log = requested && seccomp_actions_logged & SECCOMP_LOG_TRACE;
  510. break;
  511. case SECCOMP_RET_LOG:
  512. log = seccomp_actions_logged & SECCOMP_LOG_LOG;
  513. break;
  514. case SECCOMP_RET_KILL_THREAD:
  515. log = seccomp_actions_logged & SECCOMP_LOG_KILL_THREAD;
  516. break;
  517. case SECCOMP_RET_KILL_PROCESS:
  518. default:
  519. log = seccomp_actions_logged & SECCOMP_LOG_KILL_PROCESS;
  520. }
  521. /*
  522. * Force an audit message to be emitted when the action is RET_KILL_*,
  523. * RET_LOG, or the FILTER_FLAG_LOG bit was set and the action is
  524. * allowed to be logged by the admin.
  525. */
  526. if (log)
  527. return __audit_seccomp(syscall, signr, action);
  528. /*
  529. * Let the audit subsystem decide if the action should be audited based
  530. * on whether the current task itself is being audited.
  531. */
  532. return audit_seccomp(syscall, signr, action);
  533. }
  534. /*
  535. * Secure computing mode 1 allows only read/write/exit/sigreturn.
  536. * To be fully secure this must be combined with rlimit
  537. * to limit the stack allocations too.
  538. */
  539. static const int mode1_syscalls[] = {
  540. __NR_seccomp_read, __NR_seccomp_write, __NR_seccomp_exit, __NR_seccomp_sigreturn,
  541. 0, /* null terminated */
  542. };
  543. static void __secure_computing_strict(int this_syscall)
  544. {
  545. const int *syscall_whitelist = mode1_syscalls;
  546. #ifdef CONFIG_COMPAT
  547. if (in_compat_syscall())
  548. syscall_whitelist = get_compat_mode1_syscalls();
  549. #endif
  550. do {
  551. if (*syscall_whitelist == this_syscall)
  552. return;
  553. } while (*++syscall_whitelist);
  554. #ifdef SECCOMP_DEBUG
  555. dump_stack();
  556. #endif
  557. seccomp_log(this_syscall, SIGKILL, SECCOMP_RET_KILL_THREAD, true);
  558. do_exit(SIGKILL);
  559. }
  560. #ifndef CONFIG_HAVE_ARCH_SECCOMP_FILTER
  561. void secure_computing_strict(int this_syscall)
  562. {
  563. int mode = current->seccomp.mode;
  564. if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
  565. unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
  566. return;
  567. if (mode == SECCOMP_MODE_DISABLED)
  568. return;
  569. else if (mode == SECCOMP_MODE_STRICT)
  570. __secure_computing_strict(this_syscall);
  571. else
  572. BUG();
  573. }
  574. #else
  575. #ifdef CONFIG_SECCOMP_FILTER
  576. static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
  577. const bool recheck_after_trace)
  578. {
  579. u32 filter_ret, action;
  580. struct seccomp_filter *match = NULL;
  581. int data;
  582. /*
  583. * Make sure that any changes to mode from another thread have
  584. * been seen after TIF_SECCOMP was seen.
  585. */
  586. rmb();
  587. filter_ret = seccomp_run_filters(sd, &match);
  588. data = filter_ret & SECCOMP_RET_DATA;
  589. action = filter_ret & SECCOMP_RET_ACTION_FULL;
  590. switch (action) {
  591. case SECCOMP_RET_ERRNO:
  592. /* Set low-order bits as an errno, capped at MAX_ERRNO. */
  593. if (data > MAX_ERRNO)
  594. data = MAX_ERRNO;
  595. syscall_set_return_value(current, task_pt_regs(current),
  596. -data, 0);
  597. goto skip;
  598. case SECCOMP_RET_TRAP:
  599. /* Show the handler the original registers. */
  600. syscall_rollback(current, task_pt_regs(current));
  601. /* Let the filter pass back 16 bits of data. */
  602. seccomp_send_sigsys(this_syscall, data);
  603. goto skip;
  604. case SECCOMP_RET_TRACE:
  605. /* We've been put in this state by the ptracer already. */
  606. if (recheck_after_trace)
  607. return 0;
  608. /* ENOSYS these calls if there is no tracer attached. */
  609. if (!ptrace_event_enabled(current, PTRACE_EVENT_SECCOMP)) {
  610. syscall_set_return_value(current,
  611. task_pt_regs(current),
  612. -ENOSYS, 0);
  613. goto skip;
  614. }
  615. /* Allow the BPF to provide the event message */
  616. ptrace_event(PTRACE_EVENT_SECCOMP, data);
  617. /*
  618. * The delivery of a fatal signal during event
  619. * notification may silently skip tracer notification,
  620. * which could leave us with a potentially unmodified
  621. * syscall that the tracer would have liked to have
  622. * changed. Since the process is about to die, we just
  623. * force the syscall to be skipped and let the signal
  624. * kill the process and correctly handle any tracer exit
  625. * notifications.
  626. */
  627. if (fatal_signal_pending(current))
  628. goto skip;
  629. /* Check if the tracer forced the syscall to be skipped. */
  630. this_syscall = syscall_get_nr(current, task_pt_regs(current));
  631. if (this_syscall < 0)
  632. goto skip;
  633. /*
  634. * Recheck the syscall, since it may have changed. This
  635. * intentionally uses a NULL struct seccomp_data to force
  636. * a reload of all registers. This does not goto skip since
  637. * a skip would have already been reported.
  638. */
  639. if (__seccomp_filter(this_syscall, NULL, true))
  640. return -1;
  641. return 0;
  642. case SECCOMP_RET_LOG:
  643. seccomp_log(this_syscall, 0, action, true);
  644. return 0;
  645. case SECCOMP_RET_ALLOW:
  646. /*
  647. * Note that the "match" filter will always be NULL for
  648. * this action since SECCOMP_RET_ALLOW is the starting
  649. * state in seccomp_run_filters().
  650. */
  651. return 0;
  652. case SECCOMP_RET_KILL_THREAD:
  653. case SECCOMP_RET_KILL_PROCESS:
  654. default:
  655. seccomp_log(this_syscall, SIGSYS, action, true);
  656. /* Dump core only if this is the last remaining thread. */
  657. if (action == SECCOMP_RET_KILL_PROCESS ||
  658. get_nr_threads(current) == 1) {
  659. siginfo_t info;
  660. /* Show the original registers in the dump. */
  661. syscall_rollback(current, task_pt_regs(current));
  662. /* Trigger a manual coredump since do_exit skips it. */
  663. seccomp_init_siginfo(&info, this_syscall, data);
  664. do_coredump(&info);
  665. }
  666. if (action == SECCOMP_RET_KILL_PROCESS)
  667. do_group_exit(SIGSYS);
  668. else
  669. do_exit(SIGSYS);
  670. }
  671. unreachable();
  672. skip:
  673. seccomp_log(this_syscall, 0, action, match ? match->log : false);
  674. return -1;
  675. }
  676. #else
  677. static int __seccomp_filter(int this_syscall, const struct seccomp_data *sd,
  678. const bool recheck_after_trace)
  679. {
  680. BUG();
  681. }
  682. #endif
  683. int __secure_computing(const struct seccomp_data *sd)
  684. {
  685. int mode = current->seccomp.mode;
  686. int this_syscall;
  687. if (IS_ENABLED(CONFIG_CHECKPOINT_RESTORE) &&
  688. unlikely(current->ptrace & PT_SUSPEND_SECCOMP))
  689. return 0;
  690. this_syscall = sd ? sd->nr :
  691. syscall_get_nr(current, task_pt_regs(current));
  692. switch (mode) {
  693. case SECCOMP_MODE_STRICT:
  694. __secure_computing_strict(this_syscall); /* may call do_exit */
  695. return 0;
  696. case SECCOMP_MODE_FILTER:
  697. return __seccomp_filter(this_syscall, sd, false);
  698. default:
  699. BUG();
  700. }
  701. }
  702. #endif /* CONFIG_HAVE_ARCH_SECCOMP_FILTER */
  703. long prctl_get_seccomp(void)
  704. {
  705. return current->seccomp.mode;
  706. }
  707. /**
  708. * seccomp_set_mode_strict: internal function for setting strict seccomp
  709. *
  710. * Once current->seccomp.mode is non-zero, it may not be changed.
  711. *
  712. * Returns 0 on success or -EINVAL on failure.
  713. */
  714. static long seccomp_set_mode_strict(void)
  715. {
  716. const unsigned long seccomp_mode = SECCOMP_MODE_STRICT;
  717. long ret = -EINVAL;
  718. spin_lock_irq(&current->sighand->siglock);
  719. if (!seccomp_may_assign_mode(seccomp_mode))
  720. goto out;
  721. #ifdef TIF_NOTSC
  722. disable_TSC();
  723. #endif
  724. seccomp_assign_mode(current, seccomp_mode);
  725. ret = 0;
  726. out:
  727. spin_unlock_irq(&current->sighand->siglock);
  728. return ret;
  729. }
  730. #ifdef CONFIG_SECCOMP_FILTER
  731. /**
  732. * seccomp_set_mode_filter: internal function for setting seccomp filter
  733. * @flags: flags to change filter behavior
  734. * @filter: struct sock_fprog containing filter
  735. *
  736. * This function may be called repeatedly to install additional filters.
  737. * Every filter successfully installed will be evaluated (in reverse order)
  738. * for each system call the task makes.
  739. *
  740. * Once current->seccomp.mode is non-zero, it may not be changed.
  741. *
  742. * Returns 0 on success or -EINVAL on failure.
  743. */
  744. static long seccomp_set_mode_filter(unsigned int flags,
  745. const char __user *filter)
  746. {
  747. const unsigned long seccomp_mode = SECCOMP_MODE_FILTER;
  748. struct seccomp_filter *prepared = NULL;
  749. long ret = -EINVAL;
  750. /* Validate flags. */
  751. if (flags & ~SECCOMP_FILTER_FLAG_MASK)
  752. return -EINVAL;
  753. /* Prepare the new filter before holding any locks. */
  754. prepared = seccomp_prepare_user_filter(filter);
  755. if (IS_ERR(prepared))
  756. return PTR_ERR(prepared);
  757. /*
  758. * Make sure we cannot change seccomp or nnp state via TSYNC
  759. * while another thread is in the middle of calling exec.
  760. */
  761. if (flags & SECCOMP_FILTER_FLAG_TSYNC &&
  762. mutex_lock_killable(&current->signal->cred_guard_mutex))
  763. goto out_free;
  764. spin_lock_irq(&current->sighand->siglock);
  765. if (!seccomp_may_assign_mode(seccomp_mode))
  766. goto out;
  767. ret = seccomp_attach_filter(flags, prepared);
  768. if (ret)
  769. goto out;
  770. /* Do not free the successfully attached filter. */
  771. prepared = NULL;
  772. seccomp_assign_mode(current, seccomp_mode);
  773. out:
  774. spin_unlock_irq(&current->sighand->siglock);
  775. if (flags & SECCOMP_FILTER_FLAG_TSYNC)
  776. mutex_unlock(&current->signal->cred_guard_mutex);
  777. out_free:
  778. seccomp_filter_free(prepared);
  779. return ret;
  780. }
  781. #else
  782. static inline long seccomp_set_mode_filter(unsigned int flags,
  783. const char __user *filter)
  784. {
  785. return -EINVAL;
  786. }
  787. #endif
  788. static long seccomp_get_action_avail(const char __user *uaction)
  789. {
  790. u32 action;
  791. if (copy_from_user(&action, uaction, sizeof(action)))
  792. return -EFAULT;
  793. switch (action) {
  794. case SECCOMP_RET_KILL_PROCESS:
  795. case SECCOMP_RET_KILL_THREAD:
  796. case SECCOMP_RET_TRAP:
  797. case SECCOMP_RET_ERRNO:
  798. case SECCOMP_RET_TRACE:
  799. case SECCOMP_RET_LOG:
  800. case SECCOMP_RET_ALLOW:
  801. break;
  802. default:
  803. return -EOPNOTSUPP;
  804. }
  805. return 0;
  806. }
  807. /* Common entry point for both prctl and syscall. */
  808. static long do_seccomp(unsigned int op, unsigned int flags,
  809. const char __user *uargs)
  810. {
  811. switch (op) {
  812. case SECCOMP_SET_MODE_STRICT:
  813. if (flags != 0 || uargs != NULL)
  814. return -EINVAL;
  815. return seccomp_set_mode_strict();
  816. case SECCOMP_SET_MODE_FILTER:
  817. return seccomp_set_mode_filter(flags, uargs);
  818. case SECCOMP_GET_ACTION_AVAIL:
  819. if (flags != 0)
  820. return -EINVAL;
  821. return seccomp_get_action_avail(uargs);
  822. default:
  823. return -EINVAL;
  824. }
  825. }
  826. SYSCALL_DEFINE3(seccomp, unsigned int, op, unsigned int, flags,
  827. const char __user *, uargs)
  828. {
  829. return do_seccomp(op, flags, uargs);
  830. }
  831. /**
  832. * prctl_set_seccomp: configures current->seccomp.mode
  833. * @seccomp_mode: requested mode to use
  834. * @filter: optional struct sock_fprog for use with SECCOMP_MODE_FILTER
  835. *
  836. * Returns 0 on success or -EINVAL on failure.
  837. */
  838. long prctl_set_seccomp(unsigned long seccomp_mode, char __user *filter)
  839. {
  840. unsigned int op;
  841. char __user *uargs;
  842. switch (seccomp_mode) {
  843. case SECCOMP_MODE_STRICT:
  844. op = SECCOMP_SET_MODE_STRICT;
  845. /*
  846. * Setting strict mode through prctl always ignored filter,
  847. * so make sure it is always NULL here to pass the internal
  848. * check in do_seccomp().
  849. */
  850. uargs = NULL;
  851. break;
  852. case SECCOMP_MODE_FILTER:
  853. op = SECCOMP_SET_MODE_FILTER;
  854. uargs = filter;
  855. break;
  856. default:
  857. return -EINVAL;
  858. }
  859. /* prctl interface doesn't have flags, so they are always zero. */
  860. return do_seccomp(op, 0, uargs);
  861. }
  862. #if defined(CONFIG_SECCOMP_FILTER) && defined(CONFIG_CHECKPOINT_RESTORE)
  863. long seccomp_get_filter(struct task_struct *task, unsigned long filter_off,
  864. void __user *data)
  865. {
  866. struct seccomp_filter *filter;
  867. struct sock_fprog_kern *fprog;
  868. long ret;
  869. unsigned long count = 0;
  870. if (!capable(CAP_SYS_ADMIN) ||
  871. current->seccomp.mode != SECCOMP_MODE_DISABLED) {
  872. return -EACCES;
  873. }
  874. spin_lock_irq(&task->sighand->siglock);
  875. if (task->seccomp.mode != SECCOMP_MODE_FILTER) {
  876. ret = -EINVAL;
  877. goto out;
  878. }
  879. filter = task->seccomp.filter;
  880. while (filter) {
  881. filter = filter->prev;
  882. count++;
  883. }
  884. if (filter_off >= count) {
  885. ret = -ENOENT;
  886. goto out;
  887. }
  888. count -= filter_off;
  889. filter = task->seccomp.filter;
  890. while (filter && count > 1) {
  891. filter = filter->prev;
  892. count--;
  893. }
  894. if (WARN_ON(count != 1 || !filter)) {
  895. /* The filter tree shouldn't shrink while we're using it. */
  896. ret = -ENOENT;
  897. goto out;
  898. }
  899. fprog = filter->prog->orig_prog;
  900. if (!fprog) {
  901. /* This must be a new non-cBPF filter, since we save
  902. * every cBPF filter's orig_prog above when
  903. * CONFIG_CHECKPOINT_RESTORE is enabled.
  904. */
  905. ret = -EMEDIUMTYPE;
  906. goto out;
  907. }
  908. ret = fprog->len;
  909. if (!data)
  910. goto out;
  911. __get_seccomp_filter(filter);
  912. spin_unlock_irq(&task->sighand->siglock);
  913. if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog)))
  914. ret = -EFAULT;
  915. __put_seccomp_filter(filter);
  916. return ret;
  917. out:
  918. spin_unlock_irq(&task->sighand->siglock);
  919. return ret;
  920. }
  921. #endif
  922. #ifdef CONFIG_SYSCTL
  923. /* Human readable action names for friendly sysctl interaction */
  924. #define SECCOMP_RET_KILL_PROCESS_NAME "kill_process"
  925. #define SECCOMP_RET_KILL_THREAD_NAME "kill_thread"
  926. #define SECCOMP_RET_TRAP_NAME "trap"
  927. #define SECCOMP_RET_ERRNO_NAME "errno"
  928. #define SECCOMP_RET_TRACE_NAME "trace"
  929. #define SECCOMP_RET_LOG_NAME "log"
  930. #define SECCOMP_RET_ALLOW_NAME "allow"
  931. static const char seccomp_actions_avail[] =
  932. SECCOMP_RET_KILL_PROCESS_NAME " "
  933. SECCOMP_RET_KILL_THREAD_NAME " "
  934. SECCOMP_RET_TRAP_NAME " "
  935. SECCOMP_RET_ERRNO_NAME " "
  936. SECCOMP_RET_TRACE_NAME " "
  937. SECCOMP_RET_LOG_NAME " "
  938. SECCOMP_RET_ALLOW_NAME;
  939. struct seccomp_log_name {
  940. u32 log;
  941. const char *name;
  942. };
  943. static const struct seccomp_log_name seccomp_log_names[] = {
  944. { SECCOMP_LOG_KILL_PROCESS, SECCOMP_RET_KILL_PROCESS_NAME },
  945. { SECCOMP_LOG_KILL_THREAD, SECCOMP_RET_KILL_THREAD_NAME },
  946. { SECCOMP_LOG_TRAP, SECCOMP_RET_TRAP_NAME },
  947. { SECCOMP_LOG_ERRNO, SECCOMP_RET_ERRNO_NAME },
  948. { SECCOMP_LOG_TRACE, SECCOMP_RET_TRACE_NAME },
  949. { SECCOMP_LOG_LOG, SECCOMP_RET_LOG_NAME },
  950. { SECCOMP_LOG_ALLOW, SECCOMP_RET_ALLOW_NAME },
  951. { }
  952. };
  953. static bool seccomp_names_from_actions_logged(char *names, size_t size,
  954. u32 actions_logged)
  955. {
  956. const struct seccomp_log_name *cur;
  957. bool append_space = false;
  958. for (cur = seccomp_log_names; cur->name && size; cur++) {
  959. ssize_t ret;
  960. if (!(actions_logged & cur->log))
  961. continue;
  962. if (append_space) {
  963. ret = strscpy(names, " ", size);
  964. if (ret < 0)
  965. return false;
  966. names += ret;
  967. size -= ret;
  968. } else
  969. append_space = true;
  970. ret = strscpy(names, cur->name, size);
  971. if (ret < 0)
  972. return false;
  973. names += ret;
  974. size -= ret;
  975. }
  976. return true;
  977. }
  978. static bool seccomp_action_logged_from_name(u32 *action_logged,
  979. const char *name)
  980. {
  981. const struct seccomp_log_name *cur;
  982. for (cur = seccomp_log_names; cur->name; cur++) {
  983. if (!strcmp(cur->name, name)) {
  984. *action_logged = cur->log;
  985. return true;
  986. }
  987. }
  988. return false;
  989. }
  990. static bool seccomp_actions_logged_from_names(u32 *actions_logged, char *names)
  991. {
  992. char *name;
  993. *actions_logged = 0;
  994. while ((name = strsep(&names, " ")) && *name) {
  995. u32 action_logged = 0;
  996. if (!seccomp_action_logged_from_name(&action_logged, name))
  997. return false;
  998. *actions_logged |= action_logged;
  999. }
  1000. return true;
  1001. }
  1002. static int seccomp_actions_logged_handler(struct ctl_table *ro_table, int write,
  1003. void __user *buffer, size_t *lenp,
  1004. loff_t *ppos)
  1005. {
  1006. char names[sizeof(seccomp_actions_avail)];
  1007. struct ctl_table table;
  1008. int ret;
  1009. if (write && !capable(CAP_SYS_ADMIN))
  1010. return -EPERM;
  1011. memset(names, 0, sizeof(names));
  1012. if (!write) {
  1013. if (!seccomp_names_from_actions_logged(names, sizeof(names),
  1014. seccomp_actions_logged))
  1015. return -EINVAL;
  1016. }
  1017. table = *ro_table;
  1018. table.data = names;
  1019. table.maxlen = sizeof(names);
  1020. ret = proc_dostring(&table, write, buffer, lenp, ppos);
  1021. if (ret)
  1022. return ret;
  1023. if (write) {
  1024. u32 actions_logged;
  1025. if (!seccomp_actions_logged_from_names(&actions_logged,
  1026. table.data))
  1027. return -EINVAL;
  1028. if (actions_logged & SECCOMP_LOG_ALLOW)
  1029. return -EINVAL;
  1030. seccomp_actions_logged = actions_logged;
  1031. }
  1032. return 0;
  1033. }
  1034. static struct ctl_path seccomp_sysctl_path[] = {
  1035. { .procname = "kernel", },
  1036. { .procname = "seccomp", },
  1037. { }
  1038. };
  1039. static struct ctl_table seccomp_sysctl_table[] = {
  1040. {
  1041. .procname = "actions_avail",
  1042. .data = (void *) &seccomp_actions_avail,
  1043. .maxlen = sizeof(seccomp_actions_avail),
  1044. .mode = 0444,
  1045. .proc_handler = proc_dostring,
  1046. },
  1047. {
  1048. .procname = "actions_logged",
  1049. .mode = 0644,
  1050. .proc_handler = seccomp_actions_logged_handler,
  1051. },
  1052. { }
  1053. };
  1054. static int __init seccomp_sysctl_init(void)
  1055. {
  1056. struct ctl_table_header *hdr;
  1057. hdr = register_sysctl_paths(seccomp_sysctl_path, seccomp_sysctl_table);
  1058. if (!hdr)
  1059. pr_warn("seccomp: sysctl registration failed\n");
  1060. else
  1061. kmemleak_not_leak(hdr);
  1062. return 0;
  1063. }
  1064. device_initcall(seccomp_sysctl_init)
  1065. #endif /* CONFIG_SYSCTL */