From: Po Lu Date: Mon, 1 Jul 2024 10:11:58 +0000 (+0800) Subject: Optimize process execution on Android X-Git-Url: http://git.eshelyaron.com/gitweb/?a=commitdiff_plain;h=b9c624412465296cee800772d4e64ce81b94b59e;p=emacs.git Optimize process execution on Android * exec/configure.ac (REENTRANT): Remove option for reentrancy. (PROGRAM_COUNTER, HAVE_SECCOMP): Define register providing the program counter and enable seccomp if its headers are available. * exec/exec.c (write_load_command): Avoid defining unused variable. (exec_0): Remove code specific to REENTRANT configurations. * exec/exec.h (struct exec_tracee) : New fields for loader instructions and their size. * exec/exec1.c (main): Call exec_init before forking. * exec/mipsel-user.h (ELF_NGREG): Delete definition. (struct mipsel_regs): Reduce number of gregs to 32, but introduce separate fields for special registers. * exec/trace.c (use_seccomp_p): New variable; defile to false if !HAVE_SECCOMP. (remove_tracee): Cease providing for non-reentrant configurations. Release executable data if present. (handle_clone_prepare): Likewise. Resume process with PTRACE_CONT if seccomp-based interception is enabled. (handle_clone, check_signal): Resume processes as above. (handle_exec): Divide into two functions, with only rewriting the system call and generating instructions for the loader remaining in the first, and copying such instructions into the loader's stack removed into a new function, `finish_exec'. (finish_exec): New function. (handle_readlinkat, handle_openat): Abolish non-REENTRANT configurations. (process_system_call): Divide exec system calls into two phases, disambiguated by the value of tracee->waiting_for_syscall. Typo fixes. Accommodate syscall-exit-stops where the signal was initially intercepted by `seccomp_system_call'. (interesting_syscalls): New array. (ARRAYELTS): New macro. (seccomp_system_call, establish_seccomp_filter): New function. (tracing_execve) [HAVE_SECCOMP]: Establish a seccomp filter if this is to be enabled. (after_fork): Provide PTRACE_O_TRACESECCOMP. Resume process with PTRACE_CONT if seccomp-based interception is enabled. (exec_waitpid): Resume process with PTRACE_CONT if seccomp-based interception is enabled. Dispatch stops identifying as PTRACE_EVENT_SECCOMP to `seccomp_system_call'. (exec_init): Establish whether it is possible to enable seccomp. (cherry picked from commit ebf5bcb9f0b6adeb97a3918b8f3845844c9091b0) --- diff --git a/exec/configure.ac b/exec/configure.ac index 5be8a983718..c3e895740be 100644 --- a/exec/configure.ac +++ b/exec/configure.ac @@ -42,11 +42,6 @@ General Public License for more details. You should have received a copy of the GNU General Public License along with GNU Emacs. If not, see . */]) -AC_ARG_WITH([reentrancy], - [AS_HELP_STRING([--with-reentrancy], - [Generate library which can be used within a signal handler.])], - [AC_DEFINE([REENTRANT], [1])]) - AC_USE_SYSTEM_EXTENSIONS AC_PROG_CC AC_PROG_CPP @@ -74,9 +69,9 @@ AC_CHECK_FUNC([process_vm_readv], ]])])]) AC_CHECK_HEADERS([sys/param.h sys/uio.h]) AC_CHECK_MEMBERS([siginfo_t.si_syscall], [], [], - [[ +[[ #include - ]]) +]]) AH_BOTTOM([ #ifdef HAVE_STDBOOL_H @@ -120,6 +115,7 @@ AH_TEMPLATE([SYSCALL_ARG2_REG], [Define to register holding arg2 to system calls AH_TEMPLATE([SYSCALL_ARG3_REG], [Define to register holding arg3 to system calls.]) AH_TEMPLATE([SYSCALL_RET_REG], [Define to register holding value of system calls.]) AH_TEMPLATE([STACK_POINTER], [Define to register holding the stack pointer.]) +AH_TEMPLATE([PROGRAM_COUNTER], [Define to register holding the program counter.]) AH_TEMPLATE([EXEC_SYSCALL], [Define to number of the `exec' system call.]) AH_TEMPLATE([USER_WORD], [Define to word type used by tracees.]) AH_TEMPLATE([USER_SWORD], [Define to signed word type used by tracees.]) @@ -134,7 +130,8 @@ AH_TEMPLATE([READLINK_SYSCALL], [Define to number of the `readlink' system call. AH_TEMPLATE([READLINKAT_SYSCALL], [Define to number of the `readlinkat' system call.]) AH_TEMPLATE([OPEN_SYSCALL], [Define to number of the `open' system call.]) AH_TEMPLATE([OPENAT_SYSCALL], [Define to number of the `openat' system call.]) -AH_TEMPLATE([REENTRANT], [Define to 1 if the library is used within a signal handler.]) +AH_TEMPLATE([HAVE_SECCOMP], [Define to 1 if secure computing filters are available +to accelerate interception of system calls.]) AC_CANONICAL_HOST @@ -250,6 +247,7 @@ AS_CASE([$host], [x86_64-*linux*], AC_DEFINE([SYSCALL_ARG2_REG], [rdx]) AC_DEFINE([SYSCALL_ARG3_REG], [r10]) AC_DEFINE([STACK_POINTER], [rsp]) + AC_DEFINE([PROGRAM_COUNTER], [rip]) AC_DEFINE([EXEC_SYSCALL], [__NR_execve]) AC_DEFINE([USER_WORD], [uintptr_t]) AC_DEFINE([USER_SWORD], [intptr_t]) @@ -283,6 +281,7 @@ AS_CASE([$host], [x86_64-*linux*], AC_DEFINE([SYSCALL_ARG2_REG], [edx]) AC_DEFINE([SYSCALL_ARG3_REG], [esi]) AC_DEFINE([STACK_POINTER], [esp]) + AC_DEFINE([PROGRAM_COUNTER], [eip]) AC_DEFINE([EXEC_SYSCALL], [__NR_execve]) AC_DEFINE([USER_WORD], [uintptr_t]) AC_DEFINE([USER_SWORD], [intptr_t]) @@ -314,6 +313,7 @@ AS_CASE([$host], [x86_64-*linux*], AC_DEFINE([SYSCALL_ARG2_REG], [[regs[2]]]) AC_DEFINE([SYSCALL_ARG3_REG], [[regs[3]]]) AC_DEFINE([STACK_POINTER], [sp]) + AC_DEFINE([PROGRAM_COUNTER], [pc]) AC_DEFINE([EXEC_SYSCALL], [__NR_execve]) AC_DEFINE([USER_WORD], [uintptr_t]) AC_DEFINE([USER_SWORD], [intptr_t]) @@ -346,6 +346,7 @@ AS_CASE([$host], [x86_64-*linux*], AC_DEFINE([SYSCALL_ARG2_REG], [[uregs[2]]]) AC_DEFINE([SYSCALL_ARG3_REG], [[uregs[3]]]) AC_DEFINE([STACK_POINTER], [[uregs[13]]]) + AC_DEFINE([PROGRAM_COUNTER], [[uregs[15]]]) AC_DEFINE([EXEC_SYSCALL], [__NR_execve]) AC_DEFINE([USER_WORD], [uintptr_t]) AC_DEFINE([USER_SWORD], [intptr_t]) @@ -371,6 +372,7 @@ AS_CASE([$host], [x86_64-*linux*], AC_DEFINE([SYSCALL_ARG2_REG], [[uregs[2]]]) AC_DEFINE([SYSCALL_ARG3_REG], [[uregs[3]]]) AC_DEFINE([STACK_POINTER], [[uregs[13]]]) + AC_DEFINE([STACK_POINTER], [[uregs[15]]]) AC_DEFINE([EXEC_SYSCALL], [__NR_execve]) AC_DEFINE([USER_WORD], [uintptr_t]) AC_DEFINE([USER_SWORD], [intptr_t]) @@ -402,6 +404,7 @@ AS_CASE([$host], [x86_64-*linux*], AC_DEFINE([SYSCALL_ARG2_REG], [[gregs[4]]]) # a2 AC_DEFINE([SYSCALL_ARG3_REG], [[gregs[5]]]) # a3 AC_DEFINE([STACK_POINTER], [[gregs[29]]]) # sp + AC_DEFINE([PROGRAM_COUNTER], [[cp0_epc]]) # pc AC_DEFINE([EXEC_SYSCALL], [__NR_execve]) AC_DEFINE([USER_WORD], [uintptr_t]) AC_DEFINE([USER_SWORD], [intptr_t]) @@ -432,6 +435,7 @@ AS_CASE([$host], [x86_64-*linux*], AC_DEFINE([SYSCALL_ARG2_REG], [[gregs[4]]]) # a2 AC_DEFINE([SYSCALL_ARG3_REG], [[gregs[5]]]) # a3 AC_DEFINE([STACK_POINTER], [[gregs[29]]]) # sp + AC_DEFINE([PROGRAM_COUNTER], [[cp0_epc]]) # pc AC_DEFINE([EXEC_SYSCALL], [__NR_execve]) AC_DEFINE([USER_WORD], [uintptr_t]) AC_DEFINE([USER_SWORD], [intptr_t]) @@ -480,6 +484,12 @@ AC_ARG_VAR([LOADERFLAGS], [Flags used to link the loader.]) AC_ARG_VAR([ARFLAGS], [Flags for the archiver.]) AC_ARG_VAR([ASFLAGS], [Flags for the assembler.]) +# Search for seccomp headers and declarations. +AC_CHECK_HEADERS([linux/seccomp.h linux/filter.h], + [AC_CHECK_DECLS([SECCOMP_SET_MODE_FILTER, SECCOMP_FILTER_FLAG_TSYNC, SECCOMP_RET_TRACE], + [AC_DEFINE([HAVE_SECCOMP], [1])], [], + [[#include ]])]) + # Make the assembler optimize for code size. Don't do this on MIPS, # as the assembler code manages branch delays manually. diff --git a/exec/exec.c b/exec/exec.c index cbe22d4f18c..fad345c532c 100644 --- a/exec/exec.c +++ b/exec/exec.c @@ -292,7 +292,9 @@ write_load_command (program_header *header, bool use_alternate, struct exec_map_command command1; USER_WORD start, end; bool need_command1; +#ifndef PAGE_MASK static long pagesize; +#endif /* !PAGE_MASK */ /* First, write the commands necessary to map the specified segment itself. @@ -306,14 +308,14 @@ write_load_command (program_header *header, bool use_alternate, #ifdef HAVE_GETPAGESIZE if (!pagesize) pagesize = getpagesize (); -#else /* HAVE_GETPAGESIZE */ +#else /* !HAVE_GETPAGESIZE */ if (!pagesize) pagesize = sysconf (_SC_PAGESIZE); -#endif /* HAVE_GETPAGESIZE */ +#endif /* !HAVE_GETPAGESIZE */ #define PAGE_MASK (~(pagesize - 1)) #define PAGE_SIZE (pagesize) -#endif /* PAGE_MASK */ +#endif /* !PAGE_MASK */ start = header->p_vaddr & PAGE_MASK; end = ((header->p_vaddr + header->p_filesz @@ -895,10 +897,6 @@ format_pid (char *in, unsigned int pid) with #!; in that case, find the program to open and use that instead. - If REENTRANT is not defined, NAME is actually a buffer of size - PATH_MAX + 80. In that case, copy over the file name actually - opened. - Next, read the executable header, and add the necessary memory mappings for each file. Finally, return the action data and its size in *SIZE. @@ -976,11 +974,6 @@ exec_0 (char *name, struct exec_tracee *tracee, rewrite = buffer1 + link_size; remaining = buffer1 + sizeof buffer1 - rewrite - 1; memcpy (rewrite, name, strnlen (name, remaining)); - - /* Replace name with buffer1. */ -#ifndef REENTRANT - strcpy (name, buffer1); -#endif /* REENTRANT */ } } diff --git a/exec/exec.h b/exec/exec.h index 3ce06c35311..59963587573 100644 --- a/exec/exec.h +++ b/exec/exec.h @@ -152,6 +152,16 @@ struct exec_tracee completion. */ USER_WORD sp; + /* Name of the executable being run. */ + char *exec_file; + + /* Pointer to a storage area holding instructions for loading an + executable if an `exec' system call is outstanding, or NULL. */ + char *exec_data; + + /* Number of bytes in exec_data. */ + size_t data_size; + /* The thread ID of this process. */ pid_t pid; @@ -162,11 +172,6 @@ struct exec_tracee /* Whether or not the tracee has been created but is not yet processed by `handle_clone'. */ bool new_child : 1; - -#ifndef REENTRANT - /* Name of the executable being run. */ - char *exec_file; -#endif /* !REENTRANT */ }; diff --git a/exec/exec1.c b/exec/exec1.c index aaff9a94c62..cbd756d3d5c 100644 --- a/exec/exec1.c +++ b/exec/exec1.c @@ -42,6 +42,9 @@ main (int argc, char **argv) extern char **environ; int wstatus; + /* Provide the file name of the loader. */ + exec_init (argv[1]); + pid1 = getpid (); pid = fork (); @@ -58,9 +61,6 @@ main (int argc, char **argv) } else { - /* Provide the file name of the loader. */ - exec_init (argv[1]); - if (after_fork (pid)) exit (127); diff --git a/exec/mipsel-user.h b/exec/mipsel-user.h index 04f4a2a5089..14d8f6d0d5e 100644 --- a/exec/mipsel-user.h +++ b/exec/mipsel-user.h @@ -24,10 +24,6 @@ along with GNU Emacs. If not, see . */ #include -#ifndef ELF_NGREG -#define ELF_NGREG 45 -#endif /* ELF_NGREG */ - /* This file defines a structure containing user mode general purpose @@ -36,7 +32,15 @@ along with GNU Emacs. If not, see . */ struct mipsel_regs { /* General purpose registers. */ - uint64_t gregs[ELF_NGREG]; + uint64_t gregs[32]; + + /* Saved special registers. */ + uint64_t lo; + uint64_t hi; + uint64_t cp0_epc; + uint64_t cp0_badvaddr; + uint64_t cp0_status; + uint64_t cp0_cause; }; #endif /* _MIPSEL_USER_H_ */ diff --git a/exec/trace.c b/exec/trace.c index 05d862f5b9f..7cf95ed5733 100644 --- a/exec/trace.c +++ b/exec/trace.c @@ -49,11 +49,21 @@ along with GNU Emacs. If not, see . */ #ifndef SYS_SECCOMP #define SYS_SECCOMP 1 -#endif /* SYS_SECCOMP */ +#endif /* !defined SYS_SECCOMP */ #ifndef PTRACE_GETEVENTMSG #define PTRACE_GETEVENTMSG 0x4201 -#endif /* PTRACE_GETEVENTMSG */ +#endif /* !defined PTRACE_GETEVENTMSG */ + +#ifdef HAVE_SECCOMP +#include +#include + +#include +#include + +#include +#endif /* !defined HAVE_SECCOMP */ @@ -70,6 +80,15 @@ along with GNU Emacs. If not, see . */ /* Number of tracees children are allowed to create. */ #define MAX_TRACEES 4096 +#ifdef HAVE_SECCOMP + +/* Whether to enable seccomp acceleration. */ +static bool use_seccomp_p; + +#else /* !HAVE_SECCOMP */ +#define use_seccomp_p (false) +#endif /* HAVE_SECCOMP */ + #ifdef __aarch64__ /* Place PID's registers into *REGS. Return 1 upon failure, else @@ -105,8 +124,7 @@ aarch64_set_regs (pid_t pid, USER_REGS_STRUCT *regs, iov.iov_base = regs; iov.iov_len = sizeof *regs; - rc = ptrace (PTRACE_SETREGSET, pid, NT_PRSTATUS, - &iov); + rc = ptrace (PTRACE_SETREGSET, pid, NT_PRSTATUS, &iov); if (rc < 0) return 1; @@ -367,14 +385,17 @@ remove_tracee (struct exec_tracee *tracee) /* Link the tracee onto the list of free tracees. */ tracee->next = free_tracees; -#ifndef REENTRANT /* Free the exec file, if any. */ free (tracee->exec_file); tracee->exec_file = NULL; -#endif /* REENTRANT */ - free_tracees = tracee; + /* Likewise with any loader instructions that might be + present. */ + free (tracee->exec_data); + tracee->exec_data = NULL; + /* Return this tracee to the list of free ones. */ + free_tracees = tracee; return; } else @@ -419,7 +440,6 @@ find_tracee (pid_t process) static void handle_clone_prepare (struct exec_tracee *parent) { -#ifndef REENTRANT long rc; unsigned long pid; struct exec_tracee *tracee; @@ -440,7 +460,8 @@ handle_clone_prepare (struct exec_tracee *parent) assert (tracee->new_child); tracee->new_child = false; tracee->exec_file = NULL; - ptrace (PTRACE_SYSCALL, tracee->pid, 0, 0); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + tracee->pid, 0, 0); if (parent->exec_file) tracee->exec_file = strdup (parent->exec_file); @@ -457,12 +478,9 @@ handle_clone_prepare (struct exec_tracee *parent) tracee = &static_tracees[tracees]; tracees++; } -#ifndef REENTRANT - /* Try to allocate a tracee using `malloc' if this library is - not being built to run inside a signal handler. */ + /* Try to allocate a tracee using `malloc'. */ else if ((tracee = malloc (sizeof *tracee))) ; -#endif /* REENTRANT */ else return; @@ -477,7 +495,6 @@ handle_clone_prepare (struct exec_tracee *parent) if (parent->exec_file) tracee->exec_file = strdup (parent->exec_file); -#endif /* REENTRANT */ } /* Handle the completion of a `clone' or `clone3' system call, @@ -513,21 +530,18 @@ handle_clone (struct exec_tracee *tracee, pid_t pid) tracee = &static_tracees[tracees]; tracees++; } -#ifndef REENTRANT - /* Try to allocate a tracee using `malloc' if this library is - not being built to run inside a signal handler. */ + /* Try to allocate a tracee using `malloc'. */ else if ((tracee = malloc (sizeof *tracee))) ; -#endif /* REENTRANT */ else return 1; tracee->pid = pid; tracee->next = tracing_processes; tracee->waiting_for_syscall = false; -#ifndef REENTRANT tracee->exec_file = NULL; -#endif /* REENTRANT */ + tracee->exec_data = NULL; + tracee->data_size = 0; tracing_processes = tracee; tracee->new_child = true; @@ -549,6 +563,11 @@ handle_clone (struct exec_tracee *tracee, pid_t pid) flags |= PTRACE_O_TRACESYSGOOD; flags |= PTRACE_O_TRACEEXIT; +#ifdef HAVE_SECCOMP + if (use_seccomp_p) + flags |= PTRACE_O_TRACESECCOMP; +#endif /* HAVE_SECCOMP */ + rc = ptrace (PTRACE_SETOPTIONS, pid, 0, flags); if (rc) @@ -559,7 +578,8 @@ handle_clone (struct exec_tracee *tracee, pid_t pid) /* The new tracee is currently stopped. Continue it until the next system call. */ - rc = ptrace (PTRACE_SYSCALL, pid, 0, 0); + rc = ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + pid, 0, 0); if (rc) goto bail; @@ -618,9 +638,11 @@ check_signal (struct exec_tracee *tracee, int status) { if (siginfo.si_code < 0) /* SIGTRAP delivered from userspace. Pass it on. */ - ptrace (PTRACE_SYSCALL, tracee->pid, 0, SIGTRAP); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + tracee->pid, 0, SIGTRAP); else - ptrace (PTRACE_SYSCALL, tracee->pid, 0, 0); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + tracee->pid, 0, 0); return 1; } @@ -639,26 +661,28 @@ check_signal (struct exec_tracee *tracee, int status) it. */ #ifdef HAVE_SIGINFO_T_SI_SYSCALL #ifndef __arm__ - ptrace (PTRACE_SYSCALL, tracee->pid, + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), tracee->pid, 0, ((siginfo.si_code == SYS_SECCOMP && siginfo.si_syscall == -1) ? 0 : status)); #else /* __arm__ */ - ptrace (PTRACE_SYSCALL, tracee->pid, + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), tracee->pid, 0, ((siginfo.si_code == SYS_SECCOMP && siginfo.si_syscall == 222) ? 0 : status)); #endif /* !__arm__ */ #else /* !HAVE_SIGINFO_T_SI_SYSCALL */ /* Drop this signal, since what caused it is unknown. */ - ptrace (PTRACE_SYSCALL, tracee->pid, 0, 0); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), tracee->pid, + 0, 0); #endif /* HAVE_SIGINFO_T_SI_SYSCALL */ return 1; #endif /* SIGSYS */ default: /* Continue the process until the next syscall. */ - ptrace (PTRACE_SYSCALL, tracee->pid, 0, status); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + tracee->pid, 0, status); return 1; } @@ -667,17 +691,16 @@ check_signal (struct exec_tracee *tracee, int status) -/* Handle an `exec' system call from the given TRACEE. REGS are the - tracee's current user-mode registers. +/* Handle the first stage of an `exec' system call from the given + TRACEE. REGS are the tracee's current user-mode registers. Rewrite the system call arguments to use the loader binary. Then, - continue the system call until the loader is loaded. Write the - information necessary to load the original executable into the - loader's stack. + resume the process till the loader is loaded and about to begin + execution. Save instructions to load the original executable into + TRACEE->exec_data. Value is 0 upon success, 1 upon a generic failure before the loader - is loaded, 2 if the process has stopped, and 3 if something failed, - but it is too late to handle it. + is loaded. Set errno appropriately upon returning a generic failure. */ @@ -687,16 +710,10 @@ handle_exec (struct exec_tracee *tracee, USER_REGS_STRUCT *regs) char buffer[PATH_MAX + 80], *area; USER_REGS_STRUCT original; size_t size, loader_size; - USER_WORD loader, size1, sp; - int rc, wstatus; - siginfo_t siginfo; - - /* Save the old stack pointer. */ - sp = regs->STACK_POINTER; + USER_WORD loader; /* Read the file name. */ - read_memory (tracee, buffer, PATH_MAX, - regs->SYSCALL_ARG_REG); + read_memory (tracee, buffer, PATH_MAX, regs->SYSCALL_ARG_REG); /* Make sure BUFFER is NULL terminated. */ @@ -722,8 +739,18 @@ handle_exec (struct exec_tracee *tracee, USER_REGS_STRUCT *regs) return 1; } - /* Rewrite the first argument to point to the loader. */ + /* Save this area in the tracee. */ + assert (!tracee->exec_data); + tracee->exec_data = malloc (size); + if (!tracee->exec_data) + { + errno = ENOMEM; + return 1; + } + memcpy (tracee->exec_data, area, size); + tracee->data_size = size; + /* Rewrite the first argument to point to the loader. */ loader_size = strlen (loader_name) + 1; loader = user_alloca (tracee, &original, regs, loader_size); @@ -731,14 +758,14 @@ handle_exec (struct exec_tracee *tracee, USER_REGS_STRUCT *regs) if (!loader) { errno = ENOMEM; - return 1; + goto free_data_error; } if (user_copy (tracee, (unsigned char *) loader_name, loader, loader_size)) { errno = EIO; - return 1; + goto free_data_error; } regs->SYSCALL_ARG_REG = loader; @@ -748,151 +775,113 @@ handle_exec (struct exec_tracee *tracee, USER_REGS_STRUCT *regs) if (aarch64_set_regs (tracee->pid, regs, false)) { errno = EIO; - return 1; + goto free_data_error; } #else /* !__aarch64__ */ - if (ptrace (PTRACE_SETREGS, tracee->pid, NULL, - regs)) + if (ptrace (PTRACE_SETREGS, tracee->pid, NULL, regs)) { errno = EIO; - return 1; + goto free_data_error; } #endif /* __aarch64__ */ - /* Continue the system call until loader starts. */ + /* Resume the process till the loader is executed. */ if (ptrace (PTRACE_SYSCALL, tracee->pid, NULL, NULL)) { errno = EIO; - return 1; + goto free_data_error; } -#ifndef REENTRANT - /* Now that the loader has started, record the value to use for - /proc/self/exe. Don't give up just because strdup fails. + /* Now that the loader has been executed, record the value to + substitute for /proc/self/exe. Don't give up just because strdup + fails. Note that exec_0 copies the absolute file name into buffer. */ if (tracee->exec_file) free (tracee->exec_file); tracee->exec_file = strdup (buffer); -#endif /* REENTRANT */ - - again: - rc = waitpid (tracee->pid, &wstatus, __WALL); - if (rc == -1 && errno == EINTR) - goto again; - - if (rc < 0) - return 1; - - if (!WIFSTOPPED (wstatus)) - /* The process has been killed in response to a signal. - In this case, simply return 2. */ - return 2; - else - { - /* Then, check if STATUS is not a syscall-stop, and try again if - it isn't. */ - rc = check_signal (tracee, wstatus); - - if (rc == -1) - return 2; - else if (rc) - goto again; - - /* Retrieve the signal information and determine whether or not - the system call has completed. */ - - if (ptrace (PTRACE_GETSIGINFO, tracee->pid, 0, - &siginfo)) - return 3; - - if (!syscall_trap_p (&siginfo)) - { - /* Continue. */ - if (ptrace (PTRACE_SYSCALL, tracee->pid, 0, 0)) - return 3; - - goto again; - } - } - -#ifdef __aarch64__ - - if (aarch64_get_regs (tracee->pid, &original)) - return 3; + return 0; -#else /* !__aarch64__ */ + free_data_error: + free (tracee->exec_data); + tracee->exec_data = NULL; + return 1; +} - /* The system call has now completed. Get the registers again. */ +/* Complete an `exec' system call issued by TRACEE. Write the + instructions stored in TRACEE->exec_data to an appropriate location + in TRACEE's stack, and resume TRACEE, releasing TRACEE->exec_data. + REGS should be the TRACEE's user registers. If the reissued system + call did not succeed in starting the executable loader, restore + TRACEE->sp (recorded by process_system_call or seccomp_system_call), + and resume execution, so that the failure may be reported. */ - if (ptrace (PTRACE_GETREGS, tracee->pid, NULL, - &original)) - return 3; +static void +finish_exec (struct exec_tracee *tracee, USER_REGS_STRUCT *regs) +{ + USER_WORD size1, loader; + USER_REGS_STRUCT original; -#endif /* __aarch64__ */ + size1 = tracee->data_size; - *regs = original; + /* Record the registers' values as they originally were. */ + memcpy (&original, regs, sizeof *regs); - /* Upon failure, wait for the next system call and return - success. */ + /* Any non-zero value of `original.SYSCALL_RET_REG' indicates that the + reissued `exec' call was unsuccessful, and the loader is not + executing. Restore the previous stack pointer and permit the + tracee to run to completion. */ if (original.SYSCALL_RET_REG) { - /* Restore the original stack pointer. */ - regs->STACK_POINTER = sp; - + regs->STACK_POINTER = tracee->sp; #ifdef __aarch64__ aarch64_set_regs (tracee->pid, regs, false); #else /* !__aarch64__ */ ptrace (PTRACE_SETREGS, tracee->pid, NULL, regs); #endif /* __aarch64__ */ - - goto exec_failure; + return; } /* Write the loader area to the stack, followed by its size and the original stack pointer. */ loader = user_alloca (tracee, &original, regs, - size + sizeof loader * 2); + size1 + sizeof loader * 2); if (!loader) - return 3; - - size1 = size; + goto error; #ifndef STACK_GROWS_DOWNWARDS - - NOT_IMPLEMENTED; - + not implemented, you lose. #else /* STACK_GROWS_DOWNWARDS */ - if (user_copy (tracee, (unsigned char *) area, - loader + sizeof size1 * 2, size) + if (user_copy (tracee, (unsigned char *) tracee->exec_data, + loader + sizeof size1 * 2, size1) || user_copy (tracee, (unsigned char *) &size1, loader + sizeof size1, sizeof size1)) - return 3; + goto error; size1 = original.STACK_POINTER; if (user_copy (tracee, (unsigned char *) &size1, loader, sizeof size1)) - return 3; + goto error; #endif /* STACK_GROWS_DOWNWARDS */ /* Continue. */ - if (ptrace (PTRACE_SYSCALL, tracee->pid, 0, 0)) - return 3; - - return 0; + if (ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + tracee->pid, 0, 0)) + goto error; - exec_failure: - return 3; + error: + free (tracee->exec_data); + tracee->exec_data = NULL; } @@ -1007,13 +996,6 @@ static int handle_readlinkat (USER_WORD callno, USER_REGS_STRUCT *regs, struct exec_tracee *tracee, USER_WORD *result) { -#ifdef REENTRANT - /* readlinkat cannot be handled specially when the library is built - to be reentrant, as the file name information cannot be - recorded. */ - return 0; -#else /* !REENTRANT */ - char buffer[PATH_MAX + 1]; USER_WORD address, return_buffer, size; size_t length; @@ -1086,7 +1068,6 @@ handle_readlinkat (USER_WORD callno, USER_REGS_STRUCT *regs, *result = length; return 2; -#endif /* REENTRANT */ } /* Handle an `open' or `openat' system call. @@ -1104,12 +1085,6 @@ static int handle_openat (USER_WORD callno, USER_REGS_STRUCT *regs, struct exec_tracee *tracee, USER_WORD *result) { -#ifdef REENTRANT - /* readlinkat cannot be handled specially when the library is built - to be reentrant, as the file name information cannot be - recorded. */ - return 0; -#else /* !REENTRANT */ char buffer[PATH_MAX + 1]; USER_WORD address; size_t length; @@ -1199,7 +1174,6 @@ handle_openat (USER_WORD callno, USER_REGS_STRUCT *regs, fail: errno = EIO; return 1; -#endif /* REENTRANT */ } /* Process the system call at which TRACEE is stopped. If the system @@ -1229,30 +1203,43 @@ process_system_call (struct exec_tracee *tracee) /* Save the stack pointer. */ sp = regs.STACK_POINTER; - /* Now dispatch based on the system call. */ - callno = regs.SYSCALL_NUM_REG; + /* Now dispatch based on the system call. If TRACEE->exec_data is + set, this must be exec, whatever the value of SYSCALL_NUM_REG, + which is erased when exec loads another image. */ + + callno = (!tracee->exec_data ? regs.SYSCALL_NUM_REG : EXEC_SYSCALL); switch (callno) { case EXEC_SYSCALL: - /* exec system calls should be handled synchronously. */ - assert (!tracee->waiting_for_syscall); - rc = handle_exec (tracee, ®s); - - switch (rc) + if (!tracee->waiting_for_syscall) { - case 3: - /* It's too late to do anything about this error,. */ - break; + /* The outstanding syscall flag must not be inconsistent with + the presence of instructions for the loader. */ + assert (!tracee->exec_data); + rc = handle_exec (tracee, ®s); - case 2: - /* The process has gone away. */ - remove_tracee (tracee); - break; + if (rc) + /* An error has occurred; errno is set to the error. */ + goto report_syscall_error; + + /* The process has been resumed. Assert that the instructions + for loading this executable have been generated and + recorded, and set waiting_for_syscall. */ + tracee->waiting_for_syscall = true; + assert (tracee->exec_data); - case 1: - /* An error has occurred; errno is set to the error. */ - goto report_syscall_error; + /* Record the initial stack pointer also. */ + tracee->sp = sp; + } + else + { + assert (tracee->exec_data); + finish_exec (tracee, ®s); + + /* The process has been resumed and has become capable of + executing independently. */ + tracee->waiting_for_syscall = false; } break; @@ -1311,7 +1298,7 @@ process_system_call (struct exec_tracee *tracee) regs.STACK_POINTER = tracee->sp; #ifdef __aarch64__ - if (aarch64_set_regs (tracee->pid, ®s, true)) + if (aarch64_set_regs (tracee->pid, ®s, false)) return; #else /* !__aarch64__ */ if (ptrace (PTRACE_SETREGS, tracee->pid, NULL, ®s)) @@ -1327,11 +1314,35 @@ process_system_call (struct exec_tracee *tracee) will DTRT upon the next call to PTRACE_SYSCALL after the syscall-trap signal is delivered. */ - rc = ptrace (PTRACE_SYSCALL, tracee->pid, + rc = ptrace (((use_seccomp_p + /* open and openat are not processed synchronously, + nor can they afford to dispense with + post-syscall finalization. */ + + && ((callno != OPENAT_SYSCALL +#ifdef OPEN_SYSCALL + && callno != OPEN_SYSCALL +#endif /* OPEN_SYSCALL */ + ) + /* Since syscall initialization should be + reserved for seccomp_system_call, resume the + process if this system call is already + complete. */ + || !tracee->waiting_for_syscall)) + ? PTRACE_CONT : PTRACE_SYSCALL), tracee->pid, NULL, NULL); if (rc < 0) return; +#ifdef HAVE_SECCOMP + if (!(use_seccomp_p + && ((callno != OPENAT_SYSCALL +#ifdef OPEN_SYSCALL + && callno != OPEN_SYSCALL +#endif /* OPEN_SYSCALL */ + ) + || !tracee->waiting_for_syscall))) +#endif /* !HAVE_SECCOMP */ tracee->waiting_for_syscall = !tracee->waiting_for_syscall; } @@ -1345,9 +1356,10 @@ process_system_call (struct exec_tracee *tracee) reporting_error = false; common: - /* Reporting an error or emulating a system call works by setting - the system call number to -1, letting it continue, and then - substituting errno for ENOSYS in the case of an error. + /* Reporting an error or emulating a system call works by replacing + the system call number with -1 or another nonexistent syscall, + letting it continue, and then substituting errno for ENOSYS in the + case of an error. Make sure that the stack pointer is restored to its original position upon exit, or bad things can happen. */ @@ -1426,13 +1438,13 @@ process_system_call (struct exec_tracee *tracee) #endif /* __aarch64__ */ /* Now wait for the next system call to happen. */ - ptrace (PTRACE_SYSCALL, tracee->pid, NULL, NULL); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + tracee->pid, NULL, NULL); } else { /* No error is being reported. Return the result in the appropriate registers. */ - #ifdef __mips__ /* MIPS systems place errno in v0 and set a3 to 1. */ regs.gregs[2] = result; @@ -1449,14 +1461,367 @@ process_system_call (struct exec_tracee *tracee) #endif /* __aarch64__ */ /* Now wait for the next system call to happen. */ - ptrace (PTRACE_SYSCALL, tracee->pid, NULL, NULL); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + tracee->pid, NULL, NULL); } } +#ifdef HAVE_SECCOMP + +/* Seccomp acceleration. + + Seccomp enables selectively filtering signals so that the tracing + process is only notified of such system calls as it is interested in + intercepting, i.e., exec and open*. This improves performance + enormously over the traditional approach of pausing the tracee before + each system call. */ + +/* Whether the kernel's version is 4.7.x or earlier. */ +static bool kernel_4_7_or_earlier; + +/* Array of system calls this module is interested in intercepting. */ +static int interesting_syscalls[] = + { + EXEC_SYSCALL, +#ifdef OPEN_SYSCALL + OPEN_SYSCALL, +#endif /* OPEN_SYSCALL */ + OPENAT_SYSCALL, +#ifdef READLINK_SYSCALL + READLINK_SYSCALL, +#endif /* READLINK_SYSCALL */ + READLINKAT_SYSCALL, + }; + +/* Number of elements in an array. */ +#define ARRAYELTS(arr) (sizeof (arr) / sizeof (arr)[0]) + +/* Install a secure computing filter that will notify attached tracers + when a system call of interest to this module is received. Value is + 0 if successful, 1 otherwise. */ + +static int +establish_seccomp_filter (void) +{ + struct sock_filter statements[1 + ARRAYELTS (interesting_syscalls) + 2]; + struct sock_fprog program; + int index, rc; + + index = 0; + + /* As the exec wrapper will reject executables for an inappropriate + architecture, verifying the same here would only be redundant. + Proceed to load the current system call number. */ + + statements[index++] = ((struct sock_filter) + BPF_STMT (BPF_LD + BPF_W + BPF_ABS, + offsetof (struct seccomp_data, nr))); + + /* Search for system calls of interest. */ + + statements[index] + = ((struct sock_filter) + BPF_JUMP (BPF_JMP + BPF_JEQ + BPF_K, EXEC_SYSCALL, + ARRAYELTS (interesting_syscalls), 0)); index++; +#ifdef OPEN_SYSCALL + statements[index] + = ((struct sock_filter) + BPF_JUMP (BPF_JMP + BPF_JEQ + BPF_K, OPEN_SYSCALL, + ARRAYELTS (interesting_syscalls) - index + 1, 0)); index++; +#endif /* OPEN_SYSCALL */ + statements[index] + = ((struct sock_filter) + BPF_JUMP (BPF_JMP + BPF_JEQ + BPF_K, OPENAT_SYSCALL, + ARRAYELTS (interesting_syscalls) - index + 1, 0)); index++; +#ifdef READLINK_SYSCALL + statements[index] + = ((struct sock_filter) + BPF_JUMP (BPF_JMP + BPF_JEQ + BPF_K, READLINK_SYSCALL, + ARRAYELTS (interesting_syscalls) - index + 1, 0)); index++; +#endif /* READLINK_SYSCALL */ + statements[index] + = ((struct sock_filter) + BPF_JUMP (BPF_JMP + BPF_JEQ + BPF_K, READLINKAT_SYSCALL, + ARRAYELTS (interesting_syscalls) - index + 1, 0)); index++; + + /* If not intercepted above, permit this system call to execute as + normal. */ + statements[index++] + = (struct sock_filter) BPF_STMT (BPF_RET + BPF_K, SECCOMP_RET_ALLOW); + statements[index++] + = (struct sock_filter) BPF_STMT (BPF_RET + BPF_K, SECCOMP_RET_TRACE); + + rc = prctl (PR_SET_NO_NEW_PRIVS, 1, 0, 0, 0); + if (rc) + return 1; + + program.len = ARRAYELTS (statements); + program.filter = statements; + rc = prctl (PR_SET_SECCOMP, SECCOMP_MODE_FILTER, &program); + if (rc) + return 1; + + return 0; +} + +/* Intercept or resume and dismiss the system call at which TRACEE is + paused, similarly to process_system_call. */ + +static void +seccomp_system_call (struct exec_tracee *tracee) +{ + USER_REGS_STRUCT regs; + int rc, wstatus, save_errno; + USER_WORD callno, sp; + USER_WORD result; + bool reporting_error; + + if (kernel_4_7_or_earlier) + { + /* On kernel 4.7 and earlier, following a PTRACE_EVENT_SECCOMP by + a PTRACE_SYSCALL will give rise to a syscall-entry stop event, + and seccomp filters will be suppressed till the system call + runs its course. */ + ptrace (PTRACE_SYSCALL, tracee->pid, 0, 0); + return; + } + +#ifdef __aarch64__ + rc = aarch64_get_regs (tracee->pid, ®s); +#else /* !__aarch64__ */ + rc = ptrace (PTRACE_GETREGS, tracee->pid, NULL, + ®s); +#endif /* __aarch64__ */ + + /* TODO: what to do if this fails? */ + if (rc < 0) + return; + + /* On kernel 4.8, processes resumed after being paused so as to + produce a PTRACE_EVENT_SECCOMP will execute till the system call + completes, or indefinitely if resumed with PTRACE_CONT. + + In this context processes are resumed with PTRACE_CONT unless it is + an `open' syscall that is being intercepted, which, if successfully + intercepted, must receive adjustments to their stack pointer upon + completion of said system call. */ + assert (!tracee->waiting_for_syscall); + + /* Save the stack pointer. */ + sp = regs.STACK_POINTER; + + /* Now dispatch based on the system call. */ + callno = regs.SYSCALL_NUM_REG; + switch (callno) + { + case EXEC_SYSCALL: + assert (!tracee->exec_data); + rc = handle_exec (tracee, ®s); + + if (rc) + /* An error has occurred; errno is set to the error. */ + goto report_syscall_error; + + /* The process has been resumed. Assert that the instructions for + loading this executable have been generated and recorded, and + set waiting_for_syscall. */ + tracee->waiting_for_syscall = true; + assert (tracee->exec_data); + + /* Record the initial stack pointer also. */ + tracee->sp = sp; + break; + +#ifdef READLINK_SYSCALL + case READLINK_SYSCALL: +#endif /* READLINK_SYSCALL */ + case READLINKAT_SYSCALL: + /* Handle this readlinkat system call. */ + rc = handle_readlinkat (callno, ®s, tracee, + &result); + + /* rc means the same as in `handle_exec'. */ + + if (rc == 1) + goto report_syscall_error; + else if (rc == 2) + goto emulate_syscall; + + goto continue_syscall; + +#ifdef OPEN_SYSCALL + case OPEN_SYSCALL: +#endif /* OPEN_SYSCALL */ + case OPENAT_SYSCALL: + /* Handle this open system call. */ + rc = handle_openat (callno, ®s, tracee, &result); + + /* rc means the same as in `handle_exec', except that `open' + is never emulated. */ + + if (rc == 1) + goto report_syscall_error; + + /* The stack pointer must be restored after it was modified + by `user_alloca'; record sp in TRACEE, which will be + restored after this system call completes. */ + tracee->sp = sp; + + /* As such, arrange to enter `process_system_call' on its + completion. */ + rc = ptrace (PTRACE_SYSCALL, tracee->pid, + NULL, NULL); + if (rc < 0) + return; + + tracee->waiting_for_syscall = !tracee->waiting_for_syscall; + break; + + default: + continue_syscall: + rc = ptrace (PTRACE_CONT, tracee->pid, NULL, NULL); + if (rc < 0) + return; + } + + return; + + report_syscall_error: + reporting_error = true; + goto common; + + emulate_syscall: + reporting_error = false; + common: + + /* Reporting an error or emulating a system call works by replacing + the system call number with -1 or another nonexistent syscall, + letting it continue, and then substituting errno for ENOSYS in the + case of an error. + + Make sure that the stack pointer is restored to its original + position upon exit, or bad things can happen. */ + + /* First, save errno; system calls below will clobber it. */ + save_errno = errno; + + regs.SYSCALL_NUM_REG = -1; + regs.STACK_POINTER = sp; + +#ifdef __aarch64__ + if (aarch64_set_regs (tracee->pid, ®s, true)) + return; +#else /* !__aarch64__ */ + +#ifdef __arm__ + /* On ARM systems, a special request is used to update the system + call number as known to the kernel. In addition, the system call + number must be valid, so use `tuxcall'. Hopefully, nobody will + run this on a kernel with Tux. */ + + if (ptrace (PTRACE_SET_SYSCALL, tracee->pid, NULL, 222)) + return; +#endif /* __arm__ */ + + if (ptrace (PTRACE_SETREGS, tracee->pid, NULL, ®s)) + return; +#endif /* __aarch64__ */ + + /* Do this invalid system call. */ + if (ptrace (PTRACE_SYSCALL, tracee->pid, NULL, NULL)) + return; + + again1: + rc = waitpid (tracee->pid, &wstatus, __WALL); + if (rc == -1 && errno == EINTR) + goto again1; + + /* Return if waitpid fails. */ + + if (rc == -1) + return; + + /* If the process received a signal, see if the signal is SIGSYS + and/or from seccomp. If so, discard it. */ + + if (WIFSTOPPED (wstatus)) + { + rc = check_signal (tracee, wstatus); + + if (rc == -1) + return; + else if (rc) + goto again1; + } + + if (!WIFSTOPPED (wstatus)) + /* The process has been killed in response to a signal. In this + case, simply unlink the tracee and return. */ + remove_tracee (tracee); + else if (reporting_error) + { +#ifdef __mips__ + /* MIPS systems place errno in v0 and set a3 to 1. */ + regs.gregs[2] = save_errno; + regs.gregs[7] = 1; +#else /* !__mips__ */ + regs.SYSCALL_RET_REG = -save_errno; +#endif /* __mips__ */ + + /* Report errno. */ +#ifdef __aarch64__ + aarch64_set_regs (tracee->pid, ®s, false); +#else /* !__aarch64__ */ + ptrace (PTRACE_SETREGS, tracee->pid, NULL, ®s); +#endif /* __aarch64__ */ + + /* Resume the process till the next interception by its filter. */ + ptrace (PTRACE_CONT, tracee->pid, NULL, NULL); + } + else + { + /* No error is being reported. Return the result in the + appropriate registers. */ + +#ifdef __mips__ + /* MIPS systems place errno in v0 and set a3 to 1. */ + regs.gregs[2] = result; + regs.gregs[7] = 0; +#else /* !__mips__ */ + regs.SYSCALL_RET_REG = result; +#endif /* __mips__ */ + + /* Report errno. */ +#ifdef __aarch64__ + aarch64_set_regs (tracee->pid, ®s, false); +#else /* !__aarch64__ */ + ptrace (PTRACE_SETREGS, tracee->pid, NULL, ®s); +#endif /* __aarch64__ */ + + /* Resume the process till the next interception by its filter. */ + ptrace (PTRACE_CONT, tracee->pid, NULL, NULL); + } +} + +#ifndef PTRACE_EVENT_SECCOMP +#define PTRACE_EVENT_SECCOMP 7 +#endif /* !PTRACE_EVENT_SECCOMP */ + +#ifndef PTRACE_O_TRACESECCOMP +#define PTRACE_O_TRACESECCOMP (1 << PTRACE_EVENT_SECCOMP) +#endif /* !PTRACE_O_TRACESECCOMP */ + +#ifndef SIGSYS +#define SIGSYS 31 +#endif /* !SIGSYS */ +#endif /* HAVE_SECCOMP */ + + + /* Like `execve', but asks the parent to begin tracing this thread. - Fail if tracing is unsuccessful. */ + Fail by returning a non-zero value if tracing is unsuccessful. */ int tracing_execve (const char *file, char *const *argv, @@ -1471,6 +1836,13 @@ tracing_execve (const char *file, char *const *argv, /* Notify the parent to enter signal-delivery-stop. */ raise (SIGSTOP); + +#ifdef HAVE_SECCOMP + /* Install the seccomp filter. */ + if (use_seccomp_p && establish_seccomp_filter ()) + return 1; +#endif /* HAVE_SECCOMP */ + return execve (file, argv, envp); } @@ -1510,6 +1882,10 @@ after_fork (pid_t pid) flags |= PTRACE_O_TRACEFORK; flags |= PTRACE_O_TRACESYSGOOD; flags |= PTRACE_O_TRACEEXIT; +#ifdef HAVE_SECCOMP + if (use_seccomp_p) + flags |= PTRACE_O_TRACESECCOMP; +#endif /* HAVE_SECCOMP */ rc = ptrace (PTRACE_SETOPTIONS, pid, 0, flags); @@ -1521,8 +1897,10 @@ after_fork (pid_t pid) return 1; } - /* Request that the child stop upon the next system call. */ - rc = ptrace (PTRACE_SYSCALL, pid, 0, 0); + /* Request that the child stop upon the next system call, or the next + filter event. */ + rc = ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + pid, 0, 0); if (rc) return 1; @@ -1543,9 +1921,9 @@ after_fork (pid_t pid) tracee->next = tracing_processes; tracee->waiting_for_syscall = false; tracee->new_child = false; -#ifndef REENTRANT tracee->exec_file = NULL; -#endif /* REENTRANT */ + tracee->exec_data = NULL; + tracee->data_size = 0; tracing_processes = tracee; return 0; } @@ -1604,9 +1982,11 @@ exec_waitpid (pid_t pid, int *wstatus, int options) { if (siginfo.si_code < 0) /* SIGTRAP delivered from userspace. Pass it on. */ - ptrace (PTRACE_SYSCALL, pid, 0, SIGTRAP); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + pid, 0, SIGTRAP); else - ptrace (PTRACE_SYSCALL, pid, 0, 0); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + pid, 0, 0); return -1; } @@ -1644,8 +2024,16 @@ exec_waitpid (pid_t pid, int *wstatus, int options) /* These events are handled by tracing SIGSTOP signals sent to unknown tracees. Make sure not to pass through status, as there's no signal really being delivered. */ - ptrace (PTRACE_SYSCALL, pid, 0, 0); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), pid, 0, 0); + return -1; + +#ifdef HAVE_SECCOMP + case SIGTRAP | (PTRACE_EVENT_SECCOMP << 8): + /* Intercept and process this system call if the event was + produced by our filter. */ + seccomp_system_call (tracee); return -1; +#endif /* HAVE_SECCOMP */ #ifdef SIGSYS case SIGSYS: @@ -1657,17 +2045,20 @@ exec_waitpid (pid_t pid, int *wstatus, int options) it. */ #ifdef HAVE_SIGINFO_T_SI_SYSCALL #ifndef __arm__ - ptrace (PTRACE_SYSCALL, pid, 0, ((siginfo.si_code == SYS_SECCOMP - && siginfo.si_syscall == -1) - ? 0 : status)); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + pid, 0, ((siginfo.si_code == SYS_SECCOMP + && siginfo.si_syscall == -1) + ? 0 : status)); #else /* __arm__ */ - ptrace (PTRACE_SYSCALL, pid, 0, ((siginfo.si_code == SYS_SECCOMP - && siginfo.si_syscall == 222) - ? 0 : status)); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), + pid, 0, ((siginfo.si_code == SYS_SECCOMP + && siginfo.si_syscall == 222) + ? 0 : status)); #endif /* !__arm__ */ #else /* !HAVE_SIGINFO_T_SI_SYSCALL */ /* Drop this signal, since what caused it is unknown. */ - ptrace (PTRACE_SYSCALL, pid, 0, 0); + ptrace ((use_seccomp_p ? PTRACE_CONT : PTRACE_SYSCALL), pid, + 0, 0); #endif /* HAVE_SIGINFO_T_SI_SYSCALL */ return -1; #endif /* SIGSYS */ @@ -1698,5 +2089,49 @@ exec_waitpid (pid_t pid, int *wstatus, int options) void exec_init (const char *loader) { +#ifdef HAVE_SECCOMP + struct utsname u; + int major, minor; +#endif /* HAVE_SECCOMP */ + loader_name = loader; +#ifdef HAVE_SECCOMP + errno = 0; + prctl (PR_GET_SECCOMP); + + /* PR_GET_SECCOMP should not set errno if the kernel was configured + with support for seccomp. */ + if (!errno) + use_seccomp_p = true; + else + return; + + /* Establish whether the kernel is 4.7.x or older. */ + uname (&u); + if ((sscanf (u.release, "%d.%d", &major, &minor) == 2)) + { + /* Certain required ptrace features were introduced in kernel + 3.5. */ + if (major < 3 || (major == 3 && minor < 5)) + use_seccomp_p = false; + else + { + if (major < 4 || (major == 4 && minor <= 7)) + { +#ifndef __ANDROID__ + kernel_4_7_or_earlier = true; +#else /* __ANDROID __ */ + /* Certain Android kernels have received backports of + those new PTRACE_EVENT_SECCOMP semantics which were + introduced in kernel version 4.8, so that it is + necessary to actively establish which variant is in + place. This being much too involved for code I cannot + test, simply disable seccomp on kernel releases subject + to these uncertainties. */ + use_seccomp_p = false; +#endif /* !__ANDROID__ */ + } + } + } +#endif /* HAVE_SECCOMP */ }