2005-04-17 00:20:36 +02:00
|
|
|
/*
|
|
|
|
* syscalls.h - Linux syscall interfaces (non-arch-specific)
|
|
|
|
*
|
|
|
|
* Copyright (c) 2004 Randy Dunlap
|
|
|
|
* Copyright (c) 2004 Open Source Development Labs
|
|
|
|
*
|
|
|
|
* This file is released under the GPLv2.
|
|
|
|
* See the file COPYING for more details.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _LINUX_SYSCALLS_H
|
|
|
|
#define _LINUX_SYSCALLS_H
|
|
|
|
|
|
|
|
struct epoll_event;
|
|
|
|
struct iattr;
|
|
|
|
struct inode;
|
|
|
|
struct iocb;
|
|
|
|
struct io_event;
|
|
|
|
struct iovec;
|
|
|
|
struct itimerspec;
|
|
|
|
struct itimerval;
|
|
|
|
struct kexec_segment;
|
|
|
|
struct linux_dirent;
|
|
|
|
struct linux_dirent64;
|
|
|
|
struct list_head;
|
|
|
|
struct msgbuf;
|
|
|
|
struct msghdr;
|
|
|
|
struct msqid_ds;
|
|
|
|
struct new_utsname;
|
|
|
|
struct nfsctl_arg;
|
|
|
|
struct __old_kernel_stat;
|
|
|
|
struct pollfd;
|
|
|
|
struct rlimit;
|
|
|
|
struct rusage;
|
|
|
|
struct sched_param;
|
|
|
|
struct semaphore;
|
|
|
|
struct sembuf;
|
|
|
|
struct shmid_ds;
|
|
|
|
struct sockaddr;
|
|
|
|
struct stat;
|
|
|
|
struct stat64;
|
|
|
|
struct statfs;
|
|
|
|
struct statfs64;
|
|
|
|
struct __sysctl_args;
|
|
|
|
struct sysinfo;
|
|
|
|
struct timespec;
|
|
|
|
struct timeval;
|
|
|
|
struct timex;
|
|
|
|
struct timezone;
|
|
|
|
struct tms;
|
|
|
|
struct utimbuf;
|
|
|
|
struct mq_attr;
|
2006-02-01 12:04:33 +01:00
|
|
|
struct compat_stat;
|
|
|
|
struct compat_timeval;
|
2006-05-23 16:46:40 +02:00
|
|
|
struct robust_list_head;
|
2006-09-26 10:52:28 +02:00
|
|
|
struct getcpu_cache;
|
2009-01-14 14:13:55 +01:00
|
|
|
struct old_linux_dirent;
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/aio_abi.h>
|
|
|
|
#include <linux/capability.h>
|
|
|
|
#include <linux/list.h>
|
|
|
|
#include <linux/sem.h>
|
|
|
|
#include <asm/siginfo.h>
|
|
|
|
#include <asm/signal.h>
|
|
|
|
#include <linux/quota.h>
|
|
|
|
#include <linux/key.h>
|
|
|
|
|
2009-01-14 14:13:59 +01:00
|
|
|
#define __SC_DECL1(t1, a1) t1 a1
|
|
|
|
#define __SC_DECL2(t2, a2, ...) t2 a2, __SC_DECL1(__VA_ARGS__)
|
|
|
|
#define __SC_DECL3(t3, a3, ...) t3 a3, __SC_DECL2(__VA_ARGS__)
|
|
|
|
#define __SC_DECL4(t4, a4, ...) t4 a4, __SC_DECL3(__VA_ARGS__)
|
|
|
|
#define __SC_DECL5(t5, a5, ...) t5 a5, __SC_DECL4(__VA_ARGS__)
|
|
|
|
#define __SC_DECL6(t6, a6, ...) t6 a6, __SC_DECL5(__VA_ARGS__)
|
|
|
|
|
|
|
|
#define __SC_LONG1(t1, a1) long a1
|
|
|
|
#define __SC_LONG2(t2, a2, ...) long a2, __SC_LONG1(__VA_ARGS__)
|
|
|
|
#define __SC_LONG3(t3, a3, ...) long a3, __SC_LONG2(__VA_ARGS__)
|
|
|
|
#define __SC_LONG4(t4, a4, ...) long a4, __SC_LONG3(__VA_ARGS__)
|
|
|
|
#define __SC_LONG5(t5, a5, ...) long a5, __SC_LONG4(__VA_ARGS__)
|
|
|
|
#define __SC_LONG6(t6, a6, ...) long a6, __SC_LONG5(__VA_ARGS__)
|
|
|
|
|
|
|
|
#define __SC_CAST1(t1, a1) (t1) a1
|
|
|
|
#define __SC_CAST2(t2, a2, ...) (t2) a2, __SC_CAST1(__VA_ARGS__)
|
|
|
|
#define __SC_CAST3(t3, a3, ...) (t3) a3, __SC_CAST2(__VA_ARGS__)
|
|
|
|
#define __SC_CAST4(t4, a4, ...) (t4) a4, __SC_CAST3(__VA_ARGS__)
|
|
|
|
#define __SC_CAST5(t5, a5, ...) (t5) a5, __SC_CAST4(__VA_ARGS__)
|
|
|
|
#define __SC_CAST6(t6, a6, ...) (t6) a6, __SC_CAST5(__VA_ARGS__)
|
|
|
|
|
|
|
|
#define __SC_TEST(type) BUILD_BUG_ON(sizeof(type) > sizeof(long))
|
|
|
|
#define __SC_TEST1(t1, a1) __SC_TEST(t1)
|
|
|
|
#define __SC_TEST2(t2, a2, ...) __SC_TEST(t2); __SC_TEST1(__VA_ARGS__)
|
|
|
|
#define __SC_TEST3(t3, a3, ...) __SC_TEST(t3); __SC_TEST2(__VA_ARGS__)
|
|
|
|
#define __SC_TEST4(t4, a4, ...) __SC_TEST(t4); __SC_TEST3(__VA_ARGS__)
|
|
|
|
#define __SC_TEST5(t5, a5, ...) __SC_TEST(t5); __SC_TEST4(__VA_ARGS__)
|
|
|
|
#define __SC_TEST6(t6, a6, ...) __SC_TEST(t6); __SC_TEST5(__VA_ARGS__)
|
|
|
|
|
|
|
|
#define SYSCALL_DEFINE0(name) asmlinkage long sys_##name(void)
|
|
|
|
#define SYSCALL_DEFINE1(...) SYSCALL_DEFINEx(1, __VA_ARGS__)
|
|
|
|
#define SYSCALL_DEFINE2(...) SYSCALL_DEFINEx(2, __VA_ARGS__)
|
|
|
|
#define SYSCALL_DEFINE3(...) SYSCALL_DEFINEx(3, __VA_ARGS__)
|
|
|
|
#define SYSCALL_DEFINE4(...) SYSCALL_DEFINEx(4, __VA_ARGS__)
|
|
|
|
#define SYSCALL_DEFINE5(...) SYSCALL_DEFINEx(5, __VA_ARGS__)
|
|
|
|
#define SYSCALL_DEFINE6(...) SYSCALL_DEFINEx(6, __VA_ARGS__)
|
|
|
|
|
|
|
|
#define SYSCALL_ALIAS(alias, name) \
|
|
|
|
asm ("\t.globl " #alias "\n\t.set " #alias ", " #name)
|
|
|
|
|
|
|
|
#ifdef CONFIG_HAVE_SYSCALL_WRAPPERS
|
|
|
|
|
|
|
|
#define SYSCALL_DEFINE(name) static inline long SYSC_##name
|
|
|
|
#define SYSCALL_DEFINEx(x, name, ...) \
|
|
|
|
asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__)); \
|
|
|
|
static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__)); \
|
|
|
|
asmlinkage long SyS_##name(__SC_LONG##x(__VA_ARGS__)) \
|
|
|
|
{ \
|
|
|
|
__SC_TEST##x(__VA_ARGS__); \
|
|
|
|
return (long) SYSC_##name(__SC_CAST##x(__VA_ARGS__)); \
|
|
|
|
} \
|
|
|
|
SYSCALL_ALIAS(sys_##name, SyS_##name); \
|
|
|
|
static inline long SYSC_##name(__SC_DECL##x(__VA_ARGS__))
|
|
|
|
|
|
|
|
#else /* CONFIG_HAVE_SYSCALL_WRAPPERS */
|
|
|
|
|
|
|
|
#define SYSCALL_DEFINE(name) asmlinkage long sys_##name
|
|
|
|
#define SYSCALL_DEFINEx(x, name, ...) \
|
|
|
|
asmlinkage long sys_##name(__SC_DECL##x(__VA_ARGS__))
|
|
|
|
|
|
|
|
#endif /* CONFIG_HAVE_SYSCALL_WRAPPERS */
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_time(time_t __user *tloc);
|
|
|
|
asmlinkage long sys_stime(time_t __user *tptr);
|
|
|
|
asmlinkage long sys_gettimeofday(struct timeval __user *tv,
|
|
|
|
struct timezone __user *tz);
|
|
|
|
asmlinkage long sys_settimeofday(struct timeval __user *tv,
|
|
|
|
struct timezone __user *tz);
|
|
|
|
asmlinkage long sys_adjtimex(struct timex __user *txc_p);
|
|
|
|
|
|
|
|
asmlinkage long sys_times(struct tms __user *tbuf);
|
|
|
|
|
|
|
|
asmlinkage long sys_gettid(void);
|
|
|
|
asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp);
|
2009-01-14 14:13:54 +01:00
|
|
|
asmlinkage long sys_alarm(unsigned int seconds);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_getpid(void);
|
|
|
|
asmlinkage long sys_getppid(void);
|
|
|
|
asmlinkage long sys_getuid(void);
|
|
|
|
asmlinkage long sys_geteuid(void);
|
|
|
|
asmlinkage long sys_getgid(void);
|
|
|
|
asmlinkage long sys_getegid(void);
|
|
|
|
asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid);
|
|
|
|
asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid);
|
|
|
|
asmlinkage long sys_getpgid(pid_t pid);
|
|
|
|
asmlinkage long sys_getpgrp(void);
|
|
|
|
asmlinkage long sys_getsid(pid_t pid);
|
|
|
|
asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist);
|
|
|
|
|
|
|
|
asmlinkage long sys_setregid(gid_t rgid, gid_t egid);
|
|
|
|
asmlinkage long sys_setgid(gid_t gid);
|
|
|
|
asmlinkage long sys_setreuid(uid_t ruid, uid_t euid);
|
|
|
|
asmlinkage long sys_setuid(uid_t uid);
|
|
|
|
asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid);
|
|
|
|
asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid);
|
|
|
|
asmlinkage long sys_setfsuid(uid_t uid);
|
|
|
|
asmlinkage long sys_setfsgid(gid_t gid);
|
|
|
|
asmlinkage long sys_setpgid(pid_t pid, pid_t pgid);
|
|
|
|
asmlinkage long sys_setsid(void);
|
|
|
|
asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist);
|
|
|
|
|
|
|
|
asmlinkage long sys_acct(const char __user *name);
|
|
|
|
asmlinkage long sys_capget(cap_user_header_t header,
|
|
|
|
cap_user_data_t dataptr);
|
|
|
|
asmlinkage long sys_capset(cap_user_header_t header,
|
|
|
|
const cap_user_data_t data);
|
|
|
|
asmlinkage long sys_personality(u_long personality);
|
|
|
|
|
|
|
|
asmlinkage long sys_sigpending(old_sigset_t __user *set);
|
|
|
|
asmlinkage long sys_sigprocmask(int how, old_sigset_t __user *set,
|
|
|
|
old_sigset_t __user *oset);
|
|
|
|
asmlinkage long sys_getitimer(int which, struct itimerval __user *value);
|
|
|
|
asmlinkage long sys_setitimer(int which,
|
|
|
|
struct itimerval __user *value,
|
|
|
|
struct itimerval __user *ovalue);
|
|
|
|
asmlinkage long sys_timer_create(clockid_t which_clock,
|
|
|
|
struct sigevent __user *timer_event_spec,
|
|
|
|
timer_t __user * created_timer_id);
|
|
|
|
asmlinkage long sys_timer_gettime(timer_t timer_id,
|
|
|
|
struct itimerspec __user *setting);
|
|
|
|
asmlinkage long sys_timer_getoverrun(timer_t timer_id);
|
|
|
|
asmlinkage long sys_timer_settime(timer_t timer_id, int flags,
|
|
|
|
const struct itimerspec __user *new_setting,
|
|
|
|
struct itimerspec __user *old_setting);
|
|
|
|
asmlinkage long sys_timer_delete(timer_t timer_id);
|
|
|
|
asmlinkage long sys_clock_settime(clockid_t which_clock,
|
|
|
|
const struct timespec __user *tp);
|
|
|
|
asmlinkage long sys_clock_gettime(clockid_t which_clock,
|
|
|
|
struct timespec __user *tp);
|
|
|
|
asmlinkage long sys_clock_getres(clockid_t which_clock,
|
|
|
|
struct timespec __user *tp);
|
|
|
|
asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags,
|
|
|
|
const struct timespec __user *rqtp,
|
|
|
|
struct timespec __user *rmtp);
|
|
|
|
|
|
|
|
asmlinkage long sys_nice(int increment);
|
|
|
|
asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
|
|
|
|
struct sched_param __user *param);
|
|
|
|
asmlinkage long sys_sched_setparam(pid_t pid,
|
|
|
|
struct sched_param __user *param);
|
|
|
|
asmlinkage long sys_sched_getscheduler(pid_t pid);
|
|
|
|
asmlinkage long sys_sched_getparam(pid_t pid,
|
|
|
|
struct sched_param __user *param);
|
|
|
|
asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
|
|
|
|
unsigned long __user *user_mask_ptr);
|
|
|
|
asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
|
|
|
|
unsigned long __user *user_mask_ptr);
|
|
|
|
asmlinkage long sys_sched_yield(void);
|
|
|
|
asmlinkage long sys_sched_get_priority_max(int policy);
|
|
|
|
asmlinkage long sys_sched_get_priority_min(int policy);
|
|
|
|
asmlinkage long sys_sched_rr_get_interval(pid_t pid,
|
|
|
|
struct timespec __user *interval);
|
|
|
|
asmlinkage long sys_setpriority(int which, int who, int niceval);
|
|
|
|
asmlinkage long sys_getpriority(int which, int who);
|
|
|
|
|
|
|
|
asmlinkage long sys_shutdown(int, int);
|
|
|
|
asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd,
|
|
|
|
void __user *arg);
|
|
|
|
asmlinkage long sys_restart_syscall(void);
|
2005-06-25 23:58:28 +02:00
|
|
|
asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
|
|
|
|
struct kexec_segment __user *segments,
|
|
|
|
unsigned long flags);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
asmlinkage long sys_exit(int error_code);
|
2009-01-14 14:13:54 +01:00
|
|
|
asmlinkage long sys_exit_group(int error_code);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
|
|
|
|
int options, struct rusage __user *ru);
|
|
|
|
asmlinkage long sys_waitid(int which, pid_t pid,
|
|
|
|
struct siginfo __user *infop,
|
|
|
|
int options, struct rusage __user *ru);
|
|
|
|
asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options);
|
|
|
|
asmlinkage long sys_set_tid_address(int __user *tidptr);
|
[PATCH] pi-futex: futex code cleanups
We are pleased to announce "lightweight userspace priority inheritance" (PI)
support for futexes. The following patchset and glibc patch implements it,
ontop of the robust-futexes patchset which is included in 2.6.16-mm1.
We are calling it lightweight for 3 reasons:
- in the user-space fastpath a PI-enabled futex involves no kernel work
(or any other PI complexity) at all. No registration, no extra kernel
calls - just pure fast atomic ops in userspace.
- in the slowpath (in the lock-contention case), the system call and
scheduling pattern is in fact better than that of normal futexes, due to
the 'integrated' nature of FUTEX_LOCK_PI. [more about that further down]
- the in-kernel PI implementation is streamlined around the mutex
abstraction, with strict rules that keep the implementation relatively
simple: only a single owner may own a lock (i.e. no read-write lock
support), only the owner may unlock a lock, no recursive locking, etc.
Priority Inheritance - why, oh why???
-------------------------------------
Many of you heard the horror stories about the evil PI code circling Linux for
years, which makes no real sense at all and is only used by buggy applications
and which has horrible overhead. Some of you have dreaded this very moment,
when someone actually submits working PI code ;-)
So why would we like to see PI support for futexes?
We'd like to see it done purely for technological reasons. We dont think it's
a buggy concept, we think it's useful functionality to offer to applications,
which functionality cannot be achieved in other ways. We also think it's the
right thing to do, and we think we've got the right arguments and the right
numbers to prove that. We also believe that we can address all the
counter-arguments as well. For these reasons (and the reasons outlined below)
we are submitting this patch-set for upstream kernel inclusion.
What are the benefits of PI?
The short reply:
----------------
User-space PI helps achieving/improving determinism for user-space
applications. In the best-case, it can help achieve determinism and
well-bound latencies. Even in the worst-case, PI will improve the statistical
distribution of locking related application delays.
The longer reply:
-----------------
Firstly, sharing locks between multiple tasks is a common programming
technique that often cannot be replaced with lockless algorithms. As we can
see it in the kernel [which is a quite complex program in itself], lockless
structures are rather the exception than the norm - the current ratio of
lockless vs. locky code for shared data structures is somewhere between 1:10
and 1:100. Lockless is hard, and the complexity of lockless algorithms often
endangers to ability to do robust reviews of said code. I.e. critical RT
apps often choose lock structures to protect critical data structures, instead
of lockless algorithms. Furthermore, there are cases (like shared hardware,
or other resource limits) where lockless access is mathematically impossible.
Media players (such as Jack) are an example of reasonable application design
with multiple tasks (with multiple priority levels) sharing short-held locks:
for example, a highprio audio playback thread is combined with medium-prio
construct-audio-data threads and low-prio display-colory-stuff threads. Add
video and decoding to the mix and we've got even more priority levels.
So once we accept that synchronization objects (locks) are an unavoidable fact
of life, and once we accept that multi-task userspace apps have a very fair
expectation of being able to use locks, we've got to think about how to offer
the option of a deterministic locking implementation to user-space.
Most of the technical counter-arguments against doing priority inheritance
only apply to kernel-space locks. But user-space locks are different, there
we cannot disable interrupts or make the task non-preemptible in a critical
section, so the 'use spinlocks' argument does not apply (user-space spinlocks
have the same priority inversion problems as other user-space locking
constructs). Fact is, pretty much the only technique that currently enables
good determinism for userspace locks (such as futex-based pthread mutexes) is
priority inheritance:
Currently (without PI), if a high-prio and a low-prio task shares a lock [this
is a quite common scenario for most non-trivial RT applications], even if all
critical sections are coded carefully to be deterministic (i.e. all critical
sections are short in duration and only execute a limited number of
instructions), the kernel cannot guarantee any deterministic execution of the
high-prio task: any medium-priority task could preempt the low-prio task while
it holds the shared lock and executes the critical section, and could delay it
indefinitely.
Implementation:
---------------
As mentioned before, the userspace fastpath of PI-enabled pthread mutexes
involves no kernel work at all - they behave quite similarly to normal
futex-based locks: a 0 value means unlocked, and a value==TID means locked.
(This is the same method as used by list-based robust futexes.) Userspace uses
atomic ops to lock/unlock these mutexes without entering the kernel.
To handle the slowpath, we have added two new futex ops:
FUTEX_LOCK_PI
FUTEX_UNLOCK_PI
If the lock-acquire fastpath fails, [i.e. an atomic transition from 0 to TID
fails], then FUTEX_LOCK_PI is called. The kernel does all the remaining work:
if there is no futex-queue attached to the futex address yet then the code
looks up the task that owns the futex [it has put its own TID into the futex
value], and attaches a 'PI state' structure to the futex-queue. The pi_state
includes an rt-mutex, which is a PI-aware, kernel-based synchronization
object. The 'other' task is made the owner of the rt-mutex, and the
FUTEX_WAITERS bit is atomically set in the futex value. Then this task tries
to lock the rt-mutex, on which it blocks. Once it returns, it has the mutex
acquired, and it sets the futex value to its own TID and returns. Userspace
has no other work to perform - it now owns the lock, and futex value contains
FUTEX_WAITERS|TID.
If the unlock side fastpath succeeds, [i.e. userspace manages to do a TID ->
0 atomic transition of the futex value], then no kernel work is triggered.
If the unlock fastpath fails (because the FUTEX_WAITERS bit is set), then
FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the behalf of
userspace - and it also unlocks the attached pi_state->rt_mutex and thus wakes
up any potential waiters.
Note that under this approach, contrary to other PI-futex approaches, there is
no prior 'registration' of a PI-futex. [which is not quite possible anyway,
due to existing ABI properties of pthread mutexes.]
Also, under this scheme, 'robustness' and 'PI' are two orthogonal properties
of futexes, and all four combinations are possible: futex, robust-futex,
PI-futex, robust+PI-futex.
glibc support:
--------------
Ulrich Drepper and Jakub Jelinek have written glibc support for PI-futexes
(and robust futexes), enabling robust and PI (PTHREAD_PRIO_INHERIT) POSIX
mutexes. (PTHREAD_PRIO_PROTECT support will be added later on too, no
additional kernel changes are needed for that). [NOTE: The glibc patch is
obviously inofficial and unsupported without matching upstream kernel
functionality.]
the patch-queue and the glibc patch can also be downloaded from:
http://redhat.com/~mingo/PI-futex-patches/
Many thanks go to the people who helped us create this kernel feature: Steven
Rostedt, Esben Nielsen, Benedikt Spranger, Daniel Walker, John Cooper, Arjan
van de Ven, Oleg Nesterov and others. Credits for related prior projects goes
to Dirk Grambow, Inaky Perez-Gonzalez, Bill Huey and many others.
Clean up the futex code, before adding more features to it:
- use u32 as the futex field type - that's the ABI
- use __user and pointers to u32 instead of unsigned long
- code style / comment style cleanups
- rename hash-bucket name from 'bh' to 'hb'.
I checked the pre and post futex.o object files to make sure this
patch has no code effects.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:47 +02:00
|
|
|
asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
|
2005-04-17 00:20:36 +02:00
|
|
|
struct timespec __user *utime, u32 __user *uaddr2,
|
[PATCH] pi-futex: futex code cleanups
We are pleased to announce "lightweight userspace priority inheritance" (PI)
support for futexes. The following patchset and glibc patch implements it,
ontop of the robust-futexes patchset which is included in 2.6.16-mm1.
We are calling it lightweight for 3 reasons:
- in the user-space fastpath a PI-enabled futex involves no kernel work
(or any other PI complexity) at all. No registration, no extra kernel
calls - just pure fast atomic ops in userspace.
- in the slowpath (in the lock-contention case), the system call and
scheduling pattern is in fact better than that of normal futexes, due to
the 'integrated' nature of FUTEX_LOCK_PI. [more about that further down]
- the in-kernel PI implementation is streamlined around the mutex
abstraction, with strict rules that keep the implementation relatively
simple: only a single owner may own a lock (i.e. no read-write lock
support), only the owner may unlock a lock, no recursive locking, etc.
Priority Inheritance - why, oh why???
-------------------------------------
Many of you heard the horror stories about the evil PI code circling Linux for
years, which makes no real sense at all and is only used by buggy applications
and which has horrible overhead. Some of you have dreaded this very moment,
when someone actually submits working PI code ;-)
So why would we like to see PI support for futexes?
We'd like to see it done purely for technological reasons. We dont think it's
a buggy concept, we think it's useful functionality to offer to applications,
which functionality cannot be achieved in other ways. We also think it's the
right thing to do, and we think we've got the right arguments and the right
numbers to prove that. We also believe that we can address all the
counter-arguments as well. For these reasons (and the reasons outlined below)
we are submitting this patch-set for upstream kernel inclusion.
What are the benefits of PI?
The short reply:
----------------
User-space PI helps achieving/improving determinism for user-space
applications. In the best-case, it can help achieve determinism and
well-bound latencies. Even in the worst-case, PI will improve the statistical
distribution of locking related application delays.
The longer reply:
-----------------
Firstly, sharing locks between multiple tasks is a common programming
technique that often cannot be replaced with lockless algorithms. As we can
see it in the kernel [which is a quite complex program in itself], lockless
structures are rather the exception than the norm - the current ratio of
lockless vs. locky code for shared data structures is somewhere between 1:10
and 1:100. Lockless is hard, and the complexity of lockless algorithms often
endangers to ability to do robust reviews of said code. I.e. critical RT
apps often choose lock structures to protect critical data structures, instead
of lockless algorithms. Furthermore, there are cases (like shared hardware,
or other resource limits) where lockless access is mathematically impossible.
Media players (such as Jack) are an example of reasonable application design
with multiple tasks (with multiple priority levels) sharing short-held locks:
for example, a highprio audio playback thread is combined with medium-prio
construct-audio-data threads and low-prio display-colory-stuff threads. Add
video and decoding to the mix and we've got even more priority levels.
So once we accept that synchronization objects (locks) are an unavoidable fact
of life, and once we accept that multi-task userspace apps have a very fair
expectation of being able to use locks, we've got to think about how to offer
the option of a deterministic locking implementation to user-space.
Most of the technical counter-arguments against doing priority inheritance
only apply to kernel-space locks. But user-space locks are different, there
we cannot disable interrupts or make the task non-preemptible in a critical
section, so the 'use spinlocks' argument does not apply (user-space spinlocks
have the same priority inversion problems as other user-space locking
constructs). Fact is, pretty much the only technique that currently enables
good determinism for userspace locks (such as futex-based pthread mutexes) is
priority inheritance:
Currently (without PI), if a high-prio and a low-prio task shares a lock [this
is a quite common scenario for most non-trivial RT applications], even if all
critical sections are coded carefully to be deterministic (i.e. all critical
sections are short in duration and only execute a limited number of
instructions), the kernel cannot guarantee any deterministic execution of the
high-prio task: any medium-priority task could preempt the low-prio task while
it holds the shared lock and executes the critical section, and could delay it
indefinitely.
Implementation:
---------------
As mentioned before, the userspace fastpath of PI-enabled pthread mutexes
involves no kernel work at all - they behave quite similarly to normal
futex-based locks: a 0 value means unlocked, and a value==TID means locked.
(This is the same method as used by list-based robust futexes.) Userspace uses
atomic ops to lock/unlock these mutexes without entering the kernel.
To handle the slowpath, we have added two new futex ops:
FUTEX_LOCK_PI
FUTEX_UNLOCK_PI
If the lock-acquire fastpath fails, [i.e. an atomic transition from 0 to TID
fails], then FUTEX_LOCK_PI is called. The kernel does all the remaining work:
if there is no futex-queue attached to the futex address yet then the code
looks up the task that owns the futex [it has put its own TID into the futex
value], and attaches a 'PI state' structure to the futex-queue. The pi_state
includes an rt-mutex, which is a PI-aware, kernel-based synchronization
object. The 'other' task is made the owner of the rt-mutex, and the
FUTEX_WAITERS bit is atomically set in the futex value. Then this task tries
to lock the rt-mutex, on which it blocks. Once it returns, it has the mutex
acquired, and it sets the futex value to its own TID and returns. Userspace
has no other work to perform - it now owns the lock, and futex value contains
FUTEX_WAITERS|TID.
If the unlock side fastpath succeeds, [i.e. userspace manages to do a TID ->
0 atomic transition of the futex value], then no kernel work is triggered.
If the unlock fastpath fails (because the FUTEX_WAITERS bit is set), then
FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the behalf of
userspace - and it also unlocks the attached pi_state->rt_mutex and thus wakes
up any potential waiters.
Note that under this approach, contrary to other PI-futex approaches, there is
no prior 'registration' of a PI-futex. [which is not quite possible anyway,
due to existing ABI properties of pthread mutexes.]
Also, under this scheme, 'robustness' and 'PI' are two orthogonal properties
of futexes, and all four combinations are possible: futex, robust-futex,
PI-futex, robust+PI-futex.
glibc support:
--------------
Ulrich Drepper and Jakub Jelinek have written glibc support for PI-futexes
(and robust futexes), enabling robust and PI (PTHREAD_PRIO_INHERIT) POSIX
mutexes. (PTHREAD_PRIO_PROTECT support will be added later on too, no
additional kernel changes are needed for that). [NOTE: The glibc patch is
obviously inofficial and unsupported without matching upstream kernel
functionality.]
the patch-queue and the glibc patch can also be downloaded from:
http://redhat.com/~mingo/PI-futex-patches/
Many thanks go to the people who helped us create this kernel feature: Steven
Rostedt, Esben Nielsen, Benedikt Spranger, Daniel Walker, John Cooper, Arjan
van de Ven, Oleg Nesterov and others. Credits for related prior projects goes
to Dirk Grambow, Inaky Perez-Gonzalez, Bill Huey and many others.
Clean up the futex code, before adding more features to it:
- use u32 as the futex field type - that's the ABI
- use __user and pointers to u32 instead of unsigned long
- code style / comment style cleanups
- rename hash-bucket name from 'bh' to 'hb'.
I checked the pre and post futex.o object files to make sure this
patch has no code effects.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 11:54:47 +02:00
|
|
|
u32 val3);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
asmlinkage long sys_init_module(void __user *umod, unsigned long len,
|
|
|
|
const char __user *uargs);
|
|
|
|
asmlinkage long sys_delete_module(const char __user *name_user,
|
|
|
|
unsigned int flags);
|
|
|
|
|
|
|
|
asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set,
|
|
|
|
sigset_t __user *oset, size_t sigsetsize);
|
|
|
|
asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize);
|
|
|
|
asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
|
|
|
|
siginfo_t __user *uinfo,
|
|
|
|
const struct timespec __user *uts,
|
|
|
|
size_t sigsetsize);
|
|
|
|
asmlinkage long sys_kill(int pid, int sig);
|
|
|
|
asmlinkage long sys_tgkill(int tgid, int pid, int sig);
|
|
|
|
asmlinkage long sys_tkill(int pid, int sig);
|
|
|
|
asmlinkage long sys_rt_sigqueueinfo(int pid, int sig, siginfo_t __user *uinfo);
|
|
|
|
asmlinkage long sys_sgetmask(void);
|
|
|
|
asmlinkage long sys_ssetmask(int newmask);
|
2009-01-14 14:13:54 +01:00
|
|
|
asmlinkage long sys_signal(int sig, __sighandler_t handler);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_pause(void);
|
|
|
|
|
|
|
|
asmlinkage long sys_sync(void);
|
|
|
|
asmlinkage long sys_fsync(unsigned int fd);
|
|
|
|
asmlinkage long sys_fdatasync(unsigned int fd);
|
|
|
|
asmlinkage long sys_bdflush(int func, long data);
|
|
|
|
asmlinkage long sys_mount(char __user *dev_name, char __user *dir_name,
|
|
|
|
char __user *type, unsigned long flags,
|
|
|
|
void __user *data);
|
|
|
|
asmlinkage long sys_umount(char __user *name, int flags);
|
|
|
|
asmlinkage long sys_oldumount(char __user *name);
|
|
|
|
asmlinkage long sys_truncate(const char __user *path,
|
|
|
|
unsigned long length);
|
|
|
|
asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
|
|
|
|
asmlinkage long sys_stat(char __user *filename,
|
|
|
|
struct __old_kernel_stat __user *statbuf);
|
|
|
|
asmlinkage long sys_statfs(const char __user * path,
|
|
|
|
struct statfs __user *buf);
|
|
|
|
asmlinkage long sys_statfs64(const char __user *path, size_t sz,
|
|
|
|
struct statfs64 __user *buf);
|
|
|
|
asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
|
|
|
|
asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
|
|
|
|
struct statfs64 __user *buf);
|
|
|
|
asmlinkage long sys_lstat(char __user *filename,
|
|
|
|
struct __old_kernel_stat __user *statbuf);
|
|
|
|
asmlinkage long sys_fstat(unsigned int fd,
|
|
|
|
struct __old_kernel_stat __user *statbuf);
|
|
|
|
asmlinkage long sys_newstat(char __user *filename,
|
|
|
|
struct stat __user *statbuf);
|
|
|
|
asmlinkage long sys_newlstat(char __user *filename,
|
|
|
|
struct stat __user *statbuf);
|
|
|
|
asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
|
|
|
|
asmlinkage long sys_ustat(unsigned dev, struct ustat __user *ubuf);
|
|
|
|
#if BITS_PER_LONG == 32
|
|
|
|
asmlinkage long sys_stat64(char __user *filename,
|
|
|
|
struct stat64 __user *statbuf);
|
|
|
|
asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
|
|
|
|
asmlinkage long sys_lstat64(char __user *filename,
|
|
|
|
struct stat64 __user *statbuf);
|
|
|
|
asmlinkage long sys_truncate64(const char __user *path, loff_t length);
|
|
|
|
asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length);
|
|
|
|
#endif
|
|
|
|
|
2008-04-29 09:59:41 +02:00
|
|
|
asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
|
|
|
|
const void __user *value, size_t size, int flags);
|
|
|
|
asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name,
|
|
|
|
const void __user *value, size_t size, int flags);
|
|
|
|
asmlinkage long sys_fsetxattr(int fd, const char __user *name,
|
|
|
|
const void __user *value, size_t size, int flags);
|
2009-01-14 14:13:54 +01:00
|
|
|
asmlinkage long sys_getxattr(const char __user *path, const char __user *name,
|
|
|
|
void __user *value, size_t size);
|
|
|
|
asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name,
|
|
|
|
void __user *value, size_t size);
|
|
|
|
asmlinkage long sys_fgetxattr(int fd, const char __user *name,
|
|
|
|
void __user *value, size_t size);
|
|
|
|
asmlinkage long sys_listxattr(const char __user *path, char __user *list,
|
|
|
|
size_t size);
|
|
|
|
asmlinkage long sys_llistxattr(const char __user *path, char __user *list,
|
|
|
|
size_t size);
|
|
|
|
asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size);
|
2008-04-29 09:59:41 +02:00
|
|
|
asmlinkage long sys_removexattr(const char __user *path,
|
|
|
|
const char __user *name);
|
|
|
|
asmlinkage long sys_lremovexattr(const char __user *path,
|
|
|
|
const char __user *name);
|
|
|
|
asmlinkage long sys_fremovexattr(int fd, const char __user *name);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
2009-01-14 14:13:54 +01:00
|
|
|
asmlinkage long sys_brk(unsigned long brk);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_mprotect(unsigned long start, size_t len,
|
|
|
|
unsigned long prot);
|
2009-01-14 14:13:54 +01:00
|
|
|
asmlinkage long sys_mremap(unsigned long addr,
|
|
|
|
unsigned long old_len, unsigned long new_len,
|
|
|
|
unsigned long flags, unsigned long new_addr);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
|
|
|
|
unsigned long prot, unsigned long pgoff,
|
|
|
|
unsigned long flags);
|
|
|
|
asmlinkage long sys_msync(unsigned long start, size_t len, int flags);
|
|
|
|
asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice);
|
|
|
|
asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice);
|
|
|
|
asmlinkage long sys_munmap(unsigned long addr, size_t len);
|
|
|
|
asmlinkage long sys_mlock(unsigned long start, size_t len);
|
|
|
|
asmlinkage long sys_munlock(unsigned long start, size_t len);
|
|
|
|
asmlinkage long sys_mlockall(int flags);
|
|
|
|
asmlinkage long sys_munlockall(void);
|
|
|
|
asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
|
|
|
|
asmlinkage long sys_mincore(unsigned long start, size_t len,
|
|
|
|
unsigned char __user * vec);
|
|
|
|
|
|
|
|
asmlinkage long sys_pivot_root(const char __user *new_root,
|
|
|
|
const char __user *put_old);
|
|
|
|
asmlinkage long sys_chroot(const char __user *filename);
|
|
|
|
asmlinkage long sys_mknod(const char __user *filename, int mode,
|
|
|
|
unsigned dev);
|
|
|
|
asmlinkage long sys_link(const char __user *oldname,
|
|
|
|
const char __user *newname);
|
|
|
|
asmlinkage long sys_symlink(const char __user *old, const char __user *new);
|
|
|
|
asmlinkage long sys_unlink(const char __user *pathname);
|
|
|
|
asmlinkage long sys_rename(const char __user *oldname,
|
|
|
|
const char __user *newname);
|
|
|
|
asmlinkage long sys_chmod(const char __user *filename, mode_t mode);
|
|
|
|
asmlinkage long sys_fchmod(unsigned int fd, mode_t mode);
|
|
|
|
|
|
|
|
asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg);
|
|
|
|
#if BITS_PER_LONG == 32
|
|
|
|
asmlinkage long sys_fcntl64(unsigned int fd,
|
|
|
|
unsigned int cmd, unsigned long arg);
|
|
|
|
#endif
|
|
|
|
asmlinkage long sys_dup(unsigned int fildes);
|
|
|
|
asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd);
|
2008-07-24 06:29:29 +02:00
|
|
|
asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
|
|
|
|
asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd,
|
|
|
|
unsigned long arg);
|
|
|
|
asmlinkage long sys_flock(unsigned int fd, unsigned int cmd);
|
|
|
|
asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t __user *ctx);
|
|
|
|
asmlinkage long sys_io_destroy(aio_context_t ctx);
|
|
|
|
asmlinkage long sys_io_getevents(aio_context_t ctx_id,
|
|
|
|
long min_nr,
|
|
|
|
long nr,
|
|
|
|
struct io_event __user *events,
|
|
|
|
struct timespec __user *timeout);
|
|
|
|
asmlinkage long sys_io_submit(aio_context_t, long,
|
|
|
|
struct iocb __user * __user *);
|
|
|
|
asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
|
|
|
|
struct io_event __user *result);
|
2009-01-14 14:13:54 +01:00
|
|
|
asmlinkage long sys_sendfile(int out_fd, int in_fd,
|
|
|
|
off_t __user *offset, size_t count);
|
|
|
|
asmlinkage long sys_sendfile64(int out_fd, int in_fd,
|
|
|
|
loff_t __user *offset, size_t count);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_readlink(const char __user *path,
|
|
|
|
char __user *buf, int bufsiz);
|
|
|
|
asmlinkage long sys_creat(const char __user *pathname, int mode);
|
|
|
|
asmlinkage long sys_open(const char __user *filename,
|
|
|
|
int flags, int mode);
|
|
|
|
asmlinkage long sys_close(unsigned int fd);
|
|
|
|
asmlinkage long sys_access(const char __user *filename, int mode);
|
|
|
|
asmlinkage long sys_vhangup(void);
|
|
|
|
asmlinkage long sys_chown(const char __user *filename,
|
|
|
|
uid_t user, gid_t group);
|
|
|
|
asmlinkage long sys_lchown(const char __user *filename,
|
|
|
|
uid_t user, gid_t group);
|
|
|
|
asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group);
|
|
|
|
#ifdef CONFIG_UID16
|
|
|
|
asmlinkage long sys_chown16(const char __user *filename,
|
|
|
|
old_uid_t user, old_gid_t group);
|
|
|
|
asmlinkage long sys_lchown16(const char __user *filename,
|
|
|
|
old_uid_t user, old_gid_t group);
|
|
|
|
asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group);
|
|
|
|
asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid);
|
|
|
|
asmlinkage long sys_setgid16(old_gid_t gid);
|
|
|
|
asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid);
|
|
|
|
asmlinkage long sys_setuid16(old_uid_t uid);
|
|
|
|
asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid);
|
|
|
|
asmlinkage long sys_getresuid16(old_uid_t __user *ruid,
|
|
|
|
old_uid_t __user *euid, old_uid_t __user *suid);
|
|
|
|
asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid);
|
|
|
|
asmlinkage long sys_getresgid16(old_gid_t __user *rgid,
|
|
|
|
old_gid_t __user *egid, old_gid_t __user *sgid);
|
|
|
|
asmlinkage long sys_setfsuid16(old_uid_t uid);
|
|
|
|
asmlinkage long sys_setfsgid16(old_gid_t gid);
|
|
|
|
asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist);
|
|
|
|
asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist);
|
|
|
|
asmlinkage long sys_getuid16(void);
|
|
|
|
asmlinkage long sys_geteuid16(void);
|
|
|
|
asmlinkage long sys_getgid16(void);
|
|
|
|
asmlinkage long sys_getegid16(void);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
asmlinkage long sys_utime(char __user *filename,
|
|
|
|
struct utimbuf __user *times);
|
|
|
|
asmlinkage long sys_utimes(char __user *filename,
|
|
|
|
struct timeval __user *utimes);
|
2009-01-14 14:13:54 +01:00
|
|
|
asmlinkage long sys_lseek(unsigned int fd, off_t offset,
|
|
|
|
unsigned int origin);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high,
|
|
|
|
unsigned long offset_low, loff_t __user *result,
|
|
|
|
unsigned int origin);
|
2009-01-14 14:13:54 +01:00
|
|
|
asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count);
|
|
|
|
asmlinkage long sys_readahead(int fd, loff_t offset, size_t count);
|
|
|
|
asmlinkage long sys_readv(unsigned long fd,
|
|
|
|
const struct iovec __user *vec,
|
|
|
|
unsigned long vlen);
|
|
|
|
asmlinkage long sys_write(unsigned int fd, const char __user *buf,
|
|
|
|
size_t count);
|
|
|
|
asmlinkage long sys_writev(unsigned long fd,
|
|
|
|
const struct iovec __user *vec,
|
|
|
|
unsigned long vlen);
|
|
|
|
asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
|
|
|
|
size_t count, loff_t pos);
|
|
|
|
asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
|
|
|
|
size_t count, loff_t pos);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
|
|
|
|
asmlinkage long sys_mkdir(const char __user *pathname, int mode);
|
|
|
|
asmlinkage long sys_chdir(const char __user *filename);
|
|
|
|
asmlinkage long sys_fchdir(unsigned int fd);
|
|
|
|
asmlinkage long sys_rmdir(const char __user *pathname);
|
|
|
|
asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user *buf, size_t len);
|
|
|
|
asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special,
|
|
|
|
qid_t id, void __user *addr);
|
|
|
|
asmlinkage long sys_getdents(unsigned int fd,
|
|
|
|
struct linux_dirent __user *dirent,
|
|
|
|
unsigned int count);
|
|
|
|
asmlinkage long sys_getdents64(unsigned int fd,
|
|
|
|
struct linux_dirent64 __user *dirent,
|
|
|
|
unsigned int count);
|
|
|
|
|
|
|
|
asmlinkage long sys_setsockopt(int fd, int level, int optname,
|
|
|
|
char __user *optval, int optlen);
|
|
|
|
asmlinkage long sys_getsockopt(int fd, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen);
|
|
|
|
asmlinkage long sys_bind(int, struct sockaddr __user *, int);
|
|
|
|
asmlinkage long sys_connect(int, struct sockaddr __user *, int);
|
|
|
|
asmlinkage long sys_accept(int, struct sockaddr __user *, int __user *);
|
reintroduce accept4
Introduce a new accept4() system call. The addition of this system call
matches analogous changes in 2.6.27 (dup3(), evenfd2(), signalfd4(),
inotify_init1(), epoll_create1(), pipe2()) which added new system calls
that differed from analogous traditional system calls in adding a flags
argument that can be used to access additional functionality.
The accept4() system call is exactly the same as accept(), except that
it adds a flags bit-mask argument. Two flags are initially implemented.
(Most of the new system calls in 2.6.27 also had both of these flags.)
SOCK_CLOEXEC causes the close-on-exec (FD_CLOEXEC) flag to be enabled
for the new file descriptor returned by accept4(). This is a useful
security feature to avoid leaking information in a multithreaded
program where one thread is doing an accept() at the same time as
another thread is doing a fork() plus exec(). More details here:
http://udrepper.livejournal.com/20407.html "Secure File Descriptor Handling",
Ulrich Drepper).
The other flag is SOCK_NONBLOCK, which causes the O_NONBLOCK flag
to be enabled on the new open file description created by accept4().
(This flag is merely a convenience, saving the use of additional calls
fcntl(F_GETFL) and fcntl (F_SETFL) to achieve the same result.
Here's a test program. Works on x86-32. Should work on x86-64, but
I (mtk) don't have a system to hand to test with.
It tests accept4() with each of the four possible combinations of
SOCK_CLOEXEC and SOCK_NONBLOCK set/clear in 'flags', and verifies
that the appropriate flags are set on the file descriptor/open file
description returned by accept4().
I tested Ulrich's patch in this thread by applying against 2.6.28-rc2,
and it passes according to my test program.
/* test_accept4.c
Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk
<mtk.manpages@gmail.com>
Licensed under the GNU GPLv2 or later.
*/
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#define PORT_NUM 33333
#define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0)
/**********************************************************************/
/* The following is what we need until glibc gets a wrapper for
accept4() */
/* Flags for socket(), socketpair(), accept4() */
#ifndef SOCK_CLOEXEC
#define SOCK_CLOEXEC O_CLOEXEC
#endif
#ifndef SOCK_NONBLOCK
#define SOCK_NONBLOCK O_NONBLOCK
#endif
#ifdef __x86_64__
#define SYS_accept4 288
#elif __i386__
#define USE_SOCKETCALL 1
#define SYS_ACCEPT4 18
#else
#error "Sorry -- don't know the syscall # on this architecture"
#endif
static int
accept4(int fd, struct sockaddr *sockaddr, socklen_t *addrlen, int flags)
{
printf("Calling accept4(): flags = %x", flags);
if (flags != 0) {
printf(" (");
if (flags & SOCK_CLOEXEC)
printf("SOCK_CLOEXEC");
if ((flags & SOCK_CLOEXEC) && (flags & SOCK_NONBLOCK))
printf(" ");
if (flags & SOCK_NONBLOCK)
printf("SOCK_NONBLOCK");
printf(")");
}
printf("\n");
#if USE_SOCKETCALL
long args[6];
args[0] = fd;
args[1] = (long) sockaddr;
args[2] = (long) addrlen;
args[3] = flags;
return syscall(SYS_socketcall, SYS_ACCEPT4, args);
#else
return syscall(SYS_accept4, fd, sockaddr, addrlen, flags);
#endif
}
/**********************************************************************/
static int
do_test(int lfd, struct sockaddr_in *conn_addr,
int closeonexec_flag, int nonblock_flag)
{
int connfd, acceptfd;
int fdf, flf, fdf_pass, flf_pass;
struct sockaddr_in claddr;
socklen_t addrlen;
printf("=======================================\n");
connfd = socket(AF_INET, SOCK_STREAM, 0);
if (connfd == -1)
die("socket");
if (connect(connfd, (struct sockaddr *) conn_addr,
sizeof(struct sockaddr_in)) == -1)
die("connect");
addrlen = sizeof(struct sockaddr_in);
acceptfd = accept4(lfd, (struct sockaddr *) &claddr, &addrlen,
closeonexec_flag | nonblock_flag);
if (acceptfd == -1) {
perror("accept4()");
close(connfd);
return 0;
}
fdf = fcntl(acceptfd, F_GETFD);
if (fdf == -1)
die("fcntl:F_GETFD");
fdf_pass = ((fdf & FD_CLOEXEC) != 0) ==
((closeonexec_flag & SOCK_CLOEXEC) != 0);
printf("Close-on-exec flag is %sset (%s); ",
(fdf & FD_CLOEXEC) ? "" : "not ",
fdf_pass ? "OK" : "failed");
flf = fcntl(acceptfd, F_GETFL);
if (flf == -1)
die("fcntl:F_GETFD");
flf_pass = ((flf & O_NONBLOCK) != 0) ==
((nonblock_flag & SOCK_NONBLOCK) !=0);
printf("nonblock flag is %sset (%s)\n",
(flf & O_NONBLOCK) ? "" : "not ",
flf_pass ? "OK" : "failed");
close(acceptfd);
close(connfd);
printf("Test result: %s\n", (fdf_pass && flf_pass) ? "PASS" : "FAIL");
return fdf_pass && flf_pass;
}
static int
create_listening_socket(int port_num)
{
struct sockaddr_in svaddr;
int lfd;
int optval;
memset(&svaddr, 0, sizeof(struct sockaddr_in));
svaddr.sin_family = AF_INET;
svaddr.sin_addr.s_addr = htonl(INADDR_ANY);
svaddr.sin_port = htons(port_num);
lfd = socket(AF_INET, SOCK_STREAM, 0);
if (lfd == -1)
die("socket");
optval = 1;
if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &optval,
sizeof(optval)) == -1)
die("setsockopt");
if (bind(lfd, (struct sockaddr *) &svaddr,
sizeof(struct sockaddr_in)) == -1)
die("bind");
if (listen(lfd, 5) == -1)
die("listen");
return lfd;
}
int
main(int argc, char *argv[])
{
struct sockaddr_in conn_addr;
int lfd;
int port_num;
int passed;
passed = 1;
port_num = (argc > 1) ? atoi(argv[1]) : PORT_NUM;
memset(&conn_addr, 0, sizeof(struct sockaddr_in));
conn_addr.sin_family = AF_INET;
conn_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
conn_addr.sin_port = htons(port_num);
lfd = create_listening_socket(port_num);
if (!do_test(lfd, &conn_addr, 0, 0))
passed = 0;
if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, 0))
passed = 0;
if (!do_test(lfd, &conn_addr, 0, SOCK_NONBLOCK))
passed = 0;
if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, SOCK_NONBLOCK))
passed = 0;
close(lfd);
exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
}
[mtk.manpages@gmail.com: rewrote changelog, updated test program]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <linux-api@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-11-20 00:36:14 +01:00
|
|
|
asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_getsockname(int, struct sockaddr __user *, int __user *);
|
|
|
|
asmlinkage long sys_getpeername(int, struct sockaddr __user *, int __user *);
|
|
|
|
asmlinkage long sys_send(int, void __user *, size_t, unsigned);
|
|
|
|
asmlinkage long sys_sendto(int, void __user *, size_t, unsigned,
|
|
|
|
struct sockaddr __user *, int);
|
|
|
|
asmlinkage long sys_sendmsg(int fd, struct msghdr __user *msg, unsigned flags);
|
|
|
|
asmlinkage long sys_recv(int, void __user *, size_t, unsigned);
|
|
|
|
asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
|
|
|
|
struct sockaddr __user *, int __user *);
|
|
|
|
asmlinkage long sys_recvmsg(int fd, struct msghdr __user *msg, unsigned flags);
|
|
|
|
asmlinkage long sys_socket(int, int, int);
|
|
|
|
asmlinkage long sys_socketpair(int, int, int, int __user *);
|
|
|
|
asmlinkage long sys_socketcall(int call, unsigned long __user *args);
|
|
|
|
asmlinkage long sys_listen(int, int);
|
|
|
|
asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
|
|
|
|
long timeout);
|
|
|
|
asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
|
|
|
|
fd_set __user *exp, struct timeval __user *tvp);
|
|
|
|
asmlinkage long sys_epoll_create(int size);
|
2008-07-24 06:29:43 +02:00
|
|
|
asmlinkage long sys_epoll_create1(int flags);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
|
|
|
|
struct epoll_event __user *event);
|
|
|
|
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
|
|
|
|
int maxevents, int timeout);
|
2006-10-11 10:21:44 +02:00
|
|
|
asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
|
|
|
|
int maxevents, int timeout,
|
|
|
|
const sigset_t __user *sigmask,
|
|
|
|
size_t sigsetsize);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_gethostname(char __user *name, int len);
|
|
|
|
asmlinkage long sys_sethostname(char __user *name, int len);
|
|
|
|
asmlinkage long sys_setdomainname(char __user *name, int len);
|
|
|
|
asmlinkage long sys_newuname(struct new_utsname __user *name);
|
|
|
|
|
|
|
|
asmlinkage long sys_getrlimit(unsigned int resource,
|
|
|
|
struct rlimit __user *rlim);
|
2008-07-24 06:28:50 +02:00
|
|
|
#if defined(COMPAT_RLIM_OLD_INFINITY) || !(defined(CONFIG_IA64))
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim);
|
|
|
|
#endif
|
|
|
|
asmlinkage long sys_setrlimit(unsigned int resource,
|
|
|
|
struct rlimit __user *rlim);
|
|
|
|
asmlinkage long sys_getrusage(int who, struct rusage __user *ru);
|
|
|
|
asmlinkage long sys_umask(int mask);
|
|
|
|
|
|
|
|
asmlinkage long sys_msgget(key_t key, int msgflg);
|
|
|
|
asmlinkage long sys_msgsnd(int msqid, struct msgbuf __user *msgp,
|
|
|
|
size_t msgsz, int msgflg);
|
|
|
|
asmlinkage long sys_msgrcv(int msqid, struct msgbuf __user *msgp,
|
|
|
|
size_t msgsz, long msgtyp, int msgflg);
|
|
|
|
asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf);
|
|
|
|
|
|
|
|
asmlinkage long sys_semget(key_t key, int nsems, int semflg);
|
|
|
|
asmlinkage long sys_semop(int semid, struct sembuf __user *sops,
|
|
|
|
unsigned nsops);
|
|
|
|
asmlinkage long sys_semctl(int semid, int semnum, int cmd, union semun arg);
|
|
|
|
asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops,
|
|
|
|
unsigned nsops,
|
|
|
|
const struct timespec __user *timeout);
|
2005-05-01 17:59:12 +02:00
|
|
|
asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_shmget(key_t key, size_t size, int flag);
|
|
|
|
asmlinkage long sys_shmdt(char __user *shmaddr);
|
|
|
|
asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
|
|
|
|
|
|
|
|
asmlinkage long sys_mq_open(const char __user *name, int oflag, mode_t mode, struct mq_attr __user *attr);
|
|
|
|
asmlinkage long sys_mq_unlink(const char __user *name);
|
|
|
|
asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout);
|
2009-01-14 14:13:54 +01:00
|
|
|
asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout);
|
2005-04-17 00:20:36 +02:00
|
|
|
asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification);
|
|
|
|
asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat);
|
|
|
|
|
|
|
|
asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn);
|
|
|
|
asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
|
|
|
|
unsigned long off, unsigned long len,
|
|
|
|
void __user *buf);
|
|
|
|
asmlinkage long sys_pciconfig_write(unsigned long bus, unsigned long dfn,
|
|
|
|
unsigned long off, unsigned long len,
|
|
|
|
void __user *buf);
|
|
|
|
|
|
|
|
asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
|
|
|
|
unsigned long arg4, unsigned long arg5);
|
|
|
|
asmlinkage long sys_swapon(const char __user *specialfile, int swap_flags);
|
|
|
|
asmlinkage long sys_swapoff(const char __user *specialfile);
|
|
|
|
asmlinkage long sys_sysctl(struct __sysctl_args __user *args);
|
|
|
|
asmlinkage long sys_sysinfo(struct sysinfo __user *info);
|
|
|
|
asmlinkage long sys_sysfs(int option,
|
|
|
|
unsigned long arg1, unsigned long arg2);
|
|
|
|
asmlinkage long sys_nfsservctl(int cmd,
|
|
|
|
struct nfsctl_arg __user *arg,
|
|
|
|
void __user *res);
|
|
|
|
asmlinkage long sys_syslog(int type, char __user *buf, int len);
|
|
|
|
asmlinkage long sys_uselib(const char __user *library);
|
|
|
|
asmlinkage long sys_ni_syscall(void);
|
2005-10-31 00:02:22 +01:00
|
|
|
asmlinkage long sys_ptrace(long request, long pid, long addr, long data);
|
2005-04-17 00:20:36 +02:00
|
|
|
|
|
|
|
asmlinkage long sys_add_key(const char __user *_type,
|
|
|
|
const char __user *_description,
|
|
|
|
const void __user *_payload,
|
|
|
|
size_t plen,
|
|
|
|
key_serial_t destringid);
|
|
|
|
|
|
|
|
asmlinkage long sys_request_key(const char __user *_type,
|
|
|
|
const char __user *_description,
|
|
|
|
const char __user *_callout_info,
|
|
|
|
key_serial_t destringid);
|
|
|
|
|
|
|
|
asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3,
|
|
|
|
unsigned long arg4, unsigned long arg5);
|
|
|
|
|
2005-07-08 02:56:13 +02:00
|
|
|
asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
|
|
|
|
asmlinkage long sys_ioprio_get(int which, int who);
|
2005-09-21 18:55:43 +02:00
|
|
|
asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask,
|
2006-01-19 02:43:04 +01:00
|
|
|
unsigned long maxnode);
|
2006-01-08 10:00:51 +01:00
|
|
|
asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
|
2006-01-19 02:43:04 +01:00
|
|
|
const unsigned long __user *from,
|
|
|
|
const unsigned long __user *to);
|
2006-06-23 11:03:55 +02:00
|
|
|
asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
|
|
|
|
const void __user * __user *pages,
|
|
|
|
const int __user *nodes,
|
|
|
|
int __user *status,
|
|
|
|
int flags);
|
2006-01-19 02:43:04 +01:00
|
|
|
asmlinkage long sys_mbind(unsigned long start, unsigned long len,
|
|
|
|
unsigned long mode,
|
|
|
|
unsigned long __user *nmask,
|
|
|
|
unsigned long maxnode,
|
|
|
|
unsigned flags);
|
|
|
|
asmlinkage long sys_get_mempolicy(int __user *policy,
|
|
|
|
unsigned long __user *nmask,
|
|
|
|
unsigned long maxnode,
|
|
|
|
unsigned long addr, unsigned long flags);
|
|
|
|
|
|
|
|
asmlinkage long sys_inotify_init(void);
|
2008-07-24 06:29:32 +02:00
|
|
|
asmlinkage long sys_inotify_init1(int flags);
|
2006-01-19 02:43:04 +01:00
|
|
|
asmlinkage long sys_inotify_add_watch(int fd, const char __user *path,
|
|
|
|
u32 mask);
|
2009-01-05 13:19:16 +01:00
|
|
|
asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd);
|
2005-07-08 02:56:13 +02:00
|
|
|
|
2005-11-15 21:53:48 +01:00
|
|
|
asmlinkage long sys_spu_run(int fd, __u32 __user *unpc,
|
|
|
|
__u32 __user *ustatus);
|
|
|
|
asmlinkage long sys_spu_create(const char __user *name,
|
2007-07-20 21:39:47 +02:00
|
|
|
unsigned int flags, mode_t mode, int fd);
|
2005-11-15 21:53:48 +01:00
|
|
|
|
2006-02-01 12:04:33 +01:00
|
|
|
asmlinkage long sys_mknodat(int dfd, const char __user * filename, int mode,
|
|
|
|
unsigned dev);
|
|
|
|
asmlinkage long sys_mkdirat(int dfd, const char __user * pathname, int mode);
|
|
|
|
asmlinkage long sys_unlinkat(int dfd, const char __user * pathname, int flag);
|
|
|
|
asmlinkage long sys_symlinkat(const char __user * oldname,
|
|
|
|
int newdfd, const char __user * newname);
|
|
|
|
asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
|
2006-02-24 22:04:21 +01:00
|
|
|
int newdfd, const char __user *newname, int flags);
|
2006-02-01 12:04:33 +01:00
|
|
|
asmlinkage long sys_renameat(int olddfd, const char __user * oldname,
|
|
|
|
int newdfd, const char __user * newname);
|
|
|
|
asmlinkage long sys_futimesat(int dfd, char __user *filename,
|
|
|
|
struct timeval __user *utimes);
|
|
|
|
asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode);
|
|
|
|
asmlinkage long sys_fchmodat(int dfd, const char __user * filename,
|
|
|
|
mode_t mode);
|
|
|
|
asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
|
|
|
|
gid_t group, int flag);
|
|
|
|
asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
|
|
|
|
int mode);
|
|
|
|
asmlinkage long sys_newfstatat(int dfd, char __user *filename,
|
|
|
|
struct stat __user *statbuf, int flag);
|
2006-02-12 02:55:47 +01:00
|
|
|
asmlinkage long sys_fstatat64(int dfd, char __user *filename,
|
|
|
|
struct stat64 __user *statbuf, int flag);
|
2006-02-01 12:04:33 +01:00
|
|
|
asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
|
|
|
|
int bufsiz);
|
2007-05-09 11:32:35 +02:00
|
|
|
asmlinkage long sys_utimensat(int dfd, char __user *filename,
|
|
|
|
struct timespec __user *utimes, int flags);
|
2006-03-24 12:15:08 +01:00
|
|
|
asmlinkage long sys_unshare(unsigned long unshare_flags);
|
2006-04-10 15:18:58 +02:00
|
|
|
|
|
|
|
asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
|
|
|
|
int fd_out, loff_t __user *off_out,
|
|
|
|
size_t len, unsigned int flags);
|
|
|
|
|
2006-04-26 10:59:21 +02:00
|
|
|
asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
|
|
|
|
unsigned long nr_segs, unsigned int flags);
|
|
|
|
|
2006-04-11 15:51:17 +02:00
|
|
|
asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags);
|
|
|
|
|
2006-03-31 12:30:42 +02:00
|
|
|
asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
|
2006-04-11 07:53:57 +02:00
|
|
|
unsigned int flags);
|
Introduce fixed sys_sync_file_range2() syscall, implement on PowerPC and ARM
Not all the world is an i386. Many architectures need 64-bit arguments to be
aligned in suitable pairs of registers, and the original
sys_sync_file_range(int, loff_t, loff_t, int) was therefore wasting an
argument register for padding after the first integer. Since we don't
normally have more than 6 arguments for system calls, that left no room for
the final argument on some architectures.
Fix this by introducing sys_sync_file_range2(int, int, loff_t, loff_t) which
all fits nicely. In fact, ARM already had that, but called it
sys_arm_sync_file_range. Move it to fs/sync.c and rename it, then implement
the needed compatibility routine. And stop the missing syscall check from
bitching about the absence of sys_sync_file_range() if we've implemented
sys_sync_file_range2() instead.
Tested on PPC32 and with 32-bit and 64-bit userspace on PPC64.
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-27 23:10:09 +02:00
|
|
|
asmlinkage long sys_sync_file_range2(int fd, unsigned int flags,
|
|
|
|
loff_t offset, loff_t nbytes);
|
2006-05-23 16:46:40 +02:00
|
|
|
asmlinkage long sys_get_robust_list(int pid,
|
2006-10-10 23:46:07 +02:00
|
|
|
struct robust_list_head __user * __user *head_ptr,
|
2006-05-23 16:46:40 +02:00
|
|
|
size_t __user *len_ptr);
|
|
|
|
asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
|
|
|
|
size_t len);
|
2006-09-29 10:58:35 +02:00
|
|
|
asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
|
signal/timer/event: signalfd core
This patch series implements the new signalfd() system call.
I took part of the original Linus code (and you know how badly it can be
broken :), and I added even more breakage ;) Signals are fetched from the same
signal queue used by the process, so signalfd will compete with standard
kernel delivery in dequeue_signal(). If you want to reliably fetch signals on
the signalfd file, you need to block them with sigprocmask(SIG_BLOCK). This
seems to be working fine on my Dual Opteron machine. I made a quick test
program for it:
http://www.xmailserver.org/signafd-test.c
The signalfd() system call implements signal delivery into a file descriptor
receiver. The signalfd file descriptor if created with the following API:
int signalfd(int ufd, const sigset_t *mask, size_t masksize);
The "ufd" parameter allows to change an existing signalfd sigmask, w/out going
to close/create cycle (Linus idea). Use "ufd" == -1 if you want a brand new
signalfd file.
The "mask" allows to specify the signal mask of signals that we are interested
in. The "masksize" parameter is the size of "mask".
The signalfd fd supports the poll(2) and read(2) system calls. The poll(2)
will return POLLIN when signals are available to be dequeued. As a direct
consequence of supporting the Linux poll subsystem, the signalfd fd can use
used together with epoll(2) too.
The read(2) system call will return a "struct signalfd_siginfo" structure in
the userspace supplied buffer. The return value is the number of bytes copied
in the supplied buffer, or -1 in case of error. The read(2) call can also
return 0, in case the sighand structure to which the signalfd was attached,
has been orphaned. The O_NONBLOCK flag is also supported, and read(2) will
return -EAGAIN in case no signal is available.
If the size of the buffer passed to read(2) is lower than sizeof(struct
signalfd_siginfo), -EINVAL is returned. A read from the signalfd can also
return -ERESTARTSYS in case a signal hits the process. The format of the
struct signalfd_siginfo is, and the valid fields depends of the (->code &
__SI_MASK) value, in the same way a struct siginfo would:
struct signalfd_siginfo {
__u32 signo; /* si_signo */
__s32 err; /* si_errno */
__s32 code; /* si_code */
__u32 pid; /* si_pid */
__u32 uid; /* si_uid */
__s32 fd; /* si_fd */
__u32 tid; /* si_fd */
__u32 band; /* si_band */
__u32 overrun; /* si_overrun */
__u32 trapno; /* si_trapno */
__s32 status; /* si_status */
__s32 svint; /* si_int */
__u64 svptr; /* si_ptr */
__u64 utime; /* si_utime */
__u64 stime; /* si_stime */
__u64 addr; /* si_addr */
};
[akpm@linux-foundation.org: fix signalfd_copyinfo() on i386]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-11 07:23:13 +02:00
|
|
|
asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask);
|
flag parameters: signalfd
This patch adds the new signalfd4 syscall. It extends the old signalfd
syscall by one parameter which is meant to hold a flag value. In this
patch the only flag support is SFD_CLOEXEC which causes the close-on-exec
flag for the returned file descriptor to be set.
A new name SFD_CLOEXEC is introduced which in this implementation must
have the same value as O_CLOEXEC.
The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#ifndef __NR_signalfd4
# ifdef __x86_64__
# define __NR_signalfd4 289
# elif defined __i386__
# define __NR_signalfd4 327
# else
# error "need __NR_signalfd4"
# endif
#endif
#define SFD_CLOEXEC O_CLOEXEC
int
main (void)
{
sigset_t ss;
sigemptyset (&ss);
sigaddset (&ss, SIGUSR1);
int fd = syscall (__NR_signalfd4, -1, &ss, 8, 0);
if (fd == -1)
{
puts ("signalfd4(0) failed");
return 1;
}
int coe = fcntl (fd, F_GETFD);
if (coe == -1)
{
puts ("fcntl failed");
return 1;
}
if (coe & FD_CLOEXEC)
{
puts ("signalfd4(0) set close-on-exec flag");
return 1;
}
close (fd);
fd = syscall (__NR_signalfd4, -1, &ss, 8, SFD_CLOEXEC);
if (fd == -1)
{
puts ("signalfd4(SFD_CLOEXEC) failed");
return 1;
}
coe = fcntl (fd, F_GETFD);
if (coe == -1)
{
puts ("fcntl failed");
return 1;
}
if ((coe & FD_CLOEXEC) == 0)
{
puts ("signalfd4(SFD_CLOEXEC) does not set close-on-exec flag");
return 1;
}
close (fd);
puts ("OK");
return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 06:29:24 +02:00
|
|
|
asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, int flags);
|
timerfd: new timerfd API
This is the new timerfd API as it is implemented by the following patch:
int timerfd_create(int clockid, int flags);
int timerfd_settime(int ufd, int flags,
const struct itimerspec *utmr,
struct itimerspec *otmr);
int timerfd_gettime(int ufd, struct itimerspec *otmr);
The timerfd_create() API creates an un-programmed timerfd fd. The "clockid"
parameter can be either CLOCK_MONOTONIC or CLOCK_REALTIME.
The timerfd_settime() API give new settings by the timerfd fd, by optionally
retrieving the previous expiration time (in case the "otmr" parameter is not
NULL).
The time value specified in "utmr" is absolute, if the TFD_TIMER_ABSTIME bit
is set in the "flags" parameter. Otherwise it's a relative time.
The timerfd_gettime() API returns the next expiration time of the timer, or
{0, 0} if the timerfd has not been set yet.
Like the previous timerfd API implementation, read(2) and poll(2) are
supported (with the same interface). Here's a simple test program I used to
exercise the new timerfd APIs:
http://www.xmailserver.org/timerfd-test2.c
[akpm@linux-foundation.org: coding-style cleanups]
[akpm@linux-foundation.org: fix ia64 build]
[akpm@linux-foundation.org: fix m68k build]
[akpm@linux-foundation.org: fix mips build]
[akpm@linux-foundation.org: fix alpha, arm, blackfin, cris, m68k, s390, sparc and sparc64 builds]
[heiko.carstens@de.ibm.com: fix s390]
[akpm@linux-foundation.org: fix powerpc build]
[akpm@linux-foundation.org: fix sparc64 more]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 07:27:26 +01:00
|
|
|
asmlinkage long sys_timerfd_create(int clockid, int flags);
|
|
|
|
asmlinkage long sys_timerfd_settime(int ufd, int flags,
|
|
|
|
const struct itimerspec __user *utmr,
|
|
|
|
struct itimerspec __user *otmr);
|
|
|
|
asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
|
signal/timer/event: eventfd core
This is a very simple and light file descriptor, that can be used as event
wait/dispatch by userspace (both wait and dispatch) and by the kernel
(dispatch only). It can be used instead of pipe(2) in all cases where those
would simply be used to signal events. Their kernel overhead is much lower
than pipes, and they do not consume two fds. When used in the kernel, it can
offer an fd-bridge to enable, for example, functionalities like KAIO or
syslets/threadlets to signal to an fd the completion of certain operations.
But more in general, an eventfd can be used by the kernel to signal readiness,
in a POSIX poll/select way, of interfaces that would otherwise be incompatible
with it. The API is:
int eventfd(unsigned int count);
The eventfd API accepts an initial "count" parameter, and returns an eventfd
fd. It supports poll(2) (POLLIN, POLLOUT, POLLERR), read(2) and write(2).
The POLLIN flag is raised when the internal counter is greater than zero.
The POLLOUT flag is raised when at least a value of "1" can be written to the
internal counter.
The POLLERR flag is raised when an overflow in the counter value is detected.
The write(2) operation can never overflow the counter, since it blocks (unless
O_NONBLOCK is set, in which case -EAGAIN is returned).
But the eventfd_signal() function can do it, since it's supposed to not sleep
during its operation.
The read(2) function reads the __u64 counter value, and reset the internal
value to zero. If the value read is equal to (__u64) -1, an overflow happened
on the internal counter (due to 2^64 eventfd_signal() posts that has never
been retired - unlickely, but possible).
The write(2) call writes an __u64 count value, and adds it to the current
counter. The eventfd fd supports O_NONBLOCK also.
On the kernel side, we have:
struct file *eventfd_fget(int fd);
int eventfd_signal(struct file *file, unsigned int n);
The eventfd_fget() should be called to get a struct file* from an eventfd fd
(this is an fget() + check of f_op being an eventfd fops pointer).
The kernel can then call eventfd_signal() every time it wants to post an event
to userspace. The eventfd_signal() function can be called from any context.
An eventfd() simple test and bench is available here:
http://www.xmailserver.org/eventfd-bench.c
This is the eventfd-based version of pipetest-4 (pipe(2) based):
http://www.xmailserver.org/pipetest-4.c
Not that performance matters much in the eventfd case, but eventfd-bench
shows almost as double as performance than pipetest-4.
[akpm@linux-foundation.org: fix i386 build]
[akpm@linux-foundation.org: add sys_eventfd to sys_ni.c]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-11 07:23:19 +02:00
|
|
|
asmlinkage long sys_eventfd(unsigned int count);
|
2008-07-24 06:29:25 +02:00
|
|
|
asmlinkage long sys_eventfd2(unsigned int count, int flags);
|
sys_fallocate() implementation on i386, x86_64 and powerpc
fallocate() is a new system call being proposed here which will allow
applications to preallocate space to any file(s) in a file system.
Each file system implementation that wants to use this feature will need
to support an inode operation called ->fallocate().
Applications can use this feature to avoid fragmentation to certain
level and thus get faster access speed. With preallocation, applications
also get a guarantee of space for particular file(s) - even if later the
the system becomes full.
Currently, glibc provides an interface called posix_fallocate() which
can be used for similar cause. Though this has the advantage of working
on all file systems, but it is quite slow (since it writes zeroes to
each block that has to be preallocated). Without a doubt, file systems
can do this more efficiently within the kernel, by implementing
the proposed fallocate() system call. It is expected that
posix_fallocate() will be modified to call this new system call first
and incase the kernel/filesystem does not implement it, it should fall
back to the current implementation of writing zeroes to the new blocks.
ToDos:
1. Implementation on other architectures (other than i386, x86_64,
and ppc). Patches for s390(x) and ia64 are already available from
previous posts, but it was decided that they should be added later
once fallocate is in the mainline. Hence not including those patches
in this take.
2. Changes to glibc,
a) to support fallocate() system call
b) to make posix_fallocate() and posix_fallocate64() call fallocate()
Signed-off-by: Amit Arora <aarora@in.ibm.com>
2007-07-18 03:42:44 +02:00
|
|
|
asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
|
2009-01-14 14:13:55 +01:00
|
|
|
asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
|
2006-02-01 12:04:33 +01:00
|
|
|
|
2006-10-02 11:18:31 +02:00
|
|
|
int kernel_execve(const char *filename, char *const argv[], char *const envp[]);
|
|
|
|
|
2005-04-17 00:20:36 +02:00
|
|
|
#endif
|