diff -Nru a/arch/i386/kernel/cpu/proc.c b/arch/i386/kernel/cpu/proc.c --- a/arch/i386/kernel/cpu/proc.c Tue Oct 21 16:05:27 2003 +++ b/arch/i386/kernel/cpu/proc.c Tue Oct 21 16:05:27 2003 @@ -4,6 +4,12 @@ #include #include +#ifdef CONFIG_CPUSETS_PROC_CPUINFO +#include +#include +#include +#endif + /* * Get CPU information for use by the procfs. */ @@ -63,12 +69,22 @@ if (!cpu_online(n)) return 0; #endif +#ifdef CONFIG_CPUSETS_PROC_CPUINFO + /* show only CPUs in current cpuset */ + if (!cpu_isset(n, current->cpuset->cpus_allowed)) + return 0; +#endif /* CONFIG_CPUSETS_PROC_CPUINFO */ + seq_printf(m, "processor\t: %d\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" "model\t\t: %d\n" "model name\t: %s\n", +#ifdef CONFIG_CPUSETS_PROC_CPUINFO + cpuset_realtologic_cpuid(current->cpuset, n), +#else n, +#endif c->x86_vendor_id[0] ? c->x86_vendor_id : "unknown", c->x86, c->x86_model, diff -Nru a/arch/i386/kernel/entry.S b/arch/i386/kernel/entry.S --- a/arch/i386/kernel/entry.S Tue Oct 21 16:05:27 2003 +++ b/arch/i386/kernel/entry.S Tue Oct 21 16:05:27 2003 @@ -880,5 +880,14 @@ .long sys_utimes .long sys_fadvise64_64 .long sys_ni_syscall /* sys_vserver */ + .long sys_ni_syscall + .long sys_ni_syscall /* 275 */ + .long sys_ni_syscall + .long sys_cpuset_create + .long sys_cpuset_destroy + .long sys_cpuset_alloc + .long sys_cpuset_attach + .long sys_cpuset_getfreecpus + nr_syscalls=(.-sys_call_table)/4 diff -Nru a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S --- a/arch/ia64/kernel/entry.S Tue Oct 21 16:05:27 2003 +++ b/arch/ia64/kernel/entry.S Tue Oct 21 16:05:27 2003 @@ -1481,11 +1481,19 @@ data8 ia64_ni_syscall data8 ia64_ni_syscall // 1265 data8 ia64_ni_syscall +#ifdef CONFIG_CPUSETS + data8 sys_cpuset_create + data8 sys_cpuset_destroy + data8 sys_cpuset_alloc + data8 sys_cpuset_attach // 1270 + data8 sys_cpuset_getfreecpus +#else data8 ia64_ni_syscall data8 ia64_ni_syscall data8 ia64_ni_syscall data8 ia64_ni_syscall // 1270 data8 ia64_ni_syscall +#endif data8 ia64_ni_syscall data8 ia64_ni_syscall data8 ia64_ni_syscall diff -Nru a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c --- a/arch/ia64/kernel/setup.c Tue Oct 21 16:05:27 2003 +++ b/arch/ia64/kernel/setup.c Tue Oct 21 16:05:27 2003 @@ -50,6 +50,10 @@ #include #include +#ifdef CONFIG_CPUSETS_PROC_CPUINFO +# include +#endif + #if defined(CONFIG_SMP) && (IA64_CPU_SIZE > PAGE_SIZE) # error "struct cpuinfo_ia64 too big!" #endif @@ -383,6 +387,15 @@ unsigned long mask; int i; +#ifdef CONFIG_CPUSETS_PROC_CPUINFO + /* show only CPUs in current cpuset */ + if (!current->cpuset) + BUG(); + + if (!cpu_isset(cpunum, current->cpuset->cpus_allowed)) + return 0; +#endif /* CONFIG_CPUSETS_PROC_CPUINFO */ + mask = c->features; switch (c->family) { @@ -427,7 +440,12 @@ "cpu MHz : %lu.%06lu\n" "itc MHz : %lu.%06lu\n" "BogoMIPS : %lu.%02lu\n\n", - cpunum, c->vendor, family, c->model, c->revision, c->archrev, +#ifdef CONFIG_CPUSETS_PROC_CPUINFO + cpuset_realtologic_cpuid(current->cpuset, cpunum), +#else + cpunum, +#endif + c->vendor, family, c->model, c->revision, c->archrev, features, c->ppn, c->number, c->proc_freq / 1000000, c->proc_freq % 1000000, c->itc_freq / 1000000, c->itc_freq % 1000000, diff -Nru a/fs/proc/base.c b/fs/proc/base.c --- a/fs/proc/base.c Tue Oct 21 16:05:27 2003 +++ b/fs/proc/base.c Tue Oct 21 16:05:27 2003 @@ -60,6 +60,9 @@ PROC_TGID_MAPS, PROC_TGID_MOUNTS, PROC_TGID_WCHAN, +#ifdef CONFIG_CPUSETS_PROC + PROC_TGID_CPUSET, +#endif #ifdef CONFIG_SECURITY PROC_TGID_ATTR, PROC_TGID_ATTR_CURRENT, @@ -123,6 +126,9 @@ #ifdef CONFIG_KALLSYMS E(PROC_TGID_WCHAN, "wchan", S_IFREG|S_IRUGO), #endif +#ifdef CONFIG_CPUSETS_PROC + E(PROC_TGID_CPUSET, "cpuset", S_IFREG|S_IRUGO), +#endif {0,0,NULL,0} }; static struct pid_entry tid_base_stuff[] = { @@ -366,6 +372,11 @@ } #endif /* CONFIG_KALLSYMS */ + +#ifdef CONFIG_CPUSETS_PROC +int proc_pid_cpuset(struct task_struct *task, char *buffer); +#endif /* CONFIG_CPUSETS_PROC */ + /************************************************************************/ /* Here the fs part begins */ /************************************************************************/ @@ -1359,6 +1370,12 @@ case PROC_TGID_WCHAN: inode->i_fop = &proc_info_file_operations; ei->op.proc_read = proc_pid_wchan; + break; +#endif +#ifdef CONFIG_CPUSETS_PROC + case PROC_TGID_CPUSET: + inode->i_fop = &proc_info_file_operations; + ei->op.proc_read = proc_pid_cpuset; break; #endif default: diff -Nru a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c --- a/fs/proc/proc_misc.c Tue Oct 21 16:05:27 2003 +++ b/fs/proc/proc_misc.c Tue Oct 21 16:05:27 2003 @@ -51,6 +51,10 @@ #include #include +#ifdef CONFIG_CPUSETS_PROC_STAT +# include +#endif + #define LOAD_INT(x) ((x) >> FSHIFT) #define LOAD_FRAC(x) LOAD_INT(((x) & (FIXED_1-1)) * 100) /* @@ -382,6 +386,12 @@ int j; if (!cpu_online(i)) continue; +#ifdef CONFIG_CPUSETS_PROC_STAT + /* show only CPUs in current cpuset */ + if (!cpu_isset(i, current->cpuset->cpus_allowed)) + continue; +#endif + user += kstat_cpu(i).cpustat.user; nice += kstat_cpu(i).cpustat.nice; system += kstat_cpu(i).cpustat.system; @@ -403,8 +413,17 @@ jiffies_to_clock_t(softirq)); for (i = 0; i < NR_CPUS; i++){ if (!cpu_online(i)) continue; +#ifdef CONFIG_CPUSETS_PROC_STAT + /* show only CPUs in current cpuset */ + if (!cpu_isset(i, current->cpuset->cpus_allowed)) + continue; +#endif seq_printf(p, "cpu%d %u %u %u %u %u %u %u\n", +#ifdef CONFIG_CPUSETS_PROC_STAT + cpuset_realtologic_cpuid(current->cpuset, i), +#else i, +#endif jiffies_to_clock_t(kstat_cpu(i).cpustat.user), jiffies_to_clock_t(kstat_cpu(i).cpustat.nice), jiffies_to_clock_t(kstat_cpu(i).cpustat.system), diff -Nru a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h --- a/include/asm-i386/unistd.h Tue Oct 21 16:05:27 2003 +++ b/include/asm-i386/unistd.h Tue Oct 21 16:05:27 2003 @@ -280,7 +280,13 @@ #define __NR_fadvise64_64 272 #define __NR_vserver 273 -#define NR_syscalls 274 +#define __NR_sys_cpuset_create 277 +#define __NR_sys_cpuset_destroy 278 +#define __NR_sys_cpuset_alloc 279 +#define __NR_sys_cpuset_attach 280 +#define __NR_sys_cpuset_getfreecpus 281 + +#define NR_syscalls 282 /* user-visible error numbers are in the range -1 - -124: see */ diff -Nru a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h --- a/include/asm-ia64/unistd.h Tue Oct 21 16:05:27 2003 +++ b/include/asm-ia64/unistd.h Tue Oct 21 16:05:27 2003 @@ -253,6 +253,12 @@ #define NR_syscalls 256 /* length of syscall table */ +#define __NR_sys_cpuset_create 1267 +#define __NR_sys_cpuset_destroy 1268 +#define __NR_sys_cpuset_alloc 1269 +#define __NR_sys_cpuset_attach 1270 +#define __NR_sys_cpuset_getfreecpus 1271 + #if !defined(__ASSEMBLY__) && !defined(ASSEMBLER) extern long __ia64_syscall (long a0, long a1, long a2, long a3, long a4, long nr); diff -Nru a/include/linux/cpuset.h b/include/linux/cpuset.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/cpuset.h Tue Oct 21 16:05:27 2003 @@ -0,0 +1,29 @@ +/* + * BULL cpuset interface + */ + +#ifndef _LINUX_CPUSET_H +#define _LINUX_CPUSET_H + +typedef unsigned int cpuset_t; + +#define CPUSET_STRICT 0x00000001 +#define CPUSET_AUTOCLEAN 0x00000002 + +#ifdef __KERNEL__ + +extern struct cpuset top_cpuset; + +void use_cpuset(struct cpuset *); +void release_cpuset(struct cpuset *); + +struct task_struct; +int cpuset_setaffinity(struct task_struct * task, unsigned long mask); + +void cpusets_update_cpus_online(void); + +int cpuset_realtologic_cpuid(struct cpuset * cs, int cpuid); + +#endif /* __KERNEL__ */ + +#endif /* _LINUX_CPUSET_H */ diff -Nru a/include/linux/cpuset_types.h b/include/linux/cpuset_types.h --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/include/linux/cpuset_types.h Tue Oct 21 16:05:27 2003 @@ -0,0 +1,39 @@ +#ifndef _LINUX_CPUSET_TYPES_H +#define _LINUX_CPUSET_TYPES_H + + +struct cpuset { + cpuset_t id; + int flags; + int has_been_attached; + + /* bitmask of the cpus present in this cpuset */ + cpumask_t cpus_allowed; + + /* bitmask of the cpus reserved in this cpuset */ + cpumask_t cpus_reserved; + + /* bitmask of the cpus reserved with CPUSET_STRICT */ + cpumask_t cpus_strictly_reserved; + + struct cpuset * parent; + struct list_head list; /* for the whole list */ + + struct list_head children; + struct list_head brothers; + + /* overall users (processes + children) */ + /* will be replaced by atomic_t in the future */ + atomic_t count; + + spinlock_t attach_lock; + + /* owner */ + uid_t uid; + uid_t suid; + + +}; + + +#endif diff -Nru a/include/linux/init_task.h b/include/linux/init_task.h --- a/include/linux/init_task.h Tue Oct 21 16:05:27 2003 +++ b/include/linux/init_task.h Tue Oct 21 16:05:27 2003 @@ -56,6 +56,12 @@ .siglock = SPIN_LOCK_UNLOCKED, \ } +#ifdef CONFIG_CPUSETS +#define CPUSET_TSKINIT(a,b) .a = b, +#else +#define CPUSET_TSKINIT(a,b) +#endif + /* * INIT_TASK is used to set up the first task table, touch at * your own risk!. Base=0, limit=0x1fffff (=2MB) @@ -108,6 +114,9 @@ .proc_lock = SPIN_LOCK_UNLOCKED, \ .switch_lock = SPIN_LOCK_UNLOCKED, \ .journal_info = NULL, \ + CPUSET_TSKINIT(cpus_wanted, CPU_MASK_ALL) \ + CPUSET_TSKINIT(cpuset, &top_cpuset) \ + CPUSET_TSKINIT(cpuset_attach_lock, SPIN_LOCK_UNLOCKED) \ } diff -Nru a/include/linux/sched.h b/include/linux/sched.h --- a/include/linux/sched.h Tue Oct 21 16:05:27 2003 +++ b/include/linux/sched.h Tue Oct 21 16:05:27 2003 @@ -29,6 +29,7 @@ #include #include #include +#include struct exec_domain; @@ -464,6 +465,13 @@ unsigned long ptrace_message; siginfo_t *last_siginfo; /* For ptrace use. */ + +/* cpuset info */ +#ifdef CONFIG_CPUSETS + struct cpuset * cpuset; + unsigned long cpus_wanted; + spinlock_t cpuset_attach_lock; +#endif }; static inline pid_t process_group(struct task_struct *tsk) diff -Nru a/init/Kconfig b/init/Kconfig --- a/init/Kconfig Tue Oct 21 16:05:27 2003 +++ b/init/Kconfig Tue Oct 21 16:05:27 2003 @@ -194,6 +194,41 @@ Disabling this option will cause the kernel to be built without support for epoll family of system calls. +if X86 || IA64 + +config CPUSETS + bool "cpusets" + depends on SMP + help + This options will let you create and manage sets of cpu where you + can run the processes. + + Say N if unsure. + +config CPUSETS_PROC + bool "/proc/cpusets support" + depends on CPUSETS + help + Get some info about the existing cpusets in your system. + To use this option, you have to ensure that the "/proc file system + support" (CONFIG_PROC_FS) is enabled, too. + +config CPUSETS_PROC_CPUINFO + bool "/proc/cpuinfo uses current cpuset" + depends on CPUSETS_PROC + help + With this option enabled, a process reading /proc/cpuinfo will + only see the CPUs that are in its current cpuset. + +config CPUSETS_PROC_STAT + bool "/proc/stat uses current cpuset" + depends on CPUSETS_PROC + help + With this option enabled, a process reading /proc/stat will + only see the CPUs that are in its current cpuset. + +endif + source "drivers/block/Kconfig.iosched" endmenu # General setup diff -Nru a/init/main.c b/init/main.c --- a/init/main.c Tue Oct 21 16:05:27 2003 +++ b/init/main.c Tue Oct 21 16:05:27 2003 @@ -39,6 +39,13 @@ #include #include + + +#ifdef CONFIG_CPUSETS +#include +#endif + + #include #include @@ -85,6 +92,7 @@ extern void free_initmem(void); extern void populate_rootfs(void); extern void driver_init(void); +extern void cpusets_init(void); #ifdef CONFIG_TC extern void tc_init(void); @@ -456,6 +464,10 @@ #ifdef CONFIG_PROC_FS proc_root_init(); #endif +#ifdef CONFIG_CPUSETS + cpusets_init(); +#endif + check_bugs(); printk("POSIX conformance testing by UNIFIX\n"); @@ -524,6 +536,10 @@ */ static void __init do_basic_setup(void) { +#ifdef CONFIG_CPUSETS + cpusets_update_cpus_online(); +#endif + driver_init(); #ifdef CONFIG_SYSCTL @@ -579,6 +595,7 @@ do_basic_setup(); prepare_namespace(); + /* * Ok, we have completed the initial bootup, and diff -Nru a/kernel/Makefile b/kernel/Makefile --- a/kernel/Makefile Tue Oct 21 16:05:27 2003 +++ b/kernel/Makefile Tue Oct 21 16:05:27 2003 @@ -19,6 +19,7 @@ obj-$(CONFIG_COMPAT) += compat.o obj-$(CONFIG_IKCONFIG) += configs.o obj-$(CONFIG_IKCONFIG_PROC) += configs.o +obj-$(CONFIG_CPUSETS) += cpuset.o ifneq ($(CONFIG_IA64),y) # According to Alan Modra , the -fno-omit-frame-pointer is diff -Nru a/kernel/cpuset.c b/kernel/cpuset.c --- /dev/null Wed Dec 31 16:00:00 1969 +++ b/kernel/cpuset.c Tue Oct 21 16:05:27 2003 @@ -0,0 +1,785 @@ +#include +#include +#include /* for kmalloc */ +#include +#include /* for find_task_by_pid and task_struct */ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define info(args...) do {} while(0) +//#define info(args...) printk(KERN_INFO args) + + +#ifdef CPU_ARRAY_SIZE +#warning "CPU ARRAY SIZE !" +#endif +rwlock_t cpuset_lock = RW_LOCK_UNLOCKED; + +#define CPUSET_TOP_ID 1 + +struct cpuset top_cpuset = { + .id = CPUSET_TOP_ID, + .flags = CPUSET_STRICT, + .cpus_reserved = CPU_MASK_NONE, + .cpus_strictly_reserved = CPU_MASK_NONE, + .parent = 0, + .children = LIST_HEAD_INIT(top_cpuset.children), + .brothers = LIST_HEAD_INIT(top_cpuset.brothers), + .list = LIST_HEAD_INIT(top_cpuset.list), + .count = ATOMIC_INIT(1), /* this cpuset can't be deleted */ + .has_been_attached = 0, + .uid = 0, + .attach_lock = SPIN_LOCK_UNLOCKED, + .suid = 0 +}; + + +static int proc_cpusets_init(void); + +int __init cpusets_init(void) +{ + info("cpusets ("__FILE__ " compiled " __DATE__ " " __TIME__ "initializing..\n"); + + +#ifdef CONFIG_CPUSETS_PROC + proc_cpusets_init(); +#endif /* CONFIG_CPUSETS_PROC */ + return 0; +} + +/* + * later this function may be used to indicate that a CPU has been put + * online/offline + * BUT currently it only exists because cpu_online_map becomes available + * only late during kernel boot + */ +void cpusets_update_cpus_online(void) +{ + top_cpuset.cpus_allowed = cpu_online_map ; +} + + +static const int N = (8*sizeof(cpumask_t)); +/* mask must NOT be ZERO ! */ +/* this is a cyclic version of next_cpu */ +static inline void _next_cpu(const cpumask_t mask, int * index) +{ + for(;;) { + if (++*index >= N) *index = 0; + if (cpu_isset(*index, mask)) return; + } +} + +static unsigned long cpuset_combine_mask(const cpumask_t wanted, const cpumask_t allowed) +{ + int i; + cpumask_t mask; + + /* start with current cpu out of the mask + * so the first call to next_cpu will take the first cpu + * even if it is cpu zero + */ + int cpu = N; + + if (cpus_empty(wanted)) return 0; + if (cpus_empty(allowed)) return 0; + + cpus_clear(mask); + + for(i=0; i < N; i++) { + _next_cpu(allowed, &cpu); + if (cpu_isset(i, wanted)) + cpu_set(cpu, mask); + } + info("cpuset_combine_mask: %016lx + %016lx --> %016lx\n", + wanted, allowed, mask); + return mask; +} + +/* translate a "real" cpu number to a "inside cpuset" (aka logical) + * cpu number. Used for /proc/cpuinfo + */ +int cpuset_realtologic_cpuid(struct cpuset * cs, int cpuid) +{ + int i; + int l = 0; + for(i=0; i < NR_CPUS; i++) + { + if (i == cpuid) return l; + if (cpu_isset(i, cs->cpus_allowed)) + l++; + } + /* NOT REACHED */ + BUG(); + return 0; +} + +static struct cpuset * find_cpuset_by_id(cpuset_t id) +{ + struct cpuset * cs; + if (id == CPUSET_TOP_ID) return &top_cpuset; + + list_for_each_entry(cs, &top_cpuset.list, list) { + if (cs->id == id) return cs; + } + /* Not found */ + return 0; +} + +/* increment a cpuset use count */ +void use_cpuset(struct cpuset * cs) +{ + atomic_inc(&cs->count); +} + +static void check_cpuset_autoclean(struct cpuset *); + +/* decrement a cpuset use count, and maybe autodestroy it */ +/* cpuset_lock MUST NOT BE HELD */ +void release_cpuset(struct cpuset * cs) +{ + if (atomic_dec_and_test(&cs->count)) + check_cpuset_autoclean(cs); +} + +/* find a free cpuset ID */ +static cpuset_t cpuset_mkid(void) +{ + static cpuset_t curid = CPUSET_TOP_ID; + + while (find_cpuset_by_id(++curid)); + + return curid; +} + +asmlinkage long sys_cpuset_create(cpuset_t * cpusetp, int flags) +{ + struct cpuset * cs; + + info("sys_cpuset_create(%016lx, %d) called\n", + (unsigned long) cpusetp, flags); + + /* can only create a strict cs in another strict cs */ + if ((flags & CPUSET_STRICT) && (!(current->cpuset->flags & CPUSET_STRICT))) + return -EINVAL; + + /* check if given pointer is valid */ + if (verify_area(VERIFY_WRITE, cpusetp, sizeof(cpuset_t))) + return -EFAULT; + + cs = (struct cpuset *) kmalloc(sizeof(struct cpuset), GFP_KERNEL); + if (!cs) + return -ENOMEM; + + cs->flags = flags; + atomic_set(&cs->count, 0); + INIT_LIST_HEAD(&cs->children); + cs->cpus_allowed = 0; + cs->cpus_reserved = 0; + cs->cpus_strictly_reserved = 0; + cs->has_been_attached = 0; + cs->uid = current->uid; + cs->suid = current->suid; + cs->attach_lock = SPIN_LOCK_UNLOCKED; + + cs->parent = current->cpuset; + + use_cpuset(cs->parent); + + write_lock(&cpuset_lock); + + cs->id = cpuset_mkid(); + list_add(&cs->brothers, &cs->parent->children); + list_add(&cs->list, &top_cpuset.list); + + write_unlock(&cpuset_lock); + + if (put_user(cs->id, cpusetp)) + info("put_user failed !\n"); + + return 0; +} + + +static inline int bad_permission(struct cpuset * cs) +{ + return ((current->euid) && (current->euid != cs->uid) && (current->euid != cs->suid)); +} + +static void __cpuset_destroy(struct cpuset * cs); + +asmlinkage long sys_cpuset_destroy(cpuset_t cpuset) +{ + struct cpuset * cs; + + info("sys_cpuset_destroy(%d) called\n", cpuset); + + if (cpuset == CPUSET_TOP_ID) + return -EINVAL; + + read_lock(&cpuset_lock); + cs = find_cpuset_by_id(cpuset); + + if (!cs) { + read_unlock(&cpuset_lock); + return -EINVAL; + } + + use_cpuset(cs); + read_unlock(&cpuset_lock); + + if (bad_permission(cs)) { + release_cpuset(cs); + return -EPERM; + } + + write_lock(&cpuset_lock); + /* there's at least 1 user (us), if there's more, we can't destroy cs */ + if (atomic_read(&cs->count) > 1) { + write_unlock(&cpuset_lock); + release_cpuset(cs); + return -EBUSY; + } + + /* everything OK, destroy it */ + __cpuset_destroy(cs); + /* write_unlock(&cpuset_lock) will be done inside __cpuset_destroy */ + + return 0; +} + +static void rebuild_reserved_masks(struct cpuset * csp) { + cpumask_t r; + cpumask_t sr; + struct cpuset * cs; + info("Updating cpuset %d masks\n", csp->id); + + cpus_clear(r); + cpus_clear(sr); + + list_for_each_entry(cs, &csp->children, brothers) { + info(" child %d\n", cs->id); + cpus_or(r, r, cs->cpus_allowed); + if (cs->flags & CPUSET_STRICT) + cpus_or(sr, sr, cs->cpus_allowed); + } + csp->cpus_reserved = r; + csp->cpus_strictly_reserved = sr; +} + +/* REALLY destroy a cpuset + * NOTE: + * -> write cpuset_lock must be held + * -> ----------------- WILL BE RELEASED + * this ugly hack is necessary to call release_cpuset(parent) + */ +static void __cpuset_destroy(struct cpuset * cs) +{ + list_del(&cs->list); + list_del(&cs->brothers); + + /* cs will never be top_cpuset, so ->parent exists */ + rebuild_reserved_masks(cs->parent); + + write_unlock(&cpuset_lock); + release_cpuset(cs->parent); + + kfree(cs); +} + +/* remove an unused cpuset if it has the CPUSET_AUTOCLEAN flag */ +static void check_cpuset_autoclean(struct cpuset * cs) +{ + if (!(cs->flags & CPUSET_AUTOCLEAN)) return; /* not autoclean */ + if (!cs->has_been_attached) return; + + write_lock(&cpuset_lock); + + if (atomic_read(&cs->count) > 0) { /* still in use */ + write_unlock(&cpuset_lock); + return; + } + + info("autocleaning cpuset %d\n", cs->id); + + __cpuset_destroy(cs); + /* write_unlock(&cpuset_lock) will be done inside __cpuset_destroy */ +} + +asmlinkage long sys_cpuset_attach(cpuset_t cpuset, pid_t pid) +{ + struct cpuset * cs; + struct task_struct * task; + + info("sys_cpuset_attach(%d, %d) called\n", cpuset, pid); + + read_lock(&cpuset_lock); + cs = find_cpuset_by_id(cpuset); + + if (!cs) { + read_unlock(&cpuset_lock); + return -EINVAL; + } + + + use_cpuset(cs); + + read_unlock(&cpuset_lock); + + if (bad_permission(cs)) { + release_cpuset(cs); + return -EPERM; + } + + if (!cs->cpus_allowed) { /* cannot attach a cpuset with no CPU */ + release_cpuset(cs); + return -EINVAL; + } + + if (pid) { + read_lock(&tasklist_lock); + + task = find_task_by_pid(pid); + if (!task) { + read_unlock(&tasklist_lock); + release_cpuset(cs); + return -ESRCH; + } + + get_task_struct(task); + read_unlock(&tasklist_lock); + + if ((current->euid) && (current->euid != task->uid) && (current->euid != task->suid)) { + put_task_struct(task); + release_cpuset(cs); + return -EPERM; + } + } + else { + task = current; + get_task_struct(task); + } + + set_cpus_allowed(task, cpuset_combine_mask(task->cpus_wanted, cs->cpus_allowed)); + cs->has_been_attached = 1; + + /* release the current cpu set of the task */ + /* lock to prevent a race where two cpuset_attach would be called on the same + * task at the same time, and task->cpuset would be released twice + */ + spin_lock(&task->cpuset_attach_lock); + if (!task->cpuset) { /* task with no cpuset ? means it is exiting */ + spin_unlock(&task->cpuset_attach_lock); + put_task_struct(task); + release_cpuset(cs); + return -ESRCH; + } + release_cpuset(task->cpuset); + /* now lock the cpuset, to protect any running migrate_cpuset...() + * from being disturbed by us + */ + spin_lock(&cs->attach_lock); + task->cpuset = cs; + spin_unlock(&cs->attach_lock); + + spin_unlock(&task->cpuset_attach_lock); + + + put_task_struct(task); + + /* don't call release_cpuset here, + * the task being attached to the cpuset + * is really a new user ! + */ + + return 0; +} + + +static int __cpuset_setaffinity(struct task_struct * task) +{ + cpumask_t allowed; + cpumask_t last = CPU_MASK_NONE; /* remember : 0 is not a valid mask */ + + /* We cannot hold any lock while calling set_cpus_allowed + * since it might sleep + * Thus we try until we are sure we did it with the right mask + */ + for(;;) { + spin_lock(&task->cpuset_attach_lock); + if (!task->cpuset) { + /* task exiting */ + spin_unlock(&task->cpuset_attach_lock); + return 0; + } + allowed = task->cpuset->cpus_allowed; + spin_unlock(&task->cpuset_attach_lock); + + if (last == allowed) + return 0; + + int ret; + ret = set_cpus_allowed(task, cpuset_combine_mask(task->cpus_wanted, allowed)); + if (ret < 0) + return ret; + + last = allowed; + } +} + +/* Our replacement function for set_cpus_allowed */ +int cpuset_setaffinity(struct task_struct * task, cpumask_t mask) +{ + task->cpus_wanted = mask; + return __cpuset_setaffinity(task); +} + +/* When a cpuset with attached processes is being realloc'ed CPUs + * update the processes' masks and migrate them + */ +static void migrate_cpuset_processes(struct cpuset * cs) +{ + struct task_struct *g, *p; + /* This should be a RARE use of the cpusets. + * therefore we'll prefer an inefficient operation here + * (searching the whole process list) + * than adding another list_head in task_t + * and locks and list_add for each fork() + */ + + /* we need to lock tasklist_lock for reading the processes list + * BUT we cannot call set_cpus_allowed with any spinlock held + * => we need to store the list of task struct in an array + */ + struct task_struct ** array; + int nb = 0; + int sz; + + spin_lock(&cs->attach_lock); + /* at most cs->count - 1 processes to migrate */ + sz = atomic_read(&cs->count) - 1; + array = (struct task_struct **) kmalloc(sz * sizeof(struct task_struct *), GFP_ATOMIC); + if (!array) { + spin_unlock(&cs->attach_lock); + printk("Error allocating array in migrate_cpuset_processes !\n"); + return; + } + /* see linux/sched.h for this nested for/do-while loop */ + read_lock(&tasklist_lock); + do_each_thread(g, p) { + if (p->cpuset == cs) { + if (nb == sz) { + printk("migrate_cpuset_processes: array full !\n"); + goto end_loop; /* break won't work in this double loop */ + } + get_task_struct(p); + array[nb++] = p; + } + } while_each_thread(g, p); +end_loop: + read_unlock(&tasklist_lock); + spin_unlock(&cs->attach_lock); + + while(nb) { + struct task_struct * p = array[--nb]; + __cpuset_setaffinity(p); + put_task_struct(p); + } + kfree(array); +} + + + +/* see if mask b is included in mask a */ +/* old version : #define MASK_INCLUDED(a, b) (((a)|(b)) == (a)) */ +static inline int MASK_INCLUDED(cpumask_t a, cpumask_t b) +{ + cpumask_t r; + cpus_or(r, a, b); + return cpus_equal(r, a); +} + +static inline cpumask_t CPUS_NOT(cpumask_t a) +{ + cpus_complement(a); + return a; +} + +static inline cpumask_t CPUS_OR(cpumask_t a, cpumask_t b) +{ + cpumask_t r; + cpus_or(r, a, b); + return r; +} + +static inline cpumask_t CPUS_AND(cpumask_t a, cpumask_t b) +{ + cpumask_t r; + cpus_and(r, a, b); + return r; +} + + +asmlinkage long sys_cpuset_alloc(cpuset_t cpuset, int len, unsigned long * user_mask_ptr) +{ + cpumask_t new_mask; + cpumask_t old_mask; + struct cpuset * cs ; + int retval; + + info("sys_cpuset_alloc(%d, ...) called\n", cpuset); + + if (cpuset == CPUSET_TOP_ID) + return -EINVAL; + + if (len < sizeof(new_mask)) + return -EINVAL; + + if (copy_from_user(&new_mask, user_mask_ptr, sizeof(new_mask))) + return -EFAULT; + + /* do some sanity checks on the mask */ + /* must have at least ONE cpu */ + if (cpus_empty(new_mask)) + return -EINVAL; + + /* XXX phys_cpu_present_map has changed type -- + * I disable this test for now + * anyway it is not _NEEDED_ since new_mask will have to stay + * in the parent's mask + * (just some overhead in a _really_ rare case) */ +#if 0 + /* must only have existing CPUs */ + if (!MASK_INCLUDED(phys_cpu_present_map, new_mask)) + return -EINVAL; +#endif + + info(" with mask %016lx\n", new_mask); + + read_lock(&cpuset_lock); + cs = find_cpuset_by_id(cpuset); + + + if (!cs) { + read_unlock(&cpuset_lock); + return -EINVAL; + } + + use_cpuset(cs); + read_unlock(&cpuset_lock); + + if (bad_permission(cs)) { + release_cpuset(cs); + return -EPERM; + } + + /* lock early - we do not want the parent's masks to change under us */ + write_lock(&cpuset_lock); + /* must only have CPUs in the parent cpuset (if any) */ + retval = -EACCES; + if (!MASK_INCLUDED(cs->parent->cpus_allowed, new_mask)) + goto mask_error; + + old_mask = cs->cpus_allowed; + + retval = -EBUSY; + /* must only have free cpus */ + if (cs->flags & CPUSET_STRICT) { + /* CPUs already in this cs ARE free for us ! -> old_mask */ + /* The next few lines mean : + * if (!MASK_INCLUDED(~cs->parent->cpus_reserved, new_mask & (~old_mask))) + * (just obfuscated my the cpus_ macros) + */ + if (!MASK_INCLUDED(CPUS_NOT(cs->parent->cpus_reserved), + CPUS_AND(new_mask, CPUS_NOT(old_mask)))) + goto mask_error; + } + else { + if (!MASK_INCLUDED(CPUS_NOT(cs->parent->cpus_strictly_reserved), new_mask)) + goto mask_error; + } + + + /* are we trying to FREE reserved CPUs + * (i.e. reserved by children cpusets) + * from a non-unused cpuset ? */ + /* if (cs->cpus_reserved & ~new_mask) */ + if (!cpus_empty(CPUS_AND(cs->cpus_reserved, CPUS_NOT(new_mask)))) + goto mask_error; + + /* everything is OK */ + cs->cpus_allowed = new_mask; + rebuild_reserved_masks(cs->parent); + write_unlock(&cpuset_lock); + + /* did we change a non-unused cpuset ? */ + if ((atomic_read(&cs->count) > 1) && !cpus_equal(new_mask, old_mask)) { + migrate_cpuset_processes(cs); + } + + release_cpuset(cs); + return 0; + +mask_error: + write_unlock(&cpuset_lock); + release_cpuset(cs); + return retval; +} + +asmlinkage long sys_cpuset_getfreecpus(int flags, int len, unsigned long * user_mask_ptr) +{ + cpumask_t reserved; + cpumask_t free; + + int real_len = sizeof(unsigned long); + if (len < real_len) + return -EINVAL; + + if (flags & CPUSET_STRICT) + reserved = current->cpuset->cpus_reserved; + else + reserved = current->cpuset->cpus_strictly_reserved; + + free = CPUS_AND(current->cpuset->cpus_allowed, CPUS_NOT(reserved)); + + if (copy_to_user(user_mask_ptr, &free, real_len)) + return -EFAULT; + + return real_len; +} + +/************************************************************* + ***************** /proc/cpusets stuff *********************** + ************************************************************* + */ +#ifdef CONFIG_CPUSETS_PROC + +static void *proc_cpusets_start(struct seq_file *m, loff_t *pos) +{ + loff_t n = *pos; + struct list_head *p; + + read_lock(&cpuset_lock); + if (!n) seq_puts(m, "cpusets info \n"); + + p = &top_cpuset.list; + while (n--) { + p = p->next; + if (p == &top_cpuset.list) + return NULL; + } + return list_entry(p, struct cpuset, list); +} + +static void *proc_cpusets_next(struct seq_file *m, void *p, loff_t *pos) +{ + struct cpuset * cs = p; + ++*pos; + return cs->list.next == &top_cpuset.list ? NULL + : list_entry(cs->list.next, struct cpuset, list); +} + +/* How many chars needed to print a long (as a mask) ? */ +#define CHARS_FOR_LONG (BITS_PER_LONG / 4) +#define CFL CHARS_FOR_LONG +static void sprint_mask(char * buf, cpumask_t mask) +{ +#ifdef CPU_ARRAY_SIZE + int l; + for (l = CPU_ARRAY_SIZE - 1; l>=0; l--) { + /* XXX only 64 bits long supported here ! */ + sprintf(buf, "%016lx", mask.mask[l]); + buf += CFL; + } +#else + /* XXX only 64 bits long supported here ! */ + sprintf(buf, "%016lx", mask); +#endif +} + + +static int proc_cpusets_show(struct seq_file *m, void *p) +{ + struct cpuset * cs = p; +#ifdef CPU_ARRAY_SIZE + char maskbuf[CPU_ARRAY_SIZE * CFL + 1]; +#else + char maskbuf[CFL + 1]; +#endif + + seq_printf(m, "cpuset %d {\n" + "\tparent = %d\n" + "\tflags = %d\n" + "\tcount = %d\n" + "\thba = %d\n" + "\tuid & suid = %d & %d\n", + cs->id, cs->parent ? cs->parent->id : -1, + cs->flags, atomic_read(&cs->count), cs->has_been_attached, + cs->uid, cs->suid); + + sprint_mask(maskbuf, cs->cpus_allowed); + seq_printf(m,"\tcpus_allowed = %s\n", maskbuf); + sprint_mask(maskbuf, cs->cpus_reserved); + seq_printf(m,"\tcpus_reserved = %s\n", maskbuf); + sprint_mask(maskbuf, cs->cpus_strictly_reserved); + seq_printf(m,"\tcpus_strictly_reserved = %s\n", maskbuf); + + seq_printf(m, "}\n\n"); + + return 0; +} + +static void proc_cpusets_stop(struct seq_file *m, void *p) +{ + read_unlock(&cpuset_lock); +} + +static struct seq_operations cpusets_op = { + .start = proc_cpusets_start, + .next = proc_cpusets_next, + .stop = proc_cpusets_stop, + .show = proc_cpusets_show +}; + + +static int proc_cpusets_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &cpusets_op); +} + +static struct file_operations proc_cpusets_operations = { + .open = proc_cpusets_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +static int __init proc_cpusets_init(void) +{ + struct proc_dir_entry *entry; + + entry = create_proc_entry("cpusets", 0, NULL); + if (entry) + entry->proc_fops = &proc_cpusets_operations; + return 0; +} + +/************************************************************* + *********** /proc/xxx/cpuset ******************************** + ************************************************************* + */ +int proc_pid_cpuset(struct task_struct *task, char *buffer) +{ + return sprintf(buffer, "%d\n", task->cpuset->id); +} + +#endif /* CONFIG_CPUSETS_PROC */ + diff -Nru a/kernel/exit.c b/kernel/exit.c --- a/kernel/exit.c Tue Oct 21 16:05:27 2003 +++ b/kernel/exit.c Tue Oct 21 16:05:27 2003 @@ -54,6 +54,19 @@ BUG_ON(p->state < TASK_ZOMBIE); + +#ifdef CONFIG_CPUSETS + spin_lock(&p->cpuset_attach_lock); + release_cpuset(p->cpuset); + + /* mark that this process's cpuset has already been released + * another process might still try to cpuset_attach this process + */ + p->cpuset = NULL; + spin_unlock(&p->cpuset_attach_lock); +#endif /* CONFIG_CPUSETS */ + + atomic_dec(&p->user->processes); spin_lock(&p->proc_lock); proc_dentry = proc_pid_unhash(p); @@ -87,6 +100,7 @@ spin_unlock(&p->proc_lock); proc_pid_flush(proc_dentry); release_thread(p); + put_task_struct(p); } diff -Nru a/kernel/fork.c b/kernel/fork.c --- a/kernel/fork.c Tue Oct 21 16:05:27 2003 +++ b/kernel/fork.c Tue Oct 21 16:05:27 2003 @@ -31,6 +31,10 @@ #include #include +#ifdef CONFIG_CPUSETS +#include +#endif + #include #include #include @@ -1035,6 +1039,11 @@ SET_LINKS(p); if (p->ptrace & PT_PTRACED) __ptrace_link(p, current->parent); + +#ifdef CONFIG_CPUSETS + use_cpuset(p->cpuset); +#endif + attach_pid(p, PIDTYPE_PID, p->pid); if (thread_group_leader(p)) { diff -Nru a/kernel/sched.c b/kernel/sched.c --- a/kernel/sched.c Tue Oct 21 16:05:27 2003 +++ b/kernel/sched.c Tue Oct 21 16:05:27 2003 @@ -38,6 +38,10 @@ #include #include +#ifdef CONFIG_CPUSETS +#include +#endif + #ifdef CONFIG_NUMA #define cpu_to_node_mask(cpu) node_to_cpumask(cpu_to_node(cpu)) #else @@ -2203,7 +2207,11 @@ !capable(CAP_SYS_NICE)) goto out_unlock; +#ifdef CONFIG_CPUSETS + retval = cpuset_setaffinity(p, new_mask); +#else retval = set_cpus_allowed(p, new_mask); +#endif out_unlock: put_task_struct(p); @@ -2236,7 +2244,11 @@ goto out_unlock; retval = 0; +#ifdef CONFIG_CPUSETS + mask = p->cpus_wanted; +#else cpus_and(mask, p->cpus_allowed, cpu_online_map); +#endif out_unlock: read_unlock(&tasklist_lock);