From bd8fbdee6562ee526f3c2582a3b373ef195015dd Mon Sep 17 00:00:00 2001 From: Rui Sousa Date: Wed, 3 Sep 2008 17:53:07 +0200 Subject: lockdep: fix compilation when CONFIG_TRACE_IRQFLAGS_SUPPORT is not set This patch fixes compilation if CONFIG_TRACE_IRQFLAGS_SUPPORT is ever disabled (which is currently not allowed by Kconfig). Alternatively we could just remove the option altogether and the associated code paths. Since the compilation error has been in the tree for at least two years and no one noticed it, I guess we don't really have the need for CONFIG_TRACE_IRQFLAGS_SUPPORT=n. Boot tested on x86 UP. Signed-off-by: Rui Sousa Signed-off-by: Ingo Molnar --- include/linux/irqflags.h | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index 74bde13224c..f2993512b3b 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -52,10 +52,10 @@ # define start_critical_timings() do { } while (0) #endif -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT - #include +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT + #define local_irq_enable() \ do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0) #define local_irq_disable() \ @@ -84,21 +84,20 @@ * The local_irq_*() APIs are equal to the raw_local_irq*() * if !TRACE_IRQFLAGS. */ -# define raw_local_irq_disable() local_irq_disable() -# define raw_local_irq_enable() local_irq_enable() -# define raw_local_irq_save(flags) \ +#define local_irq_disable() raw_local_irq_disable() +#define local_irq_enable() raw_local_irq_enable() +#define local_irq_save(flags) \ do { \ typecheck(unsigned long, flags); \ - local_irq_save(flags); \ + raw_local_irq_save(flags); \ } while (0) -# define raw_local_irq_restore(flags) \ +# define local_irq_restore(flags) \ do { \ typecheck(unsigned long, flags); \ - local_irq_restore(flags); \ + raw_local_irq_restore(flags); \ } while (0) #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */ -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT #define safe_halt() \ do { \ trace_hardirqs_on(); \ @@ -124,6 +123,5 @@ typecheck(unsigned long, flags); \ raw_irqs_disabled_flags(flags); \ }) -#endif /* CONFIG_X86 */ #endif -- cgit v1.2.3 From ab7476cf76e560f0efda2a631a70aabe93009025 Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Fri, 15 Aug 2008 15:29:38 -0700 Subject: debug: add notifier chain debugging, v2 - unbreak ia64 (and powerpc) where function pointers dont point at code but at data (reported by Tony Luck) [ mingo@elte.hu: various cleanups ] Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2651f805ba6..4e1366b552a 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -187,6 +187,9 @@ extern unsigned long long memparse(char *ptr, char **retptr); extern int core_kernel_text(unsigned long addr); extern int __kernel_text_address(unsigned long addr); extern int kernel_text_address(unsigned long addr); +extern int func_ptr_is_kernel_text(void *ptr); +extern void *dereference_function_descriptor(void *ptr); + struct pid; extern struct pid *session_of_pgrp(struct pid *pgrp); -- cgit v1.2.3 From 76b189e91845eab3a9d52bb97f971d312d25652d Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 10 Sep 2008 09:57:35 +0200 Subject: lockdep: add might_lock() / might_lock_read() useful to establish a lock dependency in case the actual dependency is rare or hard to trigger. Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 331e5f1c2d8..0aa657aa8a1 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -480,4 +480,22 @@ static inline void print_irqtrace_events(struct task_struct *curr) # define lock_map_release(l) do { } while (0) #endif +#ifdef CONFIG_PROVE_LOCKING +# define might_lock(lock) \ +do { \ + typecheck(struct lockdep_map *, &(lock)->dep_map); \ + lock_acquire(&(lock)->dep_map, 0, 0, 0, 2, NULL, _THIS_IP_); \ + lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ +} while (0) +# define might_lock_read(lock) \ +do { \ + typecheck(struct lockdep_map *, &(lock)->dep_map); \ + lock_acquire(&(lock)->dep_map, 0, 0, 1, 2, NULL, _THIS_IP_); \ + lock_release(&(lock)->dep_map, 0, _THIS_IP_); \ +} while (0) +#else +# define might_lock(lock) do { } while (0) +# define might_lock_read(lock) do { } while (0) +#endif + #endif /* __LINUX_LOCKDEP_H */ -- cgit v1.2.3 From 3ee1afa308f2a38e5d1e2ad3752ad7abcf480da1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 10 Sep 2008 13:37:17 +0200 Subject: x86: some lock annotations for user copy paths, v2 - introduce might_fault() - handle the atomic user copy paths correctly [ mingo@elte.hu: move might_sleep() outside of in_atomic(). ] Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/kernel.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 2651f805ba6..e580ec09576 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -140,6 +140,15 @@ extern int _cond_resched(void); (__x < 0) ? -__x : __x; \ }) +#ifdef CONFIG_PROVE_LOCKING +void might_fault(void); +#else +static inline void might_fault(void) +{ + might_sleep(); +} +#endif + extern struct atomic_notifier_head panic_notifier_list; extern long (*panic_blink)(long time); NORET_TYPE void panic(const char * fmt, ...) -- cgit v1.2.3 From 1d18ef489509314506328b9e464dd47c24c1d68f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Thu, 11 Sep 2008 20:53:21 +0200 Subject: x86: some lock annotations for user copy paths, v3 - add annotation back to clear_user() - change probe_kernel_address() to _inatomic*() method Signed-off-by: Ingo Molnar --- include/linux/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index fec6decfb98..2062293e57e 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to, \ set_fs(KERNEL_DS); \ pagefault_disable(); \ - ret = __get_user(retval, (__force typeof(retval) __user *)(addr)); \ + ret = __copy_from_user_inatomic((__force typeof(retval) __user *)(addr), &(retval), sizeof(retval)); \ pagefault_enable(); \ set_fs(old_fs); \ ret; \ -- cgit v1.2.3 From 53b9d87f41a3d8838210ad7cdef02d814817ce85 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Thu, 11 Sep 2008 17:02:58 -0700 Subject: lock debug: sit tight when we are already in a panic in: > http://bugzilla.kernel.org/show_bug.cgi?id=11543 The panic code called the kexec code which called mutex_trylock() which called spin_lock_mutex() which then stupidly went and blurted a load of debug stuff because of in_interrupt(). Keep the lock debug code from escallating an already crappy situation. Signed-off-by: Ingo Molnar --- include/linux/debug_locks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index 4aaa4afb1cb..096476f1fb3 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -17,7 +17,7 @@ extern int debug_locks_off(void); ({ \ int __ret = 0; \ \ - if (unlikely(c)) { \ + if (!oops_in_progress && unlikely(c)) { \ if (debug_locks_off() && !debug_locks_silent) \ WARN_ON(1); \ __ret = 1; \ -- cgit v1.2.3 From 30742d5c2277c325fb0e9d2d817d55a19995fe8f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Sun, 14 Sep 2008 14:43:39 +0200 Subject: Revert "lockdep: fix compilation when CONFIG_TRACE_IRQFLAGS_SUPPORT is not set" This reverts commit bd8fbdee6562ee526f3c2582a3b373ef195015dd. This broke the powerpc build - more fixes are needed before we can undo this revert. --- include/linux/irqflags.h | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index f2993512b3b..74bde13224c 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -52,10 +52,10 @@ # define start_critical_timings() do { } while (0) #endif -#include - #ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +#include + #define local_irq_enable() \ do { trace_hardirqs_on(); raw_local_irq_enable(); } while (0) #define local_irq_disable() \ @@ -84,20 +84,21 @@ * The local_irq_*() APIs are equal to the raw_local_irq*() * if !TRACE_IRQFLAGS. */ -#define local_irq_disable() raw_local_irq_disable() -#define local_irq_enable() raw_local_irq_enable() -#define local_irq_save(flags) \ +# define raw_local_irq_disable() local_irq_disable() +# define raw_local_irq_enable() local_irq_enable() +# define raw_local_irq_save(flags) \ do { \ typecheck(unsigned long, flags); \ - raw_local_irq_save(flags); \ + local_irq_save(flags); \ } while (0) -# define local_irq_restore(flags) \ +# define raw_local_irq_restore(flags) \ do { \ typecheck(unsigned long, flags); \ - raw_local_irq_restore(flags); \ + local_irq_restore(flags); \ } while (0) #endif /* CONFIG_TRACE_IRQFLAGS_SUPPORT */ +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT #define safe_halt() \ do { \ trace_hardirqs_on(); \ @@ -123,5 +124,6 @@ typecheck(unsigned long, flags); \ raw_irqs_disabled_flags(flags); \ }) +#endif /* CONFIG_X86 */ #endif -- cgit v1.2.3 From fb71e45338453698bd7460f7e8f171ea0304d218 Mon Sep 17 00:00:00 2001 From: Hiroshi Shimamoto Date: Mon, 15 Sep 2008 18:04:26 -0700 Subject: uaccess: fix parameters inversion for __copy_from_user_inatomic() The following patch changes to use __copy_from_user_inatomic(), but the passing parameters incorrect: x86: some lock annotations for user copy paths, v3 This fixes the netfilter crash reported by Steven Noonan. Reported-by: Steven Noonan Signed-off-by: Hiroshi Shimamoto Tested-by: Steven Noonan Signed-off-by: Ingo Molnar --- include/linux/uaccess.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/uaccess.h b/include/linux/uaccess.h index 2062293e57e..6b58367d145 100644 --- a/include/linux/uaccess.h +++ b/include/linux/uaccess.h @@ -78,7 +78,7 @@ static inline unsigned long __copy_from_user_nocache(void *to, \ set_fs(KERNEL_DS); \ pagefault_disable(); \ - ret = __copy_from_user_inatomic((__force typeof(retval) __user *)(addr), &(retval), sizeof(retval)); \ + ret = __copy_from_user_inatomic(&(retval), (__force typeof(retval) __user *)(addr), sizeof(retval)); \ pagefault_enable(); \ set_fs(old_fs); \ ret; \ -- cgit v1.2.3 From 38d47c1b7075bd7ec3881141bb3629da58f88dab Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Fri, 26 Sep 2008 19:32:20 +0200 Subject: futex: rely on get_user_pages() for shared futexes On the way of getting rid of the mmap_sem requirement for shared futexes, start by relying on get_user_pages(). Signed-off-by: Peter Zijlstra Acked-by: Nick Piggin Signed-off-by: Ingo Molnar --- include/linux/futex.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/futex.h b/include/linux/futex.h index 586ab56a3ec..8f627b9ae2b 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -164,6 +164,8 @@ union futex_key { } both; }; +#define FUTEX_KEY_INIT (union futex_key) { .both = { .ptr = NULL } } + #ifdef CONFIG_FUTEX extern void exit_robust_list(struct task_struct *curr); extern void exit_pi_state_list(struct task_struct *curr); -- cgit v1.2.3 From c7e78cff6b7518212247fb20b1dc6411540dc9af Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 16 Oct 2008 23:17:09 +0200 Subject: lockstat: contend with points We currently only provide points that have to wait on contention, also lists the points we have to wait for. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 0aa657aa8a1..fc9f8e88123 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -73,6 +73,8 @@ struct lock_class_key { struct lockdep_subclass_key subkeys[MAX_LOCKDEP_SUBCLASSES]; }; +#define LOCKSTAT_POINTS 4 + /* * The lock-class itself: */ @@ -119,7 +121,8 @@ struct lock_class { int name_version; #ifdef CONFIG_LOCK_STAT - unsigned long contention_point[4]; + unsigned long contention_point[LOCKSTAT_POINTS]; + unsigned long contending_point[LOCKSTAT_POINTS]; #endif }; @@ -144,6 +147,7 @@ enum bounce_type { struct lock_class_stats { unsigned long contention_point[4]; + unsigned long contending_point[4]; struct lock_time read_waittime; struct lock_time write_waittime; struct lock_time read_holdtime; @@ -165,6 +169,7 @@ struct lockdep_map { const char *name; #ifdef CONFIG_LOCK_STAT int cpu; + unsigned long ip; #endif }; @@ -355,7 +360,7 @@ struct lock_class_key { }; #ifdef CONFIG_LOCK_STAT extern void lock_contended(struct lockdep_map *lock, unsigned long ip); -extern void lock_acquired(struct lockdep_map *lock); +extern void lock_acquired(struct lockdep_map *lock, unsigned long ip); #define LOCK_CONTENDED(_lock, try, lock) \ do { \ @@ -363,13 +368,13 @@ do { \ lock_contended(&(_lock)->dep_map, _RET_IP_); \ lock(_lock); \ } \ - lock_acquired(&(_lock)->dep_map); \ + lock_acquired(&(_lock)->dep_map, _RET_IP_); \ } while (0) #else /* CONFIG_LOCK_STAT */ #define lock_contended(lockdep_map, ip) do {} while (0) -#define lock_acquired(lockdep_map) do {} while (0) +#define lock_acquired(lockdep_map, ip) do {} while (0) #define LOCK_CONTENDED(_lock, try, lock) \ lock(_lock) -- cgit v1.2.3 From 2cb1599f9b2ecdd7a9e59feeee647eb258966839 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 30 Oct 2008 17:32:23 +1100 Subject: Inode: Allow external initialisers To allow XFS to combine the XFS and linux inodes into a single structure, we need to drive inode lookup from the XFS inode cache, not the generic inode cache. This means that we need initialise a struct inode from a context outside alloc_inode() as it is no longer used by XFS. Factor and export the struct inode initialisation code from alloc_inode() to inode_init_always() as a counterpart to inode_init_once(). i.e. we have to call this init function for each inode instantiation (always), as opposed inode_init_once() which is only called on slab object instantiation (once). Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 5b248d61430..04abead4b02 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1881,6 +1881,7 @@ extern loff_t default_llseek(struct file *file, loff_t offset, int origin); extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); +extern struct inode * inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); extern void iput(struct inode *); extern struct inode * igrab(struct inode *); -- cgit v1.2.3 From 8290c35f87304a6b73d4fd17b03580b4f7425de8 Mon Sep 17 00:00:00 2001 From: David Chinner Date: Thu, 30 Oct 2008 17:35:24 +1100 Subject: Inode: Allow external list initialisation To allow XFS to combine the XFS and linux inodes into a single structure, we need to drive inode lookup from the XFS inode cache, not the generic inode cache. This means that we need initialise a struct inode from a context outside alloc_inode() as it is no longer used by XFS. After inode allocation and initialisation, we need to add the inode to the superblock list, the in-use list, hash it and do some accounting. This all needs to be done with the inode_lock held and there are already several places in fs/inode.c that do this list manipulation. Factor out the common code, add a locking wrapper and export the function so ti can be called from XFS. Signed-off-by: Dave Chinner Signed-off-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- include/linux/fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 04abead4b02..1deedf235d5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1883,6 +1883,7 @@ extern loff_t vfs_llseek(struct file *file, loff_t offset, int origin); extern struct inode * inode_init_always(struct super_block *, struct inode *); extern void inode_init_once(struct inode *); +extern void inode_add_to_lists(struct super_block *, struct inode *); extern void iput(struct inode *); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); -- cgit v1.2.3 From d98d38f2014ab79f28c126ff175d034891f7aefc Mon Sep 17 00:00:00 2001 From: Arjan van de Ven Date: Wed, 29 Oct 2008 14:24:09 -0700 Subject: mutex: improve header comment to be actually informative about the API Impact: improve documentation It's nice to say that mutex_trylock follows the spin_trylock convention. It's a lot nicer if the comment also says which that is... make it so. Signed-off-by: Arjan van de Ven Signed-off-by: Ingo Molnar Signed-off-by: Andrew Morton --- include/linux/mutex.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mutex.h b/include/linux/mutex.h index bc6da10ceee..7a0e5c4f807 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -144,6 +144,8 @@ extern int __must_check mutex_lock_killable(struct mutex *lock); /* * NOTE: mutex_trylock() follows the spin_trylock() convention, * not the down_trylock() convention! + * + * Returns 1 if the mutex has been acquired successfully, and 0 on contention. */ extern int mutex_trylock(struct mutex *lock); extern void mutex_unlock(struct mutex *lock); -- cgit v1.2.3 From 29cbda77a67cf263d636feea65d3bbc9c7de2e24 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 3 Nov 2008 09:16:39 -0800 Subject: rcu: increase RCU stall-check timeouts Impact: increase timeout of debug check feature Increase RCU stall period timeouts to reduce the likelyhood of false positives. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/rcuclassic.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/rcuclassic.h b/include/linux/rcuclassic.h index 5f89b62e698..301dda829e3 100644 --- a/include/linux/rcuclassic.h +++ b/include/linux/rcuclassic.h @@ -41,7 +41,7 @@ #include #ifdef CONFIG_RCU_CPU_STALL_DETECTOR -#define RCU_SECONDS_TILL_STALL_CHECK ( 3 * HZ) /* for rcp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rcp->jiffies_stall */ #define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rcp->jiffies_stall */ #endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ -- cgit v1.2.3 From e25cf3db560e803292946ef23a30c69e341ce56f Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 17 Oct 2008 15:55:07 +0200 Subject: lockdep: include/linux/lockdep.h - fix warning in net/bluetooth/af_bluetooth.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit fix this warning: net/bluetooth/af_bluetooth.c:60: warning: ‘bt_key_strings’ defined but not used net/bluetooth/af_bluetooth.c:71: warning: ‘bt_slock_key_strings’ defined but not used this is a lockdep macro problem in the !LOCKDEP case. We cannot convert it to an inline because the macro works on multiple types, but we can mark the parameter used. [ also clean up a misaligned tab in sock_lock_init_class_and_name() ] [ also remove #ifdefs from around af_family_clock_key strings - which were certainly added to get rid of the ugly build warnings. ] Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index fc9f8e88123..8956daf64ab 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -336,10 +336,11 @@ static inline void lockdep_on(void) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) -# define lockdep_init_map(lock, name, key, sub) do { (void)(key); } while (0) +# define lockdep_init_map(lock, name, key, sub) \ + do { (void)(name); (void)(key); } while (0) # define lockdep_set_class(lock, key) do { (void)(key); } while (0) # define lockdep_set_class_and_name(lock, key, name) \ - do { (void)(key); } while (0) + do { (void)(key); (void)(name); } while (0) #define lockdep_set_class_and_subclass(lock, key, sub) \ do { (void)(key); } while (0) #define lockdep_set_subclass(lock, sub) do { } while (0) -- cgit v1.2.3 From e262a7ba31f0cd4dae225e6d2e9037e5ac6108e8 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Sun, 23 Nov 2008 14:34:43 +0000 Subject: irq.h: remove padding from irq_desc on 64bits Impact: reduce struct irq_desc size struct irq_desc: reorder to remove padding on 64bits shrinks irq_desc to 128 bytes which saves data space & cache lines On a generic x86_64/SMP build this reduces the reported data size by 64k. Signed-off-by: Richard Kennedy Signed-off-by: Ingo Molnar --- include/linux/irq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index d058c57be02..838a977885e 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -142,8 +142,8 @@ struct irq_chip { * @depth: disable-depth, for nested irq_disable() calls * @wake_depth: enable depth, for multiple set_irq_wake() callers * @irq_count: stats field to detect stalled irqs - * @irqs_unhandled: stats field for spurious unhandled interrupts * @last_unhandled: aging timer for unhandled count + * @irqs_unhandled: stats field for spurious unhandled interrupts * @lock: locking for SMP * @affinity: IRQ affinity on SMP * @cpu: cpu index useful for balancing @@ -165,8 +165,8 @@ struct irq_desc { unsigned int depth; /* nested irq disables */ unsigned int wake_depth; /* nested wake enables */ unsigned int irq_count; /* For detecting broken IRQs */ - unsigned int irqs_unhandled; unsigned long last_unhandled; /* Aging timer for unhandled count */ + unsigned int irqs_unhandled; spinlock_t lock; #ifdef CONFIG_SMP cpumask_t affinity; -- cgit v1.2.3 From 1acdac104668a0834cfa267de9946fac7764d486 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Thu, 20 Nov 2008 10:02:53 -0800 Subject: futex: make clock selectable for FUTEX_WAIT_BITSET FUTEX_WAIT_BITSET could be used instead of FUTEX_WAIT by setting the bit set to FUTEX_BITSET_MATCH_ANY, but FUTEX_WAIT uses CLOCK_REALTIME while FUTEX_WAIT_BITSET uses CLOCK_MONOTONIC. Add a flag to select CLOCK_REALTIME for FUTEX_WAIT_BITSET so glibc can replace the FUTEX_WAIT logic which needs to do gettimeofday() calls before and after the syscall to convert the absolute timeout to a relative timeout for FUTEX_WAIT. Signed-off-by: Thomas Gleixner Cc: Ulrich Drepper --- include/linux/futex.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/futex.h b/include/linux/futex.h index 8f627b9ae2b..3bf5bb5a34f 100644 --- a/include/linux/futex.h +++ b/include/linux/futex.h @@ -25,7 +25,8 @@ union ktime; #define FUTEX_WAKE_BITSET 10 #define FUTEX_PRIVATE_FLAG 128 -#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG +#define FUTEX_CLOCK_REALTIME 256 +#define FUTEX_CMD_MASK ~(FUTEX_PRIVATE_FLAG | FUTEX_CLOCK_REALTIME) #define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) #define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) -- cgit v1.2.3 From ca109491f612aab5c8152207631c0444f63da97f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Tue, 25 Nov 2008 12:43:51 +0100 Subject: hrtimer: removing all ur callback modes Impact: cleanup, move all hrtimer processing into hardirq context This is an attempt at removing some of the hrtimer complexity by reducing the number of callback modes to 1. This means that all hrtimer callback functions will be ran from HARD-irq context. I went through all the 30 odd hrtimer callback functions in the kernel and saw only one that I'm not quite sure of, which is the one in net/can/bcm.c - hence I'm CC-ing the folks responsible for that code. Furthermore, the hrtimer core now calls callbacks directly with IRQs disabled in case you try to enqueue an expired timer. If this timer is a periodic timer (which should use hrtimer_forward() to advance its time) then it might be possible to end up in an inf. recursive loop due to the fact that hrtimer_forward() doesn't round up to the next timer granularity, and therefore keeps on calling the callback - obviously this needs a fix. Aside from that, this seems to compile and actually boot on my dual core test box - although I'm sure there are some bugs in, me not hitting any makes me certain :-) Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/hrtimer.h | 34 ++-------------------------------- include/linux/interrupt.h | 3 --- 2 files changed, 2 insertions(+), 35 deletions(-) (limited to 'include/linux') diff --git a/include/linux/hrtimer.h b/include/linux/hrtimer.h index 3eba43878dc..bd37078c2d7 100644 --- a/include/linux/hrtimer.h +++ b/include/linux/hrtimer.h @@ -42,26 +42,6 @@ enum hrtimer_restart { HRTIMER_RESTART, /* Timer must be restarted */ }; -/* - * hrtimer callback modes: - * - * HRTIMER_CB_SOFTIRQ: Callback must run in softirq context - * HRTIMER_CB_IRQSAFE_PERCPU: Callback must run in hardirq context - * Special mode for tick emulation and - * scheduler timer. Such timers are per - * cpu and not allowed to be migrated on - * cpu unplug. - * HRTIMER_CB_IRQSAFE_UNLOCKED: Callback should run in hardirq context - * with timer->base lock unlocked - * used for timers which call wakeup to - * avoid lock order problems with rq->lock - */ -enum hrtimer_cb_mode { - HRTIMER_CB_SOFTIRQ, - HRTIMER_CB_IRQSAFE_PERCPU, - HRTIMER_CB_IRQSAFE_UNLOCKED, -}; - /* * Values to track state of the timer * @@ -70,7 +50,6 @@ enum hrtimer_cb_mode { * 0x00 inactive * 0x01 enqueued into rbtree * 0x02 callback function running - * 0x04 callback pending (high resolution mode) * * Special cases: * 0x03 callback function running and enqueued @@ -92,8 +71,7 @@ enum hrtimer_cb_mode { #define HRTIMER_STATE_INACTIVE 0x00 #define HRTIMER_STATE_ENQUEUED 0x01 #define HRTIMER_STATE_CALLBACK 0x02 -#define HRTIMER_STATE_PENDING 0x04 -#define HRTIMER_STATE_MIGRATE 0x08 +#define HRTIMER_STATE_MIGRATE 0x04 /** * struct hrtimer - the basic hrtimer structure @@ -109,8 +87,6 @@ enum hrtimer_cb_mode { * @function: timer expiry callback function * @base: pointer to the timer base (per cpu and per clock) * @state: state information (See bit values above) - * @cb_mode: high resolution timer feature to select the callback execution - * mode * @cb_entry: list head to enqueue an expired timer into the callback list * @start_site: timer statistics field to store the site where the timer * was started @@ -129,7 +105,6 @@ struct hrtimer { struct hrtimer_clock_base *base; unsigned long state; struct list_head cb_entry; - enum hrtimer_cb_mode cb_mode; #ifdef CONFIG_TIMER_STATS int start_pid; void *start_site; @@ -188,15 +163,11 @@ struct hrtimer_clock_base { * @check_clocks: Indictator, when set evaluate time source and clock * event devices whether high resolution mode can be * activated. - * @cb_pending: Expired timers are moved from the rbtree to this - * list in the timer interrupt. The list is processed - * in the softirq. * @nr_events: Total number of timer interrupt events */ struct hrtimer_cpu_base { spinlock_t lock; struct hrtimer_clock_base clock_base[HRTIMER_MAX_CLOCK_BASES]; - struct list_head cb_pending; #ifdef CONFIG_HIGH_RES_TIMERS ktime_t expires_next; int hres_active; @@ -404,8 +375,7 @@ static inline int hrtimer_active(const struct hrtimer *timer) */ static inline int hrtimer_is_queued(struct hrtimer *timer) { - return timer->state & - (HRTIMER_STATE_ENQUEUED | HRTIMER_STATE_PENDING); + return timer->state & HRTIMER_STATE_ENQUEUED; } /* diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f58a0cf8929..d6210a97a8c 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -251,9 +251,6 @@ enum BLOCK_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, -#ifdef CONFIG_HIGH_RES_TIMERS - HRTIMER_SOFTIRQ, -#endif RCU_SOFTIRQ, /* Preferable RCU should always be the last softirq */ NR_SOFTIRQS -- cgit v1.2.3 From ce71e27c6fdc43c29f36d307b9100bde70c947fc Mon Sep 17 00:00:00 2001 From: Eduard - Gabriel Munteanu Date: Tue, 19 Aug 2008 20:43:25 +0300 Subject: SLUB: Replace __builtin_return_address(0) with _RET_IP_. This patch replaces __builtin_return_address(0) with _RET_IP_, since a previous patch moved _RET_IP_ and _THIS_IP_ to include/linux/kernel.h and they're widely available now. This makes for shorter and easier to read code. [penberg@cs.helsinki.fi: remove _RET_IP_ casts to void pointer] Signed-off-by: Eduard - Gabriel Munteanu Signed-off-by: Pekka Enberg --- include/linux/slab.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 000da12b5cf..c97ed28559e 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -253,9 +253,9 @@ static inline void *kmem_cache_alloc_node(struct kmem_cache *cachep, * request comes from. */ #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) -extern void *__kmalloc_track_caller(size_t, gfp_t, void*); +extern void *__kmalloc_track_caller(size_t, gfp_t, unsigned long); #define kmalloc_track_caller(size, flags) \ - __kmalloc_track_caller(size, flags, __builtin_return_address(0)) + __kmalloc_track_caller(size, flags, _RET_IP_) #else #define kmalloc_track_caller(size, flags) \ __kmalloc(size, flags) @@ -271,10 +271,10 @@ extern void *__kmalloc_track_caller(size_t, gfp_t, void*); * allocation request comes from. */ #if defined(CONFIG_DEBUG_SLAB) || defined(CONFIG_SLUB) -extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *); +extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, unsigned long); #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node_track_caller(size, flags, node, \ - __builtin_return_address(0)) + _RET_IP_) #else #define kmalloc_node_track_caller(size, flags, node) \ __kmalloc_node(size, flags, node) -- cgit v1.2.3 From 8b752e3ef6e3f5cde87afc649dd51d92b1e549c1 Mon Sep 17 00:00:00 2001 From: Liming Wang Date: Fri, 28 Nov 2008 09:52:40 +0800 Subject: softirq: remove useless function __local_bh_enable Impact: remove unused code __local_bh_enable has been replaced with _local_bh_enable. As comments says "it always nests inside local_bh_enable() sections" has not been valid now. Also there is no reason to use __local_bh_enable anywhere, so we can remove this useless function. Signed-off-by: Liming Wang Signed-off-by: Ingo Molnar --- include/linux/bottom_half.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index 777dbf695d4..27b1bcffe40 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -2,7 +2,6 @@ #define _LINUX_BH_H extern void local_bh_disable(void); -extern void __local_bh_enable(void); extern void _local_bh_enable(void); extern void local_bh_enable(void); extern void local_bh_enable_ip(unsigned long ip); -- cgit v1.2.3 From 00ef9f7348dfd2fc223ec42aceb30836e86b367f Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 4 Dec 2008 09:00:17 +0100 Subject: lockdep: change a held lock's class Impact: introduce new lockdep API Allow to change a held lock's class. Basically the same as the existing code to change a subclass therefore reuse all that. The XFS code will be able to use this to annotate their inode locking. Signed-off-by: Peter Zijlstra Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 8956daf64ab..37a0361f468 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -314,8 +314,15 @@ extern void lock_acquire(struct lockdep_map *lock, unsigned int subclass, extern void lock_release(struct lockdep_map *lock, int nested, unsigned long ip); -extern void lock_set_subclass(struct lockdep_map *lock, unsigned int subclass, - unsigned long ip); +extern void lock_set_class(struct lockdep_map *lock, const char *name, + struct lock_class_key *key, unsigned int subclass, + unsigned long ip); + +static inline void lock_set_subclass(struct lockdep_map *lock, + unsigned int subclass, unsigned long ip) +{ + lock_set_class(lock, lock->name, lock->key, subclass, ip); +} # define INIT_LOCKDEP .lockdep_recursion = 0, @@ -333,6 +340,7 @@ static inline void lockdep_on(void) # define lock_acquire(l, s, t, r, c, n, i) do { } while (0) # define lock_release(l, n, i) do { } while (0) +# define lock_set_class(l, n, k, s, i) do { } while (0) # define lock_set_subclass(l, s, i) do { } while (0) # define lockdep_init() do { } while (0) # define lockdep_info() do { } while (0) -- cgit v1.2.3 From c9bb6003dd096ad190e1594a7d835ae1c39fae8f Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 29 Oct 2008 23:46:09 -0700 Subject: of: Fix comment, sparc no longer uses of_device objects on special busses. It only uses of_platform_bus_type. Signed-off-by: David S. Miller --- include/linux/of_platform.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/of_platform.h b/include/linux/of_platform.h index a8efcfeea73..3d327b67d7e 100644 --- a/include/linux/of_platform.h +++ b/include/linux/of_platform.h @@ -26,8 +26,7 @@ extern struct bus_type of_platform_bus_type; /* * An of_platform_driver driver is attached to a basic of_device on - * the "platform bus" (of_platform_bus_type) (or ISA, EBUS and SBUS - * busses on sparc). + * the "platform bus" (of_platform_bus_type). */ struct of_platform_driver { -- cgit v1.2.3 From 0b8f1efad30bd58f89961b82dfe68b9edf8fd2ac Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 5 Dec 2008 18:58:31 -0800 Subject: sparse irq_desc[] array: core kernel and x86 changes Impact: new feature Problem on distro kernels: irq_desc[NR_IRQS] takes megabytes of RAM with NR_CPUS set to large values. The goal is to be able to scale up to much larger NR_IRQS value without impacting the (important) common case. To solve this, we generalize irq_desc[NR_IRQS] to an (optional) array of irq_desc pointers. When CONFIG_SPARSE_IRQ=y is used, we use kzalloc_node to get irq_desc, this also makes the IRQ descriptors NUMA-local (to the site that calls request_irq()). This gets rid of the irq_cfg[] static array on x86 as well: irq_cfg now uses desc->chip_data for x86 to store irq_cfg. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/interrupt.h | 2 ++ include/linux/irq.h | 54 ++++++++++++++++++++++++++++++++++++++++++++- include/linux/irqnr.h | 14 +++--------- include/linux/kernel_stat.h | 14 +++++++++++- include/linux/random.h | 51 ++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 122 insertions(+), 13 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index f58a0cf8929..79e915e7e8a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -18,6 +18,8 @@ #include #include +extern int nr_irqs; + /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When diff --git a/include/linux/irq.h b/include/linux/irq.h index 3dddfa703eb..63b00439d4d 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -129,6 +129,8 @@ struct irq_chip { const char *typename; }; +struct timer_rand_state; +struct irq_2_iommu; /** * struct irq_desc - interrupt descriptor * @irq: interrupt number for this descriptor @@ -154,6 +156,13 @@ struct irq_chip { */ struct irq_desc { unsigned int irq; +#ifdef CONFIG_SPARSE_IRQ + struct timer_rand_state *timer_rand_state; + unsigned int *kstat_irqs; +# ifdef CONFIG_INTR_REMAP + struct irq_2_iommu *irq_2_iommu; +# endif +#endif irq_flow_handler_t handle_irq; struct irq_chip *chip; struct msi_desc *msi_desc; @@ -181,14 +190,52 @@ struct irq_desc { const char *name; } ____cacheline_internodealigned_in_smp; +extern void early_irq_init(void); +extern void arch_early_irq_init(void); +extern void arch_init_chip_data(struct irq_desc *desc, int cpu); +extern void arch_init_copy_chip_data(struct irq_desc *old_desc, + struct irq_desc *desc, int cpu); +extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); + +#ifndef CONFIG_SPARSE_IRQ extern struct irq_desc irq_desc[NR_IRQS]; static inline struct irq_desc *irq_to_desc(unsigned int irq) { - return (irq < nr_irqs) ? irq_desc + irq : NULL; + return (irq < NR_IRQS) ? irq_desc + irq : NULL; +} +static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) +{ + return irq_to_desc(irq); } +#ifdef CONFIG_GENERIC_HARDIRQS +# define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1); \ + irq >= 0; irq--, desc--) +#endif + +#else + +extern struct irq_desc *irq_to_desc(unsigned int irq); +extern struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu); +extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu); + +# define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_to_desc(irq); irq < nr_irqs; irq++, desc = irq_to_desc(irq)) +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1, desc = irq_to_desc(irq); irq >= 0; irq--, desc = irq_to_desc(irq)) + +#define kstat_irqs_this_cpu(DESC) \ + ((DESC)->kstat_irqs[smp_processor_id()]) +#define kstat_incr_irqs_this_cpu(irqno, DESC) \ + ((DESC)->kstat_irqs[smp_processor_id()]++) + +#endif + /* * Migration helpers for obsolete names, they will go away: */ @@ -380,6 +427,11 @@ extern int set_irq_msi(unsigned int irq, struct msi_desc *entry); #define get_irq_data(irq) (irq_to_desc(irq)->handler_data) #define get_irq_msi(irq) (irq_to_desc(irq)->msi_desc) +#define get_irq_desc_chip(desc) ((desc)->chip) +#define get_irq_desc_chip_data(desc) ((desc)->chip_data) +#define get_irq_desc_data(desc) ((desc)->handler_data) +#define get_irq_desc_msi(desc) ((desc)->msi_desc) + #endif /* CONFIG_GENERIC_HARDIRQS */ #endif /* !CONFIG_S390 */ diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 452c280c811..7a299e989f8 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -7,18 +7,10 @@ # define for_each_irq_desc(irq, desc) \ for (irq = 0; irq < nr_irqs; irq++) -#else -extern int nr_irqs; -# define for_each_irq_desc(irq, desc) \ - for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) - -# define for_each_irq_desc_reverse(irq, desc) \ - for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1); \ - irq >= 0; irq--, desc--) +static inline early_sparse_irq_init(void) +{ +} #endif -#define for_each_irq_nr(irq) \ - for (irq = 0; irq < nr_irqs; irq++) - #endif diff --git a/include/linux/kernel_stat.h b/include/linux/kernel_stat.h index 4a145caeee0..4ee4b3d2316 100644 --- a/include/linux/kernel_stat.h +++ b/include/linux/kernel_stat.h @@ -28,7 +28,9 @@ struct cpu_usage_stat { struct kernel_stat { struct cpu_usage_stat cpustat; - unsigned int irqs[NR_IRQS]; +#ifndef CONFIG_SPARSE_IRQ + unsigned int irqs[NR_IRQS]; +#endif }; DECLARE_PER_CPU(struct kernel_stat, kstat); @@ -39,6 +41,10 @@ DECLARE_PER_CPU(struct kernel_stat, kstat); extern unsigned long long nr_context_switches(void); +#ifndef CONFIG_SPARSE_IRQ +#define kstat_irqs_this_cpu(irq) \ + (kstat_this_cpu.irqs[irq]) + struct irq_desc; static inline void kstat_incr_irqs_this_cpu(unsigned int irq, @@ -46,11 +52,17 @@ static inline void kstat_incr_irqs_this_cpu(unsigned int irq, { kstat_this_cpu.irqs[irq]++; } +#endif + +#ifndef CONFIG_SPARSE_IRQ static inline unsigned int kstat_irqs_cpu(unsigned int irq, int cpu) { return kstat_cpu(cpu).irqs[irq]; } +#else +extern unsigned int kstat_irqs_cpu(unsigned int irq, int cpu); +#endif /* * Number of interrupts per specific IRQ source, since bootup diff --git a/include/linux/random.h b/include/linux/random.h index 36f125c0c60..ad9daa2374d 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -44,6 +44,57 @@ struct rand_pool_info { extern void rand_initialize_irq(int irq); +struct timer_rand_state; +#ifndef CONFIG_SPARSE_IRQ + +extern struct timer_rand_state *irq_timer_state[]; + +extern int nr_irqs; +static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq) +{ + if (irq >= nr_irqs) + return NULL; + + return irq_timer_state[irq]; +} + +static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state) +{ + if (irq >= nr_irqs) + return; + + irq_timer_state[irq] = state; +} + +#else + +#include +static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + if (!desc) + return NULL; + + return desc->timer_rand_state; +} + +static inline void set_timer_rand_state(unsigned int irq, struct timer_rand_state *state) +{ + struct irq_desc *desc; + + desc = irq_to_desc(irq); + + if (!desc) + return; + + desc->timer_rand_state = state; +} +#endif + + extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value); extern void add_interrupt_randomness(int irq); -- cgit v1.2.3 From 3145e941fcfe2548fa2270afb1a05bab3a6bc418 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Fri, 5 Dec 2008 18:58:34 -0800 Subject: x86, MSI: pass irq_cfg and irq_desc Impact: simplify code Pass irq_desc and cfg around, instead of raw IRQ numbers - this way we dont have to look it up again and again. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/msi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/msi.h b/include/linux/msi.h index 8f293922720..d2b8a1e8ca1 100644 --- a/include/linux/msi.h +++ b/include/linux/msi.h @@ -10,8 +10,11 @@ struct msi_msg { }; /* Helper functions */ +struct irq_desc; extern void mask_msi_irq(unsigned int irq); extern void unmask_msi_irq(unsigned int irq); +extern void read_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); +extern void write_msi_msg_desc(struct irq_desc *desc, struct msi_msg *msg); extern void read_msi_msg(unsigned int irq, struct msi_msg *msg); extern void write_msi_msg(unsigned int irq, struct msi_msg *msg); -- cgit v1.2.3 From 240d367b4e6c6e3c5075e034db14dba60a6f5fa7 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Mon, 8 Dec 2008 14:06:17 -0800 Subject: sparseirq: fix Alpha build failure Impact: build fix on Alpha -tip testing found this build failure on the Alpha defconfig: /home/mingo/tip/fs/proc/stat.c: In function 'show_stat': /home/mingo/tip/fs/proc/stat.c:48: error: implicit declaration of function 'for_each_irq_desc' /home/mingo/tip/fs/proc/stat.c:48: error: expected ';' before '{' token can not use irq_desc() in stat.c on older architectures. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 9 --------- include/linux/irqnr.h | 19 ++++++++++++++++--- 2 files changed, 16 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 63b00439d4d..b5749db3e5a 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -198,7 +198,6 @@ extern void arch_init_copy_chip_data(struct irq_desc *old_desc, extern void arch_free_chip_data(struct irq_desc *old_desc, struct irq_desc *desc); #ifndef CONFIG_SPARSE_IRQ - extern struct irq_desc irq_desc[NR_IRQS]; static inline struct irq_desc *irq_to_desc(unsigned int irq) @@ -210,14 +209,6 @@ static inline struct irq_desc *irq_to_desc_alloc_cpu(unsigned int irq, int cpu) return irq_to_desc(irq); } -#ifdef CONFIG_GENERIC_HARDIRQS -# define for_each_irq_desc(irq, desc) \ - for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) -# define for_each_irq_desc_reverse(irq, desc) \ - for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1); \ - irq >= 0; irq--, desc--) -#endif - #else extern struct irq_desc *irq_to_desc(unsigned int irq); diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 7a299e989f8..13754f81358 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -8,9 +8,22 @@ # define for_each_irq_desc(irq, desc) \ for (irq = 0; irq < nr_irqs; irq++) -static inline early_sparse_irq_init(void) -{ -} +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1; irq >= 0; irq--) +#else +#ifndef CONFIG_SPARSE_IRQ + +struct irq_desc; +extern int nr_irqs; +# define for_each_irq_desc(irq, desc) \ + for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) +# define for_each_irq_desc_reverse(irq, desc) \ + for (irq = nr_irqs - 1, desc = irq_desc + (nr_irqs - 1); \ + irq >= 0; irq--, desc--) #endif +#endif + +#define for_each_irq_nr(irq) \ + for (irq = 0; irq < nr_irqs; irq++) #endif -- cgit v1.2.3 From 58494487581cb143a0d763e3056a894d5009d60a Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 26 Nov 2008 12:02:53 +0100 Subject: oprofile: update comment for oprofile_add_sample() The cpu argument is no longer part of the parameter list. Signed-off-by: Robert Richter --- include/linux/oprofile.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index 5231861f357..1ce9fe572e5 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -86,8 +86,7 @@ int oprofile_arch_init(struct oprofile_operations * ops); void oprofile_arch_exit(void); /** - * Add a sample. This may be called from any context. Pass - * smp_processor_id() as cpu. + * Add a sample. This may be called from any context. */ void oprofile_add_sample(struct pt_regs * const regs, unsigned long event); -- cgit v1.2.3 From e09373f22e76cc048ca5fe10a9ff9012f5d64309 Mon Sep 17 00:00:00 2001 From: Robert Richter Date: Wed, 26 Nov 2008 14:04:19 +0100 Subject: ring_buffer: add remaining cpu functions to ring_buffer.h These functions are not yet in ring_buffer.h though they seems to be part of the API. Cc: Steven Rostedt Signed-off-by: Robert Richter --- include/linux/ring_buffer.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ring_buffer.h b/include/linux/ring_buffer.h index e097c2e6b6d..de9d8c12e5e 100644 --- a/include/linux/ring_buffer.h +++ b/include/linux/ring_buffer.h @@ -116,6 +116,8 @@ void ring_buffer_record_enable_cpu(struct ring_buffer *buffer, int cpu); unsigned long ring_buffer_entries(struct ring_buffer *buffer); unsigned long ring_buffer_overruns(struct ring_buffer *buffer); +unsigned long ring_buffer_entries_cpu(struct ring_buffer *buffer, int cpu); +unsigned long ring_buffer_overrun_cpu(struct ring_buffer *buffer, int cpu); u64 ring_buffer_time_stamp(int cpu); void ring_buffer_normalize_time_stamp(int cpu, u64 *ts); -- cgit v1.2.3 From 4d4be482a4d78ca906f45e99fd9fdb91e907f5ad Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Dec 2008 04:47:33 -0500 Subject: [XFS] add a FMODE flag to make XFS invisible I/O less hacky XFS has a mode called invisble I/O that doesn't update any of the timestamps. It's used for HSM-style applications and exposed through the nasty open by handle ioctl. Instead of doing directly assignment of file operations that set an internal flag for it add a new FMODE_NOCMTIME flag that we can check in the normal file operations. (addition of the generic VFS flag has been ACKed by Al as an interims solution) Signed-off-by: Christoph Hellwig Signed-off-by: Lachlan McIlroy --- include/linux/fs.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 51bd9370d43..965b9ba3865 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -81,6 +81,14 @@ extern int dir_notify_enable; #define FMODE_WRITE_IOCTL ((__force fmode_t)128) #define FMODE_NDELAY_NOW ((__force fmode_t)256) +/* + * Don't update ctime and mtime. + * + * Currently a special hack for the XFS open_by_handle ioctl, but we'll + * hopefully graduate it to a proper O_CMTIME flag supported by open(2) soon. + */ +#define FMODE_NOCMTIME ((__force fmode_t)2048) + #define RW_MASK 1 #define RWA_MASK 2 #define READ 0 -- cgit v1.2.3 From 0ebb26e7a4e2c5337502e98b2221e037fda911b9 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 11:26:39 +0100 Subject: sparse irqs: handle !GENIRQ platforms Impact: build fix fix: In file included from /home/mingo/tip/arch/m68k/amiga/amiints.c:39: /home/mingo/tip/include/linux/interrupt.h:21: error: expected identifier or '(' /home/mingo/tip/arch/m68k/amiga/amiints.c: In function 'amiga_init_IRQ': Signed-off-by: Ingo Molnar --- include/linux/interrupt.h | 4 ++-- include/linux/irqnr.h | 11 ++++++++++- include/linux/random.h | 2 +- 3 files changed, 13 insertions(+), 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 79e915e7e8a..777f89e00b4 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -14,12 +14,12 @@ #include #include #include +#include + #include #include #include -extern int nr_irqs; - /* * These correspond to the IORESOURCE_IRQ_* defines in * linux/ioport.h to select the interrupt line behaviour. When diff --git a/include/linux/irqnr.h b/include/linux/irqnr.h index 13754f81358..95d2b74641f 100644 --- a/include/linux/irqnr.h +++ b/include/linux/irqnr.h @@ -1,6 +1,11 @@ #ifndef _LINUX_IRQNR_H #define _LINUX_IRQNR_H +/* + * Generic irq_desc iterators: + */ +#ifdef __KERNEL__ + #ifndef CONFIG_GENERIC_HARDIRQS #include # define nr_irqs NR_IRQS @@ -11,10 +16,12 @@ # define for_each_irq_desc_reverse(irq, desc) \ for (irq = nr_irqs - 1; irq >= 0; irq--) #else + +extern int nr_irqs; + #ifndef CONFIG_SPARSE_IRQ struct irq_desc; -extern int nr_irqs; # define for_each_irq_desc(irq, desc) \ for (irq = 0, desc = irq_desc; irq < nr_irqs; irq++, desc++) # define for_each_irq_desc_reverse(irq, desc) \ @@ -26,4 +33,6 @@ extern int nr_irqs; #define for_each_irq_nr(irq) \ for (irq = 0; irq < nr_irqs; irq++) +#endif /* __KERNEL__ */ + #endif diff --git a/include/linux/random.h b/include/linux/random.h index ad9daa2374d..adbf3bd3c6b 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -8,6 +8,7 @@ #define _LINUX_RANDOM_H #include +#include /* ioctl()'s for the random number generator */ @@ -49,7 +50,6 @@ struct timer_rand_state; extern struct timer_rand_state *irq_timer_state[]; -extern int nr_irqs; static inline struct timer_rand_state *get_timer_rand_state(unsigned int irq) { if (irq >= nr_irqs) -- cgit v1.2.3 From 30cb367ea2be76bf71dbd275f38d0fd3b6f4142b Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Fri, 12 Dec 2008 12:19:57 +0100 Subject: sparse irqs: add irqnr.h to the user headers list Impact: fix build error /home/mingo/tip/usr/include/linux/random.h:11: included file 'linux/irqnr.h' is not exported Signed-off-by: Ingo Molnar --- include/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/Kbuild b/include/linux/Kbuild index e531783e5d7..95ac82340c3 100644 --- a/include/linux/Kbuild +++ b/include/linux/Kbuild @@ -313,6 +313,7 @@ unifdef-y += ptrace.h unifdef-y += qnx4_fs.h unifdef-y += quota.h unifdef-y += random.h +unifdef-y += irqnr.h unifdef-y += reboot.h unifdef-y += reiserfs_fs.h unifdef-y += reiserfs_xattr.h -- cgit v1.2.3 From 27af4245b6ce99e08c6a7c38825383bf51119cc9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 1 Dec 2008 14:18:13 -0800 Subject: posix-timers: use "struct pid*" instead of "struct task_struct*" Impact: restructure, clean up code k_itimer holds the ref to the ->it_process until sys_timer_delete(). This allows to pin up to RLIMIT_SIGPENDING dead task_struct's. Change the code to use "struct pid *" instead. The patch doesn't kill ->it_process, it places ->it_pid into the union. ->it_process is still used by do_cpu_nanosleep() as before. It would be trivial to change the nanosleep code as well, but since it uses it_process in a special way I think it is better to keep this field for grep. The patch bloats the kernel by 104 bytes and it also adds the new pointer, ->it_signal, to k_itimer. It is used by lock_timer() to verify that the found timer was not created by another process. It is not clear why do we use the global database (and thus the global idr_lock) for posix timers. We still need the signal_struct->posix_timers which contains all useable timers, perhaps it is better to use some form of per-process array instead. Signed-off-by: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/posix-timers.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/posix-timers.h b/include/linux/posix-timers.h index a7c72135554..4f71bf4e628 100644 --- a/include/linux/posix-timers.h +++ b/include/linux/posix-timers.h @@ -45,7 +45,11 @@ struct k_itimer { int it_requeue_pending; /* waiting to requeue this timer */ #define REQUEUE_PENDING 1 int it_sigev_notify; /* notify word of sigevent struct */ - struct task_struct *it_process; /* process to send signal to */ + struct signal_struct *it_signal; + union { + struct pid *it_pid; /* pid of process to send signal to */ + struct task_struct *it_process; /* for clock_nanosleep */ + }; struct sigqueue *sigq; /* signal queue entry. */ union { struct { -- cgit v1.2.3 From c29541b24fb2c6301021637229ae5347c877330a Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Mon, 1 Dec 2008 14:18:11 -0800 Subject: linux/timex.h: cleanup for userspace Impact: fix user-space exported use Move all the kernel-specific defines and includes into the __KERNEL__ section so that they don't get leaked into userspace. [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Mike Frysinger Signed-off-by: Andrew Morton Signed-off-by: Thomas Gleixner --- include/linux/timex.h | 73 ++++++++++++++++++++++++++------------------------- 1 file changed, 37 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/timex.h b/include/linux/timex.h index 9007313b5b7..998a55d80ac 100644 --- a/include/linux/timex.h +++ b/include/linux/timex.h @@ -53,46 +53,10 @@ #ifndef _LINUX_TIMEX_H #define _LINUX_TIMEX_H -#include #include -#include - #define NTP_API 4 /* NTP API version */ -/* - * SHIFT_KG and SHIFT_KF establish the damping of the PLL and are chosen - * for a slightly underdamped convergence characteristic. SHIFT_KH - * establishes the damping of the FLL and is chosen by wisdom and black - * art. - * - * MAXTC establishes the maximum time constant of the PLL. With the - * SHIFT_KG and SHIFT_KF values given and a time constant range from - * zero to MAXTC, the PLL will converge in 15 minutes to 16 hours, - * respectively. - */ -#define SHIFT_PLL 4 /* PLL frequency factor (shift) */ -#define SHIFT_FLL 2 /* FLL frequency factor (shift) */ -#define MAXTC 10 /* maximum time constant (shift) */ - -/* - * SHIFT_USEC defines the scaling (shift) of the time_freq and - * time_tolerance variables, which represent the current frequency - * offset and maximum frequency tolerance. - */ -#define SHIFT_USEC 16 /* frequency offset scale (shift) */ -#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) -#define PPM_SCALE_INV_SHIFT 19 -#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ - PPM_SCALE + 1) - -#define MAXPHASE 500000000l /* max phase error (ns) */ -#define MAXFREQ 500000 /* max frequency error (ns/s) */ -#define MAXFREQ_SCALED ((s64)MAXFREQ << NTP_SCALE_SHIFT) -#define MINSEC 256 /* min interval between updates (s) */ -#define MAXSEC 2048 /* max interval between updates (s) */ -#define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */ - /* * syscall interface - used (mainly by NTP daemon) * to discipline kernel clock oscillator @@ -199,8 +163,45 @@ struct timex { #define TIME_BAD TIME_ERROR /* bw compat */ #ifdef __KERNEL__ +#include +#include +#include + #include +/* + * SHIFT_KG and SHIFT_KF establish the damping of the PLL and are chosen + * for a slightly underdamped convergence characteristic. SHIFT_KH + * establishes the damping of the FLL and is chosen by wisdom and black + * art. + * + * MAXTC establishes the maximum time constant of the PLL. With the + * SHIFT_KG and SHIFT_KF values given and a time constant range from + * zero to MAXTC, the PLL will converge in 15 minutes to 16 hours, + * respectively. + */ +#define SHIFT_PLL 4 /* PLL frequency factor (shift) */ +#define SHIFT_FLL 2 /* FLL frequency factor (shift) */ +#define MAXTC 10 /* maximum time constant (shift) */ + +/* + * SHIFT_USEC defines the scaling (shift) of the time_freq and + * time_tolerance variables, which represent the current frequency + * offset and maximum frequency tolerance. + */ +#define SHIFT_USEC 16 /* frequency offset scale (shift) */ +#define PPM_SCALE (NSEC_PER_USEC << (NTP_SCALE_SHIFT - SHIFT_USEC)) +#define PPM_SCALE_INV_SHIFT 19 +#define PPM_SCALE_INV ((1ll << (PPM_SCALE_INV_SHIFT + NTP_SCALE_SHIFT)) / \ + PPM_SCALE + 1) + +#define MAXPHASE 500000000l /* max phase error (ns) */ +#define MAXFREQ 500000 /* max frequency error (ns/s) */ +#define MAXFREQ_SCALED ((s64)MAXFREQ << NTP_SCALE_SHIFT) +#define MINSEC 256 /* min interval between updates (s) */ +#define MAXSEC 2048 /* max interval between updates (s) */ +#define NTP_PHASE_LIMIT ((MAXPHASE / NSEC_PER_USEC) << 5) /* beyond max. dispersion */ + /* * kernel variables * Note: maximum error = NTP synch distance = dispersion + delay / 2; -- cgit v1.2.3 From b690ace50be7d10d77cb7a6d5ef1bd9de649852f Mon Sep 17 00:00:00 2001 From: Ben Dooks Date: Tue, 21 Oct 2008 14:07:03 +0100 Subject: [ARM] S3C6400: serial support for S3C6400 and S3C6410 SoCs Add support to the Samsung serial driver for the S3C6400 and S3C6410 serial ports. Signed-off-by: Ben Dooks --- include/linux/serial_core.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/serial_core.h b/include/linux/serial_core.h index 4e4f1277f3b..feb3b939ec4 100644 --- a/include/linux/serial_core.h +++ b/include/linux/serial_core.h @@ -158,6 +158,8 @@ /* SH-SCI */ #define PORT_SCIFA 83 +#define PORT_S3C6400 84 + #ifdef __KERNEL__ #include -- cgit v1.2.3 From 8c5df16bec8a60bb8589fc232b9e26cac0ed4b2c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Tue, 16 Dec 2008 12:17:26 -0800 Subject: swiotlb: allow architectures to override swiotlb pool allocation Impact: generalize swiotlb allocation code Architectures may need to allocate memory specially for use with the swiotlb. Create the weak function swiotlb_alloc_boot() and swiotlb_alloc() defaulting to the current behaviour. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ian Campbell Signed-off-by: Ingo Molnar --- include/linux/swiotlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index b18ec5533e8..b8c5fc766a5 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -10,6 +10,9 @@ struct scatterlist; extern void swiotlb_init(void); +extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs); +extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); + extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); -- cgit v1.2.3 From 0016fdee927f7aa0f428494bcf11ae60c7470a02 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 16 Dec 2008 12:17:27 -0800 Subject: swiotlb: move some definitions to header Impact: cleanup Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- include/linux/swiotlb.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index b8c5fc766a5..58b996a642f 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -7,6 +7,20 @@ struct device; struct dma_attrs; struct scatterlist; +/* + * Maximum allowable number of contiguous slabs to map, + * must be a power of 2. What is the appropriate value ? + * The complexity of {map,unmap}_single is linearly dependent on this value. + */ +#define IO_TLB_SEGSIZE 128 + + +/* + * log of the size of each IO TLB slab. The number of slabs is command line + * controllable. + */ +#define IO_TLB_SHIFT 11 + extern void swiotlb_init(void); -- cgit v1.2.3 From 48a1b10aff588833b73994704c47bbd0deb73e9c Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 11 Dec 2008 00:15:01 -0800 Subject: x86, sparseirq: move irq_desc according to smp_affinity, v7 Impact: improve NUMA handling by migrating irq_desc on smp_affinity changes if CONFIG_NUMA_MIGRATE_IRQ_DESC is set: - make irq_desc to go with affinity aka irq_desc moving etc - call move_irq_desc in irq_complete_move() - legacy irq_desc is not moved, because they are allocated via static array for logical apic mode, need to add move_desc_in_progress_in_same_domain, otherwise it will not be moved ==> also could need two phases to get irq_desc moved. Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- include/linux/irq.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index b5749db3e5a..36a01574678 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -227,6 +227,16 @@ extern struct irq_desc *move_irq_desc(struct irq_desc *old_desc, int cpu); #endif +static inline struct irq_desc * +irq_remap_to_desc(unsigned int irq, struct irq_desc *desc) +{ +#ifdef CONFIG_NUMA_MIGRATE_IRQ_DESC + return irq_to_desc(irq); +#else + return desc; +#endif +} + /* * Migration helpers for obsolete names, they will go away: */ -- cgit v1.2.3 From e08e1f7adba522378e8d2ae941bf25443866136d Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 16 Dec 2008 12:17:30 -0800 Subject: swiotlb: allow architectures to override phys<->bus<->phys conversions Impact: generalize phys<->bus<->phys conversions in the swiotlb code Architectures may need to override these conversions. Implement a __weak hook point containing the default implementation. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- include/linux/swiotlb.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 58b996a642f..694f1839cbc 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -27,6 +27,9 @@ swiotlb_init(void); extern void *swiotlb_alloc_boot(size_t bytes, unsigned long nslabs); extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); +extern dma_addr_t swiotlb_phys_to_bus(phys_addr_t address); +extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address); + extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); -- cgit v1.2.3 From b81ea27b2329bf44b30c427800954f845896d476 Mon Sep 17 00:00:00 2001 From: Ian Campbell Date: Tue, 16 Dec 2008 12:17:31 -0800 Subject: swiotlb: add arch hook to force mapping Impact: generalize the sw-IOTLB range checks Some architectures require special rules to determine whether a range needs mapping or not. This adds a weak function for architectures to override. Signed-off-by: Ian Campbell Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- include/linux/swiotlb.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/swiotlb.h b/include/linux/swiotlb.h index 694f1839cbc..325af1de035 100644 --- a/include/linux/swiotlb.h +++ b/include/linux/swiotlb.h @@ -30,6 +30,8 @@ extern void *swiotlb_alloc(unsigned order, unsigned long nslabs); extern dma_addr_t swiotlb_phys_to_bus(phys_addr_t address); extern phys_addr_t swiotlb_bus_to_phys(dma_addr_t address); +extern int swiotlb_arch_range_needs_mapping(void *ptr, size_t size); + extern void *swiotlb_alloc_coherent(struct device *hwdev, size_t size, dma_addr_t *dma_handle, gfp_t flags); -- cgit v1.2.3 From 74c8a6130486bed224e960790f4aa72dd09c061e Mon Sep 17 00:00:00 2001 From: KOSAKI Motohiro Date: Wed, 17 Dec 2008 19:40:33 +0900 Subject: locking, irq: enclose irq_desc_lock_class in CONFIG_LOCKDEP Impact: simplify code commit "08678b0: generic: sparse irqs: use irq_desc() [...]" introduced the irq_desc_lock_class variable. But it is used only if CONFIG_SPARSE_IRQ=Y or CONFIG_TRACE_IRQFLAGS=Y. Otherwise, following warnings happen: CC kernel/irq/handle.o kernel/irq/handle.c:26: warning: 'irq_desc_lock_class' defined but not used Actually, current early_init_irq_lock_class has a bit strange and messy ifdef. In addition, it is not valueable. 1. this function is protected by !CONFIG_SPARSE_IRQ, but that is not necessary. if CONFIG_SPARSE_IRQ=Y, desc of all irq number are initialized by NULL at first - then this function calling is safe. 2. this function protected by CONFIG_TRACE_IRQFLAGS too. but it is not necessary either, because lockdep_set_class() doesn't have bad side effect even if CONFIG_TRACE_IRQFLAGS=n. This patch bloat kernel size a bit on CONFIG_TRACE_IRQFLAGS=n and CONFIG_SPARSE_IRQ=Y - but that's ok. early_init_irq_lock_class() is not a fastpatch at all. To avoid messy ifdefs is more important than a few bytes diet. Signed-off-by: KOSAKI Motohiro Signed-off-by: Ingo Molnar --- include/linux/lockdep.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h index 29aec6e1002..9dba554c802 100644 --- a/include/linux/lockdep.h +++ b/include/linux/lockdep.h @@ -377,7 +377,7 @@ do { \ #endif /* CONFIG_LOCK_STAT */ -#if defined(CONFIG_TRACE_IRQFLAGS) && defined(CONFIG_GENERIC_HARDIRQS) +#ifdef CONFIG_GENERIC_HARDIRQS extern void early_init_irq_lock_class(void); #else static inline void early_init_irq_lock_class(void) -- cgit v1.2.3 From 64db4cfff99c04cd5f550357edcc8780f96b54a2 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 18 Dec 2008 21:55:32 +0100 Subject: "Tree RCU": scalable classic RCU implementation This patch fixes a long-standing performance bug in classic RCU that results in massive internal-to-RCU lock contention on systems with more than a few hundred CPUs. Although this patch creates a separate flavor of RCU for ease of review and patch maintenance, it is intended to replace classic RCU. This patch still handles stress better than does mainline, so I am still calling it ready for inclusion. This patch is against the -tip tree. Nevertheless, experience on an actual 1000+ CPU machine would still be most welcome. Most of the changes noted below were found while creating an rcutiny (which should permit ejecting the current rcuclassic) and while doing detailed line-by-line documentation. Updates from v9 (http://lkml.org/lkml/2008/12/2/334): o Fixes from remainder of line-by-line code walkthrough, including comment spelling, initialization, undesirable narrowing due to type conversion, removing redundant memory barriers, removing redundant local-variable initialization, and removing redundant local variables. I do not believe that any of these fixes address the CPU-hotplug issues that Andi Kleen was seeing, but please do give it a whirl in case the machine is smarter than I am. A writeup from the walkthrough may be found at the following URL, in case you are suffering from terminal insomnia or masochism: http://www.kernel.org/pub/linux/kernel/people/paulmck/tmp/rcutree-walkthrough.2008.12.16a.pdf o Made rcutree tracing use seq_file, as suggested some time ago by Lai Jiangshan. o Added a .csv variant of the rcudata debugfs trace file, to allow people having thousands of CPUs to drop the data into a spreadsheet. Tested with oocalc and gnumeric. Updated documentation to suit. Updates from v8 (http://lkml.org/lkml/2008/11/15/139): o Fix a theoretical race between grace-period initialization and force_quiescent_state() that could occur if more than three jiffies were required to carry out the grace-period initialization. Which it might, if you had enough CPUs. o Apply Ingo's printk-standardization patch. o Substitute local variables for repeated accesses to global variables. o Fix comment misspellings and redundant (but harmless) increments of ->n_rcu_pending (this latter after having explicitly added it). o Apply checkpatch fixes. Updates from v7 (http://lkml.org/lkml/2008/10/10/291): o Fixed a number of problems noted by Gautham Shenoy, including the cpu-stall-detection bug that he was having difficulty convincing me was real. ;-) o Changed cpu-stall detection to wait for ten seconds rather than three in order to reduce false positive, as suggested by Ingo Molnar. o Produced a design document (http://lwn.net/Articles/305782/). The act of writing this document uncovered a number of both theoretical and "here and now" bugs as noted below. o Fix dynticks_nesting accounting confusion, simplify WARN_ON() condition, fix kerneldoc comments, and add memory barriers in dynticks interface functions. o Add more data to tracing. o Remove unused "rcu_barrier" field from rcu_data structure. o Count calls to rcu_pending() from scheduling-clock interrupt to use as a surrogate timebase should jiffies stop counting. o Fix a theoretical race between force_quiescent_state() and grace-period initialization. Yes, initialization does have to go on for some jiffies for this race to occur, but given enough CPUs... Updates from v6 (http://lkml.org/lkml/2008/9/23/448): o Fix a number of checkpatch.pl complaints. o Apply review comments from Ingo Molnar and Lai Jiangshan on the stall-detection code. o Fix several bugs in !CONFIG_SMP builds. o Fix a misspelled config-parameter name so that RCU now announces at boot time if stall detection is configured. o Run tests on numerous combinations of configurations parameters, which after the fixes above, now build and run correctly. Updates from v5 (http://lkml.org/lkml/2008/9/15/92, bad subject line): o Fix a compiler error in the !CONFIG_FANOUT_EXACT case (blew a changeset some time ago, and finally got around to retesting this option). o Fix some tracing bugs in rcupreempt that caused incorrect totals to be printed. o I now test with a more brutal random-selection online/offline script (attached). Probably more brutal than it needs to be on the people reading it as well, but so it goes. o A number of optimizations and usability improvements: o Make rcu_pending() ignore the grace-period timeout when there is no grace period in progress. o Make force_quiescent_state() avoid going for a global lock in the case where there is no grace period in progress. o Rearrange struct fields to improve struct layout. o Make call_rcu() initiate a grace period if RCU was idle, rather than waiting for the next scheduling clock interrupt. o Invoke rcu_irq_enter() and rcu_irq_exit() only when idle, as suggested by Andi Kleen. I still don't completely trust this change, and might back it out. o Make CONFIG_RCU_TRACE be the single config variable manipulated for all forms of RCU, instead of the prior confusion. o Document tracing files and formats for both rcupreempt and rcutree. Updates from v4 for those missing v5 given its bad subject line: o Separated dynticks interface so that NMIs and irqs call separate functions, greatly simplifying it. In particular, this code no longer requires a proof of correctness. ;-) o Separated dynticks state out into its own per-CPU structure, avoiding the duplicated accounting. o The case where a dynticks-idle CPU runs an irq handler that invokes call_rcu() is now correctly handled, forcing that CPU out of dynticks-idle mode. o Review comments have been applied (thank you all!!!). For but one example, fixed the dynticks-ordering issue that Manfred pointed out, saving me much debugging. ;-) o Adjusted rcuclassic and rcupreempt to handle dynticks changes. Attached is an updated patch to Classic RCU that applies a hierarchy, greatly reducing the contention on the top-level lock for large machines. This passes 10-hour concurrent rcutorture and online-offline testing on 128-CPU ppc64 without dynticks enabled, and exposes some timekeeping bugs in presence of dynticks (exciting working on a system where "sleep 1" hangs until interrupted...), which were fixed in the 2.6.27 kernel. It is getting more reliable than mainline by some measures, so the next version will be against -tip for inclusion. See also Manfred Spraul's recent patches (or his earlier work from 2004 at http://marc.info/?l=linux-kernel&m=108546384711797&w=2). We will converge onto a common patch in the fullness of time, but are currently exploring different regions of the design space. That said, I have already gratefully stolen quite a few of Manfred's ideas. This patch provides CONFIG_RCU_FANOUT, which controls the bushiness of the RCU hierarchy. Defaults to 32 on 32-bit machines and 64 on 64-bit machines. If CONFIG_NR_CPUS is less than CONFIG_RCU_FANOUT, there is no hierarchy. By default, the RCU initialization code will adjust CONFIG_RCU_FANOUT to balance the hierarchy, so strongly NUMA architectures may choose to set CONFIG_RCU_FANOUT_EXACT to disable this balancing, allowing the hierarchy to be exactly aligned to the underlying hardware. Up to two levels of hierarchy are permitted (in addition to the root node), allowing up to 16,384 CPUs on 32-bit systems and up to 262,144 CPUs on 64-bit systems. I just know that I am going to regret saying this, but this seems more than sufficient for the foreseeable future. (Some architectures might wish to set CONFIG_RCU_FANOUT=4, which would limit such architectures to 64 CPUs. If this becomes a real problem, additional levels can be added, but I doubt that it will make a significant difference on real hardware.) In the common case, a given CPU will manipulate its private rcu_data structure and the rcu_node structure that it shares with its immediate neighbors. This can reduce both lock and memory contention by multiple orders of magnitude, which should eliminate the need for the strange manipulations that are reported to be required when running Linux on very large systems. Some shortcomings: o More bugs will probably surface as a result of an ongoing line-by-line code inspection. Patches will be provided as required. o There are probably hangs, rcutorture failures, &c. Seems quite stable on a 128-CPU machine, but that is kind of small compared to 4096 CPUs. However, seems to do better than mainline. Patches will be provided as required. o The memory footprint of this version is several KB larger than rcuclassic. A separate UP-only rcutiny patch will be provided, which will reduce the memory footprint significantly, even compared to the old rcuclassic. One such patch passes light testing, and has a memory footprint smaller even than rcuclassic. Initial reaction from various embedded guys was "it is not worth it", so am putting it aside. Credits: o Manfred Spraul for ideas, review comments, and bugs spotted, as well as some good friendly competition. ;-) o Josh Triplett, Ingo Molnar, Peter Zijlstra, Mathieu Desnoyers, Lai Jiangshan, Andi Kleen, Andy Whitcroft, and Andrew Morton for reviews and comments. o Thomas Gleixner for much-needed help with some timer issues (see patches below). o Jon M. Tollefson, Tim Pepper, Andrew Theurer, Jose R. Santos, Andy Whitcroft, Darrick Wong, Nishanth Aravamudan, Anton Blanchard, Dave Kleikamp, and Nathan Lynch for keeping machines alive despite my heavy abuse^Wtesting. Signed-off-by: Paul E. McKenney Signed-off-by: Ingo Molnar --- include/linux/hardirq.h | 14 +- include/linux/rcupdate.h | 10 +- include/linux/rcutree.h | 329 +++++++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 344 insertions(+), 9 deletions(-) create mode 100644 include/linux/rcutree.h (limited to 'include/linux') diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 181006cc94a..9b70b923169 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -118,13 +118,17 @@ static inline void account_system_vtime(struct task_struct *tsk) } #endif -#if defined(CONFIG_PREEMPT_RCU) && defined(CONFIG_NO_HZ) +#if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) extern void rcu_irq_enter(void); extern void rcu_irq_exit(void); +extern void rcu_nmi_enter(void); +extern void rcu_nmi_exit(void); #else # define rcu_irq_enter() do { } while (0) # define rcu_irq_exit() do { } while (0) -#endif /* CONFIG_PREEMPT_RCU */ +# define rcu_nmi_enter() do { } while (0) +# define rcu_nmi_exit() do { } while (0) +#endif /* #if defined(CONFIG_NO_HZ) && !defined(CONFIG_CLASSIC_RCU) */ /* * It is safe to do non-atomic ops on ->hardirq_context, @@ -134,7 +138,6 @@ extern void rcu_irq_exit(void); */ #define __irq_enter() \ do { \ - rcu_irq_enter(); \ account_system_vtime(current); \ add_preempt_count(HARDIRQ_OFFSET); \ trace_hardirq_enter(); \ @@ -153,7 +156,6 @@ extern void irq_enter(void); trace_hardirq_exit(); \ account_system_vtime(current); \ sub_preempt_count(HARDIRQ_OFFSET); \ - rcu_irq_exit(); \ } while (0) /* @@ -161,7 +163,7 @@ extern void irq_enter(void); */ extern void irq_exit(void); -#define nmi_enter() do { lockdep_off(); __irq_enter(); } while (0) -#define nmi_exit() do { __irq_exit(); lockdep_on(); } while (0) +#define nmi_enter() do { lockdep_off(); rcu_nmi_enter(); __irq_enter(); } while (0) +#define nmi_exit() do { __irq_exit(); rcu_nmi_exit(); lockdep_on(); } while (0) #endif /* LINUX_HARDIRQ_H */ diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 86f1f5e43e3..bfd289aff57 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -52,11 +52,15 @@ struct rcu_head { void (*func)(struct rcu_head *head); }; -#ifdef CONFIG_CLASSIC_RCU +#if defined(CONFIG_CLASSIC_RCU) #include -#else /* #ifdef CONFIG_CLASSIC_RCU */ +#elif defined(CONFIG_TREE_RCU) +#include +#elif defined(CONFIG_PREEMPT_RCU) #include -#endif /* #else #ifdef CONFIG_CLASSIC_RCU */ +#else +#error "Unknown RCU implementation specified to kernel configuration" +#endif /* #else #if defined(CONFIG_CLASSIC_RCU) */ #define RCU_HEAD_INIT { .next = NULL, .func = NULL } #define RCU_HEAD(head) struct rcu_head head = RCU_HEAD_INIT diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h new file mode 100644 index 00000000000..d4368b7975c --- /dev/null +++ b/include/linux/rcutree.h @@ -0,0 +1,329 @@ +/* + * Read-Copy Update mechanism for mutual exclusion (tree-based version) + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Copyright IBM Corporation, 2008 + * + * Author: Dipankar Sarma + * Paul E. McKenney Hierarchical algorithm + * + * Based on the original work by Paul McKenney + * and inputs from Rusty Russell, Andrea Arcangeli and Andi Kleen. + * + * For detailed explanation of Read-Copy Update mechanism see - + * Documentation/RCU + */ + +#ifndef __LINUX_RCUTREE_H +#define __LINUX_RCUTREE_H + +#include +#include +#include +#include +#include +#include + +/* + * Define shape of hierarchy based on NR_CPUS and CONFIG_RCU_FANOUT. + * In theory, it should be possible to add more levels straightforwardly. + * In practice, this has not been tested, so there is probably some + * bug somewhere. + */ +#define MAX_RCU_LVLS 3 +#define RCU_FANOUT (CONFIG_RCU_FANOUT) +#define RCU_FANOUT_SQ (RCU_FANOUT * RCU_FANOUT) +#define RCU_FANOUT_CUBE (RCU_FANOUT_SQ * RCU_FANOUT) + +#if NR_CPUS <= RCU_FANOUT +# define NUM_RCU_LVLS 1 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (NR_CPUS) +# define NUM_RCU_LVL_2 0 +# define NUM_RCU_LVL_3 0 +#elif NR_CPUS <= RCU_FANOUT_SQ +# define NUM_RCU_LVLS 2 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT - 1) / RCU_FANOUT) +# define NUM_RCU_LVL_2 (NR_CPUS) +# define NUM_RCU_LVL_3 0 +#elif NR_CPUS <= RCU_FANOUT_CUBE +# define NUM_RCU_LVLS 3 +# define NUM_RCU_LVL_0 1 +# define NUM_RCU_LVL_1 (((NR_CPUS) + RCU_FANOUT_SQ - 1) / RCU_FANOUT_SQ) +# define NUM_RCU_LVL_2 (((NR_CPUS) + (RCU_FANOUT) - 1) / (RCU_FANOUT)) +# define NUM_RCU_LVL_3 NR_CPUS +#else +# error "CONFIG_RCU_FANOUT insufficient for NR_CPUS" +#endif /* #if (NR_CPUS) <= RCU_FANOUT */ + +#define RCU_SUM (NUM_RCU_LVL_0 + NUM_RCU_LVL_1 + NUM_RCU_LVL_2 + NUM_RCU_LVL_3) +#define NUM_RCU_NODES (RCU_SUM - NR_CPUS) + +/* + * Dynticks per-CPU state. + */ +struct rcu_dynticks { + int dynticks_nesting; /* Track nesting level, sort of. */ + int dynticks; /* Even value for dynticks-idle, else odd. */ + int dynticks_nmi; /* Even value for either dynticks-idle or */ + /* not in nmi handler, else odd. So this */ + /* remains even for nmi from irq handler. */ +}; + +/* + * Definition for node within the RCU grace-period-detection hierarchy. + */ +struct rcu_node { + spinlock_t lock; + unsigned long qsmask; /* CPUs or groups that need to switch in */ + /* order for current grace period to proceed.*/ + unsigned long qsmaskinit; + /* Per-GP initialization for qsmask. */ + unsigned long grpmask; /* Mask to apply to parent qsmask. */ + int grplo; /* lowest-numbered CPU or group here. */ + int grphi; /* highest-numbered CPU or group here. */ + u8 grpnum; /* CPU/group number for next level up. */ + u8 level; /* root is at level 0. */ + struct rcu_node *parent; +} ____cacheline_internodealigned_in_smp; + +/* Index values for nxttail array in struct rcu_data. */ +#define RCU_DONE_TAIL 0 /* Also RCU_WAIT head. */ +#define RCU_WAIT_TAIL 1 /* Also RCU_NEXT_READY head. */ +#define RCU_NEXT_READY_TAIL 2 /* Also RCU_NEXT head. */ +#define RCU_NEXT_TAIL 3 +#define RCU_NEXT_SIZE 4 + +/* Per-CPU data for read-copy update. */ +struct rcu_data { + /* 1) quiescent-state and grace-period handling : */ + long completed; /* Track rsp->completed gp number */ + /* in order to detect GP end. */ + long gpnum; /* Highest gp number that this CPU */ + /* is aware of having started. */ + long passed_quiesc_completed; + /* Value of completed at time of qs. */ + bool passed_quiesc; /* User-mode/idle loop etc. */ + bool qs_pending; /* Core waits for quiesc state. */ + bool beenonline; /* CPU online at least once. */ + struct rcu_node *mynode; /* This CPU's leaf of hierarchy */ + unsigned long grpmask; /* Mask to apply to leaf qsmask. */ + + /* 2) batch handling */ + /* + * If nxtlist is not NULL, it is partitioned as follows. + * Any of the partitions might be empty, in which case the + * pointer to that partition will be equal to the pointer for + * the following partition. When the list is empty, all of + * the nxttail elements point to nxtlist, which is NULL. + * + * [*nxttail[RCU_NEXT_READY_TAIL], NULL = *nxttail[RCU_NEXT_TAIL]): + * Entries that might have arrived after current GP ended + * [*nxttail[RCU_WAIT_TAIL], *nxttail[RCU_NEXT_READY_TAIL]): + * Entries known to have arrived before current GP ended + * [*nxttail[RCU_DONE_TAIL], *nxttail[RCU_WAIT_TAIL]): + * Entries that batch # <= ->completed - 1: waiting for current GP + * [nxtlist, *nxttail[RCU_DONE_TAIL]): + * Entries that batch # <= ->completed + * The grace period for these entries has completed, and + * the other grace-period-completed entries may be moved + * here temporarily in rcu_process_callbacks(). + */ + struct rcu_head *nxtlist; + struct rcu_head **nxttail[RCU_NEXT_SIZE]; + long qlen; /* # of queued callbacks */ + long blimit; /* Upper limit on a processed batch */ + +#ifdef CONFIG_NO_HZ + /* 3) dynticks interface. */ + struct rcu_dynticks *dynticks; /* Shared per-CPU dynticks state. */ + int dynticks_snap; /* Per-GP tracking for dynticks. */ + int dynticks_nmi_snap; /* Per-GP tracking for dynticks_nmi. */ +#endif /* #ifdef CONFIG_NO_HZ */ + + /* 4) reasons this CPU needed to be kicked by force_quiescent_state */ +#ifdef CONFIG_NO_HZ + unsigned long dynticks_fqs; /* Kicked due to dynticks idle. */ +#endif /* #ifdef CONFIG_NO_HZ */ + unsigned long offline_fqs; /* Kicked due to being offline. */ + unsigned long resched_ipi; /* Sent a resched IPI. */ + + /* 5) state to allow this CPU to force_quiescent_state on others */ + long n_rcu_pending; /* rcu_pending() calls since boot. */ + long n_rcu_pending_force_qs; /* when to force quiescent states. */ + + int cpu; +}; + +/* Values for signaled field in struct rcu_state. */ +#define RCU_GP_INIT 0 /* Grace period being initialized. */ +#define RCU_SAVE_DYNTICK 1 /* Need to scan dyntick state. */ +#define RCU_FORCE_QS 2 /* Need to force quiescent state. */ +#ifdef CONFIG_NO_HZ +#define RCU_SIGNAL_INIT RCU_SAVE_DYNTICK +#else /* #ifdef CONFIG_NO_HZ */ +#define RCU_SIGNAL_INIT RCU_FORCE_QS +#endif /* #else #ifdef CONFIG_NO_HZ */ + +#define RCU_JIFFIES_TILL_FORCE_QS 3 /* for rsp->jiffies_force_qs */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR +#define RCU_SECONDS_TILL_STALL_CHECK (10 * HZ) /* for rsp->jiffies_stall */ +#define RCU_SECONDS_TILL_STALL_RECHECK (30 * HZ) /* for rsp->jiffies_stall */ +#define RCU_STALL_RAT_DELAY 2 /* Allow other CPUs time */ + /* to take at least one */ + /* scheduling clock irq */ + /* before ratting on them. */ + +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ + +/* + * RCU global state, including node hierarchy. This hierarchy is + * represented in "heap" form in a dense array. The root (first level) + * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second + * level in ->node[1] through ->node[m] (->node[1] referenced by ->level[1]), + * and the third level in ->node[m+1] and following (->node[m+1] referenced + * by ->level[2]). The number of levels is determined by the number of + * CPUs and by CONFIG_RCU_FANOUT. Small systems will have a "hierarchy" + * consisting of a single rcu_node. + */ +struct rcu_state { + struct rcu_node node[NUM_RCU_NODES]; /* Hierarchy. */ + struct rcu_node *level[NUM_RCU_LVLS]; /* Hierarchy levels. */ + u32 levelcnt[MAX_RCU_LVLS + 1]; /* # nodes in each level. */ + u8 levelspread[NUM_RCU_LVLS]; /* kids/node in each level. */ + struct rcu_data *rda[NR_CPUS]; /* array of rdp pointers. */ + + /* The following fields are guarded by the root rcu_node's lock. */ + + u8 signaled ____cacheline_internodealigned_in_smp; + /* Force QS state. */ + long gpnum; /* Current gp number. */ + long completed; /* # of last completed gp. */ + spinlock_t onofflock; /* exclude on/offline and */ + /* starting new GP. */ + spinlock_t fqslock; /* Only one task forcing */ + /* quiescent states. */ + unsigned long jiffies_force_qs; /* Time at which to invoke */ + /* force_quiescent_state(). */ + unsigned long n_force_qs; /* Number of calls to */ + /* force_quiescent_state(). */ + unsigned long n_force_qs_lh; /* ~Number of calls leaving */ + /* due to lock unavailable. */ + unsigned long n_force_qs_ngp; /* Number of calls leaving */ + /* due to no GP active. */ +#ifdef CONFIG_RCU_CPU_STALL_DETECTOR + unsigned long gp_start; /* Time at which GP started, */ + /* but in jiffies. */ + unsigned long jiffies_stall; /* Time at which to check */ + /* for CPU stalls. */ +#endif /* #ifdef CONFIG_RCU_CPU_STALL_DETECTOR */ +#ifdef CONFIG_NO_HZ + long dynticks_completed; /* Value of completed @ snap. */ +#endif /* #ifdef CONFIG_NO_HZ */ +}; + +extern struct rcu_state rcu_state; +DECLARE_PER_CPU(struct rcu_data, rcu_data); + +extern struct rcu_state rcu_bh_state; +DECLARE_PER_CPU(struct rcu_data, rcu_bh_data); + +/* + * Increment the quiescent state counter. + * The counter is a bit degenerated: We do not need to know + * how many quiescent states passed, just if there was at least + * one since the start of the grace period. Thus just a flag. + */ +static inline void rcu_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_data, cpu); + rdp->passed_quiesc = 1; + rdp->passed_quiesc_completed = rdp->completed; +} +static inline void rcu_bh_qsctr_inc(int cpu) +{ + struct rcu_data *rdp = &per_cpu(rcu_bh_data, cpu); + rdp->passed_quiesc = 1; + rdp->passed_quiesc_completed = rdp->completed; +} + +extern int rcu_pending(int cpu); +extern int rcu_needs_cpu(int cpu); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +extern struct lockdep_map rcu_lock_map; +# define rcu_read_acquire() \ + lock_acquire(&rcu_lock_map, 0, 0, 2, 1, NULL, _THIS_IP_) +# define rcu_read_release() lock_release(&rcu_lock_map, 1, _THIS_IP_) +#else +# define rcu_read_acquire() do { } while (0) +# define rcu_read_release() do { } while (0) +#endif + +static inline void __rcu_read_lock(void) +{ + preempt_disable(); + __acquire(RCU); + rcu_read_acquire(); +} +static inline void __rcu_read_unlock(void) +{ + rcu_read_release(); + __release(RCU); + preempt_enable(); +} +static inline void __rcu_read_lock_bh(void) +{ + local_bh_disable(); + __acquire(RCU_BH); + rcu_read_acquire(); +} +static inline void __rcu_read_unlock_bh(void) +{ + rcu_read_release(); + __release(RCU_BH); + local_bh_enable(); +} + +#define __synchronize_sched() synchronize_rcu() + +#define call_rcu_sched(head, func) call_rcu(head, func) + +static inline void rcu_init_sched(void) +{ +} + +extern void __rcu_init(void); +extern void rcu_check_callbacks(int cpu, int user); +extern void rcu_restart_cpu(int cpu); + +extern long rcu_batches_completed(void); +extern long rcu_batches_completed_bh(void); + +#ifdef CONFIG_NO_HZ +void rcu_enter_nohz(void); +void rcu_exit_nohz(void); +#else /* CONFIG_NO_HZ */ +static inline void rcu_enter_nohz(void) +{ +} +static inline void rcu_exit_nohz(void) +{ +} +#endif /* CONFIG_NO_HZ */ + +#endif /* __LINUX_RCUTREE_H */ -- cgit v1.2.3 From 078a55db075bf4dcc03115dc94875b3dd83f69d4 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 18 Dec 2008 16:57:52 -0800 Subject: sparseirq: add kernel-doc notation for new member in irq_desc, -v2 Signed-off-by: Yinghai Lu Acked-by: Randy Dunlap Signed-off-by: Ingo Molnar --- include/linux/irq.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/irq.h b/include/linux/irq.h index 36a01574678..7fa7f30fccb 100644 --- a/include/linux/irq.h +++ b/include/linux/irq.h @@ -134,6 +134,9 @@ struct irq_2_iommu; /** * struct irq_desc - interrupt descriptor * @irq: interrupt number for this descriptor + * @timer_rand_state: pointer to timer rand state struct + * @kstat_irqs: irq stats per cpu + * @irq_2_iommu: iommu with this irq * @handle_irq: highlevel irq-events handler [if NULL, __do_IRQ()] * @chip: low level interrupt hardware access * @msi_desc: MSI descriptor -- cgit v1.2.3 From 88a9fe8cae3bb52e82489447f45e8d7ba1409ca8 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Dec 2008 15:21:31 -0500 Subject: SUNRPC: Remove the last remnant of the BKL... Somehow, this escaped the previous purge. There should be no need to keep any extra locks in the XDR callbacks. The NFS client XDR code only writes into private objects, whereas all reads of shared objects are confined to fields that do not change, such as filehandles... Ditto for lockd, the NFSv2/v3 client mount code, and rpcbind. The nfsd XDR code may require the BKL, but since it does a synchronous RPC call from a thread that already holds the lock, that issue is moot. Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xdr.h | 15 --------------- 1 file changed, 15 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h index e4057d729f0..49e1eb45446 100644 --- a/include/linux/sunrpc/xdr.h +++ b/include/linux/sunrpc/xdr.h @@ -36,21 +36,6 @@ struct xdr_netobj { */ typedef int (*kxdrproc_t)(void *rqstp, __be32 *data, void *obj); -/* - * We're still requiring the BKL in the xdr code until it's been - * more carefully audited, at which point this wrapper will become - * unnecessary. - */ -static inline int rpc_call_xdrproc(kxdrproc_t xdrproc, void *rqstp, __be32 *data, void *obj) -{ - int ret; - - lock_kernel(); - ret = xdrproc(rqstp, data, obj); - unlock_kernel(); - return ret; -} - /* * Basic structure for transmission/reception of a client XDR message. * Features a header (for a linear buffer containing RPC headers -- cgit v1.2.3 From 146ec944bbd31d241a44a00518b054fb01921d22 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 23 Dec 2008 15:21:34 -0500 Subject: NFS: Move declaration of nfs_mount() to fs/nfs/internal.h Clean up: The nfs_mount() function is not to be used outside of the NFS client. Move its public declaration to fs/nfs/internal.h. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 4eaa8347a0d..f11077285a6 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -532,12 +532,6 @@ static inline void nfs3_forget_cached_acls(struct inode *inode) } #endif /* CONFIG_NFS_V3_ACL */ -/* - * linux/fs/mount_clnt.c - */ -extern int nfs_mount(struct sockaddr *, size_t, char *, char *, - int, int, struct nfs_fh *); - /* * inline functions */ -- cgit v1.2.3 From d740351bf0960e89ce1aef45cfe00167cb0f9e5b Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 23 Dec 2008 15:21:37 -0500 Subject: NFS: add "[no]resvport" mount option The standard default security setting for NFS is AUTH_SYS. An NFS client connects to NFS servers via a privileged source port and a fixed standard destination port (2049). The client sends raw uid and gid numbers to identify users making NFS requests, and the server assumes an appropriate authority on the client has vetted these values because the source port is privileged. On Linux, by default in-kernel RPC services use a privileged port in the range between 650 and 1023 to avoid using source ports of well- known IP services. Using such a small range limits the number of NFS mount points and the number of unique NFS servers to which a client can connect concurrently. An NFS client can use unprivileged source ports to expand the range of source port numbers, allowing more concurrent server connections and more NFS mount points. Servers must explicitly allow NFS connections from unprivileged ports for this to work. In the past, bumping the value of the sunrpc.max_resvport sysctl on the client would permit the NFS client to use unprivileged ports. Bumping this setting also changes the maximum port number used by other in-kernel RPC services, some of which still required a port number less than 1023. This is exacerbated by the way source port numbers are chosen by the Linux RPC client, which starts at the top of the range and works downwards. It means that bumping the maximum means all RPC services requesting a source port will likely get an unprivileged port instead of a privileged one. Changing this setting effects all NFS mount points on a client. A sysadmin could not selectively choose which mount points would use non-privileged ports and which could not. Lastly, this mechanism of expanding the limit on the number of NFS mount points was entirely undocumented. To address the need for the NFS client to use a large range of source ports without interfering with the activity of other in-kernel RPC services, we introduce a new NFS mount option. This option explicitly tells only the NFS client to use a non-privileged source port when communicating with the NFS server for one specific mount point. This new mount option is called "resvport," like the similar NFS mount option on FreeBSD and Mac OS X. A sister patch for nfs-utils will be submitted that documents this new option in nfs(5). The default setting for this new mount option requires the NFS client to use a privileged port, as before. Explicitly specifying the "noresvport" mount option allows the NFS client to use an unprivileged source port for this mount point when connecting to the NFS server port. This mount option is supported only for text-based NFS mounts. [ Sidebar: it is widely known that security mechanisms based on the use of privileged source ports are ineffective. However, the NFS client can combine the use of unprivileged ports with the use of secure authentication mechanisms, such as Kerberos. This allows a large number of connections and mount points while ensuring a useful level of security. Eventually we may change the default setting for this option depending on the security flavor used for the mount. For example, if the mount is using only AUTH_SYS, then the default setting will be "resvport;" if the mount is using a strong security flavor such as krb5, the default setting will be "noresvport." ] Signed-off-by: Chuck Lever [Trond.Myklebust@netapp.com: Fixed a bug whereby nfs4_init_client() was being called with incorrect arguments.] Signed-off-by: Trond Myklebust --- include/linux/nfs_mount.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/nfs_mount.h b/include/linux/nfs_mount.h index 6549a06ac16..4499016e6d0 100644 --- a/include/linux/nfs_mount.h +++ b/include/linux/nfs_mount.h @@ -45,7 +45,7 @@ struct nfs_mount_data { char context[NFS_MAX_CONTEXT_LEN + 1]; /* 6 */ }; -/* bits in the flags field */ +/* bits in the flags field visible to user space */ #define NFS_MOUNT_SOFT 0x0001 /* 1 */ #define NFS_MOUNT_INTR 0x0002 /* 1 */ /* now unused, but ABI */ @@ -68,5 +68,6 @@ struct nfs_mount_data { /* The following are for internal use only */ #define NFS_MOUNT_LOOKUP_CACHE_NONEG 0x10000 #define NFS_MOUNT_LOOKUP_CACHE_NONE 0x20000 +#define NFS_MOUNT_NORESVPORT 0x40000 #endif -- cgit v1.2.3 From 0cb2659b818eca99235e17c04291cfa9985c14f7 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 23 Dec 2008 15:21:38 -0500 Subject: NLM: allow lockd requests from an unprivileged port If the admin has specified the "noresvport" option for an NFS mount point, the kernel's NFS client uses an unprivileged source port for the main NFS transport. The kernel's lockd client should use an unprivileged port in this case as well. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/linux/lockd/bind.h | 1 + include/linux/lockd/lockd.h | 4 +++- 2 files changed, 4 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lockd/bind.h b/include/linux/lockd/bind.h index e5872dc994c..fbc48f89852 100644 --- a/include/linux/lockd/bind.h +++ b/include/linux/lockd/bind.h @@ -41,6 +41,7 @@ struct nlmclnt_initdata { size_t addrlen; unsigned short protocol; u32 nfs_version; + int noresvport; }; /* diff --git a/include/linux/lockd/lockd.h b/include/linux/lockd/lockd.h index b56d5aa9b19..23da3fa69ef 100644 --- a/include/linux/lockd/lockd.h +++ b/include/linux/lockd/lockd.h @@ -49,6 +49,7 @@ struct nlm_host { unsigned short h_proto; /* transport proto */ unsigned short h_reclaiming : 1, h_server : 1, /* server side, not client side */ + h_noresvport : 1, h_inuse : 1; wait_queue_head_t h_gracewait; /* wait while reclaiming */ struct rw_semaphore h_rwsem; /* Reboot recovery lock */ @@ -220,7 +221,8 @@ struct nlm_host *nlmclnt_lookup_host(const struct sockaddr *sap, const size_t salen, const unsigned short protocol, const u32 version, - const char *hostname); + const char *hostname, + int noresvport); struct nlm_host *nlmsvc_lookup_host(const struct svc_rqst *rqstp, const char *hostname, const size_t hostname_len); -- cgit v1.2.3 From 95d35cb4c473c754824967c0b069bbeb7efa4847 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Dec 2008 15:21:45 -0500 Subject: NFSv4: Remove nfs_client->cl_sem Now that we're using the flags to indicate state that needs to be recovered, as well as having implemented proper refcounting and spinlocking on the state and open_owners, we can get rid of nfs_client->cl_sem. The only remaining case that was dubious was the file locking, and that case is now covered by the nfsi->rwsem. Signed-off-by: Trond Myklebust --- include/linux/nfs_fs_sb.h | 6 ------ 1 file changed, 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h index 4e477ae5869..9bb81aec91c 100644 --- a/include/linux/nfs_fs_sb.h +++ b/include/linux/nfs_fs_sb.h @@ -42,12 +42,6 @@ struct nfs_client { struct rb_root cl_openowner_id; struct rb_root cl_lockowner_id; - /* - * The following rwsem ensures exclusive access to the server - * while we recover the state following a lease expiration. - */ - struct rw_semaphore cl_sem; - struct list_head cl_delegations; struct rb_root cl_state_owners; spinlock_t cl_lock; -- cgit v1.2.3 From bd7bf9d540c001055fba796ebf146d90e4dd2eb2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Dec 2008 15:21:53 -0500 Subject: NFSv4: Convert delegation->type field to fmode_t Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 2 +- include/linux/nfs_xdr.h | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index f11077285a6..8d71d7b7c78 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -180,7 +180,7 @@ struct nfs_inode { /* NFSv4 state */ struct list_head open_states; struct nfs_delegation *delegation; - int delegation_state; + fmode_t delegation_state; struct rw_semaphore rwsem; #endif /* CONFIG_NFS_V4*/ struct inode vfs_inode; diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index c1c31acb8a2..32c1a0ecdbf 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -126,7 +126,7 @@ struct nfs_openargs { struct iattr * attrs; /* UNCHECKED, GUARDED */ nfs4_verifier verifier; /* EXCLUSIVE */ nfs4_stateid delegation; /* CLAIM_DELEGATE_CUR */ - int delegation_type; /* CLAIM_PREVIOUS */ + fmode_t delegation_type; /* CLAIM_PREVIOUS */ } u; const struct qstr * name; const struct nfs_server *server; /* Needed for ID mapping */ @@ -143,7 +143,7 @@ struct nfs_openres { struct nfs_fattr * dir_attr; struct nfs_seqid * seqid; const struct nfs_server *server; - int delegation_type; + fmode_t delegation_type; nfs4_stateid delegation; __u32 do_recall; __u64 maxsize; -- cgit v1.2.3 From dc0b027dfadfcb8a5504f7d8052754bf8d501ab9 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 23 Dec 2008 15:21:56 -0500 Subject: NFSv4: Convert the open and close ops to use fmode Signed-off-by: Trond Myklebust --- include/linux/nfs_fs.h | 4 ++-- include/linux/nfs_xdr.h | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 8d71d7b7c78..b8d9c6dd4f6 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -83,7 +83,7 @@ struct nfs_open_context { struct rpc_cred *cred; struct nfs4_state *state; fl_owner_t lockowner; - int mode; + fmode_t mode; unsigned long flags; #define NFS_CONTEXT_ERROR_WRITE (0) @@ -342,7 +342,7 @@ extern int nfs_setattr(struct dentry *, struct iattr *); extern void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr); extern struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx); extern void put_nfs_open_context(struct nfs_open_context *ctx); -extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, int mode); +extern struct nfs_open_context *nfs_find_open_context(struct inode *inode, struct rpc_cred *cred, fmode_t mode); extern u64 nfs_compat_user_ino64(u64 fileid); extern void nfs_fattr_init(struct nfs_fattr *fattr); diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 32c1a0ecdbf..a550b528319 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -120,6 +120,7 @@ struct nfs_openargs { const struct nfs_fh * fh; struct nfs_seqid * seqid; int open_flags; + fmode_t fmode; __u64 clientid; __u64 id; union { @@ -171,7 +172,7 @@ struct nfs_closeargs { struct nfs_fh * fh; nfs4_stateid * stateid; struct nfs_seqid * seqid; - int open_flags; + fmode_t fmode; const u32 * bitmask; }; -- cgit v1.2.3 From 64672d55d93c26fb4035fd1a84a803cbc09cb058 Mon Sep 17 00:00:00 2001 From: Peter Staubach Date: Tue, 23 Dec 2008 15:21:56 -0500 Subject: optimize attribute timeouts for "noac" and "actimeo=0" Hi. I've been looking at a bugzilla which describes a problem where a customer was advised to use either the "noac" or "actimeo=0" mount options to solve a consistency problem that they were seeing in the file attributes. It turned out that this solution did not work reliably for them because sometimes, the local attribute cache was believed to be valid and not timed out. (With an attribute cache timeout of 0, the cache should always appear to be timed out.) In looking at this situation, it appears to me that the problem is that the attribute cache timeout code has an off-by-one error in it. It is assuming that the cache is valid in the region, [read_cache_jiffies, read_cache_jiffies + attrtimeo]. The cache should be considered valid only in the region, [read_cache_jiffies, read_cache_jiffies + attrtimeo). With this change, the options, "noac" and "actimeo=0", work as originally expected. This problem was previously addressed by special casing the attrtimeo == 0 case. However, since the problem is only an off- by-one error, the cleaner solution is address the off-by-one error and thus, not require the special case. Thanx... ps Signed-off-by: Peter Staubach Signed-off-by: Trond Myklebust --- include/linux/jiffies.h | 10 ++++++++++ include/linux/nfs_fs.h | 5 ++++- 2 files changed, 14 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/jiffies.h b/include/linux/jiffies.h index abb6ac639e8..1a9cf78bfce 100644 --- a/include/linux/jiffies.h +++ b/include/linux/jiffies.h @@ -115,10 +115,20 @@ static inline u64 get_jiffies_64(void) ((long)(a) - (long)(b) >= 0)) #define time_before_eq(a,b) time_after_eq(b,a) +/* + * Calculate whether a is in the range of [b, c]. + */ #define time_in_range(a,b,c) \ (time_after_eq(a,b) && \ time_before_eq(a,c)) +/* + * Calculate whether a is in the range of [b, c). + */ +#define time_in_range_open(a,b,c) \ + (time_after_eq(a,b) && \ + time_before(a,c)) + /* Same as above, but does so with platform independent 64bit types. * These must be used when utilizing jiffies_64 (i.e. return value of * get_jiffies_64() */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index b8d9c6dd4f6..db867b04ac3 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -130,7 +130,10 @@ struct nfs_inode { * * We need to revalidate the cached attrs for this inode if * - * jiffies - read_cache_jiffies > attrtimeo + * jiffies - read_cache_jiffies >= attrtimeo + * + * Please note the comparison is greater than or equal + * so that zero timeout values can be specified. */ unsigned long read_cache_jiffies; unsigned long attrtimeo; -- cgit v1.2.3 From c977a2ef40a38c45537ad03823d0a004f06373f0 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Tue, 23 Dec 2008 16:06:13 -0500 Subject: sunrpc: get rid of rpc_rqst.rq_bufsize rq_bufsize is not used. Signed-off-by: Mike Sager Signed-off-by: Benny Halevy Signed-off-by: Trond Myklebust --- include/linux/sunrpc/xprt.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h index 4d80a118d53..11fc71d50c1 100644 --- a/include/linux/sunrpc/xprt.h +++ b/include/linux/sunrpc/xprt.h @@ -76,8 +76,7 @@ struct rpc_rqst { struct list_head rq_list; __u32 * rq_buffer; /* XDR encode buffer */ - size_t rq_bufsize, - rq_callsize, + size_t rq_callsize, rq_rcvsize; struct xdr_buf rq_private_buf; /* The receive buffer -- cgit v1.2.3 From c381060869317b3c84430d4f54965d409cbfe65f Mon Sep 17 00:00:00 2001 From: "\\\"J. Bruce Fields\\" Date: Tue, 23 Dec 2008 16:08:32 -0500 Subject: rpc: add an rpc_pipe_open method We want to transition to a new gssd upcall which is text-based and more easily extensible. To simplify upgrades, as well as testing and debugging, it will help if we can upgrade gssd (to a version which understands the new upcall) without having to choose at boot (or module-load) time whether we want the new or the old upcall. We will do this by providing two different pipes: one named, as currently, after the mechanism (normally "krb5"), and supporting the old upcall. One named "gssd" and supporting the new upcall version. We allow gssd to indicate which version it supports by its choice of which pipe to open. As we have no interest in supporting *simultaneous* use of both versions, we'll forbid opening both pipes at the same time. So, add a new pipe_open callback to the rpc_pipefs api, which the gss code can use to track which pipes have been open, and to refuse opens of incompatible pipes. We only need this to be called on the first open of a given pipe. Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/rpc_pipe_fs.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/rpc_pipe_fs.h b/include/linux/sunrpc/rpc_pipe_fs.h index 51b977a4ca2..cea764c2359 100644 --- a/include/linux/sunrpc/rpc_pipe_fs.h +++ b/include/linux/sunrpc/rpc_pipe_fs.h @@ -15,6 +15,7 @@ struct rpc_pipe_ops { ssize_t (*upcall)(struct file *, struct rpc_pipe_msg *, char __user *, size_t); ssize_t (*downcall)(struct file *, const char __user *, size_t); void (*release_pipe)(struct inode *); + int (*open_pipe)(struct inode *); void (*destroy_msg)(struct rpc_pipe_msg *); }; -- cgit v1.2.3 From 68e76ad0baf8f5d5060377c2423ee6eed5c63057 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 23 Dec 2008 16:17:15 -0500 Subject: nfsd: pass client principal name in rsc downcall Two principals are involved in krb5 authentication: the target, who we authenticate *to* (normally the name of the server, like nfs/server.citi.umich.edu@CITI.UMICH.EDU), and the source, we we authenticate *as* (normally a user, like bfields@UMICH.EDU) In the case of NFSv4 callbacks, the target of the callback should be the source of the client's setclientid call, and the source should be the nfs server's own principal. Therefore we allow svcgssd to pass down the name of the principal that just authenticated, so that on setclientid we can store that principal name with the new client, to be used later on callbacks. Signed-off-by: Olga Kornievskaia Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/nfsd/state.h | 1 + include/linux/sunrpc/svcauth_gss.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index d0fe2e37845..ce7cbf4b7c9 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -124,6 +124,7 @@ struct nfs4_client { nfs4_verifier cl_verifier; /* generated by client */ time_t cl_time; /* time of last lease renewal */ __be32 cl_addr; /* client ipaddress */ + char *cl_principal; /* setclientid principal name */ struct svc_cred cl_cred; /* setclientid principal */ clientid_t cl_clientid; /* generated by server */ nfs4_verifier cl_confirm; /* generated by server */ diff --git a/include/linux/sunrpc/svcauth_gss.h b/include/linux/sunrpc/svcauth_gss.h index c9165d9771a..ca7d725861f 100644 --- a/include/linux/sunrpc/svcauth_gss.h +++ b/include/linux/sunrpc/svcauth_gss.h @@ -20,6 +20,7 @@ int gss_svc_init(void); void gss_svc_shutdown(void); int svcauth_gss_register_pseudoflavor(u32 pseudoflavor, char * name); u32 svcauth_gss_flavor(struct auth_domain *dom); +char *svc_gss_principal(struct svc_rqst *); #endif /* __KERNEL__ */ #endif /* _LINUX_SUNRPC_SVCAUTH_GSS_H */ -- cgit v1.2.3 From 608207e8884e083ad8b8d33eda868da70f0d63e8 Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 23 Dec 2008 16:17:40 -0500 Subject: rpc: pass target name down to rpc level on callbacks The rpc client needs to know the principal that the setclientid was done as, so it can tell gssd who to authenticate to. Signed-off-by: Olga Kornievskaia Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/sunrpc/clnt.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h index 6f0ee1b84a4..c39a21040dc 100644 --- a/include/linux/sunrpc/clnt.h +++ b/include/linux/sunrpc/clnt.h @@ -58,6 +58,7 @@ struct rpc_clnt { struct rpc_timeout cl_timeout_default; struct rpc_program * cl_program; char cl_inline_name[32]; + char *cl_principal; /* target to authenticate to */ }; /* @@ -108,6 +109,7 @@ struct rpc_create_args { u32 version; rpc_authflavor_t authflavor; unsigned long flags; + char *client_name; }; /* Values for "flags" field */ -- cgit v1.2.3 From 61054b14d545e257b9415d5ca0cd5f43762b4d0c Mon Sep 17 00:00:00 2001 From: Olga Kornievskaia Date: Tue, 23 Dec 2008 16:19:00 -0500 Subject: nfsd: support callbacks with gss flavors This patch adds server-side support for callbacks other than AUTH_SYS. Signed-off-by: Olga Kornievskaia Signed-off-by: J. Bruce Fields Signed-off-by: Trond Myklebust --- include/linux/nfsd/state.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/nfsd/state.h b/include/linux/nfsd/state.h index ce7cbf4b7c9..128298c0362 100644 --- a/include/linux/nfsd/state.h +++ b/include/linux/nfsd/state.h @@ -124,6 +124,7 @@ struct nfs4_client { nfs4_verifier cl_verifier; /* generated by client */ time_t cl_time; /* time of last lease renewal */ __be32 cl_addr; /* client ipaddress */ + u32 cl_flavor; /* setclientid pseudoflavor */ char *cl_principal; /* setclientid principal name */ struct svc_cred cl_cred; /* setclientid principal */ clientid_t cl_clientid; /* generated by server */ -- cgit v1.2.3 From 1eca4365be25c540650693e941bc06a66cf38f94 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Nov 2008 20:03:17 +0900 Subject: libata: beef up iterators There currently are the following looping constructs. * __ata_port_for_each_link() for all available links * ata_port_for_each_link() for edge links * ata_link_for_each_dev() for all devices * ata_link_for_each_dev_reverse() for all devices in reverse order Now there's a need for looping construct which is similar to __ata_port_for_each_link() but iterates over PMP links before the host link. Instead of adding another one with long name, do the following cleanup. * Implement and export ata_link_next() and ata_dev_next() which take @mode parameter and can be used to build custom loop. * Implement ata_for_each_link() and ata_for_each_dev() which take looping mode explicitly. The following iteration modes are implemented. * ATA_LITER_EDGE : loop over edge links * ATA_LITER_HOST_FIRST : loop over all links, host link first * ATA_LITER_PMP_FIRST : loop over all links, PMP links first * ATA_DITER_ENABLED : loop over enabled devices * ATA_DITER_ENABLED_REVERSE : loop over enabled devices in reverse order * ATA_DITER_ALL : loop over all devices * ATA_DITER_ALL_REVERSE : loop over all devices in reverse order This change removes exlicit device enabledness checks from many loops and makes it clear which ones are iterated over in which direction. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- include/linux/libata.h | 76 +++++++++++++++++++++++++++++++++++++------------- 1 file changed, 56 insertions(+), 20 deletions(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index ed3f26eb5df..3b2a0c6444e 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -1285,26 +1285,62 @@ static inline int ata_link_active(struct ata_link *link) return ata_tag_valid(link->active_tag) || link->sactive; } -extern struct ata_link *__ata_port_next_link(struct ata_port *ap, - struct ata_link *link, - bool dev_only); - -#define __ata_port_for_each_link(link, ap) \ - for ((link) = __ata_port_next_link((ap), NULL, false); (link); \ - (link) = __ata_port_next_link((ap), (link), false)) - -#define ata_port_for_each_link(link, ap) \ - for ((link) = __ata_port_next_link((ap), NULL, true); (link); \ - (link) = __ata_port_next_link((ap), (link), true)) - -#define ata_link_for_each_dev(dev, link) \ - for ((dev) = (link)->device; \ - (dev) < (link)->device + ata_link_max_devices(link) || ((dev) = NULL); \ - (dev)++) - -#define ata_link_for_each_dev_reverse(dev, link) \ - for ((dev) = (link)->device + ata_link_max_devices(link) - 1; \ - (dev) >= (link)->device || ((dev) = NULL); (dev)--) +/* + * Iterators + * + * ATA_LITER_* constants are used to select link iteration mode and + * ATA_DITER_* device iteration mode. + * + * For a custom iteration directly using ata_{link|dev}_next(), if + * @link or @dev, respectively, is NULL, the first element is + * returned. @dev and @link can be any valid device or link and the + * next element according to the iteration mode will be returned. + * After the last element, NULL is returned. + */ +enum ata_link_iter_mode { + ATA_LITER_EDGE, /* if present, PMP links only; otherwise, + * host link. no slave link */ + ATA_LITER_HOST_FIRST, /* host link followed by PMP or slave links */ + ATA_LITER_PMP_FIRST, /* PMP links followed by host link, + * slave link still comes after host link */ +}; + +enum ata_dev_iter_mode { + ATA_DITER_ENABLED, + ATA_DITER_ENABLED_REVERSE, + ATA_DITER_ALL, + ATA_DITER_ALL_REVERSE, +}; + +extern struct ata_link *ata_link_next(struct ata_link *link, + struct ata_port *ap, + enum ata_link_iter_mode mode); + +extern struct ata_device *ata_dev_next(struct ata_device *dev, + struct ata_link *link, + enum ata_dev_iter_mode mode); + +/* + * Shortcut notation for iterations + * + * ata_for_each_link() iterates over each link of @ap according to + * @mode. @link points to the current link in the loop. @link is + * NULL after loop termination. ata_for_each_dev() works the same way + * except that it iterates over each device of @link. + * + * Note that the mode prefixes ATA_{L|D}ITER_ shouldn't need to be + * specified when using the following shorthand notations. Only the + * mode itself (EDGE, HOST_FIRST, ENABLED, etc...) should be + * specified. This not only increases brevity but also makes it + * impossible to use ATA_LITER_* for device iteration or vice-versa. + */ +#define ata_for_each_link(link, ap, mode) \ + for ((link) = ata_link_next(NULL, (ap), ATA_LITER_##mode); (link); \ + (link) = ata_link_next((link), (ap), ATA_LITER_##mode)) + +#define ata_for_each_dev(dev, link, mode) \ + for ((dev) = ata_dev_next(NULL, (link), ATA_DITER_##mode); (dev); \ + (dev) = ata_dev_next((dev), (link), ATA_DITER_##mode)) /** * ata_ncq_enabled - Test whether NCQ is enabled -- cgit v1.2.3 From ece180d1cfe5fa751eaa85bf796cf28b2150af15 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 3 Nov 2008 20:04:37 +0900 Subject: libata: perform port detach in EH ata_port_detach() first made sure EH saw ATA_PFLAG_UNLOADING and then assumed EH context belongs to it and performed detach operation itself. However, UNLOADING doesn't disable all of EH and this could lead to problems including triggering WARN_ON()'s in EH path. This patch makes port detach behave more like other EH actions such that ata_port_detach() requests EH to detach and waits for completion. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- include/linux/libata.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/libata.h b/include/linux/libata.h index 3b2a0c6444e..3449de597ef 100644 --- a/include/linux/libata.h +++ b/include/linux/libata.h @@ -213,10 +213,11 @@ enum { ATA_PFLAG_FROZEN = (1 << 2), /* port is frozen */ ATA_PFLAG_RECOVERED = (1 << 3), /* recovery action performed */ ATA_PFLAG_LOADING = (1 << 4), /* boot/loading probe */ - ATA_PFLAG_UNLOADING = (1 << 5), /* module is unloading */ ATA_PFLAG_SCSI_HOTPLUG = (1 << 6), /* SCSI hotplug scheduled */ ATA_PFLAG_INITIALIZING = (1 << 7), /* being initialized, don't touch */ ATA_PFLAG_RESETTING = (1 << 8), /* reset in progress */ + ATA_PFLAG_UNLOADING = (1 << 9), /* driver is being unloaded */ + ATA_PFLAG_UNLOADED = (1 << 10), /* driver is unloaded */ ATA_PFLAG_SUSPENDED = (1 << 17), /* port is suspended (power) */ ATA_PFLAG_PM_PENDING = (1 << 18), /* PM operation pending */ -- cgit v1.2.3 From 88e740f1654bf28565edd528a060695c1f2b5ad7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fernando=20Luis=20V=C3=A1zquez=20Cao?= Date: Mon, 27 Oct 2008 18:44:46 +0900 Subject: block: add queue flag for paravirt frontend drivers As is the case with SSD devices, we do not want to idle in AS/CFQ when the block device is a paravirt front-end driver. This patch adds a flag (QUEUE_FLAG_VIRT) which should be used by front-end drivers such as virtio_blk and xen-blkfront to indicate a paravirtualized device. Signed-off-by: Fernando Luis Vazquez Cao Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 031a315c050..482e9600f7a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -449,6 +449,7 @@ struct request_queue #define QUEUE_FLAG_FAIL_IO 12 /* fake timeout */ #define QUEUE_FLAG_STACKABLE 13 /* supports request stacking */ #define QUEUE_FLAG_NONROT 14 /* non-rotational device (SSD) */ +#define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ static inline int queue_is_locked(struct request_queue *q) { -- cgit v1.2.3 From 08bafc0341f2f7920e9045bc32c40299cac8c21b Mon Sep 17 00:00:00 2001 From: Keith Mannthey Date: Tue, 25 Nov 2008 10:24:35 +0100 Subject: block: Supress Buffer I/O errors when SCSI REQ_QUIET flag set Allow the scsi request REQ_QUIET flag to be propagated to the buffer file system layer. The basic ideas is to pass the flag from the scsi request to the bio (block IO) and then to the buffer layer. The buffer layer can then suppress needless printks. This patch declutters the kernel log by removed the 40-50 (per lun) buffer io error messages seen during a boot in my multipath setup . It is a good chance any real errors will be missed in the "noise" it the logs without this patch. During boot I see blocks of messages like " __ratelimit: 211 callbacks suppressed Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242847 Buffer I/O error on device sdm, logical block 1 Buffer I/O error on device sdm, logical block 5242878 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242879 Buffer I/O error on device sdm, logical block 5242872 " in my logs. My disk environment is multipath fiber channel using the SCSI_DH_RDAC code and multipathd. This topology includes an "active" and "ghost" path for each lun. IO's to the "ghost" path will never complete and the SCSI layer, via the scsi device handler rdac code, quick returns the IOs to theses paths and sets the REQ_QUIET scsi flag to suppress the scsi layer messages. I am wanting to extend the QUIET behavior to include the buffer file system layer to deal with these errors as well. I have been running this patch for a while now on several boxes without issue. A few runs of bonnie++ show no noticeable difference in performance in my setup. Thanks for John Stultz for the quiet_error finalization. Submitted-by: Keith Mannthey Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + include/linux/buffer_head.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 6a642098e5c..cf132bfbbac 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -117,6 +117,7 @@ struct bio { #define BIO_CPU_AFFINE 8 /* complete bio on same CPU as submitted */ #define BIO_NULL_MAPPED 9 /* contains invalid user pages */ #define BIO_FS_INTEGRITY 10 /* fs owns integrity data, not block layer */ +#define BIO_QUIET 11 /* Make BIO Quiet */ #define bio_flagged(bio, flag) ((bio)->bi_flags & (1 << (flag))) /* diff --git a/include/linux/buffer_head.h b/include/linux/buffer_head.h index 3ce64b90118..8605f8a74df 100644 --- a/include/linux/buffer_head.h +++ b/include/linux/buffer_head.h @@ -35,6 +35,7 @@ enum bh_state_bits { BH_Ordered, /* ordered write */ BH_Eopnotsupp, /* operation not supported (barrier) */ BH_Unwritten, /* Buffer is allocated on disk but not written */ + BH_Quiet, /* Buffer Error Prinks to be quiet */ BH_PrivateStart,/* not a state bit, but the first bit available * for private allocation by other entities -- cgit v1.2.3 From 64d01dc9e1927e6535627d73f2336c75d1dd3fe2 Mon Sep 17 00:00:00 2001 From: Cheng Renquan Date: Wed, 3 Dec 2008 12:41:39 +0100 Subject: block: use cancel_work_sync() instead of kblockd_flush_work() After many improvements on kblockd_flush_work, it is now identical to cancel_work_sync, so a direct call to cancel_work_sync is suggested. The only difference is that cancel_work_sync is a GPL symbol, so no non-GPL modules anymore. Signed-off-by: Cheng Renquan Cc: Jens Axboe Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 482e9600f7a..e9bb73ff1d6 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -978,7 +978,6 @@ static inline void put_dev_sector(Sector p) struct work_struct; int kblockd_schedule_work(struct request_queue *q, struct work_struct *work); -void kblockd_flush_work(struct work_struct *work); #define MODULE_ALIAS_BLOCKDEV(major,minor) \ MODULE_ALIAS("block-major-" __stringify(major) "-" __stringify(minor)) -- cgit v1.2.3 From ba744d5e290055d171c68067259fcc1e2721f542 Mon Sep 17 00:00:00 2001 From: Richard Kennedy Date: Wed, 3 Dec 2008 12:41:40 +0100 Subject: block: reorder struct bio to remove padding on 64bit Remove 8 bytes of padding from struct bio which also removes 16 bytes from struct bio_pair to make it 248 bytes. bio_pair then fits into one fewer cache lines & into a smaller slab. Signed-off-by: Richard Kennedy Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index cf132bfbbac..3ed714eb54d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -90,10 +90,11 @@ struct bio { unsigned int bi_comp_cpu; /* completion CPU */ + atomic_t bi_cnt; /* pin count */ + struct bio_vec *bi_io_vec; /* the actual vec list */ bio_end_io_t *bi_end_io; - atomic_t bi_cnt; /* pin count */ void *bi_private; #if defined(CONFIG_BLK_DEV_INTEGRITY) -- cgit v1.2.3 From 313e42999dbc0f234ca5909a236f78f082cb43b1 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:02 +0900 Subject: block: reorganize QUEUE_ORDERED_* constants Separate out ordering type (drain,) and action masks (preflush, postflush, fua) from visible ordering mode selectors (QUEUE_ORDERED_*). Ordering types are now named QUEUE_ORDERED_BY_* while action masks are named QUEUE_ORDERED_DO_*. This change is necessary to add QUEUE_ORDERED_DO_BAR and make it optional to improve empty barrier implementation. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e9bb73ff1d6..5c92b443239 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -523,22 +523,29 @@ enum { * TAG_FLUSH : ordering by tag w/ pre and post flushes * TAG_FUA : ordering by tag w/ pre flush and FUA write */ - QUEUE_ORDERED_NONE = 0x00, - QUEUE_ORDERED_DRAIN = 0x01, - QUEUE_ORDERED_TAG = 0x02, - - QUEUE_ORDERED_PREFLUSH = 0x10, - QUEUE_ORDERED_POSTFLUSH = 0x20, - QUEUE_ORDERED_FUA = 0x40, - - QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH, - QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA, - QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_POSTFLUSH, - QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG | - QUEUE_ORDERED_PREFLUSH | QUEUE_ORDERED_FUA, + QUEUE_ORDERED_BY_DRAIN = 0x01, + QUEUE_ORDERED_BY_TAG = 0x02, + QUEUE_ORDERED_DO_PREFLUSH = 0x10, + QUEUE_ORDERED_DO_POSTFLUSH = 0x40, + QUEUE_ORDERED_DO_FUA = 0x80, + + QUEUE_ORDERED_NONE = 0x00, + + QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN, + QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_POSTFLUSH, + QUEUE_ORDERED_DRAIN_FUA = QUEUE_ORDERED_DRAIN | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_FUA, + + QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG, + QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_POSTFLUSH, + QUEUE_ORDERED_TAG_FUA = QUEUE_ORDERED_TAG | + QUEUE_ORDERED_DO_PREFLUSH | + QUEUE_ORDERED_DO_FUA, /* * Ordered operation sequence -- cgit v1.2.3 From f671620e7d895af221bdfeda751d54fa55ed9546 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:04 +0900 Subject: block: make every barrier action optional In all barrier sequences, the barrier write itself was always assumed to be issued and thus didn't have corresponding control flag. This patch adds QUEUE_ORDERED_DO_BAR and unify action mask handling in start_ordered() such that any barrier action can be skipped. This patch doesn't introduce any visible behavior changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 5c92b443239..b044267009e 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -526,12 +526,14 @@ enum { QUEUE_ORDERED_BY_DRAIN = 0x01, QUEUE_ORDERED_BY_TAG = 0x02, QUEUE_ORDERED_DO_PREFLUSH = 0x10, + QUEUE_ORDERED_DO_BAR = 0x20, QUEUE_ORDERED_DO_POSTFLUSH = 0x40, QUEUE_ORDERED_DO_FUA = 0x80, QUEUE_ORDERED_NONE = 0x00, - QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN, + QUEUE_ORDERED_DRAIN = QUEUE_ORDERED_BY_DRAIN | + QUEUE_ORDERED_DO_BAR, QUEUE_ORDERED_DRAIN_FLUSH = QUEUE_ORDERED_DRAIN | QUEUE_ORDERED_DO_PREFLUSH | QUEUE_ORDERED_DO_POSTFLUSH, @@ -539,7 +541,8 @@ enum { QUEUE_ORDERED_DO_PREFLUSH | QUEUE_ORDERED_DO_FUA, - QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG, + QUEUE_ORDERED_TAG = QUEUE_ORDERED_BY_TAG | + QUEUE_ORDERED_DO_BAR, QUEUE_ORDERED_TAG_FLUSH = QUEUE_ORDERED_TAG | QUEUE_ORDERED_DO_PREFLUSH | QUEUE_ORDERED_DO_POSTFLUSH, -- cgit v1.2.3 From 8f11b3e99a1136fcbb67316c3260f085299c0bff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:05 +0900 Subject: block: make barrier completion more robust Barrier completion had the following assumptions. * start_ordered() couldn't finish the whole sequence properly. If all actions are to be skipped, q->ordseq is set correctly but the actual completion was never triggered thus hanging the barrier request. * Drain completion in elv_complete_request() assumed that there's always at least one request in the queue when drain completes. Both assumptions are true but these assumptions need to be removed to improve empty barrier implementation. This patch makes the following changes. * Make start_ordered() use blk_ordered_complete_seq() to mark skipped steps complete and notify __elv_next_request() that it should fetch the next request if the whole barrier has completed inside start_ordered(). * Make drain completion path in elv_complete_request() check whether the queue is empty. Empty queue also indicates drain completion. * While at it, convert 0/1 return from blk_do_ordered() to false/true. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b044267009e..3c7078e0129 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -866,10 +866,10 @@ extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); extern int blk_queue_ordered(struct request_queue *, unsigned, prepare_flush_fn *); -extern int blk_do_ordered(struct request_queue *, struct request **); +extern bool blk_do_ordered(struct request_queue *, struct request **); extern unsigned blk_ordered_cur_seq(struct request_queue *); extern unsigned blk_ordered_req_seq(struct request *); -extern void blk_ordered_complete_seq(struct request_queue *, unsigned, int); +extern bool blk_ordered_complete_seq(struct request_queue *, unsigned, int); extern int blk_rq_map_sg(struct request_queue *, struct request *, struct scatterlist *); extern void blk_dump_rq_flags(struct request *, char *); -- cgit v1.2.3 From 58eea927d2de43dc6f03d1ba2c46e55854b31540 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 28 Nov 2008 13:32:06 +0900 Subject: block: simplify empty barrier implementation Empty barrier required special handling in __elv_next_request() to complete it without letting the low level driver see it. With previous changes, barrier code is now flexible enough to skip the BAR step using the same barrier sequence selection mechanism. Drop the special handling and mask off q->ordered from start_ordered(). Remove blk_empty_barrier() test which now has no user. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 3c7078e0129..41bbadfd17f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -596,7 +596,6 @@ enum { #define blk_fua_rq(rq) ((rq)->cmd_flags & REQ_FUA) #define blk_discard_rq(rq) ((rq)->cmd_flags & REQ_DISCARD) #define blk_bidi_rq(rq) ((rq)->next_rq != NULL) -#define blk_empty_barrier(rq) (blk_barrier_rq(rq) && blk_fs_request(rq) && !(rq)->hard_nr_sectors) /* rq->queuelist of dequeued request must be list_empty() */ #define blk_queued_rq(rq) (!list_empty(&(rq)->queuelist)) -- cgit v1.2.3 From 7ff9345ffac56743b5001561bc2dc1e041b79149 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Dec 2008 11:53:43 +0100 Subject: bio: only mempool back the largest bio_vec slab cache We only very rarely need the mempool backing, so it makes sense to get rid of all but one of the mempool in a bio_set. So keep the largest bio_vec count mempool so we can always honor the largest allocation, and "upgrade" callers that fail. Signed-off-by: Jens Axboe --- include/linux/bio.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 3ed714eb54d..d76e4bf22f2 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -397,13 +397,14 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) */ #define BIO_POOL_SIZE 2 #define BIOVEC_NR_POOLS 6 +#define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) struct bio_set { mempool_t *bio_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) mempool_t *bio_integrity_pool; #endif - mempool_t *bvec_pools[BIOVEC_NR_POOLS]; + mempool_t *bvec_pool; }; struct biovec_slab { -- cgit v1.2.3 From 1b4344986926da324b5cd10b683e5a1a5e1b7db3 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 22 Oct 2008 20:32:58 +0200 Subject: bio: move the slab pointer inside the bio_set In preparation for adding differently sized bios. Signed-off-by: Jens Axboe --- include/linux/bio.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index d76e4bf22f2..9340098d75d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -400,6 +400,7 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) #define BIOVEC_MAX_IDX (BIOVEC_NR_POOLS - 1) struct bio_set { + struct kmem_cache *bio_slab; mempool_t *bio_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) mempool_t *bio_integrity_pool; -- cgit v1.2.3 From bb799ca0202a360fa74d5f17039b9100caebdde7 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Wed, 10 Dec 2008 15:35:05 +0100 Subject: bio: allow individual slabs in the bio_set Instead of having a global bio slab cache, add a reference to one in each bio_set that is created. This allows for personalized slabs in each bio_set, so that they can have bios of different sizes. This means we can personalize the bios we return. File systems may want to embed the bio inside another structure, to avoid allocation more items (and stuffing them in ->bi_private) after the get a bio. Or we may want to embed a number of bio_vecs directly at the end of a bio, to avoid doing two allocations to return a bio. This is now possible. Signed-off-by: Jens Axboe --- include/linux/bio.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 9340098d75d..4b80d3537f9 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -334,7 +334,7 @@ struct bio_pair { extern struct bio_pair *bio_split(struct bio *bi, int first_sectors); extern void bio_pair_release(struct bio_pair *dbio); -extern struct bio_set *bioset_create(int, int); +extern struct bio_set *bioset_create(unsigned int, unsigned int); extern void bioset_free(struct bio_set *); extern struct bio *bio_alloc(gfp_t, int); @@ -379,6 +379,7 @@ extern struct bio *bio_copy_user_iov(struct request_queue *, extern int bio_uncopy_user(struct bio *); void zero_fill_bio(struct bio *bio); extern struct bio_vec *bvec_alloc_bs(gfp_t, int, unsigned long *, struct bio_set *); +extern void bvec_free_bs(struct bio_set *, struct bio_vec *, unsigned int); extern unsigned int bvec_nr_vecs(unsigned short idx); /* @@ -401,6 +402,8 @@ static inline void bio_set_completion_cpu(struct bio *bio, unsigned int cpu) struct bio_set { struct kmem_cache *bio_slab; + unsigned int front_pad; + mempool_t *bio_pool; #if defined(CONFIG_BLK_DEV_INTEGRITY) mempool_t *bio_integrity_pool; @@ -415,6 +418,7 @@ struct biovec_slab { }; extern struct bio_set *fs_bio_set; +extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly; /* * a small number of entries is fine, not going to be performance critical. -- cgit v1.2.3 From 392ddc32982a5c661dd90dd49a3cb37f1c68b782 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 23 Dec 2008 12:42:54 +0100 Subject: bio: add support for inlining a number of bio_vecs inside the bio When we go and allocate a bio for IO, we actually do two allocations. One for the bio itself, and one for the bi_io_vec that holds the actual pages we are interested in. This feature inlines a definable amount of io vecs inside the bio itself, so we eliminate the bio_vec array allocation for IO's up to a certain size. It defaults to 4 vecs, which is typically 16k of IO. Signed-off-by: Jens Axboe --- include/linux/bio.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bio.h b/include/linux/bio.h index 4b80d3537f9..18462c5b8ff 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -102,6 +102,13 @@ struct bio { #endif bio_destructor_t *bi_destructor; /* destructor */ + + /* + * We can inline a number of vecs at the end of the bio, to avoid + * double allocations for a small number of bio_vecs. This member + * MUST obviously be kept at the very end of the bio. + */ + struct bio_vec bi_inline_vecs[0]; }; /* @@ -213,6 +220,11 @@ static inline void *bio_data(struct bio *bio) return NULL; } +static inline int bio_has_allocated_vec(struct bio *bio) +{ + return bio->bi_io_vec && bio->bi_io_vec != bio->bi_inline_vecs; +} + /* * will die */ -- cgit v1.2.3 From abf137dd7712132ee56d5b3143c2ff61a72a5faa Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 9 Dec 2008 08:11:22 +0100 Subject: aio: make the lookup_ioctx() lockless The mm->ioctx_list is currently protected by a reader-writer lock, so we always grab that lock on the read side for doing ioctx lookups. As the workload is extremely reader biased, turn this into an rcu hlist so we can make lookup_ioctx() lockless. Get rid of the rwlock and use a spinlock for providing update side exclusion. There's usually only 1 entry on this list, so it doesn't make sense to look into fancier data structures. Reviewed-by: Jeff Moyer Signed-off-by: Jens Axboe --- include/linux/aio.h | 5 ++++- include/linux/mm_types.h | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/aio.h b/include/linux/aio.h index f6b8cf99b59..b16a957030f 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -5,6 +5,7 @@ #include #include #include +#include #include @@ -183,7 +184,7 @@ struct kioctx { /* This needs improving */ unsigned long user_id; - struct kioctx *next; + struct hlist_node list; wait_queue_head_t wait; @@ -199,6 +200,8 @@ struct kioctx { struct aio_ring_info ring_info; struct delayed_work wq; + + struct rcu_head rcu_head; }; /* prototypes */ diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index fe825471d5a..9cfc9b627fd 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -232,8 +232,9 @@ struct mm_struct { struct core_state *core_state; /* coredumping support */ /* aio bits */ - rwlock_t ioctx_list_lock; /* aio lock */ - struct kioctx *ioctx_list; + spinlock_t ioctx_lock; + struct hlist_head ioctx_list; + #ifdef CONFIG_MM_OWNER /* * "owner" points to a task that is regarded as the canonical -- cgit v1.2.3 From b374d18a4bfce705e4a99ae9f501b53e86ecb283 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 31 Oct 2008 10:05:07 +0100 Subject: block: get rid of elevator_t typedef Just use struct elevator_queue everywhere instead. Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +-- include/linux/elevator.h | 8 ++++---- 2 files changed, 5 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 41bbadfd17f..7035cec583b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -26,7 +26,6 @@ struct scsi_ioctl_command; struct request_queue; struct elevator_queue; -typedef struct elevator_queue elevator_t; struct request_pm_state; struct blk_trace; struct request; @@ -313,7 +312,7 @@ struct request_queue */ struct list_head queue_head; struct request *last_merge; - elevator_t *elevator; + struct elevator_queue *elevator; /* * the queue request freelist, one for reads and one for writes diff --git a/include/linux/elevator.h b/include/linux/elevator.h index 92f6f634e3e..7a204256b15 100644 --- a/include/linux/elevator.h +++ b/include/linux/elevator.h @@ -28,7 +28,7 @@ typedef void (elevator_activate_req_fn) (struct request_queue *, struct request typedef void (elevator_deactivate_req_fn) (struct request_queue *, struct request *); typedef void *(elevator_init_fn) (struct request_queue *); -typedef void (elevator_exit_fn) (elevator_t *); +typedef void (elevator_exit_fn) (struct elevator_queue *); struct elevator_ops { @@ -62,8 +62,8 @@ struct elevator_ops struct elv_fs_entry { struct attribute attr; - ssize_t (*show)(elevator_t *, char *); - ssize_t (*store)(elevator_t *, const char *, size_t); + ssize_t (*show)(struct elevator_queue *, char *); + ssize_t (*store)(struct elevator_queue *, const char *, size_t); }; /* @@ -130,7 +130,7 @@ extern ssize_t elv_iosched_show(struct request_queue *, char *); extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); extern int elevator_init(struct request_queue *, char *); -extern void elevator_exit(elevator_t *); +extern void elevator_exit(struct elevator_queue *); extern int elv_rq_merge_ok(struct request *, struct bio *); /* -- cgit v1.2.3 From a6f23657d3072bde6844055bbc2290e497f33fbc Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Oct 2008 12:52:42 +0200 Subject: block: add one-hit cache for disk partition lookup disk_map_sector_rcu() returns a partition from a sector offset, which we use for IO statistics on a per-partition basis. The lookup itself is an O(N) list lookup, where N is the number of partitions. This actually hurts performance quite a bit, even on the lower end partitions. On higher numbered partitions, it can get pretty bad. Solve this by adding a one-hit cache for partition lookup. This makes the lookup O(1) for the case where we do most IO to one partition. Even for mixed partition workloads, amortized cost is pretty close to O(1) since the natural IO batching makes the one-hit cache last for lots of IOs. Signed-off-by: Jens Axboe --- include/linux/genhd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 3df7742ce24..16948eaecae 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -126,6 +126,7 @@ struct blk_scsi_cmd_filter { struct disk_part_tbl { struct rcu_head rcu_head; int len; + struct hd_struct *last_lookup; struct hd_struct *part[]; }; -- cgit v1.2.3 From b3a6ffe16b5cc48abe7db8d04882dc45280eb693 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 12 Dec 2008 09:51:16 +0100 Subject: Get rid of CONFIG_LSF We have two seperate config entries for large devices/files. One is CONFIG_LBD that guards just the devices, the other is CONFIG_LSF that handles large files. This doesn't make a lot of sense, you typically want both or none. So get rid of CONFIG_LSF and change CONFIG_LBD wording to indicate that it covers both. Acked-by: Jean Delvare Signed-off-by: Jens Axboe --- include/linux/types.h | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/types.h b/include/linux/types.h index 1d98330b1f2..121f349cb7e 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -135,19 +135,14 @@ typedef __s64 int64_t; * * Linux always considers sectors to be 512 bytes long independently * of the devices real block size. + * + * blkcnt_t is the type of the inode's block count. */ #ifdef CONFIG_LBD typedef u64 sector_t; -#else -typedef unsigned long sector_t; -#endif - -/* - * The type of the inode's block count. - */ -#ifdef CONFIG_LSF typedef u64 blkcnt_t; #else +typedef unsigned long sector_t; typedef unsigned long blkcnt_t; #endif -- cgit v1.2.3 From f453ba0460742ad027ae0c4c7d61e62817b3e7ef Mon Sep 17 00:00:00 2001 From: Dave Airlie Date: Fri, 7 Nov 2008 14:05:41 -0800 Subject: DRM: add mode setting support Add mode setting support to the DRM layer. This is a fairly big chunk of work that allows DRM drivers to provide full output control and configuration capabilities to userspace. It was motivated by several factors: - the fb layer's APIs aren't suited for anything but simple configurations - coordination between the fb layer, DRM layer, and various userspace drivers is poor to non-existent (radeonfb excepted) - user level mode setting drivers makes displaying panic & oops messages more difficult - suspend/resume of graphics state is possible in many more configurations with kernel level support This commit just adds the core DRM part of the mode setting APIs. Driver specific commits using these new structure and APIs will follow. Co-authors: Jesse Barnes , Jakob Bornecrantz Contributors: Alan Hourihane , Maarten Maathuis Signed-off-by: Jesse Barnes Signed-off-by: Eric Anholt Signed-off-by: Dave Airlie --- include/linux/console.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/console.h b/include/linux/console.h index 248e6e3b9b7..a67a90cf826 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -153,4 +153,8 @@ void vcs_remove_sysfs(struct tty_struct *tty); #define VESA_HSYNC_SUSPEND 2 #define VESA_POWERDOWN 3 +#ifdef CONFIG_VGA_CONSOLE +extern bool vgacon_text_force(void); +#endif + #endif /* _LINUX_CONSOLE_H */ -- cgit v1.2.3 From 773ff60e841461cb1f9374a713ffcda029b8c317 Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Tue, 23 Dec 2008 19:37:01 +0900 Subject: SLUB: failslab support Currently fault-injection capability for SLAB allocator is only available to SLAB. This patch makes it available to SLUB, too. [penberg@cs.helsinki.fi: unify slab and slub implementations] Cc: Christoph Lameter Cc: Matt Mackall Signed-off-by: Akinobu Mita Signed-off-by: Pekka Enberg --- include/linux/fault-inject.h | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fault-inject.h b/include/linux/fault-inject.h index 32368c4f032..06ca9b21dad 100644 --- a/include/linux/fault-inject.h +++ b/include/linux/fault-inject.h @@ -81,4 +81,13 @@ static inline void cleanup_fault_attr_dentries(struct fault_attr *attr) #endif /* CONFIG_FAULT_INJECTION */ +#ifdef CONFIG_FAILSLAB +extern bool should_failslab(size_t size, gfp_t gfpflags); +#else +static inline bool should_failslab(size_t size, gfp_t gfpflags) +{ + return false; +} +#endif /* CONFIG_FAILSLAB */ + #endif /* _LINUX_FAULT_INJECT_H */ -- cgit v1.2.3 From dfcd3610289132a762b7dc0eaf33998262cd9e20 Mon Sep 17 00:00:00 2001 From: Pascal Terjan Date: Tue, 25 Nov 2008 15:08:19 +0100 Subject: slab: Fix comment on #endif This #endif in slab.h is described as closing the inner block while it's for the big CONFIG_NUMA one. That makes reading the code a bit harder. This trivial patch fixes the comment. Signed-off-by: Pascal Terjan Signed-off-by: Pekka Enberg --- include/linux/slab.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 000da12b5cf..9d8ca14be3c 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -285,7 +285,7 @@ extern void *__kmalloc_node_track_caller(size_t, gfp_t, int, void *); #define kmalloc_node_track_caller(size, flags, node) \ kmalloc_track_caller(size, flags) -#endif /* DEBUG_SLAB */ +#endif /* CONFIG_NUMA */ /* * Shortcuts -- cgit v1.2.3 From d61c72e52b98411d1cfef1fdb3f5a8bb070f8966 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Wed, 10 Dec 2008 14:07:21 +0100 Subject: DMI: add dmi_match Add a wrapper for testing system_info which will handle also NULL system infos. This will be used by the ata PIIX driver. Signed-off-by: Jiri Slaby Cc: Alexandru Romanescu Cc: Tejun Heo Cc: Alan Cox Signed-off-by: Jeff Garzik --- include/linux/dmi.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dmi.h b/include/linux/dmi.h index 2bfda178f27..34161907b2f 100644 --- a/include/linux/dmi.h +++ b/include/linux/dmi.h @@ -47,6 +47,7 @@ extern int dmi_name_in_vendors(const char *str); extern int dmi_name_in_serial(const char *str); extern int dmi_available; extern int dmi_walk(void (*decode)(const struct dmi_header *)); +extern bool dmi_match(enum dmi_field f, const char *str); #else @@ -61,6 +62,8 @@ static inline int dmi_name_in_serial(const char *s) { return 0; } #define dmi_available 0 static inline int dmi_walk(void (*decode)(const struct dmi_header *)) { return -1; } +static inline bool dmi_match(enum dmi_field f, const char *str) + { return false; } #endif -- cgit v1.2.3 From 2a2ca6a96194c4744a2adeefbc09ce881f3c5abe Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:31 +0100 Subject: ide: replace the global ide_lock spinlock by per-hwgroup spinlocks (v2) Now that (almost) all host drivers have been fixed not to abuse ide_lock and core code usage of ide_lock has been sanitized we may safely replace ide_lock by per-hwgroup locks. This patch is partially based on earlier patch from Ravikiran G Thirumalai. While at it: - don't use deprecated HWIF() and HWGROUP() macros - update locking documentation in ide.h v2: Add missing spin_lock_init(&hwgroup->lock). (Noticed by Elias Oltmanns) Cc: Vaibhav V. Nivargi Cc: Alok N. Kataria Cc: Shai Fultheim Signed-off-by: Ravikiran Thirumalai Cc: Elias Oltmanns Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 010fb26a157..c871d325ced 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -909,6 +909,8 @@ typedef struct hwgroup_s { int req_gen; int req_gen_timer; + + spinlock_t lock; } ide_hwgroup_t; typedef struct ide_driver_s ide_driver_t; @@ -1610,13 +1612,13 @@ extern struct mutex ide_cfg_mtx; /* * Structure locking: * - * ide_cfg_mtx and ide_lock together protect changes to - * ide_hwif_t->{next,hwgroup} + * ide_cfg_mtx and hwgroup->lock together protect changes to + * ide_hwif_t->next * ide_drive_t->next * - * ide_hwgroup_t->busy: ide_lock - * ide_hwgroup_t->hwif: ide_lock - * ide_hwif_t->mate: constant, no locking + * ide_hwgroup_t->busy: hwgroup->lock + * ide_hwgroup_t->hwif: hwgroup->lock + * ide_hwif_t->{hwgroup,mate}: constant, no locking * ide_drive_t->hwif: constant, no locking */ -- cgit v1.2.3 From 27c01c2db05c3cf8824975e50403cd4fd9356dca Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:32 +0100 Subject: ide-cd: remove obsolete seek optimization It doesn't make much sense nowadays and is problematic on some drives. Cc: Borislav Petkov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 4 ---- 1 file changed, 4 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index c871d325ced..0b8af0535d8 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -122,8 +122,6 @@ struct ide_io_ports { #define MAX_DRIVES 2 /* per interface; 2 assumed by lots of code */ #define SECTOR_SIZE 512 -#define IDE_LARGE_SEEK(b1,b2,t) (((b1) > (b2) + (t)) || ((b2) > (b1) + (t))) - /* * Timeouts for various operations: */ @@ -496,8 +494,6 @@ enum { * when more than one interrupt is needed. */ IDE_AFLAG_LIMIT_NFRAMES = (1 << 7), - /* Seeking in progress. */ - IDE_AFLAG_SEEKING = (1 << 8), /* Saved TOC information is current. */ IDE_AFLAG_TOC_VALID = (1 << 9), /* We think that the drive door is locked. */ -- cgit v1.2.3 From 6b5cde3629701258004b94cde75dd1089b556b02 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:32 +0100 Subject: cmd64x: set IDE_HFLAG_SERIALIZE explictly for CMD646 * Set IDE_HFLAG_SERIALIZE explictly for CMD646. * Remove no longer needed ide_cmd646 chipset type (which has a nice side-effect of fixing handling of unexpected IRQs). Cc: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 0b8af0535d8..150e42311ee 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -171,7 +171,7 @@ enum { ide_unknown, ide_generic, ide_pci, ide_cmd640, ide_dtc2278, ide_ali14xx, ide_qd65xx, ide_umc8672, ide_ht6560b, ide_rz1000, ide_trm290, - ide_cmd646, ide_cy82c693, ide_4drives, + ide_cy82c693, ide_4drives, ide_pmac, ide_acorn, ide_au1xxx, ide_palm3710 }; -- cgit v1.2.3 From f58c1ab8deebc2360cef998f169a6727c288210f Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:33 +0100 Subject: ide: always set nIEN on idle devices * Set nIEN for previous port/device in ide_do_request() also if port uses a non-shared IRQ. * Remove no longer needed ide_hwif_t.sharing_irq. Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 150e42311ee..1d28006aec6 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -842,7 +842,6 @@ typedef struct hwif_s { unsigned present : 1; /* this interface exists */ unsigned serialized : 1; /* serialized all channel operation */ - unsigned sharing_irq: 1; /* 1 = sharing irq with another hwif */ unsigned sg_mapped : 1; /* sg_table and sg_nents are ready */ struct device gendev; -- cgit v1.2.3 From 7f1ac8c4b9dadc55ec656b368f5f470f2cbe3083 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:33 +0100 Subject: rz1000: apply chipset quirks early (v2) * Use pci_name(dev) instead of hwif->name in init_hwif_rz1000(). * init_hwif_rz1000() -> rz1000_init_chipset(). Update rz1000_init_one() to use rz1000_init_chipset() and add now required rz1000_remove(). * Remove superfluous ide_rz1000 chipset type. v2: * unsigned int rz1000_init_chipset() -> int rz1000_disable_readahead() per Sergei's suggestion. Cc: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 1d28006aec6..fc1a966c7b7 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -170,8 +170,7 @@ typedef int (ide_ack_intr_t)(struct hwif_s *); enum { ide_unknown, ide_generic, ide_pci, ide_cmd640, ide_dtc2278, ide_ali14xx, ide_qd65xx, ide_umc8672, ide_ht6560b, - ide_rz1000, ide_trm290, - ide_cy82c693, ide_4drives, + ide_trm290, ide_cy82c693, ide_4drives, ide_pmac, ide_acorn, ide_au1xxx, ide_palm3710 }; -- cgit v1.2.3 From 6b4924962c49655494d2c8e9d3faab0e349a3062 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:34 +0100 Subject: ide: add ->max_sectors field to struct ide_port_info * Add ->max_sectors field to struct ide_port_info to allow host drivers to specify value used for hwif->rqsize (if smaller than the default). * Convert pdc202xx_old to use ->max_sectors and remove no longer needed IDE_HFLAG_RQSIZE_256 flag. There should be no functional changes caused by this patch. Acked-by: Sergei Shtyltov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index fc1a966c7b7..2574dda4a3e 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1372,8 +1372,6 @@ enum { IDE_HFLAG_LEGACY_IRQS = (1 << 21), /* force use of legacy IRQs */ IDE_HFLAG_FORCE_LEGACY_IRQS = (1 << 22), - /* limit LBA48 requests to 256 sectors */ - IDE_HFLAG_RQSIZE_256 = (1 << 23), /* use 32-bit I/O ops */ IDE_HFLAG_IO_32BIT = (1 << 24), /* unmask IRQs */ @@ -1411,6 +1409,9 @@ struct ide_port_info { ide_pci_enablebit_t enablebits[2]; hwif_chipset_t chipset; + + u16 max_sectors; /* if < than the default one */ + u32 host_flags; u8 pio_mask; u8 swdma_mask; -- cgit v1.2.3 From 1f66019bdf902cb59adf959e462bcd3f4c01f683 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:34 +0100 Subject: trm290: add IDE_HFLAG_TRM290 host flag * Add IDE_HFLAG_TRM290 host flag and use it in ide_build_dmatable(). * Remove no longer needed ide_trm290 chipset type. Acked-by: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 2574dda4a3e..f62d35a5fb7 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -170,7 +170,7 @@ typedef int (ide_ack_intr_t)(struct hwif_s *); enum { ide_unknown, ide_generic, ide_pci, ide_cmd640, ide_dtc2278, ide_ali14xx, ide_qd65xx, ide_umc8672, ide_ht6560b, - ide_trm290, ide_cy82c693, ide_4drives, + ide_cy82c693, ide_4drives, ide_pmac, ide_acorn, ide_au1xxx, ide_palm3710 }; @@ -1372,6 +1372,8 @@ enum { IDE_HFLAG_LEGACY_IRQS = (1 << 21), /* force use of legacy IRQs */ IDE_HFLAG_FORCE_LEGACY_IRQS = (1 << 22), + /* host is TRM290 */ + IDE_HFLAG_TRM290 = (1 << 23), /* use 32-bit I/O ops */ IDE_HFLAG_IO_32BIT = (1 << 24), /* unmask IRQs */ -- cgit v1.2.3 From b7876a6fb6e9bf6cbcf7b40cf034aa4138d7978f Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:34 +0100 Subject: cy82c693: remove superfluous ide_cy82c693 chipset type Since CY82C693 doesn't require serialization we may as well use the default ide_pci chipset type. Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index f62d35a5fb7..f89d6d69a38 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -170,8 +170,7 @@ typedef int (ide_ack_intr_t)(struct hwif_s *); enum { ide_unknown, ide_generic, ide_pci, ide_cmd640, ide_dtc2278, ide_ali14xx, ide_qd65xx, ide_umc8672, ide_ht6560b, - ide_cy82c693, ide_4drives, - ide_pmac, ide_acorn, + ide_4drives, ide_pmac, ide_acorn, ide_au1xxx, ide_palm3710 }; -- cgit v1.2.3 From 702c026be87ef8374ae58122969a4b0b081ce6f2 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:36 +0100 Subject: ide: rework handling of serialized ports (v2) * hpt366: set IDE_HFLAG_SERIALIZE in ->host_flags if needed in init_hwif_hpt366(). Remove HPT_SERIALIZE_IO while at it. * Set IDE_HFLAG_SERIALIZE in ->host_flags if needed in ide_init_port(). * Convert init_irq() to use IDE_HFLAG_SERIALIZE together with hwif->host to find out ports which need to be serialized. * Remove no longer needed save_match() and ide_hwif_t.serialized. v2: * Set host's ->host_flags field instead of port's copy. This patch should fix the incorrect grouping of port(s) from host(s) that need serialization with port(s) that happen to use the same IRQ(s) but are from the host(s) that don't need it. Cc: Sergei Shtylyov Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index f89d6d69a38..9b89cab6d49 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -839,7 +839,6 @@ typedef struct hwif_s { unsigned extra_ports; /* number of extra dma ports */ unsigned present : 1; /* this interface exists */ - unsigned serialized : 1; /* serialized all channel operation */ unsigned sg_mapped : 1; /* sg_table and sg_nents are ready */ struct device gendev; -- cgit v1.2.3 From e2984c628c924442132304ae662da433f41c05c9 Mon Sep 17 00:00:00 2001 From: Bartlomiej Zolnierkiewicz Date: Mon, 29 Dec 2008 20:27:37 +0100 Subject: ide: move Power Management support to ide-pm.c There should be no functional changes caused by this patch. Signed-off-by: Bartlomiej Zolnierkiewicz --- include/linux/ide.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include/linux') diff --git a/include/linux/ide.h b/include/linux/ide.h index 9b89cab6d49..e99c56de7f5 100644 --- a/include/linux/ide.h +++ b/include/linux/ide.h @@ -1116,6 +1116,14 @@ enum { IDE_PM_COMPLETED, }; +int generic_ide_suspend(struct device *, pm_message_t); +int generic_ide_resume(struct device *); + +void ide_complete_power_step(ide_drive_t *, struct request *); +ide_startstop_t ide_start_power_step(ide_drive_t *, struct request *); +void ide_complete_pm_request(ide_drive_t *, struct request *); +void ide_check_pm_state(ide_drive_t *, struct request *); + /* * Subdrivers support. * -- cgit v1.2.3 From 69acdf1e5a9146ec6667f6c4b439acd38c18f5ea Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Tue, 4 Nov 2008 21:59:37 -0300 Subject: V4L/DVB (9530): Add new pixel format VYUY 16 bits wide. There were already 3 YUV formats defined : - YUYV - YVYU - UYVY The only left combination is VYUY, which is added in this patch. Signed-off-by: Robert Jarzmik Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 4669d7e72e7..ec311d4616c 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -293,6 +293,7 @@ struct v4l2_pix_format { #define V4L2_PIX_FMT_YVU420 v4l2_fourcc('Y', 'V', '1', '2') /* 12 YVU 4:2:0 */ #define V4L2_PIX_FMT_YUYV v4l2_fourcc('Y', 'U', 'Y', 'V') /* 16 YUV 4:2:2 */ #define V4L2_PIX_FMT_UYVY v4l2_fourcc('U', 'Y', 'V', 'Y') /* 16 YUV 4:2:2 */ +#define V4L2_PIX_FMT_VYUY v4l2_fourcc('V', 'Y', 'U', 'Y') /* 16 YUV 4:2:2 */ #define V4L2_PIX_FMT_YUV422P v4l2_fourcc('4', '2', '2', 'P') /* 16 YVU422 planar */ #define V4L2_PIX_FMT_YUV411P v4l2_fourcc('4', '1', '1', 'P') /* 16 YVU411 planar */ #define V4L2_PIX_FMT_Y41P v4l2_fourcc('Y', '4', '1', 'P') /* 12 YUV 4:1:1 */ -- cgit v1.2.3 From 480daab42c4dd74b3c07031ddf9031251c530c77 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:25:56 -0600 Subject: virtio: Don't use PAGE_SIZE in virtio_pci.c The virtio PCI devices don't depend on the guest page size. This matters now PowerPC virtio is gaining ground (they like 64k pages). Signed-off-by: Rusty Russell --- include/linux/virtio_pci.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h index cdef3574293..e13d7ebcf57 100644 --- a/include/linux/virtio_pci.h +++ b/include/linux/virtio_pci.h @@ -53,4 +53,8 @@ /* Virtio ABI version, this must match exactly */ #define VIRTIO_PCI_ABI_VERSION 0 + +/* How many bits to shift physical queue address written to QUEUE_PFN. + * 12 is historical, and due to x86 page size. */ +#define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12 #endif -- cgit v1.2.3 From 5f0d1d7f2286c8a02dab69f5f0bd51681fab161e Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:25:57 -0600 Subject: virtio: rename 'pagesize' arg to vring_init/vring_size It's really the alignment desired for consumer/producer separation; historically this x86 pagesize, but with PowerPC it'll still be x86 pagesize. And in theory lguest could choose a different value. Signed-off-by: Rusty Russell --- include/linux/virtio_ring.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'include/linux') diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index c4a598fb382..01bf3124e31 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -83,7 +83,7 @@ struct vring { * __u16 avail_idx; * __u16 available[num]; * - * // Padding to the next page boundary. + * // Padding to the next align boundary. * char pad[]; * * // A ring of used descriptor heads with free-running index. @@ -93,19 +93,19 @@ struct vring { * }; */ static inline void vring_init(struct vring *vr, unsigned int num, void *p, - unsigned long pagesize) + unsigned long align) { vr->num = num; vr->desc = p; vr->avail = p + num*sizeof(struct vring_desc); - vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + pagesize-1) - & ~(pagesize - 1)); + vr->used = (void *)(((unsigned long)&vr->avail->ring[num] + align-1) + & ~(align - 1)); } -static inline unsigned vring_size(unsigned int num, unsigned long pagesize) +static inline unsigned vring_size(unsigned int num, unsigned long align) { return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num) - + pagesize - 1) & ~(pagesize - 1)) + + align - 1) & ~(align - 1)) + sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num; } -- cgit v1.2.3 From 498af14783935af487d17dbee4ac451783cbc2b7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:25:57 -0600 Subject: virtio: Don't use PAGE_SIZE for vring alignment in virtio_pci. That doesn't work for non-4k guests which are now appearing. Signed-off-by: Rusty Russell --- include/linux/virtio_pci.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h index e13d7ebcf57..cd0fd5d181a 100644 --- a/include/linux/virtio_pci.h +++ b/include/linux/virtio_pci.h @@ -57,4 +57,8 @@ /* How many bits to shift physical queue address written to QUEUE_PFN. * 12 is historical, and due to x86 page size. */ #define VIRTIO_PCI_QUEUE_ADDR_SHIFT 12 + +/* The alignment to use between consumer and producer parts of vring. + * x86 pagesize again. */ +#define VIRTIO_PCI_VRING_ALIGN 4096 #endif -- cgit v1.2.3 From 2966af73e70dee461c256b5eb877b2ff757f8c82 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:25:58 -0600 Subject: virtio: use LGUEST_VRING_ALIGN instead of relying on pagesize This doesn't really matter, since lguest is i386 only at the moment, but we could actually choose a different value. (lguest doesn't have a guarenteed ABI). Signed-off-by: Rusty Russell --- include/linux/lguest_launcher.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index e7217dc58f3..bd0eba76052 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -59,4 +59,8 @@ enum lguest_req LHREQ_IRQ, /* + irq */ LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ }; + +/* The alignment to use between consumer and producer parts of vring. + * x86 pagesize for historical reasons. */ +#define LGUEST_VRING_ALIGN 4096 #endif /* _LINUX_LGUEST_LAUNCHER */ -- cgit v1.2.3 From 87c7d57c17ade5024d95b6ca0da249da49b0672a Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Tue, 30 Dec 2008 09:26:03 -0600 Subject: virtio: hand virtio ring alignment as argument to vring_new_virtqueue This allows each virtio user to hand in the alignment appropriate to their virtio_ring structures. Signed-off-by: Rusty Russell Acked-by: Christian Borntraeger --- include/linux/virtio_ring.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index 01bf3124e31..71e03722fb5 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -115,6 +115,7 @@ struct virtio_device; struct virtqueue; struct virtqueue *vring_new_virtqueue(unsigned int num, + unsigned int vring_align, struct virtio_device *vdev, void *pages, void (*notify)(struct virtqueue *vq), -- cgit v1.2.3 From 1b4aa2faeca1b9922033daf2475b6fc13b0ffea6 Mon Sep 17 00:00:00 2001 From: Hollis Blanchard Date: Thu, 13 Nov 2008 15:48:33 -0600 Subject: virtio: avoid implicit use of Linux page size in balloon interface Make the balloon interface always use 4K pages, and convert Linux pfns if necessary. This patch assumes that Linux's PAGE_SHIFT will never be less than 12. Signed-off-by: Hollis Blanchard Signed-off-by: Rusty Russell (modified) --- include/linux/virtio_balloon.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h index c30c7bfbf39..8726ff77763 100644 --- a/include/linux/virtio_balloon.h +++ b/include/linux/virtio_balloon.h @@ -10,6 +10,9 @@ /* The feature bitmap for virtio balloon */ #define VIRTIO_BALLOON_F_MUST_TELL_HOST 0 /* Tell before reclaiming pages */ +/* Size of a PFN in the balloon interface. */ +#define VIRTIO_BALLOON_PFN_SHIFT 12 + struct virtio_balloon_config { /* Number of pages host wants Guest to give up. */ -- cgit v1.2.3 From c29834584ea4eafccf2f62a0b8a32e64f792044c Mon Sep 17 00:00:00 2001 From: Christian Borntraeger Date: Tue, 25 Nov 2008 13:36:26 +0100 Subject: virtio_console: support console resizing this patch uses the new hvc callback hvc_resize to set the window size which allows to change the tty size of hvc_console via a hvc_resize function. I have added a new feature bit VIRTIO_CONSOLE_F_SIZE. The driver will change the window size on tty open and via the config_changed callback of the transport. Currently lguest and kvm_s390 have not implemented this callback, but the callback can be implemented at a later point in time. Signed-off-by: Christian Borntraeger Signed-off-by: Rusty Russell --- include/linux/virtio_console.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include/linux') diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h index 19a0da0dba4..7615ffcdd55 100644 --- a/include/linux/virtio_console.h +++ b/include/linux/virtio_console.h @@ -7,6 +7,17 @@ /* The ID for virtio console */ #define VIRTIO_ID_CONSOLE 3 +/* Feature bits */ +#define VIRTIO_CONSOLE_F_SIZE 0 /* Does host provide console size? */ + +struct virtio_console_config { + /* colums of the screens */ + __u16 cols; + /* rows of the screens */ + __u16 rows; +} __attribute__((packed)); + + #ifdef __KERNEL__ int __init virtio_cons_early_init(int (*put_chars)(u32, const char *, int)); #endif /* __KERNEL__ */ -- cgit v1.2.3 From 58a24566449892dda409b9ad92c2e56c76c5670c Mon Sep 17 00:00:00 2001 From: Matias Zabaljauregui Date: Mon, 29 Sep 2008 01:40:07 -0300 Subject: lguest: move the initial guest page table creation code to the host This patch moves the initial guest page table creation code to the host, so the launcher keeps working with PAE enabled configs. Signed-off-by: Matias Zabaljauregui Signed-off-by: Rusty Russell --- include/linux/lguest_launcher.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/lguest_launcher.h b/include/linux/lguest_launcher.h index bd0eba76052..a53407a4165 100644 --- a/include/linux/lguest_launcher.h +++ b/include/linux/lguest_launcher.h @@ -54,7 +54,7 @@ struct lguest_vqconfig { /* Write command first word is a request. */ enum lguest_req { - LHREQ_INITIALIZE, /* + base, pfnlimit, pgdir, start */ + LHREQ_INITIALIZE, /* + base, pfnlimit, start */ LHREQ_GETDMA, /* No longer used */ LHREQ_IRQ, /* + irq */ LHREQ_BREAK, /* + on/off flag (on blocks until someone does off) */ -- cgit v1.2.3 From 0877258d98154565def498833ccb208234c85084 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 14 Dec 2008 16:21:16 -0300 Subject: V4L/DVB (9897): v4l2: Add camera zoom controls The zoom controls move the zoom lens group to a an absolute position, as a relative displacement or at a given speed until reaching physical device limits. Positive values move the zoom lens group towards the telephoto direction, negative values towards the wide-angle direction. Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index ec311d4616c..e450a92a622 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -1118,6 +1118,10 @@ enum v4l2_exposure_auto_type { #define V4L2_CID_FOCUS_RELATIVE (V4L2_CID_CAMERA_CLASS_BASE+11) #define V4L2_CID_FOCUS_AUTO (V4L2_CID_CAMERA_CLASS_BASE+12) +#define V4L2_CID_ZOOM_ABSOLUTE (V4L2_CID_CAMERA_CLASS_BASE+13) +#define V4L2_CID_ZOOM_RELATIVE (V4L2_CID_CAMERA_CLASS_BASE+14) +#define V4L2_CID_ZOOM_CONTINUOUS (V4L2_CID_CAMERA_CLASS_BASE+15) + /* * T U N I N G */ -- cgit v1.2.3 From 046425f8c4ac431db00c09a6d9fba16560b8e5b9 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Sun, 14 Dec 2008 16:22:05 -0300 Subject: V4L/DVB (9898): v4l2: Add privacy control The privacy control prevents video from being acquired by the camera. A true value indicates that no image can be captured. Devices that implement the privacy control must support read access and may support write access. Signed-off-by: Laurent Pinchart Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index e450a92a622..6e6743bd0f9 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -1122,6 +1122,8 @@ enum v4l2_exposure_auto_type { #define V4L2_CID_ZOOM_RELATIVE (V4L2_CID_CAMERA_CLASS_BASE+14) #define V4L2_CID_ZOOM_CONTINUOUS (V4L2_CID_CAMERA_CLASS_BASE+15) +#define V4L2_CID_PRIVACY (V4L2_CID_CAMERA_CLASS_BASE+16) + /* * T U N I N G */ -- cgit v1.2.3 From 92f45badbbaccdbc1be25085292a1e258948e221 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Sun, 21 Dec 2008 10:35:25 -0300 Subject: V4L/DVB (9932): v4l2-compat32: fix 32-64 compatibility module Added all missing v4l1/2 ioctls and fix several broken conversions. Partially based on work done by Cody Pisto . Tested-by: Brandon Jenkins Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 6e6743bd0f9..e2cfd6add78 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -1465,6 +1465,8 @@ struct v4l2_chip_ident { #define VIDIOC_G_CHIP_IDENT _IOWR('V', 81, struct v4l2_chip_ident) #endif #define VIDIOC_S_HW_FREQ_SEEK _IOW('V', 82, struct v4l2_hw_freq_seek) +/* Reminder: when adding new ioctls please add support for them to + drivers/media/video/v4l2-compat-ioctl32.c as well! */ #ifdef __OLD_VIDIOC_ /* for compatibility, will go away some day */ -- cgit v1.2.3 From 4b00eb25340c1a9b9eedaf0bc5b0f0d18eddb028 Mon Sep 17 00:00:00 2001 From: Hans Verkuil Date: Fri, 19 Dec 2008 11:17:56 -0300 Subject: V4L/DVB (9944): videodev2.h: fix typo. The comment said CX2584X instead of CX2341X. Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index e2cfd6add78..754c8d9685a 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -1051,7 +1051,7 @@ enum v4l2_mpeg_video_bitrate_mode { #define V4L2_CID_MPEG_VIDEO_MUTE (V4L2_CID_MPEG_BASE+210) #define V4L2_CID_MPEG_VIDEO_MUTE_YUV (V4L2_CID_MPEG_BASE+211) -/* MPEG-class control IDs specific to the CX2584x driver as defined by V4L2 */ +/* MPEG-class control IDs specific to the CX2341x driver as defined by V4L2 */ #define V4L2_CID_MPEG_CX2341X_BASE (V4L2_CTRL_CLASS_MPEG | 0x1000) #define V4L2_CID_MPEG_CX2341X_VIDEO_SPATIAL_FILTER_MODE (V4L2_CID_MPEG_CX2341X_BASE+0) enum v4l2_mpeg_cx2341x_video_spatial_filter_mode { -- cgit v1.2.3 From 531c98e71805b32e9ea35a218119100bbd2b7615 Mon Sep 17 00:00:00 2001 From: Mauro Carvalho Chehab Date: Mon, 22 Dec 2008 13:18:27 -0300 Subject: V4L/DVB (9953): em28xx: Add suport for debugging AC97 anciliary chips The em28xx driver can be coupled to an anciliary AC97 chip. This patch allows read/write AC97 registers directly. Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index 754c8d9685a..e31144d2223 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -1376,6 +1376,7 @@ struct v4l2_streamparm { #define V4L2_CHIP_MATCH_HOST 0 /* Match against chip ID on host (0 for the host) */ #define V4L2_CHIP_MATCH_I2C_DRIVER 1 /* Match against I2C driver ID */ #define V4L2_CHIP_MATCH_I2C_ADDR 2 /* Match against I2C 7-bit address */ +#define V4L2_CHIP_MATCH_AC97 3 /* Match against anciliary AC97 chip */ struct v4l2_register { __u32 match_type; /* Match type */ -- cgit v1.2.3 From 91962fa713bd8bf47434b02ac661fdc201365fa5 Mon Sep 17 00:00:00 2001 From: Magnus Damm Date: Thu, 18 Dec 2008 11:45:00 -0300 Subject: V4L/DVB (10078): video: add NV16 and NV61 pixel formats This patch adds support for NV16 and NV61 pixel formats. These pixel formats use two planes; one for 8-bit Y values and one for interleaved 8-bit U and V values. NV16/NV61 formats are very similar to NV12/NV21 with the exception that NV16/NV61 are using the same number of lines for both planes. The difference between NV16 and NV61 is the U and V byte order. The fourcc values are extrapolated from the NV12/NV21 case. Signed-off-by: Magnus Damm Signed-off-by: Guennadi Liakhovetski Signed-off-by: Mauro Carvalho Chehab --- include/linux/videodev2.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/videodev2.h b/include/linux/videodev2.h index e31144d2223..1f126e30766 100644 --- a/include/linux/videodev2.h +++ b/include/linux/videodev2.h @@ -305,6 +305,8 @@ struct v4l2_pix_format { /* two planes -- one Y, one Cr + Cb interleaved */ #define V4L2_PIX_FMT_NV12 v4l2_fourcc('N', 'V', '1', '2') /* 12 Y/CbCr 4:2:0 */ #define V4L2_PIX_FMT_NV21 v4l2_fourcc('N', 'V', '2', '1') /* 12 Y/CrCb 4:2:0 */ +#define V4L2_PIX_FMT_NV16 v4l2_fourcc('N', 'V', '1', '6') /* 16 Y/CbCr 4:2:2 */ +#define V4L2_PIX_FMT_NV61 v4l2_fourcc('N', 'V', '6', '1') /* 16 Y/CrCb 4:2:2 */ /* The following formats are not defined in the V4L2 specification */ #define V4L2_PIX_FMT_YUV410 v4l2_fourcc('Y', 'U', 'V', '9') /* 9 YUV 4:1:0 */ -- cgit v1.2.3