From cf417141cbb3a4ceb5cca15b2c1f099bd0a6603c Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Mon, 11 Aug 2008 14:33:53 -0700 Subject: sched, cpuset: rework sched domains and CPU hotplug handling (v4) This is an updated version of my previous cpuset patch on top of the latest mainline git. The patch fixes CPU hotplug handling issues in the current cpusets code. Namely circular locking in rebuild_sched_domains() and unsafe access to the cpu_online_map in the cpuset cpu hotplug handler. This version includes changes suggested by Paul Jackson (naming, comments, style, etc). I also got rid of the separate workqueue thread because it is now safe to call get_online_cpus() from workqueue callbacks. Here are some more details: rebuild_sched_domains() is the only way to rebuild sched domains correctly based on the current cpuset settings. What this means is that we need to be able to call it from different contexts, like cpu hotplug for example. Also latest scheduler code in -tip now calls rebuild_sched_domains() directly from functions like arch_reinit_sched_domains(). In order to support that properly we need to rework cpuset locking rules to avoid circular dependencies, which is what this patch does. New lock nesting rules are explained in the comments. We can now safely call rebuild_sched_domains() from virtually any context. The only requirement is that it needs to be called under get_online_cpus(). This allows cpu hotplug handlers and the scheduler to call rebuild_sched_domains() directly. The rest of the cpuset code now offloads sched domains rebuilds to a workqueue (async_rebuild_sched_domains()). This version of the patch addresses comments from the previous review. I fixed all miss-formated comments and trailing spaces. I also factored out the code that builds domain masks and split up CPU and memory hotplug handling. This was needed to simplify locking, to avoid unsafe access to the cpu_online_map from mem hotplug handler, and in general to make things cleaner. The patch passes moderate testing (building kernel with -j 16, creating & removing domains and bringing cpus off/online at the same time) on the quad-core2 based machine. It passes lockdep checks, even with preemptable RCU enabled. This time I also tested in with suspend/resume path and everything is working as expected. Signed-off-by: Max Krasnyansky Acked-by: Paul Jackson Cc: menage@google.com Cc: a.p.zijlstra@chello.nl Cc: vegard.nossum@gmail.com Signed-off-by: Ingo Molnar --- kernel/cpuset.c | 312 +++++++++++++++++++++++++++++++++----------------------- 1 file changed, 182 insertions(+), 130 deletions(-) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index d5ab79cf516..f227bc17269 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -14,6 +14,8 @@ * 2003-10-22 Updates by Stephen Hemminger. * 2004 May-July Rework by Paul Jackson. * 2006 Rework by Paul Menage to use generic cgroups + * 2008 Rework of the scheduler domains and CPU hotplug handling + * by Max Krasnyansky * * This file is subject to the terms and conditions of the GNU General Public * License. See the file COPYING in the main directory of the Linux @@ -236,9 +238,11 @@ static struct cpuset top_cpuset = { static DEFINE_MUTEX(callback_mutex); -/* This is ugly, but preserves the userspace API for existing cpuset +/* + * This is ugly, but preserves the userspace API for existing cpuset * users. If someone tries to mount the "cpuset" filesystem, we - * silently switch it to mount "cgroup" instead */ + * silently switch it to mount "cgroup" instead + */ static int cpuset_get_sb(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data, struct vfsmount *mnt) @@ -473,10 +477,9 @@ static int validate_change(const struct cpuset *cur, const struct cpuset *trial) } /* - * Helper routine for rebuild_sched_domains(). + * Helper routine for generate_sched_domains(). * Do cpusets a, b have overlapping cpus_allowed masks? */ - static int cpusets_overlap(struct cpuset *a, struct cpuset *b) { return cpus_intersects(a->cpus_allowed, b->cpus_allowed); @@ -518,26 +521,15 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) } /* - * rebuild_sched_domains() - * - * This routine will be called to rebuild the scheduler's dynamic - * sched domains: - * - if the flag 'sched_load_balance' of any cpuset with non-empty - * 'cpus' changes, - * - or if the 'cpus' allowed changes in any cpuset which has that - * flag enabled, - * - or if the 'sched_relax_domain_level' of any cpuset which has - * that flag enabled and with non-empty 'cpus' changes, - * - or if any cpuset with non-empty 'cpus' is removed, - * - or if a cpu gets offlined. - * - * This routine builds a partial partition of the systems CPUs - * (the set of non-overlappping cpumask_t's in the array 'part' - * below), and passes that partial partition to the kernel/sched.c - * partition_sched_domains() routine, which will rebuild the - * schedulers load balancing domains (sched domains) as specified - * by that partial partition. A 'partial partition' is a set of - * non-overlapping subsets whose union is a subset of that set. + * generate_sched_domains() + * + * This function builds a partial partition of the systems CPUs + * A 'partial partition' is a set of non-overlapping subsets whose + * union is a subset of that set. + * The output of this function needs to be passed to kernel/sched.c + * partition_sched_domains() routine, which will rebuild the scheduler's + * load balancing domains (sched domains) as specified by that partial + * partition. * * See "What is sched_load_balance" in Documentation/cpusets.txt * for a background explanation of this. @@ -547,13 +539,7 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * domains when operating in the severe memory shortage situations * that could cause allocation failures below. * - * Call with cgroup_mutex held. May take callback_mutex during - * call due to the kfifo_alloc() and kmalloc() calls. May nest - * a call to the get_online_cpus()/put_online_cpus() pair. - * Must not be called holding callback_mutex, because we must not - * call get_online_cpus() while holding callback_mutex. Elsewhere - * the kernel nests callback_mutex inside get_online_cpus() calls. - * So the reverse nesting would risk an ABBA deadlock. + * Must be called with cgroup_lock held. * * The three key local variables below are: * q - a linked-list queue of cpuset pointers, used to implement a @@ -588,10 +574,10 @@ update_domain_attr_tree(struct sched_domain_attr *dattr, struct cpuset *c) * element of the partition (one sched domain) to be passed to * partition_sched_domains(). */ - -void rebuild_sched_domains(void) +static int generate_sched_domains(cpumask_t **domains, + struct sched_domain_attr **attributes) { - LIST_HEAD(q); /* queue of cpusets to be scanned*/ + LIST_HEAD(q); /* queue of cpusets to be scanned */ struct cpuset *cp; /* scans q */ struct cpuset **csa; /* array of all cpuset ptrs */ int csn; /* how many cpuset ptrs in csa so far */ @@ -601,23 +587,26 @@ void rebuild_sched_domains(void) int ndoms; /* number of sched domains in result */ int nslot; /* next empty doms[] cpumask_t slot */ - csa = NULL; + ndoms = 0; doms = NULL; dattr = NULL; + csa = NULL; /* Special case for the 99% of systems with one, full, sched domain */ if (is_sched_load_balance(&top_cpuset)) { - ndoms = 1; doms = kmalloc(sizeof(cpumask_t), GFP_KERNEL); if (!doms) - goto rebuild; + goto done; + dattr = kmalloc(sizeof(struct sched_domain_attr), GFP_KERNEL); if (dattr) { *dattr = SD_ATTR_INIT; update_domain_attr_tree(dattr, &top_cpuset); } *doms = top_cpuset.cpus_allowed; - goto rebuild; + + ndoms = 1; + goto done; } csa = kmalloc(number_of_cpusets * sizeof(cp), GFP_KERNEL); @@ -680,61 +669,141 @@ restart: } } - /* Convert to */ + /* + * Now we know how many domains to create. + * Convert to and populate cpu masks. + */ doms = kmalloc(ndoms * sizeof(cpumask_t), GFP_KERNEL); - if (!doms) - goto rebuild; + if (!doms) { + ndoms = 0; + goto done; + } + + /* + * The rest of the code, including the scheduler, can deal with + * dattr==NULL case. No need to abort if alloc fails. + */ dattr = kmalloc(ndoms * sizeof(struct sched_domain_attr), GFP_KERNEL); for (nslot = 0, i = 0; i < csn; i++) { struct cpuset *a = csa[i]; + cpumask_t *dp; int apn = a->pn; - if (apn >= 0) { - cpumask_t *dp = doms + nslot; - - if (nslot == ndoms) { - static int warnings = 10; - if (warnings) { - printk(KERN_WARNING - "rebuild_sched_domains confused:" - " nslot %d, ndoms %d, csn %d, i %d," - " apn %d\n", - nslot, ndoms, csn, i, apn); - warnings--; - } - continue; + if (apn < 0) { + /* Skip completed partitions */ + continue; + } + + dp = doms + nslot; + + if (nslot == ndoms) { + static int warnings = 10; + if (warnings) { + printk(KERN_WARNING + "rebuild_sched_domains confused:" + " nslot %d, ndoms %d, csn %d, i %d," + " apn %d\n", + nslot, ndoms, csn, i, apn); + warnings--; } + continue; + } - cpus_clear(*dp); - if (dattr) - *(dattr + nslot) = SD_ATTR_INIT; - for (j = i; j < csn; j++) { - struct cpuset *b = csa[j]; - - if (apn == b->pn) { - cpus_or(*dp, *dp, b->cpus_allowed); - b->pn = -1; - if (dattr) - update_domain_attr_tree(dattr - + nslot, b); - } + cpus_clear(*dp); + if (dattr) + *(dattr + nslot) = SD_ATTR_INIT; + for (j = i; j < csn; j++) { + struct cpuset *b = csa[j]; + + if (apn == b->pn) { + cpus_or(*dp, *dp, b->cpus_allowed); + if (dattr) + update_domain_attr_tree(dattr + nslot, b); + + /* Done with this partition */ + b->pn = -1; } - nslot++; } + nslot++; } BUG_ON(nslot != ndoms); -rebuild: - /* Have scheduler rebuild sched domains */ +done: + kfree(csa); + + *domains = doms; + *attributes = dattr; + return ndoms; +} + +/* + * Rebuild scheduler domains. + * + * Call with neither cgroup_mutex held nor within get_online_cpus(). + * Takes both cgroup_mutex and get_online_cpus(). + * + * Cannot be directly called from cpuset code handling changes + * to the cpuset pseudo-filesystem, because it cannot be called + * from code that already holds cgroup_mutex. + */ +static void do_rebuild_sched_domains(struct work_struct *unused) +{ + struct sched_domain_attr *attr; + cpumask_t *doms; + int ndoms; + get_online_cpus(); - partition_sched_domains(ndoms, doms, dattr); + + /* Generate domain masks and attrs */ + cgroup_lock(); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); + put_online_cpus(); +} -done: - kfree(csa); - /* Don't kfree(doms) -- partition_sched_domains() does that. */ - /* Don't kfree(dattr) -- partition_sched_domains() does that. */ +static DECLARE_WORK(rebuild_sched_domains_work, do_rebuild_sched_domains); + +/* + * Rebuild scheduler domains, asynchronously via workqueue. + * + * If the flag 'sched_load_balance' of any cpuset with non-empty + * 'cpus' changes, or if the 'cpus' allowed changes in any cpuset + * which has that flag enabled, or if any cpuset with a non-empty + * 'cpus' is removed, then call this routine to rebuild the + * scheduler's dynamic sched domains. + * + * The rebuild_sched_domains() and partition_sched_domains() + * routines must nest cgroup_lock() inside get_online_cpus(), + * but such cpuset changes as these must nest that locking the + * other way, holding cgroup_lock() for much of the code. + * + * So in order to avoid an ABBA deadlock, the cpuset code handling + * these user changes delegates the actual sched domain rebuilding + * to a separate workqueue thread, which ends up processing the + * above do_rebuild_sched_domains() function. + */ +static void async_rebuild_sched_domains(void) +{ + schedule_work(&rebuild_sched_domains_work); +} + +/* + * Accomplishes the same scheduler domain rebuild as the above + * async_rebuild_sched_domains(), however it directly calls the + * rebuild routine synchronously rather than calling it via an + * asynchronous work thread. + * + * This can only be called from code that is not holding + * cgroup_mutex (not nested in a cgroup_lock() call.) + */ +void rebuild_sched_domains(void) +{ + do_rebuild_sched_domains(NULL); } /** @@ -863,7 +932,7 @@ static int update_cpumask(struct cpuset *cs, const char *buf) return retval; if (is_load_balanced) - rebuild_sched_domains(); + async_rebuild_sched_domains(); return 0; } @@ -1090,7 +1159,7 @@ static int update_relax_domain_level(struct cpuset *cs, s64 val) if (val != cs->relax_domain_level) { cs->relax_domain_level = val; if (!cpus_empty(cs->cpus_allowed) && is_sched_load_balance(cs)) - rebuild_sched_domains(); + async_rebuild_sched_domains(); } return 0; @@ -1131,7 +1200,7 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, mutex_unlock(&callback_mutex); if (cpus_nonempty && balance_flag_changed) - rebuild_sched_domains(); + async_rebuild_sched_domains(); return 0; } @@ -1492,6 +1561,9 @@ static u64 cpuset_read_u64(struct cgroup *cont, struct cftype *cft) default: BUG(); } + + /* Unreachable but makes gcc happy */ + return 0; } static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) @@ -1504,6 +1576,9 @@ static s64 cpuset_read_s64(struct cgroup *cont, struct cftype *cft) default: BUG(); } + + /* Unrechable but makes gcc happy */ + return 0; } @@ -1692,15 +1767,9 @@ static struct cgroup_subsys_state *cpuset_create( } /* - * Locking note on the strange update_flag() call below: - * * If the cpuset being removed has its flag 'sched_load_balance' * enabled, then simulate turning sched_load_balance off, which - * will call rebuild_sched_domains(). The get_online_cpus() - * call in rebuild_sched_domains() must not be made while holding - * callback_mutex. Elsewhere the kernel nests callback_mutex inside - * get_online_cpus() calls. So the reverse nesting would risk an - * ABBA deadlock. + * will call async_rebuild_sched_domains(). */ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) @@ -1719,7 +1788,7 @@ static void cpuset_destroy(struct cgroup_subsys *ss, struct cgroup *cont) struct cgroup_subsys cpuset_subsys = { .name = "cpuset", .create = cpuset_create, - .destroy = cpuset_destroy, + .destroy = cpuset_destroy, .can_attach = cpuset_can_attach, .attach = cpuset_attach, .populate = cpuset_populate, @@ -1811,7 +1880,7 @@ static void move_member_tasks_to_cpuset(struct cpuset *from, struct cpuset *to) } /* - * If common_cpu_mem_hotplug_unplug(), below, unplugs any CPUs + * If CPU and/or memory hotplug handlers, below, unplug any CPUs * or memory nodes, we need to walk over the cpuset hierarchy, * removing that CPU or node from all cpusets. If this removes the * last CPU or node from a cpuset, then move the tasks in the empty @@ -1902,35 +1971,6 @@ static void scan_for_empty_cpusets(const struct cpuset *root) } } -/* - * The cpus_allowed and mems_allowed nodemasks in the top_cpuset track - * cpu_online_map and node_states[N_HIGH_MEMORY]. Force the top cpuset to - * track what's online after any CPU or memory node hotplug or unplug event. - * - * Since there are two callers of this routine, one for CPU hotplug - * events and one for memory node hotplug events, we could have coded - * two separate routines here. We code it as a single common routine - * in order to minimize text size. - */ - -static void common_cpu_mem_hotplug_unplug(int rebuild_sd) -{ - cgroup_lock(); - - top_cpuset.cpus_allowed = cpu_online_map; - top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - scan_for_empty_cpusets(&top_cpuset); - - /* - * Scheduler destroys domains on hotplug events. - * Rebuild them based on the current settings. - */ - if (rebuild_sd) - rebuild_sched_domains(); - - cgroup_unlock(); -} - /* * The top_cpuset tracks what CPUs and Memory Nodes are online, * period. This is necessary in order to make cpusets transparent @@ -1939,40 +1979,52 @@ static void common_cpu_mem_hotplug_unplug(int rebuild_sd) * * This routine ensures that top_cpuset.cpus_allowed tracks * cpu_online_map on each CPU hotplug (cpuhp) event. + * + * Called within get_online_cpus(). Needs to call cgroup_lock() + * before calling generate_sched_domains(). */ - -static int cpuset_handle_cpuhp(struct notifier_block *unused_nb, +static int cpuset_track_online_cpus(struct notifier_block *unused_nb, unsigned long phase, void *unused_cpu) { + struct sched_domain_attr *attr; + cpumask_t *doms; + int ndoms; + switch (phase) { - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DOWN_FAILED: - case CPU_DOWN_FAILED_FROZEN: case CPU_ONLINE: case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - common_cpu_mem_hotplug_unplug(1); break; + default: return NOTIFY_DONE; } + cgroup_lock(); + top_cpuset.cpus_allowed = cpu_online_map; + scan_for_empty_cpusets(&top_cpuset); + ndoms = generate_sched_domains(&doms, &attr); + cgroup_unlock(); + + /* Have scheduler rebuild the domains */ + partition_sched_domains(ndoms, doms, attr); + return NOTIFY_OK; } #ifdef CONFIG_MEMORY_HOTPLUG /* * Keep top_cpuset.mems_allowed tracking node_states[N_HIGH_MEMORY]. - * Call this routine anytime after you change - * node_states[N_HIGH_MEMORY]. - * See also the previous routine cpuset_handle_cpuhp(). + * Call this routine anytime after node_states[N_HIGH_MEMORY] changes. + * See also the previous routine cpuset_track_online_cpus(). */ - void cpuset_track_online_nodes(void) { - common_cpu_mem_hotplug_unplug(0); + cgroup_lock(); + top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; + scan_for_empty_cpusets(&top_cpuset); + cgroup_unlock(); } #endif @@ -1987,7 +2039,7 @@ void __init cpuset_init_smp(void) top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_states[N_HIGH_MEMORY]; - hotcpu_notifier(cpuset_handle_cpuhp, 0); + hotcpu_notifier(cpuset_track_online_cpus, 0); } /** -- cgit v1.2.3 From 04da11bfcf511544ae19e0a7e5f994b3237752ac Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 20 Aug 2008 17:16:34 +0300 Subject: UBIFS: fix zero-length truncations Always allow truncations to zero, even if budgeting thinks there is no space. UBIFS reserves some space for deletions anyway. Otherwise, the following happans: 1. create a file, and write as much as possible there, until ENOSPC 2. truncate the file, which fails with ENOSPC, which is not good. Signed-off-by: Artem Bityutskiy --- fs/ubifs/dir.c | 1 - fs/ubifs/file.c | 20 ++++++++++++++++---- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index 5c96f1fb701..2b267c9a180 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -587,7 +587,6 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) if (err) { if (err != -ENOSPC) return err; - err = 0; budgeted = 0; } diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 4071d1cae29..3d698e2022b 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -793,7 +793,7 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, int err; struct ubifs_budget_req req; loff_t old_size = inode->i_size, new_size = attr->ia_size; - int offset = new_size & (UBIFS_BLOCK_SIZE - 1); + int offset = new_size & (UBIFS_BLOCK_SIZE - 1), budgeted = 1; struct ubifs_inode *ui = ubifs_inode(inode); dbg_gen("ino %lu, size %lld -> %lld", inode->i_ino, old_size, new_size); @@ -811,8 +811,15 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, /* A funny way to budget for truncation node */ req.dirtied_ino_d = UBIFS_TRUN_NODE_SZ; err = ubifs_budget_space(c, &req); - if (err) - return err; + if (err) { + /* + * Treat truncations to zero as deletion and always allow them, + * just like we do for '->unlink()'. + */ + if (new_size || err != -ENOSPC) + return err; + budgeted = 0; + } err = vmtruncate(inode, new_size); if (err) @@ -869,7 +876,12 @@ static int do_truncation(struct ubifs_info *c, struct inode *inode, err = ubifs_jnl_truncate(c, inode, old_size, new_size); mutex_unlock(&ui->ui_mutex); out_budg: - ubifs_release_budget(c, &req); + if (budgeted) + ubifs_release_budget(c, &req); + else { + c->nospace = c->nospace_rp = 0; + smp_wmb(); + } return err; } -- cgit v1.2.3 From 2338263e2cb3cd2f77cdc3fcab0312b6c7fc02c3 Mon Sep 17 00:00:00 2001 From: Henrik Rydberg Date: Fri, 22 Aug 2008 16:18:22 -0400 Subject: Input: bcm5974 - add maintainer entry Signed-off-by: Henrik Rydberg Signed-off-by: Dmitry Torokhov --- MAINTAINERS | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/MAINTAINERS b/MAINTAINERS index deedc0d827b..cdedf043e02 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -413,6 +413,12 @@ L: linux-laptop@vger.kernel.org W: http://www.canb.auug.org.au/~sfr/ S: Supported +APPLE BCM5974 MULTITOUCH DRIVER +P: Henrik Rydberg +M: rydberg@euromail.se +L: linux-input@vger.kernel.org +S: Maintained + APPLE SMC DRIVER P: Nicolas Boichat M: nicolas@boichat.ch -- cgit v1.2.3 From 761e29f3bb19b05bea55285dfdf2d28e001a63b8 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Wed, 20 Aug 2008 16:32:40 +0300 Subject: UBIFS: always read hashed-key nodes under TNC mutex Leaf-nodes that have a hashed key are stored in the leaf-node-cache (LNC) which is protected by the TNC mutex. Consequently, when reading a leaf node with a hashed key (i.e. directory entries, xattr entries) the TNC mutex is always required. Signed-off-by: Adrian Hunter --- fs/ubifs/tnc.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index e909f4a9644..4fbc5921688 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -1498,7 +1498,6 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, { int found, n, err; struct ubifs_znode *znode; - struct ubifs_zbranch zbr; dbg_tnc("name '%.*s' key %s", nm->len, nm->name, DBGKEY(key)); mutex_lock(&c->tnc_mutex); @@ -1522,11 +1521,7 @@ static int do_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, goto out_unlock; } - zbr = znode->zbranch[n]; - mutex_unlock(&c->tnc_mutex); - - err = tnc_read_node_nm(c, &zbr, node); - return err; + err = tnc_read_node_nm(c, &znode->zbranch[n], node); out_unlock: mutex_unlock(&c->tnc_mutex); -- cgit v1.2.3 From 601c0bc46753007be011b513ba4fc50ed8e30aef Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Fri, 22 Aug 2008 14:23:35 +0300 Subject: UBIFS: allow for racing between GC and TNC The TNC mutex is unlocked prematurely when reading leaf nodes with non-hashed keys. This is unsafe because the node may be moved by garbage collection and the eraseblock unmapped, although that has never actually happened during stress testing. This patch fixes the flaw by detecting the race and retrying with the TNC mutex locked. Signed-off-by: Adrian Hunter --- fs/ubifs/gc.c | 6 +++ fs/ubifs/misc.h | 17 +++++++++ fs/ubifs/tnc.c | 109 ++++++++++++++++++++++++++++++------------------------- fs/ubifs/ubifs.h | 6 ++- 4 files changed, 87 insertions(+), 51 deletions(-) diff --git a/fs/ubifs/gc.c b/fs/ubifs/gc.c index d0f3dac2908..13f1019c859 100644 --- a/fs/ubifs/gc.c +++ b/fs/ubifs/gc.c @@ -344,6 +344,12 @@ int ubifs_garbage_collect_leb(struct ubifs_info *c, struct ubifs_lprops *lp) if (err) goto out; + /* Allow for races with TNC */ + c->gced_lnum = lnum; + smp_wmb(); + c->gc_seq += 1; + smp_wmb(); + if (c->gc_lnum == -1) { c->gc_lnum = lnum; err = LEB_RETAINED; diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 87dabf9fe74..87ced4c74a6 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -325,4 +325,21 @@ static inline struct timespec ubifs_current_time(struct inode *inode) current_fs_time(inode->i_sb) : CURRENT_TIME_SEC; } +/** + * ubifs_tnc_lookup - look up a file-system node. + * @c: UBIFS file-system description object + * @key: node key to lookup + * @node: the node is returned here + * + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. + */ +static inline int ubifs_tnc_lookup(struct ubifs_info *c, + const union ubifs_key *key, void *node) +{ + return ubifs_tnc_locate(c, key, node, NULL, NULL); +} + #endif /* __UBIFS_MISC_H__ */ diff --git a/fs/ubifs/tnc.c b/fs/ubifs/tnc.c index 4fbc5921688..7da209ab937 100644 --- a/fs/ubifs/tnc.c +++ b/fs/ubifs/tnc.c @@ -506,7 +506,7 @@ static int fallible_read_node(struct ubifs_info *c, const union ubifs_key *key, if (keys_cmp(c, key, &node_key) != 0) ret = 0; } - if (ret == 0) + if (ret == 0 && c->replaying) dbg_mnt("dangling branch LEB %d:%d len %d, key %s", zbr->lnum, zbr->offs, zbr->len, DBGKEY(key)); return ret; @@ -1382,50 +1382,39 @@ static int lookup_level0_dirty(struct ubifs_info *c, const union ubifs_key *key, } /** - * ubifs_tnc_lookup - look up a file-system node. + * maybe_leb_gced - determine if a LEB may have been garbage collected. * @c: UBIFS file-system description object - * @key: node key to lookup - * @node: the node is returned here + * @lnum: LEB number + * @gc_seq1: garbage collection sequence number * - * This function look up and reads node with key @key. The caller has to make - * sure the @node buffer is large enough to fit the node. Returns zero in case - * of success, %-ENOENT if the node was not found, and a negative error code in - * case of failure. + * This function determines if @lnum may have been garbage collected since + * sequence number @gc_seq1. If it may have been then %1 is returned, otherwise + * %0 is returned. */ -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, - void *node) +static int maybe_leb_gced(struct ubifs_info *c, int lnum, int gc_seq1) { - int found, n, err; - struct ubifs_znode *znode; - struct ubifs_zbranch zbr, *zt; - - mutex_lock(&c->tnc_mutex); - found = ubifs_lookup_level0(c, key, &znode, &n); - if (!found) { - err = -ENOENT; - goto out; - } else if (found < 0) { - err = found; - goto out; - } - zt = &znode->zbranch[n]; - if (is_hash_key(c, key)) { - /* - * In this case the leaf node cache gets used, so we pass the - * address of the zbranch and keep the mutex locked - */ - err = tnc_read_node_nm(c, zt, node); - goto out; - } - zbr = znode->zbranch[n]; - mutex_unlock(&c->tnc_mutex); - - err = ubifs_tnc_read_node(c, &zbr, node); - return err; + int gc_seq2, gced_lnum; -out: - mutex_unlock(&c->tnc_mutex); - return err; + gced_lnum = c->gced_lnum; + smp_rmb(); + gc_seq2 = c->gc_seq; + /* Same seq means no GC */ + if (gc_seq1 == gc_seq2) + return 0; + /* Different by more than 1 means we don't know */ + if (gc_seq1 + 1 != gc_seq2) + return 1; + /* + * We have seen the sequence number has increased by 1. Now we need to + * be sure we read the right LEB number, so read it again. + */ + smp_rmb(); + if (gced_lnum != c->gced_lnum) + return 1; + /* Finally we can check lnum */ + if (gced_lnum == lnum) + return 1; + return 0; } /** @@ -1436,16 +1425,19 @@ out: * @lnum: LEB number is returned here * @offs: offset is returned here * - * This function is the same as 'ubifs_tnc_lookup()' but it returns the node - * location also. See 'ubifs_tnc_lookup()'. + * This function look up and reads node with key @key. The caller has to make + * sure the @node buffer is large enough to fit the node. Returns zero in case + * of success, %-ENOENT if the node was not found, and a negative error code in + * case of failure. The node location can be returned in @lnum and @offs. */ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, void *node, int *lnum, int *offs) { - int found, n, err; + int found, n, err, safely = 0, gc_seq1; struct ubifs_znode *znode; struct ubifs_zbranch zbr, *zt; +again: mutex_lock(&c->tnc_mutex); found = ubifs_lookup_level0(c, key, &znode, &n); if (!found) { @@ -1456,24 +1448,43 @@ int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, goto out; } zt = &znode->zbranch[n]; + if (lnum) { + *lnum = zt->lnum; + *offs = zt->offs; + } if (is_hash_key(c, key)) { /* * In this case the leaf node cache gets used, so we pass the * address of the zbranch and keep the mutex locked */ - *lnum = zt->lnum; - *offs = zt->offs; err = tnc_read_node_nm(c, zt, node); goto out; } + if (safely) { + err = ubifs_tnc_read_node(c, zt, node); + goto out; + } + /* Drop the TNC mutex prematurely and race with garbage collection */ zbr = znode->zbranch[n]; + gc_seq1 = c->gc_seq; mutex_unlock(&c->tnc_mutex); - *lnum = zbr.lnum; - *offs = zbr.offs; + if (ubifs_get_wbuf(c, zbr.lnum)) { + /* We do not GC journal heads */ + err = ubifs_tnc_read_node(c, &zbr, node); + return err; + } - err = ubifs_tnc_read_node(c, &zbr, node); - return err; + err = fallible_read_node(c, key, &zbr, node); + if (maybe_leb_gced(c, zbr.lnum, gc_seq1)) { + /* + * The node may have been GC'ed out from under us so try again + * while keeping the TNC mutex locked. + */ + safely = 1; + goto again; + } + return 0; out: mutex_unlock(&c->tnc_mutex); diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index d7f706f7a30..7828d69ca4f 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1028,6 +1028,8 @@ struct ubifs_mount_opts { * @sbuf: a buffer of LEB size used by GC and replay for scanning * @idx_gc: list of index LEBs that have been garbage collected * @idx_gc_cnt: number of elements on the idx_gc list + * @gc_seq: incremented for every non-index LEB garbage collected + * @gced_lnum: last non-index LEB that was garbage collected * * @infos_list: links all 'ubifs_info' objects * @umount_mutex: serializes shrinker and un-mount @@ -1257,6 +1259,8 @@ struct ubifs_info { void *sbuf; struct list_head idx_gc; int idx_gc_cnt; + volatile int gc_seq; + volatile int gced_lnum; struct list_head infos_list; struct mutex umount_mutex; @@ -1451,8 +1455,6 @@ int ubifs_save_dirty_idx_lnums(struct ubifs_info *c); /* tnc.c */ int ubifs_lookup_level0(struct ubifs_info *c, const union ubifs_key *key, struct ubifs_znode **zn, int *n); -int ubifs_tnc_lookup(struct ubifs_info *c, const union ubifs_key *key, - void *node); int ubifs_tnc_lookup_nm(struct ubifs_info *c, const union ubifs_key *key, void *node, const struct qstr *nm); int ubifs_tnc_locate(struct ubifs_info *c, const union ubifs_key *key, -- cgit v1.2.3 From 8191e1fa8131a422f4bf7b0f2dc1f8543fd17783 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 22 Aug 2008 12:26:40 +0300 Subject: UBIFS: do not update min_idx_lebs in stafs This is bad because the rest of the code should not depend on it, and this may hide bugss, instead of revealing them. Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 15409815747..ac0d2e1e73b 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -741,7 +741,6 @@ long long ubifs_budg_get_free_space(struct ubifs_info *c) available = ubifs_calc_available(c, min_idx_lebs); outstanding = c->budg_data_growth + c->budg_dd_growth; - c->min_idx_lebs = min_idx_lebs; spin_unlock(&c->space_lock); if (available > outstanding) -- cgit v1.2.3 From 9e5de3549615818cae9c20a0ee1fd3ad4a747758 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 25 Aug 2008 17:29:43 +0300 Subject: UBIFS: push empty flash hack down We have a hack which forces the amount of flash space to be equivalent to 'c->blocks_cnt' in case of empty FS. This is to make users happy and see '%0' used in 'df' when they mount an empty FS. This hack is not needed in 'ubifs_calc_available()', but it is only needed the caller, in 'ubifs_budg_get_free_space()'. So push it down there. Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 24 +++++++++++------------- fs/ubifs/super.c | 2 -- 2 files changed, 11 insertions(+), 15 deletions(-) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index ac0d2e1e73b..f6d2eaa7a06 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -302,18 +302,6 @@ long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs) int subtract_lebs; long long available; - /* - * Force the amount available to the total size reported if the used - * space is zero. - */ - if (c->lst.total_used <= UBIFS_INO_NODE_SZ && - c->budg_data_growth + c->budg_dd_growth == 0) { - /* Do the same calculation as for c->block_cnt */ - available = c->main_lebs - 2; - available *= c->leb_size - c->dark_wm; - return available; - } - available = c->main_bytes - c->lst.total_used; /* @@ -739,8 +727,18 @@ long long ubifs_budg_get_free_space(struct ubifs_info *c) return 0; } - available = ubifs_calc_available(c, min_idx_lebs); outstanding = c->budg_data_growth + c->budg_dd_growth; + + /* + * Force the amount available to the total size reported if the used + * space is zero. + */ + if (c->lst.total_used <= UBIFS_INO_NODE_SZ && !outstanding) { + spin_unlock(&c->space_lock); + return (long long)c->block_cnt << UBIFS_BLOCK_SHIFT; + } + + available = ubifs_calc_available(c, min_idx_lebs); spin_unlock(&c->space_lock); if (available > outstanding) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f71e6b8822c..1018053519e 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -649,8 +649,6 @@ static int init_constants_late(struct ubifs_info *c) * * Subtract the LEB reserved for GC and the LEB which is reserved for * deletions. - * - * Review 'ubifs_calc_available()' if changing this calculation. */ tmp64 = c->main_lebs - 2; tmp64 *= (uint64_t)c->leb_size - c->dark_wm; -- cgit v1.2.3 From 8aabb75017291ba68c09ff5fdb998ef0a1fdaaf9 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 25 Aug 2008 16:02:31 +0300 Subject: UBIFS: remove incorrect index space check When we report free space to user-space, we should not report 0 if the amount of empty LEBs is too low, because they would be produced by GC when needed. Thus, just call 'ubifs_calc_available()' straight away which would take 'min_idx_lebs' into account anyway. Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 15 +-------------- 1 file changed, 1 insertion(+), 14 deletions(-) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index f6d2eaa7a06..9ef630a594c 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -709,24 +709,11 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, */ long long ubifs_budg_get_free_space(struct ubifs_info *c) { - int min_idx_lebs, rsvd_idx_lebs; + int min_idx_lebs; long long available, outstanding, free; - /* Do exactly the same calculations as in 'do_budget_space()' */ spin_lock(&c->space_lock); min_idx_lebs = ubifs_calc_min_idx_lebs(c); - - if (min_idx_lebs > c->lst.idx_lebs) - rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; - else - rsvd_idx_lebs = 0; - - if (rsvd_idx_lebs > c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - - c->lst.taken_empty_lebs) { - spin_unlock(&c->space_lock); - return 0; - } - outstanding = c->budg_data_growth + c->budg_dd_growth; /* -- cgit v1.2.3 From 4b5f2762ec914c9dfd0e9d2377c0574f2ee9a8f9 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 25 Aug 2008 16:15:56 +0300 Subject: UBIFS: improve statfs reporting Make free space calculation less pessimistic and more realistic, which in turn improves 'statfs()' reports. Now it lies by 10%-20%, instead of 20%-30% (10% more honest). Results of "freespace" test (120MiB volume, 16KiB LEB size, 512 bytes page size). Before the change: freespace: Test 1: fill the space we have 3 times freespace: was free: 78274560 bytes 74.6 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 18214912 bytes 17.4 MiB, wrote 23.3% more than predicted freespace: was free: 76754944 bytes 73.2 MiB, wrote: 96493568 bytes 92.0 MiB, delta: 19738624 bytes 18.8 MiB, wrote 25.7% more than predicted freespace: was free: 76759040 bytes 73.2 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 19730432 bytes 18.8 MiB, wrote 25.7% more than predicted freespace: Test 1 finished freespace: Test 2: gradually lessen amount of free space and fill the FS freespace: do 10 steps, lessen free space by 6977722 bytes 6.7 MiB each time freespace: was free: 72273920 bytes 68.9 MiB, wrote: 88891392 bytes 84.8 MiB, delta: 16617472 bytes 15.8 MiB, wrote 23.0% more than predicted freespace: was free: 66154496 bytes 63.1 MiB, wrote: 81506304 bytes 77.7 MiB, delta: 15351808 bytes 14.6 MiB, wrote 23.2% more than predicted freespace: was free: 58732544 bytes 56.0 MiB, wrote: 72572928 bytes 69.2 MiB, delta: 13840384 bytes 13.2 MiB, wrote 23.6% more than predicted freespace: was free: 51552256 bytes 49.2 MiB, wrote: 63754240 bytes 60.8 MiB, delta: 12201984 bytes 11.6 MiB, wrote 23.7% more than predicted freespace: was free: 44404736 bytes 42.3 MiB, wrote: 54943744 bytes 52.4 MiB, delta: 10539008 bytes 10.1 MiB, wrote 23.7% more than predicted freespace: was free: 37285888 bytes 35.6 MiB, wrote: 46161920 bytes 44.0 MiB, delta: 8876032 bytes 8.5 MiB, wrote 23.8% more than predicted freespace: was free: 30171136 bytes 28.8 MiB, wrote: 37384192 bytes 35.7 MiB, delta: 7213056 bytes 6.9 MiB, wrote 23.9% more than predicted freespace: was free: 23048192 bytes 22.0 MiB, wrote: 28606464 bytes 27.3 MiB, delta: 5558272 bytes 5.3 MiB, wrote 24.1% more than predicted freespace: was free: 15941632 bytes 15.2 MiB, wrote: 19828736 bytes 18.9 MiB, delta: 3887104 bytes 3.7 MiB, wrote 24.4% more than predicted freespace: was free: 8830976 bytes 8.4 MiB, wrote: 11063296 bytes 10.6 MiB, delta: 2232320 bytes 2.1 MiB, wrote 25.3% more than predicted freespace: Test 2 finished freespace: Test 3: gradually lessen amount of free space by trashing and fill the FS freespace: do 10 steps, lessen free space by 6985541 bytes 6.7 MiB each time freespace: trashing: was free: 76840960 bytes 73.3 MiB, need free: 6985550 bytes 6.7 MiB, files created: 248311, delete 225737 (90.9% of them) freespace: was free: 65228800 bytes 62.2 MiB, wrote: 82530304 bytes 78.7 MiB, delta: 17301504 bytes 16.5 MiB, wrote 26.5% more than predicted freespace: trashing: was free: 74485760 bytes 71.0 MiB, need free: 13971091 bytes 13.3 MiB, files created: 248712, delete 202061 (81.2% of them) freespace: was free: 55025664 bytes 52.5 MiB, wrote: 71925760 bytes 68.6 MiB, delta: 16900096 bytes 16.1 MiB, wrote 30.7% more than predicted freespace: trashing: was free: 75550720 bytes 72.1 MiB, need free: 20956632 bytes 20.0 MiB, files created: 248849, delete 179822 (72.3% of them) freespace: was free: 46669824 bytes 44.5 MiB, wrote: 63197184 bytes 60.3 MiB, delta: 16527360 bytes 15.8 MiB, wrote 35.4% more than predicted freespace: trashing: was free: 76214272 bytes 72.7 MiB, need free: 27942173 bytes 26.6 MiB, files created: 248789, delete 157576 (63.3% of them) freespace: was free: 39129088 bytes 37.3 MiB, wrote: 55164928 bytes 52.6 MiB, delta: 16035840 bytes 15.3 MiB, wrote 41.0% more than predicted freespace: trashing: was free: 77398016 bytes 73.8 MiB, need free: 34927714 bytes 33.3 MiB, files created: 248711, delete 136474 (54.9% of them) freespace: was free: 32325632 bytes 30.8 MiB, wrote: 48234496 bytes 46.0 MiB, delta: 15908864 bytes 15.2 MiB, wrote 49.2% more than predicted freespace: trashing: was free: 75796480 bytes 72.3 MiB, need free: 41913255 bytes 40.0 MiB, files created: 248674, delete 111164 (44.7% of them) freespace: was free: 25079808 bytes 23.9 MiB, wrote: 40775680 bytes 38.9 MiB, delta: 15695872 bytes 15.0 MiB, wrote 62.6% more than predicted freespace: trashing: was free: 78209024 bytes 74.6 MiB, need free: 48898796 bytes 46.6 MiB, files created: 248708, delete 93207 (37.5% of them) freespace: was free: 20582400 bytes 19.6 MiB, wrote: 34844672 bytes 33.2 MiB, delta: 14262272 bytes 13.6 MiB, wrote 69.3% more than predicted freespace: trashing: was free: 77328384 bytes 73.7 MiB, need free: 55884337 bytes 53.3 MiB, files created: 248644, delete 68951 (27.7% of them) freespace: was free: 14368768 bytes 13.7 MiB, wrote: 28278784 bytes 27.0 MiB, delta: 13910016 bytes 13.3 MiB, wrote 96.8% more than predicted freespace: trashing: was free: 77434880 bytes 73.8 MiB, need free: 62869878 bytes 60.0 MiB, files created: 248640, delete 46767 (18.8% of them) freespace: was free: 8286208 bytes 7.9 MiB, wrote: 21811200 bytes 20.8 MiB, delta: 13524992 bytes 12.9 MiB, wrote 163.2% more than predicted freespace: trashing: was free: 77856768 bytes 74.2 MiB, need free: 69855419 bytes 66.6 MiB, files created: 248576, delete 25546 (10.3% of them) freespace: was free: 5570560 bytes 5.3 MiB, wrote: 8187904 bytes 7.8 MiB, delta: 2617344 bytes 2.5 MiB, wrote 47.0% more than predicted freespace: Test 3 finished freespace: finished successfully After the change: freespace: Test 1: fill the space we have 3 times freespace: was free: 85204992 bytes 81.3 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 11284480 bytes 10.8 MiB, wrote 13.2% more than predicted freespace: was free: 83554304 bytes 79.7 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 12935168 bytes 12.3 MiB, wrote 15.5% more than predicted freespace: was free: 83554304 bytes 79.7 MiB, wrote: 96493568 bytes 92.0 MiB, delta: 12939264 bytes 12.3 MiB, wrote 15.5% more than predicted freespace: Test 1 finished freespace: Test 2: gradually lessen amount of free space and fill the FS freespace: do 10 steps, lessen free space by 7596218 bytes 7.2 MiB each time freespace: was free: 78675968 bytes 75.0 MiB, wrote: 88903680 bytes 84.8 MiB, delta: 10227712 bytes 9.8 MiB, wrote 13.0% more than predicted freespace: was free: 72015872 bytes 68.7 MiB, wrote: 81514496 bytes 77.7 MiB, delta: 9498624 bytes 9.1 MiB, wrote 13.2% more than predicted freespace: was free: 63938560 bytes 61.0 MiB, wrote: 72589312 bytes 69.2 MiB, delta: 8650752 bytes 8.2 MiB, wrote 13.5% more than predicted freespace: was free: 56127488 bytes 53.5 MiB, wrote: 63762432 bytes 60.8 MiB, delta: 7634944 bytes 7.3 MiB, wrote 13.6% more than predicted freespace: was free: 48336896 bytes 46.1 MiB, wrote: 54935552 bytes 52.4 MiB, delta: 6598656 bytes 6.3 MiB, wrote 13.7% more than predicted freespace: was free: 40587264 bytes 38.7 MiB, wrote: 46157824 bytes 44.0 MiB, delta: 5570560 bytes 5.3 MiB, wrote 13.7% more than predicted freespace: was free: 32841728 bytes 31.3 MiB, wrote: 37384192 bytes 35.7 MiB, delta: 4542464 bytes 4.3 MiB, wrote 13.8% more than predicted freespace: was free: 25100288 bytes 23.9 MiB, wrote: 28618752 bytes 27.3 MiB, delta: 3518464 bytes 3.4 MiB, wrote 14.0% more than predicted freespace: was free: 17342464 bytes 16.5 MiB, wrote: 19841024 bytes 18.9 MiB, delta: 2498560 bytes 2.4 MiB, wrote 14.4% more than predicted freespace: was free: 9605120 bytes 9.2 MiB, wrote: 11063296 bytes 10.6 MiB, delta: 1458176 bytes 1.4 MiB, wrote 15.2% more than predicted freespace: Test 2 finished freespace: Test 3: gradually lessen amount of free space by trashing and fill the FS freespace: do 10 steps, lessen free space by 7606272 bytes 7.3 MiB each time freespace: trashing: was free: 83668992 bytes 79.8 MiB, need free: 7606272 bytes 7.3 MiB, files created: 248297, delete 225724 (90.9% of them) freespace: was free: 70803456 bytes 67.5 MiB, wrote: 82485248 bytes 78.7 MiB, delta: 11681792 bytes 11.1 MiB, wrote 16.5% more than predicted freespace: trashing: was free: 81080320 bytes 77.3 MiB, need free: 15212544 bytes 14.5 MiB, files created: 248711, delete 202047 (81.2% of them) freespace: was free: 59867136 bytes 57.1 MiB, wrote: 71897088 bytes 68.6 MiB, delta: 12029952 bytes 11.5 MiB, wrote 20.1% more than predicted freespace: trashing: was free: 82243584 bytes 78.4 MiB, need free: 22818816 bytes 21.8 MiB, files created: 248866, delete 179817 (72.3% of them) freespace: was free: 50905088 bytes 48.5 MiB, wrote: 63168512 bytes 60.2 MiB, delta: 12263424 bytes 11.7 MiB, wrote 24.1% more than predicted freespace: trashing: was free: 83402752 bytes 79.5 MiB, need free: 30425088 bytes 29.0 MiB, files created: 248920, delete 158114 (63.5% of them) freespace: was free: 42651648 bytes 40.7 MiB, wrote: 55406592 bytes 52.8 MiB, delta: 12754944 bytes 12.2 MiB, wrote 29.9% more than predicted freespace: trashing: was free: 84402176 bytes 80.5 MiB, need free: 38031360 bytes 36.3 MiB, files created: 248709, delete 136641 (54.9% of them) freespace: was free: 35233792 bytes 33.6 MiB, wrote: 48250880 bytes 46.0 MiB, delta: 13017088 bytes 12.4 MiB, wrote 36.9% more than predicted freespace: trashing: was free: 82530304 bytes 78.7 MiB, need free: 45637632 bytes 43.5 MiB, files created: 248778, delete 111208 (44.7% of them) freespace: was free: 27287552 bytes 26.0 MiB, wrote: 40267776 bytes 38.4 MiB, delta: 12980224 bytes 12.4 MiB, wrote 47.6% more than predicted freespace: trashing: was free: 85114880 bytes 81.2 MiB, need free: 53243904 bytes 50.8 MiB, files created: 248508, delete 93052 (37.4% of them) freespace: was free: 22437888 bytes 21.4 MiB, wrote: 35328000 bytes 33.7 MiB, delta: 12890112 bytes 12.3 MiB, wrote 57.4% more than predicted freespace: trashing: was free: 84103168 bytes 80.2 MiB, need free: 60850176 bytes 58.0 MiB, files created: 248637, delete 68743 (27.6% of them) freespace: was free: 15536128 bytes 14.8 MiB, wrote: 28319744 bytes 27.0 MiB, delta: 12783616 bytes 12.2 MiB, wrote 82.3% more than predicted freespace: trashing: was free: 84357120 bytes 80.4 MiB, need free: 68456448 bytes 65.3 MiB, files created: 248567, delete 46852 (18.8% of them) freespace: was free: 9015296 bytes 8.6 MiB, wrote: 22044672 bytes 21.0 MiB, delta: 13029376 bytes 12.4 MiB, wrote 144.5% more than predicted freespace: trashing: was free: 84942848 bytes 81.0 MiB, need free: 76062720 bytes 72.5 MiB, files created: 248636, delete 25993 (10.5% of them) freespace: was free: 6086656 bytes 5.8 MiB, wrote: 8331264 bytes 7.9 MiB, delta: 2244608 bytes 2.1 MiB, wrote 36.9% more than predicted freespace: Test 3 finished freespace: finished successfully Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ fs/ubifs/misc.h | 32 -------------------------------- fs/ubifs/ubifs.h | 1 + 3 files changed, 46 insertions(+), 32 deletions(-) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 9ef630a594c..7851480a6ce 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -701,6 +701,51 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, ubifs_release_budget(c, &req); } +/** + * ubifs_reported_space - calculate reported free space. + * @c: the UBIFS file-system description object + * @free: amount of free space + * + * This function calculates amount of free space which will be reported to + * user-space. User-space application tend to expect that if the file-system + * (e.g., via the 'statfs()' call) reports that it has N bytes available, they + * are able to write a file of size N. UBIFS attaches node headers to each data + * node and it has to write indexind nodes as well. This introduces additional + * overhead, and UBIFS it has to report sligtly less free space to meet the + * above expectetion. + * + * This function assumes free space is made up of uncompressed data nodes and + * full index nodes (one per data node, tripled because we always allow enough + * space to write the index thrice). + * + * Note, the calculation is pessimistic, which means that most of the time + * UBIFS reports less space than it actually has. + */ +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) +{ + int divisor, factor; + + /* + * Reported space size is @free * X, where X is UBIFS block size + * divided by UBIFS block size + all overhead one data block + * introduces. The overhead is the node header + indexing overhead. + * + * Indexing overhead is calculations are based on the following + * formula: I = N/(f - 1) + 1, where I - number of indexing nodes, N - + * number of data nodes, f - fanout. Because effective UBIFS fanout is + * twice as less than maximum fanout, we assume that each data node + * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. + * Note, the multiplier 3 is because UBIFS reseves thrice as more space + * for the index. + */ + factor = UBIFS_BLOCK_SIZE; + divisor = UBIFS_MAX_DATA_NODE_SZ; + divisor += (c->max_idx_node_sz * 3) / ((c->fanout >> 1) - 1); + free *= factor; + do_div(free, divisor); + return free; +} + /** * ubifs_budg_get_free_space - return amount of free space. * @c: UBIFS file-system description object diff --git a/fs/ubifs/misc.h b/fs/ubifs/misc.h index 87ced4c74a6..4c12a9215d7 100644 --- a/fs/ubifs/misc.h +++ b/fs/ubifs/misc.h @@ -283,38 +283,6 @@ static inline void *ubifs_idx_key(const struct ubifs_info *c, return (void *)((struct ubifs_branch *)idx->branches)->key; } -/** - * ubifs_reported_space - calculate reported free space. - * @c: the UBIFS file-system description object - * @free: amount of free space - * - * This function calculates amount of free space which will be reported to - * user-space. User-space application tend to expect that if the file-system - * (e.g., via the 'statfs()' call) reports that it has N bytes available, they - * are able to write a file of size N. UBIFS attaches node headers to each data - * node and it has to write indexind nodes as well. This introduces additional - * overhead, and UBIFS it has to report sligtly less free space to meet the - * above expectetion. - * - * This function assumes free space is made up of uncompressed data nodes and - * full index nodes (one per data node, doubled because we always allow enough - * space to write the index twice). - * - * Note, the calculation is pessimistic, which means that most of the time - * UBIFS reports less space than it actually has. - */ -static inline long long ubifs_reported_space(const struct ubifs_info *c, - uint64_t free) -{ - int divisor, factor; - - divisor = UBIFS_MAX_DATA_NODE_SZ + (c->max_idx_node_sz * 3); - factor = UBIFS_MAX_DATA_NODE_SZ - UBIFS_DATA_NODE_SZ; - do_div(free, divisor); - - return free * factor; -} - /** * ubifs_current_time - round current time to time granularity. * @inode: inode diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 7828d69ca4f..681d46e1628 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1441,6 +1441,7 @@ void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, long long ubifs_budg_get_free_space(struct ubifs_info *c); int ubifs_calc_min_idx_lebs(struct ubifs_info *c); void ubifs_convert_page_budget(struct ubifs_info *c); +long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free); long long ubifs_calc_available(const struct ubifs_info *c, int min_idx_lebs); /* find.c */ -- cgit v1.2.3 From ad507653a39e0d27404291e5d813683265388a20 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 25 Aug 2008 18:32:57 +0300 Subject: UBIFS: fix assertion The assertion was incorrect, because it did not take into account free space. This patch also amends the comments correspondingly, and cleans them up a little. Signed-off-by: Artem Bityutskiy --- fs/ubifs/find.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index adee7b5ddea..9fc55ae7b03 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -211,14 +211,8 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, * dirty index heap, and it falls-back to LPT scanning if the heaps are empty * or do not have an LEB which satisfies the @min_space criteria. * - * Note: - * o LEBs which have less than dead watermark of dirty space are never picked - * by this function; - * - * Returns zero and the LEB properties of - * found dirty LEB in case of success, %-ENOSPC if no dirty LEB was found and a - * negative error code in case of other failures. The returned LEB is marked as - * "taken". + * Note, LEBs which have less than dead watermark of free + dirty space are + * never picked by this function. * * The additional @pick_free argument controls if this function has to return a * free or freeable LEB if one is present. For example, GC must to set it to %1, @@ -231,6 +225,10 @@ static const struct ubifs_lprops *scan_for_dirty(struct ubifs_info *c, * * In addition @pick_free is set to %2 by the recovery process in order to * recover gc_lnum in which case an index LEB must not be returned. + * + * This function returns zero and the LEB properties of found dirty LEB in case + * of success, %-ENOSPC if no dirty LEB was found and a negative error code in + * case of other failures. The returned LEB is marked as "taken". */ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, int min_space, int pick_free) @@ -317,7 +315,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, lp = idx_lp; if (lp) { - ubifs_assert(lp->dirty >= c->dead_wm); + ubifs_assert(lp->free + lp->dirty >= c->dead_wm); goto found; } -- cgit v1.2.3 From 131130b9a1e6e523c64b34137b14f88ae1382a6a Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 25 Aug 2008 18:34:45 +0300 Subject: UBIFS: add forgotten gc_idx_lebs component We add this component at other similar places, but not in this one. Signed-off-by: Artem Bityutskiy --- fs/ubifs/find.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/find.c b/fs/ubifs/find.c index 9fc55ae7b03..e045c8b5542 100644 --- a/fs/ubifs/find.c +++ b/fs/ubifs/find.c @@ -243,7 +243,7 @@ int ubifs_find_dirty_leb(struct ubifs_info *c, struct ubifs_lprops *ret_lp, int lebs, rsvd_idx_lebs = 0; spin_lock(&c->space_lock); - lebs = c->lst.empty_lebs; + lebs = c->lst.empty_lebs + c->idx_gc_cnt; lebs += c->freeable_cnt - c->lst.taken_empty_lebs; /* -- cgit v1.2.3 From 9bbb5726efb64e2cfed42f6eec07db80cd87e63b Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Fri, 22 Aug 2008 18:23:22 +0300 Subject: UBIFS: introduce LEB overhead This is a preparational patch for the following statfs() report fix. Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 6 ++++++ fs/ubifs/ubifs.h | 5 +++++ 2 files changed, 11 insertions(+) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1018053519e..be23fd3cfd8 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -530,6 +530,12 @@ static int init_constants_early(struct ubifs_info *c) c->dead_wm = ALIGN(MIN_WRITE_SZ, c->min_io_size); c->dark_wm = ALIGN(UBIFS_MAX_NODE_SZ, c->min_io_size); + /* + * Calculate how many bytes would be wasted at the end of LEB if it was + * fully filled with data nodes of maximum size. This is used in + * calculations when reporting free space. + */ + c->leb_overhead = c->leb_size % UBIFS_MAX_DATA_NODE_SZ; return 0; } diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 681d46e1628..57e58541de2 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -995,6 +995,9 @@ struct ubifs_mount_opts { * @max_idx_node_sz: maximum indexing node aligned on 8-bytes boundary * @max_inode_sz: maximum possible inode size in bytes * @max_znode_sz: size of znode in bytes + * + * @leb_overhead: how many bytes are wasted in an LEB when it is filled with + * data nodes of maximum size - used in free space reporting * @dead_wm: LEB dead space watermark * @dark_wm: LEB dark space watermark * @block_cnt: count of 4KiB blocks on the FS @@ -1226,6 +1229,8 @@ struct ubifs_info { int max_idx_node_sz; long long max_inode_sz; int max_znode_sz; + + int leb_overhead; int dead_wm; int dark_wm; int block_cnt; -- cgit v1.2.3 From 7dad181bbe58b8fe9e170da28bcd5f6ec9addd6d Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 25 Aug 2008 18:58:19 +0300 Subject: UBIFS: improve statfs reporting even more Since free space we report in statfs is file size which should fit to the FS - change the way we calculate free space and use leb_overhead instead of dark_wm in calculations. Results of "freespace" test (120MiB volume, 16KiB LEB size, 512 bytes page size). Before the change: freespace: Test 1: fill the space we have 3 times freespace: was free: 85204992 bytes 81.3 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 11284480 bytes 10.8 MiB, wrote 13.2% more than predicted freespace: was free: 83554304 bytes 79.7 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 12935168 bytes 12.3 MiB, wrote 15.5% more than predicted freespace: was free: 83554304 bytes 79.7 MiB, wrote: 96493568 bytes 92.0 MiB, delta: 12939264 bytes 12.3 MiB, wrote 15.5% more than predicted freespace: Test 1 finished freespace: Test 2: gradually lessen amount of free space and fill the FS freespace: do 10 steps, lessen free space by 7596218 bytes 7.2 MiB each time freespace: was free: 78675968 bytes 75.0 MiB, wrote: 88903680 bytes 84.8 MiB, delta: 10227712 bytes 9.8 MiB, wrote 13.0% more than predicted freespace: was free: 72015872 bytes 68.7 MiB, wrote: 81514496 bytes 77.7 MiB, delta: 9498624 bytes 9.1 MiB, wrote 13.2% more than predicted freespace: was free: 63938560 bytes 61.0 MiB, wrote: 72589312 bytes 69.2 MiB, delta: 8650752 bytes 8.2 MiB, wrote 13.5% more than predicted freespace: was free: 56127488 bytes 53.5 MiB, wrote: 63762432 bytes 60.8 MiB, delta: 7634944 bytes 7.3 MiB, wrote 13.6% more than predicted freespace: was free: 48336896 bytes 46.1 MiB, wrote: 54935552 bytes 52.4 MiB, delta: 6598656 bytes 6.3 MiB, wrote 13.7% more than predicted freespace: was free: 40587264 bytes 38.7 MiB, wrote: 46157824 bytes 44.0 MiB, delta: 5570560 bytes 5.3 MiB, wrote 13.7% more than predicted freespace: was free: 32841728 bytes 31.3 MiB, wrote: 37384192 bytes 35.7 MiB, delta: 4542464 bytes 4.3 MiB, wrote 13.8% more than predicted freespace: was free: 25100288 bytes 23.9 MiB, wrote: 28618752 bytes 27.3 MiB, delta: 3518464 bytes 3.4 MiB, wrote 14.0% more than predicted freespace: was free: 17342464 bytes 16.5 MiB, wrote: 19841024 bytes 18.9 MiB, delta: 2498560 bytes 2.4 MiB, wrote 14.4% more than predicted freespace: was free: 9605120 bytes 9.2 MiB, wrote: 11063296 bytes 10.6 MiB, delta: 1458176 bytes 1.4 MiB, wrote 15.2% more than predicted freespace: Test 2 finished freespace: Test 3: gradually lessen amount of free space by trashing and fill the FS freespace: do 10 steps, lessen free space by 7606272 bytes 7.3 MiB each time freespace: trashing: was free: 83668992 bytes 79.8 MiB, need free: 7606272 bytes 7.3 MiB, files created: 248297, delete 225724 (90.9% of them) freespace: was free: 70803456 bytes 67.5 MiB, wrote: 82485248 bytes 78.7 MiB, delta: 11681792 bytes 11.1 MiB, wrote 16.5% more than predicted freespace: trashing: was free: 81080320 bytes 77.3 MiB, need free: 15212544 bytes 14.5 MiB, files created: 248711, delete 202047 (81.2% of them) freespace: was free: 59867136 bytes 57.1 MiB, wrote: 71897088 bytes 68.6 MiB, delta: 12029952 bytes 11.5 MiB, wrote 20.1% more than predicted freespace: trashing: was free: 82243584 bytes 78.4 MiB, need free: 22818816 bytes 21.8 MiB, files created: 248866, delete 179817 (72.3% of them) freespace: was free: 50905088 bytes 48.5 MiB, wrote: 63168512 bytes 60.2 MiB, delta: 12263424 bytes 11.7 MiB, wrote 24.1% more than predicted freespace: trashing: was free: 83402752 bytes 79.5 MiB, need free: 30425088 bytes 29.0 MiB, files created: 248920, delete 158114 (63.5% of them) freespace: was free: 42651648 bytes 40.7 MiB, wrote: 55406592 bytes 52.8 MiB, delta: 12754944 bytes 12.2 MiB, wrote 29.9% more than predicted freespace: trashing: was free: 84402176 bytes 80.5 MiB, need free: 38031360 bytes 36.3 MiB, files created: 248709, delete 136641 (54.9% of them) freespace: was free: 35233792 bytes 33.6 MiB, wrote: 48250880 bytes 46.0 MiB, delta: 13017088 bytes 12.4 MiB, wrote 36.9% more than predicted freespace: trashing: was free: 82530304 bytes 78.7 MiB, need free: 45637632 bytes 43.5 MiB, files created: 248778, delete 111208 (44.7% of them) freespace: was free: 27287552 bytes 26.0 MiB, wrote: 40267776 bytes 38.4 MiB, delta: 12980224 bytes 12.4 MiB, wrote 47.6% more than predicted freespace: trashing: was free: 85114880 bytes 81.2 MiB, need free: 53243904 bytes 50.8 MiB, files created: 248508, delete 93052 (37.4% of them) freespace: was free: 22437888 bytes 21.4 MiB, wrote: 35328000 bytes 33.7 MiB, delta: 12890112 bytes 12.3 MiB, wrote 57.4% more than predicted freespace: trashing: was free: 84103168 bytes 80.2 MiB, need free: 60850176 bytes 58.0 MiB, files created: 248637, delete 68743 (27.6% of them) freespace: was free: 15536128 bytes 14.8 MiB, wrote: 28319744 bytes 27.0 MiB, delta: 12783616 bytes 12.2 MiB, wrote 82.3% more than predicted freespace: trashing: was free: 84357120 bytes 80.4 MiB, need free: 68456448 bytes 65.3 MiB, files created: 248567, delete 46852 (18.8% of them) freespace: was free: 9015296 bytes 8.6 MiB, wrote: 22044672 bytes 21.0 MiB, delta: 13029376 bytes 12.4 MiB, wrote 144.5% more than predicted freespace: trashing: was free: 84942848 bytes 81.0 MiB, need free: 76062720 bytes 72.5 MiB, files created: 248636, delete 25993 (10.5% of them) freespace: was free: 6086656 bytes 5.8 MiB, wrote: 8331264 bytes 7.9 MiB, delta: 2244608 bytes 2.1 MiB, wrote 36.9% more than predicted freespace: Test 3 finished freespace: finished successfully After the change: freespace: Test 1: fill the space we have 3 times freespace: was free: 94048256 bytes 89.7 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 2441216 bytes 2.3 MiB, wrote 2.6% more than predicted freespace: was free: 92246016 bytes 88.0 MiB, wrote: 96493568 bytes 92.0 MiB, delta: 4247552 bytes 4.1 MiB, wrote 4.6% more than predicted freespace: was free: 92254208 bytes 88.0 MiB, wrote: 96489472 bytes 92.0 MiB, delta: 4235264 bytes 4.0 MiB, wrote 4.6% more than predicted freespace: Test 1 finished freespace: Test 2: gradually lessen amount of free space and fill the FS freespace: do 10 steps, lessen free space by 8386001 bytes 8.0 MiB each time freespace: was free: 86605824 bytes 82.6 MiB, wrote: 88252416 bytes 84.2 MiB, delta: 1646592 bytes 1.6 MiB, wrote 1.9% more than predicted freespace: was free: 78667776 bytes 75.0 MiB, wrote: 80715776 bytes 77.0 MiB, delta: 2048000 bytes 2.0 MiB, wrote 2.6% more than predicted freespace: was free: 69615616 bytes 66.4 MiB, wrote: 71630848 bytes 68.3 MiB, delta: 2015232 bytes 1.9 MiB, wrote 2.9% more than predicted freespace: was free: 61018112 bytes 58.2 MiB, wrote: 62783488 bytes 59.9 MiB, delta: 1765376 bytes 1.7 MiB, wrote 2.9% more than predicted freespace: was free: 52424704 bytes 50.0 MiB, wrote: 53968896 bytes 51.5 MiB, delta: 1544192 bytes 1.5 MiB, wrote 2.9% more than predicted freespace: was free: 43880448 bytes 41.8 MiB, wrote: 45199360 bytes 43.1 MiB, delta: 1318912 bytes 1.3 MiB, wrote 3.0% more than predicted freespace: was free: 35332096 bytes 33.7 MiB, wrote: 36425728 bytes 34.7 MiB, delta: 1093632 bytes 1.0 MiB, wrote 3.1% more than predicted freespace: was free: 26771456 bytes 25.5 MiB, wrote: 27643904 bytes 26.4 MiB, delta: 872448 bytes 852.0 KiB, wrote 3.3% more than predicted freespace: was free: 18231296 bytes 17.4 MiB, wrote: 18878464 bytes 18.0 MiB, delta: 647168 bytes 632.0 KiB, wrote 3.5% more than predicted freespace: was free: 9674752 bytes 9.2 MiB, wrote: 10088448 bytes 9.6 MiB, delta: 413696 bytes 404.0 KiB, wrote 4.3% more than predicted freespace: Test 2 finished freespace: Test 3: gradually lessen amount of free space by trashing and fill the FS freespace: do 10 steps, lessen free space by 8397544 bytes 8.0 MiB each time freespace: trashing: was free: 92372992 bytes 88.1 MiB, need free: 8397552 bytes 8.0 MiB, files created: 248296, delete 225723 (90.9% of them) freespace: was free: 71909376 bytes 68.6 MiB, wrote: 82472960 bytes 78.7 MiB, delta: 10563584 bytes 10.1 MiB, wrote 14.7% more than predicted freespace: trashing: was free: 88989696 bytes 84.9 MiB, need free: 16795096 bytes 16.0 MiB, files created: 248794, delete 201838 (81.1% of them) freespace: was free: 60354560 bytes 57.6 MiB, wrote: 71782400 bytes 68.5 MiB, delta: 11427840 bytes 10.9 MiB, wrote 18.9% more than predicted freespace: trashing: was free: 90304512 bytes 86.1 MiB, need free: 25192640 bytes 24.0 MiB, files created: 248733, delete 179342 (72.1% of them) freespace: was free: 51187712 bytes 48.8 MiB, wrote: 62943232 bytes 60.0 MiB, delta: 11755520 bytes 11.2 MiB, wrote 23.0% more than predicted freespace: trashing: was free: 91209728 bytes 87.0 MiB, need free: 33590184 bytes 32.0 MiB, files created: 248779, delete 157160 (63.2% of them) freespace: was free: 42704896 bytes 40.7 MiB, wrote: 55050240 bytes 52.5 MiB, delta: 12345344 bytes 11.8 MiB, wrote 28.9% more than predicted freespace: trashing: was free: 92700672 bytes 88.4 MiB, need free: 41987728 bytes 40.0 MiB, files created: 248848, delete 136135 (54.7% of them) freespace: was free: 35250176 bytes 33.6 MiB, wrote: 48115712 bytes 45.9 MiB, delta: 12865536 bytes 12.3 MiB, wrote 36.5% more than predicted freespace: trashing: was free: 93986816 bytes 89.6 MiB, need free: 50385272 bytes 48.1 MiB, files created: 248723, delete 115385 (46.4% of them) freespace: was free: 29995008 bytes 28.6 MiB, wrote: 41582592 bytes 39.7 MiB, delta: 11587584 bytes 11.1 MiB, wrote 38.6% more than predicted freespace: trashing: was free: 91881472 bytes 87.6 MiB, need free: 58782816 bytes 56.1 MiB, files created: 248645, delete 89569 (36.0% of them) freespace: was free: 22511616 bytes 21.5 MiB, wrote: 34705408 bytes 33.1 MiB, delta: 12193792 bytes 11.6 MiB, wrote 54.2% more than predicted freespace: trashing: was free: 91774976 bytes 87.5 MiB, need free: 67180360 bytes 64.1 MiB, files created: 248580, delete 66616 (26.8% of them) freespace: was free: 16908288 bytes 16.1 MiB, wrote: 26898432 bytes 25.7 MiB, delta: 9990144 bytes 9.5 MiB, wrote 59.1% more than predicted freespace: trashing: was free: 92450816 bytes 88.2 MiB, need free: 75577904 bytes 72.1 MiB, files created: 248654, delete 45381 (18.3% of them) freespace: was free: 10170368 bytes 9.7 MiB, wrote: 19111936 bytes 18.2 MiB, delta: 8941568 bytes 8.5 MiB, wrote 87.9% more than predicted freespace: trashing: was free: 93282304 bytes 89.0 MiB, need free: 83975448 bytes 80.1 MiB, files created: 248513, delete 24794 (10.0% of them) freespace: was free: 3911680 bytes 3.7 MiB, wrote: 7872512 bytes 7.5 MiB, delta: 3960832 bytes 3.8 MiB, wrote 101.3% more than predicted freespace: Test 3 finished freespace: finished successfully Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 38 ++++++++++++++++++++++++++++++++++---- fs/ubifs/super.c | 10 +++++----- fs/ubifs/ubifs.h | 2 +- 3 files changed, 40 insertions(+), 10 deletions(-) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 7851480a6ce..101d278c591 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -747,14 +747,24 @@ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) } /** - * ubifs_budg_get_free_space - return amount of free space. + * ubifs_get_free_space - return amount of free space. * @c: UBIFS file-system description object * - * This function returns amount of free space on the file-system. + * This function calculates amount of free space to report to user-space. + * + * Because UBIFS may introduce substantial overhead (the index, node headers, + * alighment, wastage at the end of eraseblocks, etc), it cannot report real + * amount of free flash space it has (well, because not all dirty space is + * reclamable, UBIFS does not actually know the real amount). If UBIFS did so, + * it would bread user expectetion about what free space is. Users seem to + * accustomed to assume that if the file-system reports N bytes of free space, + * they would be able to fit a file of N bytes to the FS. This almost works for + * traditional file-systems, because they have way less overhead than UBIFS. + * So, to keep users happy, UBIFS tries to take the overhead into account. */ -long long ubifs_budg_get_free_space(struct ubifs_info *c) +long long ubifs_get_free_space(struct ubifs_info *c) { - int min_idx_lebs; + int min_idx_lebs, rsvd_idx_lebs, lebs; long long available, outstanding, free; spin_lock(&c->space_lock); @@ -771,6 +781,26 @@ long long ubifs_budg_get_free_space(struct ubifs_info *c) } available = ubifs_calc_available(c, min_idx_lebs); + + /* + * When reporting free space to user-space, UBIFS guarantees that it is + * possible to write a file of free space size. This means that for + * empty LEBs we may use more precise calculations than + * 'ubifs_calc_available()' is using. Namely, we know that in empty + * LEBs we would waste only @c->leb_overhead bytes, not @c->dark_wm. + * Thus, amend the available space. + * + * Note, the calculations below are similar to what we have in + * 'do_budget_space()', so refer there for comments. + */ + if (min_idx_lebs > c->lst.idx_lebs) + rsvd_idx_lebs = min_idx_lebs - c->lst.idx_lebs; + else + rsvd_idx_lebs = 0; + lebs = c->lst.empty_lebs + c->freeable_cnt + c->idx_gc_cnt - + c->lst.taken_empty_lebs; + lebs -= rsvd_idx_lebs; + available += lebs * (c->dark_wm - c->leb_overhead); spin_unlock(&c->space_lock); if (available > outstanding) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index be23fd3cfd8..1207bd51ead 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -371,7 +371,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) struct ubifs_info *c = dentry->d_sb->s_fs_info; unsigned long long free; - free = ubifs_budg_get_free_space(c); + free = ubifs_get_free_space(c); dbg_gen("free space %lld bytes (%lld blocks)", free, free >> UBIFS_BLOCK_SHIFT); @@ -653,11 +653,11 @@ static int init_constants_late(struct ubifs_info *c) * internally because it does not make much sense for UBIFS, but it is * necessary to report something for the 'statfs()' call. * - * Subtract the LEB reserved for GC and the LEB which is reserved for - * deletions. + * Subtract the LEB reserved for GC, the LEB which is reserved for + * deletions, and assume only one journal head is available. */ - tmp64 = c->main_lebs - 2; - tmp64 *= (uint64_t)c->leb_size - c->dark_wm; + tmp64 = c->main_lebs - 2 - c->jhead_cnt + 1; + tmp64 *= (uint64_t)c->leb_size - c->leb_overhead; tmp64 = ubifs_reported_space(c, tmp64); c->block_cnt = tmp64 >> UBIFS_BLOCK_SHIFT; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 57e58541de2..17c620b93ee 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -1443,7 +1443,7 @@ void ubifs_release_ino_dirty(struct ubifs_info *c, struct inode *inode, struct ubifs_budget_req *req); void ubifs_cancel_ino_op(struct ubifs_info *c, struct inode *inode, struct ubifs_budget_req *req); -long long ubifs_budg_get_free_space(struct ubifs_info *c); +long long ubifs_get_free_space(struct ubifs_info *c); int ubifs_calc_min_idx_lebs(struct ubifs_info *c); void ubifs_convert_page_budget(struct ubifs_info *c); long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free); -- cgit v1.2.3 From b3385c278d3c32aec68d4900b35bc07df1b2240c Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Sun, 31 Aug 2008 17:13:18 +0300 Subject: UBIFS: fill f_fsid UBIFS stores 16-bit UUID in the superblock, and it is a good idea to return part of it in 'f_fsid' filed of kstatfs structure. Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 1207bd51ead..0dee4042c6c 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -386,6 +386,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = UBIFS_MAX_NLEN; + memcpy(&buf->f_fsid, c->uuid, sizeof(__kernel_fsid_t)); return 0; } -- cgit v1.2.3 From 9e3f544d792fd2ff7e31ca4a72e5194f1491ed14 Mon Sep 17 00:00:00 2001 From: Haavard Skinnemoen Date: Wed, 20 Aug 2008 15:46:24 +0200 Subject: avr32: Fix lockup after Java stack underflow in user mode When using the Java Extension Module hardware, a Java stack underflow or overflow trap may cause the system to enter an infinite exception loop. Although there's no kernel support for the Java hardware yet, we need to be able to recover from this situation and keep the system running. This patch adds code to detect and fixup this situation in the critical exception handler and terminate the faulting process. We may have to rethink how to handle this more gracefully when the necessary kernel support for hardware-accelerated Java is added. Reported-by: Guennadi Liakhovetski Signed-off-by: Haavard Skinnemoen --- arch/avr32/kernel/asm-offsets.c | 6 ++++ arch/avr32/kernel/entry-avr32b.S | 59 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 63 insertions(+), 2 deletions(-) diff --git a/arch/avr32/kernel/asm-offsets.c b/arch/avr32/kernel/asm-offsets.c index e4796c67a83..d6a8193a1d2 100644 --- a/arch/avr32/kernel/asm-offsets.c +++ b/arch/avr32/kernel/asm-offsets.c @@ -4,6 +4,8 @@ * to extract and format the required data. */ +#include +#include #include #include @@ -17,4 +19,8 @@ void foo(void) OFFSET(TI_rar_saved, thread_info, rar_saved); OFFSET(TI_rsr_saved, thread_info, rsr_saved); OFFSET(TI_restart_block, thread_info, restart_block); + BLANK(); + OFFSET(TSK_active_mm, task_struct, active_mm); + BLANK(); + OFFSET(MM_pgd, mm_struct, pgd); } diff --git a/arch/avr32/kernel/entry-avr32b.S b/arch/avr32/kernel/entry-avr32b.S index 2b398cae110..33d49377b8b 100644 --- a/arch/avr32/kernel/entry-avr32b.S +++ b/arch/avr32/kernel/entry-avr32b.S @@ -334,9 +334,64 @@ save_full_context_ex: /* Low-level exception handlers */ handle_critical: + /* + * AT32AP700x errata: + * + * After a Java stack overflow or underflow trap, any CPU + * memory access may cause erratic behavior. This will happen + * when the four least significant bits of the JOSP system + * register contains any value between 9 and 15 (inclusive). + * + * Possible workarounds: + * - Don't use the Java Extension Module + * - Ensure that the stack overflow and underflow trap + * handlers do not do any memory access or trigger any + * exceptions before the overflow/underflow condition is + * cleared (by incrementing or decrementing the JOSP) + * - Make sure that JOSP does not contain any problematic + * value before doing any exception or interrupt + * processing. + * - Set up a critical exception handler which writes a + * known-to-be-safe value, e.g. 4, to JOSP before doing + * any further processing. + * + * We'll use the last workaround for now since we cannot + * guarantee that user space processes don't use Java mode. + * Non-well-behaving userland will be terminated with extreme + * prejudice. + */ +#ifdef CONFIG_CPU_AT32AP700X + /* + * There's a chance we can't touch memory, so temporarily + * borrow PTBR to save the stack pointer while we fix things + * up... + */ + mtsr SYSREG_PTBR, sp + mov sp, 4 + mtsr SYSREG_JOSP, sp + mfsr sp, SYSREG_PTBR + sub pc, -2 + + /* Push most of pt_regs on stack. We'll do the rest later */ sub sp, 4 - stmts --sp, r0-lr - rcall save_full_context_ex + pushm r0-r12 + + /* PTBR mirrors current_thread_info()->task->active_mm->pgd */ + get_thread_info r0 + ld.w r1, r0[TI_task] + ld.w r2, r1[TSK_active_mm] + ld.w r3, r2[MM_pgd] + mtsr SYSREG_PTBR, r3 +#else + sub sp, 4 + pushm r0-r12 +#endif + sub r0, sp, -(14 * 4) + mov r1, lr + mfsr r2, SYSREG_RAR_EX + mfsr r3, SYSREG_RSR_EX + pushm r0-r3 + mfsr r12, SYSREG_ECR mov r11, sp rcall do_critical_exception -- cgit v1.2.3 From 84c4f2f21a8e6e6d4bdfff95bf5ddc7925df4e01 Mon Sep 17 00:00:00 2001 From: Humphrey Bucknell Date: Fri, 29 Aug 2008 16:27:00 +0100 Subject: avr32: pm_standby low-power ram bug fix The value stored into the SDRAMC LPR register should be the current value of the register with the Self-refresh value set in the lower bit field. The bug involved only the Self-refresh value being written to the register, thus over writing any low-power ram settings. Signed-off-by: Humphrey Bucknell Signed-off-by: Haavard Skinnemoen --- arch/avr32/mach-at32ap/pm-at32ap700x.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/avr32/mach-at32ap/pm-at32ap700x.S b/arch/avr32/mach-at32ap/pm-at32ap700x.S index 5be4de65b20..17503b0ed6c 100644 --- a/arch/avr32/mach-at32ap/pm-at32ap700x.S +++ b/arch/avr32/mach-at32ap/pm-at32ap700x.S @@ -134,7 +134,7 @@ pm_standby: mov r11, SDRAMC_LPR_LPCB_SELF_RFR bfins r10, r11, 0, 2 /* LPCB <- self Refresh */ sync 0 /* flush write buffer */ - st.w r12[SDRAMC_LPR], r11 /* put SDRAM in self-refresh mode */ + st.w r12[SDRAMC_LPR], r10 /* put SDRAM in self-refresh mode */ ld.w r11, r12[SDRAMC_LPR] unmask_interrupts sleep CPU_SLEEP_FROZEN -- cgit v1.2.3 From e5bd1c3fdd06b6c0fa6dfb98ce31cea1820ce4e9 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Wed, 3 Sep 2008 02:14:39 -0700 Subject: sparc64: Fix IPI call locking. When I switched sparc64 over to the generic helpers for smp_call_function(), I didn't convert the dinky call_lock we had. Use ipi_call_lock() and ipi_call_unlock(). Signed-off-by: David S. Miller --- arch/sparc64/kernel/smp.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 743ccad61c6..0712a445f98 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -80,8 +80,6 @@ void smp_bogo(struct seq_file *m) i, cpu_data(i).clock_tick); } -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(call_lock); - extern void setup_sparc64_timer(void); static volatile unsigned long callin_flag = 0; @@ -120,9 +118,9 @@ void __cpuinit smp_callin(void) while (!cpu_isset(cpuid, smp_commenced_mask)) rmb(); - spin_lock(&call_lock); + ipi_call_lock(); cpu_set(cpuid, cpu_online_map); - spin_unlock(&call_lock); + ipi_call_unlock(); /* idle thread is expected to have preempt disabled */ preempt_disable(); @@ -1305,9 +1303,9 @@ int __cpu_disable(void) c->core_id = 0; c->proc_id = -1; - spin_lock(&call_lock); + ipi_call_lock(); cpu_clear(cpu, cpu_online_map); - spin_unlock(&call_lock); + ipi_call_unlock(); smp_wmb(); -- cgit v1.2.3 From 69114a47af524badc73de8fc86959941a03f0600 Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 3 Sep 2008 10:15:26 +0100 Subject: [ARM] omap: fix gpio.c build error arch/arm/plat-omap/gpio.c: In function '_omap_gpio_init': arch/arm/plat-omap/gpio.c:1492: error: 'omap_mpuio_device' undeclared (first use in this function) Signed-off-by: Russell King --- arch/arm/plat-omap/gpio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/plat-omap/gpio.c b/arch/arm/plat-omap/gpio.c index 3e76ee2bc73..9e1341ebc14 100644 --- a/arch/arm/plat-omap/gpio.c +++ b/arch/arm/plat-omap/gpio.c @@ -1488,7 +1488,7 @@ static int __init _omap_gpio_init(void) bank->chip.set = gpio_set; if (bank_is_mpuio(bank)) { bank->chip.label = "mpuio"; -#ifdef CONFIG_ARCH_OMAP1 +#ifdef CONFIG_ARCH_OMAP16XX bank->chip.dev = &omap_mpuio_device.dev; #endif bank->chip.base = OMAP_MPUIO(0); -- cgit v1.2.3 From 4d084617fb0d025c42c242362d1f27d337e2d407 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 3 Sep 2008 02:15:30 -0700 Subject: sparc64: Prevent sparc64 from invoking irq handlers on offline CPUs Make sparc64 refrain from clearing a given to-be-offlined CPU's bit in the cpu_online_mask until it has processed pending irqs. This change prevents other CPUs from being blindsided by an apparently offline CPU nevertheless changing globally visible state. Signed-off-by: Paul E. McKenney Signed-off-by: David S. Miller --- arch/sparc64/kernel/smp.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/sparc64/kernel/smp.c b/arch/sparc64/kernel/smp.c index 0712a445f98..2be166c544c 100644 --- a/arch/sparc64/kernel/smp.c +++ b/arch/sparc64/kernel/smp.c @@ -1303,10 +1303,6 @@ int __cpu_disable(void) c->core_id = 0; c->proc_id = -1; - ipi_call_lock(); - cpu_clear(cpu, cpu_online_map); - ipi_call_unlock(); - smp_wmb(); /* Make sure no interrupts point to this cpu. */ @@ -1316,6 +1312,10 @@ int __cpu_disable(void) mdelay(1); local_irq_disable(); + ipi_call_lock(); + cpu_clear(cpu, cpu_online_map); + ipi_call_unlock(); + return 0; } -- cgit v1.2.3 From 7c7cbadf7341a0792879c67d6e3020f040d6cd7f Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 3 Sep 2008 14:16:42 +0300 Subject: UBIFS: amend f_fsid David Woodhouse suggested to be consistent with other FSes and xor the beginning and the end of the UUID. Signed-off-by: Artem Bityutskiy --- fs/ubifs/super.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index 0dee4042c6c..7562464ac83 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -370,6 +370,7 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct ubifs_info *c = dentry->d_sb->s_fs_info; unsigned long long free; + __le32 *uuid = (__le32 *)c->uuid; free = ubifs_get_free_space(c); dbg_gen("free space %lld bytes (%lld blocks)", @@ -386,8 +387,8 @@ static int ubifs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_files = 0; buf->f_ffree = 0; buf->f_namelen = UBIFS_MAX_NLEN; - memcpy(&buf->f_fsid, c->uuid, sizeof(__kernel_fsid_t)); - + buf->f_fsid.val[0] = le32_to_cpu(uuid[0]) ^ le32_to_cpu(uuid[2]); + buf->f_fsid.val[1] = le32_to_cpu(uuid[1]) ^ le32_to_cpu(uuid[3]); return 0; } -- cgit v1.2.3 From c3df1a2685004bce97827bdbe7a304734009849b Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 3 Sep 2008 23:59:23 +0100 Subject: [ARM] omap: fix build error in ohci-omap.c drivers/usb/host/ohci-omap.c: In function 'ohci_omap_init': drivers/usb/host/ohci-omap.c:228: error: 'start_hnp' undeclared (first use in this function) Signed-off-by: Russell King --- drivers/usb/host/ohci-omap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/ohci-omap.c b/drivers/usb/host/ohci-omap.c index 1eb64d08b60..95b3ec89c12 100644 --- a/drivers/usb/host/ohci-omap.c +++ b/drivers/usb/host/ohci-omap.c @@ -208,7 +208,7 @@ static int ohci_omap_init(struct usb_hcd *hcd) if (cpu_is_omap16xx()) ocpi_enable(); -#ifdef CONFIG_ARCH_OMAP_OTG +#ifdef CONFIG_USB_OTG if (need_transceiver) { ohci->transceiver = otg_get_transceiver(); if (ohci->transceiver) { -- cgit v1.2.3 From 8b540fdcb7c445d61955991f071e956ef40a2f17 Mon Sep 17 00:00:00 2001 From: Huang Weiyi Date: Sat, 23 Aug 2008 13:56:02 +0800 Subject: [ARM] remove unused #include The driver(s) below do not use LINUX_VERSION_CODE nor KERNEL_VERSION. arch/arm/plat-mxc/clock.c This patch removes the said #include . Signed-off-by: Huang Weiyi Signed-off-by: Russell King --- arch/arm/plat-mxc/clock.c | 1 - 1 file changed, 1 deletion(-) diff --git a/arch/arm/plat-mxc/clock.c b/arch/arm/plat-mxc/clock.c index 2f862721883..0a38f0b396e 100644 --- a/arch/arm/plat-mxc/clock.c +++ b/arch/arm/plat-mxc/clock.c @@ -37,7 +37,6 @@ #include #include #include -#include #include -- cgit v1.2.3 From 65846909d684d75906269df4f5f3474e1fef568b Mon Sep 17 00:00:00 2001 From: Russell King Date: Wed, 3 Sep 2008 23:46:18 +0100 Subject: [ARM] omap: fix virtual vs physical address space confusions mcbsp is confused as to what takes a physical or virtual address. Fix the two instances where it gets it wrong. Acked-by: Tony Lindgren Signed-off-by: Russell King --- arch/arm/mach-omap1/mcbsp.c | 8 ++++++++ arch/arm/mach-omap2/mcbsp.c | 4 ++++ arch/arm/plat-omap/include/mach/mcbsp.h | 2 ++ arch/arm/plat-omap/mcbsp.c | 5 +++-- 4 files changed, 17 insertions(+), 2 deletions(-) diff --git a/arch/arm/mach-omap1/mcbsp.c b/arch/arm/mach-omap1/mcbsp.c index 826010d5d01..2baeaeb0c90 100644 --- a/arch/arm/mach-omap1/mcbsp.c +++ b/arch/arm/mach-omap1/mcbsp.c @@ -159,6 +159,7 @@ static struct omap_mcbsp_ops omap1_mcbsp_ops = { #ifdef CONFIG_ARCH_OMAP730 static struct omap_mcbsp_platform_data omap730_mcbsp_pdata[] = { { + .phys_base = OMAP730_MCBSP1_BASE, .virt_base = io_p2v(OMAP730_MCBSP1_BASE), .dma_rx_sync = OMAP_DMA_MCBSP1_RX, .dma_tx_sync = OMAP_DMA_MCBSP1_TX, @@ -167,6 +168,7 @@ static struct omap_mcbsp_platform_data omap730_mcbsp_pdata[] = { .ops = &omap1_mcbsp_ops, }, { + .phys_base = OMAP730_MCBSP2_BASE, .virt_base = io_p2v(OMAP730_MCBSP2_BASE), .dma_rx_sync = OMAP_DMA_MCBSP3_RX, .dma_tx_sync = OMAP_DMA_MCBSP3_TX, @@ -184,6 +186,7 @@ static struct omap_mcbsp_platform_data omap730_mcbsp_pdata[] = { #ifdef CONFIG_ARCH_OMAP15XX static struct omap_mcbsp_platform_data omap15xx_mcbsp_pdata[] = { { + .phys_base = OMAP1510_MCBSP1_BASE, .virt_base = OMAP1510_MCBSP1_BASE, .dma_rx_sync = OMAP_DMA_MCBSP1_RX, .dma_tx_sync = OMAP_DMA_MCBSP1_TX, @@ -193,6 +196,7 @@ static struct omap_mcbsp_platform_data omap15xx_mcbsp_pdata[] = { .clk_name = "mcbsp_clk", }, { + .phys_base = OMAP1510_MCBSP2_BASE, .virt_base = io_p2v(OMAP1510_MCBSP2_BASE), .dma_rx_sync = OMAP_DMA_MCBSP2_RX, .dma_tx_sync = OMAP_DMA_MCBSP2_TX, @@ -201,6 +205,7 @@ static struct omap_mcbsp_platform_data omap15xx_mcbsp_pdata[] = { .ops = &omap1_mcbsp_ops, }, { + .phys_base = OMAP1510_MCBSP3_BASE, .virt_base = OMAP1510_MCBSP3_BASE, .dma_rx_sync = OMAP_DMA_MCBSP3_RX, .dma_tx_sync = OMAP_DMA_MCBSP3_TX, @@ -219,6 +224,7 @@ static struct omap_mcbsp_platform_data omap15xx_mcbsp_pdata[] = { #ifdef CONFIG_ARCH_OMAP16XX static struct omap_mcbsp_platform_data omap16xx_mcbsp_pdata[] = { { + .phys_base = OMAP1610_MCBSP1_BASE, .virt_base = OMAP1610_MCBSP1_BASE, .dma_rx_sync = OMAP_DMA_MCBSP1_RX, .dma_tx_sync = OMAP_DMA_MCBSP1_TX, @@ -228,6 +234,7 @@ static struct omap_mcbsp_platform_data omap16xx_mcbsp_pdata[] = { .clk_name = "mcbsp_clk", }, { + .phys_base = OMAP1610_MCBSP2_BASE, .virt_base = io_p2v(OMAP1610_MCBSP2_BASE), .dma_rx_sync = OMAP_DMA_MCBSP2_RX, .dma_tx_sync = OMAP_DMA_MCBSP2_TX, @@ -236,6 +243,7 @@ static struct omap_mcbsp_platform_data omap16xx_mcbsp_pdata[] = { .ops = &omap1_mcbsp_ops, }, { + .phys_base = OMAP1610_MCBSP3_BASE, .virt_base = OMAP1610_MCBSP3_BASE, .dma_rx_sync = OMAP_DMA_MCBSP3_RX, .dma_tx_sync = OMAP_DMA_MCBSP3_TX, diff --git a/arch/arm/mach-omap2/mcbsp.c b/arch/arm/mach-omap2/mcbsp.c index 27eb6e3ca92..b261f1f80b5 100644 --- a/arch/arm/mach-omap2/mcbsp.c +++ b/arch/arm/mach-omap2/mcbsp.c @@ -134,6 +134,7 @@ static struct omap_mcbsp_ops omap2_mcbsp_ops = { #ifdef CONFIG_ARCH_OMAP24XX static struct omap_mcbsp_platform_data omap24xx_mcbsp_pdata[] = { { + .phys_base = OMAP24XX_MCBSP1_BASE, .virt_base = IO_ADDRESS(OMAP24XX_MCBSP1_BASE), .dma_rx_sync = OMAP24XX_DMA_MCBSP1_RX, .dma_tx_sync = OMAP24XX_DMA_MCBSP1_TX, @@ -143,6 +144,7 @@ static struct omap_mcbsp_platform_data omap24xx_mcbsp_pdata[] = { .clk_name = "mcbsp_clk", }, { + .phys_base = OMAP24XX_MCBSP2_BASE, .virt_base = IO_ADDRESS(OMAP24XX_MCBSP2_BASE), .dma_rx_sync = OMAP24XX_DMA_MCBSP2_RX, .dma_tx_sync = OMAP24XX_DMA_MCBSP2_TX, @@ -161,6 +163,7 @@ static struct omap_mcbsp_platform_data omap24xx_mcbsp_pdata[] = { #ifdef CONFIG_ARCH_OMAP34XX static struct omap_mcbsp_platform_data omap34xx_mcbsp_pdata[] = { { + .phys_base = OMAP34XX_MCBSP1_BASE, .virt_base = IO_ADDRESS(OMAP34XX_MCBSP1_BASE), .dma_rx_sync = OMAP24XX_DMA_MCBSP1_RX, .dma_tx_sync = OMAP24XX_DMA_MCBSP1_TX, @@ -170,6 +173,7 @@ static struct omap_mcbsp_platform_data omap34xx_mcbsp_pdata[] = { .clk_name = "mcbsp_clk", }, { + .phys_base = OMAP34XX_MCBSP2_BASE, .virt_base = IO_ADDRESS(OMAP34XX_MCBSP2_BASE), .dma_rx_sync = OMAP24XX_DMA_MCBSP2_RX, .dma_tx_sync = OMAP24XX_DMA_MCBSP2_TX, diff --git a/arch/arm/plat-omap/include/mach/mcbsp.h b/arch/arm/plat-omap/include/mach/mcbsp.h index 6eb44a92871..8fdb95e26fc 100644 --- a/arch/arm/plat-omap/include/mach/mcbsp.h +++ b/arch/arm/plat-omap/include/mach/mcbsp.h @@ -315,6 +315,7 @@ struct omap_mcbsp_ops { }; struct omap_mcbsp_platform_data { + unsigned long phys_base; u32 virt_base; u8 dma_rx_sync, dma_tx_sync; u16 rx_irq, tx_irq; @@ -324,6 +325,7 @@ struct omap_mcbsp_platform_data { struct omap_mcbsp { struct device *dev; + unsigned long phys_base; u32 io_base; u8 id; u8 free; diff --git a/arch/arm/plat-omap/mcbsp.c b/arch/arm/plat-omap/mcbsp.c index d0844050f2d..014d26574bb 100644 --- a/arch/arm/plat-omap/mcbsp.c +++ b/arch/arm/plat-omap/mcbsp.c @@ -651,7 +651,7 @@ int omap_mcbsp_xmit_buffer(unsigned int id, dma_addr_t buffer, omap_set_dma_dest_params(mcbsp[id].dma_tx_lch, src_port, OMAP_DMA_AMODE_CONSTANT, - mcbsp[id].io_base + OMAP_MCBSP_REG_DXR1, + mcbsp[id].phys_base + OMAP_MCBSP_REG_DXR1, 0, 0); omap_set_dma_src_params(mcbsp[id].dma_tx_lch, @@ -712,7 +712,7 @@ int omap_mcbsp_recv_buffer(unsigned int id, dma_addr_t buffer, omap_set_dma_src_params(mcbsp[id].dma_rx_lch, src_port, OMAP_DMA_AMODE_CONSTANT, - mcbsp[id].io_base + OMAP_MCBSP_REG_DRR1, + mcbsp[id].phys_base + OMAP_MCBSP_REG_DRR1, 0, 0); omap_set_dma_dest_params(mcbsp[id].dma_rx_lch, @@ -830,6 +830,7 @@ static int __init omap_mcbsp_probe(struct platform_device *pdev) mcbsp[id].dma_tx_lch = -1; mcbsp[id].dma_rx_lch = -1; + mcbsp[id].phys_base = pdata->phys_base; mcbsp[id].io_base = pdata->virt_base; /* Default I/O is IRQ based */ mcbsp[id].io_type = OMAP_MCBSP_IRQ_IO; -- cgit v1.2.3 From 9f43e3914dceb0f8191875b3cdf4325b48d0d70a Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Tue, 2 Sep 2008 11:57:09 +1000 Subject: powerpc/spufs: Fix multiple get_spu_context() Commit 8d5636fbca202f61fdb808fc9e20c0142291d802 introduced a reference count on SPU contexts during find_victim, but this may cause a leak in the reference count if we later find a better contender for a context to unschedule. Change the reference to after we've found our victim context, so we don't do the extra get_spu_context(). Signed-off-by: Jeremy Kerr --- arch/powerpc/platforms/cell/spufs/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 1c1b627ee84..9bb45c6b839 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -643,9 +643,10 @@ static struct spu *find_victim(struct spu_context *ctx) !(tmp->flags & SPU_CREATE_NOSCHED) && (!victim || tmp->prio > victim->prio)) { victim = spu->ctx; - get_spu_context(victim); } } + if (victim) + get_spu_context(victim); mutex_unlock(&cbe_spu_info[node].list_mutex); if (victim) { -- cgit v1.2.3 From b65fe0356b5b732d7e1e0224c6a1cf2eb5255984 Mon Sep 17 00:00:00 2001 From: Jeremy Kerr Date: Thu, 4 Sep 2008 15:02:47 +1000 Subject: powerpc/spufs: Fix race for a free SPU We currently have a race for a free SPE. With one thread doing a spu_yield(), and another doing a spu_activate(): thread 1 thread 2 spu_yield(oldctx) spu_activate(ctx) __spu_deactivate(oldctx) spu_unschedule(oldctx, spu) spu->alloc_state = SPU_FREE spu = spu_get_idle(ctx) - searches for a SPE in state SPU_FREE, gets the context just freed by thread 1 spu_schedule(ctx, spu) spu->alloc_state = SPU_USED spu_schedule(newctx, spu) - assumes spu is still free - tries to schedule context on already-used spu This change introduces a 'free_spu' flag to spu_unschedule, to indicate whether or not the function should free the spu after descheduling the context. We only set this flag if we're not going to re-schedule another context on this SPU. Add a comment to document this behaviour. Signed-off-by: Jeremy Kerr --- arch/powerpc/platforms/cell/spufs/sched.c | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 9bb45c6b839..897c7406116 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -732,13 +732,28 @@ static void spu_schedule(struct spu *spu, struct spu_context *ctx) spu_release(ctx); } -static void spu_unschedule(struct spu *spu, struct spu_context *ctx) +/** + * spu_unschedule - remove a context from a spu, and possibly release it. + * @spu: The SPU to unschedule from + * @ctx: The context currently scheduled on the SPU + * @free_spu Whether to free the SPU for other contexts + * + * Unbinds the context @ctx from the SPU @spu. If @free_spu is non-zero, the + * SPU is made available for other contexts (ie, may be returned by + * spu_get_idle). If this is zero, the caller is expected to schedule another + * context to this spu. + * + * Should be called with ctx->state_mutex held. + */ +static void spu_unschedule(struct spu *spu, struct spu_context *ctx, + int free_spu) { int node = spu->node; mutex_lock(&cbe_spu_info[node].list_mutex); cbe_spu_info[node].nr_active--; - spu->alloc_state = SPU_FREE; + if (free_spu) + spu->alloc_state = SPU_FREE; spu_unbind_context(spu, ctx); ctx->stats.invol_ctx_switch++; spu->stats.invol_ctx_switch++; @@ -838,7 +853,7 @@ static int __spu_deactivate(struct spu_context *ctx, int force, int max_prio) if (spu) { new = grab_runnable_context(max_prio, spu->node); if (new || force) { - spu_unschedule(spu, ctx); + spu_unschedule(spu, ctx, new == NULL); if (new) { if (new->flags & SPU_CREATE_NOSCHED) wake_up(&new->stop_wq); @@ -911,7 +926,7 @@ static noinline void spusched_tick(struct spu_context *ctx) new = grab_runnable_context(ctx->prio + 1, spu->node); if (new) { - spu_unschedule(spu, ctx); + spu_unschedule(spu, ctx, 0); if (test_bit(SPU_SCHED_SPU_RUN, &ctx->sched_flags)) spu_add_to_rq(ctx); } else { -- cgit v1.2.3 From 158e0fb6028a2329425d8287b1b2402a12ed4f28 Mon Sep 17 00:00:00 2001 From: Henrik Rydberg Date: Thu, 4 Sep 2008 22:20:10 -0400 Subject: Input: bcm5974 - small formatting cleanup Signed-off-by: Henrik Rydberg Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/bcm5974.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c index 2ec921bf3c6..ae78bb833f6 100644 --- a/drivers/input/mouse/bcm5974.c +++ b/drivers/input/mouse/bcm5974.c @@ -63,7 +63,7 @@ } /* table of devices that work with this driver */ -static const struct usb_device_id bcm5974_table [] = { +static const struct usb_device_id bcm5974_table[] = { /* MacbookAir1.1 */ BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING_ANSI), BCM5974_DEVICE(USB_DEVICE_ID_APPLE_WELLSPRING_ISO), -- cgit v1.2.3 From 75e21e3f3bb2b4a41bb0646a4d54eef27eb36ca5 Mon Sep 17 00:00:00 2001 From: Henrik Rydberg Date: Thu, 4 Sep 2008 22:28:23 -0400 Subject: Input: bcm5974 - improve finger tracking and counting The problem of finger tracking, i.e., when to switch focus from one finger to another on the trackpad, has been improved by utilizing more information from the bcm5974 chip output. This results in less pointer hopping when many fingers are on the trackpad. In addition, a finger counting method based on pressure information from all fingers is introduced. Together with a pressure hysteresis window, this yields a more stable counting of the number of fingers on the trackpad. Signed-off-by: Henrik Rydberg Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/bcm5974.c | 70 +++++++++++++++++++++++++++++++++---------- 1 file changed, 54 insertions(+), 16 deletions(-) diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c index ae78bb833f6..8568211c556 100644 --- a/drivers/input/mouse/bcm5974.c +++ b/drivers/input/mouse/bcm5974.c @@ -105,7 +105,7 @@ struct tp_header { /* trackpad finger structure */ struct tp_finger { - __le16 origin; /* left/right origin? */ + __le16 origin; /* zero when switching track finger */ __le16 abs_x; /* absolute x coodinate */ __le16 abs_y; /* absolute y coodinate */ __le16 rel_x; /* relative x coodinate */ @@ -159,6 +159,7 @@ struct bcm5974 { struct bt_data *bt_data; /* button transferred data */ struct urb *tp_urb; /* trackpad usb request block */ struct tp_data *tp_data; /* trackpad transferred data */ + int fingers; /* number of fingers on trackpad */ }; /* logical dimensions */ @@ -172,6 +173,10 @@ struct bcm5974 { #define SN_WIDTH 100 /* width signal-to-noise ratio */ #define SN_COORD 250 /* coordinate signal-to-noise ratio */ +/* pressure thresholds */ +#define PRESSURE_LOW (2 * DIM_PRESSURE / SN_PRESSURE) +#define PRESSURE_HIGH (3 * PRESSURE_LOW) + /* device constants */ static const struct bcm5974_config bcm5974_config_table[] = { { @@ -273,32 +278,65 @@ static int report_tp_state(struct bcm5974 *dev, int size) const struct tp_finger *f = dev->tp_data->finger; struct input_dev *input = dev->input; const int fingers = (size - 26) / 28; - int p = 0, w, x, y, n = 0; + int raw_p, raw_w, raw_x, raw_y; + int ptest = 0, origin = 0, nmin = 0, nmax = 0; + int abs_p = 0, abs_w = 0, abs_x = 0, abs_y = 0; if (size < 26 || (size - 26) % 28 != 0) return -EIO; + /* always track the first finger; when detached, start over */ if (fingers) { - p = raw2int(f->force_major); - w = raw2int(f->size_major); - x = raw2int(f->abs_x); - y = raw2int(f->abs_y); - n = p > 0 ? fingers : 0; + raw_p = raw2int(f->force_major); + raw_w = raw2int(f->size_major); + raw_x = raw2int(f->abs_x); + raw_y = raw2int(f->abs_y); dprintk(9, - "bcm5974: p: %+05d w: %+05d x: %+05d y: %+05d n: %d\n", - p, w, x, y, n); + "bcm5974: raw: p: %+05d w: %+05d x: %+05d y: %+05d\n", + raw_p, raw_w, raw_x, raw_y); + + ptest = int2bound(&c->p, raw_p); + origin = raw2int(f->origin); + } - input_report_abs(input, ABS_TOOL_WIDTH, int2bound(&c->w, w)); - input_report_abs(input, ABS_X, int2bound(&c->x, x - c->x.devmin)); - input_report_abs(input, ABS_Y, int2bound(&c->y, c->y.devmax - y)); + /* while tracking finger still valid, count all fingers */ + if (ptest > PRESSURE_LOW && origin) { + abs_p = ptest; + abs_w = int2bound(&c->w, raw_w); + abs_x = int2bound(&c->x, raw_x - c->x.devmin); + abs_y = int2bound(&c->y, c->y.devmax - raw_y); + for (; f != dev->tp_data->finger + fingers; f++) { + ptest = int2bound(&c->p, raw2int(f->force_major)); + if (ptest > PRESSURE_LOW) + nmax++; + if (ptest > PRESSURE_HIGH) + nmin++; + } } - input_report_abs(input, ABS_PRESSURE, int2bound(&c->p, p)); + if (dev->fingers < nmin) + dev->fingers = nmin; + if (dev->fingers > nmax) + dev->fingers = nmax; + + input_report_key(input, BTN_TOOL_FINGER, dev->fingers == 1); + input_report_key(input, BTN_TOOL_DOUBLETAP, dev->fingers == 2); + input_report_key(input, BTN_TOOL_TRIPLETAP, dev->fingers > 2); - input_report_key(input, BTN_TOOL_FINGER, n == 1); - input_report_key(input, BTN_TOOL_DOUBLETAP, n == 2); - input_report_key(input, BTN_TOOL_TRIPLETAP, n > 2); + input_report_abs(input, ABS_PRESSURE, abs_p); + input_report_abs(input, ABS_TOOL_WIDTH, abs_w); + + if (abs_p) { + input_report_abs(input, ABS_X, abs_x); + input_report_abs(input, ABS_Y, abs_y); + + dprintk(8, + "bcm5974: abs: p: %+05d w: %+05d x: %+05d y: %+05d " + "nmin: %d nmax: %d n: %d\n", + abs_p, abs_w, abs_x, abs_y, nmin, nmax, dev->fingers); + + } input_sync(input); -- cgit v1.2.3 From a6821f345fd508b17f5ce310b677b37aefb028dc Mon Sep 17 00:00:00 2001 From: Henrik Rydberg Date: Thu, 4 Sep 2008 22:28:31 -0400 Subject: Input: bcm5974 - add BTN_TOUCH event for mousedev benefit The mousedev driver requires the use of BTN_TOUCH events to process ABS_X and ABS_Y events properly, which is what is needed for the bcm5974-based apple computers to have a functional pointer out-of-the-box. Signed-off-by: Henrik Rydberg Signed-off-by: Dmitry Torokhov --- drivers/input/mouse/bcm5974.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/input/mouse/bcm5974.c b/drivers/input/mouse/bcm5974.c index 8568211c556..18f4d7f6ce6 100644 --- a/drivers/input/mouse/bcm5974.c +++ b/drivers/input/mouse/bcm5974.c @@ -253,6 +253,7 @@ static void setup_events_to_report(struct input_dev *input_dev, 0, cfg->y.dim, cfg->y.fuzz, 0); __set_bit(EV_KEY, input_dev->evbit); + __set_bit(BTN_TOUCH, input_dev->keybit); __set_bit(BTN_TOOL_FINGER, input_dev->keybit); __set_bit(BTN_TOOL_DOUBLETAP, input_dev->keybit); __set_bit(BTN_TOOL_TRIPLETAP, input_dev->keybit); @@ -320,6 +321,7 @@ static int report_tp_state(struct bcm5974 *dev, int size) if (dev->fingers > nmax) dev->fingers = nmax; + input_report_key(input, BTN_TOUCH, dev->fingers > 0); input_report_key(input, BTN_TOOL_FINGER, dev->fingers == 1); input_report_key(input, BTN_TOOL_DOUBLETAP, dev->fingers == 2); input_report_key(input, BTN_TOOL_TRIPLETAP, dev->fingers > 2); -- cgit v1.2.3 From 9ce1ca284a322ba6f9d691136a29c9cfe381e1fc Mon Sep 17 00:00:00 2001 From: Jiri Kosina Date: Thu, 4 Sep 2008 22:28:48 -0400 Subject: Input: i8042 - make Lenovo 3000 N100 blacklist entry more specific Apparently, there are more different versions of Lenovo 3000 N100, some of them working properly with active mux, and some of them requiring it being switched off. This patch applies 'nomux' only to the specific product name that is reported to behave badly unless 'nomux' is specified. Signed-off-by: Jiri Kosina Signed-off-by: Dmitry Torokhov --- drivers/input/serio/i8042-x86ia64io.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h index 3282b741e24..5aafe24984c 100644 --- a/drivers/input/serio/i8042-x86ia64io.h +++ b/drivers/input/serio/i8042-x86ia64io.h @@ -305,7 +305,7 @@ static struct dmi_system_id __initdata i8042_dmi_nomux_table[] = { .ident = "Lenovo 3000 n100", .matches = { DMI_MATCH(DMI_SYS_VENDOR, "LENOVO"), - DMI_MATCH(DMI_PRODUCT_VERSION, "3000 N100"), + DMI_MATCH(DMI_PRODUCT_NAME, "076804U"), }, }, { -- cgit v1.2.3 From 7c1e76897492d92b6a1c2d6892494d39ded9680c Mon Sep 17 00:00:00 2001 From: Venkatesh Pallipadi Date: Wed, 3 Sep 2008 21:36:50 +0000 Subject: clockevents: prevent clockevent event_handler ending up handler_noop There is a ordering related problem with clockevents code, due to which clockevents_register_device() called after tickless/highres switch will not work. The new clockevent ends up with clockevents_handle_noop as event handler, resulting in no timer activity. The problematic path seems to be * old device already has hrtimer_interrupt as the event_handler * new clockevent device registers with a higher rating * tick_check_new_device() is called * clockevents_exchange_device() gets called * old->event_handler is set to clockevents_handle_noop * tick_setup_device() is called for the new device * which sets new->event_handler using the old->event_handler which is noop. Change the ordering so that new device inherits the proper handler. This does not have any issue in normal case as most likely all the clockevent devices are setup before the highres switch. But, can potentially be affecting some corner case where HPET force detect happens after the highres switch. This was a problem with HPET in MSI mode code that we have been experimenting with. Signed-off-by: Venkatesh Pallipadi Signed-off-by: Shaohua Li Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- include/linux/clockchips.h | 2 ++ kernel/time/clockevents.c | 3 +-- kernel/time/tick-common.c | 1 + 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index c33b0dc28e4..ed3a5d473e5 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -127,6 +127,8 @@ extern int clockevents_register_notifier(struct notifier_block *nb); extern int clockevents_program_event(struct clock_event_device *dev, ktime_t expires, ktime_t now); +extern void clockevents_handle_noop(struct clock_event_device *dev); + #ifdef CONFIG_GENERIC_CLOCKEVENTS extern void clockevents_notify(unsigned long reason, void *arg); #else diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 3d1e3e1a197..1876b526c77 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -177,7 +177,7 @@ void clockevents_register_device(struct clock_event_device *dev) /* * Noop handler when we shut down an event device */ -static void clockevents_handle_noop(struct clock_event_device *dev) +void clockevents_handle_noop(struct clock_event_device *dev) { } @@ -199,7 +199,6 @@ void clockevents_exchange_device(struct clock_event_device *old, * released list and do a notify add later. */ if (old) { - old->event_handler = clockevents_handle_noop; clockevents_set_mode(old, CLOCK_EVT_MODE_UNUSED); list_del(&old->list); list_add(&old->list, &clockevents_released); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index 80c4336f418..c4777193d56 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -161,6 +161,7 @@ static void tick_setup_device(struct tick_device *td, } else { handler = td->evtdev->event_handler; next_event = td->evtdev->next_event; + td->evtdev->event_handler = clockevents_handle_noop; } td->evtdev = newdev; -- cgit v1.2.3 From d4496b39559c6d43f83e4c08b899984f8b8089b5 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2008 21:36:57 +0000 Subject: clockevents: prevent endless loop in periodic broadcast handler The reprogramming of the periodic broadcast handler was broken, when the first programming returned -ETIME. The clockevents code stores the new expiry value in the clock events device next_event field only when the programming time has not been elapsed yet. The loop in question calculates the new expiry value from the next_event value and therefor never increases. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 31463d370b9..3044a88357f 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -175,6 +175,8 @@ static void tick_do_periodic_broadcast(void) */ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) { + ktime_t next; + tick_do_periodic_broadcast(); /* @@ -185,10 +187,13 @@ static void tick_handle_periodic_broadcast(struct clock_event_device *dev) /* * Setup the next period for devices, which do not have - * periodic mode: + * periodic mode. We read dev->next_event first and add to it + * when the event alrady expired. clockevents_program_event() + * sets dev->next_event only when the event is really + * programmed to the device. */ - for (;;) { - ktime_t next = ktime_add(dev->next_event, tick_period); + for (next = dev->next_event; ;) { + next = ktime_add(next, tick_period); if (!clockevents_program_event(dev, next, ktime_get())) return; -- cgit v1.2.3 From 7205656ab48da29a95d7f55e43a81db755d3cb3a Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2008 21:37:03 +0000 Subject: clockevents: enforce reprogram in oneshot setup In tick_oneshot_setup we program the device to the given next_event, but we do not check the return value. We need to make sure that the device is programmed enforced so the interrupt handler engine starts working. Split out the reprogramming function from tick_program_event() and call it with the device, which was handed in to tick_setup_oneshot(). Set the force argument, so the devices is firing an interrupt. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-oneshot.c | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 450c04935b6..06595c64b0c 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -23,11 +23,11 @@ #include "tick-internal.h" /** - * tick_program_event + * tick_program_event internal worker function */ -int tick_program_event(ktime_t expires, int force) +static int __tick_program_event(struct clock_event_device *dev, + ktime_t expires, int force) { - struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; ktime_t now = ktime_get(); while (1) { @@ -40,6 +40,16 @@ int tick_program_event(ktime_t expires, int force) } } +/** + * tick_program_event + */ +int tick_program_event(ktime_t expires, int force) +{ + struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; + + return __tick_program_event(dev, expires, force); +} + /** * tick_resume_onshot - resume oneshot mode */ @@ -61,7 +71,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, { newdev->event_handler = handler; clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - clockevents_program_event(newdev, next_event, ktime_get()); + __tick_program_event(newdev, next_event, 1); } /** -- cgit v1.2.3 From 9c17bcda991000351cb2373f78be7e4b1c44caa3 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2008 21:37:08 +0000 Subject: clockevents: prevent multiple init/shutdown While chasing the C1E/HPET bugreports I went through the clock events code inch by inch and found that the broadcast device can be initialized and shutdown multiple times. Multiple shutdowns are not critical, but useless waste of time. Multiple initializations are simply broken. Another CPU might have the device in use already after the first initialization and the second init could just render it unusable again. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 3044a88357f..5744f40b269 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -210,7 +210,7 @@ static void tick_do_broadcast_on_off(void *why) struct clock_event_device *bc, *dev; struct tick_device *td; unsigned long flags, *reason = why; - int cpu; + int cpu, bc_stopped; spin_lock_irqsave(&tick_broadcast_lock, flags); @@ -228,6 +228,8 @@ static void tick_do_broadcast_on_off(void *why) if (!tick_device_is_functional(dev)) goto out; + bc_stopped = cpus_empty(tick_broadcast_mask); + switch (*reason) { case CLOCK_EVT_NOTIFY_BROADCAST_ON: case CLOCK_EVT_NOTIFY_BROADCAST_FORCE: @@ -250,9 +252,10 @@ static void tick_do_broadcast_on_off(void *why) break; } - if (cpus_empty(tick_broadcast_mask)) - clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); - else { + if (cpus_empty(tick_broadcast_mask)) { + if (!bc_stopped) + clockevents_set_mode(bc, CLOCK_EVT_MODE_SHUTDOWN); + } else if (bc_stopped) { if (tick_broadcast_device.mode == TICKDEV_MODE_PERIODIC) tick_broadcast_start_periodic(bc); else @@ -501,9 +504,12 @@ static void tick_broadcast_clear_oneshot(int cpu) */ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { - bc->event_handler = tick_handle_oneshot_broadcast; - clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - bc->next_event.tv64 = KTIME_MAX; + /* Set it up only once ! */ + if (bc->event_handler != tick_handle_oneshot_broadcast) { + bc->event_handler = tick_handle_oneshot_broadcast; + clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); + bc->next_event.tv64 = KTIME_MAX; + } } /* -- cgit v1.2.3 From 1fb9b7d29d8e85ba3196eaa7ab871bf76fc98d36 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2008 21:37:14 +0000 Subject: clockevents: prevent endless loop lockup The C1E/HPET bug reports on AMDX2/RS690 systems where tracked down to a too small value of the HPET minumum delta for programming an event. The clockevents code needs to enforce an interrupt event on the clock event device in some cases. The enforcement code was stupid and naive, as it just added the minimum delta to the current time and tried to reprogram the device. When the minimum delta is too small, then this loops forever. Add a sanity check. Allow reprogramming to fail 3 times, then print a warning and double the minimum delta value to make sure, that this does not happen again. Use the same function for both tick-oneshot and tick-broadcast code. Signed-off-by: Thomas Gleixner Signed-off-by: Ingo Molnar --- kernel/time/tick-broadcast.c | 12 ++---------- kernel/time/tick-internal.h | 2 ++ kernel/time/tick-oneshot.c | 36 ++++++++++++++++++++++++++++++------ 3 files changed, 34 insertions(+), 16 deletions(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 5744f40b269..2bc1f046151 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -372,16 +372,8 @@ cpumask_t *tick_get_broadcast_oneshot_mask(void) static int tick_broadcast_set_event(ktime_t expires, int force) { struct clock_event_device *bc = tick_broadcast_device.evtdev; - ktime_t now = ktime_get(); - int res; - - for(;;) { - res = clockevents_program_event(bc, expires, now); - if (!res || !force) - return res; - now = ktime_get(); - expires = ktime_add(now, ktime_set(0, bc->min_delta_ns)); - } + + return tick_dev_program_event(bc, expires, force); } int tick_resume_broadcast_oneshot(struct clock_event_device *bc) diff --git a/kernel/time/tick-internal.h b/kernel/time/tick-internal.h index f13f2b7f4fd..0ffc2918ea6 100644 --- a/kernel/time/tick-internal.h +++ b/kernel/time/tick-internal.h @@ -17,6 +17,8 @@ extern void tick_handle_periodic(struct clock_event_device *dev); extern void tick_setup_oneshot(struct clock_event_device *newdev, void (*handler)(struct clock_event_device *), ktime_t nextevt); +extern int tick_dev_program_event(struct clock_event_device *dev, + ktime_t expires, int force); extern int tick_program_event(ktime_t expires, int force); extern void tick_oneshot_notify(void); extern int tick_switch_to_oneshot(void (*handler)(struct clock_event_device *)); diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 06595c64b0c..2e35501e61d 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -25,18 +25,42 @@ /** * tick_program_event internal worker function */ -static int __tick_program_event(struct clock_event_device *dev, - ktime_t expires, int force) +int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, + int force) { ktime_t now = ktime_get(); + int i; - while (1) { + for (i = 0;;) { int ret = clockevents_program_event(dev, expires, now); if (!ret || !force) return ret; + + /* + * We tried 2 times to program the device with the given + * min_delta_ns. If that's not working then we double it + * and emit a warning. + */ + if (++i > 2) { + printk(KERN_WARNING "CE: __tick_program_event of %s is " + "stuck %llx %llx\n", dev->name ? dev->name : "?", + now.tv64, expires.tv64); + printk(KERN_WARNING + "CE: increasing min_delta_ns %ld to %ld nsec\n", + dev->min_delta_ns, dev->min_delta_ns << 1); + WARN_ON(1); + + /* Double the min. delta and try again */ + if (!dev->min_delta_ns) + dev->min_delta_ns = 5000; + else + dev->min_delta_ns <<= 1; + i = 0; + } + now = ktime_get(); - expires = ktime_add(now, ktime_set(0, dev->min_delta_ns)); + expires = ktime_add_ns(now, dev->min_delta_ns); } } @@ -47,7 +71,7 @@ int tick_program_event(ktime_t expires, int force) { struct clock_event_device *dev = __get_cpu_var(tick_cpu_device).evtdev; - return __tick_program_event(dev, expires, force); + return tick_dev_program_event(dev, expires, force); } /** @@ -71,7 +95,7 @@ void tick_setup_oneshot(struct clock_event_device *newdev, { newdev->event_handler = handler; clockevents_set_mode(newdev, CLOCK_EVT_MODE_ONESHOT); - __tick_program_event(newdev, next_event, 1); + tick_dev_program_event(newdev, next_event, 1); } /** -- cgit v1.2.3 From 7cfb0435330364f90f274a26ecdc5f47f738498c Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Wed, 3 Sep 2008 21:37:24 +0000 Subject: HPET: make minimum reprogramming delta useful The minimum reprogramming delta was hardcoded in HPET ticks, which is stupid as it does not work with faster running HPETs. The C1E idle patches made this prominent on AMD/RS690 chipsets, where the HPET runs with 25MHz. Set it to 5us which seems to be a reasonable value and fixes the problems on the bug reporters machines. We have a further sanity check now in the clock events, which increases the delta when it is not sufficient. Signed-off-by: Thomas Gleixner Tested-by: Luiz Fernando N. Capitulino Tested-by: Dmitry Nezhevenko Signed-off-by: Ingo Molnar --- arch/x86/kernel/hpet.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 59fd3b6b130..2256315416d 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -210,8 +210,8 @@ static void hpet_legacy_clockevent_register(void) /* Calculate the min / max delta */ hpet_clockevent.max_delta_ns = clockevent_delta2ns(0x7FFFFFFF, &hpet_clockevent); - hpet_clockevent.min_delta_ns = clockevent_delta2ns(0x30, - &hpet_clockevent); + /* 5 usec minimum reprogramming delta. */ + hpet_clockevent.min_delta_ns = 5000; /* * Start hpet with the boot cpu mask and make it -- cgit v1.2.3 From f171d4d769c8ccac6675892960e37f6485837fae Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 3 Sep 2008 16:17:14 +0300 Subject: UBIFS: fix division by zero If fanout is 3, we have division by zero in 'ubifs_read_superblock()': divide error: 0000 [#1] PREEMPT SMP Pid: 28744, comm: mount Not tainted (2.6.27-rc4-ubifs-2.6 #23) EIP: 0060:[] EFLAGS: 00010202 CPU: 0 EIP is at ubifs_reported_space+0x2d/0x69 [ubifs] EAX: 00000000 EBX: 00000000 ECX: 00000000 EDX: 00000000 ESI: 00000000 EDI: f0ae64b0 EBP: f1f9fcf4 ESP: f1f9fce0 DS: 007b ES: 007b FS: 00d8 GS: 0033 SS: 0068 Signed-off-by: Artem Bityutskiy --- fs/ubifs/budget.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/fs/ubifs/budget.c b/fs/ubifs/budget.c index 101d278c591..73db464cd08 100644 --- a/fs/ubifs/budget.c +++ b/fs/ubifs/budget.c @@ -723,24 +723,25 @@ void ubifs_release_dirty_inode_budget(struct ubifs_info *c, */ long long ubifs_reported_space(const struct ubifs_info *c, uint64_t free) { - int divisor, factor; + int divisor, factor, f; /* * Reported space size is @free * X, where X is UBIFS block size * divided by UBIFS block size + all overhead one data block * introduces. The overhead is the node header + indexing overhead. * - * Indexing overhead is calculations are based on the following - * formula: I = N/(f - 1) + 1, where I - number of indexing nodes, N - - * number of data nodes, f - fanout. Because effective UBIFS fanout is - * twice as less than maximum fanout, we assume that each data node + * Indexing overhead calculations are based on the following formula: + * I = N/(f - 1) + 1, where I - number of indexing nodes, N - number + * of data nodes, f - fanout. Because effective UBIFS fanout is twice + * as less than maximum fanout, we assume that each data node * introduces 3 * @c->max_idx_node_sz / (@c->fanout/2 - 1) bytes. * Note, the multiplier 3 is because UBIFS reseves thrice as more space * for the index. */ + f = c->fanout > 3 ? c->fanout >> 1 : 2; factor = UBIFS_BLOCK_SIZE; divisor = UBIFS_MAX_DATA_NODE_SZ; - divisor += (c->max_idx_node_sz * 3) / ((c->fanout >> 1) - 1); + divisor += (c->max_idx_node_sz * 3) / (f - 1); free *= factor; do_div(free, divisor); return free; -- cgit v1.2.3 From a5cb562d6977d9d7989c346b7b153cef31ec0228 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 3 Sep 2008 18:26:47 +0300 Subject: UBIFS: make minimum fanout 3 UBIFS does not really work correctly when fanout is 2, because of the way we manage the indexing tree. It may just become a list and UBIFS screws up. Signed-off-by: Artem Bityutskiy --- fs/ubifs/ubifs-media.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ubifs/ubifs-media.h b/fs/ubifs/ubifs-media.h index bd2121f3426..a9ecbd9af20 100644 --- a/fs/ubifs/ubifs-media.h +++ b/fs/ubifs/ubifs-media.h @@ -87,7 +87,7 @@ #define UBIFS_SK_LEN 8 /* Minimum index tree fanout */ -#define UBIFS_MIN_FANOUT 2 +#define UBIFS_MIN_FANOUT 3 /* Maximum number of levels in UBIFS indexing B-tree */ #define UBIFS_MAX_LEVELS 512 -- cgit v1.2.3 From 0510617b85758b6e66f3c602ceccea1826440470 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Mon, 4 Aug 2008 19:44:34 +0200 Subject: [MIPS] Fix data bus error recovery With -ffunction-section the entries in __dbe_table aren't no longer sorted, so the lookup of exception addresses in do_be() failed for some addresses. To avoid this we now sort __dbe_table. Signed-off-by: Thomas Bogendoerfer Signed-off-by: Ralf Baechle --- arch/mips/kernel/traps.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 426cced1e9d..1f579a8ea25 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -373,8 +373,8 @@ void __noreturn die(const char * str, const struct pt_regs * regs) do_exit(SIGSEGV); } -extern const struct exception_table_entry __start___dbe_table[]; -extern const struct exception_table_entry __stop___dbe_table[]; +extern struct exception_table_entry __start___dbe_table[]; +extern struct exception_table_entry __stop___dbe_table[]; __asm__( " .section __dbe_table, \"a\"\n" @@ -1682,4 +1682,6 @@ void __init trap_init(void) flush_icache_range(ebase, ebase + 0x400); flush_tlb_handlers(); + + sort_extable(__start___dbe_table, __stop___dbe_table); } -- cgit v1.2.3 From e0cee3eea7875800451739ae38f99edcf11c133d Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Mon, 4 Aug 2008 20:53:57 +0200 Subject: [MIPS] Fix WARNING: at kernel/smp.c:290 trap_init issues flush_icache_range(), which uses ipi functions to get icache flushing done on all cpus. But this is done before interrupts are enabled and caused WARN_ON messages. This changeset introduces a new local_flush_icache_range() and uses it before interrupts (and additional CPUs) are enabled to avoid this problem. Signed-off-by: Thomas Bogendoerfer Signed-off-by: Ralf Baechle --- arch/mips/kernel/traps.c | 12 +++++++----- arch/mips/mm/c-r3k.c | 1 + arch/mips/mm/c-r4k.c | 18 ++++++++++++------ arch/mips/mm/c-tx39.c | 1 + arch/mips/mm/cache.c | 1 + arch/mips/mm/tlbex.c | 6 +++--- include/asm-mips/cacheflush.h | 1 + 7 files changed, 26 insertions(+), 14 deletions(-) diff --git a/arch/mips/kernel/traps.c b/arch/mips/kernel/traps.c index 1f579a8ea25..6bee29097a5 100644 --- a/arch/mips/kernel/traps.c +++ b/arch/mips/kernel/traps.c @@ -1200,7 +1200,7 @@ void *set_except_vector(int n, void *addr) if (n == 0 && cpu_has_divec) { *(u32 *)(ebase + 0x200) = 0x08000000 | (0x03ffffff & (handler >> 2)); - flush_icache_range(ebase + 0x200, ebase + 0x204); + local_flush_icache_range(ebase + 0x200, ebase + 0x204); } return (void *)old_handler; } @@ -1283,7 +1283,8 @@ static void *set_vi_srs_handler(int n, vi_handler_t addr, int srs) *w = (*w & 0xffff0000) | (((u32)handler >> 16) & 0xffff); w = (u32 *)(b + ori_offset); *w = (*w & 0xffff0000) | ((u32)handler & 0xffff); - flush_icache_range((unsigned long)b, (unsigned long)(b+handler_len)); + local_flush_icache_range((unsigned long)b, + (unsigned long)(b+handler_len)); } else { /* @@ -1295,7 +1296,8 @@ static void *set_vi_srs_handler(int n, vi_handler_t addr, int srs) w = (u32 *)b; *w++ = 0x08000000 | (((u32)handler >> 2) & 0x03fffff); /* j handler */ *w = 0; - flush_icache_range((unsigned long)b, (unsigned long)(b+8)); + local_flush_icache_range((unsigned long)b, + (unsigned long)(b+8)); } return (void *)old_handler; @@ -1515,7 +1517,7 @@ void __cpuinit per_cpu_trap_init(void) void __init set_handler(unsigned long offset, void *addr, unsigned long size) { memcpy((void *)(ebase + offset), addr, size); - flush_icache_range(ebase + offset, ebase + offset + size); + local_flush_icache_range(ebase + offset, ebase + offset + size); } static char panic_null_cerr[] __cpuinitdata = @@ -1680,7 +1682,7 @@ void __init trap_init(void) signal32_init(); #endif - flush_icache_range(ebase, ebase + 0x400); + local_flush_icache_range(ebase, ebase + 0x400); flush_tlb_handlers(); sort_extable(__start___dbe_table, __stop___dbe_table); diff --git a/arch/mips/mm/c-r3k.c b/arch/mips/mm/c-r3k.c index 27a5b466c85..5500c20c79a 100644 --- a/arch/mips/mm/c-r3k.c +++ b/arch/mips/mm/c-r3k.c @@ -320,6 +320,7 @@ void __cpuinit r3k_cache_init(void) flush_cache_range = r3k_flush_cache_range; flush_cache_page = r3k_flush_cache_page; flush_icache_range = r3k_flush_icache_range; + local_flush_icache_range = r3k_flush_icache_range; flush_cache_sigtramp = r3k_flush_cache_sigtramp; local_flush_data_cache_page = local_r3k_flush_data_cache_page; diff --git a/arch/mips/mm/c-r4k.c b/arch/mips/mm/c-r4k.c index 71df3390c07..6e99665ae86 100644 --- a/arch/mips/mm/c-r4k.c +++ b/arch/mips/mm/c-r4k.c @@ -543,12 +543,8 @@ struct flush_icache_range_args { unsigned long end; }; -static inline void local_r4k_flush_icache_range(void *args) +static inline void local_r4k_flush_icache_range(unsigned long start, unsigned long end) { - struct flush_icache_range_args *fir_args = args; - unsigned long start = fir_args->start; - unsigned long end = fir_args->end; - if (!cpu_has_ic_fills_f_dc) { if (end - start >= dcache_size) { r4k_blast_dcache(); @@ -564,6 +560,15 @@ static inline void local_r4k_flush_icache_range(void *args) protected_blast_icache_range(start, end); } +static inline void local_r4k_flush_icache_range_ipi(void *args) +{ + struct flush_icache_range_args *fir_args = args; + unsigned long start = fir_args->start; + unsigned long end = fir_args->end; + + local_r4k_flush_icache_range(start, end); +} + static void r4k_flush_icache_range(unsigned long start, unsigned long end) { struct flush_icache_range_args args; @@ -571,7 +576,7 @@ static void r4k_flush_icache_range(unsigned long start, unsigned long end) args.start = start; args.end = end; - r4k_on_each_cpu(local_r4k_flush_icache_range, &args, 1); + r4k_on_each_cpu(local_r4k_flush_icache_range_ipi, &args, 1); instruction_hazard(); } @@ -1375,6 +1380,7 @@ void __cpuinit r4k_cache_init(void) local_flush_data_cache_page = local_r4k_flush_data_cache_page; flush_data_cache_page = r4k_flush_data_cache_page; flush_icache_range = r4k_flush_icache_range; + local_flush_icache_range = local_r4k_flush_icache_range; #if defined(CONFIG_DMA_NONCOHERENT) if (coherentio) { diff --git a/arch/mips/mm/c-tx39.c b/arch/mips/mm/c-tx39.c index a9f7f1f5e9b..39c81820ce8 100644 --- a/arch/mips/mm/c-tx39.c +++ b/arch/mips/mm/c-tx39.c @@ -362,6 +362,7 @@ void __cpuinit tx39_cache_init(void) flush_cache_range = (void *) tx39h_flush_icache_all; flush_cache_page = (void *) tx39h_flush_icache_all; flush_icache_range = (void *) tx39h_flush_icache_all; + local_flush_icache_range = (void *) tx39h_flush_icache_all; flush_cache_sigtramp = (void *) tx39h_flush_icache_all; local_flush_data_cache_page = (void *) tx39h_flush_icache_all; diff --git a/arch/mips/mm/cache.c b/arch/mips/mm/cache.c index 034e8506f6e..1eb7c71e3d6 100644 --- a/arch/mips/mm/cache.c +++ b/arch/mips/mm/cache.c @@ -29,6 +29,7 @@ void (*flush_cache_range)(struct vm_area_struct *vma, unsigned long start, void (*flush_cache_page)(struct vm_area_struct *vma, unsigned long page, unsigned long pfn); void (*flush_icache_range)(unsigned long start, unsigned long end); +void (*local_flush_icache_range)(unsigned long start, unsigned long end); void (*__flush_cache_vmap)(void); void (*__flush_cache_vunmap)(void); diff --git a/arch/mips/mm/tlbex.c b/arch/mips/mm/tlbex.c index 76da73a5ab3..979cf919728 100644 --- a/arch/mips/mm/tlbex.c +++ b/arch/mips/mm/tlbex.c @@ -1273,10 +1273,10 @@ void __cpuinit build_tlb_refill_handler(void) void __cpuinit flush_tlb_handlers(void) { - flush_icache_range((unsigned long)handle_tlbl, + local_flush_icache_range((unsigned long)handle_tlbl, (unsigned long)handle_tlbl + sizeof(handle_tlbl)); - flush_icache_range((unsigned long)handle_tlbs, + local_flush_icache_range((unsigned long)handle_tlbs, (unsigned long)handle_tlbs + sizeof(handle_tlbs)); - flush_icache_range((unsigned long)handle_tlbm, + local_flush_icache_range((unsigned long)handle_tlbm, (unsigned long)handle_tlbm + sizeof(handle_tlbm)); } diff --git a/include/asm-mips/cacheflush.h b/include/asm-mips/cacheflush.h index d5c0f2fda51..03b1d69b142 100644 --- a/include/asm-mips/cacheflush.h +++ b/include/asm-mips/cacheflush.h @@ -63,6 +63,7 @@ static inline void flush_icache_page(struct vm_area_struct *vma, } extern void (*flush_icache_range)(unsigned long start, unsigned long end); +extern void (*local_flush_icache_range)(unsigned long start, unsigned long end); extern void (*__flush_cache_vmap)(void); -- cgit v1.2.3 From 073828d078d3e55b23b2b5937a49dc1019c5907d Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Tue, 26 Aug 2008 21:29:58 +0900 Subject: [MIPS] TXx9: Fix txx9_pcode initialization The txx9_pcode variable was introduced in commit fe1c2bc64f65003b39f331a8e4b0d15b235a4afd ("TXx9: Add 64-bit support") but was not initialized properly. Signed-off-by: Atsushi Nemoto Signed-off-by: Ralf Baechle --- arch/mips/txx9/generic/setup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/txx9/generic/setup.c b/arch/mips/txx9/generic/setup.c index 0afe94c48fb..fe6bee09cec 100644 --- a/arch/mips/txx9/generic/setup.c +++ b/arch/mips/txx9/generic/setup.c @@ -53,6 +53,7 @@ txx9_reg_res_init(unsigned int pcode, unsigned long base, unsigned long size) txx9_ce_res[i].name = txx9_ce_res_name[i]; } + txx9_pcode = pcode; sprintf(txx9_pcode_str, "TX%x", pcode); if (base) { txx9_reg_res.start = base & 0xfffffffffULL; -- cgit v1.2.3 From 3885ec8ca29e5e33e9a5f0ae9dc849d798634ec9 Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Tue, 26 Aug 2008 22:30:41 +0900 Subject: [MIPS] TX39xx: Add missing local_flush_icache_range initialization Commmit 59e39ecd933ba49eb6efe84cbfa5597a6c9ef18a ("Fix WARNING: at kernel/smp.c:290") introduced local_flush_icache_range but lacks initialization for some TX39 case. Signed-off-by: Atsushi Nemoto Signed-off-by: Ralf Baechle --- arch/mips/mm/c-tx39.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/mips/mm/c-tx39.c b/arch/mips/mm/c-tx39.c index 39c81820ce8..f7c8f9ce39c 100644 --- a/arch/mips/mm/c-tx39.c +++ b/arch/mips/mm/c-tx39.c @@ -391,6 +391,7 @@ void __cpuinit tx39_cache_init(void) flush_cache_range = tx39_flush_cache_range; flush_cache_page = tx39_flush_cache_page; flush_icache_range = tx39_flush_icache_range; + local_flush_icache_range = tx39_flush_icache_range; flush_cache_sigtramp = tx39_flush_cache_sigtramp; local_flush_data_cache_page = local_tx39_flush_data_cache_page; -- cgit v1.2.3 From 0011036beeffeada15acd1936d67988de21ca65e Mon Sep 17 00:00:00 2001 From: Atsushi Nemoto Date: Tue, 26 Aug 2008 22:34:57 +0900 Subject: [MIPS] Probe initrd header only if explicitly specified Currently init_initrd() probes initrd header at the last page of kernel image, but it is valid only if addinitrd was used. If addinitrd was not used, the area contains garbage so probing there might misdetect initrd header (magic number is not strictly robust). This patch introduces CONFIG_PROBE_INITRD_HEADER to explicitly enable this probing. Signed-off-by: Atsushi Nemoto Signed-off-by: Ralf Baechle --- arch/mips/Kconfig | 9 +++++++++ arch/mips/kernel/setup.c | 33 ++++++++++++++++++--------------- 2 files changed, 27 insertions(+), 15 deletions(-) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index 4da736e2533..49896a2a1d7 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -1886,6 +1886,15 @@ config STACKTRACE_SUPPORT source "init/Kconfig" +config PROBE_INITRD_HEADER + bool "Probe initrd header created by addinitrd" + depends on BLK_DEV_INITRD + help + Probe initrd header at the last page of kernel image. + Say Y here if you are using arch/mips/boot/addinitrd.c to + add initrd or initramfs image to the kernel image. + Otherwise, say N. + menu "Bus options (PCI, PCMCIA, EISA, ISA, TC)" config HW_HAS_EISA diff --git a/arch/mips/kernel/setup.c b/arch/mips/kernel/setup.c index 2aae76bce29..16f8edfe5cd 100644 --- a/arch/mips/kernel/setup.c +++ b/arch/mips/kernel/setup.c @@ -160,30 +160,33 @@ early_param("rd_size", rd_size_early); static unsigned long __init init_initrd(void) { unsigned long end; - u32 *initrd_header; /* * Board specific code or command line parser should have * already set up initrd_start and initrd_end. In these cases * perfom sanity checks and use them if all looks good. */ - if (initrd_start && initrd_end > initrd_start) - goto sanitize; + if (!initrd_start || initrd_end <= initrd_start) { +#ifdef CONFIG_PROBE_INITRD_HEADER + u32 *initrd_header; - /* - * See if initrd has been added to the kernel image by - * arch/mips/boot/addinitrd.c. In that case a header is - * prepended to initrd and is made up by 8 bytes. The fisrt - * word is a magic number and the second one is the size of - * initrd. Initrd start must be page aligned in any cases. - */ - initrd_header = __va(PAGE_ALIGN(__pa_symbol(&_end) + 8)) - 8; - if (initrd_header[0] != 0x494E5244) + /* + * See if initrd has been added to the kernel image by + * arch/mips/boot/addinitrd.c. In that case a header is + * prepended to initrd and is made up by 8 bytes. The first + * word is a magic number and the second one is the size of + * initrd. Initrd start must be page aligned in any cases. + */ + initrd_header = __va(PAGE_ALIGN(__pa_symbol(&_end) + 8)) - 8; + if (initrd_header[0] != 0x494E5244) + goto disable; + initrd_start = (unsigned long)(initrd_header + 2); + initrd_end = initrd_start + initrd_header[1]; +#else goto disable; - initrd_start = (unsigned long)(initrd_header + 2); - initrd_end = initrd_start + initrd_header[1]; +#endif + } -sanitize: if (initrd_start & ~PAGE_MASK) { pr_err("initrd start must be page aligned\n"); goto disable; -- cgit v1.2.3 From b74b06c5f6612a72298f37baa65460a59c26ca67 Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Fri, 15 Aug 2008 15:36:31 -0700 Subject: x86: boot: stub out unimplemented CPU feature words The CPU feature detection code in the boot code is somewhat minimal, and doesn't include all possible CPUID words. In particular, it doesn't contain the code for CPU feature words 2 (Transmeta), 3 (Linux-specific), 5 (VIA), or 7 (scattered). Zero them out, so we can still set those bits as known at compile time; in particular, this allows creating a Linux-specific NOPL flag and have it required (and therefore resolvable at compile time) in 64-bit mode. Signed-off-by: H. Peter Anvin --- arch/x86/boot/cpucheck.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/boot/cpucheck.c b/arch/x86/boot/cpucheck.c index 4b9ae7c5674..4d3ff037201 100644 --- a/arch/x86/boot/cpucheck.c +++ b/arch/x86/boot/cpucheck.c @@ -38,12 +38,12 @@ static const u32 req_flags[NCAPINTS] = { REQUIRED_MASK0, REQUIRED_MASK1, - REQUIRED_MASK2, - REQUIRED_MASK3, + 0, /* REQUIRED_MASK2 not implemented in this file */ + 0, /* REQUIRED_MASK3 not implemented in this file */ REQUIRED_MASK4, - REQUIRED_MASK5, + 0, /* REQUIRED_MASK5 not implemented in this file */ REQUIRED_MASK6, - REQUIRED_MASK7, + 0, /* REQUIRED_MASK7 not implemented in this file */ }; #define A32(a, b, c, d) (((d) << 24)+((c) << 16)+((b) << 8)+(a)) -- cgit v1.2.3 From b6734c35af028f06772c0b2c836c7d579e6d4dad Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Aug 2008 17:39:32 -0700 Subject: x86: add NOPL as a synthetic CPU feature bit The long noops ("NOPL") are supposed to be detected by family >= 6. Unfortunately, several non-Intel x86 implementations, both hardware and software, don't obey this dictum. Instead, probe for NOPL directly by executing a NOPL instruction and see if we get #UD. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/cpu/common.c | 32 +++++++++++++++++++++++++++++++- arch/x86/kernel/cpu/common_64.c | 36 ++++++++++++++++++++++++++++++++++++ arch/x86/kernel/cpu/feature_names.c | 3 ++- include/asm-x86/cpufeature.h | 11 ++++++----- include/asm-x86/required-features.h | 8 +++++++- 5 files changed, 82 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 80ab20d4fa3..0785b3c8d04 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -13,6 +13,7 @@ #include #include #include +#include #ifdef CONFIG_X86_LOCAL_APIC #include #include @@ -341,6 +342,35 @@ static void __init early_cpu_detect(void) early_get_cap(c); } +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6, unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. Hence, probe for it based on first + * principles. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + const u32 nopl_signature = 0x888c53b1; /* Random number */ + u32 has_nopl = nopl_signature; + + clear_cpu_cap(c, X86_FEATURE_NOPL); + if (c->x86 >= 6) { + asm volatile("\n" + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ + "2:\n" + " .section .fixup,\"ax\"\n" + "3: xor %0,%0\n" + " jmp 2b\n" + " .previous\n" + _ASM_EXTABLE(1b,3b) + : "+a" (has_nopl)); + + if (has_nopl == nopl_signature) + set_cpu_cap(c, X86_FEATURE_NOPL); + } +} + static void __cpuinit generic_identify(struct cpuinfo_x86 *c) { u32 tfms, xlvl; @@ -395,8 +425,8 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) } init_scattered_cpuid_features(c); + detect_nopl(c); } - } static void __cpuinit squash_the_stupid_serial_number(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index dd6e3f15017..c3afba5a81a 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -18,6 +18,7 @@ #include #include #include +#include #include #ifdef CONFIG_X86_LOCAL_APIC #include @@ -215,6 +216,39 @@ static void __init early_cpu_support_print(void) } } +/* + * The NOPL instruction is supposed to exist on all CPUs with + * family >= 6, unfortunately, that's not true in practice because + * of early VIA chips and (more importantly) broken virtualizers that + * are not easy to detect. Hence, probe for it based on first + * principles. + * + * Note: no 64-bit chip is known to lack these, but put the code here + * for consistency with 32 bits, and to make it utterly trivial to + * diagnose the problem should it ever surface. + */ +static void __cpuinit detect_nopl(struct cpuinfo_x86 *c) +{ + const u32 nopl_signature = 0x888c53b1; /* Random number */ + u32 has_nopl = nopl_signature; + + clear_cpu_cap(c, X86_FEATURE_NOPL); + if (c->x86 >= 6) { + asm volatile("\n" + "1: .byte 0x0f,0x1f,0xc0\n" /* nopl %eax */ + "2:\n" + " .section .fixup,\"ax\"\n" + "3: xor %0,%0\n" + " jmp 2b\n" + " .previous\n" + _ASM_EXTABLE(1b,3b) + : "+a" (has_nopl)); + + if (has_nopl == nopl_signature) + set_cpu_cap(c, X86_FEATURE_NOPL); + } +} + static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c); void __init early_cpu_init(void) @@ -313,6 +347,8 @@ static void __cpuinit early_identify_cpu(struct cpuinfo_x86 *c) c->x86_phys_bits = eax & 0xff; } + detect_nopl(c); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); diff --git a/arch/x86/kernel/cpu/feature_names.c b/arch/x86/kernel/cpu/feature_names.c index e43ad4ad4cb..c9017799497 100644 --- a/arch/x86/kernel/cpu/feature_names.c +++ b/arch/x86/kernel/cpu/feature_names.c @@ -39,7 +39,8 @@ const char * const x86_cap_flags[NCAPINTS*32] = { NULL, NULL, NULL, NULL, "constant_tsc", "up", NULL, "arch_perfmon", "pebs", "bts", NULL, NULL, - "rep_good", NULL, NULL, NULL, NULL, NULL, NULL, NULL, + "rep_good", NULL, NULL, NULL, + "nopl", NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, NULL, /* Intel-defined (#2) */ diff --git a/include/asm-x86/cpufeature.h b/include/asm-x86/cpufeature.h index 762f6a6bc70..9489283a4bc 100644 --- a/include/asm-x86/cpufeature.h +++ b/include/asm-x86/cpufeature.h @@ -72,14 +72,15 @@ #define X86_FEATURE_UP (3*32+ 9) /* smp kernel running on up */ #define X86_FEATURE_FXSAVE_LEAK (3*32+10) /* FXSAVE leaks FOP/FIP/FOP */ #define X86_FEATURE_ARCH_PERFMON (3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 (3*32+14) /* syscall in ia32 userspace */ -#define X86_FEATURE_SYSENTER32 (3*32+15) /* sysenter in ia32 userspace */ +#define X86_FEATURE_PEBS (3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS (3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 (3*32+14) /* syscall in ia32 userspace */ +#define X86_FEATURE_SYSENTER32 (3*32+15) /* sysenter in ia32 userspace */ #define X86_FEATURE_REP_GOOD (3*32+16) /* rep microcode works well on this CPU */ #define X86_FEATURE_MFENCE_RDTSC (3*32+17) /* Mfence synchronizes RDTSC */ #define X86_FEATURE_LFENCE_RDTSC (3*32+18) /* Lfence synchronizes RDTSC */ -#define X86_FEATURE_11AP (3*32+19) /* Bad local APIC aka 11AP */ +#define X86_FEATURE_11AP (3*32+19) /* Bad local APIC aka 11AP */ +#define X86_FEATURE_NOPL (3*32+20) /* The NOPL (0F 1F) instructions */ /* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ #define X86_FEATURE_XMM3 (4*32+ 0) /* Streaming SIMD Extensions-3 */ diff --git a/include/asm-x86/required-features.h b/include/asm-x86/required-features.h index adec887dd7c..5c2ff4bc298 100644 --- a/include/asm-x86/required-features.h +++ b/include/asm-x86/required-features.h @@ -41,6 +41,12 @@ # define NEED_3DNOW 0 #endif +#if defined(CONFIG_X86_P6_NOP) || defined(CONFIG_X86_64) +# define NEED_NOPL (1<<(X86_FEATURE_NOPL & 31)) +#else +# define NEED_NOPL 0 +#endif + #ifdef CONFIG_X86_64 #define NEED_PSE 0 #define NEED_MSR (1<<(X86_FEATURE_MSR & 31)) @@ -67,7 +73,7 @@ #define REQUIRED_MASK1 (NEED_LM|NEED_3DNOW) #define REQUIRED_MASK2 0 -#define REQUIRED_MASK3 0 +#define REQUIRED_MASK3 (NEED_NOPL) #define REQUIRED_MASK4 0 #define REQUIRED_MASK5 0 #define REQUIRED_MASK6 0 -- cgit v1.2.3 From f31d731e4467e61de51d7f6d7115f3b712d9354c Mon Sep 17 00:00:00 2001 From: "H. Peter Anvin" Date: Mon, 18 Aug 2008 17:50:33 -0700 Subject: x86: use X86_FEATURE_NOPL in alternatives Use X86_FEATURE_NOPL to determine if it is safe to use P6 NOPs in alternatives. Also, replace table and loop with simple if statement. Signed-off-by: H. Peter Anvin --- arch/x86/kernel/alternative.c | 36 +++++++++++++----------------------- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 2763cb37b55..65a0c1b4869 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -145,35 +145,25 @@ static const unsigned char *const p6_nops[ASM_NOP_MAX+1] = { extern char __vsyscall_0; const unsigned char *const *find_nop_table(void) { - return boot_cpu_data.x86_vendor != X86_VENDOR_INTEL || - boot_cpu_data.x86 < 6 ? k8_nops : p6_nops; + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return k8_nops; } #else /* CONFIG_X86_64 */ -static const struct nop { - int cpuid; - const unsigned char *const *noptable; -} noptypes[] = { - { X86_FEATURE_K8, k8_nops }, - { X86_FEATURE_K7, k7_nops }, - { X86_FEATURE_P4, p6_nops }, - { X86_FEATURE_P3, p6_nops }, - { -1, NULL } -}; - const unsigned char *const *find_nop_table(void) { - const unsigned char *const *noptable = intel_nops; - int i; - - for (i = 0; noptypes[i].cpuid >= 0; i++) { - if (boot_cpu_has(noptypes[i].cpuid)) { - noptable = noptypes[i].noptable; - break; - } - } - return noptable; + if (boot_cpu_has(X86_FEATURE_K8)) + return k8_nops; + else if (boot_cpu_has(X86_FEATURE_K7)) + return k7_nops; + else if (boot_cpu_has(X86_FEATURE_NOPL)) + return p6_nops; + else + return intel_nops; } #endif /* CONFIG_X86_64 */ -- cgit v1.2.3 From 7300711e8c6824fcfbd42a126980ff50439d8dd0 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2008 03:01:45 +0200 Subject: clockevents: broadcast fixup possible waiters Until the C1E patches arrived there where no users of periodic broadcast before switching to oneshot mode. Now we need to trigger a possible waiter for a periodic broadcast when switching to oneshot mode. Otherwise we can starve them for ever. Signed-off-by: Thomas Gleixner --- kernel/time/tick-broadcast.c | 37 ++++++++++++++++++++++++++++++++++++- 1 file changed, 36 insertions(+), 1 deletion(-) diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index 2bc1f046151..2f5a38294bf 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -491,6 +491,18 @@ static void tick_broadcast_clear_oneshot(int cpu) cpu_clear(cpu, tick_broadcast_oneshot_mask); } +static void tick_broadcast_init_next_event(cpumask_t *mask, ktime_t expires) +{ + struct tick_device *td; + int cpu; + + for_each_cpu_mask_nr(cpu, *mask) { + td = &per_cpu(tick_cpu_device, cpu); + if (td->evtdev) + td->evtdev->next_event = expires; + } +} + /** * tick_broadcast_setup_oneshot - setup the broadcast device */ @@ -498,9 +510,32 @@ void tick_broadcast_setup_oneshot(struct clock_event_device *bc) { /* Set it up only once ! */ if (bc->event_handler != tick_handle_oneshot_broadcast) { + int was_periodic = bc->mode == CLOCK_EVT_MODE_PERIODIC; + int cpu = smp_processor_id(); + cpumask_t mask; + bc->event_handler = tick_handle_oneshot_broadcast; clockevents_set_mode(bc, CLOCK_EVT_MODE_ONESHOT); - bc->next_event.tv64 = KTIME_MAX; + + /* Take the do_timer update */ + tick_do_timer_cpu = cpu; + + /* + * We must be careful here. There might be other CPUs + * waiting for periodic broadcast. We need to set the + * oneshot_mask bits for those and program the + * broadcast device to fire. + */ + mask = tick_broadcast_mask; + cpu_clear(cpu, mask); + cpus_or(tick_broadcast_oneshot_mask, + tick_broadcast_oneshot_mask, mask); + + if (was_periodic && !cpus_empty(mask)) { + tick_broadcast_init_next_event(&mask, tick_next_period); + tick_broadcast_set_event(tick_next_period, 1); + } else + bc->next_event.tv64 = KTIME_MAX; } } -- cgit v1.2.3 From f7676254f179eac6b5244a80195ec8ae0e9d4606 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2008 03:03:32 +0200 Subject: x86: HPET fix moronic 32/64bit thinko We use the HPET only in 32bit mode because: 1) some HPETs are 32bit only 2) on i386 there is no way to read/write the HPET atomic 64bit wide The HPET code unification done by the "moron of the year" did not take into account that unsigned long is different on 32 and 64 bit. This thinko results in a possible endless loop in the clockevents code, when the return comparison fails due to the 64bit/332bit unawareness. unsigned long cnt = (u32) hpet_read() + delta can wrap over 32bit. but the final compare will fail and return -ETIME causing endless loops. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 2256315416d..801497a16e0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -270,15 +270,15 @@ static void hpet_legacy_set_mode(enum clock_event_mode mode, } static int hpet_legacy_next_event(unsigned long delta, - struct clock_event_device *evt) + struct clock_event_device *evt) { - unsigned long cnt; + u32 cnt; cnt = hpet_readl(HPET_COUNTER); - cnt += delta; + cnt += (u32) delta; hpet_writel(cnt, HPET_T0_CMP); - return ((long)(hpet_readl(HPET_COUNTER) - cnt ) > 0) ? -ETIME : 0; + return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } /* -- cgit v1.2.3 From 72d43d9bc9210d24d09202eaf219eac09e17b339 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Sat, 6 Sep 2008 03:06:08 +0200 Subject: x86: HPET: read back compare register before reading counter After fixing the u32 thinko I sill had occasional hickups on ATI chipsets with small deltas. There seems to be a delay between writing the compare register and the transffer to the internal register which triggers the interrupt. Reading back the value makes sure, that it hit the internal match register befor we compare against the counter value. Signed-off-by: Thomas Gleixner --- arch/x86/kernel/hpet.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/x86/kernel/hpet.c b/arch/x86/kernel/hpet.c index 801497a16e0..73deaffadd0 100644 --- a/arch/x86/kernel/hpet.c +++ b/arch/x86/kernel/hpet.c @@ -278,6 +278,13 @@ static int hpet_legacy_next_event(unsigned long delta, cnt += (u32) delta; hpet_writel(cnt, HPET_T0_CMP); + /* + * We need to read back the CMP register to make sure that + * what we wrote hit the chip before we compare it to the + * counter. + */ + WARN_ON((u32)hpet_readl(HPET_T0_CMP) != cnt); + return (s32)((u32)hpet_readl(HPET_COUNTER) - cnt) >= 0 ? -ETIME : 0; } -- cgit v1.2.3 From 8a656496b21efd95fd55b66e0601c5ad41f9b156 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Sat, 6 Sep 2008 11:43:41 +0200 Subject: Fix CONFIG_AC97_BUS dependency CONFIG_AC97_BUS is used from both sound and ucb1400 drivers. The recent change in Kconfig introduced the exclusive dependency on CONFIG_SOUND, and disabled the ucb1400 build without sound. This patch makes CONFIG_AC97_BUS independent. Signed-off-by: Takashi Iwai Tested-by: Randy Dunlap --- sound/Kconfig | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/sound/Kconfig b/sound/Kconfig index a37bee094eb..8ebf512ced6 100644 --- a/sound/Kconfig +++ b/sound/Kconfig @@ -91,6 +91,9 @@ endif # SOUND_PRIME endif # !M68K +endif # SOUND + +# AC97_BUS is used from both sound and ucb1400 config AC97_BUS tristate help @@ -99,4 +102,3 @@ config AC97_BUS sound although they're sharing the AC97 bus. Concerned drivers should "select" this. -endif # SOUND -- cgit v1.2.3 From 1ad77a876da48331451698cc4172c90ab9b6372f Mon Sep 17 00:00:00 2001 From: Lennert Buytenhek Date: Fri, 5 Sep 2008 13:17:11 +0100 Subject: [ARM] 5241/1: provide ioremap_wc() This patch provides an ARM implementation of ioremap_wc(). We use different page table attributes depending on which CPU we are running on: - Non-XScale ARMv5 and earlier systems: The ARMv5 ARM documents four possible mapping types (CB=00/01/10/11). We can't use any of the cached memory types (CB=10/11), since that breaks coherency with peripheral devices. Both CB=00 and CB=01 are suitable for _wc, and CB=01 (Uncached/Buffered) allows the hardware more freedom than CB=00, so we'll use that. (The ARMv5 ARM seems to suggest that CB=01 is allowed to delay stores but isn't allowed to merge them, but there is no other mapping type we can use that allows the hardware to delay and merge stores, so we'll go with CB=01.) - XScale v1/v2 (ARMv5): same as the ARMv5 case above, with the slight difference that on these platforms, CB=01 actually _does_ allow merging stores. (If you want noncoalescing bufferable behavior on Xscale v1/v2, you need to use XCB=101.) - Xscale v3 (ARMv5) and ARMv6+: on these systems, we use TEXCB=00100 mappings (Inner/Outer Uncacheable in xsc3 parlance, Uncached Normal in ARMv6 parlance). The ARMv6 ARM explicitly says that any accesses to Normal memory can be merged, which makes Normal memory more suitable for _wc mappings than Device or Strongly Ordered memory, as the latter two mapping types are guaranteed to maintain transaction number, size and order. We use the Uncached variety of Normal mappings for the same reason that we can't use C=1 mappings on ARMv5. The xsc3 Architecture Specification documents TEXCB=00100 as being Uncacheable and allowing coalescing of writes, which is also just what we need. Signed-off-by: Lennert Buytenhek Signed-off-by: Russell King --- arch/arm/include/asm/io.h | 5 ++++- arch/arm/include/asm/mach/map.h | 14 +++++++------- arch/arm/mm/mmu.c | 20 ++++++++++++++++++++ 3 files changed, 31 insertions(+), 8 deletions(-) diff --git a/arch/arm/include/asm/io.h b/arch/arm/include/asm/io.h index 94a95d7fafd..71934856fc2 100644 --- a/arch/arm/include/asm/io.h +++ b/arch/arm/include/asm/io.h @@ -61,8 +61,9 @@ extern void __raw_readsl(const void __iomem *addr, void *data, int longlen); #define MT_DEVICE_NONSHARED 1 #define MT_DEVICE_CACHED 2 #define MT_DEVICE_IXP2000 3 +#define MT_DEVICE_WC 4 /* - * types 4 onwards can be found in asm/mach/map.h and are undefined + * types 5 onwards can be found in asm/mach/map.h and are undefined * for ioremap */ @@ -215,11 +216,13 @@ extern void _memset_io(volatile void __iomem *, int, size_t); #define ioremap(cookie,size) __arm_ioremap(cookie, size, MT_DEVICE) #define ioremap_nocache(cookie,size) __arm_ioremap(cookie, size, MT_DEVICE) #define ioremap_cached(cookie,size) __arm_ioremap(cookie, size, MT_DEVICE_CACHED) +#define ioremap_wc(cookie,size) __arm_ioremap(cookie, size, MT_DEVICE_WC) #define iounmap(cookie) __iounmap(cookie) #else #define ioremap(cookie,size) __arch_ioremap((cookie), (size), MT_DEVICE) #define ioremap_nocache(cookie,size) __arch_ioremap((cookie), (size), MT_DEVICE) #define ioremap_cached(cookie,size) __arch_ioremap((cookie), (size), MT_DEVICE_CACHED) +#define ioremap_wc(cookie,size) __arch_ioremap((cookie), (size), MT_DEVICE_WC) #define iounmap(cookie) __arch_iounmap(cookie) #endif diff --git a/arch/arm/include/asm/mach/map.h b/arch/arm/include/asm/mach/map.h index 06f583b1399..9eb936e49cc 100644 --- a/arch/arm/include/asm/mach/map.h +++ b/arch/arm/include/asm/mach/map.h @@ -18,13 +18,13 @@ struct map_desc { unsigned int type; }; -/* types 0-3 are defined in asm/io.h */ -#define MT_CACHECLEAN 4 -#define MT_MINICLEAN 5 -#define MT_LOW_VECTORS 6 -#define MT_HIGH_VECTORS 7 -#define MT_MEMORY 8 -#define MT_ROM 9 +/* types 0-4 are defined in asm/io.h */ +#define MT_CACHECLEAN 5 +#define MT_MINICLEAN 6 +#define MT_LOW_VECTORS 7 +#define MT_HIGH_VECTORS 8 +#define MT_MEMORY 9 +#define MT_ROM 10 #define MT_NONSHARED_DEVICE MT_DEVICE_NONSHARED #define MT_IXP2000_DEVICE MT_DEVICE_IXP2000 diff --git a/arch/arm/mm/mmu.c b/arch/arm/mm/mmu.c index 25d9a11eb61..a713e40e1f1 100644 --- a/arch/arm/mm/mmu.c +++ b/arch/arm/mm/mmu.c @@ -211,6 +211,12 @@ static struct mem_type mem_types[] = { PMD_SECT_TEX(1), .domain = DOMAIN_IO, }, + [MT_DEVICE_WC] = { /* ioremap_wc */ + .prot_pte = PROT_PTE_DEVICE, + .prot_l1 = PMD_TYPE_TABLE, + .prot_sect = PROT_SECT_DEVICE, + .domain = DOMAIN_IO, + }, [MT_CACHECLEAN] = { .prot_sect = PMD_TYPE_SECT | PMD_SECT_XN, .domain = DOMAIN_KERNEL, @@ -272,6 +278,20 @@ static void __init build_mem_type_table(void) ecc_mask = 0; } + /* + * On non-Xscale3 ARMv5-and-older systems, use CB=01 + * (Uncached/Buffered) for ioremap_wc() mappings. On XScale3 + * and ARMv6+, use TEXCB=00100 mappings (Inner/Outer Uncacheable + * in xsc3 parlance, Uncached Normal in ARMv6 parlance). + */ + if (cpu_is_xsc3() || cpu_arch >= CPU_ARCH_ARMv6) { + mem_types[MT_DEVICE_WC].prot_pte_ext |= PTE_EXT_TEX(1); + mem_types[MT_DEVICE_WC].prot_sect |= PMD_SECT_TEX(1); + } else { + mem_types[MT_DEVICE_WC].prot_pte |= L_PTE_BUFFERABLE; + mem_types[MT_DEVICE_WC].prot_sect |= PMD_SECT_BUFFERABLE; + } + /* * ARMv5 and lower, bit 4 must be set for page tables. * (was: cache "update-able on write" bit on ARM610) -- cgit v1.2.3 From 4ff4b9e19a80b73959ebeb28d1df40176686f0a8 Mon Sep 17 00:00:00 2001 From: "Maciej W. Rozycki" Date: Fri, 5 Sep 2008 14:05:31 -0700 Subject: ntp: fix calculation of the next jiffie to trigger RTC sync We have a bug in the calculation of the next jiffie to trigger the RTC synchronisation. The aim here is to run sync_cmos_clock() as close as possible to the middle of a second. Which means we want this function to be called less than or equal to half a jiffie away from when now.tv_nsec equals 5e8 (500000000). If this is not the case for a given call to the function, for this purpose instead of updating the RTC we calculate the offset in nanoseconds to the next point in time where now.tv_nsec will be equal 5e8. The calculated offset is then converted to jiffies as these are the unit used by the timer. Hovewer timespec_to_jiffies() used here uses a ceil()-type rounding mode, where the resulting value is rounded up. As a result the range of now.tv_nsec when the timer will trigger is from 5e8 to 5e8 + TICK_NSEC rather than the desired 5e8 - TICK_NSEC / 2 to 5e8 + TICK_NSEC / 2. As a result if for example sync_cmos_clock() happens to be called at the time when now.tv_nsec is between 5e8 + TICK_NSEC / 2 and 5e8 to 5e8 + TICK_NSEC, it will simply be rescheduled HZ jiffies later, falling in the same range of now.tv_nsec again. Similarly for cases offsetted by an integer multiple of TICK_NSEC. This change addresses the problem by subtracting TICK_NSEC / 2 from the nanosecond offset to the next point in time where now.tv_nsec will be equal 5e8, effectively shifting the following rounding in timespec_to_jiffies() so that it produces a rounded-to-nearest result. Signed-off-by: Maciej W. Rozycki Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- kernel/time/ntp.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/time/ntp.c b/kernel/time/ntp.c index 5125ddd8196..1ad46f3df6e 100644 --- a/kernel/time/ntp.c +++ b/kernel/time/ntp.c @@ -245,7 +245,7 @@ static void sync_cmos_clock(unsigned long dummy) if (abs(now.tv_nsec - (NSEC_PER_SEC / 2)) <= tick_nsec / 2) fail = update_persistent_clock(now); - next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec; + next.tv_nsec = (NSEC_PER_SEC / 2) - now.tv_nsec - (TICK_NSEC / 2); if (next.tv_nsec <= 0) next.tv_nsec += NSEC_PER_SEC; -- cgit v1.2.3 From dfdf748a61a21b7397b9f57c83de722de71dc56a Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Fri, 5 Sep 2008 14:05:33 -0700 Subject: clocksource, acpi_pm.c: use proper read function also in errata mode On all hardware (some Intel ICH4, PIIX4 and PIIX4E chipsets) affected by a hardware errata there's about a 4.2% chance that initialization of the ACPI PMTMR fails. On those chipsets, we need to read out the timer value at least three times to get a correct result, for every once in a while (i.e. within a 3 ns window every 69.8 ns) the read returns a bogus result. During normal operation we work around this issue, but during initialization reading a bogus value may lead to -EINVAL even though the hardware is usable. Thanks to Andreas Mohr for spotting this issue. Signed-off-by: Dominik Brodowski Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- drivers/clocksource/acpi_pm.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c index 5ca1d80de18..860d033bc74 100644 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@ -151,13 +151,13 @@ DECLARE_PCI_FIXUP_EARLY(PCI_VENDOR_ID_SERVERWORKS, PCI_DEVICE_ID_SERVERWORKS_LE, */ static int verify_pmtmr_rate(void) { - u32 value1, value2; + cycle_t value1, value2; unsigned long count, delta; mach_prepare_counter(); - value1 = read_pmtmr(); + value1 = clocksource_acpi_pm.read(); mach_countup(&count); - value2 = read_pmtmr(); + value2 = clocksource_acpi_pm.read(); delta = (value2 - value1) & ACPI_PM_MASK; /* Check that the PMTMR delta is within 5% of what we expect */ @@ -177,7 +177,7 @@ static int verify_pmtmr_rate(void) static int __init init_acpi_pm_clocksource(void) { - u32 value1, value2; + cycle_t value1, value2; unsigned int i; if (!pmtmr_ioport) @@ -187,9 +187,9 @@ static int __init init_acpi_pm_clocksource(void) clocksource_acpi_pm.shift); /* "verify" this timing source: */ - value1 = read_pmtmr(); + value1 = clocksource_acpi_pm.read(); for (i = 0; i < 10000; i++) { - value2 = read_pmtmr(); + value2 = clocksource_acpi_pm.read(); if (value2 == value1) continue; if (value2 > value1) @@ -197,11 +197,11 @@ static int __init init_acpi_pm_clocksource(void) if ((value2 < value1) && ((value2) < 0xFFF)) goto pm_good; printk(KERN_INFO "PM-Timer had inconsistent results:" - " 0x%#x, 0x%#x - aborting.\n", value1, value2); + " 0x%#llx, 0x%#llx - aborting.\n", value1, value2); return -EINVAL; } printk(KERN_INFO "PM-Timer had no reasonable result:" - " 0x%#x - aborting.\n", value1); + " 0x%#llx - aborting.\n", value1); return -ENODEV; pm_good: -- cgit v1.2.3 From 4ab6a219113197425ac112e35e1ec8062c69888e Mon Sep 17 00:00:00 2001 From: Dominik Brodowski Date: Fri, 5 Sep 2008 14:05:35 -0700 Subject: clocksource, acpi_pm.c: check for monotonicity The current check for monotonicity is way too weak: Andreas Mohr reports ( http://lkml.org/lkml/2008/8/10/77 ) that on one of his test systems the current check only triggers in 50% of all cases, leading to catastrophic timer behaviour. To fix this issue, expand the check for monotonicity by doing ten consecutive tests instead of one. Signed-off-by: Dominik Brodowski Signed-off-by: Andrew Morton Signed-off-by: Ingo Molnar --- drivers/clocksource/acpi_pm.c | 46 +++++++++++++++++++++++++++---------------- 1 file changed, 29 insertions(+), 17 deletions(-) diff --git a/drivers/clocksource/acpi_pm.c b/drivers/clocksource/acpi_pm.c index 860d033bc74..4eee533f3f4 100644 --- a/drivers/clocksource/acpi_pm.c +++ b/drivers/clocksource/acpi_pm.c @@ -21,6 +21,7 @@ #include #include #include +#include #include /* @@ -175,10 +176,13 @@ static int verify_pmtmr_rate(void) #define verify_pmtmr_rate() (0) #endif +/* Number of monotonicity checks to perform during initialization */ +#define ACPI_PM_MONOTONICITY_CHECKS 10 + static int __init init_acpi_pm_clocksource(void) { cycle_t value1, value2; - unsigned int i; + unsigned int i, j, good = 0; if (!pmtmr_ioport) return -ENODEV; @@ -187,24 +191,32 @@ static int __init init_acpi_pm_clocksource(void) clocksource_acpi_pm.shift); /* "verify" this timing source: */ - value1 = clocksource_acpi_pm.read(); - for (i = 0; i < 10000; i++) { - value2 = clocksource_acpi_pm.read(); - if (value2 == value1) - continue; - if (value2 > value1) - goto pm_good; - if ((value2 < value1) && ((value2) < 0xFFF)) - goto pm_good; - printk(KERN_INFO "PM-Timer had inconsistent results:" - " 0x%#llx, 0x%#llx - aborting.\n", value1, value2); - return -EINVAL; + for (j = 0; j < ACPI_PM_MONOTONICITY_CHECKS; j++) { + value1 = clocksource_acpi_pm.read(); + for (i = 0; i < 10000; i++) { + value2 = clocksource_acpi_pm.read(); + if (value2 == value1) + continue; + if (value2 > value1) + good++; + break; + if ((value2 < value1) && ((value2) < 0xFFF)) + good++; + break; + printk(KERN_INFO "PM-Timer had inconsistent results:" + " 0x%#llx, 0x%#llx - aborting.\n", + value1, value2); + return -EINVAL; + } + udelay(300 * i); + } + + if (good != ACPI_PM_MONOTONICITY_CHECKS) { + printk(KERN_INFO "PM-Timer failed consistency check " + " (0x%#llx) - aborting.\n", value1); + return -ENODEV; } - printk(KERN_INFO "PM-Timer had no reasonable result:" - " 0x%#llx - aborting.\n", value1); - return -ENODEV; -pm_good: if (verify_pmtmr_rate() != 0) return -ENODEV; -- cgit v1.2.3 From 12cf105cd66d95cf32c73cfa847a50bd1b700f23 Mon Sep 17 00:00:00 2001 From: Krzysztof Helt Date: Thu, 4 Sep 2008 21:09:43 +0200 Subject: x86: delay early cpu initialization until cpuid is done Move early cpu initialization after cpu early get cap so the early cpu initialization can fix up cpu caps. Signed-off-by: Krzysztof Helt Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 0785b3c8d04..8aab8517642 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -335,11 +335,11 @@ static void __init early_cpu_detect(void) get_cpu_vendor(c, 1); + early_get_cap(c); + if (c->x86_vendor != X86_VENDOR_UNKNOWN && cpu_devs[c->x86_vendor]->c_early_init) cpu_devs[c->x86_vendor]->c_early_init(c); - - early_get_cap(c); } /* -- cgit v1.2.3 From dd786dd12c99634055a9066f25ea957f29991c22 Mon Sep 17 00:00:00 2001 From: Yinghai Lu Date: Thu, 4 Sep 2008 21:09:43 +0200 Subject: x86: move mtrr cpu cap setting early in early_init_xxxx Krzysztof Helt found MTRR is not detected on k6-2 root cause: we moved mtrr_bp_init() early for mtrr trimming, and in early_detect we only read the CPU capability from cpuid, so some cpu doesn't have that bit in cpuid. So we need to add early_init_xxxx to preset those bit before mtrr_bp_init for those earlier cpus. this patch is for v2.6.27 Reported-by: Krzysztof Helt Signed-off-by: Yinghai Lu Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/amd.c | 9 +++++---- arch/x86/kernel/cpu/centaur.c | 11 +++++++++++ arch/x86/kernel/cpu/cyrix.c | 32 ++++++++++++++++++++++++++++---- 3 files changed, 44 insertions(+), 8 deletions(-) diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index cae9cabc303..18514ed2610 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -31,6 +31,11 @@ static void __cpuinit early_init_amd(struct cpuinfo_x86 *c) if (c->x86_power & (1<<8)) set_cpu_cap(c, X86_FEATURE_CONSTANT_TSC); } + + /* Set MTRR capability flag if appropriate */ + if (c->x86_model == 13 || c->x86_model == 9 || + (c->x86_model == 8 && c->x86_mask >= 8)) + set_cpu_cap(c, X86_FEATURE_K6_MTRR); } static void __cpuinit init_amd(struct cpuinfo_x86 *c) @@ -166,10 +171,6 @@ static void __cpuinit init_amd(struct cpuinfo_x86 *c) mbytes); } - /* Set MTRR capability flag if appropriate */ - if (c->x86_model == 13 || c->x86_model == 9 || - (c->x86_model == 8 && c->x86_mask >= 8)) - set_cpu_cap(c, X86_FEATURE_K6_MTRR); break; } diff --git a/arch/x86/kernel/cpu/centaur.c b/arch/x86/kernel/cpu/centaur.c index e0f45edd6a5..a0534c04d38 100644 --- a/arch/x86/kernel/cpu/centaur.c +++ b/arch/x86/kernel/cpu/centaur.c @@ -314,6 +314,16 @@ enum { EAMD3D = 1<<20, }; +static void __cpuinit early_init_centaur(struct cpuinfo_x86 *c) +{ + switch (c->x86) { + case 5: + /* Emulate MTRRs using Centaur's MCR. */ + set_cpu_cap(c, X86_FEATURE_CENTAUR_MCR); + break; + } +} + static void __cpuinit init_centaur(struct cpuinfo_x86 *c) { @@ -462,6 +472,7 @@ centaur_size_cache(struct cpuinfo_x86 *c, unsigned int size) static struct cpu_dev centaur_cpu_dev __cpuinitdata = { .c_vendor = "Centaur", .c_ident = { "CentaurHauls" }, + .c_early_init = early_init_centaur, .c_init = init_centaur, .c_size_cache = centaur_size_cache, }; diff --git a/arch/x86/kernel/cpu/cyrix.c b/arch/x86/kernel/cpu/cyrix.c index e710a21bb6e..898a5a2002e 100644 --- a/arch/x86/kernel/cpu/cyrix.c +++ b/arch/x86/kernel/cpu/cyrix.c @@ -15,13 +15,11 @@ /* * Read NSC/Cyrix DEVID registers (DIR) to get more detailed info. about the CPU */ -static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +static void __cpuinit __do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) { unsigned char ccr2, ccr3; - unsigned long flags; /* we test for DEVID by checking whether CCR3 is writable */ - local_irq_save(flags); ccr3 = getCx86(CX86_CCR3); setCx86(CX86_CCR3, ccr3 ^ 0x80); getCx86(0xc0); /* dummy to change bus */ @@ -44,9 +42,16 @@ static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) *dir0 = getCx86(CX86_DIR0); *dir1 = getCx86(CX86_DIR1); } - local_irq_restore(flags); } +static void __cpuinit do_cyrix_devid(unsigned char *dir0, unsigned char *dir1) +{ + unsigned long flags; + + local_irq_save(flags); + __do_cyrix_devid(dir0, dir1); + local_irq_restore(flags); +} /* * Cx86_dir0_msb is a HACK needed by check_cx686_cpuid/slop in bugs.h in * order to identify the Cyrix CPU model after we're out of setup.c @@ -161,6 +166,24 @@ static void __cpuinit geode_configure(void) local_irq_restore(flags); } +static void __cpuinit early_init_cyrix(struct cpuinfo_x86 *c) +{ + unsigned char dir0, dir0_msn, dir1 = 0; + + __do_cyrix_devid(&dir0, &dir1); + dir0_msn = dir0 >> 4; /* identifies CPU "family" */ + + switch (dir0_msn) { + case 3: /* 6x86/6x86L */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + case 5: /* 6x86MX/M II */ + /* Emulate MTRRs using Cyrix's ARRs. */ + set_cpu_cap(c, X86_FEATURE_CYRIX_ARR); + break; + } +} static void __cpuinit init_cyrix(struct cpuinfo_x86 *c) { @@ -416,6 +439,7 @@ static void __cpuinit cyrix_identify(struct cpuinfo_x86 *c) static struct cpu_dev cyrix_cpu_dev __cpuinitdata = { .c_vendor = "Cyrix", .c_ident = { "CyrixInstead" }, + .c_early_init = early_init_cyrix, .c_init = init_cyrix, .c_identify = cyrix_identify, }; -- cgit v1.2.3 From dfb512ec4834116124da61d6c1ee10fd0aa32bd6 Mon Sep 17 00:00:00 2001 From: Max Krasnyansky Date: Fri, 29 Aug 2008 13:11:41 -0700 Subject: sched: arch_reinit_sched_domains() must destroy domains to force rebuild What I realized recently is that calling rebuild_sched_domains() in arch_reinit_sched_domains() by itself is not enough when cpusets are enabled. partition_sched_domains() code is trying to avoid unnecessary domain rebuilds and will not actually rebuild anything if new domain masks match the old ones. What this means is that doing echo 1 > /sys/devices/system/cpu/sched_mc_power_savings on a system with cpusets enabled will not take affect untill something changes in the cpuset setup (ie new sets created or deleted). This patch fixes restore correct behaviour where domains must be rebuilt in order to enable MC powersaving flags. Test on quad-core Core2 box with both CONFIG_CPUSETS and !CONFIG_CPUSETS. Also tested on dual-core Core2 laptop. Lockdep is happy and things are working as expected. Signed-off-by: Max Krasnyansky Tested-by: Vaidyanathan Srinivasan Signed-off-by: Ingo Molnar --- include/linux/cpuset.h | 2 +- kernel/sched.c | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index e8f450c499b..2691926fb50 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -160,7 +160,7 @@ static inline int current_cpuset_is_being_rebound(void) static inline void rebuild_sched_domains(void) { - partition_sched_domains(0, NULL, NULL); + partition_sched_domains(1, NULL, NULL); } #endif /* !CONFIG_CPUSETS */ diff --git a/kernel/sched.c b/kernel/sched.c index d601fb0406c..d72ee9a0eac 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -7589,24 +7589,27 @@ static int dattrs_equal(struct sched_domain_attr *cur, int idx_cur, * and partition_sched_domains() will fallback to the single partition * 'fallback_doms', it also forces the domains to be rebuilt. * + * If doms_new==NULL it will be replaced with cpu_online_map. + * ndoms_new==0 is a special case for destroying existing domains. + * It will not create the default domain. + * * Call with hotplug lock held */ void partition_sched_domains(int ndoms_new, cpumask_t *doms_new, struct sched_domain_attr *dattr_new) { - int i, j; + int i, j, n; mutex_lock(&sched_domains_mutex); /* always unregister in case we don't destroy any domains */ unregister_sched_domain_sysctl(); - if (doms_new == NULL) - ndoms_new = 0; + n = doms_new ? ndoms_new : 0; /* Destroy deleted domains */ for (i = 0; i < ndoms_cur; i++) { - for (j = 0; j < ndoms_new; j++) { + for (j = 0; j < n; j++) { if (cpus_equal(doms_cur[i], doms_new[j]) && dattrs_equal(dattr_cur, i, dattr_new, j)) goto match1; @@ -7619,7 +7622,6 @@ match1: if (doms_new == NULL) { ndoms_cur = 0; - ndoms_new = 1; doms_new = &fallback_doms; cpus_andnot(doms_new[0], cpu_online_map, cpu_isolated_map); dattr_new = NULL; @@ -7656,8 +7658,13 @@ match2: int arch_reinit_sched_domains(void) { get_online_cpus(); + + /* Destroy domains first to force the rebuild */ + partition_sched_domains(0, NULL, NULL); + rebuild_sched_domains(); put_online_cpus(); + return 0; } @@ -7741,7 +7748,7 @@ static int update_sched_domains(struct notifier_block *nfb, case CPU_ONLINE_FROZEN: case CPU_DEAD: case CPU_DEAD_FROZEN: - partition_sched_domains(0, NULL, NULL); + partition_sched_domains(1, NULL, NULL); return NOTIFY_OK; default: -- cgit v1.2.3 From e4a6be4d2850da032a782b5296c07dfdf583af86 Mon Sep 17 00:00:00 2001 From: Eduardo Habkost Date: Thu, 24 Jul 2008 12:15:45 -0300 Subject: x86, xen: Use native_pte_flags instead of native_pte_val for .pte_flags Using native_pte_val triggers the BUG_ON() in the paravirt_ops version of pte_flags(). Signed-off-by: Eduardo Habkost Acked-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar --- arch/x86/xen/enlighten.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 9ff6e3cbf08..a4e201b47f6 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -1324,7 +1324,7 @@ static const struct pv_mmu_ops xen_mmu_ops __initdata = { .ptep_modify_prot_commit = __ptep_modify_prot_commit, .pte_val = xen_pte_val, - .pte_flags = native_pte_val, + .pte_flags = native_pte_flags, .pgd_val = xen_pgd_val, .make_pte = xen_make_pte, -- cgit v1.2.3 From d04ec773d7ca1bbc05a2768be95c1cebe2b07757 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 6 Aug 2008 10:27:30 +0200 Subject: x86: pda_init(): fix memory leak when using CPU hotplug pda->irqstackptr is allocated whenever a CPU is set online. But it is never freed. This results in a memory leak of 16K for each CPU offline/online cycle. Fix is to allocate pda->irqstackptr only once. Signed-off-by: Andreas Herrmann Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index c3afba5a81a..4f2eeb5652e 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -529,17 +529,20 @@ void pda_init(int cpu) /* others are initialized in smpboot.c */ pda->pcurrent = &init_task; pda->irqstackptr = boot_cpu_stack; + pda->irqstackptr += IRQSTACKSIZE - 64; } else { - pda->irqstackptr = (char *) - __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); - if (!pda->irqstackptr) - panic("cannot allocate irqstack for cpu %d", cpu); + if (!pda->irqstackptr) { + pda->irqstackptr = (char *) + __get_free_pages(GFP_ATOMIC, IRQSTACK_ORDER); + if (!pda->irqstackptr) + panic("cannot allocate irqstack for cpu %d", + cpu); + pda->irqstackptr += IRQSTACKSIZE - 64; + } if (pda->nodenumber == 0 && cpu_to_node(cpu) != NUMA_NO_NODE) pda->nodenumber = cpu_to_node(cpu); } - - pda->irqstackptr += IRQSTACKSIZE-64; } char boot_exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + -- cgit v1.2.3 From 23952a96ae738277f3139b63d622e22984589031 Mon Sep 17 00:00:00 2001 From: Andreas Herrmann Date: Wed, 6 Aug 2008 10:29:37 +0200 Subject: x86: cpu_init(): fix memory leak when using CPU hotplug Exception stacks are allocated each time a CPU is set online. But the allocated space is never freed. Thus with one CPU hotplug offline/online cycle there is a memory leak of 24K (6 pages) for a CPU. Fix is to allocate exception stacks only once -- when the CPU is set online for the first time. Signed-off-by: Andreas Herrmann Cc: akpm@linux-foundation.org Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common_64.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/arch/x86/kernel/cpu/common_64.c b/arch/x86/kernel/cpu/common_64.c index 4f2eeb5652e..a11f5d4477c 100644 --- a/arch/x86/kernel/cpu/common_64.c +++ b/arch/x86/kernel/cpu/common_64.c @@ -640,19 +640,22 @@ void __cpuinit cpu_init(void) /* * set up and load the per-CPU TSS */ - for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (!orig_ist->ist[0]) { static const unsigned int order[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, - [DEBUG_STACK - 1] = DEBUG_STACK_ORDER + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STACK_ORDER, + [DEBUG_STACK - 1] = DEBUG_STACK_ORDER }; - if (cpu) { - estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); - if (!estacks) - panic("Cannot allocate exception stack %ld %d\n", - v, cpu); + for (v = 0; v < N_EXCEPTION_STACKS; v++) { + if (cpu) { + estacks = (char *)__get_free_pages(GFP_ATOMIC, order[v]); + if (!estacks) + panic("Cannot allocate exception " + "stack %ld %d\n", v, cpu); + } + estacks += PAGE_SIZE << order[v]; + orig_ist->ist[v] = t->x86_tss.ist[v] = + (unsigned long)estacks; } - estacks += PAGE_SIZE << order[v]; - orig_ist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; } t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); -- cgit v1.2.3 From b2e601d14deb2083e2a537b47869ab3895d23a28 Mon Sep 17 00:00:00 2001 From: Andre Detsch Date: Thu, 4 Sep 2008 21:16:27 +0000 Subject: powerpc/spufs: Fix possible scheduling of a context to multiple SPEs We currently have a race when scheduling a context to a SPE - after we have found a runnable context in spusched_tick, the same context may have been scheduled by spu_activate(). This may result in a panic if we try to unschedule a context that has been freed in the meantime. This change exits spu_schedule() if the context has already been scheduled, so we don't end up scheduling it twice. Signed-off-by: Andre Detsch Signed-off-by: Jeremy Kerr --- arch/powerpc/platforms/cell/spufs/sched.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c index 897c7406116..67595bc380d 100644 --- a/arch/powerpc/platforms/cell/spufs/sched.c +++ b/arch/powerpc/platforms/cell/spufs/sched.c @@ -728,7 +728,8 @@ static void spu_schedule(struct spu *spu, struct spu_context *ctx) /* not a candidate for interruptible because it's called either from the scheduler thread or from spu_deactivate */ mutex_lock(&ctx->state_mutex); - __spu_schedule(spu, ctx); + if (ctx->state == SPU_STATE_SAVED) + __spu_schedule(spu, ctx); spu_release(ctx); } -- cgit v1.2.3 From 66bf79182d6531c14c1f9a507b6bbf374a2ae4cd Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 7 Sep 2008 18:19:25 -0700 Subject: netfilter: nf_conntrack_sip: de-static helper pointers Helper's ->help hook can run concurrently with itself, so iterating over SIP helpers with static pointer won't work reliably. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_sip.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index 2f9bbc058b4..1fa306be60f 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -1193,7 +1193,6 @@ static const struct sip_handler sip_handlers[] = { static int process_sip_response(struct sk_buff *skb, const char **dptr, unsigned int *datalen) { - static const struct sip_handler *handler; enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); unsigned int matchoff, matchlen; @@ -1214,6 +1213,8 @@ static int process_sip_response(struct sk_buff *skb, dataoff = matchoff + matchlen + 1; for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + handler = &sip_handlers[i]; if (handler->response == NULL) continue; @@ -1228,13 +1229,14 @@ static int process_sip_response(struct sk_buff *skb, static int process_sip_request(struct sk_buff *skb, const char **dptr, unsigned int *datalen) { - static const struct sip_handler *handler; enum ip_conntrack_info ctinfo; struct nf_conn *ct = nf_ct_get(skb, &ctinfo); unsigned int matchoff, matchlen; unsigned int cseq, i; for (i = 0; i < ARRAY_SIZE(sip_handlers); i++) { + const struct sip_handler *handler; + handler = &sip_handlers[i]; if (handler->request == NULL) continue; -- cgit v1.2.3 From 887464a41fde7e9e1e11ca86748338033c502446 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 7 Sep 2008 18:20:08 -0700 Subject: netfilter: nf_conntrack_gre: more locking around keymap list gre_keymap_list should be protected in all places. (unless I'm misreading something) Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_proto_gre.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index 654a4f7f12c..b308bb4c12b 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -97,10 +97,14 @@ int nf_ct_gre_keymap_add(struct nf_conn *ct, enum ip_conntrack_dir dir, kmp = &help->help.ct_pptp_info.keymap[dir]; if (*kmp) { /* check whether it's a retransmission */ + read_lock_bh(&nf_ct_gre_lock); list_for_each_entry(km, &gre_keymap_list, list) { - if (gre_key_cmpfn(km, t) && km == *kmp) + if (gre_key_cmpfn(km, t) && km == *kmp) { + read_unlock_bh(&nf_ct_gre_lock); return 0; + } } + read_unlock_bh(&nf_ct_gre_lock); pr_debug("trying to override keymap_%s for ct %p\n", dir == IP_CT_DIR_REPLY ? "reply" : "orig", ct); return -EEXIST; -- cgit v1.2.3 From 51807e91a76a531d059ec7ce3395c435e4df52a8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Sun, 7 Sep 2008 18:20:36 -0700 Subject: netfilter: nf_conntrack_gre: nf_ct_gre_keymap_flush() fixlet It does "kfree(list_head)" which looks wrong because entity that was allocated is definitely not list_head. However, this all works because list_head is first item in struct nf_ct_gre_keymap. Signed-off-by: Alexey Dobriyan Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_proto_gre.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/net/netfilter/nf_conntrack_proto_gre.c b/net/netfilter/nf_conntrack_proto_gre.c index b308bb4c12b..9bd03967fea 100644 --- a/net/netfilter/nf_conntrack_proto_gre.c +++ b/net/netfilter/nf_conntrack_proto_gre.c @@ -45,12 +45,12 @@ static LIST_HEAD(gre_keymap_list); void nf_ct_gre_keymap_flush(void) { - struct list_head *pos, *n; + struct nf_ct_gre_keymap *km, *tmp; write_lock_bh(&nf_ct_gre_lock); - list_for_each_safe(pos, n, &gre_keymap_list) { - list_del(pos); - kfree(pos); + list_for_each_entry_safe(km, tmp, &gre_keymap_list, list) { + list_del(&km->list); + kfree(km); } write_unlock_bh(&nf_ct_gre_lock); } -- cgit v1.2.3 From e3b802ba885b54f4050164c3cfd9e0ba9c73173a Mon Sep 17 00:00:00 2001 From: Patrick McHardy Date: Sun, 7 Sep 2008 18:21:24 -0700 Subject: netfilter: nf_conntrack_irc: make sure string is terminated before calling simple_strtoul Alexey Dobriyan points out: 1. simple_strtoul() silently accepts all characters for given base even if result won't fit into unsigned long. This is amazing stupidity in itself, but 2. nf_conntrack_irc helper use simple_strtoul() for DCC request parsing. Data first copied into 64KB buffer, so theoretically nothing prevents reading past the end of it, since data comes from network given 1). This is not actually a problem currently since we're guaranteed to have a 0 byte in skb_shared_info or in the buffer the data is copied to, but to make this more robust, make sure the string is actually terminated. Signed-off-by: Patrick McHardy Signed-off-by: David S. Miller --- net/netfilter/nf_conntrack_irc.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 1b1226d6653..20633fdf7e6 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -68,11 +68,21 @@ static const char *const dccprotos[] = { static int parse_dcc(char *data, const char *data_end, u_int32_t *ip, u_int16_t *port, char **ad_beg_p, char **ad_end_p) { + char *tmp; + /* at least 12: "AAAAAAAA P\1\n" */ while (*data++ != ' ') if (data > data_end - 12) return -1; + /* Make sure we have a newline character within the packet boundaries + * because simple_strtoul parses until the first invalid character. */ + for (tmp = data; tmp <= data_end; tmp++) + if (*tmp == '\n') + break; + if (tmp > data_end || *tmp != '\n') + return -1; + *ad_beg_p = data; *ip = simple_strtoul(data, &data, 10); -- cgit v1.2.3 From e8a83e10d7dfe5d0841062780769b30f65417e15 Mon Sep 17 00:00:00 2001 From: Jarek Poplawski Date: Sun, 7 Sep 2008 18:41:21 -0700 Subject: pkt_sched: Fix qdisc state in net_tx_action() net_tx_action() can skip __QDISC_STATE_SCHED bit clearing while qdisc is neither ran nor rescheduled, which may cause endless loop in dev_deactivate(). Reported-by: Denys Fedoryshchenko Tested-by: Denys Fedoryshchenko Signed-off-by: Jarek Poplawski Acked-by: Herbert Xu Signed-off-by: David S. Miller --- net/core/dev.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/core/dev.c b/net/core/dev.c index 60c51f76588..e719ed29310 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -1991,8 +1991,13 @@ static void net_tx_action(struct softirq_action *h) spin_unlock(root_lock); } else { if (!test_bit(__QDISC_STATE_DEACTIVATED, - &q->state)) + &q->state)) { __netif_reschedule(q); + } else { + smp_mb__before_clear_bit(); + clear_bit(__QDISC_STATE_SCHED, + &q->state); + } } } } -- cgit v1.2.3 From f4c4cd6d14e3a3f638475f2f83e26765a7d3327b Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 8 Sep 2008 14:29:54 +1000 Subject: Revert "crypto: camellia - Use kernel-provided bitops, unaligned access helpers" This reverts commit bd699f2df6dbc2f4cba528fe598bd63a4d3702c5, which causes camellia to fail the included self-test vectors. It has also been confirmed that it breaks existing encrypted disks using camellia. Signed-off-by: Herbert Xu --- crypto/camellia.c | 84 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 48 insertions(+), 36 deletions(-) diff --git a/crypto/camellia.c b/crypto/camellia.c index b1cc4de6493..493fee7e0a8 100644 --- a/crypto/camellia.c +++ b/crypto/camellia.c @@ -35,8 +35,6 @@ #include #include #include -#include -#include static const u32 camellia_sp1110[256] = { 0x70707000,0x82828200,0x2c2c2c00,0xececec00, @@ -337,6 +335,20 @@ static const u32 camellia_sp4404[256] = { /* * macros */ +#define GETU32(v, pt) \ + do { \ + /* latest breed of gcc is clever enough to use move */ \ + memcpy(&(v), (pt), 4); \ + (v) = be32_to_cpu(v); \ + } while(0) + +/* rotation right shift 1byte */ +#define ROR8(x) (((x) >> 8) + ((x) << 24)) +/* rotation left shift 1bit */ +#define ROL1(x) (((x) << 1) + ((x) >> 31)) +/* rotation left shift 1byte */ +#define ROL8(x) (((x) << 8) + ((x) >> 24)) + #define ROLDQ(ll, lr, rl, rr, w0, w1, bits) \ do { \ w0 = ll; \ @@ -371,7 +383,7 @@ static const u32 camellia_sp4404[256] = { ^ camellia_sp3033[(u8)(il >> 8)] \ ^ camellia_sp4404[(u8)(il )]; \ yl ^= yr; \ - yr = ror32(yr, 8); \ + yr = ROR8(yr); \ yr ^= yl; \ } while(0) @@ -393,7 +405,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) subL[7] ^= subL[1]; subR[7] ^= subR[1]; subL[1] ^= subR[1] & ~subR[9]; dw = subL[1] & subL[9], - subR[1] ^= rol32(dw, 1); /* modified for FLinv(kl2) */ + subR[1] ^= ROL1(dw); /* modified for FLinv(kl2) */ /* round 8 */ subL[11] ^= subL[1]; subR[11] ^= subR[1]; /* round 10 */ @@ -402,7 +414,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) subL[15] ^= subL[1]; subR[15] ^= subR[1]; subL[1] ^= subR[1] & ~subR[17]; dw = subL[1] & subL[17], - subR[1] ^= rol32(dw, 1); /* modified for FLinv(kl4) */ + subR[1] ^= ROL1(dw); /* modified for FLinv(kl4) */ /* round 14 */ subL[19] ^= subL[1]; subR[19] ^= subR[1]; /* round 16 */ @@ -418,7 +430,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) } else { subL[1] ^= subR[1] & ~subR[25]; dw = subL[1] & subL[25], - subR[1] ^= rol32(dw, 1); /* modified for FLinv(kl6) */ + subR[1] ^= ROL1(dw); /* modified for FLinv(kl6) */ /* round 20 */ subL[27] ^= subL[1]; subR[27] ^= subR[1]; /* round 22 */ @@ -438,7 +450,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) subL[26] ^= kw4l; subR[26] ^= kw4r; kw4l ^= kw4r & ~subR[24]; dw = kw4l & subL[24], - kw4r ^= rol32(dw, 1); /* modified for FL(kl5) */ + kw4r ^= ROL1(dw); /* modified for FL(kl5) */ } /* round 17 */ subL[22] ^= kw4l; subR[22] ^= kw4r; @@ -448,7 +460,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) subL[18] ^= kw4l; subR[18] ^= kw4r; kw4l ^= kw4r & ~subR[16]; dw = kw4l & subL[16], - kw4r ^= rol32(dw, 1); /* modified for FL(kl3) */ + kw4r ^= ROL1(dw); /* modified for FL(kl3) */ /* round 11 */ subL[14] ^= kw4l; subR[14] ^= kw4r; /* round 9 */ @@ -457,7 +469,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) subL[10] ^= kw4l; subR[10] ^= kw4r; kw4l ^= kw4r & ~subR[8]; dw = kw4l & subL[8], - kw4r ^= rol32(dw, 1); /* modified for FL(kl1) */ + kw4r ^= ROL1(dw); /* modified for FL(kl1) */ /* round 5 */ subL[6] ^= kw4l; subR[6] ^= kw4r; /* round 3 */ @@ -482,7 +494,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) SUBKEY_R(6) = subR[5] ^ subR[7]; tl = subL[10] ^ (subR[10] & ~subR[8]); dw = tl & subL[8], /* FL(kl1) */ - tr = subR[10] ^ rol32(dw, 1); + tr = subR[10] ^ ROL1(dw); SUBKEY_L(7) = subL[6] ^ tl; /* round 6 */ SUBKEY_R(7) = subR[6] ^ tr; SUBKEY_L(8) = subL[8]; /* FL(kl1) */ @@ -491,7 +503,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) SUBKEY_R(9) = subR[9]; tl = subL[7] ^ (subR[7] & ~subR[9]); dw = tl & subL[9], /* FLinv(kl2) */ - tr = subR[7] ^ rol32(dw, 1); + tr = subR[7] ^ ROL1(dw); SUBKEY_L(10) = tl ^ subL[11]; /* round 7 */ SUBKEY_R(10) = tr ^ subR[11]; SUBKEY_L(11) = subL[10] ^ subL[12]; /* round 8 */ @@ -504,7 +516,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) SUBKEY_R(14) = subR[13] ^ subR[15]; tl = subL[18] ^ (subR[18] & ~subR[16]); dw = tl & subL[16], /* FL(kl3) */ - tr = subR[18] ^ rol32(dw, 1); + tr = subR[18] ^ ROL1(dw); SUBKEY_L(15) = subL[14] ^ tl; /* round 12 */ SUBKEY_R(15) = subR[14] ^ tr; SUBKEY_L(16) = subL[16]; /* FL(kl3) */ @@ -513,7 +525,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) SUBKEY_R(17) = subR[17]; tl = subL[15] ^ (subR[15] & ~subR[17]); dw = tl & subL[17], /* FLinv(kl4) */ - tr = subR[15] ^ rol32(dw, 1); + tr = subR[15] ^ ROL1(dw); SUBKEY_L(18) = tl ^ subL[19]; /* round 13 */ SUBKEY_R(18) = tr ^ subR[19]; SUBKEY_L(19) = subL[18] ^ subL[20]; /* round 14 */ @@ -532,7 +544,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) } else { tl = subL[26] ^ (subR[26] & ~subR[24]); dw = tl & subL[24], /* FL(kl5) */ - tr = subR[26] ^ rol32(dw, 1); + tr = subR[26] ^ ROL1(dw); SUBKEY_L(23) = subL[22] ^ tl; /* round 18 */ SUBKEY_R(23) = subR[22] ^ tr; SUBKEY_L(24) = subL[24]; /* FL(kl5) */ @@ -541,7 +553,7 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) SUBKEY_R(25) = subR[25]; tl = subL[23] ^ (subR[23] & ~subR[25]); dw = tl & subL[25], /* FLinv(kl6) */ - tr = subR[23] ^ rol32(dw, 1); + tr = subR[23] ^ ROL1(dw); SUBKEY_L(26) = tl ^ subL[27]; /* round 19 */ SUBKEY_R(26) = tr ^ subR[27]; SUBKEY_L(27) = subL[26] ^ subL[28]; /* round 20 */ @@ -561,17 +573,17 @@ static void camellia_setup_tail(u32 *subkey, u32 *subL, u32 *subR, int max) /* apply the inverse of the last half of P-function */ i = 2; do { - dw = SUBKEY_L(i + 0) ^ SUBKEY_R(i + 0); dw = rol32(dw, 8);/* round 1 */ + dw = SUBKEY_L(i + 0) ^ SUBKEY_R(i + 0); dw = ROL8(dw);/* round 1 */ SUBKEY_R(i + 0) = SUBKEY_L(i + 0) ^ dw; SUBKEY_L(i + 0) = dw; - dw = SUBKEY_L(i + 1) ^ SUBKEY_R(i + 1); dw = rol32(dw, 8);/* round 2 */ + dw = SUBKEY_L(i + 1) ^ SUBKEY_R(i + 1); dw = ROL8(dw);/* round 2 */ SUBKEY_R(i + 1) = SUBKEY_L(i + 1) ^ dw; SUBKEY_L(i + 1) = dw; - dw = SUBKEY_L(i + 2) ^ SUBKEY_R(i + 2); dw = rol32(dw, 8);/* round 3 */ + dw = SUBKEY_L(i + 2) ^ SUBKEY_R(i + 2); dw = ROL8(dw);/* round 3 */ SUBKEY_R(i + 2) = SUBKEY_L(i + 2) ^ dw; SUBKEY_L(i + 2) = dw; - dw = SUBKEY_L(i + 3) ^ SUBKEY_R(i + 3); dw = rol32(dw, 8);/* round 4 */ + dw = SUBKEY_L(i + 3) ^ SUBKEY_R(i + 3); dw = ROL8(dw);/* round 4 */ SUBKEY_R(i + 3) = SUBKEY_L(i + 3) ^ dw; SUBKEY_L(i + 3) = dw; - dw = SUBKEY_L(i + 4) ^ SUBKEY_R(i + 4); dw = rol32(dw, 9);/* round 5 */ + dw = SUBKEY_L(i + 4) ^ SUBKEY_R(i + 4); dw = ROL8(dw);/* round 5 */ SUBKEY_R(i + 4) = SUBKEY_L(i + 4) ^ dw; SUBKEY_L(i + 4) = dw; - dw = SUBKEY_L(i + 5) ^ SUBKEY_R(i + 5); dw = rol32(dw, 8);/* round 6 */ + dw = SUBKEY_L(i + 5) ^ SUBKEY_R(i + 5); dw = ROL8(dw);/* round 6 */ SUBKEY_R(i + 5) = SUBKEY_L(i + 5) ^ dw; SUBKEY_L(i + 5) = dw; i += 8; } while (i < max); @@ -587,10 +599,10 @@ static void camellia_setup128(const unsigned char *key, u32 *subkey) /** * k == kll || klr || krl || krr (|| is concatenation) */ - kll = get_unaligned_be32(key); - klr = get_unaligned_be32(key + 4); - krl = get_unaligned_be32(key + 8); - krr = get_unaligned_be32(key + 12); + GETU32(kll, key ); + GETU32(klr, key + 4); + GETU32(krl, key + 8); + GETU32(krr, key + 12); /* generate KL dependent subkeys */ /* kw1 */ @@ -695,14 +707,14 @@ static void camellia_setup256(const unsigned char *key, u32 *subkey) * key = (kll || klr || krl || krr || krll || krlr || krrl || krrr) * (|| is concatenation) */ - kll = get_unaligned_be32(key); - klr = get_unaligned_be32(key + 4); - krl = get_unaligned_be32(key + 8); - krr = get_unaligned_be32(key + 12); - krll = get_unaligned_be32(key + 16); - krlr = get_unaligned_be32(key + 20); - krrl = get_unaligned_be32(key + 24); - krrr = get_unaligned_be32(key + 28); + GETU32(kll, key ); + GETU32(klr, key + 4); + GETU32(krl, key + 8); + GETU32(krr, key + 12); + GETU32(krll, key + 16); + GETU32(krlr, key + 20); + GETU32(krrl, key + 24); + GETU32(krrr, key + 28); /* generate KL dependent subkeys */ /* kw1 */ @@ -858,13 +870,13 @@ static void camellia_setup192(const unsigned char *key, u32 *subkey) t0 &= ll; \ t2 |= rr; \ rl ^= t2; \ - lr ^= rol32(t0, 1); \ + lr ^= ROL1(t0); \ t3 = krl; \ t1 = klr; \ t3 &= rl; \ t1 |= lr; \ ll ^= t1; \ - rr ^= rol32(t3, 1); \ + rr ^= ROL1(t3); \ } while(0) #define CAMELLIA_ROUNDSM(xl, xr, kl, kr, yl, yr, il, ir) \ @@ -880,7 +892,7 @@ static void camellia_setup192(const unsigned char *key, u32 *subkey) il ^= kl; \ ir ^= il ^ kr; \ yl ^= ir; \ - yr ^= ror32(il, 8) ^ ir; \ + yr ^= ROR8(il) ^ ir; \ } while(0) /* max = 24: 128bit encrypt, max = 32: 256bit encrypt */ -- cgit v1.2.3 From 4ff23fa93011e2367fea056e72c92709178972d9 Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Sun, 7 Sep 2008 00:35:48 +0100 Subject: powerpc: Fix rare boot build breakage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A make -j20 powerpc kernel build broke a couple of months ago saying: In file included from arch/powerpc/boot/gunzip_util.h:13, from arch/powerpc/boot/prpmc2800.c:21: arch/powerpc/boot/zlib.h:85: error: expected ‘:’, ‘,’, ‘;’, ‘}’ or ‘__attribute__’ before ‘*’ token arch/powerpc/boot/zlib.h:630: warning: type defaults to ‘int’ in declaration of ‘Byte’ arch/powerpc/boot/zlib.h:630: error: expected ‘;’, ‘,’ or ‘)’ before ‘*’ token It happened again yesterday: too rare for me to confirm the fix, but it looks like the list of dependants on gunzip_util.h was incomplete. Signed-off-by: Hugh Dickins Signed-off-by: Paul Mackerras --- arch/powerpc/boot/Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/boot/Makefile b/arch/powerpc/boot/Makefile index 14174aa2407..717a3bc1352 100644 --- a/arch/powerpc/boot/Makefile +++ b/arch/powerpc/boot/Makefile @@ -49,7 +49,7 @@ zlib := inffast.c inflate.c inftrees.c zlibheader := inffast.h inffixed.h inflate.h inftrees.h infutil.h zliblinuxheader := zlib.h zconf.h zutil.h -$(addprefix $(obj)/,$(zlib) gunzip_util.o main.o): \ +$(addprefix $(obj)/,$(zlib) cuboot-c2k.o gunzip_util.o main.o prpmc2800.o): \ $(addprefix $(obj)/,$(zliblinuxheader)) $(addprefix $(obj)/,$(zlibheader)) src-libfdt := fdt.c fdt_ro.c fdt_wip.c fdt_sw.c fdt_rw.c fdt_strerror.c -- cgit v1.2.3 From 5b66c829bf5c65663b2f68ee6b42f6e834cd39cd Mon Sep 17 00:00:00 2001 From: Alan Cox Date: Wed, 3 Sep 2008 14:48:34 +0100 Subject: ahci, pata_marvell: play nicely together I've been chasing Jeff about this for months. Jeff added the Marvell device identifiers to the ahci driver without making the AHCI driver handle the PATA port. This means a lot of users can't use current kernels and in most distro cases can't even install. This has been going on since March 2008 for the 6121 Marvell, and late 2007 for the 6145!!! This was all pointed out at the time and repeatedly ignored. Bugs assigned to Jeff about this are ignored also. To quote Jeff in email > "Just switch the order of 'ahci' and 'pata_marvell' in > /etc/modprobe.conf, then use Fedora's tools regenerate the initrd. > See? It's not rocket science, and the current configuration can be > easily made to work for Fedora users." (Which isn't trivial, isn't end user, shouldn't be needed, and as it usually breaks at install time is in fact impossible) To quote Jeff in August 2007 > " mv-ahci-pata > Marvell 6121/6141 PATA support. Needs fixing in the 'PATA controller > command' area before it is usable, and can go upstream." Only he add the ids anyway later and caused regressions, adding a further id in March causing more regresions. The actual fix for the moment is very simple. If the user has included the pata_marvell driver let it drive the ports. If they've only selected for SATA support give them the AHCI driver which will run the port a fraction faster. Allow the user to control this decision via ahci.marvell_enable as a module parameter so that distributions can ship 'it works' defaults and smarter users (or config tools) can then flip it over it desired. Signed-off-by: Alan Cox Signed-off-by: Jeff Garzik --- drivers/ata/Kconfig | 6 ++++-- drivers/ata/ahci.c | 17 ++++++++++++++++ drivers/ata/pata_marvell.c | 51 ++++++++++++++++++++++++++++++++++------------ 3 files changed, 59 insertions(+), 15 deletions(-) diff --git a/drivers/ata/Kconfig b/drivers/ata/Kconfig index ae8494944c4..11c8c19f0fb 100644 --- a/drivers/ata/Kconfig +++ b/drivers/ata/Kconfig @@ -448,8 +448,10 @@ config PATA_MARVELL tristate "Marvell PATA support via legacy mode" depends on PCI help - This option enables limited support for the Marvell 88SE6145 ATA - controller. + This option enables limited support for the Marvell 88SE61xx ATA + controllers. If you wish to use only the SATA ports then select + the AHCI driver alone. If you wish to the use the PATA port or + both SATA and PATA include this driver. If unsure, say N. diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index c729e6988bb..bce26ee3806 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -610,6 +610,15 @@ module_param(ahci_em_messages, int, 0444); MODULE_PARM_DESC(ahci_em_messages, "Set AHCI Enclosure Management Message type (0 = disabled, 1 = LED"); +#if defined(CONFIG_PATA_MARVELL) || defined(CONFIG_PATA_MARVELL_MODULE) +static int marvell_enable; +#else +static int marvell_enable = 1; +#endif +module_param(marvell_enable, int, 0644); +MODULE_PARM_DESC(marvell_enable, "Marvell SATA via AHCI (1 = enabled)"); + + static inline int ahci_nr_ports(u32 cap) { return (cap & 0x1f) + 1; @@ -732,6 +741,8 @@ static void ahci_save_initial_config(struct pci_dev *pdev, "MV_AHCI HACK: port_map %x -> %x\n", port_map, port_map & mv); + dev_printk(KERN_ERR, &pdev->dev, + "Disabling your PATA port. Use the boot option 'ahci.marvell_enable=0' to avoid this.\n"); port_map &= mv; } @@ -2533,6 +2544,12 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) if (!printed_version++) dev_printk(KERN_DEBUG, &pdev->dev, "version " DRV_VERSION "\n"); + /* The AHCI driver can only drive the SATA ports, the PATA driver + can drive them all so if both drivers are selected make sure + AHCI stays out of the way */ + if (pdev->vendor == PCI_VENDOR_ID_MARVELL && !marvell_enable) + return -ENODEV; + /* acquire resources */ rc = pcim_enable_device(pdev); if (rc) diff --git a/drivers/ata/pata_marvell.c b/drivers/ata/pata_marvell.c index 24a011b2502..0d87eec8496 100644 --- a/drivers/ata/pata_marvell.c +++ b/drivers/ata/pata_marvell.c @@ -20,29 +20,30 @@ #include #define DRV_NAME "pata_marvell" -#define DRV_VERSION "0.1.4" +#define DRV_VERSION "0.1.6" /** - * marvell_pre_reset - check for 40/80 pin - * @link: link - * @deadline: deadline jiffies for the operation + * marvell_pata_active - check if PATA is active + * @pdev: PCI device * - * Perform the PATA port setup we need. + * Returns 1 if the PATA port may be active. We know how to check this + * for the 6145 but not the other devices */ -static int marvell_pre_reset(struct ata_link *link, unsigned long deadline) +static int marvell_pata_active(struct pci_dev *pdev) { - struct ata_port *ap = link->ap; - struct pci_dev *pdev = to_pci_dev(ap->host->dev); + int i; u32 devices; void __iomem *barp; - int i; - /* Check if our port is enabled */ + /* We don't yet know how to do this for other devices */ + if (pdev->device != 0x6145) + return 1; barp = pci_iomap(pdev, 5, 0x10); if (barp == NULL) return -ENOMEM; + printk("BAR5:"); for(i = 0; i <= 0x0F; i++) printk("%02X:%02X ", i, ioread8(barp + i)); @@ -51,9 +52,27 @@ static int marvell_pre_reset(struct ata_link *link, unsigned long deadline) devices = ioread32(barp + 0x0C); pci_iounmap(pdev, barp); - if ((pdev->device == 0x6145) && (ap->port_no == 0) && - (!(devices & 0x10))) /* PATA enable ? */ - return -ENOENT; + if (devices & 0x10) + return 1; + return 0; +} + +/** + * marvell_pre_reset - check for 40/80 pin + * @link: link + * @deadline: deadline jiffies for the operation + * + * Perform the PATA port setup we need. + */ + +static int marvell_pre_reset(struct ata_link *link, unsigned long deadline) +{ + struct ata_port *ap = link->ap; + struct pci_dev *pdev = to_pci_dev(ap->host->dev); + + if (pdev->device == 0x6145 && ap->port_no == 0 && + !marvell_pata_active(pdev)) /* PATA enable ? */ + return -ENOENT; return ata_sff_prereset(link, deadline); } @@ -128,6 +147,12 @@ static int marvell_init_one (struct pci_dev *pdev, const struct pci_device_id *i if (pdev->device == 0x6101) ppi[1] = &ata_dummy_port_info; +#if defined(CONFIG_AHCI) || defined(CONFIG_AHCI_MODULE) + if (!marvell_pata_active(pdev)) { + printk(KERN_INFO DRV_NAME ": PATA port not active, deferring to AHCI driver.\n"); + return -ENODEV; + } +#endif return ata_pci_sff_init_one(pdev, ppi, &marvell_sht, NULL); } -- cgit v1.2.3 From 46c5784c8fa736c2bb42fe681189b86e99abdc2e Mon Sep 17 00:00:00 2001 From: Mark Lord Date: Thu, 4 Sep 2008 18:21:07 -0400 Subject: sata_mv: add RocketRaid 1720 PCI ID to driver Signed-off-by: Petr Jelen Signed-off-by: Mark Lord Signed-off-by: Jeff Garzik --- drivers/ata/sata_mv.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/ata/sata_mv.c b/drivers/ata/sata_mv.c index 13c1d2af18a..c815f8ecf6e 100644 --- a/drivers/ata/sata_mv.c +++ b/drivers/ata/sata_mv.c @@ -667,7 +667,8 @@ static const struct pci_device_id mv_pci_tbl[] = { { PCI_VDEVICE(MARVELL, 0x5041), chip_504x }, { PCI_VDEVICE(MARVELL, 0x5080), chip_5080 }, { PCI_VDEVICE(MARVELL, 0x5081), chip_508x }, - /* RocketRAID 1740/174x have different identifiers */ + /* RocketRAID 1720/174x have different identifiers */ + { PCI_VDEVICE(TTI, 0x1720), chip_6042 }, { PCI_VDEVICE(TTI, 0x1740), chip_508x }, { PCI_VDEVICE(TTI, 0x1742), chip_508x }, -- cgit v1.2.3 From 17248461cb66103b87ff03bdee34aa61035cc93e Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Aug 2008 16:03:59 +0200 Subject: ahci: disable PMP for marvell ahcis Marvell ahcis don't play nicely with PMPs. Disable it. Reported by KueiHuan Chen in the following thread. http://thread.gmane.org/gmane.linux.ide/33296 Signed-off-by: Tejun Heo Cc: KueiHuan Chen Cc: Mark Lord Signed-off-by: Jeff Garzik --- drivers/ata/ahci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index bce26ee3806..66aa146bf4d 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -420,7 +420,7 @@ static const struct ata_port_info ahci_port_info[] = { /* board_ahci_mv */ { AHCI_HFLAGS (AHCI_HFLAG_NO_NCQ | AHCI_HFLAG_NO_MSI | - AHCI_HFLAG_MV_PATA), + AHCI_HFLAG_MV_PATA | AHCI_HFLAG_NO_PMP), .flags = ATA_FLAG_SATA | ATA_FLAG_NO_LEGACY | ATA_FLAG_MMIO | ATA_FLAG_PIO_DMA, .pio_mask = 0x1f, /* pio0-4 */ -- cgit v1.2.3 From 2fd673ecf0378ddeeeb87b3605e50212e0c0ddc6 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Aug 2008 16:13:12 +0200 Subject: sata_nv: disable hardreset for generic of them being unifying probing, hotplug and EH reset paths uniform. Previously, broken hardreset could go unnoticed as it wasn't used during probing but when something goes wrong or after hotplug the problem will surface and bite hard. OSDL bug 11195 reports that sata_nv generic flavor falls into this category. Hardreset itself succeeds but PHY stays offline after hardreset. I tried longer debounce timing but the result was the same. http://bugzilla.kernel.org/show_bug.cgi?id=11195 So, it seems we'll have to drop hardreset from the generic flavor. Signed-off-by: Tejun Heo Cc: Peer Chen Signed-off-by: Jeff Garzik --- drivers/ata/sata_nv.c | 19 +------------------ 1 file changed, 1 insertion(+), 18 deletions(-) diff --git a/drivers/ata/sata_nv.c b/drivers/ata/sata_nv.c index 858f70610ed..1e1f3f3757a 100644 --- a/drivers/ata/sata_nv.c +++ b/drivers/ata/sata_nv.c @@ -309,8 +309,6 @@ static void nv_nf2_freeze(struct ata_port *ap); static void nv_nf2_thaw(struct ata_port *ap); static void nv_ck804_freeze(struct ata_port *ap); static void nv_ck804_thaw(struct ata_port *ap); -static int nv_hardreset(struct ata_link *link, unsigned int *class, - unsigned long deadline); static int nv_adma_slave_config(struct scsi_device *sdev); static int nv_adma_check_atapi_dma(struct ata_queued_cmd *qc); static void nv_adma_qc_prep(struct ata_queued_cmd *qc); @@ -407,7 +405,7 @@ static struct scsi_host_template nv_swncq_sht = { static struct ata_port_operations nv_generic_ops = { .inherits = &ata_bmdma_port_ops, - .hardreset = nv_hardreset, + .hardreset = ATA_OP_NULL, .scr_read = nv_scr_read, .scr_write = nv_scr_write, }; @@ -1588,21 +1586,6 @@ static void nv_mcp55_thaw(struct ata_port *ap) ata_sff_thaw(ap); } -static int nv_hardreset(struct ata_link *link, unsigned int *class, - unsigned long deadline) -{ - int rc; - - /* SATA hardreset fails to retrieve proper device signature on - * some controllers. Request follow up SRST. For more info, - * see http://bugzilla.kernel.org/show_bug.cgi?id=3352 - */ - rc = sata_sff_hardreset(link, class, deadline); - if (rc) - return rc; - return -EAGAIN; -} - static void nv_adma_error_handler(struct ata_port *ap) { struct nv_adma_port_priv *pp = ap->private_data; -- cgit v1.2.3 From 9c2676b61a5a4b6d99e65fb2f438fb3914302eda Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Fri, 29 Aug 2008 16:27:43 +0200 Subject: libata-sff: kill spurious WARN_ON() in ata_hsm_move() On HSM_ST_ERR, ata_hsm_move() triggers WARN_ON() if AC_ERR_DEV or AC_ERR_HSM is not set. PHY events may trigger HSM_ST_ERR with other error codes and, with or without it, there just isn't much reason to do WARN_ON() on it. Even if error code is not set there, core EH logic won't have any problem dealing with the error condition. OSDL bz#11065 reports this problem. Signed-off-by: Tejun Heo Signed-off-by: Jeff Garzik --- drivers/ata/libata-sff.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index 304fdc6f1dc..2a4c516894f 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -1315,11 +1315,6 @@ fsm_start: break; case HSM_ST_ERR: - /* make sure qc->err_mask is available to - * know what's wrong and recover - */ - WARN_ON(!(qc->err_mask & (AC_ERR_DEV | AC_ERR_HSM))); - ap->hsm_task_state = HSM_ST_IDLE; /* complete taskfile transaction */ -- cgit v1.2.3 From de058cdea65842ed4bf17da6b50d6fe6b120a6ef Mon Sep 17 00:00:00 2001 From: David Milburn Date: Fri, 29 Aug 2008 10:36:28 -0500 Subject: pata_sil680: remove duplicate pcim_enable_device Remove duplicate call to pcim_enable_device in sil680_init_one. Signed-off-by: David Milburn Signed-off-by: Jeff Garzik --- drivers/ata/pata_sil680.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/ata/pata_sil680.c b/drivers/ata/pata_sil680.c index 720b8645f58..e970b227fbc 100644 --- a/drivers/ata/pata_sil680.c +++ b/drivers/ata/pata_sil680.c @@ -322,9 +322,6 @@ static int __devinit sil680_init_one(struct pci_dev *pdev, /* Try to acquire MMIO resources and fallback to PIO if * that fails */ - rc = pcim_enable_device(pdev); - if (rc) - return rc; rc = pcim_iomap_regions(pdev, 1 << SIL680_MMIO_BAR, DRV_NAME); if (rc) goto use_ioports; -- cgit v1.2.3 From 8e48b6b307085ce8a747cf94294742f7b7a11b18 Mon Sep 17 00:00:00 2001 From: Seth Heasley Date: Wed, 27 Aug 2008 16:47:22 -0700 Subject: ahci: RAID mode SATA patch for Intel Ibex Peak DeviceIDs Add the Intel Ibex Peak (PCH) SATA RAID Controller DeviceIDs. Signed-off-by: Seth Heasley Signed-off-by: Jeff Garzik --- drivers/ata/ahci.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c index 66aa146bf4d..2e1a7cb2ed5 100644 --- a/drivers/ata/ahci.c +++ b/drivers/ata/ahci.c @@ -487,7 +487,9 @@ static const struct pci_device_id ahci_pci_tbl[] = { { PCI_VDEVICE(INTEL, 0x3a05), board_ahci }, /* ICH10 */ { PCI_VDEVICE(INTEL, 0x3a25), board_ahci }, /* ICH10 */ { PCI_VDEVICE(INTEL, 0x3b24), board_ahci }, /* PCH RAID */ + { PCI_VDEVICE(INTEL, 0x3b25), board_ahci }, /* PCH RAID */ { PCI_VDEVICE(INTEL, 0x3b2b), board_ahci }, /* PCH RAID */ + { PCI_VDEVICE(INTEL, 0x3b2c), board_ahci }, /* PCH RAID */ /* JMicron 360/1/3/5/6, match class to avoid IDE function */ { PCI_VENDOR_ID_JMICRON, PCI_ANY_ID, PCI_ANY_ID, PCI_ANY_ID, -- cgit v1.2.3 From 4a911b1efe219fa3c8af697be0054c72e13bdae4 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Thu, 4 Sep 2008 23:05:45 +0200 Subject: [MIPS] IP22: Fix detection of second HPC3 on Challenge S The second HPC3 could be found only on Guiness systems (Challenge-S), but not on fullhouse (Indigo2) systems. Signed-off-by: Thomas Bogendoerfer Signed-off-by: Ralf Baechle --- arch/mips/sgi-ip22/ip22-platform.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/sgi-ip22/ip22-platform.c b/arch/mips/sgi-ip22/ip22-platform.c index 60141235ec4..52486c4d2b0 100644 --- a/arch/mips/sgi-ip22/ip22-platform.c +++ b/arch/mips/sgi-ip22/ip22-platform.c @@ -150,7 +150,7 @@ static int __init sgiseeq_devinit(void) return res; /* Second HPC is missing? */ - if (!ip22_is_fullhouse() || + if (ip22_is_fullhouse() || get_dbe(tmp, (unsigned int *)&hpc3c1->pbdma[1])) return 0; -- cgit v1.2.3 From 0253398ca1df7e1d2bfbb452175c964a0862482c Mon Sep 17 00:00:00 2001 From: Jeremy Fitzhardinge Date: Mon, 8 Sep 2008 09:54:49 -0700 Subject: xen: fix 2.6.27-rc5 xen balloon driver warnings Set the class so it doesn't clash with the normal memory class. Signed-off-by: Jeremy Fitzhardinge Signed-off-by: Ingo Molnar =================================================================== --- drivers/xen/balloon.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/xen/balloon.c b/drivers/xen/balloon.c index d4427cb8697..2e15da5459c 100644 --- a/drivers/xen/balloon.c +++ b/drivers/xen/balloon.c @@ -60,7 +60,7 @@ #define PAGES2KB(_p) ((_p)<<(PAGE_SHIFT-10)) -#define BALLOON_CLASS_NAME "memory" +#define BALLOON_CLASS_NAME "xen_memory" struct balloon_stats { /* We aim for 'current allocation' == 'target allocation'. */ -- cgit v1.2.3 From 14469a8dd23677921db5e7354a602c98d9c6300f Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Fri, 5 Sep 2008 09:30:14 -0700 Subject: x86: disable static NOPLs on 32 bits On 32-bit, at least the generic nops are fairly reasonable, but the default nops for 64-bit really look pretty sad, and the P6 nops really do look better. So I would suggest perhaps moving the static P6 nop selection into the CONFIG_X86_64 thing. The alternative is to just get rid of that static nop selection, and just have two cases: 32-bit and 64-bit, and just pick obviously safe cases for them. Signed-off-by: H. Peter Anvin --- arch/x86/Kconfig.cpu | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu index 2c518fbc52e..b225219c448 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu @@ -382,14 +382,17 @@ config X86_OOSTORE # P6_NOPs are a relatively minor optimization that require a family >= # 6 processor, except that it is broken on certain VIA chips. # Furthermore, AMD chips prefer a totally different sequence of NOPs -# (which work on all CPUs). As a result, disallow these if we're -# compiling X86_GENERIC but not X86_64 (these NOPs do work on all -# x86-64 capable chips); the list of processors in the right-hand clause -# are the cores that benefit from this optimization. +# (which work on all CPUs). In addition, it looks like Virtual PC +# does not understand them. +# +# As a result, disallow these if we're not compiling for X86_64 (these +# NOPs do work on all x86-64 capable chips); the list of processors in +# the right-hand clause are the cores that benefit from this optimization. # config X86_P6_NOP def_bool y - depends on (X86_64 || !X86_GENERIC) && (M686 || MPENTIUMII || MPENTIUMIII || MPENTIUMM || MCORE2 || MPENTIUM4 || MPSC) + depends on X86_64 + depends on (MCORE2 || MPENTIUM4 || MPSC) config X86_TSC def_bool y -- cgit v1.2.3 From d315492b1a6ba29da0fa2860759505ae1b2db857 Mon Sep 17 00:00:00 2001 From: Daniel Lezcano Date: Mon, 8 Sep 2008 13:17:27 -0700 Subject: netns : fix kernel panic in timewait socket destruction How to reproduce ? - create a network namespace - use tcp protocol and get timewait socket - exit the network namespace - after a moment (when the timewait socket is destroyed), the kernel panics. # BUG: unable to handle kernel NULL pointer dereference at 0000000000000007 IP: [] inet_twdr_do_twkill_work+0x6e/0xb8 PGD 119985067 PUD 11c5c0067 PMD 0 Oops: 0000 [1] SMP CPU 1 Modules linked in: ipv6 button battery ac loop dm_mod tg3 libphy ext3 jbd edd fan thermal processor thermal_sys sg sata_svw libata dock serverworks sd_mod scsi_mod ide_disk ide_core [last unloaded: freq_table] Pid: 0, comm: swapper Not tainted 2.6.27-rc2 #3 RIP: 0010:[] [] inet_twdr_do_twkill_work+0x6e/0xb8 RSP: 0018:ffff88011ff7fed0 EFLAGS: 00010246 RAX: ffffffffffffffff RBX: ffffffff82339420 RCX: ffff88011ff7ff30 RDX: 0000000000000001 RSI: ffff88011a4d03c0 RDI: ffff88011ac2fc00 RBP: ffffffff823392e0 R08: 0000000000000000 R09: ffff88002802a200 R10: ffff8800a5c4b000 R11: ffffffff823e4080 R12: ffff88011ac2fc00 R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000000 FS: 0000000041cbd940(0000) GS:ffff8800bff839c0(0000) knlGS:0000000000000000 CS: 0010 DS: 0018 ES: 0018 CR0: 000000008005003b CR2: 0000000000000007 CR3: 00000000bd87c000 CR4: 00000000000006e0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000ffff0ff0 DR7: 0000000000000400 Process swapper (pid: 0, threadinfo ffff8800bff9e000, task ffff88011ff76690) Stack: ffffffff823392e0 0000000000000100 ffffffff821e3a3a 0000000000000008 0000000000000000 ffffffff821e3a61 ffff8800bff7c000 ffffffff8203c7e7 ffff88011ff7ff10 ffff88011ff7ff10 0000000000000021 ffffffff82351108 Call Trace: [] ? inet_twdr_hangman+0x0/0x9e [] ? inet_twdr_hangman+0x27/0x9e [] ? run_timer_softirq+0x12c/0x193 [] ? __do_softirq+0x5e/0xcd [] ? call_softirq+0x1c/0x28 [] ? do_softirq+0x2c/0x68 [] ? smp_apic_timer_interrupt+0x8e/0xa9 [] ? apic_timer_interrupt+0x66/0x70 [] ? default_idle+0x27/0x3b [] ? cpu_idle+0x5f/0x7d Code: e8 01 00 00 4c 89 e7 41 ff c5 e8 8d fd ff ff 49 8b 44 24 38 4c 89 e7 65 8b 14 25 24 00 00 00 89 d2 48 8b 80 e8 00 00 00 48 f7 d0 <48> 8b 04 d0 48 ff 40 58 e8 fc fc ff ff 48 89 df e8 c0 5f 04 00 RIP [] inet_twdr_do_twkill_work+0x6e/0xb8 RSP CR2: 0000000000000007 This patch provides a function to purge all timewait sockets related to a network namespace. The timewait sockets life cycle is not tied with the network namespace, that means the timewait sockets stay alive while the network namespace dies. The timewait sockets are for avoiding to receive a duplicate packet from the network, if the network namespace is freed, the network stack is removed, so no chance to receive any packets from the outside world. Furthermore, having a pending destruction timer on these sockets with a network namespace freed is not safe and will lead to an oops if the timer callback which try to access data belonging to the namespace like for example in: inet_twdr_do_twkill_work -> NET_INC_STATS_BH(twsk_net(tw), LINUX_MIB_TIMEWAITED); Purging the timewait sockets at the network namespace destruction will: 1) speed up memory freeing for the namespace 2) fix kernel panic on asynchronous timewait destruction Signed-off-by: Daniel Lezcano Acked-by: Denis V. Lunev Acked-by: Eric W. Biederman Signed-off-by: David S. Miller --- include/net/inet_timewait_sock.h | 3 +++ net/ipv4/inet_timewait_sock.c | 35 +++++++++++++++++++++++++++++++++++ net/ipv4/tcp_ipv4.c | 1 + net/ipv6/tcp_ipv6.c | 1 + 4 files changed, 40 insertions(+) diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h index 95c660c9719..91324908fcc 100644 --- a/include/net/inet_timewait_sock.h +++ b/include/net/inet_timewait_sock.h @@ -208,6 +208,9 @@ extern void inet_twsk_schedule(struct inet_timewait_sock *tw, extern void inet_twsk_deschedule(struct inet_timewait_sock *tw, struct inet_timewait_death_row *twdr); +extern void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, + struct inet_timewait_death_row *twdr, int family); + static inline struct net *twsk_net(const struct inet_timewait_sock *twsk) { diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index d985bd613d2..743f011b9a8 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -409,3 +409,38 @@ out: } EXPORT_SYMBOL_GPL(inet_twdr_twcal_tick); + +void inet_twsk_purge(struct net *net, struct inet_hashinfo *hashinfo, + struct inet_timewait_death_row *twdr, int family) +{ + struct inet_timewait_sock *tw; + struct sock *sk; + struct hlist_node *node; + int h; + + local_bh_disable(); + for (h = 0; h < (hashinfo->ehash_size); h++) { + struct inet_ehash_bucket *head = + inet_ehash_bucket(hashinfo, h); + rwlock_t *lock = inet_ehash_lockp(hashinfo, h); +restart: + write_lock(lock); + sk_for_each(sk, node, &head->twchain) { + + tw = inet_twsk(sk); + if (!net_eq(twsk_net(tw), net) || + tw->tw_family != family) + continue; + + atomic_inc(&tw->tw_refcnt); + write_unlock(lock); + inet_twsk_deschedule(tw, twdr); + inet_twsk_put(tw); + + goto restart; + } + write_unlock(lock); + } + local_bh_enable(); +} +EXPORT_SYMBOL_GPL(inet_twsk_purge); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 44c1e934824..1b4fee20fc9 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -2376,6 +2376,7 @@ static int __net_init tcp_sk_init(struct net *net) static void __net_exit tcp_sk_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv4.tcp_sock); + inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET); } static struct pernet_operations __net_initdata tcp_sk_ops = { diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 5b90b369ccb..b585c850a89 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -2148,6 +2148,7 @@ static int tcpv6_net_init(struct net *net) static void tcpv6_net_exit(struct net *net) { inet_ctl_sock_destroy(net->ipv6.tcp_sk); + inet_twsk_purge(net, &tcp_hashinfo, &tcp_death_row, AF_INET6); } static struct pernet_operations tcpv6_net_ops = { -- cgit v1.2.3 From 8d4698f7a54a492a1b96c505b30fe750ae3e61d5 Mon Sep 17 00:00:00 2001 From: Stephen Hemminger Date: Mon, 8 Sep 2008 13:44:40 -0700 Subject: bridge: don't allow setting hello time to zero Dushan Tcholich reports that on his system ksoftirqd can consume between %6 to %10 of cpu time, and cause ~200 context switches per second. He then correlated this with a report by bdupree@techfinesse.com: http://marc.info/?l=linux-kernel&m=119613299024398&w=2 and the culprit cause seems to be starting the bridge interface. In particular, when starting the bridge interface, his scripts are specifying a hello timer interval of "0". The bridge hello time can't be safely set to values less than 1 second, otherwise it is possible to end up with a runaway timer. Signed-off-by: Stephen Hemminger Signed-off-by: David S. Miller --- net/bridge/br_ioctl.c | 8 +++++++- net/bridge/br_sysfs_br.c | 26 ++++++++++++++++++-------- 2 files changed, 25 insertions(+), 9 deletions(-) diff --git a/net/bridge/br_ioctl.c b/net/bridge/br_ioctl.c index eeee218eed8..5bbf0736217 100644 --- a/net/bridge/br_ioctl.c +++ b/net/bridge/br_ioctl.c @@ -188,15 +188,21 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd) return 0; case BRCTL_SET_BRIDGE_HELLO_TIME: + { + unsigned long t = clock_t_to_jiffies(args[1]); if (!capable(CAP_NET_ADMIN)) return -EPERM; + if (t < HZ) + return -EINVAL; + spin_lock_bh(&br->lock); - br->bridge_hello_time = clock_t_to_jiffies(args[1]); + br->bridge_hello_time = t; if (br_is_root_bridge(br)) br->hello_time = br->bridge_hello_time; spin_unlock_bh(&br->lock); return 0; + } case BRCTL_SET_BRIDGE_MAX_AGE: if (!capable(CAP_NET_ADMIN)) diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c index 27d6a511c8c..158dee8b496 100644 --- a/net/bridge/br_sysfs_br.c +++ b/net/bridge/br_sysfs_br.c @@ -29,11 +29,12 @@ */ static ssize_t store_bridge_parm(struct device *d, const char *buf, size_t len, - void (*set)(struct net_bridge *, unsigned long)) + int (*set)(struct net_bridge *, unsigned long)) { struct net_bridge *br = to_bridge(d); char *endp; unsigned long val; + int err; if (!capable(CAP_NET_ADMIN)) return -EPERM; @@ -43,9 +44,9 @@ static ssize_t store_bridge_parm(struct device *d, return -EINVAL; spin_lock_bh(&br->lock); - (*set)(br, val); + err = (*set)(br, val); spin_unlock_bh(&br->lock); - return len; + return err ? err : len; } @@ -56,12 +57,13 @@ static ssize_t show_forward_delay(struct device *d, return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->forward_delay)); } -static void set_forward_delay(struct net_bridge *br, unsigned long val) +static int set_forward_delay(struct net_bridge *br, unsigned long val) { unsigned long delay = clock_t_to_jiffies(val); br->forward_delay = delay; if (br_is_root_bridge(br)) br->bridge_forward_delay = delay; + return 0; } static ssize_t store_forward_delay(struct device *d, @@ -80,12 +82,17 @@ static ssize_t show_hello_time(struct device *d, struct device_attribute *attr, jiffies_to_clock_t(to_bridge(d)->hello_time)); } -static void set_hello_time(struct net_bridge *br, unsigned long val) +static int set_hello_time(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); + + if (t < HZ) + return -EINVAL; + br->hello_time = t; if (br_is_root_bridge(br)) br->bridge_hello_time = t; + return 0; } static ssize_t store_hello_time(struct device *d, @@ -104,12 +111,13 @@ static ssize_t show_max_age(struct device *d, struct device_attribute *attr, jiffies_to_clock_t(to_bridge(d)->max_age)); } -static void set_max_age(struct net_bridge *br, unsigned long val) +static int set_max_age(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); br->max_age = t; if (br_is_root_bridge(br)) br->bridge_max_age = t; + return 0; } static ssize_t store_max_age(struct device *d, struct device_attribute *attr, @@ -126,9 +134,10 @@ static ssize_t show_ageing_time(struct device *d, return sprintf(buf, "%lu\n", jiffies_to_clock_t(br->ageing_time)); } -static void set_ageing_time(struct net_bridge *br, unsigned long val) +static int set_ageing_time(struct net_bridge *br, unsigned long val) { br->ageing_time = clock_t_to_jiffies(val); + return 0; } static ssize_t store_ageing_time(struct device *d, @@ -180,9 +189,10 @@ static ssize_t show_priority(struct device *d, struct device_attribute *attr, (br->bridge_id.prio[0] << 8) | br->bridge_id.prio[1]); } -static void set_priority(struct net_bridge *br, unsigned long val) +static int set_priority(struct net_bridge *br, unsigned long val) { br_stp_set_bridge_priority(br, (u16) val); + return 0; } static ssize_t store_priority(struct device *d, struct device_attribute *attr, -- cgit v1.2.3 From af904deaf6da3f3285eb0a06a3dc6a1af0251030 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Mon, 8 Sep 2008 11:58:13 -0400 Subject: NFS: Restore missing hunk in NFS mount option parser Automounter maps can contain mount options valid for other NFS implementations but not for Linux. The Linux automounter uses the mount command's "-s" command line option ("s" for "sloppy") so that mount requests containing such options are not rejected. Commit f45663ce5fb30f76a3414ab3ac69f4dd320e760a attempted to address a known regression with text-based NFS mount option parsing. Unrecognized mount options would cause mount requests to fail, even if the "-s" option was used on the mount command line. Unfortunately, this commit was not complete as submitted. It adds a new mount option, "sloppy". But it is missing a hunk, so it now allows NFS mounts with unrecognized mount options, even if the "sloppy" option is not present. This could be a problem if a required critical mount option such as "sync" is misspelled, for example, and is considered a regression from 2.6.26. This patch restores the missing hunk. Now, the default behavior of text-based NFS mount options is as before: any unrecognized mount option will cause the mount to fail. Please include this in 2.6.27-rc. Thanks to Neil Brown for reporting this. Signed-off-by: Chuck Lever Acked-by: J. Bruce Fields Signed-off-by: Linus Torvalds --- fs/nfs/super.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9abcd2b329f..e9b20173fef 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1279,6 +1279,12 @@ static int nfs_parse_mount_options(char *raw, } } + if (errors > 0) { + dfprintk(MOUNT, "NFS: parsing encountered %d error%s\n", + errors, (errors == 1 ? "" : "s")); + if (!sloppy) + return 0; + } return 1; out_nomem: -- cgit v1.2.3 From 06dd881f59b3c07a430cdcbef2197f9b6dc79ae8 Mon Sep 17 00:00:00 2001 From: Jason Wessel Date: Mon, 8 Sep 2008 14:53:37 +0100 Subject: usb: fix null deferences in low level usb serial The hw interface drivers for the usb serial devices deference the tty structure to set up the parameters for the initial console. The tty structure should be passed as a parameter to the set_termios() call. Signed-off-by: Jason Wessel Signed-off-by: Alan Cox Signed-off-by: Linus Torvalds --- drivers/usb/serial/console.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c index 7b74238ad1c..e980766bb84 100644 --- a/drivers/usb/serial/console.c +++ b/drivers/usb/serial/console.c @@ -161,7 +161,7 @@ static int usb_console_setup(struct console *co, char *options) if (serial->type->set_termios) { termios->c_cflag = cflag; tty_termios_encode_baud_rate(termios, baud, baud); - serial->type->set_termios(NULL, port, &dummy); + serial->type->set_termios(tty, port, &dummy); port->port.tty = NULL; kfree(termios); -- cgit v1.2.3 From a46add72f79bb8196f07a860adddd312ca398eec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Uwe=20Kleine-K=C3=B6nig?= Date: Tue, 9 Sep 2008 00:11:39 +0200 Subject: Fix format of MAINTAINERS MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ... one entry lacked a colon which broke one of my scripts. Signed-off-by: Uwe Kleine-König Signed-off-by: Linus Torvalds --- MAINTAINERS | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index af279458b61..b3e92fbe336 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -1593,7 +1593,7 @@ S: Supported EMBEDDED LINUX P: Paul Gortmaker M: paul.gortmaker@windriver.com -P David Woodhouse +P: David Woodhouse M: dwmw2@infradead.org L: linux-embedded@vger.kernel.org S: Maintained -- cgit v1.2.3 From 2eb2f77900d62796934bcd43c4089e444cf1179e Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Mon, 8 Sep 2008 17:21:07 -0700 Subject: sparc64: Disable timer interrupts in fixup_irqs(). When a CPU is offlined, we leave the timer interrupts disabled because fixup_irqs() does not explicitly take care of that case. Fix this by invoking tick_ops->disable_irq(). Based upon analysis done by Paul E. McKenney. Signed-off-by: David S. Miller --- arch/sparc64/kernel/irq.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/sparc64/kernel/irq.c b/arch/sparc64/kernel/irq.c index 9b6689d9d57..23963882bc1 100644 --- a/arch/sparc64/kernel/irq.c +++ b/arch/sparc64/kernel/irq.c @@ -792,6 +792,8 @@ void fixup_irqs(void) } spin_unlock_irqrestore(&irq_desc[irq].lock, flags); } + + tick_ops->disable_irq(); } #endif -- cgit v1.2.3 From f1c08ca559387ab30992055596d54061dfa022b1 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 9 Sep 2008 07:19:19 +0200 Subject: [Bluetooth] Fix reference counting during ACL config stage The ACL config stage keeps holding a reference count on incoming connections when requesting the extended features. This results in keeping an ACL link up without any users. The problem here is that the Bluetooth specification doesn't define an ownership of the ACL link and thus it can happen that the implementation on the initiator side doesn't care about disconnecting unused links. In this case the acceptor needs to take care of this. Signed-off-by: Marcel Holtmann --- net/bluetooth/hci_event.c | 11 ++++------- 1 file changed, 4 insertions(+), 7 deletions(-) diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 0e3db289f4b..ad7a553d771 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -1605,14 +1605,11 @@ static inline void hci_remote_ext_features_evt(struct hci_dev *hdev, struct sk_b if (conn->state == BT_CONFIG) { if (!ev->status && hdev->ssp_mode > 0 && - conn->ssp_mode > 0) { - if (conn->out) { - struct hci_cp_auth_requested cp; - cp.handle = ev->handle; - hci_send_cmd(hdev, - HCI_OP_AUTH_REQUESTED, + conn->ssp_mode > 0 && conn->out) { + struct hci_cp_auth_requested cp; + cp.handle = ev->handle; + hci_send_cmd(hdev, HCI_OP_AUTH_REQUESTED, sizeof(cp), &cp); - } } else { conn->state = BT_CONNECTED; hci_proto_connect_cfm(conn, ev->status); -- cgit v1.2.3 From 09ab6f4c2376a0fc31abde1e2991513f900ea825 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 9 Sep 2008 07:19:20 +0200 Subject: [Bluetooth] Enforce correct authentication requirements With the introduction of Security Mode 4 and Simple Pairing from the Bluetooth 2.1 specification it became mandatory that the initiator requires authentication and encryption before any L2CAP channel can be established. The only exception here is PSM 1 for the service discovery protocol (SDP). It is meant to be used without any encryption since it contains only public information. This is how Bluetooth 2.0 and before handle connections on PSM 1. For Bluetooth 2.1 devices the pairing procedure differentiates between no bonding, general bonding and dedicated bonding. The L2CAP layer wrongly uses always general bonding when creating new connections, but it should not do this for SDP connections. In this case the authentication requirement should be no bonding and the just-works model should be used, but in case of non-SDP connection it is required to use general bonding. If the new connection requires man-in-the-middle (MITM) protection, it also first wrongly creates an unauthenticated link key and then later on requests an upgrade to an authenticated link key to provide full MITM protection. With Simple Pairing the link key generation is an expensive operation (compared to Bluetooth 2.0 and before) and doing this twice during a connection setup causes a noticeable delay when establishing a new connection. This should be avoided to not regress from the expected Bluetooth 2.0 connection times. The authentication requirements are known up-front and so enforce them. To fulfill these requirements the hci_connect() function has been extended with an authentication requirement parameter that will be stored inside the connection information and can be retrieved by userspace at any time. This allows the correct IO capabilities exchange and results in the expected behavior. Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 2 +- net/bluetooth/hci_conn.c | 8 +++++--- net/bluetooth/l2cap.c | 19 +++++++++++++++++-- net/bluetooth/sco.c | 2 +- 4 files changed, 24 insertions(+), 7 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index cbf75109468..5e785b968e7 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -325,7 +325,7 @@ int hci_conn_del(struct hci_conn *conn); void hci_conn_hash_flush(struct hci_dev *hdev); void hci_conn_check_pending(struct hci_dev *hdev); -struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *src); +struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 auth_type); int hci_conn_auth(struct hci_conn *conn); int hci_conn_encrypt(struct hci_conn *conn); int hci_conn_change_link_key(struct hci_conn *conn); diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index ca8d05245ca..a2f9efaa336 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -330,7 +330,7 @@ EXPORT_SYMBOL(hci_get_route); /* Create SCO or ACL connection. * Device _must_ be locked */ -struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst) +struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 auth_type) { struct hci_conn *acl; struct hci_conn *sco; @@ -344,8 +344,10 @@ struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst) hci_conn_hold(acl); - if (acl->state == BT_OPEN || acl->state == BT_CLOSED) + if (acl->state == BT_OPEN || acl->state == BT_CLOSED) { + acl->auth_type = auth_type; hci_acl_connect(acl); + } if (type == ACL_LINK) return acl; @@ -381,7 +383,7 @@ int hci_conn_auth(struct hci_conn *conn) if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0) { if (!(conn->auth_type & 0x01)) { - conn->auth_type = HCI_AT_GENERAL_BONDING_MITM; + conn->auth_type |= 0x01; conn->link_mode &= ~HCI_LM_AUTH; } } diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index 3396d5bdef1..a96d6de80d1 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -55,7 +55,7 @@ #define BT_DBG(D...) #endif -#define VERSION "2.10" +#define VERSION "2.11" static u32 l2cap_feat_mask = 0x0000; @@ -778,6 +778,7 @@ static int l2cap_do_connect(struct sock *sk) struct l2cap_conn *conn; struct hci_conn *hcon; struct hci_dev *hdev; + __u8 auth_type; int err = 0; BT_DBG("%s -> %s psm 0x%2.2x", batostr(src), batostr(dst), l2cap_pi(sk)->psm); @@ -789,7 +790,21 @@ static int l2cap_do_connect(struct sock *sk) err = -ENOMEM; - hcon = hci_connect(hdev, ACL_LINK, dst); + if (l2cap_pi(sk)->link_mode & L2CAP_LM_AUTH || + l2cap_pi(sk)->link_mode & L2CAP_LM_ENCRYPT || + l2cap_pi(sk)->link_mode & L2CAP_LM_SECURE) { + if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) + auth_type = HCI_AT_NO_BONDING_MITM; + else + auth_type = HCI_AT_GENERAL_BONDING_MITM; + } else { + if (l2cap_pi(sk)->psm == cpu_to_le16(0x0001)) + auth_type = HCI_AT_NO_BONDING; + else + auth_type = HCI_AT_GENERAL_BONDING; + } + + hcon = hci_connect(hdev, ACL_LINK, dst, auth_type); if (!hcon) goto done; diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c index a16011fedc1..0cc91e6da76 100644 --- a/net/bluetooth/sco.c +++ b/net/bluetooth/sco.c @@ -200,7 +200,7 @@ static int sco_connect(struct sock *sk) else type = SCO_LINK; - hcon = hci_connect(hdev, type, dst); + hcon = hci_connect(hdev, type, dst, HCI_AT_NO_BONDING); if (!hcon) goto done; -- cgit v1.2.3 From e7c29cb16c833441fd2160642bb13025f4e7ac70 Mon Sep 17 00:00:00 2001 From: Marcel Holtmann Date: Tue, 9 Sep 2008 07:19:20 +0200 Subject: [Bluetooth] Reject L2CAP connections on an insecure ACL link The Security Mode 4 of the Bluetooth 2.1 specification has strict authentication and encryption requirements. It is the initiators job to create a secure ACL link. However in case of malicious devices, the acceptor has to make sure that the ACL is encrypted before allowing any kind of L2CAP connection. The only exception here is the PSM 1 for the service discovery protocol, because that is allowed to run on an insecure ACL link. Previously it was enough to reject a L2CAP connection during the connection setup phase, but with Bluetooth 2.1 it is forbidden to do any L2CAP protocol exchange on an insecure link (except SDP). The new hci_conn_check_link_mode() function can be used to check the integrity of an ACL link. This functions also takes care of the cases where Security Mode 4 is disabled or one of the devices is based on an older specification. Signed-off-by: Marcel Holtmann --- include/net/bluetooth/hci_core.h | 1 + net/bluetooth/af_bluetooth.c | 2 +- net/bluetooth/hci_conn.c | 13 +++++++++++++ net/bluetooth/l2cap.c | 15 +++++++++++---- 4 files changed, 26 insertions(+), 5 deletions(-) diff --git a/include/net/bluetooth/hci_core.h b/include/net/bluetooth/hci_core.h index 5e785b968e7..46a43b721dd 100644 --- a/include/net/bluetooth/hci_core.h +++ b/include/net/bluetooth/hci_core.h @@ -326,6 +326,7 @@ void hci_conn_hash_flush(struct hci_dev *hdev); void hci_conn_check_pending(struct hci_dev *hdev); struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 auth_type); +int hci_conn_check_link_mode(struct hci_conn *conn); int hci_conn_auth(struct hci_conn *conn); int hci_conn_encrypt(struct hci_conn *conn); int hci_conn_change_link_key(struct hci_conn *conn); diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c index 1edfdf4c095..f6348e078aa 100644 --- a/net/bluetooth/af_bluetooth.c +++ b/net/bluetooth/af_bluetooth.c @@ -49,7 +49,7 @@ #define BT_DBG(D...) #endif -#define VERSION "2.12" +#define VERSION "2.13" /* Bluetooth sockets */ #define BT_MAX_PROTO 8 diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index a2f9efaa336..b7002429f15 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -376,6 +376,19 @@ struct hci_conn *hci_connect(struct hci_dev *hdev, int type, bdaddr_t *dst, __u8 } EXPORT_SYMBOL(hci_connect); +/* Check link security requirement */ +int hci_conn_check_link_mode(struct hci_conn *conn) +{ + BT_DBG("conn %p", conn); + + if (conn->ssp_mode > 0 && conn->hdev->ssp_mode > 0 && + !(conn->link_mode & HCI_LM_ENCRYPT)) + return 0; + + return 1; +} +EXPORT_SYMBOL(hci_conn_check_link_mode); + /* Authenticate remote device */ int hci_conn_auth(struct hci_conn *conn) { diff --git a/net/bluetooth/l2cap.c b/net/bluetooth/l2cap.c index a96d6de80d1..9610a9c85b9 100644 --- a/net/bluetooth/l2cap.c +++ b/net/bluetooth/l2cap.c @@ -1568,10 +1568,10 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd struct l2cap_conn_req *req = (struct l2cap_conn_req *) data; struct l2cap_conn_rsp rsp; struct sock *sk, *parent; - int result, status = 0; + int result, status = L2CAP_CS_NO_INFO; u16 dcid = 0, scid = __le16_to_cpu(req->scid); - __le16 psm = req->psm; + __le16 psm = req->psm; BT_DBG("psm 0x%2.2x scid 0x%4.4x", psm, scid); @@ -1582,6 +1582,13 @@ static inline int l2cap_connect_req(struct l2cap_conn *conn, struct l2cap_cmd_hd goto sendresp; } + /* Check if the ACL is secure enough (if not SDP) */ + if (psm != cpu_to_le16(0x0001) && + !hci_conn_check_link_mode(conn->hcon)) { + result = L2CAP_CR_SEC_BLOCK; + goto response; + } + result = L2CAP_CR_NO_MEM; /* Check for backlog size */ @@ -2239,7 +2246,7 @@ static int l2cap_auth_cfm(struct hci_conn *hcon, u8 status) rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); rsp.result = cpu_to_le16(result); - rsp.status = cpu_to_le16(0); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp); } @@ -2311,7 +2318,7 @@ static int l2cap_encrypt_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) rsp.scid = cpu_to_le16(l2cap_pi(sk)->dcid); rsp.dcid = cpu_to_le16(l2cap_pi(sk)->scid); rsp.result = cpu_to_le16(result); - rsp.status = cpu_to_le16(0); + rsp.status = cpu_to_le16(L2CAP_CS_NO_INFO); l2cap_send_cmd(conn, l2cap_pi(sk)->ident, L2CAP_CONN_RSP, sizeof(rsp), &rsp); } -- cgit v1.2.3 From 3d6e48f43340343d97839eadb1ab7b6a3ea98797 Mon Sep 17 00:00:00 2001 From: Jarod Wilson Date: Tue, 9 Sep 2008 12:38:56 +0200 Subject: [S390] CVE-2008-1514: prevent ptrace padding area read/write in 31-bit mode When running a 31-bit ptrace, on either an s390 or s390x kernel, reads and writes into a padding area in struct user_regs_struct32 will result in a kernel panic. This is also known as CVE-2008-1514. Test case available here: http://sources.redhat.com/cgi-bin/cvsweb.cgi/~checkout~/tests/ptrace-tests/tests/user-area-padding.c?cvsroot=systemtap Steps to reproduce: 1) wget the above 2) gcc -o user-area-padding-31bit user-area-padding.c -Wall -ggdb2 -D_GNU_SOURCE -m31 3) ./user-area-padding-31bit Test status ----------- Without patch, both s390 and s390x kernels panic. With patch, the test case, as well as the gdb testsuite, pass without incident, padding area reads returning zero, writes ignored. Nb: original version returned -EINVAL on write attempts, which broke the gdb test and made the test case slightly unhappy, Jan Kratochvil suggested the change to return 0 on write attempts. Signed-off-by: Jarod Wilson Tested-by: Jan Kratochvil Signed-off-by: Martin Schwidefsky --- arch/s390/kernel/compat_ptrace.h | 1 + arch/s390/kernel/ptrace.c | 28 ++++++++++++++++++++++++++++ 2 files changed, 29 insertions(+) diff --git a/arch/s390/kernel/compat_ptrace.h b/arch/s390/kernel/compat_ptrace.h index cde81fa64f8..a2be3a978d5 100644 --- a/arch/s390/kernel/compat_ptrace.h +++ b/arch/s390/kernel/compat_ptrace.h @@ -42,6 +42,7 @@ struct user_regs_struct32 u32 gprs[NUM_GPRS]; u32 acrs[NUM_ACRS]; u32 orig_gpr2; + /* nb: there's a 4-byte hole here */ s390_fp_regs fp_regs; /* * These per registers are in here so that gdb can modify them diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c index 2815bfe348a..c8b08289eb8 100644 --- a/arch/s390/kernel/ptrace.c +++ b/arch/s390/kernel/ptrace.c @@ -170,6 +170,13 @@ static unsigned long __peek_user(struct task_struct *child, addr_t addr) */ tmp = (addr_t) task_pt_regs(child)->orig_gpr2; + } else if (addr < (addr_t) &dummy->regs.fp_regs) { + /* + * prevent reads of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + tmp = 0; + } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure @@ -270,6 +277,13 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data) */ task_pt_regs(child)->orig_gpr2 = data; + } else if (addr < (addr_t) &dummy->regs.fp_regs) { + /* + * prevent writes of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + return 0; + } else if (addr < (addr_t) (&dummy->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure @@ -428,6 +442,13 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr) */ tmp = *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4); + } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + /* + * prevent reads of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + tmp = 0; + } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure @@ -514,6 +535,13 @@ static int __poke_user_compat(struct task_struct *child, */ *(__u32*)((addr_t) &task_pt_regs(child)->orig_gpr2 + 4) = tmp; + } else if (addr < (addr_t) &dummy32->regs.fp_regs) { + /* + * prevent writess of padding hole between + * orig_gpr2 and fp_regs on s390. + */ + return 0; + } else if (addr < (addr_t) (&dummy32->regs.fp_regs + 1)) { /* * floating point regs. are stored in the thread structure -- cgit v1.2.3 From a2164b8174f13b7315c3f45c0b48dec619285096 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 9 Sep 2008 12:38:57 +0200 Subject: [S390] cio: Correct cleanup on error. Fix cleanup on error in chp_new() and init_channel_subsystem() (must not call kfree() on structures that had been registered). Signed-off-by: Cornelia Huck Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/chp.c | 7 ++++--- drivers/s390/cio/css.c | 32 ++++++++++++++++++-------------- 2 files changed, 22 insertions(+), 17 deletions(-) diff --git a/drivers/s390/cio/chp.c b/drivers/s390/cio/chp.c index db00b059173..f1216cf6fa8 100644 --- a/drivers/s390/cio/chp.c +++ b/drivers/s390/cio/chp.c @@ -423,7 +423,7 @@ int chp_new(struct chp_id chpid) ret = sysfs_create_group(&chp->dev.kobj, &chp_attr_group); if (ret) { device_unregister(&chp->dev); - goto out_free; + goto out; } mutex_lock(&channel_subsystems[chpid.cssid]->mutex); if (channel_subsystems[chpid.cssid]->cm_enabled) { @@ -432,14 +432,15 @@ int chp_new(struct chp_id chpid) sysfs_remove_group(&chp->dev.kobj, &chp_attr_group); device_unregister(&chp->dev); mutex_unlock(&channel_subsystems[chpid.cssid]->mutex); - goto out_free; + goto out; } } channel_subsystems[chpid.cssid]->chps[chpid.id] = chp; mutex_unlock(&channel_subsystems[chpid.cssid]->mutex); - return ret; + goto out; out_free: kfree(chp); +out: return ret; } diff --git a/drivers/s390/cio/css.c b/drivers/s390/cio/css.c index 51489eff6b0..1261e1a9e8c 100644 --- a/drivers/s390/cio/css.c +++ b/drivers/s390/cio/css.c @@ -633,6 +633,11 @@ channel_subsystem_release(struct device *dev) css = to_css(dev); mutex_destroy(&css->mutex); + if (css->pseudo_subchannel) { + /* Implies that it has been generated but never registered. */ + css_subchannel_release(&css->pseudo_subchannel->dev); + css->pseudo_subchannel = NULL; + } kfree(css); } @@ -785,11 +790,15 @@ init_channel_subsystem (void) } channel_subsystems[i] = css; ret = setup_css(i); - if (ret) - goto out_free; + if (ret) { + kfree(channel_subsystems[i]); + goto out_unregister; + } ret = device_register(&css->device); - if (ret) - goto out_free_all; + if (ret) { + put_device(&css->device); + goto out_unregister; + } if (css_chsc_characteristics.secm) { ret = device_create_file(&css->device, &dev_attr_cm_enable); @@ -802,7 +811,7 @@ init_channel_subsystem (void) } ret = register_reboot_notifier(&css_reboot_notifier); if (ret) - goto out_pseudo; + goto out_unregister; css_init_done = 1; /* Enable default isc for I/O subchannels. */ @@ -810,18 +819,12 @@ init_channel_subsystem (void) for_each_subchannel(__init_channel_subsystem, NULL); return 0; -out_pseudo: - device_unregister(&channel_subsystems[i]->pseudo_subchannel->dev); out_file: - device_remove_file(&channel_subsystems[i]->device, - &dev_attr_cm_enable); + if (css_chsc_characteristics.secm) + device_remove_file(&channel_subsystems[i]->device, + &dev_attr_cm_enable); out_device: device_unregister(&channel_subsystems[i]->device); -out_free_all: - kfree(channel_subsystems[i]->pseudo_subchannel->lock); - kfree(channel_subsystems[i]->pseudo_subchannel); -out_free: - kfree(channel_subsystems[i]); out_unregister: while (i > 0) { struct channel_subsystem *css; @@ -829,6 +832,7 @@ out_unregister: i--; css = channel_subsystems[i]; device_unregister(&css->pseudo_subchannel->dev); + css->pseudo_subchannel = NULL; if (css_chsc_characteristics.secm) device_remove_file(&css->device, &dev_attr_cm_enable); -- cgit v1.2.3 From c91ebe496120e05301465fff31094bfecf798e9f Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 9 Sep 2008 12:38:58 +0200 Subject: [S390] cio: handle ssch() return codes correctly. ssch() has two classes of return codes: - condition codes (0-3) which need to be translated to Linux error codes - Linux error codes (-EIO on exceptions) which should be passed to the caller (instead of erronously being handled like condition code 3) Signed-off-by: Cornelia Huck Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/cio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/s390/cio/cio.c b/drivers/s390/cio/cio.c index 33bff8fec7d..5954b905e3c 100644 --- a/drivers/s390/cio/cio.c +++ b/drivers/s390/cio/cio.c @@ -208,8 +208,10 @@ cio_start_key (struct subchannel *sch, /* subchannel structure */ case 1: /* status pending */ case 2: /* busy */ return -EBUSY; - default: /* device/path not operational */ + case 3: /* device/path not operational */ return cio_start_handle_notoper(sch, lpm); + default: + return ccode; } } -- cgit v1.2.3 From b301ea8c81b13123761772f344faf606c76ba174 Mon Sep 17 00:00:00 2001 From: Peter Oberparleiter Date: Tue, 9 Sep 2008 12:38:59 +0200 Subject: [S390] cio: allow offline processing for disconnected devices When disconnected ccw devices are removed, the device has to be set offline, otherwise there will be side effects including a reference count imbalance. This patch modifies ccw_device_offline to work for devices in disconnecte/not operational state. ccw_device_offline is called by cio for devices which are online during device removal. Signed-off-by: Peter Oberparleiter Signed-off-by: Martin Schwidefsky --- drivers/s390/cio/device_fsm.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/s390/cio/device_fsm.c b/drivers/s390/cio/device_fsm.c index 550508df952..84cc9ea346d 100644 --- a/drivers/s390/cio/device_fsm.c +++ b/drivers/s390/cio/device_fsm.c @@ -658,6 +658,13 @@ ccw_device_offline(struct ccw_device *cdev) { struct subchannel *sch; + /* Allow ccw_device_offline while disconnected. */ + if (cdev->private->state == DEV_STATE_DISCONNECTED || + cdev->private->state == DEV_STATE_NOT_OPER) { + cdev->private->flags.donotify = 0; + ccw_device_done(cdev, DEV_STATE_NOT_OPER); + return 0; + } if (ccw_device_is_orphan(cdev)) { ccw_device_done(cdev, DEV_STATE_OFFLINE); return 0; -- cgit v1.2.3 From 225f40055f779032974a9fce7b2f9c9eda04ff58 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Tue, 9 Sep 2008 05:23:37 -0700 Subject: ipsec: Restore larval states and socket policies in dump The commit commit 4c563f7669c10a12354b72b518c2287ffc6ebfb3 ("[XFRM]: Speed up xfrm_policy and xfrm_state walking") inadvertently removed larval states and socket policies from netlink dumps. This patch restores them. Signed-off-by: Herbert Xu Signed-off-by: David S. Miller --- net/xfrm/xfrm_policy.c | 1 + net/xfrm/xfrm_state.c | 2 ++ 2 files changed, 3 insertions(+) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 46914b79d85..b7754b1b73a 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1077,6 +1077,7 @@ static void __xfrm_policy_link(struct xfrm_policy *pol, int dir) struct hlist_head *chain = policy_hash_bysel(&pol->selector, pol->family, dir); + list_add_tail(&pol->bytype, &xfrm_policy_bytype[pol->type]); hlist_add_head(&pol->bydst, chain); hlist_add_head(&pol->byidx, xfrm_policy_byidx+idx_hash(pol->index)); xfrm_policy_count[dir]++; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 7bd62f61593..0a8f09c3144 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -858,6 +858,7 @@ xfrm_state_find(xfrm_address_t *daddr, xfrm_address_t *saddr, if (km_query(x, tmpl, pol) == 0) { x->km.state = XFRM_STATE_ACQ; + list_add_tail(&x->all, &xfrm_state_all); hlist_add_head(&x->bydst, xfrm_state_bydst+h); h = xfrm_src_hash(daddr, saddr, family); hlist_add_head(&x->bysrc, xfrm_state_bysrc+h); @@ -1055,6 +1056,7 @@ static struct xfrm_state *__find_acq_core(unsigned short family, u8 mode, u32 re xfrm_state_hold(x); x->timer.expires = jiffies + sysctl_xfrm_acq_expires*HZ; add_timer(&x->timer); + list_add_tail(&x->all, &xfrm_state_all); hlist_add_head(&x->bydst, xfrm_state_bydst+h); h = xfrm_src_hash(daddr, saddr, family); hlist_add_head(&x->bysrc, xfrm_state_bysrc+h); -- cgit v1.2.3 From adaae7215e5130e5ce1ac3ee390e5a23101b09b2 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 9 Sep 2008 20:02:01 +0200 Subject: update Documentation/filesystems/Locking for 2.6.27 changes In the 2.6.27 circle ->fasync lost the BKL, and the last remaining ->open variant that takes the BKL is also gone. ->get_sb and ->kill_sb didn't have BKL forever, so updated the entries while we're at that. Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- Documentation/filesystems/Locking | 15 ++++----------- 1 file changed, 4 insertions(+), 11 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index 680fb566b92..8362860e21a 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -144,8 +144,8 @@ prototypes: void (*kill_sb) (struct super_block *); locking rules: may block BKL -get_sb yes yes -kill_sb yes yes +get_sb yes no +kill_sb yes no ->get_sb() returns error or 0 with locked superblock attached to the vfsmount (exclusive on ->s_umount). @@ -409,12 +409,12 @@ ioctl: yes (see below) unlocked_ioctl: no (see below) compat_ioctl: no mmap: no -open: maybe (see below) +open: no flush: no release: no fsync: no (see below) aio_fsync: no -fasync: yes (see below) +fasync: no lock: yes readv: no writev: no @@ -431,13 +431,6 @@ For many filesystems, it is probably safe to acquire the inode semaphore. Note some filesystems (i.e. remote ones) provide no protection for i_size so you will need to use the BKL. -->open() locking is in-transit: big lock partially moved into the methods. -The only exception is ->open() in the instances of file_operations that never -end up in ->i_fop/->proc_fops, i.e. ones that belong to character devices -(chrdev_open() takes lock before replacing ->f_op and calling the secondary -method. As soon as we fix the handling of module reference counters all -instances of ->open() will be called without the BKL. - Note: ext2_release() was *the* source of contention on fs-intensive loads and dropping BKL on ->release() helps to get rid of that (we still grab BKL for cases when we close a file that had been opened r/w, but that -- cgit v1.2.3 From 7ae115b4f50d3c5824f1a15e572b5de9d1b06d35 Mon Sep 17 00:00:00 2001 From: Chris Snook Date: Tue, 9 Sep 2008 03:26:57 -0400 Subject: MAINTAINERS: add Atheros maintainer for atlx Jie Yang at Atheros is getting more directly involved with upstream work on the atl* drivers. This patch changes the ATL1 entry to ATLX (atl2 support posted to netdev today) and adds him as a maintainer. Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b3e92fbe336..186be3ba506 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -750,11 +750,13 @@ P: Ville Syrjala M: syrjala@sci.fi S: Maintained -ATL1 ETHERNET DRIVER +ATLX ETHERNET DRIVERS P: Jay Cliburn M: jcliburn@gmail.com P: Chris Snook M: csnook@redhat.com +P: Jie Yang +M: jie.yang@atheros.com L: atl1-devel@lists.sourceforge.net W: http://sourceforge.net/projects/atl1 W: http://atl1.sourceforge.net -- cgit v1.2.3 From deac93df26b20cf8438339b5935b5f5643bc30c9 Mon Sep 17 00:00:00 2001 From: James Bottomley Date: Wed, 3 Sep 2008 20:43:36 -0500 Subject: lib: Correct printk %pF to work on all architectures It was introduced by "vsprintf: add support for '%pS' and '%pF' pointer formats" in commit 0fe1ef24f7bd0020f29ffe287dfdb9ead33ca0b2. However, the current way its coded doesn't work on parisc64. For two reasons: 1) parisc isn't in the #ifdef and 2) parisc has a different format for function descriptors Make dereference_function_descriptor() more accommodating by allowing architecture overrides. I put the three overrides (for parisc64, ppc64 and ia64) in arch/kernel/module.c because that's where the kernel internal linker which knows how to deal with function descriptors sits. Signed-off-by: James Bottomley Acked-by: Benjamin Herrenschmidt Acked-by: Tony Luck Acked-by: Kyle McMartin Signed-off-by: Linus Torvalds --- arch/ia64/include/asm/sections.h | 3 +++ arch/ia64/kernel/module.c | 12 ++++++++++++ arch/parisc/kernel/module.c | 14 ++++++++++++++ arch/powerpc/include/asm/sections.h | 3 +++ arch/powerpc/kernel/module_64.c | 13 ++++++++++++- include/asm-generic/sections.h | 6 ++++++ include/asm-parisc/sections.h | 5 +++++ lib/vsprintf.c | 11 +---------- 8 files changed, 56 insertions(+), 11 deletions(-) diff --git a/arch/ia64/include/asm/sections.h b/arch/ia64/include/asm/sections.h index 7286e4a9fe8..a7acad2bc2f 100644 --- a/arch/ia64/include/asm/sections.h +++ b/arch/ia64/include/asm/sections.h @@ -21,5 +21,8 @@ extern char __start_gate_brl_fsys_bubble_down_patchlist[], __end_gate_brl_fsys_b extern char __start_unwind[], __end_unwind[]; extern char __start_ivt_text[], __end_ivt_text[]; +#undef dereference_function_descriptor +void *dereference_function_descriptor(void *); + #endif /* _ASM_IA64_SECTIONS_H */ diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c index 29aad349e0c..545626f66a4 100644 --- a/arch/ia64/kernel/module.c +++ b/arch/ia64/kernel/module.c @@ -31,9 +31,11 @@ #include #include #include +#include #include #include +#include #include #define ARCH_MODULE_DEBUG 0 @@ -941,3 +943,13 @@ module_arch_cleanup (struct module *mod) if (mod->arch.core_unw_table) unw_remove_unwind_table(mod->arch.core_unw_table); } + +void *dereference_function_descriptor(void *ptr) +{ + struct fdesc *desc = ptr; + void *p; + + if (!probe_kernel_address(&desc->ip, p)) + ptr = p; + return ptr; +} diff --git a/arch/parisc/kernel/module.c b/arch/parisc/kernel/module.c index fdacdd4341c..44138c3e6ea 100644 --- a/arch/parisc/kernel/module.c +++ b/arch/parisc/kernel/module.c @@ -47,7 +47,9 @@ #include #include #include +#include +#include #include #if 0 @@ -860,3 +862,15 @@ void module_arch_cleanup(struct module *mod) deregister_unwind_table(mod); module_bug_cleanup(mod); } + +#ifdef CONFIG_64BIT +void *dereference_function_descriptor(void *ptr) +{ + Elf64_Fdesc *desc = ptr; + void *p; + + if (!probe_kernel_address(&desc->addr, p)) + ptr = p; + return ptr; +} +#endif diff --git a/arch/powerpc/include/asm/sections.h b/arch/powerpc/include/asm/sections.h index 916018e425c..7710e9e6660 100644 --- a/arch/powerpc/include/asm/sections.h +++ b/arch/powerpc/include/asm/sections.h @@ -16,6 +16,9 @@ static inline int in_kernel_text(unsigned long addr) return 0; } +#undef dereference_function_descriptor +void *dereference_function_descriptor(void *); + #endif #endif /* __KERNEL__ */ diff --git a/arch/powerpc/kernel/module_64.c b/arch/powerpc/kernel/module_64.c index ee6a2982d56..ad79de272ff 100644 --- a/arch/powerpc/kernel/module_64.c +++ b/arch/powerpc/kernel/module_64.c @@ -21,8 +21,9 @@ #include #include #include +#include #include -#include +#include #include #include #include @@ -451,3 +452,13 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, return 0; } + +void *dereference_function_descriptor(void *ptr) +{ + struct ppc64_opd_entry *desc = ptr; + void *p; + + if (!probe_kernel_address(&desc->funcaddr, p)) + ptr = p; + return ptr; +} diff --git a/include/asm-generic/sections.h b/include/asm-generic/sections.h index 8feeae1f236..79a7ff925bf 100644 --- a/include/asm-generic/sections.h +++ b/include/asm-generic/sections.h @@ -14,4 +14,10 @@ extern char __kprobes_text_start[], __kprobes_text_end[]; extern char __initdata_begin[], __initdata_end[]; extern char __start_rodata[], __end_rodata[]; +/* function descriptor handling (if any). Override + * in asm/sections.h */ +#ifndef dereference_function_descriptor +#define dereference_function_descriptor(p) (p) +#endif + #endif /* _ASM_GENERIC_SECTIONS_H_ */ diff --git a/include/asm-parisc/sections.h b/include/asm-parisc/sections.h index fdd43ec42ec..9d13c3507ad 100644 --- a/include/asm-parisc/sections.h +++ b/include/asm-parisc/sections.h @@ -4,4 +4,9 @@ /* nothing to see, move along */ #include +#ifdef CONFIG_64BIT +#undef dereference_function_descriptor +void *dereference_function_descriptor(void *); +#endif + #endif diff --git a/lib/vsprintf.c b/lib/vsprintf.c index d8d1d114224..c399bc1093c 100644 --- a/lib/vsprintf.c +++ b/lib/vsprintf.c @@ -27,6 +27,7 @@ #include /* for PAGE_SIZE */ #include +#include /* for dereference_function_descriptor() */ /* Works only for digits and letters, but small and fast */ #define TOLOWER(x) ((x) | 0x20) @@ -513,16 +514,6 @@ static char *string(char *buf, char *end, char *s, int field_width, int precisio return buf; } -static inline void *dereference_function_descriptor(void *ptr) -{ -#if defined(CONFIG_IA64) || defined(CONFIG_PPC64) - void *p; - if (!probe_kernel_address(ptr, p)) - ptr = p; -#endif - return ptr; -} - static char *symbol_string(char *buf, char *end, void *ptr, int field_width, int precision, int flags) { unsigned long value = (unsigned long) ptr; -- cgit v1.2.3 From d6be118a97ce51ca84035270f91c2bccecbfac5f Mon Sep 17 00:00:00 2001 From: Prarit Bhargava Date: Tue, 9 Sep 2008 09:56:08 -0400 Subject: x86: fix memmap=exactmap boot argument When using kdump modifying the e820 map is yielding strange results. For example starting with BIOS-provided physical RAM map: BIOS-e820: 0000000000000100 - 0000000000093400 (usable) BIOS-e820: 0000000000093400 - 00000000000a0000 (reserved) BIOS-e820: 0000000000100000 - 000000003fee0000 (usable) BIOS-e820: 000000003fee0000 - 000000003fef3000 (ACPI data) BIOS-e820: 000000003fef3000 - 000000003ff80000 (ACPI NVS) BIOS-e820: 000000003ff80000 - 0000000040000000 (reserved) BIOS-e820: 00000000e0000000 - 00000000f0000000 (reserved) BIOS-e820: 00000000fec00000 - 00000000fec10000 (reserved) BIOS-e820: 00000000fee00000 - 00000000fee01000 (reserved) BIOS-e820: 00000000ff000000 - 0000000100000000 (reserved) and booting with args memmap=exactmap memmap=640K@0K memmap=5228K@16384K memmap=125188K@22252K memmap=76K#1047424K memmap=564K#1047500K resulted in: user-defined physical RAM map: user: 0000000000000000 - 0000000000093400 (usable) user: 0000000000093400 - 00000000000a0000 (reserved) user: 0000000000100000 - 000000003fee0000 (usable) user: 000000003fee0000 - 000000003fef3000 (ACPI data) user: 000000003fef3000 - 000000003ff80000 (ACPI NVS) user: 000000003ff80000 - 0000000040000000 (reserved) user: 00000000e0000000 - 00000000f0000000 (reserved) user: 00000000fec00000 - 00000000fec10000 (reserved) user: 00000000fee00000 - 00000000fee01000 (reserved) user: 00000000ff000000 - 0000000100000000 (reserved) But should have resulted in: user-defined physical RAM map: user: 0000000000000000 - 00000000000a0000 (usable) user: 0000000001000000 - 000000000151b000 (usable) user: 00000000015bb000 - 0000000008ffc000 (usable) user: 000000003fee0000 - 000000003ff80000 (ACPI data) This is happening because of an improper usage of strcmp() in the e820 parsing code. The strcmp() always returns !0 and never resets the value for e820.nr_map and returns an incorrect user-defined map. This patch fixes the problem. Signed-off-by: Prarit Bhargava Signed-off-by: Ingo Molnar --- arch/x86/kernel/e820.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 9af89078f7b..66e48aa2dd1 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -1203,7 +1203,7 @@ static int __init parse_memmap_opt(char *p) if (!p) return -EINVAL; - if (!strcmp(p, "exactmap")) { + if (!strncmp(p, "exactmap", 8)) { #ifdef CONFIG_CRASH_DUMP /* * If we are doing a crash dump, we still need to know -- cgit v1.2.3 From 61c22c34c6f80a8e89cff5ff717627c54cc14fd4 Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Tue, 9 Sep 2008 21:38:57 +0200 Subject: clockevents: remove WARN_ON which was used to gather information The issue of the endless reprogramming loop due to a too small min_delta_ns was fixed with the previous updates of the clock events code, but we had no information about the spread of this problem. I added a WARN_ON to get automated information via kerneloops.org and to get some direct reports, which allowed me to analyse the affected machines. The WARN_ON has served its purpose and would be annoying for a release kernel. Remove it and just keep the information about the increase of the min_delta_ns value. Signed-off-by: Thomas Gleixner --- kernel/time/tick-oneshot.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/kernel/time/tick-oneshot.c b/kernel/time/tick-oneshot.c index 2e35501e61d..2e8de678e76 100644 --- a/kernel/time/tick-oneshot.c +++ b/kernel/time/tick-oneshot.c @@ -43,19 +43,17 @@ int tick_dev_program_event(struct clock_event_device *dev, ktime_t expires, * and emit a warning. */ if (++i > 2) { - printk(KERN_WARNING "CE: __tick_program_event of %s is " - "stuck %llx %llx\n", dev->name ? dev->name : "?", - now.tv64, expires.tv64); - printk(KERN_WARNING - "CE: increasing min_delta_ns %ld to %ld nsec\n", - dev->min_delta_ns, dev->min_delta_ns << 1); - WARN_ON(1); - - /* Double the min. delta and try again */ + /* Increase the min. delta and try again */ if (!dev->min_delta_ns) dev->min_delta_ns = 5000; else - dev->min_delta_ns <<= 1; + dev->min_delta_ns += dev->min_delta_ns >> 1; + + printk(KERN_WARNING + "CE: %s increasing min_delta_ns to %lu nsec\n", + dev->name ? dev->name : "?", + dev->min_delta_ns << 1); + i = 0; } -- cgit v1.2.3 From e550dfb0c2c31b6363aa463a035fc9f8dcaa3c9b Mon Sep 17 00:00:00 2001 From: Neil Horman Date: Tue, 9 Sep 2008 13:51:35 -0700 Subject: ipv6: Fix OOPS in ip6_dst_lookup_tail(). This fixes kernel bugzilla 11469: "TUN with 1024 neighbours: ip6_dst_lookup_tail NULL crash" dst->neighbour is not necessarily hooked up at this point in the processing path, so blindly dereferencing it is the wrong thing to do. This NULL check exists in other similar paths and this case was just an oversight. Also fix the completely wrong and confusing indentation here while we're at it. Based upon a patch by Evgeniy Polyakov. Signed-off-by: Neil Horman Signed-off-by: David S. Miller --- net/ipv6/ip6_output.c | 64 +++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index 0e844c2736a..3df2c442d90 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -943,39 +943,39 @@ static int ip6_dst_lookup_tail(struct sock *sk, } #ifdef CONFIG_IPV6_OPTIMISTIC_DAD - /* - * Here if the dst entry we've looked up - * has a neighbour entry that is in the INCOMPLETE - * state and the src address from the flow is - * marked as OPTIMISTIC, we release the found - * dst entry and replace it instead with the - * dst entry of the nexthop router - */ - if (!((*dst)->neighbour->nud_state & NUD_VALID)) { - struct inet6_ifaddr *ifp; - struct flowi fl_gw; - int redirect; - - ifp = ipv6_get_ifaddr(net, &fl->fl6_src, - (*dst)->dev, 1); - - redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); - if (ifp) - in6_ifa_put(ifp); - - if (redirect) { - /* - * We need to get the dst entry for the - * default router instead - */ - dst_release(*dst); - memcpy(&fl_gw, fl, sizeof(struct flowi)); - memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr)); - *dst = ip6_route_output(net, sk, &fl_gw); - if ((err = (*dst)->error)) - goto out_err_release; - } + /* + * Here if the dst entry we've looked up + * has a neighbour entry that is in the INCOMPLETE + * state and the src address from the flow is + * marked as OPTIMISTIC, we release the found + * dst entry and replace it instead with the + * dst entry of the nexthop router + */ + if ((*dst)->neighbour && !((*dst)->neighbour->nud_state & NUD_VALID)) { + struct inet6_ifaddr *ifp; + struct flowi fl_gw; + int redirect; + + ifp = ipv6_get_ifaddr(net, &fl->fl6_src, + (*dst)->dev, 1); + + redirect = (ifp && ifp->flags & IFA_F_OPTIMISTIC); + if (ifp) + in6_ifa_put(ifp); + + if (redirect) { + /* + * We need to get the dst entry for the + * default router instead + */ + dst_release(*dst); + memcpy(&fl_gw, fl, sizeof(struct flowi)); + memset(&fl_gw.fl6_dst, 0, sizeof(struct in6_addr)); + *dst = ip6_route_output(net, sk, &fl_gw); + if ((err = (*dst)->error)) + goto out_err_release; } + } #endif return 0; -- cgit v1.2.3 From adee14b2e1557d0a8559f29681732d05a89dfc35 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 9 Sep 2008 16:27:49 -0700 Subject: Linux 2.6.27-rc6 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f448e0082eb..4ff83ea36c1 100644 --- a/Makefile +++ b/Makefile @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 27 -EXTRAVERSION = -rc5 +EXTRAVERSION = -rc6 NAME = Rotary Wombat # *DOCUMENTATION* -- cgit v1.2.3