diff options
Diffstat (limited to 'drivers')
128 files changed, 4677 insertions, 1850 deletions
diff --git a/drivers/acpi/acpi_memhotplug.c b/drivers/acpi/acpi_memhotplug.c index 7a0f4aa4fa1..9a62224cc27 100644 --- a/drivers/acpi/acpi_memhotplug.c +++ b/drivers/acpi/acpi_memhotplug.c @@ -38,6 +38,9 @@ #define _COMPONENT ACPI_MEMORY_DEVICE_COMPONENT +#undef PREFIX +#define PREFIX "ACPI:memory_hp:" + ACPI_MODULE_NAME("acpi_memhotplug"); MODULE_AUTHOR("Naveen B S <naveen.b.s@intel.com>"); MODULE_DESCRIPTION("Hotplug Mem Driver"); @@ -153,6 +156,7 @@ acpi_memory_get_device(acpi_handle handle, acpi_handle phandle; struct acpi_device *device = NULL; struct acpi_device *pdevice = NULL; + int result; if (!acpi_bus_get_device(handle, &device) && device) @@ -165,9 +169,9 @@ acpi_memory_get_device(acpi_handle handle, } /* Get the parent device */ - status = acpi_bus_get_device(phandle, &pdevice); - if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, "Cannot get acpi bus device")); + result = acpi_bus_get_device(phandle, &pdevice); + if (result) { + printk(KERN_WARNING PREFIX "Cannot get acpi bus device"); return -EINVAL; } @@ -175,9 +179,9 @@ acpi_memory_get_device(acpi_handle handle, * Now add the notified device. This creates the acpi_device * and invokes .add function */ - status = acpi_bus_add(&device, pdevice, handle, ACPI_BUS_TYPE_DEVICE); - if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, "Cannot add acpi bus")); + result = acpi_bus_add(&device, pdevice, handle, ACPI_BUS_TYPE_DEVICE); + if (result) { + printk(KERN_WARNING PREFIX "Cannot add acpi bus"); return -EINVAL; } @@ -238,7 +242,12 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) num_enabled++; continue; } - + /* + * If the memory block size is zero, please ignore it. + * Don't try to do the following memory hotplug flowchart. + */ + if (!info->length) + continue; if (node < 0) node = memory_add_physaddr_to_nid(info->start_addr); @@ -253,8 +262,15 @@ static int acpi_memory_enable_device(struct acpi_memory_device *mem_device) mem_device->state = MEMORY_INVALID_STATE; return -EINVAL; } - - return result; + /* + * Sometimes the memory device will contain several memory blocks. + * When one memory block is hot-added to the system memory, it will + * be regarded as a success. + * Otherwise if the last memory block can't be hot-added to the system + * memory, it will be failure and the memory device can't be bound with + * driver. + */ + return 0; } static int acpi_memory_powerdown_device(struct acpi_memory_device *mem_device) diff --git a/drivers/acpi/acpica/acobject.h b/drivers/acpi/acpica/acobject.h index 544dcf83492..eb6f038b03d 100644 --- a/drivers/acpi/acpica/acobject.h +++ b/drivers/acpi/acpica/acobject.h @@ -97,6 +97,7 @@ #define AOPOBJ_OBJECT_INITIALIZED 0x08 #define AOPOBJ_SETUP_COMPLETE 0x10 #define AOPOBJ_SINGLE_DATUM 0x20 +#define AOPOBJ_INVALID 0x40 /* Used if host OS won't allow an op_region address */ /****************************************************************************** * diff --git a/drivers/acpi/acpica/dsopcode.c b/drivers/acpi/acpica/dsopcode.c index 584d766e6f1..b79978f7bc7 100644 --- a/drivers/acpi/acpica/dsopcode.c +++ b/drivers/acpi/acpica/dsopcode.c @@ -397,6 +397,30 @@ acpi_status acpi_ds_get_region_arguments(union acpi_operand_object *obj_desc) status = acpi_ds_execute_arguments(node, acpi_ns_get_parent_node(node), extra_desc->extra.aml_length, extra_desc->extra.aml_start); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + /* Validate the region address/length via the host OS */ + + status = acpi_os_validate_address(obj_desc->region.space_id, + obj_desc->region.address, + (acpi_size) obj_desc->region.length, + acpi_ut_get_node_name(node)); + + if (ACPI_FAILURE(status)) { + /* + * Invalid address/length. We will emit an error message and mark + * the region as invalid, so that it will cause an additional error if + * it is ever used. Then return AE_OK. + */ + ACPI_EXCEPTION((AE_INFO, status, + "During address validation of OpRegion [%4.4s]", + node->name.ascii)); + obj_desc->common.flags |= AOPOBJ_INVALID; + status = AE_OK; + } + return_ACPI_STATUS(status); } diff --git a/drivers/acpi/acpica/exfldio.c b/drivers/acpi/acpica/exfldio.c index d4075b82102..6687be167f5 100644 --- a/drivers/acpi/acpica/exfldio.c +++ b/drivers/acpi/acpica/exfldio.c @@ -113,6 +113,12 @@ acpi_ex_setup_region(union acpi_operand_object *obj_desc, } } + /* Exit if Address/Length have been disallowed by the host OS */ + + if (rgn_desc->common.flags & AOPOBJ_INVALID) { + return_ACPI_STATUS(AE_AML_ILLEGAL_ADDRESS); + } + /* * Exit now for SMBus address space, it has a non-linear address space * and the request cannot be directly validated diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index 71670719d61..5691f165a95 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -189,11 +189,36 @@ acpi_status __init acpi_os_initialize(void) return AE_OK; } +static void bind_to_cpu0(struct work_struct *work) +{ + set_cpus_allowed(current, cpumask_of_cpu(0)); + kfree(work); +} + +static void bind_workqueue(struct workqueue_struct *wq) +{ + struct work_struct *work; + + work = kzalloc(sizeof(struct work_struct), GFP_KERNEL); + INIT_WORK(work, bind_to_cpu0); + queue_work(wq, work); +} + acpi_status acpi_os_initialize1(void) { + /* + * On some machines, a software-initiated SMI causes corruption unless + * the SMI runs on CPU 0. An SMI can be initiated by any AML, but + * typically it's done in GPE-related methods that are run via + * workqueues, so we can avoid the known corruption cases by binding + * the workqueues to CPU 0. + */ kacpid_wq = create_singlethread_workqueue("kacpid"); + bind_workqueue(kacpid_wq); kacpi_notify_wq = create_singlethread_workqueue("kacpi_notify"); + bind_workqueue(kacpi_notify_wq); kacpi_hotplug_wq = create_singlethread_workqueue("kacpi_hotplug"); + bind_workqueue(kacpi_hotplug_wq); BUG_ON(!kacpid_wq); BUG_ON(!kacpi_notify_wq); BUG_ON(!kacpi_hotplug_wq); diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c index 0944daec064..9c61ab2177c 100644 --- a/drivers/acpi/system.c +++ b/drivers/acpi/system.c @@ -121,7 +121,7 @@ static void acpi_table_attr_init(struct acpi_table_attr *table_attr, table_attr->attr.size = 0; table_attr->attr.read = acpi_table_show; table_attr->attr.attr.name = table_attr->name; - table_attr->attr.attr.mode = 0444; + table_attr->attr.attr.mode = 0400; return; } diff --git a/drivers/block/mg_disk.c b/drivers/block/mg_disk.c index f703f547824..6d7fbaa9224 100644 --- a/drivers/block/mg_disk.c +++ b/drivers/block/mg_disk.c @@ -36,7 +36,6 @@ /* Register offsets */ #define MG_BUFF_OFFSET 0x8000 -#define MG_STORAGE_BUFFER_SIZE 0x200 #define MG_REG_OFFSET 0xC000 #define MG_REG_FEATURE (MG_REG_OFFSET + 2) /* write case */ #define MG_REG_ERROR (MG_REG_OFFSET + 2) /* read case */ @@ -219,6 +218,16 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec) host->error = MG_ERR_NONE; expire = jiffies + msecs_to_jiffies(msec); + /* These 2 times dummy status read prevents reading invalid + * status. A very little time (3 times of mflash operating clk) + * is required for busy bit is set. Use dummy read instead of + * busy wait, because mflash's PLL is machine dependent. + */ + if (prv_data->use_polling) { + status = inb((unsigned long)host->dev_base + MG_REG_STATUS); + status = inb((unsigned long)host->dev_base + MG_REG_STATUS); + } + status = inb((unsigned long)host->dev_base + MG_REG_STATUS); do { @@ -245,8 +254,6 @@ static unsigned int mg_wait(struct mg_host *host, u32 expect, u32 msec) mg_dump_status("not ready", status, host); return MG_ERR_INV_STAT; } - if (prv_data->use_polling) - msleep(1); status = inb((unsigned long)host->dev_base + MG_REG_STATUS); } while (time_before(cur_jiffies, expire)); @@ -469,9 +476,18 @@ static unsigned int mg_out(struct mg_host *host, return MG_ERR_NONE; } +static void mg_read_one(struct mg_host *host, struct request *req) +{ + u16 *buff = (u16 *)req->buffer; + u32 i; + + for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) + *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET + + (i << 1)); +} + static void mg_read(struct request *req) { - u32 j; struct mg_host *host = req->rq_disk->private_data; if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req), @@ -482,49 +498,65 @@ static void mg_read(struct request *req) blk_rq_sectors(req), blk_rq_pos(req), req->buffer); do { - u16 *buff = (u16 *)req->buffer; - if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_RD_DRQ) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } - for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) - *buff++ = inw((unsigned long)host->dev_base + - MG_BUFF_OFFSET + (j << 1)); + + mg_read_one(host, req); outb(MG_CMD_RD_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); } while (mg_end_request(host, 0, MG_SECTOR_SIZE)); } +static void mg_write_one(struct mg_host *host, struct request *req) +{ + u16 *buff = (u16 *)req->buffer; + u32 i; + + for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) + outw(*buff++, (unsigned long)host->dev_base + MG_BUFF_OFFSET + + (i << 1)); +} + static void mg_write(struct request *req) { - u32 j; struct mg_host *host = req->rq_disk->private_data; + unsigned int rem = blk_rq_sectors(req); - if (mg_out(host, blk_rq_pos(req), blk_rq_sectors(req), + if (mg_out(host, blk_rq_pos(req), rem, MG_CMD_WR, NULL) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } MG_DBG("requested %d sects (from %ld), buffer=0x%p\n", - blk_rq_sectors(req), blk_rq_pos(req), req->buffer); + rem, blk_rq_pos(req), req->buffer); + + if (mg_wait(host, ATA_DRQ, + MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { + mg_bad_rw_intr(host); + return; + } do { - u16 *buff = (u16 *)req->buffer; + mg_write_one(host, req); - if (mg_wait(host, ATA_DRQ, MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { + outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + + MG_REG_COMMAND); + + rem--; + if (rem > 1 && mg_wait(host, ATA_DRQ, + MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { + mg_bad_rw_intr(host); + return; + } else if (mg_wait(host, MG_STAT_READY, + MG_TMAX_WAIT_WR_DRQ) != MG_ERR_NONE) { mg_bad_rw_intr(host); return; } - for (j = 0; j < MG_SECTOR_SIZE >> 1; j++) - outw(*buff++, (unsigned long)host->dev_base + - MG_BUFF_OFFSET + (j << 1)); - - outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + - MG_REG_COMMAND); } while (mg_end_request(host, 0, MG_SECTOR_SIZE)); } @@ -532,7 +564,6 @@ static void mg_read_intr(struct mg_host *host) { struct request *req = host->req; u32 i; - u16 *buff; /* check status */ do { @@ -550,13 +581,7 @@ static void mg_read_intr(struct mg_host *host) return; ok_to_read: - /* get current segment of request */ - buff = (u16 *)req->buffer; - - /* read 1 sector */ - for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) - *buff++ = inw((unsigned long)host->dev_base + MG_BUFF_OFFSET + - (i << 1)); + mg_read_one(host, req); MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", blk_rq_pos(req), blk_rq_sectors(req) - 1, req->buffer); @@ -575,8 +600,7 @@ ok_to_read: static void mg_write_intr(struct mg_host *host) { struct request *req = host->req; - u32 i, j; - u16 *buff; + u32 i; bool rem; /* check status */ @@ -597,12 +621,7 @@ static void mg_write_intr(struct mg_host *host) ok_to_write: if ((rem = mg_end_request(host, 0, MG_SECTOR_SIZE))) { /* write 1 sector and set handler if remains */ - buff = (u16 *)req->buffer; - for (j = 0; j < MG_STORAGE_BUFFER_SIZE >> 1; j++) { - outw(*buff, (unsigned long)host->dev_base + - MG_BUFF_OFFSET + (j << 1)); - buff++; - } + mg_write_one(host, req); MG_DBG("sector %ld, remaining=%ld, buffer=0x%p\n", blk_rq_pos(req), blk_rq_sectors(req), req->buffer); host->mg_do_intr = mg_write_intr; @@ -667,9 +686,6 @@ static unsigned int mg_issue_req(struct request *req, unsigned int sect_num, unsigned int sect_cnt) { - u16 *buff; - u32 i; - switch (rq_data_dir(req)) { case READ: if (mg_out(host, sect_num, sect_cnt, MG_CMD_RD, &mg_read_intr) @@ -693,12 +709,7 @@ static unsigned int mg_issue_req(struct request *req, mg_bad_rw_intr(host); return host->error; } - buff = (u16 *)req->buffer; - for (i = 0; i < MG_SECTOR_SIZE >> 1; i++) { - outw(*buff, (unsigned long)host->dev_base + - MG_BUFF_OFFSET + (i << 1)); - buff++; - } + mg_write_one(host, req); mod_timer(&host->timer, jiffies + 3 * HZ); outb(MG_CMD_WR_CONF, (unsigned long)host->dev_base + MG_REG_COMMAND); diff --git a/drivers/char/agp/parisc-agp.c b/drivers/char/agp/parisc-agp.c index f4bb43fb801..e077701ae3d 100644 --- a/drivers/char/agp/parisc-agp.c +++ b/drivers/char/agp/parisc-agp.c @@ -225,7 +225,7 @@ static const struct agp_bridge_driver parisc_agp_driver = { .configure = parisc_agp_configure, .fetch_size = parisc_agp_fetch_size, .tlb_flush = parisc_agp_tlbflush, - .mask_memory = parisc_agp_mask_memory, + .mask_memory = parisc_agp_page_mask_memory, .masks = parisc_agp_masks, .agp_enable = parisc_agp_enable, .cache_flush = global_cache_flush, diff --git a/drivers/char/tty_ldisc.c b/drivers/char/tty_ldisc.c index acd76b767d4..1733d3439ad 100644 --- a/drivers/char/tty_ldisc.c +++ b/drivers/char/tty_ldisc.c @@ -48,6 +48,41 @@ static DECLARE_WAIT_QUEUE_HEAD(tty_ldisc_wait); /* Line disc dispatch table */ static struct tty_ldisc_ops *tty_ldiscs[NR_LDISCS]; +static inline struct tty_ldisc *get_ldisc(struct tty_ldisc *ld) +{ + if (ld) + atomic_inc(&ld->users); + return ld; +} + +static void put_ldisc(struct tty_ldisc *ld) +{ + unsigned long flags; + + if (WARN_ON_ONCE(!ld)) + return; + + /* + * If this is the last user, free the ldisc, and + * release the ldisc ops. + * + * We really want an "atomic_dec_and_lock_irqsave()", + * but we don't have it, so this does it by hand. + */ + local_irq_save(flags); + if (atomic_dec_and_lock(&ld->users, &tty_ldisc_lock)) { + struct tty_ldisc_ops *ldo = ld->ops; + + ldo->refcount--; + module_put(ldo->owner); + spin_unlock_irqrestore(&tty_ldisc_lock, flags); + + kfree(ld); + return; + } + local_irq_restore(flags); +} + /** * tty_register_ldisc - install a line discipline * @disc: ldisc number @@ -142,7 +177,7 @@ static struct tty_ldisc *tty_ldisc_try_get(int disc) /* lock it */ ldops->refcount++; ld->ops = ldops; - ld->refcount = 0; + atomic_set(&ld->users, 1); err = 0; } } @@ -181,35 +216,6 @@ static struct tty_ldisc *tty_ldisc_get(int disc) return ld; } -/** - * tty_ldisc_put - drop ldisc reference - * @ld: ldisc - * - * Drop a reference to a line discipline. Manage refcounts and - * module usage counts. Free the ldisc once the recount hits zero. - * - * Locking: - * takes tty_ldisc_lock to guard against ldisc races - */ - -static void tty_ldisc_put(struct tty_ldisc *ld) -{ - unsigned long flags; - int disc = ld->ops->num; - struct tty_ldisc_ops *ldo; - - BUG_ON(disc < N_TTY || disc >= NR_LDISCS); - - spin_lock_irqsave(&tty_ldisc_lock, flags); - ldo = tty_ldiscs[disc]; - BUG_ON(ldo->refcount == 0); - ldo->refcount--; - module_put(ldo->owner); - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - WARN_ON(ld->refcount); - kfree(ld); -} - static void *tty_ldiscs_seq_start(struct seq_file *m, loff_t *pos) { return (*pos < NR_LDISCS) ? pos : NULL; @@ -234,7 +240,7 @@ static int tty_ldiscs_seq_show(struct seq_file *m, void *v) if (IS_ERR(ld)) return 0; seq_printf(m, "%-10s %2d\n", ld->ops->name ? ld->ops->name : "???", i); - tty_ldisc_put(ld); + put_ldisc(ld); return 0; } @@ -288,20 +294,17 @@ static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld) * Locking: takes tty_ldisc_lock */ -static int tty_ldisc_try(struct tty_struct *tty) +static struct tty_ldisc *tty_ldisc_try(struct tty_struct *tty) { unsigned long flags; struct tty_ldisc *ld; - int ret = 0; spin_lock_irqsave(&tty_ldisc_lock, flags); - ld = tty->ldisc; - if (test_bit(TTY_LDISC, &tty->flags)) { - ld->refcount++; - ret = 1; - } + ld = NULL; + if (test_bit(TTY_LDISC, &tty->flags)) + ld = get_ldisc(tty->ldisc); spin_unlock_irqrestore(&tty_ldisc_lock, flags); - return ret; + return ld; } /** @@ -322,10 +325,11 @@ static int tty_ldisc_try(struct tty_struct *tty) struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty) { + struct tty_ldisc *ld; + /* wait_event is a macro */ - wait_event(tty_ldisc_wait, tty_ldisc_try(tty)); - WARN_ON(tty->ldisc->refcount == 0); - return tty->ldisc; + wait_event(tty_ldisc_wait, (ld = tty_ldisc_try(tty)) != NULL); + return ld; } EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); @@ -342,9 +346,7 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty) { - if (tty_ldisc_try(tty)) - return tty->ldisc; - return NULL; + return tty_ldisc_try(tty); } EXPORT_SYMBOL_GPL(tty_ldisc_ref); @@ -360,21 +362,15 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref); void tty_ldisc_deref(struct tty_ldisc *ld) { - unsigned long flags; - - BUG_ON(ld == NULL); - - spin_lock_irqsave(&tty_ldisc_lock, flags); - if (ld->refcount == 0) - printk(KERN_ERR "tty_ldisc_deref: no references.\n"); - else - ld->refcount--; - if (ld->refcount == 0) - wake_up(&tty_ldisc_wait); - spin_unlock_irqrestore(&tty_ldisc_lock, flags); + put_ldisc(ld); } EXPORT_SYMBOL_GPL(tty_ldisc_deref); +static inline void tty_ldisc_put(struct tty_ldisc *ld) +{ + put_ldisc(ld); +} + /** * tty_ldisc_enable - allow ldisc use * @tty: terminal to activate ldisc on @@ -523,31 +519,6 @@ static int tty_ldisc_halt(struct tty_struct *tty) } /** - * tty_ldisc_wait_idle - wait for the ldisc to become idle - * @tty: tty to wait for - * - * Wait for the line discipline to become idle. The discipline must - * have been halted for this to guarantee it remains idle. - * - * tty_ldisc_lock protects the ref counts currently. - */ - -static int tty_ldisc_wait_idle(struct tty_struct *tty) -{ - unsigned long flags; - spin_lock_irqsave(&tty_ldisc_lock, flags); - while (tty->ldisc->refcount) { - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - if (wait_event_timeout(tty_ldisc_wait, - tty->ldisc->refcount == 0, 5 * HZ) == 0) - return -EBUSY; - spin_lock_irqsave(&tty_ldisc_lock, flags); - } - spin_unlock_irqrestore(&tty_ldisc_lock, flags); - return 0; -} - -/** * tty_set_ldisc - set line discipline * @tty: the terminal to set * @ldisc: the line discipline @@ -642,14 +613,6 @@ int tty_set_ldisc(struct tty_struct *tty, int ldisc) flush_scheduled_work(); - /* Let any existing reference holders finish */ - retval = tty_ldisc_wait_idle(tty); - if (retval < 0) { - clear_bit(TTY_LDISC_CHANGING, &tty->flags); - tty_ldisc_put(new_ldisc); - return retval; - } - mutex_lock(&tty->ldisc_mutex); if (test_bit(TTY_HUPPED, &tty->flags)) { /* We were raced by the hangup method. It will have stomped @@ -795,7 +758,6 @@ void tty_ldisc_hangup(struct tty_struct *tty) if (tty->ldisc) { /* Not yet closed */ /* Switch back to N_TTY */ tty_ldisc_halt(tty); - tty_ldisc_wait_idle(tty); tty_ldisc_reinit(tty); /* At this point we have a closed ldisc and we want to reopen it. We could defer this to the next open but @@ -860,14 +822,6 @@ void tty_ldisc_release(struct tty_struct *tty, struct tty_struct *o_tty) tty_ldisc_halt(tty); flush_scheduled_work(); - /* - * Wait for any short term users (we know they are just driver - * side waiters as the file is closing so user count on the file - * side is zero. - */ - - tty_ldisc_wait_idle(tty); - mutex_lock(&tty->ldisc_mutex); /* * Now kill off the ldisc diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index b90eda8b344..fd69086d08d 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -858,6 +858,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) /* Check for existing affected CPUs. * They may not be aware of it due to CPU Hotplug. + * cpufreq_cpu_put is called when the device is removed + * in __cpufreq_remove_dev() */ managed_policy = cpufreq_cpu_get(j); if (unlikely(managed_policy)) { @@ -884,7 +886,7 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) ret = sysfs_create_link(&sys_dev->kobj, &managed_policy->kobj, "cpufreq"); - if (!ret) + if (ret) cpufreq_cpu_put(managed_policy); /* * Success. We only needed to be added to the mask. @@ -924,6 +926,8 @@ static int cpufreq_add_dev(struct sys_device *sys_dev) spin_lock_irqsave(&cpufreq_driver_lock, flags); for_each_cpu(j, policy->cpus) { + if (!cpu_online(j)) + continue; per_cpu(cpufreq_cpu_data, j) = policy; per_cpu(policy_cpu, j) = policy->cpu; } @@ -1244,13 +1248,22 @@ EXPORT_SYMBOL(cpufreq_get); static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg) { - int cpu = sysdev->id; int ret = 0; + +#ifdef __powerpc__ + int cpu = sysdev->id; unsigned int cur_freq = 0; struct cpufreq_policy *cpu_policy; dprintk("suspending cpu %u\n", cpu); + /* + * This whole bogosity is here because Powerbooks are made of fail. + * No sane platform should need any of the code below to be run. + * (it's entirely the wrong thing to do, as driver->get may + * reenable interrupts on some architectures). + */ + if (!cpu_online(cpu)) return 0; @@ -1309,6 +1322,7 @@ static int cpufreq_suspend(struct sys_device *sysdev, pm_message_t pmsg) out: cpufreq_cpu_put(cpu_policy); +#endif /* __powerpc__ */ return ret; } @@ -1322,12 +1336,18 @@ out: */ static int cpufreq_resume(struct sys_device *sysdev) { - int cpu = sysdev->id; int ret = 0; + +#ifdef __powerpc__ + int cpu = sysdev->id; struct cpufreq_policy *cpu_policy; dprintk("resuming cpu %u\n", cpu); + /* As with the ->suspend method, all the code below is + * only necessary because Powerbooks suck. + * See commit 42d4dc3f4e1e for jokes. */ + if (!cpu_online(cpu)) return 0; @@ -1391,6 +1411,7 @@ out: schedule_work(&cpu_policy->update); fail: cpufreq_cpu_put(cpu_policy); +#endif /* __powerpc__ */ return ret; } diff --git a/drivers/cpufreq/cpufreq_conservative.c b/drivers/cpufreq/cpufreq_conservative.c index 57490502b21..bdea7e2f94b 100644 --- a/drivers/cpufreq/cpufreq_conservative.c +++ b/drivers/cpufreq/cpufreq_conservative.c @@ -63,6 +63,7 @@ struct cpu_dbs_info_s { unsigned int down_skip; unsigned int requested_freq; int cpu; + unsigned int enable:1; /* * percpu mutex that serializes governor limit change with * do_dbs_timer invocation. We do not want do_dbs_timer to run @@ -141,6 +142,9 @@ dbs_cpufreq_notifier(struct notifier_block *nb, unsigned long val, struct cpufreq_policy *policy; + if (!this_dbs_info->enable) + return 0; + policy = this_dbs_info->cur_policy; /* @@ -497,6 +501,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) int delay = usecs_to_jiffies(dbs_tuners_ins.sampling_rate); delay -= jiffies % delay; + dbs_info->enable = 1; INIT_DELAYED_WORK_DEFERRABLE(&dbs_info->work, do_dbs_timer); queue_delayed_work_on(dbs_info->cpu, kconservative_wq, &dbs_info->work, delay); @@ -504,6 +509,7 @@ static inline void dbs_timer_init(struct cpu_dbs_info_s *dbs_info) static inline void dbs_timer_exit(struct cpu_dbs_info_s *dbs_info) { + dbs_info->enable = 0; cancel_delayed_work_sync(&dbs_info->work); } diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 070357aaedb..81e1020fb51 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -4,7 +4,7 @@ menuconfig DMADEVICES bool "DMA Engine support" - depends on !HIGHMEM64G && HAS_DMA + depends on HAS_DMA help DMA engines can do asynchronous data transfers without involving the host CPU. Currently, this framework can be @@ -46,6 +46,14 @@ config DW_DMAC Support the Synopsys DesignWare AHB DMA controller. This can be integrated in chips such as the Atmel AT32ap7000. +config AT_HDMAC + tristate "Atmel AHB DMA support" + depends on ARCH_AT91SAM9RL + select DMA_ENGINE + help + Support the Atmel AHB DMA controller. This can be integrated in + chips such as the Atmel AT91SAM9RL. + config FSL_DMA tristate "Freescale Elo and Elo Plus DMA support" depends on FSL_SOC @@ -108,7 +116,7 @@ config NET_DMA config ASYNC_TX_DMA bool "Async_tx: Offload support for the async_tx api" - depends on DMA_ENGINE + depends on DMA_ENGINE && !HIGHMEM64G help This allows the async_tx api to take advantage of offload engines for memcpy, memset, xor, and raid6 p+q operations. If your platform has diff --git a/drivers/dma/Makefile b/drivers/dma/Makefile index a0b6564800c..40e1e008357 100644 --- a/drivers/dma/Makefile +++ b/drivers/dma/Makefile @@ -7,5 +7,6 @@ obj-$(CONFIG_INTEL_IOP_ADMA) += iop-adma.o obj-$(CONFIG_FSL_DMA) += fsldma.o obj-$(CONFIG_MV_XOR) += mv_xor.o obj-$(CONFIG_DW_DMAC) += dw_dmac.o +obj-$(CONFIG_AT_HDMAC) += at_hdmac.o obj-$(CONFIG_MX3_IPU) += ipu/ obj-$(CONFIG_TXX9_DMAC) += txx9dmac.o diff --git a/drivers/dma/at_hdmac.c b/drivers/dma/at_hdmac.c new file mode 100644 index 00000000000..9a1e5fb412e --- /dev/null +++ b/drivers/dma/at_hdmac.c @@ -0,0 +1,1213 @@ +/* + * Driver for the Atmel AHB DMA Controller (aka HDMA or DMAC on AT91 systems) + * + * Copyright (C) 2008 Atmel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + * + * + * This supports the Atmel AHB DMA Controller, + * + * The driver has currently been tested with the Atmel AT91SAM9RL + * and AT91SAM9G45 series. + */ + +#include <linux/clk.h> +#include <linux/dmaengine.h> +#include <linux/dma-mapping.h> +#include <linux/dmapool.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/platform_device.h> + +#include "at_hdmac_regs.h" + +/* + * Glossary + * -------- + * + * at_hdmac : Name of the ATmel AHB DMA Controller + * at_dma_ / atdma : ATmel DMA controller entity related + * atc_ / atchan : ATmel DMA Channel entity related + */ + +#define ATC_DEFAULT_CFG (ATC_FIFOCFG_HALFFIFO) +#define ATC_DEFAULT_CTRLA (0) +#define ATC_DEFAULT_CTRLB (ATC_SIF(0) \ + |ATC_DIF(1)) + +/* + * Initial number of descriptors to allocate for each channel. This could + * be increased during dma usage. + */ +static unsigned int init_nr_desc_per_channel = 64; +module_param(init_nr_desc_per_channel, uint, 0644); +MODULE_PARM_DESC(init_nr_desc_per_channel, + "initial descriptors per channel (default: 64)"); + + +/* prototypes */ +static dma_cookie_t atc_tx_submit(struct dma_async_tx_descriptor *tx); + + +/*----------------------------------------------------------------------*/ + +static struct at_desc *atc_first_active(struct at_dma_chan *atchan) +{ + return list_first_entry(&atchan->active_list, + struct at_desc, desc_node); +} + +static struct at_desc *atc_first_queued(struct at_dma_chan *atchan) +{ + return list_first_entry(&atchan->queue, + struct at_desc, desc_node); +} + +/** + * atc_alloc_descriptor - allocate and return an initilized descriptor + * @chan: the channel to allocate descriptors for + * @gfp_flags: GFP allocation flags + * + * Note: The ack-bit is positioned in the descriptor flag at creation time + * to make initial allocation more convenient. This bit will be cleared + * and control will be given to client at usage time (during + * preparation functions). + */ +static struct at_desc *atc_alloc_descriptor(struct dma_chan *chan, + gfp_t gfp_flags) +{ + struct at_desc *desc = NULL; + struct at_dma *atdma = to_at_dma(chan->device); + dma_addr_t phys; + + desc = dma_pool_alloc(atdma->dma_desc_pool, gfp_flags, &phys); + if (desc) { + memset(desc, 0, sizeof(struct at_desc)); + dma_async_tx_descriptor_init(&desc->txd, chan); + /* txd.flags will be overwritten in prep functions */ + desc->txd.flags = DMA_CTRL_ACK; + desc->txd.tx_submit = atc_tx_submit; + desc->txd.phys = phys; + } + + return desc; +} + +/** + * atc_desc_get - get a unsused descriptor from free_list + * @atchan: channel we want a new descriptor for + */ +static struct at_desc *atc_desc_get(struct at_dma_chan *atchan) +{ + struct at_desc *desc, *_desc; + struct at_desc *ret = NULL; + unsigned int i = 0; + LIST_HEAD(tmp_list); + + spin_lock_bh(&atchan->lock); + list_for_each_entry_safe(desc, _desc, &atchan->free_list, desc_node) { + i++; + if (async_tx_test_ack(&desc->txd)) { + list_del(&desc->desc_node); + ret = desc; + break; + } + dev_dbg(chan2dev(&atchan->chan_common), + "desc %p not ACKed\n", desc); + } + spin_unlock_bh(&atchan->lock); + dev_vdbg(chan2dev(&atchan->chan_common), + "scanned %u descriptors on freelist\n", i); + + /* no more descriptor available in initial pool: create one more */ + if (!ret) { + ret = atc_alloc_descriptor(&atchan->chan_common, GFP_ATOMIC); + if (ret) { + spin_lock_bh(&atchan->lock); + atchan->descs_allocated++; + spin_unlock_bh(&atchan->lock); + } else { + dev_err(chan2dev(&atchan->chan_common), + "not enough descriptors available\n"); + } + } + + return ret; +} + +/** + * atc_desc_put - move a descriptor, including any children, to the free list + * @atchan: channel we work on + * @desc: descriptor, at the head of a chain, to move to free list + */ +static void atc_desc_put(struct at_dma_chan *atchan, struct at_desc *desc) +{ + if (desc) { + struct at_desc *child; + + spin_lock_bh(&atchan->lock); + list_for_each_entry(child, &desc->txd.tx_list, desc_node) + dev_vdbg(chan2dev(&atchan->chan_common), + "moving child desc %p to freelist\n", + child); + list_splice_init(&desc->txd.tx_list, &atchan->free_list); + dev_vdbg(chan2dev(&atchan->chan_common), + "moving desc %p to freelist\n", desc); + list_add(&desc->desc_node, &atchan->free_list); + spin_unlock_bh(&atchan->lock); + } +} + +/** + * atc_assign_cookie - compute and assign new cookie + * @atchan: channel we work on + * @desc: descriptor to asign cookie for + * + * Called with atchan->lock held and bh disabled + */ +static dma_cookie_t +atc_assign_cookie(struct at_dma_chan *atchan, struct at_desc *desc) +{ + dma_cookie_t cookie = atchan->chan_common.cookie; + + if (++cookie < 0) + cookie = 1; + + atchan->chan_common.cookie = cookie; + desc->txd.cookie = cookie; + + return cookie; +} + +/** + * atc_dostart - starts the DMA engine for real + * @atchan: the channel we want to start + * @first: first descriptor in the list we want to begin with + * + * Called with atchan->lock held and bh disabled + */ +static void atc_dostart(struct at_dma_chan *atchan, struct at_desc *first) +{ + struct at_dma *atdma = to_at_dma(atchan->chan_common.device); + + /* ASSERT: channel is idle */ + if (atc_chan_is_enabled(atchan)) { + dev_err(chan2dev(&atchan->chan_common), + "BUG: Attempted to start non-idle channel\n"); + dev_err(chan2dev(&atchan->chan_common), + " channel: s0x%x d0x%x ctrl0x%x:0x%x l0x%x\n", + channel_readl(atchan, SADDR), + channel_readl(atchan, DADDR), + channel_readl(atchan, CTRLA), + channel_readl(atchan, CTRLB), + channel_readl(atchan, DSCR)); + + /* The tasklet will hopefully advance the queue... */ + return; + } + + vdbg_dump_regs(atchan); + + /* clear any pending interrupt */ + while (dma_readl(atdma, EBCISR)) + cpu_relax(); + + channel_writel(atchan, SADDR, 0); + channel_writel(atchan, DADDR, 0); + channel_writel(atchan, CTRLA, 0); + channel_writel(atchan, CTRLB, 0); + channel_writel(atchan, DSCR, first->txd.phys); + dma_writel(atdma, CHER, atchan->mask); + + vdbg_dump_regs(atchan); +} + +/** + * atc_chain_complete - finish work for one transaction chain + * @atchan: channel we work on + * @desc: descriptor at the head of the chain we want do complete + * + * Called with atchan->lock held and bh disabled */ +static void +atc_chain_complete(struct at_dma_chan *atchan, struct at_desc *desc) +{ + dma_async_tx_callback callback; + void *param; + struct dma_async_tx_descriptor *txd = &desc->txd; + + dev_vdbg(chan2dev(&atchan->chan_common), + "descriptor %u complete\n", txd->cookie); + + atchan->completed_cookie = txd->cookie; + callback = txd->callback; + param = txd->callback_param; + + /* move children to free_list */ + list_splice_init(&txd->tx_list, &atchan->free_list); + /* move myself to free_list */ + list_move(&desc->desc_node, &atchan->free_list); + + /* unmap dma addresses */ + if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) { + if (txd->flags & DMA_COMPL_DEST_UNMAP_SINGLE) + dma_unmap_single(chan2parent(&atchan->chan_common), + desc->lli.daddr, + desc->len, DMA_FROM_DEVICE); + else + dma_unmap_page(chan2parent(&atchan->chan_common), + desc->lli.daddr, + desc->len, DMA_FROM_DEVICE); + } + if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) { + if (txd->flags & DMA_COMPL_SRC_UNMAP_SINGLE) + dma_unmap_single(chan2parent(&atchan->chan_common), + desc->lli.saddr, + desc->len, DMA_TO_DEVICE); + else + dma_unmap_page(chan2parent(&atchan->chan_common), + desc->lli.saddr, + desc->len, DMA_TO_DEVICE); + } + + /* + * The API requires that no submissions are done from a + * callback, so we don't need to drop the lock here + */ + if (callback) + callback(param); + + dma_run_dependencies(txd); +} + +/** + * atc_complete_all - finish work for all transactions + * @atchan: channel to complete transactions for + * + * Eventually submit queued descriptors if any + * + * Assume channel is idle while calling this function + * Called with atchan->lock held and bh disabled + */ +static void atc_complete_all(struct at_dma_chan *atchan) +{ + struct at_desc *desc, *_desc; + LIST_HEAD(list); + + dev_vdbg(chan2dev(&atchan->chan_common), "complete all\n"); + + BUG_ON(atc_chan_is_enabled(atchan)); + + /* + * Submit queued descriptors ASAP, i.e. before we go through + * the completed ones. + */ + if (!list_empty(&atchan->queue)) + atc_dostart(atchan, atc_first_queued(atchan)); + /* empty active_list now it is completed */ + list_splice_init(&atchan->active_list, &list); + /* empty queue list by moving descriptors (if any) to active_list */ + list_splice_init(&atchan->queue, &atchan->active_list); + + list_for_each_entry_safe(desc, _desc, &list, desc_node) + atc_chain_complete(atchan, desc); +} + +/** + * atc_cleanup_descriptors - cleanup up finished descriptors in active_list + * @atchan: channel to be cleaned up + * + * Called with atchan->lock held and bh disabled + */ +static void atc_cleanup_descriptors(struct at_dma_chan *atchan) +{ + struct at_desc *desc, *_desc; + struct at_desc *child; + + dev_vdbg(chan2dev(&atchan->chan_common), "cleanup descriptors\n"); + + list_for_each_entry_safe(desc, _desc, &atchan->active_list, desc_node) { + if (!(desc->lli.ctrla & ATC_DONE)) + /* This one is currently in progress */ + return; + + list_for_each_entry(child, &desc->txd.tx_list, desc_node) + if (!(child->lli.ctrla & ATC_DONE)) + /* Currently in progress */ + return; + + /* + * No descriptors so far seem to be in progress, i.e. + * this chain must be done. + */ + atc_chain_complete(atchan, desc); + } +} + +/** + * atc_advance_work - at the end of a transaction, move forward + * @atchan: channel where the transaction ended + * + * Called with atchan->lock held and bh disabled + */ +static void atc_advance_work(struct at_dma_chan *atchan) +{ + dev_vdbg(chan2dev(&atchan->chan_common), "advance_work\n"); + + if (list_empty(&atchan->active_list) || + list_is_singular(&atchan->active_list)) { + atc_complete_all(atchan); + } else { + atc_chain_complete(atchan, atc_first_active(atchan)); + /* advance work */ + atc_dostart(atchan, atc_first_active(atchan)); + } +} + + +/** + * atc_handle_error - handle errors reported by DMA controller + * @atchan: channel where error occurs + * + * Called with atchan->lock held and bh disabled + */ +static void atc_handle_error(struct at_dma_chan *atchan) +{ + struct at_desc *bad_desc; + struct at_desc *child; + + /* + * The descriptor currently at the head of the active list is + * broked. Since we don't have any way to report errors, we'll + * just have to scream loudly and try to carry on. + */ + bad_desc = atc_first_active(atchan); + list_del_init(&bad_desc->desc_node); + + /* As we are stopped, take advantage to push queued descriptors + * in active_list */ + list_splice_init(&atchan->queue, atchan->active_list.prev); + + /* Try to restart the controller */ + if (!list_empty(&atchan->active_list)) + atc_dostart(atchan, atc_first_active(atchan)); + + /* + * KERN_CRITICAL may seem harsh, but since this only happens + * when someone submits a bad physical address in a + * descriptor, we should consider ourselves lucky that the + * controller flagged an error instead of scribbling over + * random memory locations. + */ + dev_crit(chan2dev(&atchan->chan_common), + "Bad descriptor submitted for DMA!\n"); + dev_crit(chan2dev(&atchan->chan_common), + " cookie: %d\n", bad_desc->txd.cookie); + atc_dump_lli(atchan, &bad_desc->lli); + list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node) + atc_dump_lli(atchan, &child->lli); + + /* Pretend the descriptor completed successfully */ + atc_chain_complete(atchan, bad_desc); +} + + +/*-- IRQ & Tasklet ---------------------------------------------------*/ + +static void atc_tasklet(unsigned long data) +{ + struct at_dma_chan *atchan = (struct at_dma_chan *)data; + + /* Channel cannot be enabled here */ + if (atc_chan_is_enabled(atchan)) { + dev_err(chan2dev(&atchan->chan_common), + "BUG: channel enabled in tasklet\n"); + return; + } + + spin_lock(&atchan->lock); + if (test_and_clear_bit(0, &atchan->error_status)) + atc_handle_error(atchan); + else + atc_advance_work(atchan); + + spin_unlock(&atchan->lock); +} + +static irqreturn_t at_dma_interrupt(int irq, void *dev_id) +{ + struct at_dma *atdma = (struct at_dma *)dev_id; + struct at_dma_chan *atchan; + int i; + u32 status, pending, imr; + int ret = IRQ_NONE; + + do { + imr = dma_readl(atdma, EBCIMR); + status = dma_readl(atdma, EBCISR); + pending = status & imr; + + if (!pending) + break; + + dev_vdbg(atdma->dma_common.dev, + "interrupt: status = 0x%08x, 0x%08x, 0x%08x\n", + status, imr, pending); + + for (i = 0; i < atdma->dma_common.chancnt; i++) { + atchan = &atdma->chan[i]; + if (pending & (AT_DMA_CBTC(i) | AT_DMA_ERR(i))) { + if (pending & AT_DMA_ERR(i)) { + /* Disable channel on AHB error */ + dma_writel(atdma, CHDR, atchan->mask); + /* Give information to tasklet */ + set_bit(0, &atchan->error_status); + } + tasklet_schedule(&atchan->tasklet); + ret = IRQ_HANDLED; + } + } + + } while (pending); + + return ret; +} + + +/*-- DMA Engine API --------------------------------------------------*/ + +/** + * atc_tx_submit - set the prepared descriptor(s) to be executed by the engine + * @desc: descriptor at the head of the transaction chain + * + * Queue chain if DMA engine is working already + * + * Cookie increment and adding to active_list or queue must be atomic + */ +static dma_cookie_t atc_tx_submit(struct dma_async_tx_descriptor *tx) +{ + struct at_desc *desc = txd_to_at_desc(tx); + struct at_dma_chan *atchan = to_at_dma_chan(tx->chan); + dma_cookie_t cookie; + + spin_lock_bh(&atchan->lock); + cookie = atc_assign_cookie(atchan, desc); + + if (list_empty(&atchan->active_list)) { + dev_vdbg(chan2dev(tx->chan), "tx_submit: started %u\n", + desc->txd.cookie); + atc_dostart(atchan, desc); + list_add_tail(&desc->desc_node, &atchan->active_list); + } else { + dev_vdbg(chan2dev(tx->chan), "tx_submit: queued %u\n", + desc->txd.cookie); + list_add_tail(&desc->desc_node, &atchan->queue); + } + + spin_unlock_bh(&atchan->lock); + + return cookie; +} + +/** + * atc_prep_dma_memcpy - prepare a memcpy operation + * @chan: the channel to prepare operation on + * @dest: operation virtual destination address + * @src: operation virtual source address + * @len: operation length + * @flags: tx descriptor status flags + */ +static struct dma_async_tx_descriptor * +atc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, + size_t len, unsigned long flags) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct at_desc *desc = NULL; + struct at_desc *first = NULL; + struct at_desc *prev = NULL; + size_t xfer_count; + size_t offset; + unsigned int src_width; + unsigned int dst_width; + u32 ctrla; + u32 ctrlb; + + dev_vdbg(chan2dev(chan), "prep_dma_memcpy: d0x%x s0x%x l0x%zx f0x%lx\n", + dest, src, len, flags); + + if (unlikely(!len)) { + dev_dbg(chan2dev(chan), "prep_dma_memcpy: length is zero!\n"); + return NULL; + } + + ctrla = ATC_DEFAULT_CTRLA; + ctrlb = ATC_DEFAULT_CTRLB + | ATC_SRC_ADDR_MODE_INCR + | ATC_DST_ADDR_MODE_INCR + | ATC_FC_MEM2MEM; + + /* + * We can be a lot more clever here, but this should take care + * of the most common optimization. + */ + if (!((src | dest | len) & 3)) { + ctrla |= ATC_SRC_WIDTH_WORD | ATC_DST_WIDTH_WORD; + src_width = dst_width = 2; + } else if (!((src | dest | len) & 1)) { + ctrla |= ATC_SRC_WIDTH_HALFWORD | ATC_DST_WIDTH_HALFWORD; + src_width = dst_width = 1; + } else { + ctrla |= ATC_SRC_WIDTH_BYTE | ATC_DST_WIDTH_BYTE; + src_width = dst_width = 0; + } + + for (offset = 0; offset < len; offset += xfer_count << src_width) { + xfer_count = min_t(size_t, (len - offset) >> src_width, + ATC_BTSIZE_MAX); + + desc = atc_desc_get(atchan); + if (!desc) + goto err_desc_get; + + desc->lli.saddr = src + offset; + desc->lli.daddr = dest + offset; + desc->lli.ctrla = ctrla | xfer_count; + desc->lli.ctrlb = ctrlb; + + desc->txd.cookie = 0; + async_tx_ack(&desc->txd); + + if (!first) { + first = desc; + } else { + /* inform the HW lli about chaining */ + prev->lli.dscr = desc->txd.phys; + /* insert the link descriptor to the LD ring */ + list_add_tail(&desc->desc_node, + &first->txd.tx_list); + } + prev = desc; + } + + /* First descriptor of the chain embedds additional information */ + first->txd.cookie = -EBUSY; + first->len = len; + + /* set end-of-link to the last link descriptor of list*/ + set_desc_eol(desc); + + desc->txd.flags = flags; /* client is in control of this ack */ + + return &first->txd; + +err_desc_get: + atc_desc_put(atchan, first); + return NULL; +} + + +/** + * atc_prep_slave_sg - prepare descriptors for a DMA_SLAVE transaction + * @chan: DMA channel + * @sgl: scatterlist to transfer to/from + * @sg_len: number of entries in @scatterlist + * @direction: DMA direction + * @flags: tx descriptor status flags + */ +static struct dma_async_tx_descriptor * +atc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, + unsigned int sg_len, enum dma_data_direction direction, + unsigned long flags) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct at_dma_slave *atslave = chan->private; + struct at_desc *first = NULL; + struct at_desc *prev = NULL; + u32 ctrla; + u32 ctrlb; + dma_addr_t reg; + unsigned int reg_width; + unsigned int mem_width; + unsigned int i; + struct scatterlist *sg; + size_t total_len = 0; + + dev_vdbg(chan2dev(chan), "prep_slave_sg: %s f0x%lx\n", + direction == DMA_TO_DEVICE ? "TO DEVICE" : "FROM DEVICE", + flags); + + if (unlikely(!atslave || !sg_len)) { + dev_dbg(chan2dev(chan), "prep_dma_memcpy: length is zero!\n"); + return NULL; + } + + reg_width = atslave->reg_width; + + sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction); + + ctrla = ATC_DEFAULT_CTRLA | atslave->ctrla; + ctrlb = ATC_DEFAULT_CTRLB | ATC_IEN; + + switch (direction) { + case DMA_TO_DEVICE: + ctrla |= ATC_DST_WIDTH(reg_width); + ctrlb |= ATC_DST_ADDR_MODE_FIXED + | ATC_SRC_ADDR_MODE_INCR + | ATC_FC_MEM2PER; + reg = atslave->tx_reg; + for_each_sg(sgl, sg, sg_len, i) { + struct at_desc *desc; + u32 len; + u32 mem; + + desc = atc_desc_get(atchan); + if (!desc) + goto err_desc_get; + + mem = sg_phys(sg); + len = sg_dma_len(sg); + mem_width = 2; + if (unlikely(mem & 3 || len & 3)) + mem_width = 0; + + desc->lli.saddr = mem; + desc->lli.daddr = reg; + desc->lli.ctrla = ctrla + | ATC_SRC_WIDTH(mem_width) + | len >> mem_width; + desc->lli.ctrlb = ctrlb; + + if (!first) { + first = desc; + } else { + /* inform the HW lli about chaining */ + prev->lli.dscr = desc->txd.phys; + /* insert the link descriptor to the LD ring */ + list_add_tail(&desc->desc_node, + &first->txd.tx_list); + } + prev = desc; + total_len += len; + } + break; + case DMA_FROM_DEVICE: + ctrla |= ATC_SRC_WIDTH(reg_width); + ctrlb |= ATC_DST_ADDR_MODE_INCR + | ATC_SRC_ADDR_MODE_FIXED + | ATC_FC_PER2MEM; + + reg = atslave->rx_reg; + for_each_sg(sgl, sg, sg_len, i) { + struct at_desc *desc; + u32 len; + u32 mem; + + desc = atc_desc_get(atchan); + if (!desc) + goto err_desc_get; + + mem = sg_phys(sg); + len = sg_dma_len(sg); + mem_width = 2; + if (unlikely(mem & 3 || len & 3)) + mem_width = 0; + + desc->lli.saddr = reg; + desc->lli.daddr = mem; + desc->lli.ctrla = ctrla + | ATC_DST_WIDTH(mem_width) + | len >> mem_width; + desc->lli.ctrlb = ctrlb; + + if (!first) { + first = desc; + } else { + /* inform the HW lli about chaining */ + prev->lli.dscr = desc->txd.phys; + /* insert the link descriptor to the LD ring */ + list_add_tail(&desc->desc_node, + &first->txd.tx_list); + } + prev = desc; + total_len += len; + } + break; + default: + return NULL; + } + + /* set end-of-link to the last link descriptor of list*/ + set_desc_eol(prev); + + /* First descriptor of the chain embedds additional information */ + first->txd.cookie = -EBUSY; + first->len = total_len; + + /* last link descriptor of list is responsible of flags */ + prev->txd.flags = flags; /* client is in control of this ack */ + + return &first->txd; + +err_desc_get: + dev_err(chan2dev(chan), "not enough descriptors available\n"); + atc_desc_put(atchan, first); + return NULL; +} + +static void atc_terminate_all(struct dma_chan *chan) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct at_dma *atdma = to_at_dma(chan->device); + struct at_desc *desc, *_desc; + LIST_HEAD(list); + + /* + * This is only called when something went wrong elsewhere, so + * we don't really care about the data. Just disable the + * channel. We still have to poll the channel enable bit due + * to AHB/HSB limitations. + */ + spin_lock_bh(&atchan->lock); + + dma_writel(atdma, CHDR, atchan->mask); + + /* confirm that this channel is disabled */ + while (dma_readl(atdma, CHSR) & atchan->mask) + cpu_relax(); + + /* active_list entries will end up before queued entries */ + list_splice_init(&atchan->queue, &list); + list_splice_init(&atchan->active_list, &list); + + spin_unlock_bh(&atchan->lock); + + /* Flush all pending and queued descriptors */ + list_for_each_entry_safe(desc, _desc, &list, desc_node) + atc_chain_complete(atchan, desc); +} + +/** + * atc_is_tx_complete - poll for transaction completion + * @chan: DMA channel + * @cookie: transaction identifier to check status of + * @done: if not %NULL, updated with last completed transaction + * @used: if not %NULL, updated with last used transaction + * + * If @done and @used are passed in, upon return they reflect the driver + * internal state and can be used with dma_async_is_complete() to check + * the status of multiple cookies without re-checking hardware state. + */ +static enum dma_status +atc_is_tx_complete(struct dma_chan *chan, + dma_cookie_t cookie, + dma_cookie_t *done, dma_cookie_t *used) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + dma_cookie_t last_used; + dma_cookie_t last_complete; + enum dma_status ret; + + dev_vdbg(chan2dev(chan), "is_tx_complete: %d (d%d, u%d)\n", + cookie, done ? *done : 0, used ? *used : 0); + + spin_lock_bh(atchan->lock); + + last_complete = atchan->completed_cookie; + last_used = chan->cookie; + + ret = dma_async_is_complete(cookie, last_complete, last_used); + if (ret != DMA_SUCCESS) { + atc_cleanup_descriptors(atchan); + + last_complete = atchan->completed_cookie; + last_used = chan->cookie; + + ret = dma_async_is_complete(cookie, last_complete, last_used); + } + + spin_unlock_bh(atchan->lock); + + if (done) + *done = last_complete; + if (used) + *used = last_used; + + return ret; +} + +/** + * atc_issue_pending - try to finish work + * @chan: target DMA channel + */ +static void atc_issue_pending(struct dma_chan *chan) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + + dev_vdbg(chan2dev(chan), "issue_pending\n"); + + if (!atc_chan_is_enabled(atchan)) { + spin_lock_bh(&atchan->lock); + atc_advance_work(atchan); + spin_unlock_bh(&atchan->lock); + } +} + +/** + * atc_alloc_chan_resources - allocate resources for DMA channel + * @chan: allocate descriptor resources for this channel + * @client: current client requesting the channel be ready for requests + * + * return - the number of allocated descriptors + */ +static int atc_alloc_chan_resources(struct dma_chan *chan) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct at_dma *atdma = to_at_dma(chan->device); + struct at_desc *desc; + struct at_dma_slave *atslave; + int i; + u32 cfg; + LIST_HEAD(tmp_list); + + dev_vdbg(chan2dev(chan), "alloc_chan_resources\n"); + + /* ASSERT: channel is idle */ + if (atc_chan_is_enabled(atchan)) { + dev_dbg(chan2dev(chan), "DMA channel not idle ?\n"); + return -EIO; + } + + cfg = ATC_DEFAULT_CFG; + + atslave = chan->private; + if (atslave) { + /* + * We need controller-specific data to set up slave + * transfers. + */ + BUG_ON(!atslave->dma_dev || atslave->dma_dev != atdma->dma_common.dev); + + /* if cfg configuration specified take it instad of default */ + if (atslave->cfg) + cfg = atslave->cfg; + } + + /* have we already been set up? + * reconfigure channel but no need to reallocate descriptors */ + if (!list_empty(&atchan->free_list)) + return atchan->descs_allocated; + + /* Allocate initial pool of descriptors */ + for (i = 0; i < init_nr_desc_per_channel; i++) { + desc = atc_alloc_descriptor(chan, GFP_KERNEL); + if (!desc) { + dev_err(atdma->dma_common.dev, + "Only %d initial descriptors\n", i); + break; + } + list_add_tail(&desc->desc_node, &tmp_list); + } + + spin_lock_bh(&atchan->lock); + atchan->descs_allocated = i; + list_splice(&tmp_list, &atchan->free_list); + atchan->completed_cookie = chan->cookie = 1; + spin_unlock_bh(&atchan->lock); + + /* channel parameters */ + channel_writel(atchan, CFG, cfg); + + dev_dbg(chan2dev(chan), + "alloc_chan_resources: allocated %d descriptors\n", + atchan->descs_allocated); + + return atchan->descs_allocated; +} + +/** + * atc_free_chan_resources - free all channel resources + * @chan: DMA channel + */ +static void atc_free_chan_resources(struct dma_chan *chan) +{ + struct at_dma_chan *atchan = to_at_dma_chan(chan); + struct at_dma *atdma = to_at_dma(chan->device); + struct at_desc *desc, *_desc; + LIST_HEAD(list); + + dev_dbg(chan2dev(chan), "free_chan_resources: (descs allocated=%u)\n", + atchan->descs_allocated); + + /* ASSERT: channel is idle */ + BUG_ON(!list_empty(&atchan->active_list)); + BUG_ON(!list_empty(&atchan->queue)); + BUG_ON(atc_chan_is_enabled(atchan)); + + list_for_each_entry_safe(desc, _desc, &atchan->free_list, desc_node) { + dev_vdbg(chan2dev(chan), " freeing descriptor %p\n", desc); + list_del(&desc->desc_node); + /* free link descriptor */ + dma_pool_free(atdma->dma_desc_pool, desc, desc->txd.phys); + } + list_splice_init(&atchan->free_list, &list); + atchan->descs_allocated = 0; + + dev_vdbg(chan2dev(chan), "free_chan_resources: done\n"); +} + + +/*-- Module Management -----------------------------------------------*/ + +/** + * at_dma_off - disable DMA controller + * @atdma: the Atmel HDAMC device + */ +static void at_dma_off(struct at_dma *atdma) +{ + dma_writel(atdma, EN, 0); + + /* disable all interrupts */ + dma_writel(atdma, EBCIDR, -1L); + + /* confirm that all channels are disabled */ + while (dma_readl(atdma, CHSR) & atdma->all_chan_mask) + cpu_relax(); +} + +static int __init at_dma_probe(struct platform_device *pdev) +{ + struct at_dma_platform_data *pdata; + struct resource *io; + struct at_dma *atdma; + size_t size; + int irq; + int err; + int i; + + /* get DMA Controller parameters from platform */ + pdata = pdev->dev.platform_data; + if (!pdata || pdata->nr_channels > AT_DMA_MAX_NR_CHANNELS) + return -EINVAL; + + io = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!io) + return -EINVAL; + + irq = platform_get_irq(pdev, 0); + if (irq < 0) + return irq; + + size = sizeof(struct at_dma); + size += pdata->nr_channels * sizeof(struct at_dma_chan); + atdma = kzalloc(size, GFP_KERNEL); + if (!atdma) + return -ENOMEM; + + /* discover transaction capabilites from the platform data */ + atdma->dma_common.cap_mask = pdata->cap_mask; + atdma->all_chan_mask = (1 << pdata->nr_channels) - 1; + + size = io->end - io->start + 1; + if (!request_mem_region(io->start, size, pdev->dev.driver->name)) { + err = -EBUSY; + goto err_kfree; + } + + atdma->regs = ioremap(io->start, size); + if (!atdma->regs) { + err = -ENOMEM; + goto err_release_r; + } + + atdma->clk = clk_get(&pdev->dev, "dma_clk"); + if (IS_ERR(atdma->clk)) { + err = PTR_ERR(atdma->clk); + goto err_clk; + } + clk_enable(atdma->clk); + + /* force dma off, just in case */ + at_dma_off(atdma); + + err = request_irq(irq, at_dma_interrupt, 0, "at_hdmac", atdma); + if (err) + goto err_irq; + + platform_set_drvdata(pdev, atdma); + + /* create a pool of consistent memory blocks for hardware descriptors */ + atdma->dma_desc_pool = dma_pool_create("at_hdmac_desc_pool", + &pdev->dev, sizeof(struct at_desc), + 4 /* word alignment */, 0); + if (!atdma->dma_desc_pool) { + dev_err(&pdev->dev, "No memory for descriptors dma pool\n"); + err = -ENOMEM; + goto err_pool_create; + } + + /* clear any pending interrupt */ + while (dma_readl(atdma, EBCISR)) + cpu_relax(); + + /* initialize channels related values */ + INIT_LIST_HEAD(&atdma->dma_common.channels); + for (i = 0; i < pdata->nr_channels; i++, atdma->dma_common.chancnt++) { + struct at_dma_chan *atchan = &atdma->chan[i]; + + atchan->chan_common.device = &atdma->dma_common; + atchan->chan_common.cookie = atchan->completed_cookie = 1; + atchan->chan_common.chan_id = i; + list_add_tail(&atchan->chan_common.device_node, + &atdma->dma_common.channels); + + atchan->ch_regs = atdma->regs + ch_regs(i); + spin_lock_init(&atchan->lock); + atchan->mask = 1 << i; + + INIT_LIST_HEAD(&atchan->active_list); + INIT_LIST_HEAD(&atchan->queue); + INIT_LIST_HEAD(&atchan->free_list); + + tasklet_init(&atchan->tasklet, atc_tasklet, + (unsigned long)atchan); + atc_enable_irq(atchan); + } + + /* set base routines */ + atdma->dma_common.device_alloc_chan_resources = atc_alloc_chan_resources; + atdma->dma_common.device_free_chan_resources = atc_free_chan_resources; + atdma->dma_common.device_is_tx_complete = atc_is_tx_complete; + atdma->dma_common.device_issue_pending = atc_issue_pending; + atdma->dma_common.dev = &pdev->dev; + + /* set prep routines based on capability */ + if (dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask)) + atdma->dma_common.device_prep_dma_memcpy = atc_prep_dma_memcpy; + + if (dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask)) { + atdma->dma_common.device_prep_slave_sg = atc_prep_slave_sg; + atdma->dma_common.device_terminate_all = atc_terminate_all; + } + + dma_writel(atdma, EN, AT_DMA_ENABLE); + + dev_info(&pdev->dev, "Atmel AHB DMA Controller ( %s%s), %d channels\n", + dma_has_cap(DMA_MEMCPY, atdma->dma_common.cap_mask) ? "cpy " : "", + dma_has_cap(DMA_SLAVE, atdma->dma_common.cap_mask) ? "slave " : "", + atdma->dma_common.chancnt); + + dma_async_device_register(&atdma->dma_common); + + return 0; + +err_pool_create: + platform_set_drvdata(pdev, NULL); + free_irq(platform_get_irq(pdev, 0), atdma); +err_irq: + clk_disable(atdma->clk); + clk_put(atdma->clk); +err_clk: + iounmap(atdma->regs); + atdma->regs = NULL; +err_release_r: + release_mem_region(io->start, size); +err_kfree: + kfree(atdma); + return err; +} + +static int __exit at_dma_remove(struct platform_device *pdev) +{ + struct at_dma *atdma = platform_get_drvdata(pdev); + struct dma_chan *chan, *_chan; + struct resource *io; + + at_dma_off(atdma); + dma_async_device_unregister(&atdma->dma_common); + + dma_pool_destroy(atdma->dma_desc_pool); + platform_set_drvdata(pdev, NULL); + free_irq(platform_get_irq(pdev, 0), atdma); + + list_for_each_entry_safe(chan, _chan, &atdma->dma_common.channels, + device_node) { + struct at_dma_chan *atchan = to_at_dma_chan(chan); + + /* Disable interrupts */ + atc_disable_irq(atchan); + tasklet_disable(&atchan->tasklet); + + tasklet_kill(&atchan->tasklet); + list_del(&chan->device_node); + } + + clk_disable(atdma->clk); + clk_put(atdma->clk); + + iounmap(atdma->regs); + atdma->regs = NULL; + + io = platform_get_resource(pdev, IORESOURCE_MEM, 0); + release_mem_region(io->start, io->end - io->start + 1); + + kfree(atdma); + + return 0; +} + +static void at_dma_shutdown(struct platform_device *pdev) +{ + struct at_dma *atdma = platform_get_drvdata(pdev); + + at_dma_off(platform_get_drvdata(pdev)); + clk_disable(atdma->clk); +} + +static int at_dma_suspend_late(struct platform_device *pdev, pm_message_t mesg) +{ + struct at_dma *atdma = platform_get_drvdata(pdev); + + at_dma_off(platform_get_drvdata(pdev)); + clk_disable(atdma->clk); + return 0; +} + +static int at_dma_resume_early(struct platform_device *pdev) +{ + struct at_dma *atdma = platform_get_drvdata(pdev); + + clk_enable(atdma->clk); + dma_writel(atdma, EN, AT_DMA_ENABLE); + return 0; + +} + +static struct platform_driver at_dma_driver = { + .remove = __exit_p(at_dma_remove), + .shutdown = at_dma_shutdown, + .suspend_late = at_dma_suspend_late, + .resume_early = at_dma_resume_early, + .driver = { + .name = "at_hdmac", + }, +}; + +static int __init at_dma_init(void) +{ + return platform_driver_probe(&at_dma_driver, at_dma_probe); +} +module_init(at_dma_init); + +static void __exit at_dma_exit(void) +{ + platform_driver_unregister(&at_dma_driver); +} +module_exit(at_dma_exit); + +MODULE_DESCRIPTION("Atmel AHB DMA Controller driver"); +MODULE_AUTHOR("Nicolas Ferre <nicolas.ferre@atmel.com>"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:at_hdmac"); diff --git a/drivers/dma/at_hdmac_regs.h b/drivers/dma/at_hdmac_regs.h new file mode 100644 index 00000000000..4c972afc49e --- /dev/null +++ b/drivers/dma/at_hdmac_regs.h @@ -0,0 +1,353 @@ +/* + * Header file for the Atmel AHB DMA Controller driver + * + * Copyright (C) 2008 Atmel Corporation + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#ifndef AT_HDMAC_REGS_H +#define AT_HDMAC_REGS_H + +#include <mach/at_hdmac.h> + +#define AT_DMA_MAX_NR_CHANNELS 8 + + +#define AT_DMA_GCFG 0x00 /* Global Configuration Register */ +#define AT_DMA_IF_BIGEND(i) (0x1 << (i)) /* AHB-Lite Interface i in Big-endian mode */ +#define AT_DMA_ARB_CFG (0x1 << 4) /* Arbiter mode. */ +#define AT_DMA_ARB_CFG_FIXED (0x0 << 4) +#define AT_DMA_ARB_CFG_ROUND_ROBIN (0x1 << 4) + +#define AT_DMA_EN 0x04 /* Controller Enable Register */ +#define AT_DMA_ENABLE (0x1 << 0) + +#define AT_DMA_SREQ 0x08 /* Software Single Request Register */ +#define AT_DMA_SSREQ(x) (0x1 << ((x) << 1)) /* Request a source single transfer on channel x */ +#define AT_DMA_DSREQ(x) (0x1 << (1 + ((x) << 1))) /* Request a destination single transfer on channel x */ + +#define AT_DMA_CREQ 0x0C /* Software Chunk Transfer Request Register */ +#define AT_DMA_SCREQ(x) (0x1 << ((x) << 1)) /* Request a source chunk transfer on channel x */ +#define AT_DMA_DCREQ(x) (0x1 << (1 + ((x) << 1))) /* Request a destination chunk transfer on channel x */ + +#define AT_DMA_LAST 0x10 /* Software Last Transfer Flag Register */ +#define AT_DMA_SLAST(x) (0x1 << ((x) << 1)) /* This src rq is last tx of buffer on channel x */ +#define AT_DMA_DLAST(x) (0x1 << (1 + ((x) << 1))) /* This dst rq is last tx of buffer on channel x */ + +#define AT_DMA_SYNC 0x14 /* Request Synchronization Register */ +#define AT_DMA_SYR(h) (0x1 << (h)) /* Synchronize handshake line h */ + +/* Error, Chained Buffer transfer completed and Buffer transfer completed Interrupt registers */ +#define AT_DMA_EBCIER 0x18 /* Enable register */ +#define AT_DMA_EBCIDR 0x1C /* Disable register */ +#define AT_DMA_EBCIMR 0x20 /* Mask Register */ +#define AT_DMA_EBCISR 0x24 /* Status Register */ +#define AT_DMA_CBTC_OFFSET 8 +#define AT_DMA_ERR_OFFSET 16 +#define AT_DMA_BTC(x) (0x1 << (x)) +#define AT_DMA_CBTC(x) (0x1 << (AT_DMA_CBTC_OFFSET + (x))) +#define AT_DMA_ERR(x) (0x1 << (AT_DMA_ERR_OFFSET + (x))) + +#define AT_DMA_CHER 0x28 /* Channel Handler Enable Register */ +#define AT_DMA_ENA(x) (0x1 << (x)) +#define AT_DMA_SUSP(x) (0x1 << ( 8 + (x))) +#define AT_DMA_KEEP(x) (0x1 << (24 + (x))) + +#define AT_DMA_CHDR 0x2C /* Channel Handler Disable Register */ +#define AT_DMA_DIS(x) (0x1 << (x)) +#define AT_DMA_RES(x) (0x1 << ( 8 + (x))) + +#define AT_DMA_CHSR 0x30 /* Channel Handler Status Register */ +#define AT_DMA_EMPT(x) (0x1 << (16 + (x))) +#define AT_DMA_STAL(x) (0x1 << (24 + (x))) + + +#define AT_DMA_CH_REGS_BASE 0x3C /* Channel registers base address */ +#define ch_regs(x) (AT_DMA_CH_REGS_BASE + (x) * 0x28) /* Channel x base addr */ + +/* Hardware register offset for each channel */ +#define ATC_SADDR_OFFSET 0x00 /* Source Address Register */ +#define ATC_DADDR_OFFSET 0x04 /* Destination Address Register */ +#define ATC_DSCR_OFFSET 0x08 /* Descriptor Address Register */ +#define ATC_CTRLA_OFFSET 0x0C /* Control A Register */ +#define ATC_CTRLB_OFFSET 0x10 /* Control B Register */ +#define ATC_CFG_OFFSET 0x14 /* Configuration Register */ +#define ATC_SPIP_OFFSET 0x18 /* Src PIP Configuration Register */ +#define ATC_DPIP_OFFSET 0x1C /* Dst PIP Configuration Register */ + + +/* Bitfield definitions */ + +/* Bitfields in DSCR */ +#define ATC_DSCR_IF(i) (0x3 & (i)) /* Dsc feched via AHB-Lite Interface i */ + +/* Bitfields in CTRLA */ +#define ATC_BTSIZE_MAX 0xFFFFUL /* Maximum Buffer Transfer Size */ +#define ATC_BTSIZE(x) (ATC_BTSIZE_MAX & (x)) /* Buffer Transfer Size */ +/* Chunck Tranfer size definitions are in at_hdmac.h */ +#define ATC_SRC_WIDTH_MASK (0x3 << 24) /* Source Single Transfer Size */ +#define ATC_SRC_WIDTH(x) ((x) << 24) +#define ATC_SRC_WIDTH_BYTE (0x0 << 24) +#define ATC_SRC_WIDTH_HALFWORD (0x1 << 24) +#define ATC_SRC_WIDTH_WORD (0x2 << 24) +#define ATC_DST_WIDTH_MASK (0x3 << 28) /* Destination Single Transfer Size */ +#define ATC_DST_WIDTH(x) ((x) << 28) +#define ATC_DST_WIDTH_BYTE (0x0 << 28) +#define ATC_DST_WIDTH_HALFWORD (0x1 << 28) +#define ATC_DST_WIDTH_WORD (0x2 << 28) +#define ATC_DONE (0x1 << 31) /* Tx Done (only written back in descriptor) */ + +/* Bitfields in CTRLB */ +#define ATC_SIF(i) (0x3 & (i)) /* Src tx done via AHB-Lite Interface i */ +#define ATC_DIF(i) ((0x3 & (i)) << 4) /* Dst tx done via AHB-Lite Interface i */ +#define ATC_SRC_PIP (0x1 << 8) /* Source Picture-in-Picture enabled */ +#define ATC_DST_PIP (0x1 << 12) /* Destination Picture-in-Picture enabled */ +#define ATC_SRC_DSCR_DIS (0x1 << 16) /* Src Descriptor fetch disable */ +#define ATC_DST_DSCR_DIS (0x1 << 20) /* Dst Descriptor fetch disable */ +#define ATC_FC_MASK (0x7 << 21) /* Choose Flow Controller */ +#define ATC_FC_MEM2MEM (0x0 << 21) /* Mem-to-Mem (DMA) */ +#define ATC_FC_MEM2PER (0x1 << 21) /* Mem-to-Periph (DMA) */ +#define ATC_FC_PER2MEM (0x2 << 21) /* Periph-to-Mem (DMA) */ +#define ATC_FC_PER2PER (0x3 << 21) /* Periph-to-Periph (DMA) */ +#define ATC_FC_PER2MEM_PER (0x4 << 21) /* Periph-to-Mem (Peripheral) */ +#define ATC_FC_MEM2PER_PER (0x5 << 21) /* Mem-to-Periph (Peripheral) */ +#define ATC_FC_PER2PER_SRCPER (0x6 << 21) /* Periph-to-Periph (Src Peripheral) */ +#define ATC_FC_PER2PER_DSTPER (0x7 << 21) /* Periph-to-Periph (Dst Peripheral) */ +#define ATC_SRC_ADDR_MODE_MASK (0x3 << 24) +#define ATC_SRC_ADDR_MODE_INCR (0x0 << 24) /* Incrementing Mode */ +#define ATC_SRC_ADDR_MODE_DECR (0x1 << 24) /* Decrementing Mode */ +#define ATC_SRC_ADDR_MODE_FIXED (0x2 << 24) /* Fixed Mode */ +#define ATC_DST_ADDR_MODE_MASK (0x3 << 28) +#define ATC_DST_ADDR_MODE_INCR (0x0 << 28) /* Incrementing Mode */ +#define ATC_DST_ADDR_MODE_DECR (0x1 << 28) /* Decrementing Mode */ +#define ATC_DST_ADDR_MODE_FIXED (0x2 << 28) /* Fixed Mode */ +#define ATC_IEN (0x1 << 30) /* BTC interrupt enable (active low) */ +#define ATC_AUTO (0x1 << 31) /* Auto multiple buffer tx enable */ + +/* Bitfields in CFG */ +/* are in at_hdmac.h */ + +/* Bitfields in SPIP */ +#define ATC_SPIP_HOLE(x) (0xFFFFU & (x)) +#define ATC_SPIP_BOUNDARY(x) ((0x3FF & (x)) << 16) + +/* Bitfields in DPIP */ +#define ATC_DPIP_HOLE(x) (0xFFFFU & (x)) +#define ATC_DPIP_BOUNDARY(x) ((0x3FF & (x)) << 16) + + +/*-- descriptors -----------------------------------------------------*/ + +/* LLI == Linked List Item; aka DMA buffer descriptor */ +struct at_lli { + /* values that are not changed by hardware */ + dma_addr_t saddr; + dma_addr_t daddr; + /* value that may get written back: */ + u32 ctrla; + /* more values that are not changed by hardware */ + u32 ctrlb; + dma_addr_t dscr; /* chain to next lli */ +}; + +/** + * struct at_desc - software descriptor + * @at_lli: hardware lli structure + * @txd: support for the async_tx api + * @desc_node: node on the channed descriptors list + * @len: total transaction bytecount + */ +struct at_desc { + /* FIRST values the hardware uses */ + struct at_lli lli; + + /* THEN values for driver housekeeping */ + struct dma_async_tx_descriptor txd; + struct list_head desc_node; + size_t len; +}; + +static inline struct at_desc * +txd_to_at_desc(struct dma_async_tx_descriptor *txd) +{ + return container_of(txd, struct at_desc, txd); +} + + +/*-- Channels --------------------------------------------------------*/ + +/** + * struct at_dma_chan - internal representation of an Atmel HDMAC channel + * @chan_common: common dmaengine channel object members + * @device: parent device + * @ch_regs: memory mapped register base + * @mask: channel index in a mask + * @error_status: transmit error status information from irq handler + * to tasklet (use atomic operations) + * @tasklet: bottom half to finish transaction work + * @lock: serializes enqueue/dequeue operations to descriptors lists + * @completed_cookie: identifier for the most recently completed operation + * @active_list: list of descriptors dmaengine is being running on + * @queue: list of descriptors ready to be submitted to engine + * @free_list: list of descriptors usable by the channel + * @descs_allocated: records the actual size of the descriptor pool + */ +struct at_dma_chan { + struct dma_chan chan_common; + struct at_dma *device; + void __iomem *ch_regs; + u8 mask; + unsigned long error_status; + struct tasklet_struct tasklet; + + spinlock_t lock; + + /* these other elements are all protected by lock */ + dma_cookie_t completed_cookie; + struct list_head active_list; + struct list_head queue; + struct list_head free_list; + unsigned int descs_allocated; +}; + +#define channel_readl(atchan, name) \ + __raw_readl((atchan)->ch_regs + ATC_##name##_OFFSET) + +#define channel_writel(atchan, name, val) \ + __raw_writel((val), (atchan)->ch_regs + ATC_##name##_OFFSET) + +static inline struct at_dma_chan *to_at_dma_chan(struct dma_chan *dchan) +{ + return container_of(dchan, struct at_dma_chan, chan_common); +} + + +/*-- Controller ------------------------------------------------------*/ + +/** + * struct at_dma - internal representation of an Atmel HDMA Controller + * @chan_common: common dmaengine dma_device object members + * @ch_regs: memory mapped register base + * @clk: dma controller clock + * @all_chan_mask: all channels availlable in a mask + * @dma_desc_pool: base of DMA descriptor region (DMA address) + * @chan: channels table to store at_dma_chan structures + */ +struct at_dma { + struct dma_device dma_common; + void __iomem *regs; + struct clk *clk; + + u8 all_chan_mask; + + struct dma_pool *dma_desc_pool; + /* AT THE END channels table */ + struct at_dma_chan chan[0]; +}; + +#define dma_readl(atdma, name) \ + __raw_readl((atdma)->regs + AT_DMA_##name) +#define dma_writel(atdma, name, val) \ + __raw_writel((val), (atdma)->regs + AT_DMA_##name) + +static inline struct at_dma *to_at_dma(struct dma_device *ddev) +{ + return container_of(ddev, struct at_dma, dma_common); +} + + +/*-- Helper functions ------------------------------------------------*/ + +static struct device *chan2dev(struct dma_chan *chan) +{ + return &chan->dev->device; +} +static struct device *chan2parent(struct dma_chan *chan) +{ + return chan->dev->device.parent; +} + +#if defined(VERBOSE_DEBUG) +static void vdbg_dump_regs(struct at_dma_chan *atchan) +{ + struct at_dma *atdma = to_at_dma(atchan->chan_common.device); + + dev_err(chan2dev(&atchan->chan_common), + " channel %d : imr = 0x%x, chsr = 0x%x\n", + atchan->chan_common.chan_id, + dma_readl(atdma, EBCIMR), + dma_readl(atdma, CHSR)); + + dev_err(chan2dev(&atchan->chan_common), + " channel: s0x%x d0x%x ctrl0x%x:0x%x cfg0x%x l0x%x\n", + channel_readl(atchan, SADDR), + channel_readl(atchan, DADDR), + channel_readl(atchan, CTRLA), + channel_readl(atchan, CTRLB), + channel_readl(atchan, CFG), + channel_readl(atchan, DSCR)); +} +#else +static void vdbg_dump_regs(struct at_dma_chan *atchan) {} +#endif + +static void atc_dump_lli(struct at_dma_chan *atchan, struct at_lli *lli) +{ + dev_printk(KERN_CRIT, chan2dev(&atchan->chan_common), + " desc: s0x%x d0x%x ctrl0x%x:0x%x l0x%x\n", + lli->saddr, lli->daddr, + lli->ctrla, lli->ctrlb, lli->dscr); +} + + +static void atc_setup_irq(struct at_dma_chan *atchan, int on) +{ + struct at_dma *atdma = to_at_dma(atchan->chan_common.device); + u32 ebci; + + /* enable interrupts on buffer chain completion & error */ + ebci = AT_DMA_CBTC(atchan->chan_common.chan_id) + | AT_DMA_ERR(atchan->chan_common.chan_id); + if (on) + dma_writel(atdma, EBCIER, ebci); + else + dma_writel(atdma, EBCIDR, ebci); +} + +static inline void atc_enable_irq(struct at_dma_chan *atchan) +{ + atc_setup_irq(atchan, 1); +} + +static inline void atc_disable_irq(struct at_dma_chan *atchan) +{ + atc_setup_irq(atchan, 0); +} + + +/** + * atc_chan_is_enabled - test if given channel is enabled + * @atchan: channel we want to test status + */ +static inline int atc_chan_is_enabled(struct at_dma_chan *atchan) +{ + struct at_dma *atdma = to_at_dma(atchan->chan_common.device); + + return !!(dma_readl(atdma, CHSR) & atchan->mask); +} + + +/** + * set_desc_eol - set end-of-link to descriptor so it will end transfer + * @desc: descriptor, signle or at the end of a chain, to end chain on + */ +static void set_desc_eol(struct at_desc *desc) +{ + desc->lli.ctrlb |= ATC_SRC_DSCR_DIS | ATC_DST_DSCR_DIS; + desc->lli.dscr = 0; +} + +#endif /* AT_HDMAC_REGS_H */ diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index fb7da5141e9..d93017fc787 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -38,6 +38,11 @@ module_param(max_channels, uint, S_IRUGO); MODULE_PARM_DESC(max_channels, "Maximum number of channels to use (default: all)"); +static unsigned int iterations; +module_param(iterations, uint, S_IRUGO); +MODULE_PARM_DESC(iterations, + "Iterations before stopping test (default: infinite)"); + static unsigned int xor_sources = 3; module_param(xor_sources, uint, S_IRUGO); MODULE_PARM_DESC(xor_sources, @@ -114,7 +119,7 @@ static void dmatest_init_srcs(u8 **bufs, unsigned int start, unsigned int len) buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); for ( ; i < start + len; i++) buf[i] = PATTERN_SRC | PATTERN_COPY - | (~i & PATTERN_COUNT_MASK);; + | (~i & PATTERN_COUNT_MASK); for ( ; i < test_buf_size; i++) buf[i] = PATTERN_SRC | (~i & PATTERN_COUNT_MASK); buf++; @@ -270,7 +275,8 @@ static int dmatest_func(void *data) flags = DMA_CTRL_ACK | DMA_COMPL_SKIP_DEST_UNMAP | DMA_PREP_INTERRUPT; - while (!kthread_should_stop()) { + while (!kthread_should_stop() + && !(iterations && total_tests >= iterations)) { struct dma_device *dev = chan->device; struct dma_async_tx_descriptor *tx = NULL; dma_addr_t dma_srcs[src_cnt]; @@ -416,6 +422,13 @@ err_srcbuf: err_srcs: pr_notice("%s: terminating after %u tests, %u failures (status %d)\n", thread_name, total_tests, failed_tests, ret); + + if (iterations > 0) + while (!kthread_should_stop()) { + DECLARE_WAIT_QUEUE_HEAD(wait_dmatest_exit); + interruptible_sleep_on(&wait_dmatest_exit); + } + return ret; } @@ -495,11 +508,11 @@ static int dmatest_add_channel(struct dma_chan *chan) if (dma_has_cap(DMA_MEMCPY, dma_dev->cap_mask)) { cnt = dmatest_add_threads(dtc, DMA_MEMCPY); - thread_count += cnt > 0 ?: 0; + thread_count += cnt > 0 ? cnt : 0; } if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) { cnt = dmatest_add_threads(dtc, DMA_XOR); - thread_count += cnt > 0 ?: 0; + thread_count += cnt > 0 ? cnt : 0; } pr_info("dmatest: Started %u threads using %s\n", diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index f18d1bde043..ef87a898414 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -12,6 +12,11 @@ * also fit for MPC8560, MPC8555, MPC8548, MPC8641, and etc. * The support for MPC8349 DMA contorller is also added. * + * This driver instructs the DMA controller to issue the PCI Read Multiple + * command for PCI read operations, instead of using the default PCI Read Line + * command. Please be aware that this setting may result in read pre-fetching + * on some platforms. + * * This is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License as published by * the Free Software Foundation; either version 2 of the License, or @@ -49,9 +54,10 @@ static void dma_init(struct fsl_dma_chan *fsl_chan) case FSL_DMA_IP_83XX: /* Set the channel to below modes: * EOTIE - End-of-transfer interrupt enable + * PRC_RM - PCI read multiple */ - DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, FSL_DMA_MR_EOTIE, - 32); + DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, FSL_DMA_MR_EOTIE + | FSL_DMA_MR_PRC_RM, 32); break; } @@ -136,15 +142,16 @@ static int dma_is_idle(struct fsl_dma_chan *fsl_chan) static void dma_start(struct fsl_dma_chan *fsl_chan) { - u32 mr_set = 0;; + u32 mr_set = 0; if (fsl_chan->feature & FSL_DMA_CHAN_PAUSE_EXT) { DMA_OUT(fsl_chan, &fsl_chan->reg_base->bcr, 0, 32); mr_set |= FSL_DMA_MR_EMP_EN; - } else + } else if ((fsl_chan->feature & FSL_DMA_IP_MASK) == FSL_DMA_IP_85XX) { DMA_OUT(fsl_chan, &fsl_chan->reg_base->mr, DMA_IN(fsl_chan, &fsl_chan->reg_base->mr, 32) & ~FSL_DMA_MR_EMP_EN, 32); + } if (fsl_chan->feature & FSL_DMA_CHAN_START_EXT) mr_set |= FSL_DMA_MR_EMS_EN; @@ -871,9 +878,9 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev, switch (new_fsl_chan->feature & FSL_DMA_IP_MASK) { case FSL_DMA_IP_85XX: - new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start; new_fsl_chan->toggle_ext_pause = fsl_chan_toggle_ext_pause; case FSL_DMA_IP_83XX: + new_fsl_chan->toggle_ext_start = fsl_chan_toggle_ext_start; new_fsl_chan->set_src_loop_size = fsl_chan_set_src_loop_size; new_fsl_chan->set_dest_loop_size = fsl_chan_set_dest_loop_size; } diff --git a/drivers/dma/fsldma.h b/drivers/dma/fsldma.h index 4f21a512d84..dc7f2686579 100644 --- a/drivers/dma/fsldma.h +++ b/drivers/dma/fsldma.h @@ -38,6 +38,7 @@ /* Special MR definition for MPC8349 */ #define FSL_DMA_MR_EOTIE 0x00000080 +#define FSL_DMA_MR_PRC_RM 0x00000800 #define FSL_DMA_SR_CH 0x00000020 #define FSL_DMA_SR_PE 0x00000010 diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index ddab94f5122..3f23eabe09f 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -1176,7 +1176,7 @@ static int __devinit mv_xor_probe(struct platform_device *pdev) if (dma_has_cap(DMA_MEMSET, dma_dev->cap_mask)) dma_dev->device_prep_dma_memset = mv_xor_prep_dma_memset; if (dma_has_cap(DMA_XOR, dma_dev->cap_mask)) { - dma_dev->max_xor = 8; ; + dma_dev->max_xor = 8; dma_dev->device_prep_dma_xor = mv_xor_prep_dma_xor; } diff --git a/drivers/edac/amd64_edac.c b/drivers/edac/amd64_edac.c index 24964c1d0af..e2a10bcba7a 100644 --- a/drivers/edac/amd64_edac.c +++ b/drivers/edac/amd64_edac.c @@ -868,6 +868,8 @@ static void amd64_read_dbam_reg(struct amd64_pvt *pvt) goto err_reg; } + return; + err_reg: debugf0("Error reading F2x%03x.\n", reg); } @@ -2634,6 +2636,8 @@ static void amd64_read_mc_registers(struct amd64_pvt *pvt) amd64_dump_misc_regs(pvt); + return; + err_reg: debugf0("Reading an MC register failed\n"); @@ -2977,6 +2981,9 @@ static int amd64_check_ecc_enabled(struct amd64_pvt *pvt) "ECC is enabled by BIOS, Proceeding " "with EDAC module initialization\n"); + /* Signal good ECC status */ + ret = 0; + /* CLEAR the override, since BIOS controlled it */ ecc_enable_override = 0; } diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c index 8fab7890a36..33be210d672 100644 --- a/drivers/gpu/drm/drm_crtc.c +++ b/drivers/gpu/drm/drm_crtc.c @@ -1461,7 +1461,7 @@ int drm_mode_setcrtc(struct drm_device *dev, void *data, goto out; } - if (crtc_req->count_connectors > 0 && !mode && !fb) { + if (crtc_req->count_connectors > 0 && (!mode || !fb)) { DRM_DEBUG("Count connectors is %d but no mode or fb set\n", crtc_req->count_connectors); ret = -EINVAL; diff --git a/drivers/gpu/drm/drm_crtc_helper.c b/drivers/gpu/drm/drm_crtc_helper.c index 3da9cfa3184..6aaa2cb2336 100644 --- a/drivers/gpu/drm/drm_crtc_helper.c +++ b/drivers/gpu/drm/drm_crtc_helper.c @@ -706,8 +706,8 @@ int drm_crtc_helper_set_config(struct drm_mode_set *set) struct drm_encoder **save_encoders, *new_encoder; struct drm_framebuffer *old_fb = NULL; bool save_enabled; - bool mode_changed = false; - bool fb_changed = false; + bool mode_changed = false; /* if true do a full mode set */ + bool fb_changed = false; /* if true and !mode_changed just do a flip */ struct drm_connector *connector; int count = 0, ro, fail = 0; struct drm_crtc_helper_funcs *crtc_funcs; @@ -758,6 +758,8 @@ int drm_crtc_helper_set_config(struct drm_mode_set *set) if (set->crtc->fb == NULL) { DRM_DEBUG("crtc has no fb, full mode set\n"); mode_changed = true; + } else if (set->fb == NULL) { + mode_changed = true; } else if ((set->fb->bits_per_pixel != set->crtc->fb->bits_per_pixel) || set->fb->depth != set->crtc->fb->depth) diff --git a/drivers/gpu/drm/radeon/r100.c b/drivers/gpu/drm/radeon/r100.c index 05a44896dff..f1ba8ff4113 100644 --- a/drivers/gpu/drm/radeon/r100.c +++ b/drivers/gpu/drm/radeon/r100.c @@ -722,13 +722,14 @@ int r100_cs_packet_parse(struct radeon_cs_parser *p, unsigned idx) { struct radeon_cs_chunk *ib_chunk = &p->chunks[p->chunk_ib_idx]; - uint32_t header = ib_chunk->kdata[idx]; + uint32_t header; if (idx >= ib_chunk->length_dw) { DRM_ERROR("Can not parse packet at %d after CS end %d !\n", idx, ib_chunk->length_dw); return -EINVAL; } + header = ib_chunk->kdata[idx]; pkt->idx = idx; pkt->type = CP_PACKET_GET_TYPE(header); pkt->count = CP_PACKET_GET_COUNT(header); diff --git a/drivers/gpu/drm/radeon/radeon_drv.c b/drivers/gpu/drm/radeon/radeon_drv.c index 3cfcee17dc5..0bd5879a495 100644 --- a/drivers/gpu/drm/radeon/radeon_drv.c +++ b/drivers/gpu/drm/radeon/radeon_drv.c @@ -318,6 +318,14 @@ static int __init radeon_init(void) driver = &driver_old; driver->num_ioctls = radeon_max_ioctl; #if defined(CONFIG_DRM_RADEON_KMS) +#ifdef CONFIG_VGA_CONSOLE + if (vgacon_text_force() && radeon_modeset == -1) { + DRM_INFO("VGACON disable radeon kernel modesetting.\n"); + driver = &driver_old; + driver->driver_features &= ~DRIVER_MODESET; + radeon_modeset = 0; + } +#endif /* if enabled by default */ if (radeon_modeset == -1) { DRM_INFO("radeon default to kernel modesetting.\n"); @@ -329,17 +337,8 @@ static int __init radeon_init(void) driver->driver_features |= DRIVER_MODESET; driver->num_ioctls = radeon_max_kms_ioctl; } - /* if the vga console setting is enabled still * let modprobe override it */ -#ifdef CONFIG_VGA_CONSOLE - if (vgacon_text_force() && radeon_modeset == -1) { - DRM_INFO("VGACON disable radeon kernel modesetting.\n"); - driver = &driver_old; - driver->driver_features &= ~DRIVER_MODESET; - radeon_modeset = 0; - } -#endif #endif return drm_init(driver); } diff --git a/drivers/gpu/drm/radeon/radeon_kms.c b/drivers/gpu/drm/radeon/radeon_kms.c index 937a2f1cdb4..3357110e30c 100644 --- a/drivers/gpu/drm/radeon/radeon_kms.c +++ b/drivers/gpu/drm/radeon/radeon_kms.c @@ -58,6 +58,8 @@ int radeon_driver_load_kms(struct drm_device *dev, unsigned long flags) if (r) { DRM_ERROR("Failed to initialize radeon, disabling IOCTL\n"); radeon_device_fini(rdev); + kfree(rdev); + dev->dev_private = NULL; return r; } return 0; diff --git a/drivers/gpu/drm/radeon/rv515.c b/drivers/gpu/drm/radeon/rv515.c index 551e608702e..fd8f3ca716e 100644 --- a/drivers/gpu/drm/radeon/rv515.c +++ b/drivers/gpu/drm/radeon/rv515.c @@ -370,6 +370,7 @@ void rv515_vram_info(struct radeon_device *rdev) rv515_vram_get_type(rdev); + r100_vram_init_sizes(rdev); /* FIXME: we should enforce default clock in case GPU is not in * default setup */ diff --git a/drivers/gpu/drm/ttm/ttm_bo.c b/drivers/gpu/drm/ttm/ttm_bo.c index 6538d423698..c2b0d710d10 100644 --- a/drivers/gpu/drm/ttm/ttm_bo.c +++ b/drivers/gpu/drm/ttm/ttm_bo.c @@ -1182,13 +1182,14 @@ static int ttm_bo_force_list_clean(struct ttm_bo_device *bdev, int ttm_bo_clean_mm(struct ttm_bo_device *bdev, unsigned mem_type) { - struct ttm_mem_type_manager *man = &bdev->man[mem_type]; + struct ttm_mem_type_manager *man; int ret = -EINVAL; if (mem_type >= TTM_NUM_MEM_TYPES) { printk(KERN_ERR TTM_PFX "Illegal memory type %d\n", mem_type); return ret; } + man = &bdev->man[mem_type]; if (!man->has_type) { printk(KERN_ERR TTM_PFX "Trying to take down uninitialized " @@ -1575,6 +1576,10 @@ int ttm_bo_wait(struct ttm_buffer_object *bo, driver->sync_obj_unref(&sync_obj); driver->sync_obj_unref(&tmp_obj); spin_lock(&bo->lock); + } else { + spin_unlock(&bo->lock); + driver->sync_obj_unref(&sync_obj); + spin_lock(&bo->lock); } } return 0; diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index ce2e6f38ea0..ad4ada07c6c 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -150,7 +150,7 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, #ifdef CONFIG_X86 dst = kmap_atomic_prot(d, KM_USER0, prot); #else - if (prot != PAGE_KERNEL) + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) dst = vmap(&d, 1, 0, prot); else dst = kmap(d); @@ -163,7 +163,7 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, #ifdef CONFIG_X86 kunmap_atomic(dst, KM_USER0); #else - if (prot != PAGE_KERNEL) + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) vunmap(dst); else kunmap(d); @@ -186,7 +186,7 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, #ifdef CONFIG_X86 src = kmap_atomic_prot(s, KM_USER0, prot); #else - if (prot != PAGE_KERNEL) + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) src = vmap(&s, 1, 0, prot); else src = kmap(s); @@ -199,7 +199,7 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, #ifdef CONFIG_X86 kunmap_atomic(src, KM_USER0); #else - if (prot != PAGE_KERNEL) + if (pgprot_val(prot) != pgprot_val(PAGE_KERNEL)) vunmap(src); else kunmap(s); diff --git a/drivers/input/serio/hp_sdc_mlc.c b/drivers/input/serio/hp_sdc_mlc.c index b587e2d576a..820e51673b2 100644 --- a/drivers/input/serio/hp_sdc_mlc.c +++ b/drivers/input/serio/hp_sdc_mlc.c @@ -296,7 +296,7 @@ static void hp_sdc_mlc_out(hil_mlc *mlc) priv->tseq[3] = 0; if (mlc->opacket & HIL_CTRL_APE) { priv->tseq[3] |= HP_SDC_LPC_APE_IPF; - down_trylock(&mlc->csem); + BUG_ON(down_trylock(&mlc->csem)); } enqueue: hp_sdc_enqueue_transaction(&priv->trans); diff --git a/drivers/isdn/mISDN/l1oip_core.c b/drivers/isdn/mISDN/l1oip_core.c index c3b661a666c..7e5f30dbc0a 100644 --- a/drivers/isdn/mISDN/l1oip_core.c +++ b/drivers/isdn/mISDN/l1oip_core.c @@ -1480,7 +1480,7 @@ l1oip_init(void) return -ENOMEM; l1oip_cnt = 0; - while (type[l1oip_cnt] && l1oip_cnt < MAX_CARDS) { + while (l1oip_cnt < MAX_CARDS && type[l1oip_cnt]) { switch (type[l1oip_cnt] & 0xff) { case 1: pri = 0; diff --git a/drivers/lguest/core.c b/drivers/lguest/core.c index a6974e9b8eb..1e2cb846b3c 100644 --- a/drivers/lguest/core.c +++ b/drivers/lguest/core.c @@ -1,6 +1,8 @@ -/*P:400 This contains run_guest() which actually calls into the Host<->Guest +/*P:400 + * This contains run_guest() which actually calls into the Host<->Guest * Switcher and analyzes the return, such as determining if the Guest wants the - * Host to do something. This file also contains useful helper routines. :*/ + * Host to do something. This file also contains useful helper routines. +:*/ #include <linux/module.h> #include <linux/stringify.h> #include <linux/stddef.h> @@ -24,7 +26,8 @@ static struct page **switcher_page; /* This One Big lock protects all inter-guest data structures. */ DEFINE_MUTEX(lguest_lock); -/*H:010 We need to set up the Switcher at a high virtual address. Remember the +/*H:010 + * We need to set up the Switcher at a high virtual address. Remember the * Switcher is a few hundred bytes of assembler code which actually changes the * CPU to run the Guest, and then changes back to the Host when a trap or * interrupt happens. @@ -33,7 +36,8 @@ DEFINE_MUTEX(lguest_lock); * Host since it will be running as the switchover occurs. * * Trying to map memory at a particular address is an unusual thing to do, so - * it's not a simple one-liner. */ + * it's not a simple one-liner. + */ static __init int map_switcher(void) { int i, err; @@ -47,8 +51,10 @@ static __init int map_switcher(void) * easy. */ - /* We allocate an array of struct page pointers. map_vm_area() wants - * this, rather than just an array of pages. */ + /* + * We allocate an array of struct page pointers. map_vm_area() wants + * this, rather than just an array of pages. + */ switcher_page = kmalloc(sizeof(switcher_page[0])*TOTAL_SWITCHER_PAGES, GFP_KERNEL); if (!switcher_page) { @@ -56,8 +62,10 @@ static __init int map_switcher(void) goto out; } - /* Now we actually allocate the pages. The Guest will see these pages, - * so we make sure they're zeroed. */ + /* + * Now we actually allocate the pages. The Guest will see these pages, + * so we make sure they're zeroed. + */ for (i = 0; i < TOTAL_SWITCHER_PAGES; i++) { unsigned long addr = get_zeroed_page(GFP_KERNEL); if (!addr) { @@ -67,19 +75,23 @@ static __init int map_switcher(void) switcher_page[i] = virt_to_page(addr); } - /* First we check that the Switcher won't overlap the fixmap area at + /* + * First we check that the Switcher won't overlap the fixmap area at * the top of memory. It's currently nowhere near, but it could have - * very strange effects if it ever happened. */ + * very strange effects if it ever happened. + */ if (SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1)*PAGE_SIZE > FIXADDR_START){ err = -ENOMEM; printk("lguest: mapping switcher would thwack fixmap\n"); goto free_pages; } - /* Now we reserve the "virtual memory area" we want: 0xFFC00000 + /* + * Now we reserve the "virtual memory area" we want: 0xFFC00000 * (SWITCHER_ADDR). We might not get it in theory, but in practice * it's worked so far. The end address needs +1 because __get_vm_area - * allocates an extra guard page, so we need space for that. */ + * allocates an extra guard page, so we need space for that. + */ switcher_vma = __get_vm_area(TOTAL_SWITCHER_PAGES * PAGE_SIZE, VM_ALLOC, SWITCHER_ADDR, SWITCHER_ADDR + (TOTAL_SWITCHER_PAGES+1) * PAGE_SIZE); @@ -89,11 +101,13 @@ static __init int map_switcher(void) goto free_pages; } - /* This code actually sets up the pages we've allocated to appear at + /* + * This code actually sets up the pages we've allocated to appear at * SWITCHER_ADDR. map_vm_area() takes the vma we allocated above, the * kind of pages we're mapping (kernel pages), and a pointer to our * array of struct pages. It increments that pointer, but we don't - * care. */ + * care. + */ pagep = switcher_page; err = map_vm_area(switcher_vma, PAGE_KERNEL_EXEC, &pagep); if (err) { @@ -101,8 +115,10 @@ static __init int map_switcher(void) goto free_vma; } - /* Now the Switcher is mapped at the right address, we can't fail! - * Copy in the compiled-in Switcher code (from <arch>_switcher.S). */ + /* + * Now the Switcher is mapped at the right address, we can't fail! + * Copy in the compiled-in Switcher code (from <arch>_switcher.S). + */ memcpy(switcher_vma->addr, start_switcher_text, end_switcher_text - start_switcher_text); @@ -124,8 +140,7 @@ out: } /*:*/ -/* Cleaning up the mapping when the module is unloaded is almost... - * too easy. */ +/* Cleaning up the mapping when the module is unloaded is almost... too easy. */ static void unmap_switcher(void) { unsigned int i; @@ -151,16 +166,19 @@ static void unmap_switcher(void) * But we can't trust the Guest: it might be trying to access the Launcher * code. We have to check that the range is below the pfn_limit the Launcher * gave us. We have to make sure that addr + len doesn't give us a false - * positive by overflowing, too. */ + * positive by overflowing, too. + */ bool lguest_address_ok(const struct lguest *lg, unsigned long addr, unsigned long len) { return (addr+len) / PAGE_SIZE < lg->pfn_limit && (addr+len >= addr); } -/* This routine copies memory from the Guest. Here we can see how useful the +/* + * This routine copies memory from the Guest. Here we can see how useful the * kill_lguest() routine we met in the Launcher can be: we return a random - * value (all zeroes) instead of needing to return an error. */ + * value (all zeroes) instead of needing to return an error. + */ void __lgread(struct lg_cpu *cpu, void *b, unsigned long addr, unsigned bytes) { if (!lguest_address_ok(cpu->lg, addr, bytes) @@ -181,9 +199,11 @@ void __lgwrite(struct lg_cpu *cpu, unsigned long addr, const void *b, } /*:*/ -/*H:030 Let's jump straight to the the main loop which runs the Guest. +/*H:030 + * Let's jump straight to the the main loop which runs the Guest. * Remember, this is called by the Launcher reading /dev/lguest, and we keep - * going around and around until something interesting happens. */ + * going around and around until something interesting happens. + */ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) { /* We stop running once the Guest is dead. */ @@ -195,10 +215,17 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (cpu->hcall) do_hypercalls(cpu); - /* It's possible the Guest did a NOTIFY hypercall to the - * Launcher, in which case we return from the read() now. */ + /* + * It's possible the Guest did a NOTIFY hypercall to the + * Launcher. + */ if (cpu->pending_notify) { + /* + * Does it just needs to write to a registered + * eventfd (ie. the appropriate virtqueue thread)? + */ if (!send_notify_to_eventfd(cpu)) { + /* OK, we tell the main Laucher. */ if (put_user(cpu->pending_notify, user)) return -EFAULT; return sizeof(cpu->pending_notify); @@ -209,29 +236,39 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) if (signal_pending(current)) return -ERESTARTSYS; - /* Check if there are any interrupts which can be delivered now: + /* + * Check if there are any interrupts which can be delivered now: * if so, this sets up the hander to be executed when we next - * run the Guest. */ + * run the Guest. + */ irq = interrupt_pending(cpu, &more); if (irq < LGUEST_IRQS) try_deliver_interrupt(cpu, irq, more); - /* All long-lived kernel loops need to check with this horrible + /* + * All long-lived kernel loops need to check with this horrible * thing called the freezer. If the Host is trying to suspend, - * it stops us. */ + * it stops us. + */ try_to_freeze(); - /* Just make absolutely sure the Guest is still alive. One of - * those hypercalls could have been fatal, for example. */ + /* + * Just make absolutely sure the Guest is still alive. One of + * those hypercalls could have been fatal, for example. + */ if (cpu->lg->dead) break; - /* If the Guest asked to be stopped, we sleep. The Guest's - * clock timer will wake us. */ + /* + * If the Guest asked to be stopped, we sleep. The Guest's + * clock timer will wake us. + */ if (cpu->halted) { set_current_state(TASK_INTERRUPTIBLE); - /* Just before we sleep, make sure no interrupt snuck in - * which we should be doing. */ + /* + * Just before we sleep, make sure no interrupt snuck in + * which we should be doing. + */ if (interrupt_pending(cpu, &more) < LGUEST_IRQS) set_current_state(TASK_RUNNING); else @@ -239,8 +276,10 @@ int run_guest(struct lg_cpu *cpu, unsigned long __user *user) continue; } - /* OK, now we're ready to jump into the Guest. First we put up - * the "Do Not Disturb" sign: */ + /* + * OK, now we're ready to jump into the Guest. First we put up + * the "Do Not Disturb" sign: + */ local_irq_disable(); /* Actually run the Guest until something happens. */ @@ -327,8 +366,10 @@ static void __exit fini(void) } /*:*/ -/* The Host side of lguest can be a module. This is a nice way for people to - * play with it. */ +/* + * The Host side of lguest can be a module. This is a nice way for people to + * play with it. + */ module_init(init); module_exit(fini); MODULE_LICENSE("GPL"); diff --git a/drivers/lguest/hypercalls.c b/drivers/lguest/hypercalls.c index c29ffa19cb7..83511eb0923 100644 --- a/drivers/lguest/hypercalls.c +++ b/drivers/lguest/hypercalls.c @@ -1,8 +1,10 @@ -/*P:500 Just as userspace programs request kernel operations through a system +/*P:500 + * Just as userspace programs request kernel operations through a system * call, the Guest requests Host operations through a "hypercall". You might * notice this nomenclature doesn't really follow any logic, but the name has * been around for long enough that we're stuck with it. As you'd expect, this - * code is basically a one big switch statement. :*/ + * code is basically a one big switch statement. +:*/ /* Copyright (C) 2006 Rusty Russell IBM Corporation @@ -28,30 +30,41 @@ #include <asm/pgtable.h> #include "lg.h" -/*H:120 This is the core hypercall routine: where the Guest gets what it wants. - * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. */ +/*H:120 + * This is the core hypercall routine: where the Guest gets what it wants. + * Or gets killed. Or, in the case of LHCALL_SHUTDOWN, both. + */ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) { switch (args->arg0) { case LHCALL_FLUSH_ASYNC: - /* This call does nothing, except by breaking out of the Guest - * it makes us process all the asynchronous hypercalls. */ + /* + * This call does nothing, except by breaking out of the Guest + * it makes us process all the asynchronous hypercalls. + */ break; case LHCALL_SEND_INTERRUPTS: - /* This call does nothing too, but by breaking out of the Guest - * it makes us process any pending interrupts. */ + /* + * This call does nothing too, but by breaking out of the Guest + * it makes us process any pending interrupts. + */ break; case LHCALL_LGUEST_INIT: - /* You can't get here unless you're already initialized. Don't - * do that. */ + /* + * You can't get here unless you're already initialized. Don't + * do that. + */ kill_guest(cpu, "already have lguest_data"); break; case LHCALL_SHUTDOWN: { - /* Shutdown is such a trivial hypercall that we do it in four - * lines right here. */ char msg[128]; - /* If the lgread fails, it will call kill_guest() itself; the - * kill_guest() with the message will be ignored. */ + /* + * Shutdown is such a trivial hypercall that we do it in five + * lines right here. + * + * If the lgread fails, it will call kill_guest() itself; the + * kill_guest() with the message will be ignored. + */ __lgread(cpu, msg, args->arg1, sizeof(msg)); msg[sizeof(msg)-1] = '\0'; kill_guest(cpu, "CRASH: %s", msg); @@ -60,16 +73,17 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) break; } case LHCALL_FLUSH_TLB: - /* FLUSH_TLB comes in two flavors, depending on the - * argument: */ + /* FLUSH_TLB comes in two flavors, depending on the argument: */ if (args->arg1) guest_pagetable_clear_all(cpu); else guest_pagetable_flush_user(cpu); break; - /* All these calls simply pass the arguments through to the right - * routines. */ + /* + * All these calls simply pass the arguments through to the right + * routines. + */ case LHCALL_NEW_PGTABLE: guest_new_pagetable(cpu, args->arg1); break; @@ -112,15 +126,16 @@ static void do_hcall(struct lg_cpu *cpu, struct hcall_args *args) kill_guest(cpu, "Bad hypercall %li\n", args->arg0); } } -/*:*/ -/*H:124 Asynchronous hypercalls are easy: we just look in the array in the +/*H:124 + * Asynchronous hypercalls are easy: we just look in the array in the * Guest's "struct lguest_data" to see if any new ones are marked "ready". * * We are careful to do these in order: obviously we respect the order the * Guest put them in the ring, but we also promise the Guest that they will * happen before any normal hypercall (which is why we check this before - * checking for a normal hcall). */ + * checking for a normal hcall). + */ static void do_async_hcalls(struct lg_cpu *cpu) { unsigned int i; @@ -133,22 +148,28 @@ static void do_async_hcalls(struct lg_cpu *cpu) /* We process "struct lguest_data"s hcalls[] ring once. */ for (i = 0; i < ARRAY_SIZE(st); i++) { struct hcall_args args; - /* We remember where we were up to from last time. This makes + /* + * We remember where we were up to from last time. This makes * sure that the hypercalls are done in the order the Guest - * places them in the ring. */ + * places them in the ring. + */ unsigned int n = cpu->next_hcall; /* 0xFF means there's no call here (yet). */ if (st[n] == 0xFF) break; - /* OK, we have hypercall. Increment the "next_hcall" cursor, - * and wrap back to 0 if we reach the end. */ + /* + * OK, we have hypercall. Increment the "next_hcall" cursor, + * and wrap back to 0 if we reach the end. + */ if (++cpu->next_hcall == LHCALL_RING_SIZE) cpu->next_hcall = 0; - /* Copy the hypercall arguments into a local copy of - * the hcall_args struct. */ + /* + * Copy the hypercall arguments into a local copy of the + * hcall_args struct. + */ if (copy_from_user(&args, &cpu->lg->lguest_data->hcalls[n], sizeof(struct hcall_args))) { kill_guest(cpu, "Fetching async hypercalls"); @@ -164,19 +185,25 @@ static void do_async_hcalls(struct lg_cpu *cpu) break; } - /* Stop doing hypercalls if they want to notify the Launcher: - * it needs to service this first. */ + /* + * Stop doing hypercalls if they want to notify the Launcher: + * it needs to service this first. + */ if (cpu->pending_notify) break; } } -/* Last of all, we look at what happens first of all. The very first time the - * Guest makes a hypercall, we end up here to set things up: */ +/* + * Last of all, we look at what happens first of all. The very first time the + * Guest makes a hypercall, we end up here to set things up: + */ static void initialize(struct lg_cpu *cpu) { - /* You can't do anything until you're initialized. The Guest knows the - * rules, so we're unforgiving here. */ + /* + * You can't do anything until you're initialized. The Guest knows the + * rules, so we're unforgiving here. + */ if (cpu->hcall->arg0 != LHCALL_LGUEST_INIT) { kill_guest(cpu, "hypercall %li before INIT", cpu->hcall->arg0); return; @@ -185,32 +212,44 @@ static void initialize(struct lg_cpu *cpu) if (lguest_arch_init_hypercalls(cpu)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* The Guest tells us where we're not to deliver interrupts by putting - * the range of addresses into "struct lguest_data". */ + /* + * The Guest tells us where we're not to deliver interrupts by putting + * the range of addresses into "struct lguest_data". + */ if (get_user(cpu->lg->noirq_start, &cpu->lg->lguest_data->noirq_start) || get_user(cpu->lg->noirq_end, &cpu->lg->lguest_data->noirq_end)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* We write the current time into the Guest's data page once so it can - * set its clock. */ + /* + * We write the current time into the Guest's data page once so it can + * set its clock. + */ write_timestamp(cpu); /* page_tables.c will also do some setup. */ page_table_guest_data_init(cpu); - /* This is the one case where the above accesses might have been the + /* + * This is the one case where the above accesses might have been the * first write to a Guest page. This may have caused a copy-on-write * fault, but the old page might be (read-only) in the Guest - * pagetable. */ + * pagetable. + */ guest_pagetable_clear_all(cpu); } /*:*/ -/*M:013 If a Guest reads from a page (so creates a mapping) that it has never +/*M:013 + * If a Guest reads from a page (so creates a mapping) that it has never * written to, and then the Launcher writes to it (ie. the output of a virtual * device), the Guest will still see the old page. In practice, this never * happens: why would the Guest read a page which it has never written to? But - * a similar scenario might one day bite us, so it's worth mentioning. :*/ + * a similar scenario might one day bite us, so it's worth mentioning. + * + * Note that if we used a shared anonymous mapping in the Launcher instead of + * mapping /dev/zero private, we wouldn't worry about cop-on-write. And we + * need that to switch the Launcher to processes (away from threads) anyway. +:*/ /*H:100 * Hypercalls @@ -229,17 +268,22 @@ void do_hypercalls(struct lg_cpu *cpu) return; } - /* The Guest has initialized. + /* + * The Guest has initialized. * - * Look in the hypercall ring for the async hypercalls: */ + * Look in the hypercall ring for the async hypercalls: + */ do_async_hcalls(cpu); - /* If we stopped reading the hypercall ring because the Guest did a + /* + * If we stopped reading the hypercall ring because the Guest did a * NOTIFY to the Launcher, we want to return now. Otherwise we do - * the hypercall. */ + * the hypercall. + */ if (!cpu->pending_notify) { do_hcall(cpu, cpu->hcall); - /* Tricky point: we reset the hcall pointer to mark the + /* + * Tricky point: we reset the hcall pointer to mark the * hypercall as "done". We use the hcall pointer rather than * the trap number to indicate a hypercall is pending. * Normally it doesn't matter: the Guest will run again and @@ -248,13 +292,16 @@ void do_hypercalls(struct lg_cpu *cpu) * However, if we are signalled or the Guest sends I/O to the * Launcher, the run_guest() loop will exit without running the * Guest. When it comes back it would try to re-run the - * hypercall. Finding that bug sucked. */ + * hypercall. Finding that bug sucked. + */ cpu->hcall = NULL; } } -/* This routine supplies the Guest with time: it's used for wallclock time at - * initial boot and as a rough time source if the TSC isn't available. */ +/* + * This routine supplies the Guest with time: it's used for wallclock time at + * initial boot and as a rough time source if the TSC isn't available. + */ void write_timestamp(struct lg_cpu *cpu) { struct timespec now; diff --git a/drivers/lguest/interrupts_and_traps.c b/drivers/lguest/interrupts_and_traps.c index 0e9067b0d50..18648180db0 100644 --- a/drivers/lguest/interrupts_and_traps.c +++ b/drivers/lguest/interrupts_and_traps.c @@ -1,4 +1,5 @@ -/*P:800 Interrupts (traps) are complicated enough to earn their own file. +/*P:800 + * Interrupts (traps) are complicated enough to earn their own file. * There are three classes of interrupts: * * 1) Real hardware interrupts which occur while we're running the Guest, @@ -10,7 +11,8 @@ * just like real hardware would deliver them. Traps from the Guest can be set * up to go directly back into the Guest, but sometimes the Host wants to see * them first, so we also have a way of "reflecting" them into the Guest as if - * they had been delivered to it directly. :*/ + * they had been delivered to it directly. +:*/ #include <linux/uaccess.h> #include <linux/interrupt.h> #include <linux/module.h> @@ -26,8 +28,10 @@ static unsigned long idt_address(u32 lo, u32 hi) return (lo & 0x0000FFFF) | (hi & 0xFFFF0000); } -/* The "type" of the interrupt handler is a 4 bit field: we only support a - * couple of types. */ +/* + * The "type" of the interrupt handler is a 4 bit field: we only support a + * couple of types. + */ static int idt_type(u32 lo, u32 hi) { return (hi >> 8) & 0xF; @@ -39,8 +43,10 @@ static bool idt_present(u32 lo, u32 hi) return (hi & 0x8000); } -/* We need a helper to "push" a value onto the Guest's stack, since that's a - * big part of what delivering an interrupt does. */ +/* + * We need a helper to "push" a value onto the Guest's stack, since that's a + * big part of what delivering an interrupt does. + */ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) { /* Stack grows upwards: move stack then write value. */ @@ -48,7 +54,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) lgwrite(cpu, *gstack, u32, val); } -/*H:210 The set_guest_interrupt() routine actually delivers the interrupt or +/*H:210 + * The set_guest_interrupt() routine actually delivers the interrupt or * trap. The mechanics of delivering traps and interrupts to the Guest are the * same, except some traps have an "error code" which gets pushed onto the * stack as well: the caller tells us if this is one. @@ -59,7 +66,8 @@ static void push_guest_stack(struct lg_cpu *cpu, unsigned long *gstack, u32 val) * * We set up the stack just like the CPU does for a real interrupt, so it's * identical for the Guest (and the standard "iret" instruction will undo - * it). */ + * it). + */ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, bool has_err) { @@ -67,20 +75,26 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, u32 eflags, ss, irq_enable; unsigned long virtstack; - /* There are two cases for interrupts: one where the Guest is already + /* + * There are two cases for interrupts: one where the Guest is already * in the kernel, and a more complex one where the Guest is in - * userspace. We check the privilege level to find out. */ + * userspace. We check the privilege level to find out. + */ if ((cpu->regs->ss&0x3) != GUEST_PL) { - /* The Guest told us their kernel stack with the SET_STACK - * hypercall: both the virtual address and the segment */ + /* + * The Guest told us their kernel stack with the SET_STACK + * hypercall: both the virtual address and the segment. + */ virtstack = cpu->esp1; ss = cpu->ss1; origstack = gstack = guest_pa(cpu, virtstack); - /* We push the old stack segment and pointer onto the new + /* + * We push the old stack segment and pointer onto the new * stack: when the Guest does an "iret" back from the interrupt * handler the CPU will notice they're dropping privilege - * levels and expect these here. */ + * levels and expect these here. + */ push_guest_stack(cpu, &gstack, cpu->regs->ss); push_guest_stack(cpu, &gstack, cpu->regs->esp); } else { @@ -91,18 +105,22 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, origstack = gstack = guest_pa(cpu, virtstack); } - /* Remember that we never let the Guest actually disable interrupts, so + /* + * Remember that we never let the Guest actually disable interrupts, so * the "Interrupt Flag" bit is always set. We copy that bit from the * Guest's "irq_enabled" field into the eflags word: we saw the Guest - * copy it back in "lguest_iret". */ + * copy it back in "lguest_iret". + */ eflags = cpu->regs->eflags; if (get_user(irq_enable, &cpu->lg->lguest_data->irq_enabled) == 0 && !(irq_enable & X86_EFLAGS_IF)) eflags &= ~X86_EFLAGS_IF; - /* An interrupt is expected to push three things on the stack: the old + /* + * An interrupt is expected to push three things on the stack: the old * "eflags" word, the old code segment, and the old instruction - * pointer. */ + * pointer. + */ push_guest_stack(cpu, &gstack, eflags); push_guest_stack(cpu, &gstack, cpu->regs->cs); push_guest_stack(cpu, &gstack, cpu->regs->eip); @@ -111,15 +129,19 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, if (has_err) push_guest_stack(cpu, &gstack, cpu->regs->errcode); - /* Now we've pushed all the old state, we change the stack, the code - * segment and the address to execute. */ + /* + * Now we've pushed all the old state, we change the stack, the code + * segment and the address to execute. + */ cpu->regs->ss = ss; cpu->regs->esp = virtstack + (gstack - origstack); cpu->regs->cs = (__KERNEL_CS|GUEST_PL); cpu->regs->eip = idt_address(lo, hi); - /* There are two kinds of interrupt handlers: 0xE is an "interrupt - * gate" which expects interrupts to be disabled on entry. */ + /* + * There are two kinds of interrupt handlers: 0xE is an "interrupt + * gate" which expects interrupts to be disabled on entry. + */ if (idt_type(lo, hi) == 0xE) if (put_user(0, &cpu->lg->lguest_data->irq_enabled)) kill_guest(cpu, "Disabling interrupts"); @@ -130,7 +152,8 @@ static void set_guest_interrupt(struct lg_cpu *cpu, u32 lo, u32 hi, * * interrupt_pending() returns the first pending interrupt which isn't blocked * by the Guest. It is called before every entry to the Guest, and just before - * we go to sleep when the Guest has halted itself. */ + * we go to sleep when the Guest has halted itself. + */ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) { unsigned int irq; @@ -140,8 +163,10 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) if (!cpu->lg->lguest_data) return LGUEST_IRQS; - /* Take our "irqs_pending" array and remove any interrupts the Guest - * wants blocked: the result ends up in "blk". */ + /* + * Take our "irqs_pending" array and remove any interrupts the Guest + * wants blocked: the result ends up in "blk". + */ if (copy_from_user(&blk, cpu->lg->lguest_data->blocked_interrupts, sizeof(blk))) return LGUEST_IRQS; @@ -154,16 +179,20 @@ unsigned int interrupt_pending(struct lg_cpu *cpu, bool *more) return irq; } -/* This actually diverts the Guest to running an interrupt handler, once an - * interrupt has been identified by interrupt_pending(). */ +/* + * This actually diverts the Guest to running an interrupt handler, once an + * interrupt has been identified by interrupt_pending(). + */ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) { struct desc_struct *idt; BUG_ON(irq >= LGUEST_IRQS); - /* They may be in the middle of an iret, where they asked us never to - * deliver interrupts. */ + /* + * They may be in the middle of an iret, where they asked us never to + * deliver interrupts. + */ if (cpu->regs->eip >= cpu->lg->noirq_start && (cpu->regs->eip < cpu->lg->noirq_end)) return; @@ -187,29 +216,37 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) } } - /* Look at the IDT entry the Guest gave us for this interrupt. The + /* + * Look at the IDT entry the Guest gave us for this interrupt. The * first 32 (FIRST_EXTERNAL_VECTOR) entries are for traps, so we skip - * over them. */ + * over them. + */ idt = &cpu->arch.idt[FIRST_EXTERNAL_VECTOR+irq]; /* If they don't have a handler (yet?), we just ignore it */ if (idt_present(idt->a, idt->b)) { /* OK, mark it no longer pending and deliver it. */ clear_bit(irq, cpu->irqs_pending); - /* set_guest_interrupt() takes the interrupt descriptor and a + /* + * set_guest_interrupt() takes the interrupt descriptor and a * flag to say whether this interrupt pushes an error code onto - * the stack as well: virtual interrupts never do. */ + * the stack as well: virtual interrupts never do. + */ set_guest_interrupt(cpu, idt->a, idt->b, false); } - /* Every time we deliver an interrupt, we update the timestamp in the + /* + * Every time we deliver an interrupt, we update the timestamp in the * Guest's lguest_data struct. It would be better for the Guest if we * did this more often, but it can actually be quite slow: doing it * here is a compromise which means at least it gets updated every - * timer interrupt. */ + * timer interrupt. + */ write_timestamp(cpu); - /* If there are no other interrupts we want to deliver, clear - * the pending flag. */ + /* + * If there are no other interrupts we want to deliver, clear + * the pending flag. + */ if (!more) put_user(0, &cpu->lg->lguest_data->irq_pending); } @@ -217,24 +254,29 @@ void try_deliver_interrupt(struct lg_cpu *cpu, unsigned int irq, bool more) /* And this is the routine when we want to set an interrupt for the Guest. */ void set_interrupt(struct lg_cpu *cpu, unsigned int irq) { - /* Next time the Guest runs, the core code will see if it can deliver - * this interrupt. */ + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_bit(irq, cpu->irqs_pending); - /* Make sure it sees it; it might be asleep (eg. halted), or - * running the Guest right now, in which case kick_process() - * will knock it out. */ + /* + * Make sure it sees it; it might be asleep (eg. halted), or running + * the Guest right now, in which case kick_process() will knock it out. + */ if (!wake_up_process(cpu->tsk)) kick_process(cpu->tsk); } /*:*/ -/* Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent +/* + * Linux uses trap 128 for system calls. Plan9 uses 64, and Ron Minnich sent * me a patch, so we support that too. It'd be a big step for lguest if half * the Plan 9 user base were to start using it. * * Actually now I think of it, it's possible that Ron *is* half the Plan 9 - * userbase. Oh well. */ + * userbase. Oh well. + */ static bool could_be_syscall(unsigned int num) { /* Normal Linux SYSCALL_VECTOR or reserved vector? */ @@ -274,9 +316,11 @@ void free_interrupts(void) clear_bit(syscall_vector, used_vectors); } -/*H:220 Now we've got the routines to deliver interrupts, delivering traps like +/*H:220 + * Now we've got the routines to deliver interrupts, delivering traps like * page fault is easy. The only trick is that Intel decided that some traps - * should have error codes: */ + * should have error codes: + */ static bool has_err(unsigned int trap) { return (trap == 8 || (trap >= 10 && trap <= 14) || trap == 17); @@ -285,13 +329,17 @@ static bool has_err(unsigned int trap) /* deliver_trap() returns true if it could deliver the trap. */ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) { - /* Trap numbers are always 8 bit, but we set an impossible trap number - * for traps inside the Switcher, so check that here. */ + /* + * Trap numbers are always 8 bit, but we set an impossible trap number + * for traps inside the Switcher, so check that here. + */ if (num >= ARRAY_SIZE(cpu->arch.idt)) return false; - /* Early on the Guest hasn't set the IDT entries (or maybe it put a - * bogus one in): if we fail here, the Guest will be killed. */ + /* + * Early on the Guest hasn't set the IDT entries (or maybe it put a + * bogus one in): if we fail here, the Guest will be killed. + */ if (!idt_present(cpu->arch.idt[num].a, cpu->arch.idt[num].b)) return false; set_guest_interrupt(cpu, cpu->arch.idt[num].a, @@ -299,7 +347,8 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) return true; } -/*H:250 Here's the hard part: returning to the Host every time a trap happens +/*H:250 + * Here's the hard part: returning to the Host every time a trap happens * and then calling deliver_trap() and re-entering the Guest is slow. * Particularly because Guest userspace system calls are traps (usually trap * 128). @@ -311,69 +360,87 @@ bool deliver_trap(struct lg_cpu *cpu, unsigned int num) * the other hypervisors would beat it up at lunchtime. * * This routine indicates if a particular trap number could be delivered - * directly. */ + * directly. + */ static bool direct_trap(unsigned int num) { - /* Hardware interrupts don't go to the Guest at all (except system - * call). */ + /* + * Hardware interrupts don't go to the Guest at all (except system + * call). + */ if (num >= FIRST_EXTERNAL_VECTOR && !could_be_syscall(num)) return false; - /* The Host needs to see page faults (for shadow paging and to save the + /* + * The Host needs to see page faults (for shadow paging and to save the * fault address), general protection faults (in/out emulation) and * device not available (TS handling), invalid opcode fault (kvm hcall), - * and of course, the hypercall trap. */ + * and of course, the hypercall trap. + */ return num != 14 && num != 13 && num != 7 && num != 6 && num != LGUEST_TRAP_ENTRY; } /*:*/ -/*M:005 The Guest has the ability to turn its interrupt gates into trap gates, +/*M:005 + * The Guest has the ability to turn its interrupt gates into trap gates, * if it is careful. The Host will let trap gates can go directly to the * Guest, but the Guest needs the interrupts atomically disabled for an * interrupt gate. It can do this by pointing the trap gate at instructions - * within noirq_start and noirq_end, where it can safely disable interrupts. */ + * within noirq_start and noirq_end, where it can safely disable interrupts. + */ -/*M:006 The Guests do not use the sysenter (fast system call) instruction, +/*M:006 + * The Guests do not use the sysenter (fast system call) instruction, * because it's hardcoded to enter privilege level 0 and so can't go direct. * It's about twice as fast as the older "int 0x80" system call, so it might * still be worthwhile to handle it in the Switcher and lcall down to the * Guest. The sysenter semantics are hairy tho: search for that keyword in - * entry.S :*/ + * entry.S +:*/ -/*H:260 When we make traps go directly into the Guest, we need to make sure +/*H:260 + * When we make traps go directly into the Guest, we need to make sure * the kernel stack is valid (ie. mapped in the page tables). Otherwise, the * CPU trying to deliver the trap will fault while trying to push the interrupt * words on the stack: this is called a double fault, and it forces us to kill * the Guest. * - * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. */ + * Which is deeply unfair, because (literally!) it wasn't the Guests' fault. + */ void pin_stack_pages(struct lg_cpu *cpu) { unsigned int i; - /* Depending on the CONFIG_4KSTACKS option, the Guest can have one or - * two pages of stack space. */ + /* + * Depending on the CONFIG_4KSTACKS option, the Guest can have one or + * two pages of stack space. + */ for (i = 0; i < cpu->lg->stack_pages; i++) - /* The stack grows *upwards*, so the address we're given is the + /* + * The stack grows *upwards*, so the address we're given is the * start of the page after the kernel stack. Subtract one to * get back onto the first stack page, and keep subtracting to - * get to the rest of the stack pages. */ + * get to the rest of the stack pages. + */ pin_page(cpu, cpu->esp1 - 1 - i * PAGE_SIZE); } -/* Direct traps also mean that we need to know whenever the Guest wants to use +/* + * Direct traps also mean that we need to know whenever the Guest wants to use * a different kernel stack, so we can change the IDT entries to use that * stack. The IDT entries expect a virtual address, so unlike most addresses * the Guest gives us, the "esp" (stack pointer) value here is virtual, not * physical. * * In Linux each process has its own kernel stack, so this happens a lot: we - * change stacks on each context switch. */ + * change stacks on each context switch. + */ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) { - /* You are not allowed have a stack segment with privilege level 0: bad - * Guest! */ + /* + * You're not allowed a stack segment with privilege level 0: bad Guest! + */ if ((seg & 0x3) != GUEST_PL) kill_guest(cpu, "bad stack segment %i", seg); /* We only expect one or two stack pages. */ @@ -387,11 +454,15 @@ void guest_set_stack(struct lg_cpu *cpu, u32 seg, u32 esp, unsigned int pages) pin_stack_pages(cpu); } -/* All this reference to mapping stacks leads us neatly into the other complex - * part of the Host: page table handling. */ +/* + * All this reference to mapping stacks leads us neatly into the other complex + * part of the Host: page table handling. + */ -/*H:235 This is the routine which actually checks the Guest's IDT entry and - * transfers it into the entry in "struct lguest": */ +/*H:235 + * This is the routine which actually checks the Guest's IDT entry and + * transfers it into the entry in "struct lguest": + */ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, unsigned int num, u32 lo, u32 hi) { @@ -407,30 +478,38 @@ static void set_trap(struct lg_cpu *cpu, struct desc_struct *trap, if (type != 0xE && type != 0xF) kill_guest(cpu, "bad IDT type %i", type); - /* We only copy the handler address, present bit, privilege level and + /* + * We only copy the handler address, present bit, privilege level and * type. The privilege level controls where the trap can be triggered * manually with an "int" instruction. This is usually GUEST_PL, - * except for system calls which userspace can use. */ + * except for system calls which userspace can use. + */ trap->a = ((__KERNEL_CS|GUEST_PL)<<16) | (lo&0x0000FFFF); trap->b = (hi&0xFFFFEF00); } -/*H:230 While we're here, dealing with delivering traps and interrupts to the +/*H:230 + * While we're here, dealing with delivering traps and interrupts to the * Guest, we might as well complete the picture: how the Guest tells us where * it wants them to go. This would be simple, except making traps fast * requires some tricks. * * We saw the Guest setting Interrupt Descriptor Table (IDT) entries with the - * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. */ + * LHCALL_LOAD_IDT_ENTRY hypercall before: that comes here. + */ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) { - /* Guest never handles: NMI, doublefault, spurious interrupt or - * hypercall. We ignore when it tries to set them. */ + /* + * Guest never handles: NMI, doublefault, spurious interrupt or + * hypercall. We ignore when it tries to set them. + */ if (num == 2 || num == 8 || num == 15 || num == LGUEST_TRAP_ENTRY) return; - /* Mark the IDT as changed: next time the Guest runs we'll know we have - * to copy this again. */ + /* + * Mark the IDT as changed: next time the Guest runs we'll know we have + * to copy this again. + */ cpu->changed |= CHANGED_IDT; /* Check that the Guest doesn't try to step outside the bounds. */ @@ -440,9 +519,11 @@ void load_guest_idt_entry(struct lg_cpu *cpu, unsigned int num, u32 lo, u32 hi) set_trap(cpu, &cpu->arch.idt[num], num, lo, hi); } -/* The default entry for each interrupt points into the Switcher routines which +/* + * The default entry for each interrupt points into the Switcher routines which * simply return to the Host. The run_guest() loop will then call - * deliver_trap() to bounce it back into the Guest. */ + * deliver_trap() to bounce it back into the Guest. + */ static void default_idt_entry(struct desc_struct *idt, int trap, const unsigned long handler, @@ -451,13 +532,17 @@ static void default_idt_entry(struct desc_struct *idt, /* A present interrupt gate. */ u32 flags = 0x8e00; - /* Set the privilege level on the entry for the hypercall: this allows - * the Guest to use the "int" instruction to trigger it. */ + /* + * Set the privilege level on the entry for the hypercall: this allows + * the Guest to use the "int" instruction to trigger it. + */ if (trap == LGUEST_TRAP_ENTRY) flags |= (GUEST_PL << 13); else if (base) - /* Copy priv. level from what Guest asked for. This allows - * debug (int 3) traps from Guest userspace, for example. */ + /* + * Copy privilege level from what Guest asked for. This allows + * debug (int 3) traps from Guest userspace, for example. + */ flags |= (base->b & 0x6000); /* Now pack it into the IDT entry in its weird format. */ @@ -475,16 +560,20 @@ void setup_default_idt_entries(struct lguest_ro_state *state, default_idt_entry(&state->guest_idt[i], i, def[i], NULL); } -/*H:240 We don't use the IDT entries in the "struct lguest" directly, instead +/*H:240 + * We don't use the IDT entries in the "struct lguest" directly, instead * we copy them into the IDT which we've set up for Guests on this CPU, just - * before we run the Guest. This routine does that copy. */ + * before we run the Guest. This routine does that copy. + */ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, const unsigned long *def) { unsigned int i; - /* We can simply copy the direct traps, otherwise we use the default - * ones in the Switcher: they will return to the Host. */ + /* + * We can simply copy the direct traps, otherwise we use the default + * ones in the Switcher: they will return to the Host. + */ for (i = 0; i < ARRAY_SIZE(cpu->arch.idt); i++) { const struct desc_struct *gidt = &cpu->arch.idt[i]; @@ -492,14 +581,16 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, if (!direct_trap(i)) continue; - /* Only trap gates (type 15) can go direct to the Guest. + /* + * Only trap gates (type 15) can go direct to the Guest. * Interrupt gates (type 14) disable interrupts as they are * entered, which we never let the Guest do. Not present * entries (type 0x0) also can't go direct, of course. * * If it can't go direct, we still need to copy the priv. level: * they might want to give userspace access to a software - * interrupt. */ + * interrupt. + */ if (idt_type(gidt->a, gidt->b) == 0xF) idt[i] = *gidt; else @@ -518,7 +609,8 @@ void copy_traps(const struct lg_cpu *cpu, struct desc_struct *idt, * the next timer interrupt (in nanoseconds). We use the high-resolution timer * infrastructure to set a callback at that time. * - * 0 means "turn off the clock". */ + * 0 means "turn off the clock". + */ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) { ktime_t expires; @@ -529,9 +621,11 @@ void guest_set_clockevent(struct lg_cpu *cpu, unsigned long delta) return; } - /* We use wallclock time here, so the Guest might not be running for + /* + * We use wallclock time here, so the Guest might not be running for * all the time between now and the timer interrupt it asked for. This - * is almost always the right thing to do. */ + * is almost always the right thing to do. + */ expires = ktime_add_ns(ktime_get_real(), delta); hrtimer_start(&cpu->hrt, expires, HRTIMER_MODE_ABS); } diff --git a/drivers/lguest/lg.h b/drivers/lguest/lg.h index 01c59192379..bc28745d05a 100644 --- a/drivers/lguest/lg.h +++ b/drivers/lguest/lg.h @@ -16,15 +16,13 @@ void free_pagetables(void); int init_pagetables(struct page **switcher_page, unsigned int pages); -struct pgdir -{ +struct pgdir { unsigned long gpgdir; pgd_t *pgdir; }; /* We have two pages shared with guests, per cpu. */ -struct lguest_pages -{ +struct lguest_pages { /* This is the stack page mapped rw in guest */ char spare[PAGE_SIZE - sizeof(struct lguest_regs)]; struct lguest_regs regs; @@ -54,13 +52,13 @@ struct lg_cpu { unsigned long pending_notify; /* pfn from LHCALL_NOTIFY */ - /* At end of a page shared mapped over lguest_pages in guest. */ + /* At end of a page shared mapped over lguest_pages in guest. */ unsigned long regs_page; struct lguest_regs *regs; struct lguest_pages *last_pages; - int cpu_pgd; /* which pgd this cpu is currently using */ + int cpu_pgd; /* Which pgd this cpu is currently using */ /* If a hypercall was asked for, this points to the arguments. */ struct hcall_args *hcall; @@ -89,15 +87,17 @@ struct lg_eventfd_map { }; /* The private info the thread maintains about the guest. */ -struct lguest -{ +struct lguest { struct lguest_data __user *lguest_data; struct lg_cpu cpus[NR_CPUS]; unsigned int nr_cpus; u32 pfn_limit; - /* This provides the offset to the base of guest-physical - * memory in the Launcher. */ + + /* + * This provides the offset to the base of guest-physical memory in the + * Launcher. + */ void __user *mem_base; unsigned long kernel_address; @@ -122,11 +122,13 @@ bool lguest_address_ok(const struct lguest *lg, void __lgread(struct lg_cpu *, void *, unsigned long, unsigned); void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); -/*H:035 Using memory-copy operations like that is usually inconvient, so we +/*H:035 + * Using memory-copy operations like that is usually inconvient, so we * have the following helper macros which read and write a specific type (often * an unsigned long). * - * This reads into a variable of the given type then returns that. */ + * This reads into a variable of the given type then returns that. + */ #define lgread(cpu, addr, type) \ ({ type _v; __lgread((cpu), &_v, (addr), sizeof(_v)); _v; }) @@ -140,9 +142,11 @@ void __lgwrite(struct lg_cpu *, unsigned long, const void *, unsigned); int run_guest(struct lg_cpu *cpu, unsigned long __user *user); -/* Helper macros to obtain the first 12 or the last 20 bits, this is only the +/* + * Helper macros to obtain the first 12 or the last 20 bits, this is only the * first step in the migration to the kernel types. pte_pfn is already defined - * in the kernel. */ + * in the kernel. + */ #define pgd_flags(x) (pgd_val(x) & ~PAGE_MASK) #define pgd_pfn(x) (pgd_val(x) >> PAGE_SHIFT) #define pmd_flags(x) (pmd_val(x) & ~PAGE_MASK) diff --git a/drivers/lguest/lguest_device.c b/drivers/lguest/lguest_device.c index e082cdac88b..b6200bc39b5 100644 --- a/drivers/lguest/lguest_device.c +++ b/drivers/lguest/lguest_device.c @@ -1,10 +1,12 @@ -/*P:050 Lguest guests use a very simple method to describe devices. It's a +/*P:050 + * Lguest guests use a very simple method to describe devices. It's a * series of device descriptors contained just above the top of normal Guest * memory. * * We use the standard "virtio" device infrastructure, which provides us with a * console, a network and a block driver. Each one expects some configuration - * information and a "virtqueue" or two to send and receive data. :*/ + * information and a "virtqueue" or two to send and receive data. +:*/ #include <linux/init.h> #include <linux/bootmem.h> #include <linux/lguest_launcher.h> @@ -20,8 +22,10 @@ /* The pointer to our (page) of device descriptions. */ static void *lguest_devices; -/* For Guests, device memory can be used as normal memory, so we cast away the - * __iomem to quieten sparse. */ +/* + * For Guests, device memory can be used as normal memory, so we cast away the + * __iomem to quieten sparse. + */ static inline void *lguest_map(unsigned long phys_addr, unsigned long pages) { return (__force void *)ioremap_cache(phys_addr, PAGE_SIZE*pages); @@ -32,8 +36,10 @@ static inline void lguest_unmap(void *addr) iounmap((__force void __iomem *)addr); } -/*D:100 Each lguest device is just a virtio device plus a pointer to its entry - * in the lguest_devices page. */ +/*D:100 + * Each lguest device is just a virtio device plus a pointer to its entry + * in the lguest_devices page. + */ struct lguest_device { struct virtio_device vdev; @@ -41,9 +47,11 @@ struct lguest_device { struct lguest_device_desc *desc; }; -/* Since the virtio infrastructure hands us a pointer to the virtio_device all +/* + * Since the virtio infrastructure hands us a pointer to the virtio_device all * the time, it helps to have a curt macro to get a pointer to the struct - * lguest_device it's enclosed in. */ + * lguest_device it's enclosed in. + */ #define to_lgdev(vd) container_of(vd, struct lguest_device, vdev) /*D:130 @@ -55,7 +63,8 @@ struct lguest_device { * the driver will look at them during setup. * * A convenient routine to return the device's virtqueue config array: - * immediately after the descriptor. */ + * immediately after the descriptor. + */ static struct lguest_vqconfig *lg_vq(const struct lguest_device_desc *desc) { return (void *)(desc + 1); @@ -98,10 +107,12 @@ static u32 lg_get_features(struct virtio_device *vdev) return features; } -/* The virtio core takes the features the Host offers, and copies the - * ones supported by the driver into the vdev->features array. Once - * that's all sorted out, this routine is called so we can tell the - * Host which features we understand and accept. */ +/* + * The virtio core takes the features the Host offers, and copies the ones + * supported by the driver into the vdev->features array. Once that's all + * sorted out, this routine is called so we can tell the Host which features we + * understand and accept. + */ static void lg_finalize_features(struct virtio_device *vdev) { unsigned int i, bits; @@ -112,10 +123,11 @@ static void lg_finalize_features(struct virtio_device *vdev) /* Give virtio_ring a chance to accept features. */ vring_transport_features(vdev); - /* The vdev->feature array is a Linux bitmask: this isn't the - * same as a the simple array of bits used by lguest devices - * for features. So we do this slow, manual conversion which is - * completely general. */ + /* + * The vdev->feature array is a Linux bitmask: this isn't the same as a + * the simple array of bits used by lguest devices for features. So we + * do this slow, manual conversion which is completely general. + */ memset(out_features, 0, desc->feature_len); bits = min_t(unsigned, desc->feature_len, sizeof(vdev->features)) * 8; for (i = 0; i < bits; i++) { @@ -146,15 +158,19 @@ static void lg_set(struct virtio_device *vdev, unsigned int offset, memcpy(lg_config(desc) + offset, buf, len); } -/* The operations to get and set the status word just access the status field - * of the device descriptor. */ +/* + * The operations to get and set the status word just access the status field + * of the device descriptor. + */ static u8 lg_get_status(struct virtio_device *vdev) { return to_lgdev(vdev)->desc->status; } -/* To notify on status updates, we (ab)use the NOTIFY hypercall, with the - * descriptor address of the device. A zero status means "reset". */ +/* + * To notify on status updates, we (ab)use the NOTIFY hypercall, with the + * descriptor address of the device. A zero status means "reset". + */ static void set_status(struct virtio_device *vdev, u8 status) { unsigned long offset = (void *)to_lgdev(vdev)->desc - lguest_devices; @@ -191,8 +207,7 @@ static void lg_reset(struct virtio_device *vdev) */ /*D:140 This is the information we remember about each virtqueue. */ -struct lguest_vq_info -{ +struct lguest_vq_info { /* A copy of the information contained in the device config. */ struct lguest_vqconfig config; @@ -200,13 +215,17 @@ struct lguest_vq_info void *pages; }; -/* When the virtio_ring code wants to prod the Host, it calls us here and we +/* + * When the virtio_ring code wants to prod the Host, it calls us here and we * make a hypercall. We hand the physical address of the virtqueue so the Host - * knows which virtqueue we're talking about. */ + * knows which virtqueue we're talking about. + */ static void lg_notify(struct virtqueue *vq) { - /* We store our virtqueue information in the "priv" pointer of the - * virtqueue structure. */ + /* + * We store our virtqueue information in the "priv" pointer of the + * virtqueue structure. + */ struct lguest_vq_info *lvq = vq->priv; kvm_hypercall1(LHCALL_NOTIFY, lvq->config.pfn << PAGE_SHIFT); @@ -215,7 +234,8 @@ static void lg_notify(struct virtqueue *vq) /* An extern declaration inside a C file is bad form. Don't do it. */ extern void lguest_setup_irq(unsigned int irq); -/* This routine finds the first virtqueue described in the configuration of +/* + * This routine finds the Nth virtqueue described in the configuration of * this device and sets it up. * * This is kind of an ugly duckling. It'd be nicer to have a standard @@ -223,9 +243,7 @@ extern void lguest_setup_irq(unsigned int irq); * everyone wants to do it differently. The KVM coders want the Guest to * allocate its own pages and tell the Host where they are, but for lguest it's * simpler for the Host to simply tell us where the pages are. - * - * So we provide drivers with a "find the Nth virtqueue and set it up" - * function. */ + */ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), @@ -244,9 +262,11 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, if (!lvq) return ERR_PTR(-ENOMEM); - /* Make a copy of the "struct lguest_vqconfig" entry, which sits after + /* + * Make a copy of the "struct lguest_vqconfig" entry, which sits after * the descriptor. We need a copy because the config space might not - * be aligned correctly. */ + * be aligned correctly. + */ memcpy(&lvq->config, lg_vq(ldev->desc)+index, sizeof(lvq->config)); printk("Mapping virtqueue %i addr %lx\n", index, @@ -261,8 +281,10 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, goto free_lvq; } - /* OK, tell virtio_ring.c to set up a virtqueue now we know its size - * and we've got a pointer to its pages. */ + /* + * OK, tell virtio_ring.c to set up a virtqueue now we know its size + * and we've got a pointer to its pages. + */ vq = vring_new_virtqueue(lvq->config.num, LGUEST_VRING_ALIGN, vdev, lvq->pages, lg_notify, callback, name); if (!vq) { @@ -273,18 +295,23 @@ static struct virtqueue *lg_find_vq(struct virtio_device *vdev, /* Make sure the interrupt is allocated. */ lguest_setup_irq(lvq->config.irq); - /* Tell the interrupt for this virtqueue to go to the virtio_ring - * interrupt handler. */ - /* FIXME: We used to have a flag for the Host to tell us we could use + /* + * Tell the interrupt for this virtqueue to go to the virtio_ring + * interrupt handler. + * + * FIXME: We used to have a flag for the Host to tell us we could use * the interrupt as a source of randomness: it'd be nice to have that - * back.. */ + * back. + */ err = request_irq(lvq->config.irq, vring_interrupt, IRQF_SHARED, dev_name(&vdev->dev), vq); if (err) goto destroy_vring; - /* Last of all we hook up our 'struct lguest_vq_info" to the - * virtqueue's priv pointer. */ + /* + * Last of all we hook up our 'struct lguest_vq_info" to the + * virtqueue's priv pointer. + */ vq->priv = lvq; return vq; @@ -358,11 +385,14 @@ static struct virtio_config_ops lguest_config_ops = { .del_vqs = lg_del_vqs, }; -/* The root device for the lguest virtio devices. This makes them appear as - * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. */ +/* + * The root device for the lguest virtio devices. This makes them appear as + * /sys/devices/lguest/0,1,2 not /sys/devices/0,1,2. + */ static struct device *lguest_root; -/*D:120 This is the core of the lguest bus: actually adding a new device. +/*D:120 + * This is the core of the lguest bus: actually adding a new device. * It's a separate function because it's neater that way, and because an * earlier version of the code supported hotplug and unplug. They were removed * early on because they were never used. @@ -371,14 +401,14 @@ static struct device *lguest_root; * * It's worth reading this carefully: we start with a pointer to the new device * descriptor in the "lguest_devices" page, and the offset into the device - * descriptor page so we can uniquely identify it if things go badly wrong. */ + * descriptor page so we can uniquely identify it if things go badly wrong. + */ static void add_lguest_device(struct lguest_device_desc *d, unsigned int offset) { struct lguest_device *ldev; - /* Start with zeroed memory; Linux's device layer seems to count on - * it. */ + /* Start with zeroed memory; Linux's device layer counts on it. */ ldev = kzalloc(sizeof(*ldev), GFP_KERNEL); if (!ldev) { printk(KERN_EMERG "Cannot allocate lguest dev %u type %u\n", @@ -388,17 +418,25 @@ static void add_lguest_device(struct lguest_device_desc *d, /* This devices' parent is the lguest/ dir. */ ldev->vdev.dev.parent = lguest_root; - /* We have a unique device index thanks to the dev_index counter. */ + /* + * The device type comes straight from the descriptor. There's also a + * device vendor field in the virtio_device struct, which we leave as + * 0. + */ ldev->vdev.id.device = d->type; - /* We have a simple set of routines for querying the device's - * configuration information and setting its status. */ + /* + * We have a simple set of routines for querying the device's + * configuration information and setting its status. + */ ldev->vdev.config = &lguest_config_ops; /* And we remember the device's descriptor for lguest_config_ops. */ ldev->desc = d; - /* register_virtio_device() sets up the generic fields for the struct + /* + * register_virtio_device() sets up the generic fields for the struct * virtio_device and calls device_register(). This makes the bus - * infrastructure look for a matching driver. */ + * infrastructure look for a matching driver. + */ if (register_virtio_device(&ldev->vdev) != 0) { printk(KERN_ERR "Failed to register lguest dev %u type %u\n", offset, d->type); @@ -406,8 +444,10 @@ static void add_lguest_device(struct lguest_device_desc *d, } } -/*D:110 scan_devices() simply iterates through the device page. The type 0 is - * reserved to mean "end of devices". */ +/*D:110 + * scan_devices() simply iterates through the device page. The type 0 is + * reserved to mean "end of devices". + */ static void scan_devices(void) { unsigned int i; @@ -426,7 +466,8 @@ static void scan_devices(void) } } -/*D:105 Fairly early in boot, lguest_devices_init() is called to set up the +/*D:105 + * Fairly early in boot, lguest_devices_init() is called to set up the * lguest device infrastructure. We check that we are a Guest by checking * pv_info.name: there are other ways of checking, but this seems most * obvious to me. @@ -437,7 +478,8 @@ static void scan_devices(void) * correct sysfs incantation). * * Finally we call scan_devices() which adds all the devices found in the - * lguest_devices page. */ + * lguest_devices page. + */ static int __init lguest_devices_init(void) { if (strcmp(pv_info.name, "lguest") != 0) @@ -456,11 +498,13 @@ static int __init lguest_devices_init(void) /* We do this after core stuff, but before the drivers. */ postcore_initcall(lguest_devices_init); -/*D:150 At this point in the journey we used to now wade through the lguest +/*D:150 + * At this point in the journey we used to now wade through the lguest * devices themselves: net, block and console. Since they're all now virtio * devices rather than lguest-specific, I've decided to ignore them. Mostly, * they're kind of boring. But this does mean you'll never experience the * thrill of reading the forbidden love scene buried deep in the block driver. * * "make Launcher" beckons, where we answer questions like "Where do Guests - * come from?", and "What do you do when someone asks for optimization?". */ + * come from?", and "What do you do when someone asks for optimization?". + */ diff --git a/drivers/lguest/lguest_user.c b/drivers/lguest/lguest_user.c index 9f9a2953b38..b4d3f7ca554 100644 --- a/drivers/lguest/lguest_user.c +++ b/drivers/lguest/lguest_user.c @@ -1,8 +1,9 @@ /*P:200 This contains all the /dev/lguest code, whereby the userspace launcher * controls and communicates with the Guest. For example, the first write will - * tell us the Guest's memory layout, pagetable, entry point and kernel address - * offset. A read will run the Guest until something happens, such as a signal - * or the Guest doing a NOTIFY out to the Launcher. :*/ + * tell us the Guest's memory layout and entry point. A read will run the + * Guest until something happens, such as a signal or the Guest doing a NOTIFY + * out to the Launcher. +:*/ #include <linux/uaccess.h> #include <linux/miscdevice.h> #include <linux/fs.h> @@ -11,14 +12,41 @@ #include <linux/file.h> #include "lg.h" +/*L:056 + * Before we move on, let's jump ahead and look at what the kernel does when + * it needs to look up the eventfds. That will complete our picture of how we + * use RCU. + * + * The notification value is in cpu->pending_notify: we return true if it went + * to an eventfd. + */ bool send_notify_to_eventfd(struct lg_cpu *cpu) { unsigned int i; struct lg_eventfd_map *map; - /* lg->eventfds is RCU-protected */ + /* + * This "rcu_read_lock()" helps track when someone is still looking at + * the (RCU-using) eventfds array. It's not actually a lock at all; + * indeed it's a noop in many configurations. (You didn't expect me to + * explain all the RCU secrets here, did you?) + */ rcu_read_lock(); + /* + * rcu_dereference is the counter-side of rcu_assign_pointer(); it + * makes sure we don't access the memory pointed to by + * cpu->lg->eventfds before cpu->lg->eventfds is set. Sounds crazy, + * but Alpha allows this! Paul McKenney points out that a really + * aggressive compiler could have the same effect: + * http://lists.ozlabs.org/pipermail/lguest/2009-July/001560.html + * + * So play safe, use rcu_dereference to get the rcu-protected pointer: + */ map = rcu_dereference(cpu->lg->eventfds); + /* + * Simple array search: even if they add an eventfd while we do this, + * we'll continue to use the old array and just won't see the new one. + */ for (i = 0; i < map->num; i++) { if (map->map[i].addr == cpu->pending_notify) { eventfd_signal(map->map[i].event, 1); @@ -26,19 +54,50 @@ bool send_notify_to_eventfd(struct lg_cpu *cpu) break; } } + /* We're done with the rcu-protected variable cpu->lg->eventfds. */ rcu_read_unlock(); + + /* If we cleared the notification, it's because we found a match. */ return cpu->pending_notify == 0; } +/*L:055 + * One of the more tricksy tricks in the Linux Kernel is a technique called + * Read Copy Update. Since one point of lguest is to teach lguest journeyers + * about kernel coding, I use it here. (In case you're curious, other purposes + * include learning about virtualization and instilling a deep appreciation for + * simplicity and puppies). + * + * We keep a simple array which maps LHCALL_NOTIFY values to eventfds, but we + * add new eventfds without ever blocking readers from accessing the array. + * The current Launcher only does this during boot, so that never happens. But + * Read Copy Update is cool, and adding a lock risks damaging even more puppies + * than this code does. + * + * We allocate a brand new one-larger array, copy the old one and add our new + * element. Then we make the lg eventfd pointer point to the new array. + * That's the easy part: now we need to free the old one, but we need to make + * sure no slow CPU somewhere is still looking at it. That's what + * synchronize_rcu does for us: waits until every CPU has indicated that it has + * moved on to know it's no longer using the old one. + * + * If that's unclear, see http://en.wikipedia.org/wiki/Read-copy-update. + */ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) { struct lg_eventfd_map *new, *old = lg->eventfds; + /* + * We don't allow notifications on value 0 anyway (pending_notify of + * 0 means "nothing pending"). + */ if (!addr) return -EINVAL; - /* Replace the old array with the new one, carefully: others can - * be accessing it at the same time */ + /* + * Replace the old array with the new one, carefully: others can + * be accessing it at the same time. + */ new = kmalloc(sizeof(*new) + sizeof(new->map[0]) * (old->num + 1), GFP_KERNEL); if (!new) @@ -52,22 +111,41 @@ static int add_eventfd(struct lguest *lg, unsigned long addr, int fd) new->map[new->num].addr = addr; new->map[new->num].event = eventfd_ctx_fdget(fd); if (IS_ERR(new->map[new->num].event)) { + int err = PTR_ERR(new->map[new->num].event); kfree(new); - return PTR_ERR(new->map[new->num].event); + return err; } new->num++; - /* Now put new one in place. */ + /* + * Now put new one in place: rcu_assign_pointer() is a fancy way of + * doing "lg->eventfds = new", but it uses memory barriers to make + * absolutely sure that the contents of "new" written above is nailed + * down before we actually do the assignment. + * + * We have to think about these kinds of things when we're operating on + * live data without locks. + */ rcu_assign_pointer(lg->eventfds, new); - /* We're not in a big hurry. Wait until noone's looking at old - * version, then delete it. */ + /* + * We're not in a big hurry. Wait until noone's looking at old + * version, then free it. + */ synchronize_rcu(); kfree(old); return 0; } +/*L:052 + * Receiving notifications from the Guest is usually done by attaching a + * particular LHCALL_NOTIFY value to an event filedescriptor. The eventfd will + * become readable when the Guest does an LHCALL_NOTIFY with that value. + * + * This is really convenient for processing each virtqueue in a separate + * thread. + */ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) { unsigned long addr, fd; @@ -79,15 +157,22 @@ static int attach_eventfd(struct lguest *lg, const unsigned long __user *input) if (get_user(fd, input) != 0) return -EFAULT; + /* + * Just make sure two callers don't add eventfds at once. We really + * only need to lock against callers adding to the same Guest, so using + * the Big Lguest Lock is overkill. But this is setup, not a fast path. + */ mutex_lock(&lguest_lock); err = add_eventfd(lg, addr, fd); mutex_unlock(&lguest_lock); - return 0; + return err; } -/*L:050 Sending an interrupt is done by writing LHREQ_IRQ and an interrupt - * number to /dev/lguest. */ +/*L:050 + * Sending an interrupt is done by writing LHREQ_IRQ and an interrupt + * number to /dev/lguest. + */ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) { unsigned long irq; @@ -97,12 +182,18 @@ static int user_send_irq(struct lg_cpu *cpu, const unsigned long __user *input) if (irq >= LGUEST_IRQS) return -EINVAL; + /* + * Next time the Guest runs, the core code will see if it can deliver + * this interrupt. + */ set_interrupt(cpu, irq); return 0; } -/*L:040 Once our Guest is initialized, the Launcher makes it run by reading - * from /dev/lguest. */ +/*L:040 + * Once our Guest is initialized, the Launcher makes it run by reading + * from /dev/lguest. + */ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) { struct lguest *lg = file->private_data; @@ -138,8 +229,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return len; } - /* If we returned from read() last time because the Guest sent I/O, - * clear the flag. */ + /* + * If we returned from read() last time because the Guest sent I/O, + * clear the flag. + */ if (cpu->pending_notify) cpu->pending_notify = 0; @@ -147,8 +240,10 @@ static ssize_t read(struct file *file, char __user *user, size_t size,loff_t*o) return run_guest(cpu, (unsigned long __user *)user); } -/*L:025 This actually initializes a CPU. For the moment, a Guest is only - * uniprocessor, so "id" is always 0. */ +/*L:025 + * This actually initializes a CPU. For the moment, a Guest is only + * uniprocessor, so "id" is always 0. + */ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) { /* We have a limited number the number of CPUs in the lguest struct. */ @@ -163,8 +258,10 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* Each CPU has a timer it can set. */ init_clockdev(cpu); - /* We need a complete page for the Guest registers: they are accessible - * to the Guest and we can only grant it access to whole pages. */ + /* + * We need a complete page for the Guest registers: they are accessible + * to the Guest and we can only grant it access to whole pages. + */ cpu->regs_page = get_zeroed_page(GFP_KERNEL); if (!cpu->regs_page) return -ENOMEM; @@ -172,29 +269,38 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) /* We actually put the registers at the bottom of the page. */ cpu->regs = (void *)cpu->regs_page + PAGE_SIZE - sizeof(*cpu->regs); - /* Now we initialize the Guest's registers, handing it the start - * address. */ + /* + * Now we initialize the Guest's registers, handing it the start + * address. + */ lguest_arch_setup_regs(cpu, start_ip); - /* We keep a pointer to the Launcher task (ie. current task) for when - * other Guests want to wake this one (eg. console input). */ + /* + * We keep a pointer to the Launcher task (ie. current task) for when + * other Guests want to wake this one (eg. console input). + */ cpu->tsk = current; - /* We need to keep a pointer to the Launcher's memory map, because if + /* + * We need to keep a pointer to the Launcher's memory map, because if * the Launcher dies we need to clean it up. If we don't keep a - * reference, it is destroyed before close() is called. */ + * reference, it is destroyed before close() is called. + */ cpu->mm = get_task_mm(cpu->tsk); - /* We remember which CPU's pages this Guest used last, for optimization - * when the same Guest runs on the same CPU twice. */ + /* + * We remember which CPU's pages this Guest used last, for optimization + * when the same Guest runs on the same CPU twice. + */ cpu->last_pages = NULL; /* No error == success. */ return 0; } -/*L:020 The initialization write supplies 3 pointer sized (32 or 64 bit) - * values (in addition to the LHREQ_INITIALIZE value). These are: +/*L:020 + * The initialization write supplies 3 pointer sized (32 or 64 bit) values (in + * addition to the LHREQ_INITIALIZE value). These are: * * base: The start of the Guest-physical memory inside the Launcher memory. * @@ -206,14 +312,15 @@ static int lg_cpu_start(struct lg_cpu *cpu, unsigned id, unsigned long start_ip) */ static int initialize(struct file *file, const unsigned long __user *input) { - /* "struct lguest" contains everything we (the Host) know about a - * Guest. */ + /* "struct lguest" contains all we (the Host) know about a Guest. */ struct lguest *lg; int err; unsigned long args[3]; - /* We grab the Big Lguest lock, which protects against multiple - * simultaneous initializations. */ + /* + * We grab the Big Lguest lock, which protects against multiple + * simultaneous initializations. + */ mutex_lock(&lguest_lock); /* You can't initialize twice! Close the device and start again... */ if (file->private_data) { @@ -248,8 +355,10 @@ static int initialize(struct file *file, const unsigned long __user *input) if (err) goto free_eventfds; - /* Initialize the Guest's shadow page tables, using the toplevel - * address the Launcher gave us. This allocates memory, so can fail. */ + /* + * Initialize the Guest's shadow page tables, using the toplevel + * address the Launcher gave us. This allocates memory, so can fail. + */ err = init_guest_pagetable(lg); if (err) goto free_regs; @@ -274,20 +383,24 @@ unlock: return err; } -/*L:010 The first operation the Launcher does must be a write. All writes +/*L:010 + * The first operation the Launcher does must be a write. All writes * start with an unsigned long number: for the first write this must be * LHREQ_INITIALIZE to set up the Guest. After that the Launcher can use - * writes of other values to send interrupts. + * writes of other values to send interrupts or set up receipt of notifications. * * Note that we overload the "offset" in the /dev/lguest file to indicate what - * CPU number we're dealing with. Currently this is always 0, since we only + * CPU number we're dealing with. Currently this is always 0 since we only * support uniprocessor Guests, but you can see the beginnings of SMP support - * here. */ + * here. + */ static ssize_t write(struct file *file, const char __user *in, size_t size, loff_t *off) { - /* Once the Guest is initialized, we hold the "struct lguest" in the - * file private data. */ + /* + * Once the Guest is initialized, we hold the "struct lguest" in the + * file private data. + */ struct lguest *lg = file->private_data; const unsigned long __user *input = (const unsigned long __user *)in; unsigned long req; @@ -322,13 +435,15 @@ static ssize_t write(struct file *file, const char __user *in, } } -/*L:060 The final piece of interface code is the close() routine. It reverses +/*L:060 + * The final piece of interface code is the close() routine. It reverses * everything done in initialize(). This is usually called because the * Launcher exited. * * Note that the close routine returns 0 or a negative error number: it can't * really fail, but it can whine. I blame Sun for this wart, and K&R C for - * letting them do it. :*/ + * letting them do it. +:*/ static int close(struct inode *inode, struct file *file) { struct lguest *lg = file->private_data; @@ -338,8 +453,10 @@ static int close(struct inode *inode, struct file *file) if (!lg) return 0; - /* We need the big lock, to protect from inter-guest I/O and other - * Launchers initializing guests. */ + /* + * We need the big lock, to protect from inter-guest I/O and other + * Launchers initializing guests. + */ mutex_lock(&lguest_lock); /* Free up the shadow page tables for the Guest. */ @@ -350,8 +467,10 @@ static int close(struct inode *inode, struct file *file) hrtimer_cancel(&lg->cpus[i].hrt); /* We can free up the register page we allocated. */ free_page(lg->cpus[i].regs_page); - /* Now all the memory cleanups are done, it's safe to release - * the Launcher's memory management structure. */ + /* + * Now all the memory cleanups are done, it's safe to release + * the Launcher's memory management structure. + */ mmput(lg->cpus[i].mm); } @@ -360,8 +479,10 @@ static int close(struct inode *inode, struct file *file) eventfd_ctx_put(lg->eventfds->map[i].event); kfree(lg->eventfds); - /* If lg->dead doesn't contain an error code it will be NULL or a - * kmalloc()ed string, either of which is ok to hand to kfree(). */ + /* + * If lg->dead doesn't contain an error code it will be NULL or a + * kmalloc()ed string, either of which is ok to hand to kfree(). + */ if (!IS_ERR(lg->dead)) kfree(lg->dead); /* Free the memory allocated to the lguest_struct */ @@ -385,7 +506,8 @@ static int close(struct inode *inode, struct file *file) * * We begin our understanding with the Host kernel interface which the Launcher * uses: reading and writing a character device called /dev/lguest. All the - * work happens in the read(), write() and close() routines: */ + * work happens in the read(), write() and close() routines: + */ static struct file_operations lguest_fops = { .owner = THIS_MODULE, .release = close, @@ -393,8 +515,10 @@ static struct file_operations lguest_fops = { .read = read, }; -/* This is a textbook example of a "misc" character device. Populate a "struct - * miscdevice" and register it with misc_register(). */ +/* + * This is a textbook example of a "misc" character device. Populate a "struct + * miscdevice" and register it with misc_register(). + */ static struct miscdevice lguest_dev = { .minor = MISC_DYNAMIC_MINOR, .name = "lguest", diff --git a/drivers/lguest/page_tables.c b/drivers/lguest/page_tables.c index a6fe1abda24..a8d0aee3bc0 100644 --- a/drivers/lguest/page_tables.c +++ b/drivers/lguest/page_tables.c @@ -1,9 +1,11 @@ -/*P:700 The pagetable code, on the other hand, still shows the scars of +/*P:700 + * The pagetable code, on the other hand, still shows the scars of * previous encounters. It's functional, and as neat as it can be in the * circumstances, but be wary, for these things are subtle and break easily. * The Guest provides a virtual to physical mapping, but we can neither trust * it nor use it: we verify and convert it here then point the CPU to the - * converted Guest pages when running the Guest. :*/ + * converted Guest pages when running the Guest. +:*/ /* Copyright (C) Rusty Russell IBM Corporation 2006. * GPL v2 and any later version */ @@ -17,18 +19,20 @@ #include <asm/bootparam.h> #include "lg.h" -/*M:008 We hold reference to pages, which prevents them from being swapped. +/*M:008 + * We hold reference to pages, which prevents them from being swapped. * It'd be nice to have a callback in the "struct mm_struct" when Linux wants * to swap out. If we had this, and a shrinker callback to trim PTE pages, we - * could probably consider launching Guests as non-root. :*/ + * could probably consider launching Guests as non-root. +:*/ /*H:300 * The Page Table Code * - * We use two-level page tables for the Guest. If you're not entirely - * comfortable with virtual addresses, physical addresses and page tables then - * I recommend you review arch/x86/lguest/boot.c's "Page Table Handling" (with - * diagrams!). + * We use two-level page tables for the Guest, or three-level with PAE. If + * you're not entirely comfortable with virtual addresses, physical addresses + * and page tables then I recommend you review arch/x86/lguest/boot.c's "Page + * Table Handling" (with diagrams!). * * The Guest keeps page tables, but we maintain the actual ones here: these are * called "shadow" page tables. Which is a very Guest-centric name: these are @@ -45,16 +49,18 @@ * (v) Flushing (throwing away) page tables, * (vi) Mapping the Switcher when the Guest is about to run, * (vii) Setting up the page tables initially. - :*/ +:*/ - -/* 1024 entries in a page table page maps 1024 pages: 4MB. The Switcher is - * conveniently placed at the top 4MB, so it uses a separate, complete PTE - * page. */ +/* + * The Switcher uses the complete top PTE page. That's 1024 PTE entries (4MB) + * or 512 PTE entries with PAE (2MB). + */ #define SWITCHER_PGD_INDEX (PTRS_PER_PGD - 1) -/* For PAE we need the PMD index as well. We use the last 2MB, so we - * will need the last pmd entry of the last pmd page. */ +/* + * For PAE we need the PMD index as well. We use the last 2MB, so we + * will need the last pmd entry of the last pmd page. + */ #ifdef CONFIG_X86_PAE #define SWITCHER_PMD_INDEX (PTRS_PER_PMD - 1) #define RESERVE_MEM 2U @@ -64,14 +70,18 @@ #define CHECK_GPGD_MASK _PAGE_TABLE #endif -/* We actually need a separate PTE page for each CPU. Remember that after the +/* + * We actually need a separate PTE page for each CPU. Remember that after the * Switcher code itself comes two pages for each CPU, and we don't want this - * CPU's guest to see the pages of any other CPU. */ + * CPU's guest to see the pages of any other CPU. + */ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); #define switcher_pte_page(cpu) per_cpu(switcher_pte_pages, cpu) -/*H:320 The page table code is curly enough to need helper functions to keep it - * clear and clean. +/*H:320 + * The page table code is curly enough to need helper functions to keep it + * clear and clean. The kernel itself provides many of them; one advantage + * of insisting that the Guest and Host use the same CONFIG_PAE setting. * * There are two functions which return pointers to the shadow (aka "real") * page tables. @@ -79,7 +89,8 @@ static DEFINE_PER_CPU(pte_t *, switcher_pte_pages); * spgd_addr() takes the virtual address and returns a pointer to the top-level * page directory entry (PGD) for that address. Since we keep track of several * page tables, the "i" argument tells us which one we're interested in (it's - * usually the current one). */ + * usually the current one). + */ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) { unsigned int index = pgd_index(vaddr); @@ -96,9 +107,11 @@ static pgd_t *spgd_addr(struct lg_cpu *cpu, u32 i, unsigned long vaddr) } #ifdef CONFIG_X86_PAE -/* This routine then takes the PGD entry given above, which contains the +/* + * This routine then takes the PGD entry given above, which contains the * address of the PMD page. It then returns a pointer to the PMD entry for the - * given address. */ + * given address. + */ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { unsigned int index = pmd_index(vaddr); @@ -119,9 +132,11 @@ static pmd_t *spmd_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) } #endif -/* This routine then takes the page directory entry returned above, which +/* + * This routine then takes the page directory entry returned above, which * contains the address of the page table entry (PTE) page. It then returns a - * pointer to the PTE entry for the given address. */ + * pointer to the PTE entry for the given address. + */ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) { #ifdef CONFIG_X86_PAE @@ -139,8 +154,10 @@ static pte_t *spte_addr(struct lg_cpu *cpu, pgd_t spgd, unsigned long vaddr) return &page[pte_index(vaddr)]; } -/* These two functions just like the above two, except they access the Guest - * page tables. Hence they return a Guest address. */ +/* + * These functions are just like the above two, except they access the Guest + * page tables. Hence they return a Guest address. + */ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) { unsigned int index = vaddr >> (PGDIR_SHIFT); @@ -148,6 +165,7 @@ static unsigned long gpgd_addr(struct lg_cpu *cpu, unsigned long vaddr) } #ifdef CONFIG_X86_PAE +/* Follow the PGD to the PMD. */ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) { unsigned long gpage = pgd_pfn(gpgd) << PAGE_SHIFT; @@ -155,6 +173,7 @@ static unsigned long gpmd_addr(pgd_t gpgd, unsigned long vaddr) return gpage + pmd_index(vaddr) * sizeof(pmd_t); } +/* Follow the PMD to the PTE. */ static unsigned long gpte_addr(struct lg_cpu *cpu, pmd_t gpmd, unsigned long vaddr) { @@ -164,6 +183,7 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, return gpage + pte_index(vaddr) * sizeof(pte_t); } #else +/* Follow the PGD to the PTE (no mid-level for !PAE). */ static unsigned long gpte_addr(struct lg_cpu *cpu, pgd_t gpgd, unsigned long vaddr) { @@ -175,17 +195,21 @@ static unsigned long gpte_addr(struct lg_cpu *cpu, #endif /*:*/ -/*M:014 get_pfn is slow: we could probably try to grab batches of pages here as - * an optimization (ie. pre-faulting). :*/ +/*M:014 + * get_pfn is slow: we could probably try to grab batches of pages here as + * an optimization (ie. pre-faulting). +:*/ -/*H:350 This routine takes a page number given by the Guest and converts it to +/*H:350 + * This routine takes a page number given by the Guest and converts it to * an actual, physical page number. It can fail for several reasons: the * virtual address might not be mapped by the Launcher, the write flag is set * and the page is read-only, or the write flag was set and the page was * shared so had to be copied, but we ran out of memory. * * This holds a reference to the page, so release_pte() is careful to put that - * back. */ + * back. + */ static unsigned long get_pfn(unsigned long virtpfn, int write) { struct page *page; @@ -198,33 +222,41 @@ static unsigned long get_pfn(unsigned long virtpfn, int write) return -1UL; } -/*H:340 Converting a Guest page table entry to a shadow (ie. real) page table +/*H:340 + * Converting a Guest page table entry to a shadow (ie. real) page table * entry can be a little tricky. The flags are (almost) the same, but the * Guest PTE contains a virtual page number: the CPU needs the real page - * number. */ + * number. + */ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) { unsigned long pfn, base, flags; - /* The Guest sets the global flag, because it thinks that it is using + /* + * The Guest sets the global flag, because it thinks that it is using * PGE. We only told it to use PGE so it would tell us whether it was * flushing a kernel mapping or a userspace mapping. We don't actually - * use the global bit, so throw it away. */ + * use the global bit, so throw it away. + */ flags = (pte_flags(gpte) & ~_PAGE_GLOBAL); /* The Guest's pages are offset inside the Launcher. */ base = (unsigned long)cpu->lg->mem_base / PAGE_SIZE; - /* We need a temporary "unsigned long" variable to hold the answer from + /* + * We need a temporary "unsigned long" variable to hold the answer from * get_pfn(), because it returns 0xFFFFFFFF on failure, which wouldn't * fit in spte.pfn. get_pfn() finds the real physical number of the - * page, given the virtual number. */ + * page, given the virtual number. + */ pfn = get_pfn(base + pte_pfn(gpte), write); if (pfn == -1UL) { kill_guest(cpu, "failed to get page %lu", pte_pfn(gpte)); - /* When we destroy the Guest, we'll go through the shadow page + /* + * When we destroy the Guest, we'll go through the shadow page * tables and release_pte() them. Make sure we don't think - * this one is valid! */ + * this one is valid! + */ flags = 0; } /* Now we assemble our shadow PTE from the page number and flags. */ @@ -234,8 +266,10 @@ static pte_t gpte_to_spte(struct lg_cpu *cpu, pte_t gpte, int write) /*H:460 And to complete the chain, release_pte() looks like this: */ static void release_pte(pte_t pte) { - /* Remember that get_user_pages_fast() took a reference to the page, in - * get_pfn()? We have to put it back now. */ + /* + * Remember that get_user_pages_fast() took a reference to the page, in + * get_pfn()? We have to put it back now. + */ if (pte_flags(pte) & _PAGE_PRESENT) put_page(pte_page(pte)); } @@ -273,7 +307,8 @@ static void check_gpmd(struct lg_cpu *cpu, pmd_t gpmd) * and return to the Guest without it knowing. * * If we fixed up the fault (ie. we mapped the address), this routine returns - * true. Otherwise, it was a real fault and we need to tell the Guest. */ + * true. Otherwise, it was a real fault and we need to tell the Guest. + */ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) { pgd_t gpgd; @@ -282,6 +317,7 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) pte_t gpte; pte_t *spte; + /* Mid level for PAE. */ #ifdef CONFIG_X86_PAE pmd_t *spmd; pmd_t gpmd; @@ -298,22 +334,26 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if (!(pgd_flags(*spgd) & _PAGE_PRESENT)) { /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; } /* We check that the Guest pgd is OK. */ check_gpgd(cpu, gpgd); - /* And we copy the flags to the shadow PGD entry. The page - * number in the shadow PGD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PGD entry. The page + * number in the shadow PGD is the page we just allocated. + */ set_pgd(spgd, __pgd(__pa(ptepage) | pgd_flags(gpgd))); } #ifdef CONFIG_X86_PAE gpmd = lgread(cpu, gpmd_addr(gpgd, vaddr), pmd_t); - /* middle level not present? We can't map it in. */ + /* Middle level not present? We can't map it in. */ if (!(pmd_flags(gpmd) & _PAGE_PRESENT)) return false; @@ -324,8 +364,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* No shadow entry: allocate a new shadow PTE page. */ unsigned long ptepage = get_zeroed_page(GFP_KERNEL); - /* This is not really the Guest's fault, but killing it is - * simple for this corner case. */ + /* + * This is not really the Guest's fault, but killing it is + * simple for this corner case. + */ if (!ptepage) { kill_guest(cpu, "out of memory allocating pte page"); return false; @@ -334,27 +376,37 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* We check that the Guest pmd is OK. */ check_gpmd(cpu, gpmd); - /* And we copy the flags to the shadow PMD entry. The page - * number in the shadow PMD is the page we just allocated. */ + /* + * And we copy the flags to the shadow PMD entry. The page + * number in the shadow PMD is the page we just allocated. + */ native_set_pmd(spmd, __pmd(__pa(ptepage) | pmd_flags(gpmd))); } - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpmd, vaddr); #else - /* OK, now we look at the lower level in the Guest page table: keep its - * address, because we might update it later. */ + /* + * OK, now we look at the lower level in the Guest page table: keep its + * address, because we might update it later. + */ gpte_ptr = gpte_addr(cpu, gpgd, vaddr); #endif + + /* Read the actual PTE value. */ gpte = lgread(cpu, gpte_ptr, pte_t); /* If this page isn't in the Guest page tables, we can't page it in. */ if (!(pte_flags(gpte) & _PAGE_PRESENT)) return false; - /* Check they're not trying to write to a page the Guest wants - * read-only (bit 2 of errcode == write). */ + /* + * Check they're not trying to write to a page the Guest wants + * read-only (bit 2 of errcode == write). + */ if ((errcode & 2) && !(pte_flags(gpte) & _PAGE_RW)) return false; @@ -362,8 +414,10 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) if ((errcode & 4) && !(pte_flags(gpte) & _PAGE_USER)) return false; - /* Check that the Guest PTE flags are OK, and the page number is below - * the pfn_limit (ie. not mapping the Launcher binary). */ + /* + * Check that the Guest PTE flags are OK, and the page number is below + * the pfn_limit (ie. not mapping the Launcher binary). + */ check_gpte(cpu, gpte); /* Add the _PAGE_ACCESSED and (for a write) _PAGE_DIRTY flag */ @@ -373,29 +427,40 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) /* Get the pointer to the shadow PTE entry we're going to set. */ spte = spte_addr(cpu, *spgd, vaddr); - /* If there was a valid shadow PTE entry here before, we release it. - * This can happen with a write to a previously read-only entry. */ + + /* + * If there was a valid shadow PTE entry here before, we release it. + * This can happen with a write to a previously read-only entry. + */ release_pte(*spte); - /* If this is a write, we insist that the Guest page is writable (the - * final arg to gpte_to_spte()). */ + /* + * If this is a write, we insist that the Guest page is writable (the + * final arg to gpte_to_spte()). + */ if (pte_dirty(gpte)) *spte = gpte_to_spte(cpu, gpte, 1); else - /* If this is a read, don't set the "writable" bit in the page + /* + * If this is a read, don't set the "writable" bit in the page * table entry, even if the Guest says it's writable. That way * we will come back here when a write does actually occur, so - * we can update the Guest's _PAGE_DIRTY flag. */ + * we can update the Guest's _PAGE_DIRTY flag. + */ native_set_pte(spte, gpte_to_spte(cpu, pte_wrprotect(gpte), 0)); - /* Finally, we write the Guest PTE entry back: we've set the - * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. */ + /* + * Finally, we write the Guest PTE entry back: we've set the + * _PAGE_ACCESSED and maybe the _PAGE_DIRTY flags. + */ lgwrite(cpu, gpte_ptr, pte_t, gpte); - /* The fault is fixed, the page table is populated, the mapping + /* + * The fault is fixed, the page table is populated, the mapping * manipulated, the result returned and the code complete. A small * delay and a trace of alliteration are the only indications the Guest - * has that a page fault occurred at all. */ + * has that a page fault occurred at all. + */ return true; } @@ -408,7 +473,8 @@ bool demand_page(struct lg_cpu *cpu, unsigned long vaddr, int errcode) * mapped, so it's overkill. * * This is a quick version which answers the question: is this virtual address - * mapped by the shadow page tables, and is it writable? */ + * mapped by the shadow page tables, and is it writable? + */ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) { pgd_t *spgd; @@ -428,21 +494,26 @@ static bool page_writable(struct lg_cpu *cpu, unsigned long vaddr) return false; #endif - /* Check the flags on the pte entry itself: it must be present and - * writable. */ + /* + * Check the flags on the pte entry itself: it must be present and + * writable. + */ flags = pte_flags(*(spte_addr(cpu, *spgd, vaddr))); return (flags & (_PAGE_PRESENT|_PAGE_RW)) == (_PAGE_PRESENT|_PAGE_RW); } -/* So, when pin_stack_pages() asks us to pin a page, we check if it's already +/* + * So, when pin_stack_pages() asks us to pin a page, we check if it's already * in the page tables, and if not, we call demand_page() with error code 2 - * (meaning "write"). */ + * (meaning "write"). + */ void pin_page(struct lg_cpu *cpu, unsigned long vaddr) { if (!page_writable(cpu, vaddr) && !demand_page(cpu, vaddr, 2)) kill_guest(cpu, "bad stack page %#lx", vaddr); } +/*:*/ #ifdef CONFIG_X86_PAE static void release_pmd(pmd_t *spmd) @@ -479,15 +550,21 @@ static void release_pgd(pgd_t *spgd) } #else /* !CONFIG_X86_PAE */ -/*H:450 If we chase down the release_pgd() code, it looks like this: */ +/*H:450 + * If we chase down the release_pgd() code, the non-PAE version looks like + * this. The PAE version is almost identical, but instead of calling + * release_pte it calls release_pmd(), which looks much like this. + */ static void release_pgd(pgd_t *spgd) { /* If the entry's not present, there's nothing to release. */ if (pgd_flags(*spgd) & _PAGE_PRESENT) { unsigned int i; - /* Converting the pfn to find the actual PTE page is easy: turn + /* + * Converting the pfn to find the actual PTE page is easy: turn * the page number into a physical address, then convert to a - * virtual address (easy for kernel pages like this one). */ + * virtual address (easy for kernel pages like this one). + */ pte_t *ptepage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); /* For each entry in the page, we might need to release it. */ for (i = 0; i < PTRS_PER_PTE; i++) @@ -499,9 +576,12 @@ static void release_pgd(pgd_t *spgd) } } #endif -/*H:445 We saw flush_user_mappings() twice: once from the flush_user_mappings() + +/*H:445 + * We saw flush_user_mappings() twice: once from the flush_user_mappings() * hypercall and once in new_pgdir() when we re-used a top-level pgdir page. - * It simply releases every PTE page from 0 up to the Guest's kernel address. */ + * It simply releases every PTE page from 0 up to the Guest's kernel address. + */ static void flush_user_mappings(struct lguest *lg, int idx) { unsigned int i; @@ -510,10 +590,12 @@ static void flush_user_mappings(struct lguest *lg, int idx) release_pgd(lg->pgdirs[idx].pgdir + i); } -/*H:440 (v) Flushing (throwing away) page tables, +/*H:440 + * (v) Flushing (throwing away) page tables, * * The Guest has a hypercall to throw away the page tables: it's used when a - * large number of mappings have been changed. */ + * large number of mappings have been changed. + */ void guest_pagetable_flush_user(struct lg_cpu *cpu) { /* Drop the userspace part of the current page table. */ @@ -551,9 +633,11 @@ unsigned long guest_pa(struct lg_cpu *cpu, unsigned long vaddr) return pte_pfn(gpte) * PAGE_SIZE | (vaddr & ~PAGE_MASK); } -/* We keep several page tables. This is a simple routine to find the page +/* + * We keep several page tables. This is a simple routine to find the page * table (if any) corresponding to this top-level address the Guest has given - * us. */ + * us. + */ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) { unsigned int i; @@ -563,9 +647,11 @@ static unsigned int find_pgdir(struct lguest *lg, unsigned long pgtable) return i; } -/*H:435 And this is us, creating the new page directory. If we really do +/*H:435 + * And this is us, creating the new page directory. If we really do * allocate a new one (and so the kernel parts are not there), we set - * blank_pgdir. */ + * blank_pgdir. + */ static unsigned int new_pgdir(struct lg_cpu *cpu, unsigned long gpgdir, int *blank_pgdir) @@ -575,8 +661,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, pmd_t *pmd_table; #endif - /* We pick one entry at random to throw out. Choosing the Least - * Recently Used might be better, but this is easy. */ + /* + * We pick one entry at random to throw out. Choosing the Least + * Recently Used might be better, but this is easy. + */ next = random32() % ARRAY_SIZE(cpu->lg->pgdirs); /* If it's never been allocated at all before, try now. */ if (!cpu->lg->pgdirs[next].pgdir) { @@ -587,8 +675,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, next = cpu->cpu_pgd; else { #ifdef CONFIG_X86_PAE - /* In PAE mode, allocate a pmd page and populate the - * last pgd entry. */ + /* + * In PAE mode, allocate a pmd page and populate the + * last pgd entry. + */ pmd_table = (pmd_t *)get_zeroed_page(GFP_KERNEL); if (!pmd_table) { free_page((long)cpu->lg->pgdirs[next].pgdir); @@ -598,8 +688,10 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, set_pgd(cpu->lg->pgdirs[next].pgdir + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); - /* This is a blank page, so there are no kernel - * mappings: caller must map the stack! */ + /* + * This is a blank page, so there are no kernel + * mappings: caller must map the stack! + */ *blank_pgdir = 1; } #else @@ -615,19 +707,23 @@ static unsigned int new_pgdir(struct lg_cpu *cpu, return next; } -/*H:430 (iv) Switching page tables +/*H:430 + * (iv) Switching page tables * * Now we've seen all the page table setting and manipulation, let's see * what happens when the Guest changes page tables (ie. changes the top-level - * pgdir). This occurs on almost every context switch. */ + * pgdir). This occurs on almost every context switch. + */ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) { int newpgdir, repin = 0; /* Look to see if we have this one already. */ newpgdir = find_pgdir(cpu->lg, pgtable); - /* If not, we allocate or mug an existing one: if it's a fresh one, - * repin gets set to 1. */ + /* + * If not, we allocate or mug an existing one: if it's a fresh one, + * repin gets set to 1. + */ if (newpgdir == ARRAY_SIZE(cpu->lg->pgdirs)) newpgdir = new_pgdir(cpu, pgtable, &repin); /* Change the current pgd index to the new one. */ @@ -637,9 +733,11 @@ void guest_new_pagetable(struct lg_cpu *cpu, unsigned long pgtable) pin_stack_pages(cpu); } -/*H:470 Finally, a routine which throws away everything: all PGD entries in all +/*H:470 + * Finally, a routine which throws away everything: all PGD entries in all * the shadow page tables, including the Guest's kernel mappings. This is used - * when we destroy the Guest. */ + * when we destroy the Guest. + */ static void release_all_pagetables(struct lguest *lg) { unsigned int i, j; @@ -656,8 +754,10 @@ static void release_all_pagetables(struct lguest *lg) spgd = lg->pgdirs[i].pgdir + SWITCHER_PGD_INDEX; pmdpage = __va(pgd_pfn(*spgd) << PAGE_SHIFT); - /* And release the pmd entries of that pmd page, - * except for the switcher pmd. */ + /* + * And release the pmd entries of that pmd page, + * except for the switcher pmd. + */ for (k = 0; k < SWITCHER_PMD_INDEX; k++) release_pmd(&pmdpage[k]); #endif @@ -667,10 +767,12 @@ static void release_all_pagetables(struct lguest *lg) } } -/* We also throw away everything when a Guest tells us it's changed a kernel +/* + * We also throw away everything when a Guest tells us it's changed a kernel * mapping. Since kernel mappings are in every page table, it's easiest to * throw them all away. This traps the Guest in amber for a while as - * everything faults back in, but it's rare. */ + * everything faults back in, but it's rare. + */ void guest_pagetable_clear_all(struct lg_cpu *cpu) { release_all_pagetables(cpu->lg); @@ -678,15 +780,19 @@ void guest_pagetable_clear_all(struct lg_cpu *cpu) pin_stack_pages(cpu); } /*:*/ -/*M:009 Since we throw away all mappings when a kernel mapping changes, our + +/*M:009 + * Since we throw away all mappings when a kernel mapping changes, our * performance sucks for guests using highmem. In fact, a guest with * PAGE_OFFSET 0xc0000000 (the default) and more than about 700MB of RAM is * usually slower than a Guest with less memory. * * This, of course, cannot be fixed. It would take some kind of... well, I - * don't know, but the term "puissant code-fu" comes to mind. :*/ + * don't know, but the term "puissant code-fu" comes to mind. +:*/ -/*H:420 This is the routine which actually sets the page table entry for then +/*H:420 + * This is the routine which actually sets the page table entry for then * "idx"'th shadow page table. * * Normally, we can just throw out the old entry and replace it with 0: if they @@ -715,31 +821,36 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, spmd = spmd_addr(cpu, *spgd, vaddr); if (pmd_flags(*spmd) & _PAGE_PRESENT) { #endif - /* Otherwise, we start by releasing - * the existing entry. */ + /* Otherwise, start by releasing the existing entry. */ pte_t *spte = spte_addr(cpu, *spgd, vaddr); release_pte(*spte); - /* If they're setting this entry as dirty or accessed, - * we might as well put that entry they've given us - * in now. This shaves 10% off a - * copy-on-write micro-benchmark. */ + /* + * If they're setting this entry as dirty or accessed, + * we might as well put that entry they've given us in + * now. This shaves 10% off a copy-on-write + * micro-benchmark. + */ if (pte_flags(gpte) & (_PAGE_DIRTY | _PAGE_ACCESSED)) { check_gpte(cpu, gpte); native_set_pte(spte, gpte_to_spte(cpu, gpte, pte_flags(gpte) & _PAGE_DIRTY)); - } else - /* Otherwise kill it and we can demand_page() - * it in later. */ + } else { + /* + * Otherwise kill it and we can demand_page() + * it in later. + */ native_set_pte(spte, __pte(0)); + } #ifdef CONFIG_X86_PAE } #endif } } -/*H:410 Updating a PTE entry is a little trickier. +/*H:410 + * Updating a PTE entry is a little trickier. * * We keep track of several different page tables (the Guest uses one for each * process, so it makes sense to cache at least a few). Each of these have @@ -748,12 +859,15 @@ static void do_set_pte(struct lg_cpu *cpu, int idx, * all the page tables, not just the current one. This is rare. * * The benefit is that when we have to track a new page table, we can keep all - * the kernel mappings. This speeds up context switch immensely. */ + * the kernel mappings. This speeds up context switch immensely. + */ void guest_set_pte(struct lg_cpu *cpu, unsigned long gpgdir, unsigned long vaddr, pte_t gpte) { - /* Kernel mappings must be changed on all top levels. Slow, but doesn't - * happen often. */ + /* + * Kernel mappings must be changed on all top levels. Slow, but doesn't + * happen often. + */ if (vaddr >= cpu->lg->kernel_address) { unsigned int i; for (i = 0; i < ARRAY_SIZE(cpu->lg->pgdirs); i++) @@ -795,19 +909,25 @@ void guest_set_pgd(struct lguest *lg, unsigned long gpgdir, u32 idx) /* ... throw it away. */ release_pgd(lg->pgdirs[pgdir].pgdir + idx); } + #ifdef CONFIG_X86_PAE +/* For setting a mid-level, we just throw everything away. It's easy. */ void guest_set_pmd(struct lguest *lg, unsigned long pmdp, u32 idx) { guest_pagetable_clear_all(&lg->cpus[0]); } #endif -/* Once we know how much memory we have we can construct simple identity - * (which set virtual == physical) and linear mappings - * which will get the Guest far enough into the boot to create its own. +/*H:505 + * To get through boot, we construct simple identity page mappings (which + * set virtual == physical) and linear mappings which will get the Guest far + * enough into the boot to create its own. The linear mapping means we + * simplify the Guest boot, but it makes assumptions about their PAGE_OFFSET, + * as you'll see. * * We lay them out of the way, just below the initrd (which is why we need to - * know its size here). */ + * know its size here). + */ static unsigned long setup_pagetables(struct lguest *lg, unsigned long mem, unsigned long initrd_size) @@ -825,8 +945,10 @@ static unsigned long setup_pagetables(struct lguest *lg, unsigned int phys_linear; #endif - /* We have mapped_pages frames to map, so we need - * linear_pages page tables to map them. */ + /* + * We have mapped_pages frames to map, so we need linear_pages page + * tables to map them. + */ mapped_pages = mem / PAGE_SIZE; linear_pages = (mapped_pages + PTRS_PER_PTE - 1) / PTRS_PER_PTE; @@ -837,10 +959,16 @@ static unsigned long setup_pagetables(struct lguest *lg, linear = (void *)pgdir - linear_pages * PAGE_SIZE; #ifdef CONFIG_X86_PAE + /* + * And the single mid page goes below that. We only use one, but + * that's enough to map 1G, which definitely gets us through boot. + */ pmds = (void *)linear - PAGE_SIZE; #endif - /* Linear mapping is easy: put every page's address into the - * mapping in order. */ + /* + * Linear mapping is easy: put every page's address into the + * mapping in order. + */ for (i = 0; i < mapped_pages; i++) { pte_t pte; pte = pfn_pte(i, __pgprot(_PAGE_PRESENT|_PAGE_RW|_PAGE_USER)); @@ -848,11 +976,14 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } - /* The top level points to the linear page table pages above. - * We setup the identity and linear mappings here. */ #ifdef CONFIG_X86_PAE + /* + * Make the Guest PMD entries point to the corresponding place in the + * linear mapping (up to one page worth of PMD). + */ for (i = j = 0; i < mapped_pages && j < PTRS_PER_PMD; i += PTRS_PER_PTE, j++) { + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&pmd, __pmd(((unsigned long)(linear + i) - mem_base) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); @@ -860,18 +991,36 @@ static unsigned long setup_pagetables(struct lguest *lg, return -EFAULT; } + /* One PGD entry, pointing to that PMD page. */ set_pgd(&pgd, __pgd(((u32)pmds - mem_base) | _PAGE_PRESENT)); + /* Copy it in as the first PGD entry (ie. addresses 0-1G). */ if (copy_to_user(&pgdir[0], &pgd, sizeof(pgd)) != 0) return -EFAULT; + /* + * And the third PGD entry (ie. addresses 3G-4G). + * + * FIXME: This assumes that PAGE_OFFSET for the Guest is 0xC0000000. + */ if (copy_to_user(&pgdir[3], &pgd, sizeof(pgd)) != 0) return -EFAULT; #else + /* + * The top level points to the linear page table pages above. + * We setup the identity and linear mappings here. + */ phys_linear = (unsigned long)linear - mem_base; for (i = 0; i < mapped_pages; i += PTRS_PER_PTE) { pgd_t pgd; + /* + * Create a PGD entry which points to the right part of the + * linear PTE pages. + */ pgd = __pgd((phys_linear + i * sizeof(pte_t)) | (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER)); + /* + * Copy it into the PGD page at 0 and PAGE_OFFSET. + */ if (copy_to_user(&pgdir[i / PTRS_PER_PTE], &pgd, sizeof(pgd)) || copy_to_user(&pgdir[pgd_index(PAGE_OFFSET) + i / PTRS_PER_PTE], @@ -880,15 +1029,19 @@ static unsigned long setup_pagetables(struct lguest *lg, } #endif - /* We return the top level (guest-physical) address: remember where - * this is. */ + /* + * We return the top level (guest-physical) address: we remember where + * this is to write it into lguest_data when the Guest initializes. + */ return (unsigned long)pgdir - mem_base; } -/*H:500 (vii) Setting up the page tables initially. +/*H:500 + * (vii) Setting up the page tables initially. * * When a Guest is first created, the Launcher tells us where the toplevel of - * its first page table is. We set some things up here: */ + * its first page table is. We set some things up here: + */ int init_guest_pagetable(struct lguest *lg) { u64 mem; @@ -898,21 +1051,27 @@ int init_guest_pagetable(struct lguest *lg) pgd_t *pgd; pmd_t *pmd_table; #endif - /* Get the Guest memory size and the ramdisk size from the boot header - * located at lg->mem_base (Guest address 0). */ + /* + * Get the Guest memory size and the ramdisk size from the boot header + * located at lg->mem_base (Guest address 0). + */ if (copy_from_user(&mem, &boot->e820_map[0].size, sizeof(mem)) || get_user(initrd_size, &boot->hdr.ramdisk_size)) return -EFAULT; - /* We start on the first shadow page table, and give it a blank PGD - * page. */ + /* + * We start on the first shadow page table, and give it a blank PGD + * page. + */ lg->pgdirs[0].gpgdir = setup_pagetables(lg, mem, initrd_size); if (IS_ERR_VALUE(lg->pgdirs[0].gpgdir)) return lg->pgdirs[0].gpgdir; lg->pgdirs[0].pgdir = (pgd_t *)get_zeroed_page(GFP_KERNEL); if (!lg->pgdirs[0].pgdir) return -ENOMEM; + #ifdef CONFIG_X86_PAE + /* For PAE, we also create the initial mid-level. */ pgd = lg->pgdirs[0].pgdir; pmd_table = (pmd_t *) get_zeroed_page(GFP_KERNEL); if (!pmd_table) @@ -921,27 +1080,33 @@ int init_guest_pagetable(struct lguest *lg) set_pgd(pgd + SWITCHER_PGD_INDEX, __pgd(__pa(pmd_table) | _PAGE_PRESENT)); #endif + + /* This is the current page table. */ lg->cpus[0].cpu_pgd = 0; return 0; } -/* When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ +/*H:508 When the Guest calls LHCALL_LGUEST_INIT we do more setup. */ void page_table_guest_data_init(struct lg_cpu *cpu) { /* We get the kernel address: above this is all kernel memory. */ if (get_user(cpu->lg->kernel_address, &cpu->lg->lguest_data->kernel_address) - /* We tell the Guest that it can't use the top 2 or 4 MB - * of virtual addresses used by the Switcher. */ + /* + * We tell the Guest that it can't use the top 2 or 4 MB + * of virtual addresses used by the Switcher. + */ || put_user(RESERVE_MEM * 1024 * 1024, &cpu->lg->lguest_data->reserve_mem) || put_user(cpu->lg->pgdirs[0].gpgdir, &cpu->lg->lguest_data->pgdir)) kill_guest(cpu, "bad guest page %p", cpu->lg->lguest_data); - /* In flush_user_mappings() we loop from 0 to + /* + * In flush_user_mappings() we loop from 0 to * "pgd_index(lg->kernel_address)". This assumes it won't hit the - * Switcher mappings, so check that now. */ + * Switcher mappings, so check that now. + */ #ifdef CONFIG_X86_PAE if (pgd_index(cpu->lg->kernel_address) == SWITCHER_PGD_INDEX && pmd_index(cpu->lg->kernel_address) == SWITCHER_PMD_INDEX) @@ -964,12 +1129,14 @@ void free_guest_pagetable(struct lguest *lg) free_page((long)lg->pgdirs[i].pgdir); } -/*H:480 (vi) Mapping the Switcher when the Guest is about to run. +/*H:480 + * (vi) Mapping the Switcher when the Guest is about to run. * * The Switcher and the two pages for this CPU need to be visible in the * Guest (and not the pages for other CPUs). We have the appropriate PTE pages * for each CPU already set up, we just need to hook them in now we know which - * Guest is about to run on this CPU. */ + * Guest is about to run on this CPU. + */ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) { pte_t *switcher_pte_page = __get_cpu_var(switcher_pte_pages); @@ -980,30 +1147,38 @@ void map_switcher_in_guest(struct lg_cpu *cpu, struct lguest_pages *pages) pmd_t switcher_pmd; pmd_t *pmd_table; + /* FIXME: native_set_pmd is overkill here. */ native_set_pmd(&switcher_pmd, pfn_pmd(__pa(switcher_pte_page) >> PAGE_SHIFT, PAGE_KERNEL_EXEC)); + /* Figure out where the pmd page is, by reading the PGD, and converting + * it to a virtual address. */ pmd_table = __va(pgd_pfn(cpu->lg-> pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX]) << PAGE_SHIFT); + /* Now write it into the shadow page table. */ native_set_pmd(&pmd_table[SWITCHER_PMD_INDEX], switcher_pmd); #else pgd_t switcher_pgd; - /* Make the last PGD entry for this Guest point to the Switcher's PTE - * page for this CPU (with appropriate flags). */ + /* + * Make the last PGD entry for this Guest point to the Switcher's PTE + * page for this CPU (with appropriate flags). + */ switcher_pgd = __pgd(__pa(switcher_pte_page) | __PAGE_KERNEL_EXEC); cpu->lg->pgdirs[cpu->cpu_pgd].pgdir[SWITCHER_PGD_INDEX] = switcher_pgd; #endif - /* We also change the Switcher PTE page. When we're running the Guest, + /* + * We also change the Switcher PTE page. When we're running the Guest, * we want the Guest's "regs" page to appear where the first Switcher * page for this CPU is. This is an optimization: when the Switcher * saves the Guest registers, it saves them into the first page of this * CPU's "struct lguest_pages": if we make sure the Guest's register * page is already mapped there, we don't have to copy them out - * again. */ + * again. + */ pfn = __pa(cpu->regs_page) >> PAGE_SHIFT; native_set_pte(®s_pte, pfn_pte(pfn, PAGE_KERNEL)); native_set_pte(&switcher_pte_page[pte_index((unsigned long)pages)], @@ -1019,10 +1194,12 @@ static void free_switcher_pte_pages(void) free_page((long)switcher_pte_page(i)); } -/*H:520 Setting up the Switcher PTE page for given CPU is fairly easy, given +/*H:520 + * Setting up the Switcher PTE page for given CPU is fairly easy, given * the CPU number and the "struct page"s for the Switcher code itself. * - * Currently the Switcher is less than a page long, so "pages" is always 1. */ + * Currently the Switcher is less than a page long, so "pages" is always 1. + */ static __init void populate_switcher_pte_page(unsigned int cpu, struct page *switcher_page[], unsigned int pages) @@ -1043,13 +1220,16 @@ static __init void populate_switcher_pte_page(unsigned int cpu, native_set_pte(&pte[i], pfn_pte(page_to_pfn(switcher_page[i]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED|_PAGE_RW))); - /* The second page contains the "struct lguest_ro_state", and is - * read-only. */ + /* + * The second page contains the "struct lguest_ro_state", and is + * read-only. + */ native_set_pte(&pte[i+1], pfn_pte(page_to_pfn(switcher_page[i+1]), __pgprot(_PAGE_PRESENT|_PAGE_ACCESSED))); } -/* We've made it through the page table code. Perhaps our tired brains are +/* + * We've made it through the page table code. Perhaps our tired brains are * still processing the details, or perhaps we're simply glad it's over. * * If nothing else, note that all this complexity in juggling shadow page tables @@ -1058,10 +1238,13 @@ static __init void populate_switcher_pte_page(unsigned int cpu, * uses exotic direct Guest pagetable manipulation, and why both Intel and AMD * have implemented shadow page table support directly into hardware. * - * There is just one file remaining in the Host. */ + * There is just one file remaining in the Host. + */ -/*H:510 At boot or module load time, init_pagetables() allocates and populates - * the Switcher PTE page for each CPU. */ +/*H:510 + * At boot or module load time, init_pagetables() allocates and populates + * the Switcher PTE page for each CPU. + */ __init int init_pagetables(struct page **switcher_page, unsigned int pages) { unsigned int i; diff --git a/drivers/lguest/segments.c b/drivers/lguest/segments.c index 482ed5a1875..951c57b0a7e 100644 --- a/drivers/lguest/segments.c +++ b/drivers/lguest/segments.c @@ -1,4 +1,5 @@ -/*P:600 The x86 architecture has segments, which involve a table of descriptors +/*P:600 + * The x86 architecture has segments, which involve a table of descriptors * which can be used to do funky things with virtual address interpretation. * We originally used to use segments so the Guest couldn't alter the * Guest<->Host Switcher, and then we had to trim Guest segments, and restore @@ -8,7 +9,8 @@ * * In these modern times, the segment handling code consists of simple sanity * checks, and the worst you'll experience reading this code is butterfly-rash - * from frolicking through its parklike serenity. :*/ + * from frolicking through its parklike serenity. +:*/ #include "lg.h" /*H:600 @@ -41,10 +43,12 @@ * begin. */ -/* There are several entries we don't let the Guest set. The TSS entry is the +/* + * There are several entries we don't let the Guest set. The TSS entry is the * "Task State Segment" which controls all kinds of delicate things. The * LGUEST_CS and LGUEST_DS entries are reserved for the Switcher, and the - * the Guest can't be trusted to deal with double faults. */ + * the Guest can't be trusted to deal with double faults. + */ static bool ignored_gdt(unsigned int num) { return (num == GDT_ENTRY_TSS @@ -53,42 +57,52 @@ static bool ignored_gdt(unsigned int num) || num == GDT_ENTRY_DOUBLEFAULT_TSS); } -/*H:630 Once the Guest gave us new GDT entries, we fix them up a little. We +/*H:630 + * Once the Guest gave us new GDT entries, we fix them up a little. We * don't care if they're invalid: the worst that can happen is a General * Protection Fault in the Switcher when it restores a Guest segment register * which tries to use that entry. Then we kill the Guest for causing such a - * mess: the message will be "unhandled trap 256". */ + * mess: the message will be "unhandled trap 256". + */ static void fixup_gdt_table(struct lg_cpu *cpu, unsigned start, unsigned end) { unsigned int i; for (i = start; i < end; i++) { - /* We never copy these ones to real GDT, so we don't care what - * they say */ + /* + * We never copy these ones to real GDT, so we don't care what + * they say + */ if (ignored_gdt(i)) continue; - /* Segment descriptors contain a privilege level: the Guest is + /* + * Segment descriptors contain a privilege level: the Guest is * sometimes careless and leaves this as 0, even though it's - * running at privilege level 1. If so, we fix it here. */ + * running at privilege level 1. If so, we fix it here. + */ if ((cpu->arch.gdt[i].b & 0x00006000) == 0) cpu->arch.gdt[i].b |= (GUEST_PL << 13); - /* Each descriptor has an "accessed" bit. If we don't set it + /* + * Each descriptor has an "accessed" bit. If we don't set it * now, the CPU will try to set it when the Guest first loads * that entry into a segment register. But the GDT isn't - * writable by the Guest, so bad things can happen. */ + * writable by the Guest, so bad things can happen. + */ cpu->arch.gdt[i].b |= 0x00000100; } } -/*H:610 Like the IDT, we never simply use the GDT the Guest gives us. We keep +/*H:610 + * Like the IDT, we never simply use the GDT the Guest gives us. We keep * a GDT for each CPU, and copy across the Guest's entries each time we want to * run the Guest on that CPU. * * This routine is called at boot or modprobe time for each CPU to set up the * constant GDT entries: the ones which are the same no matter what Guest we're - * running. */ + * running. + */ void setup_default_gdt_entries(struct lguest_ro_state *state) { struct desc_struct *gdt = state->guest_gdt; @@ -98,30 +112,37 @@ void setup_default_gdt_entries(struct lguest_ro_state *state) gdt[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; gdt[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; - /* The TSS segment refers to the TSS entry for this particular CPU. + /* + * The TSS segment refers to the TSS entry for this particular CPU. * Forgive the magic flags: the 0x8900 means the entry is Present, it's * privilege level 0 Available 386 TSS system segment, and the 0x67 - * means Saturn is eclipsed by Mercury in the twelfth house. */ + * means Saturn is eclipsed by Mercury in the twelfth house. + */ gdt[GDT_ENTRY_TSS].a = 0x00000067 | (tss << 16); gdt[GDT_ENTRY_TSS].b = 0x00008900 | (tss & 0xFF000000) | ((tss >> 16) & 0x000000FF); } -/* This routine sets up the initial Guest GDT for booting. All entries start - * as 0 (unusable). */ +/* + * This routine sets up the initial Guest GDT for booting. All entries start + * as 0 (unusable). + */ void setup_guest_gdt(struct lg_cpu *cpu) { - /* Start with full 0-4G segments... */ + /* + * Start with full 0-4G segments...except the Guest is allowed to use + * them, so set the privilege level appropriately in the flags. + */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS] = FULL_EXEC_SEGMENT; cpu->arch.gdt[GDT_ENTRY_KERNEL_DS] = FULL_SEGMENT; - /* ...except the Guest is allowed to use them, so set the privilege - * level appropriately in the flags. */ cpu->arch.gdt[GDT_ENTRY_KERNEL_CS].b |= (GUEST_PL << 13); cpu->arch.gdt[GDT_ENTRY_KERNEL_DS].b |= (GUEST_PL << 13); } -/*H:650 An optimization of copy_gdt(), for just the three "thead-local storage" - * entries. */ +/*H:650 + * An optimization of copy_gdt(), for just the three "thead-local storage" + * entries. + */ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; @@ -130,26 +151,34 @@ void copy_gdt_tls(const struct lg_cpu *cpu, struct desc_struct *gdt) gdt[i] = cpu->arch.gdt[i]; } -/*H:640 When the Guest is run on a different CPU, or the GDT entries have - * changed, copy_gdt() is called to copy the Guest's GDT entries across to this - * CPU's GDT. */ +/*H:640 + * When the Guest is run on a different CPU, or the GDT entries have changed, + * copy_gdt() is called to copy the Guest's GDT entries across to this CPU's + * GDT. + */ void copy_gdt(const struct lg_cpu *cpu, struct desc_struct *gdt) { unsigned int i; - /* The default entries from setup_default_gdt_entries() are not - * replaced. See ignored_gdt() above. */ + /* + * The default entries from setup_default_gdt_entries() are not + * replaced. See ignored_gdt() above. + */ for (i = 0; i < GDT_ENTRIES; i++) if (!ignored_gdt(i)) gdt[i] = cpu->arch.gdt[i]; } -/*H:620 This is where the Guest asks us to load a new GDT entry - * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. */ +/*H:620 + * This is where the Guest asks us to load a new GDT entry + * (LHCALL_LOAD_GDT_ENTRY). We tweak the entry and copy it in. + */ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) { - /* We assume the Guest has the same number of GDT entries as the - * Host, otherwise we'd have to dynamically allocate the Guest GDT. */ + /* + * We assume the Guest has the same number of GDT entries as the + * Host, otherwise we'd have to dynamically allocate the Guest GDT. + */ if (num >= ARRAY_SIZE(cpu->arch.gdt)) kill_guest(cpu, "too many gdt entries %i", num); @@ -157,15 +186,19 @@ void load_guest_gdt_entry(struct lg_cpu *cpu, u32 num, u32 lo, u32 hi) cpu->arch.gdt[num].a = lo; cpu->arch.gdt[num].b = hi; fixup_gdt_table(cpu, num, num+1); - /* Mark that the GDT changed so the core knows it has to copy it again, - * even if the Guest is run on the same CPU. */ + /* + * Mark that the GDT changed so the core knows it has to copy it again, + * even if the Guest is run on the same CPU. + */ cpu->changed |= CHANGED_GDT; } -/* This is the fast-track version for just changing the three TLS entries. +/* + * This is the fast-track version for just changing the three TLS entries. * Remember that this happens on every context switch, so it's worth * optimizing. But wouldn't it be neater to have a single hypercall to cover - * both cases? */ + * both cases? + */ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) { struct desc_struct *tls = &cpu->arch.gdt[GDT_ENTRY_TLS_MIN]; @@ -175,7 +208,6 @@ void guest_load_tls(struct lg_cpu *cpu, unsigned long gtls) /* Note that just the TLS entries have changed. */ cpu->changed |= CHANGED_GDT_TLS; } -/*:*/ /*H:660 * With this, we have finished the Host. diff --git a/drivers/lguest/x86/core.c b/drivers/lguest/x86/core.c index eaf722fe309..6ae388849a3 100644 --- a/drivers/lguest/x86/core.c +++ b/drivers/lguest/x86/core.c @@ -17,13 +17,15 @@ * along with this program; if not, write to the Free Software * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. */ -/*P:450 This file contains the x86-specific lguest code. It used to be all +/*P:450 + * This file contains the x86-specific lguest code. It used to be all * mixed in with drivers/lguest/core.c but several foolhardy code slashers * wrestled most of the dependencies out to here in preparation for porting * lguest to other architectures (see what I mean by foolhardy?). * * This also contains a couple of non-obvious setup and teardown pieces which - * were implemented after days of debugging pain. :*/ + * were implemented after days of debugging pain. +:*/ #include <linux/kernel.h> #include <linux/start_kernel.h> #include <linux/string.h> @@ -82,25 +84,33 @@ static DEFINE_PER_CPU(struct lg_cpu *, last_cpu); */ static void copy_in_guest_info(struct lg_cpu *cpu, struct lguest_pages *pages) { - /* Copying all this data can be quite expensive. We usually run the + /* + * Copying all this data can be quite expensive. We usually run the * same Guest we ran last time (and that Guest hasn't run anywhere else * meanwhile). If that's not the case, we pretend everything in the - * Guest has changed. */ + * Guest has changed. + */ if (__get_cpu_var(last_cpu) != cpu || cpu->last_pages != pages) { __get_cpu_var(last_cpu) = cpu; cpu->last_pages = pages; cpu->changed = CHANGED_ALL; } - /* These copies are pretty cheap, so we do them unconditionally: */ - /* Save the current Host top-level page directory. */ + /* + * These copies are pretty cheap, so we do them unconditionally: */ + /* Save the current Host top-level page directory. + */ pages->state.host_cr3 = __pa(current->mm->pgd); - /* Set up the Guest's page tables to see this CPU's pages (and no - * other CPU's pages). */ + /* + * Set up the Guest's page tables to see this CPU's pages (and no + * other CPU's pages). + */ map_switcher_in_guest(cpu, pages); - /* Set up the two "TSS" members which tell the CPU what stack to use + /* + * Set up the two "TSS" members which tell the CPU what stack to use * for traps which do directly into the Guest (ie. traps at privilege - * level 1). */ + * level 1). + */ pages->state.guest_tss.sp1 = cpu->esp1; pages->state.guest_tss.ss1 = cpu->ss1; @@ -125,97 +135,126 @@ static void run_guest_once(struct lg_cpu *cpu, struct lguest_pages *pages) /* This is a dummy value we need for GCC's sake. */ unsigned int clobber; - /* Copy the guest-specific information into this CPU's "struct - * lguest_pages". */ + /* + * Copy the guest-specific information into this CPU's "struct + * lguest_pages". + */ copy_in_guest_info(cpu, pages); - /* Set the trap number to 256 (impossible value). If we fault while + /* + * Set the trap number to 256 (impossible value). If we fault while * switching to the Guest (bad segment registers or bug), this will - * cause us to abort the Guest. */ + * cause us to abort the Guest. + */ cpu->regs->trapnum = 256; - /* Now: we push the "eflags" register on the stack, then do an "lcall". + /* + * Now: we push the "eflags" register on the stack, then do an "lcall". * This is how we change from using the kernel code segment to using * the dedicated lguest code segment, as well as jumping into the * Switcher. * * The lcall also pushes the old code segment (KERNEL_CS) onto the * stack, then the address of this call. This stack layout happens to - * exactly match the stack layout created by an interrupt... */ + * exactly match the stack layout created by an interrupt... + */ asm volatile("pushf; lcall *lguest_entry" - /* This is how we tell GCC that %eax ("a") and %ebx ("b") - * are changed by this routine. The "=" means output. */ + /* + * This is how we tell GCC that %eax ("a") and %ebx ("b") + * are changed by this routine. The "=" means output. + */ : "=a"(clobber), "=b"(clobber) - /* %eax contains the pages pointer. ("0" refers to the + /* + * %eax contains the pages pointer. ("0" refers to the * 0-th argument above, ie "a"). %ebx contains the * physical address of the Guest's top-level page - * directory. */ + * directory. + */ : "0"(pages), "1"(__pa(cpu->lg->pgdirs[cpu->cpu_pgd].pgdir)) - /* We tell gcc that all these registers could change, + /* + * We tell gcc that all these registers could change, * which means we don't have to save and restore them in - * the Switcher. */ + * the Switcher. + */ : "memory", "%edx", "%ecx", "%edi", "%esi"); } /*:*/ -/*M:002 There are hooks in the scheduler which we can register to tell when we +/*M:002 + * There are hooks in the scheduler which we can register to tell when we * get kicked off the CPU (preempt_notifier_register()). This would allow us * to lazily disable SYSENTER which would regain some performance, and should * also simplify copy_in_guest_info(). Note that we'd still need to restore * things when we exit to Launcher userspace, but that's fairly easy. * - * We could also try using this hooks for PGE, but that might be too expensive. + * We could also try using these hooks for PGE, but that might be too expensive. * - * The hooks were designed for KVM, but we can also put them to good use. :*/ + * The hooks were designed for KVM, but we can also put them to good use. +:*/ -/*H:040 This is the i386-specific code to setup and run the Guest. Interrupts - * are disabled: we own the CPU. */ +/*H:040 + * This is the i386-specific code to setup and run the Guest. Interrupts + * are disabled: we own the CPU. + */ void lguest_arch_run_guest(struct lg_cpu *cpu) { - /* Remember the awfully-named TS bit? If the Guest has asked to set it + /* + * Remember the awfully-named TS bit? If the Guest has asked to set it * we set it now, so we can trap and pass that trap to the Guest if it - * uses the FPU. */ + * uses the FPU. + */ if (cpu->ts) unlazy_fpu(current); - /* SYSENTER is an optimized way of doing system calls. We can't allow + /* + * SYSENTER is an optimized way of doing system calls. We can't allow * it because it always jumps to privilege level 0. A normal Guest * won't try it because we don't advertise it in CPUID, but a malicious * Guest (or malicious Guest userspace program) could, so we tell the - * CPU to disable it before running the Guest. */ + * CPU to disable it before running the Guest. + */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, 0, 0); - /* Now we actually run the Guest. It will return when something + /* + * Now we actually run the Guest. It will return when something * interesting happens, and we can examine its registers to see what it - * was doing. */ + * was doing. + */ run_guest_once(cpu, lguest_pages(raw_smp_processor_id())); - /* Note that the "regs" structure contains two extra entries which are + /* + * Note that the "regs" structure contains two extra entries which are * not really registers: a trap number which says what interrupt or * trap made the switcher code come back, and an error code which some - * traps set. */ + * traps set. + */ /* Restore SYSENTER if it's supposed to be on. */ if (boot_cpu_has(X86_FEATURE_SEP)) wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); - /* If the Guest page faulted, then the cr2 register will tell us the + /* + * If the Guest page faulted, then the cr2 register will tell us the * bad virtual address. We have to grab this now, because once we * re-enable interrupts an interrupt could fault and thus overwrite - * cr2, or we could even move off to a different CPU. */ + * cr2, or we could even move off to a different CPU. + */ if (cpu->regs->trapnum == 14) cpu->arch.last_pagefault = read_cr2(); - /* Similarly, if we took a trap because the Guest used the FPU, + /* + * Similarly, if we took a trap because the Guest used the FPU, * we have to restore the FPU it expects to see. * math_state_restore() may sleep and we may even move off to * a different CPU. So all the critical stuff should be done - * before this. */ + * before this. + */ else if (cpu->regs->trapnum == 7) math_state_restore(); } -/*H:130 Now we've examined the hypercall code; our Guest can make requests. +/*H:130 + * Now we've examined the hypercall code; our Guest can make requests. * Our Guest is usually so well behaved; it never tries to do things it isn't * allowed to, and uses hypercalls instead. Unfortunately, Linux's paravirtual * infrastructure isn't quite complete, because it doesn't contain replacements @@ -225,26 +264,33 @@ void lguest_arch_run_guest(struct lg_cpu *cpu) * * When the Guest uses one of these instructions, we get a trap (General * Protection Fault) and come here. We see if it's one of those troublesome - * instructions and skip over it. We return true if we did. */ + * instructions and skip over it. We return true if we did. + */ static int emulate_insn(struct lg_cpu *cpu) { u8 insn; unsigned int insnlen = 0, in = 0, shift = 0; - /* The eip contains the *virtual* address of the Guest's instruction: - * guest_pa just subtracts the Guest's page_offset. */ + /* + * The eip contains the *virtual* address of the Guest's instruction: + * guest_pa just subtracts the Guest's page_offset. + */ unsigned long physaddr = guest_pa(cpu, cpu->regs->eip); - /* This must be the Guest kernel trying to do something, not userspace! + /* + * This must be the Guest kernel trying to do something, not userspace! * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return 0; /* Decoding x86 instructions is icky. */ insn = lgread(cpu, physaddr, u8); - /* 0x66 is an "operand prefix". It means it's using the upper 16 bits - of the eax register. */ + /* + * 0x66 is an "operand prefix". It means it's using the upper 16 bits + * of the eax register. + */ if (insn == 0x66) { shift = 16; /* The instruction is 1 byte so far, read the next byte. */ @@ -252,8 +298,10 @@ static int emulate_insn(struct lg_cpu *cpu) insn = lgread(cpu, physaddr + insnlen, u8); } - /* We can ignore the lower bit for the moment and decode the 4 opcodes - * we need to emulate. */ + /* + * We can ignore the lower bit for the moment and decode the 4 opcodes + * we need to emulate. + */ switch (insn & 0xFE) { case 0xE4: /* in <next byte>,%al */ insnlen += 2; @@ -274,9 +322,11 @@ static int emulate_insn(struct lg_cpu *cpu) return 0; } - /* If it was an "IN" instruction, they expect the result to be read + /* + * If it was an "IN" instruction, they expect the result to be read * into %eax, so we change %eax. We always return all-ones, which - * traditionally means "there's nothing there". */ + * traditionally means "there's nothing there". + */ if (in) { /* Lower bit tells is whether it's a 16 or 32 bit access */ if (insn & 0x1) @@ -290,7 +340,8 @@ static int emulate_insn(struct lg_cpu *cpu) return 1; } -/* Our hypercalls mechanism used to be based on direct software interrupts. +/* + * Our hypercalls mechanism used to be based on direct software interrupts. * After Anthony's "Refactor hypercall infrastructure" kvm patch, we decided to * change over to using kvm hypercalls. * @@ -318,16 +369,20 @@ static int emulate_insn(struct lg_cpu *cpu) */ static void rewrite_hypercall(struct lg_cpu *cpu) { - /* This are the opcodes we use to patch the Guest. The opcode for "int + /* + * This are the opcodes we use to patch the Guest. The opcode for "int * $0x1f" is "0xcd 0x1f" but vmcall instruction is 3 bytes long, so we - * complete the sequence with a NOP (0x90). */ + * complete the sequence with a NOP (0x90). + */ u8 insn[3] = {0xcd, 0x1f, 0x90}; __lgwrite(cpu, guest_pa(cpu, cpu->regs->eip), insn, sizeof(insn)); - /* The above write might have caused a copy of that page to be made + /* + * The above write might have caused a copy of that page to be made * (if it was read-only). We need to make sure the Guest has * up-to-date pagetables. As this doesn't happen often, we can just - * drop them all. */ + * drop them all. + */ guest_pagetable_clear_all(cpu); } @@ -335,9 +390,11 @@ static bool is_hypercall(struct lg_cpu *cpu) { u8 insn[3]; - /* This must be the Guest kernel trying to do something. + /* + * This must be the Guest kernel trying to do something. * The bottom two bits of the CS segment register are the privilege - * level. */ + * level. + */ if ((cpu->regs->cs & 3) != GUEST_PL) return false; @@ -351,86 +408,105 @@ void lguest_arch_handle_trap(struct lg_cpu *cpu) { switch (cpu->regs->trapnum) { case 13: /* We've intercepted a General Protection Fault. */ - /* Check if this was one of those annoying IN or OUT + /* + * Check if this was one of those annoying IN or OUT * instructions which we need to emulate. If so, we just go - * back into the Guest after we've done it. */ + * back into the Guest after we've done it. + */ if (cpu->regs->errcode == 0) { if (emulate_insn(cpu)) return; } - /* If KVM is active, the vmcall instruction triggers a - * General Protection Fault. Normally it triggers an - * invalid opcode fault (6): */ + /* + * If KVM is active, the vmcall instruction triggers a General + * Protection Fault. Normally it triggers an invalid opcode + * fault (6): + */ case 6: - /* We need to check if ring == GUEST_PL and - * faulting instruction == vmcall. */ + /* + * We need to check if ring == GUEST_PL and faulting + * instruction == vmcall. + */ if (is_hypercall(cpu)) { rewrite_hypercall(cpu); return; } break; case 14: /* We've intercepted a Page Fault. */ - /* The Guest accessed a virtual address that wasn't mapped. + /* + * The Guest accessed a virtual address that wasn't mapped. * This happens a lot: we don't actually set up most of the page * tables for the Guest at all when we start: as it runs it asks * for more and more, and we set them up as required. In this * case, we don't even tell the Guest that the fault happened. * * The errcode tells whether this was a read or a write, and - * whether kernel or userspace code. */ + * whether kernel or userspace code. + */ if (demand_page(cpu, cpu->arch.last_pagefault, cpu->regs->errcode)) return; - /* OK, it's really not there (or not OK): the Guest needs to + /* + * OK, it's really not there (or not OK): the Guest needs to * know. We write out the cr2 value so it knows where the * fault occurred. * * Note that if the Guest were really messed up, this could * happen before it's done the LHCALL_LGUEST_INIT hypercall, so - * lg->lguest_data could be NULL */ + * lg->lguest_data could be NULL + */ if (cpu->lg->lguest_data && put_user(cpu->arch.last_pagefault, &cpu->lg->lguest_data->cr2)) kill_guest(cpu, "Writing cr2"); break; case 7: /* We've intercepted a Device Not Available fault. */ - /* If the Guest doesn't want to know, we already restored the - * Floating Point Unit, so we just continue without telling - * it. */ + /* + * If the Guest doesn't want to know, we already restored the + * Floating Point Unit, so we just continue without telling it. + */ if (!cpu->ts) return; break; case 32 ... 255: - /* These values mean a real interrupt occurred, in which case + /* + * These values mean a real interrupt occurred, in which case * the Host handler has already been run. We just do a * friendly check if another process should now be run, then - * return to run the Guest again */ + * return to run the Guest again + */ cond_resched(); return; case LGUEST_TRAP_ENTRY: - /* Our 'struct hcall_args' maps directly over our regs: we set - * up the pointer now to indicate a hypercall is pending. */ + /* + * Our 'struct hcall_args' maps directly over our regs: we set + * up the pointer now to indicate a hypercall is pending. + */ cpu->hcall = (struct hcall_args *)cpu->regs; return; } /* We didn't handle the trap, so it needs to go to the Guest. */ if (!deliver_trap(cpu, cpu->regs->trapnum)) - /* If the Guest doesn't have a handler (either it hasn't + /* + * If the Guest doesn't have a handler (either it hasn't * registered any yet, or it's one of the faults we don't let - * it handle), it dies with this cryptic error message. */ + * it handle), it dies with this cryptic error message. + */ kill_guest(cpu, "unhandled trap %li at %#lx (%#lx)", cpu->regs->trapnum, cpu->regs->eip, cpu->regs->trapnum == 14 ? cpu->arch.last_pagefault : cpu->regs->errcode); } -/* Now we can look at each of the routines this calls, in increasing order of +/* + * Now we can look at each of the routines this calls, in increasing order of * complexity: do_hypercalls(), emulate_insn(), maybe_do_interrupt(), * deliver_trap() and demand_page(). After all those, we'll be ready to * examine the Switcher, and our philosophical understanding of the Host/Guest - * duality will be complete. :*/ + * duality will be complete. +:*/ static void adjust_pge(void *on) { if (on) @@ -439,13 +515,16 @@ static void adjust_pge(void *on) write_cr4(read_cr4() & ~X86_CR4_PGE); } -/*H:020 Now the Switcher is mapped and every thing else is ready, we need to do - * some more i386-specific initialization. */ +/*H:020 + * Now the Switcher is mapped and every thing else is ready, we need to do + * some more i386-specific initialization. + */ void __init lguest_arch_host_init(void) { int i; - /* Most of the i386/switcher.S doesn't care that it's been moved; on + /* + * Most of the i386/switcher.S doesn't care that it's been moved; on * Intel, jumps are relative, and it doesn't access any references to * external code or data. * @@ -453,7 +532,8 @@ void __init lguest_arch_host_init(void) * addresses are placed in a table (default_idt_entries), so we need to * update the table with the new addresses. switcher_offset() is a * convenience function which returns the distance between the - * compiled-in switcher code and the high-mapped copy we just made. */ + * compiled-in switcher code and the high-mapped copy we just made. + */ for (i = 0; i < IDT_ENTRIES; i++) default_idt_entries[i] += switcher_offset(); @@ -468,63 +548,81 @@ void __init lguest_arch_host_init(void) for_each_possible_cpu(i) { /* lguest_pages() returns this CPU's two pages. */ struct lguest_pages *pages = lguest_pages(i); - /* This is a convenience pointer to make the code fit one - * statement to a line. */ + /* This is a convenience pointer to make the code neater. */ struct lguest_ro_state *state = &pages->state; - /* The Global Descriptor Table: the Host has a different one + /* + * The Global Descriptor Table: the Host has a different one * for each CPU. We keep a descriptor for the GDT which says * where it is and how big it is (the size is actually the last - * byte, not the size, hence the "-1"). */ + * byte, not the size, hence the "-1"). + */ state->host_gdt_desc.size = GDT_SIZE-1; state->host_gdt_desc.address = (long)get_cpu_gdt_table(i); - /* All CPUs on the Host use the same Interrupt Descriptor + /* + * All CPUs on the Host use the same Interrupt Descriptor * Table, so we just use store_idt(), which gets this CPU's IDT - * descriptor. */ + * descriptor. + */ store_idt(&state->host_idt_desc); - /* The descriptors for the Guest's GDT and IDT can be filled + /* + * The descriptors for the Guest's GDT and IDT can be filled * out now, too. We copy the GDT & IDT into ->guest_gdt and - * ->guest_idt before actually running the Guest. */ + * ->guest_idt before actually running the Guest. + */ state->guest_idt_desc.size = sizeof(state->guest_idt)-1; state->guest_idt_desc.address = (long)&state->guest_idt; state->guest_gdt_desc.size = sizeof(state->guest_gdt)-1; state->guest_gdt_desc.address = (long)&state->guest_gdt; - /* We know where we want the stack to be when the Guest enters + /* + * We know where we want the stack to be when the Guest enters * the Switcher: in pages->regs. The stack grows upwards, so - * we start it at the end of that structure. */ + * we start it at the end of that structure. + */ state->guest_tss.sp0 = (long)(&pages->regs + 1); - /* And this is the GDT entry to use for the stack: we keep a - * couple of special LGUEST entries. */ + /* + * And this is the GDT entry to use for the stack: we keep a + * couple of special LGUEST entries. + */ state->guest_tss.ss0 = LGUEST_DS; - /* x86 can have a finegrained bitmap which indicates what I/O + /* + * x86 can have a finegrained bitmap which indicates what I/O * ports the process can use. We set it to the end of our - * structure, meaning "none". */ + * structure, meaning "none". + */ state->guest_tss.io_bitmap_base = sizeof(state->guest_tss); - /* Some GDT entries are the same across all Guests, so we can - * set them up now. */ + /* + * Some GDT entries are the same across all Guests, so we can + * set them up now. + */ setup_default_gdt_entries(state); /* Most IDT entries are the same for all Guests, too.*/ setup_default_idt_entries(state, default_idt_entries); - /* The Host needs to be able to use the LGUEST segments on this - * CPU, too, so put them in the Host GDT. */ + /* + * The Host needs to be able to use the LGUEST segments on this + * CPU, too, so put them in the Host GDT. + */ get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_CS] = FULL_EXEC_SEGMENT; get_cpu_gdt_table(i)[GDT_ENTRY_LGUEST_DS] = FULL_SEGMENT; } - /* In the Switcher, we want the %cs segment register to use the + /* + * In the Switcher, we want the %cs segment register to use the * LGUEST_CS GDT entry: we've put that in the Host and Guest GDTs, so * it will be undisturbed when we switch. To change %cs and jump we - * need this structure to feed to Intel's "lcall" instruction. */ + * need this structure to feed to Intel's "lcall" instruction. + */ lguest_entry.offset = (long)switch_to_guest + switcher_offset(); lguest_entry.segment = LGUEST_CS; - /* Finally, we need to turn off "Page Global Enable". PGE is an + /* + * Finally, we need to turn off "Page Global Enable". PGE is an * optimization where page table entries are specially marked to show * they never change. The Host kernel marks all the kernel pages this * way because it's always present, even when userspace is running. @@ -534,16 +632,21 @@ void __init lguest_arch_host_init(void) * you'll get really weird bugs that you'll chase for two days. * * I used to turn PGE off every time we switched to the Guest and back - * on when we return, but that slowed the Switcher down noticibly. */ + * on when we return, but that slowed the Switcher down noticibly. + */ - /* We don't need the complexity of CPUs coming and going while we're - * doing this. */ + /* + * We don't need the complexity of CPUs coming and going while we're + * doing this. + */ get_online_cpus(); if (cpu_has_pge) { /* We have a broader idea of "global". */ /* Remember that this was originally set (for cleanup). */ cpu_had_pge = 1; - /* adjust_pge is a helper function which sets or unsets the PGE - * bit on its CPU, depending on the argument (0 == unset). */ + /* + * adjust_pge is a helper function which sets or unsets the PGE + * bit on its CPU, depending on the argument (0 == unset). + */ on_each_cpu(adjust_pge, (void *)0, 1); /* Turn off the feature in the global feature set. */ clear_cpu_cap(&boot_cpu_data, X86_FEATURE_PGE); @@ -590,26 +693,32 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) { u32 tsc_speed; - /* The pointer to the Guest's "struct lguest_data" is the only argument. - * We check that address now. */ + /* + * The pointer to the Guest's "struct lguest_data" is the only argument. + * We check that address now. + */ if (!lguest_address_ok(cpu->lg, cpu->hcall->arg1, sizeof(*cpu->lg->lguest_data))) return -EFAULT; - /* Having checked it, we simply set lg->lguest_data to point straight + /* + * Having checked it, we simply set lg->lguest_data to point straight * into the Launcher's memory at the right place and then use * copy_to_user/from_user from now on, instead of lgread/write. I put * this in to show that I'm not immune to writing stupid - * optimizations. */ + * optimizations. + */ cpu->lg->lguest_data = cpu->lg->mem_base + cpu->hcall->arg1; - /* We insist that the Time Stamp Counter exist and doesn't change with + /* + * We insist that the Time Stamp Counter exist and doesn't change with * cpu frequency. Some devious chip manufacturers decided that TSC * changes could be handled in software. I decided that time going * backwards might be good for benchmarks, but it's bad for users. * * We also insist that the TSC be stable: the kernel detects unreliable - * TSCs for its own purposes, and we use that here. */ + * TSCs for its own purposes, and we use that here. + */ if (boot_cpu_has(X86_FEATURE_CONSTANT_TSC) && !check_tsc_unstable()) tsc_speed = tsc_khz; else @@ -625,38 +734,47 @@ int lguest_arch_init_hypercalls(struct lg_cpu *cpu) } /*:*/ -/*L:030 lguest_arch_setup_regs() +/*L:030 + * lguest_arch_setup_regs() * * Most of the Guest's registers are left alone: we used get_zeroed_page() to - * allocate the structure, so they will be 0. */ + * allocate the structure, so they will be 0. + */ void lguest_arch_setup_regs(struct lg_cpu *cpu, unsigned long start) { struct lguest_regs *regs = cpu->regs; - /* There are four "segment" registers which the Guest needs to boot: + /* + * There are four "segment" registers which the Guest needs to boot: * The "code segment" register (cs) refers to the kernel code segment * __KERNEL_CS, and the "data", "extra" and "stack" segment registers * refer to the kernel data segment __KERNEL_DS. * * The privilege level is packed into the lower bits. The Guest runs - * at privilege level 1 (GUEST_PL).*/ + * at privilege level 1 (GUEST_PL). + */ regs->ds = regs->es = regs->ss = __KERNEL_DS|GUEST_PL; regs->cs = __KERNEL_CS|GUEST_PL; - /* The "eflags" register contains miscellaneous flags. Bit 1 (0x002) + /* + * The "eflags" register contains miscellaneous flags. Bit 1 (0x002) * is supposed to always be "1". Bit 9 (0x200) controls whether * interrupts are enabled. We always leave interrupts enabled while - * running the Guest. */ + * running the Guest. + */ regs->eflags = X86_EFLAGS_IF | 0x2; - /* The "Extended Instruction Pointer" register says where the Guest is - * running. */ + /* + * The "Extended Instruction Pointer" register says where the Guest is + * running. + */ regs->eip = start; - /* %esi points to our boot information, at physical address 0, so don't - * touch it. */ + /* + * %esi points to our boot information, at physical address 0, so don't + * touch it. + */ - /* There are a couple of GDT entries the Guest expects when first - * booting. */ + /* There are a couple of GDT entries the Guest expects at boot. */ setup_guest_gdt(cpu); } diff --git a/drivers/lguest/x86/switcher_32.S b/drivers/lguest/x86/switcher_32.S index 3fc15318a80..40634b0db9f 100644 --- a/drivers/lguest/x86/switcher_32.S +++ b/drivers/lguest/x86/switcher_32.S @@ -1,12 +1,15 @@ -/*P:900 This is the Switcher: code which sits at 0xFFC00000 astride both the - * Host and Guest to do the low-level Guest<->Host switch. It is as simple as - * it can be made, but it's naturally very specific to x86. +/*P:900 + * This is the Switcher: code which sits at 0xFFC00000 (or 0xFFE00000) astride + * both the Host and Guest to do the low-level Guest<->Host switch. It is as + * simple as it can be made, but it's naturally very specific to x86. * * You have now completed Preparation. If this has whet your appetite; if you * are feeling invigorated and refreshed then the next, more challenging stage - * can be found in "make Guest". :*/ + * can be found in "make Guest". + :*/ -/*M:012 Lguest is meant to be simple: my rule of thumb is that 1% more LOC must +/*M:012 + * Lguest is meant to be simple: my rule of thumb is that 1% more LOC must * gain at least 1% more performance. Since neither LOC nor performance can be * measured beforehand, it generally means implementing a feature then deciding * if it's worth it. And once it's implemented, who can say no? @@ -31,11 +34,14 @@ * Host (which is actually really easy). * * Two questions remain. Would the performance gain outweigh the complexity? - * And who would write the verse documenting it? :*/ + * And who would write the verse documenting it? +:*/ -/*M:011 Lguest64 handles NMI. This gave me NMI envy (until I looked at their +/*M:011 + * Lguest64 handles NMI. This gave me NMI envy (until I looked at their * code). It's worth doing though, since it would let us use oprofile in the - * Host when a Guest is running. :*/ + * Host when a Guest is running. +:*/ /*S:100 * Welcome to the Switcher itself! diff --git a/drivers/md/linear.c b/drivers/md/linear.c index 5810fa906af..5fe39c2a3d2 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -220,6 +220,7 @@ static int linear_run (mddev_t *mddev) mddev->queue->unplug_fn = linear_unplug; mddev->queue->backing_dev_info.congested_fn = linear_congested; mddev->queue->backing_dev_info.congested_data = mddev; + md_integrity_register(mddev); return 0; } @@ -256,6 +257,7 @@ static int linear_add(mddev_t *mddev, mdk_rdev_t *rdev) rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); + revalidate_disk(mddev->gendisk); call_rcu(&oldconf->rcu, free_conf); return 0; } diff --git a/drivers/md/md.c b/drivers/md/md.c index d4351ff0849..5b98bea4ff9 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1308,7 +1308,12 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev) } if (mddev->level != LEVEL_MULTIPATH) { int role; - role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); + if (rdev->desc_nr < 0 || + rdev->desc_nr >= le32_to_cpu(sb->max_dev)) { + role = 0xffff; + rdev->desc_nr = -1; + } else + role = le16_to_cpu(sb->dev_roles[rdev->desc_nr]); switch(role) { case 0xffff: /* spare */ break; @@ -1394,8 +1399,14 @@ static void super_1_sync(mddev_t *mddev, mdk_rdev_t *rdev) if (rdev2->desc_nr+1 > max_dev) max_dev = rdev2->desc_nr+1; - if (max_dev > le32_to_cpu(sb->max_dev)) + if (max_dev > le32_to_cpu(sb->max_dev)) { + int bmask; sb->max_dev = cpu_to_le32(max_dev); + rdev->sb_size = max_dev * 2 + 256; + bmask = queue_logical_block_size(rdev->bdev->bd_disk->queue)-1; + if (rdev->sb_size & bmask) + rdev->sb_size = (rdev->sb_size | bmask) + 1; + } for (i=0; i<max_dev;i++) sb->dev_roles[i] = cpu_to_le16(0xfffe); @@ -1487,37 +1498,76 @@ static int match_mddev_units(mddev_t *mddev1, mddev_t *mddev2) static LIST_HEAD(pending_raid_disks); -static void md_integrity_check(mdk_rdev_t *rdev, mddev_t *mddev) +/* + * Try to register data integrity profile for an mddev + * + * This is called when an array is started and after a disk has been kicked + * from the array. It only succeeds if all working and active component devices + * are integrity capable with matching profiles. + */ +int md_integrity_register(mddev_t *mddev) +{ + mdk_rdev_t *rdev, *reference = NULL; + + if (list_empty(&mddev->disks)) + return 0; /* nothing to do */ + if (blk_get_integrity(mddev->gendisk)) + return 0; /* already registered */ + list_for_each_entry(rdev, &mddev->disks, same_set) { + /* skip spares and non-functional disks */ + if (test_bit(Faulty, &rdev->flags)) + continue; + if (rdev->raid_disk < 0) + continue; + /* + * If at least one rdev is not integrity capable, we can not + * enable data integrity for the md device. + */ + if (!bdev_get_integrity(rdev->bdev)) + return -EINVAL; + if (!reference) { + /* Use the first rdev as the reference */ + reference = rdev; + continue; + } + /* does this rdev's profile match the reference profile? */ + if (blk_integrity_compare(reference->bdev->bd_disk, + rdev->bdev->bd_disk) < 0) + return -EINVAL; + } + /* + * All component devices are integrity capable and have matching + * profiles, register the common profile for the md device. + */ + if (blk_integrity_register(mddev->gendisk, + bdev_get_integrity(reference->bdev)) != 0) { + printk(KERN_ERR "md: failed to register integrity for %s\n", + mdname(mddev)); + return -EINVAL; + } + printk(KERN_NOTICE "md: data integrity on %s enabled\n", + mdname(mddev)); + return 0; +} +EXPORT_SYMBOL(md_integrity_register); + +/* Disable data integrity if non-capable/non-matching disk is being added */ +void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev) { - struct mdk_personality *pers = mddev->pers; - struct gendisk *disk = mddev->gendisk; struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(disk); + struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); - /* Data integrity passthrough not supported on RAID 4, 5 and 6 */ - if (pers && pers->level >= 4 && pers->level <= 6) + if (!bi_mddev) /* nothing to do */ return; - - /* If rdev is integrity capable, register profile for mddev */ - if (!bi_mddev && bi_rdev) { - if (blk_integrity_register(disk, bi_rdev)) - printk(KERN_ERR "%s: %s Could not register integrity!\n", - __func__, disk->disk_name); - else - printk(KERN_NOTICE "Enabling data integrity on %s\n", - disk->disk_name); + if (rdev->raid_disk < 0) /* skip spares */ return; - } - - /* Check that mddev and rdev have matching profiles */ - if (blk_integrity_compare(disk, rdev->bdev->bd_disk) < 0) { - printk(KERN_ERR "%s: %s/%s integrity mismatch!\n", __func__, - disk->disk_name, rdev->bdev->bd_disk->disk_name); - printk(KERN_NOTICE "Disabling data integrity on %s\n", - disk->disk_name); - blk_integrity_unregister(disk); - } + if (bi_rdev && blk_integrity_compare(mddev->gendisk, + rdev->bdev->bd_disk) >= 0) + return; + printk(KERN_NOTICE "disabling data integrity on %s\n", mdname(mddev)); + blk_integrity_unregister(mddev->gendisk); } +EXPORT_SYMBOL(md_integrity_add_rdev); static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) { @@ -1591,7 +1641,6 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev) /* May as well allow recovery to be retried once */ mddev->recovery_disabled = 0; - md_integrity_check(rdev, mddev); return 0; fail: @@ -2657,6 +2706,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len) ssize_t rv = len; struct mdk_personality *pers; void *priv; + mdk_rdev_t *rdev; if (mddev->pers == NULL) { if (len == 0) @@ -2736,6 +2786,12 @@ level_store(mddev_t *mddev, const char *buf, size_t len) mddev_suspend(mddev); mddev->pers->stop(mddev); module_put(mddev->pers->owner); + /* Invalidate devices that are now superfluous */ + list_for_each_entry(rdev, &mddev->disks, same_set) + if (rdev->raid_disk >= mddev->raid_disks) { + rdev->raid_disk = -1; + clear_bit(In_sync, &rdev->flags); + } mddev->pers = pers; mddev->private = priv; strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); @@ -3685,17 +3741,8 @@ array_size_store(mddev_t *mddev, const char *buf, size_t len) mddev->array_sectors = sectors; set_capacity(mddev->gendisk, mddev->array_sectors); - if (mddev->pers) { - struct block_device *bdev = bdget_disk(mddev->gendisk, 0); - - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } - } + if (mddev->pers) + revalidate_disk(mddev->gendisk); return len; } @@ -4048,10 +4095,6 @@ static int do_md_run(mddev_t * mddev) } strlcpy(mddev->clevel, pers->name, sizeof(mddev->clevel)); - if (pers->level >= 4 && pers->level <= 6) - /* Cannot support integrity (yet) */ - blk_integrity_unregister(mddev->gendisk); - if (mddev->reshape_position != MaxSector && pers->start_reshape == NULL) { /* This personality cannot handle reshaping... */ @@ -4189,6 +4232,7 @@ static int do_md_run(mddev_t * mddev) md_wakeup_thread(mddev->thread); md_wakeup_thread(mddev->sync_thread); /* possibly kick off a reshape */ + revalidate_disk(mddev->gendisk); mddev->changed = 1; md_new_event(mddev); sysfs_notify_dirent(mddev->sysfs_state); @@ -5087,18 +5131,8 @@ static int update_size(mddev_t *mddev, sector_t num_sectors) return -ENOSPC; } rv = mddev->pers->resize(mddev, num_sectors); - if (!rv) { - struct block_device *bdev; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } - } + if (!rv) + revalidate_disk(mddev->gendisk); return rv; } diff --git a/drivers/md/md.h b/drivers/md/md.h index 9430a110db9..78f03168baf 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -431,5 +431,7 @@ extern int md_allow_write(mddev_t *mddev); extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev); extern void md_set_array_sectors(mddev_t *mddev, sector_t array_sectors); extern int md_check_no_bitmap(mddev_t *mddev); +extern int md_integrity_register(mddev_t *mddev); +void md_integrity_add_rdev(mdk_rdev_t *rdev, mddev_t *mddev); #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 237fe3fd235..7140909f666 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -313,6 +313,7 @@ static int multipath_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) set_bit(In_sync, &rdev->flags); rcu_assign_pointer(p->rdev, rdev); err = 0; + md_integrity_add_rdev(rdev, mddev); break; } @@ -345,7 +346,9 @@ static int multipath_remove_disk(mddev_t *mddev, int number) /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; + goto abort; } + md_integrity_register(mddev); } abort: @@ -519,7 +522,7 @@ static int multipath_run (mddev_t *mddev) mddev->queue->unplug_fn = multipath_unplug; mddev->queue->backing_dev_info.congested_fn = multipath_congested; mddev->queue->backing_dev_info.congested_data = mddev; - + md_integrity_register(mddev); return 0; out_free_conf: diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index 335f490dcad..898e2bdfee4 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -351,6 +351,7 @@ static int raid0_run(mddev_t *mddev) blk_queue_merge_bvec(mddev->queue, raid0_mergeable_bvec); dump_zones(mddev); + md_integrity_register(mddev); return 0; } diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 0569efba0c0..8726fd7ebce 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1144,7 +1144,7 @@ static int raid1_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) rcu_assign_pointer(p->rdev, rdev); break; } - + md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; } @@ -1178,7 +1178,9 @@ static int raid1_remove_disk(mddev_t *mddev, int number) /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; + goto abort; } + md_integrity_register(mddev); } abort: @@ -2067,7 +2069,7 @@ static int run(mddev_t *mddev) mddev->queue->unplug_fn = raid1_unplug; mddev->queue->backing_dev_info.congested_fn = raid1_congested; mddev->queue->backing_dev_info.congested_data = mddev; - + md_integrity_register(mddev); return 0; out_no_mem: @@ -2132,6 +2134,7 @@ static int raid1_resize(mddev_t *mddev, sector_t sectors) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; + revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->dev_sectors; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 7298a5e5a18..3d9020cf6f6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1170,6 +1170,7 @@ static int raid10_add_disk(mddev_t *mddev, mdk_rdev_t *rdev) break; } + md_integrity_add_rdev(rdev, mddev); print_conf(conf); return err; } @@ -1203,7 +1204,9 @@ static int raid10_remove_disk(mddev_t *mddev, int number) /* lost the race, try later */ err = -EBUSY; p->rdev = rdev; + goto abort; } + md_integrity_register(mddev); } abort: @@ -2225,6 +2228,7 @@ static int run(mddev_t *mddev) if (conf->near_copies < mddev->raid_disks) blk_queue_merge_bvec(mddev->queue, raid10_mergeable_bvec); + md_integrity_register(mddev); return 0; out_free_conf: diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 37835538b58..2b521ee67df 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -3999,6 +3999,9 @@ static inline sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *ski return 0; } + /* Allow raid5_quiesce to complete */ + wait_event(conf->wait_for_overlap, conf->quiesce != 2); + if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery)) return reshape_request(mddev, sector_nr, skipped); @@ -4316,6 +4319,15 @@ raid5_size(mddev_t *mddev, sector_t sectors, int raid_disks) return sectors * (raid_disks - conf->max_degraded); } +static void free_conf(raid5_conf_t *conf) +{ + shrink_stripes(conf); + safe_put_page(conf->spare_page); + kfree(conf->disks); + kfree(conf->stripe_hashtbl); + kfree(conf); +} + static raid5_conf_t *setup_conf(mddev_t *mddev) { raid5_conf_t *conf; @@ -4447,11 +4459,7 @@ static raid5_conf_t *setup_conf(mddev_t *mddev) abort: if (conf) { - shrink_stripes(conf); - safe_put_page(conf->spare_page); - kfree(conf->disks); - kfree(conf->stripe_hashtbl); - kfree(conf); + free_conf(conf); return ERR_PTR(-EIO); } else return ERR_PTR(-ENOMEM); @@ -4629,12 +4637,8 @@ abort: md_unregister_thread(mddev->thread); mddev->thread = NULL; if (conf) { - shrink_stripes(conf); print_raid5_conf(conf); - safe_put_page(conf->spare_page); - kfree(conf->disks); - kfree(conf->stripe_hashtbl); - kfree(conf); + free_conf(conf); } mddev->private = NULL; printk(KERN_ALERT "raid5: failed to run raid set %s\n", mdname(mddev)); @@ -4649,13 +4653,10 @@ static int stop(mddev_t *mddev) md_unregister_thread(mddev->thread); mddev->thread = NULL; - shrink_stripes(conf); - kfree(conf->stripe_hashtbl); mddev->queue->backing_dev_info.congested_fn = NULL; blk_sync_queue(mddev->queue); /* the unplug fn references 'conf'*/ sysfs_remove_group(&mddev->kobj, &raid5_attrs_group); - kfree(conf->disks); - kfree(conf); + free_conf(conf); mddev->private = NULL; return 0; } @@ -4857,6 +4858,7 @@ static int raid5_resize(mddev_t *mddev, sector_t sectors) return -EINVAL; set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; + revalidate_disk(mddev->gendisk); if (sectors > mddev->dev_sectors && mddev->recovery_cp == MaxSector) { mddev->recovery_cp = mddev->dev_sectors; set_bit(MD_RECOVERY_NEEDED, &mddev->recovery); @@ -5002,7 +5004,7 @@ static int raid5_start_reshape(mddev_t *mddev) spin_unlock_irqrestore(&conf->device_lock, flags); } mddev->raid_disks = conf->raid_disks; - mddev->reshape_position = 0; + mddev->reshape_position = conf->reshape_progress; set_bit(MD_CHANGE_DEVS, &mddev->flags); clear_bit(MD_RECOVERY_SYNC, &mddev->recovery); @@ -5057,7 +5059,6 @@ static void end_reshape(raid5_conf_t *conf) */ static void raid5_finish_reshape(mddev_t *mddev) { - struct block_device *bdev; raid5_conf_t *conf = mddev->private; if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery)) { @@ -5066,15 +5067,7 @@ static void raid5_finish_reshape(mddev_t *mddev) md_set_array_sectors(mddev, raid5_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev->changed = 1; - - bdev = bdget_disk(mddev->gendisk, 0); - if (bdev) { - mutex_lock(&bdev->bd_inode->i_mutex); - i_size_write(bdev->bd_inode, - (loff_t)mddev->array_sectors << 9); - mutex_unlock(&bdev->bd_inode->i_mutex); - bdput(bdev); - } + revalidate_disk(mddev->gendisk); } else { int d; mddev->degraded = conf->raid_disks; @@ -5106,12 +5099,18 @@ static void raid5_quiesce(mddev_t *mddev, int state) case 1: /* stop all writes */ spin_lock_irq(&conf->device_lock); - conf->quiesce = 1; + /* '2' tells resync/reshape to pause so that all + * active stripes can drain + */ + conf->quiesce = 2; wait_event_lock_irq(conf->wait_for_stripe, atomic_read(&conf->active_stripes) == 0 && atomic_read(&conf->active_aligned_reads) == 0, conf->device_lock, /* nothing */); + conf->quiesce = 1; spin_unlock_irq(&conf->device_lock); + /* allow reshape to continue */ + wake_up(&conf->wait_for_overlap); break; case 0: /* re-enable writes */ diff --git a/drivers/mfd/twl4030-irq.c b/drivers/mfd/twl4030-irq.c index bae61b22501..7d430835655 100644 --- a/drivers/mfd/twl4030-irq.c +++ b/drivers/mfd/twl4030-irq.c @@ -180,14 +180,9 @@ static struct completion irq_event; static int twl4030_irq_thread(void *data) { long irq = (long)data; - struct irq_desc *desc = irq_to_desc(irq); static unsigned i2c_errors; static const unsigned max_i2c_errors = 100; - if (!desc) { - pr_err("twl4030: Invalid IRQ: %ld\n", irq); - return -EINVAL; - } current->flags |= PF_NOFREEZE; @@ -240,7 +235,7 @@ static int twl4030_irq_thread(void *data) } local_irq_enable(); - desc->chip->unmask(irq); + enable_irq(irq); } return 0; @@ -255,25 +250,13 @@ static int twl4030_irq_thread(void *data) * thread. All we do here is acknowledge and mask the interrupt and wakeup * the kernel thread. */ -static void handle_twl4030_pih(unsigned int irq, struct irq_desc *desc) +static irqreturn_t handle_twl4030_pih(int irq, void *devid) { /* Acknowledge, clear *AND* mask the interrupt... */ - desc->chip->ack(irq); - complete(&irq_event); -} - -static struct task_struct *start_twl4030_irq_thread(long irq) -{ - struct task_struct *thread; - - init_completion(&irq_event); - thread = kthread_run(twl4030_irq_thread, (void *)irq, "twl4030-irq"); - if (!thread) - pr_err("twl4030: could not create irq %ld thread!\n", irq); - - return thread; + disable_irq_nosync(irq); + complete(devid); + return IRQ_HANDLED; } - /*----------------------------------------------------------------------*/ /* @@ -734,18 +717,28 @@ int twl_init_irq(int irq_num, unsigned irq_base, unsigned irq_end) } /* install an irq handler to demultiplex the TWL4030 interrupt */ - task = start_twl4030_irq_thread(irq_num); - if (!task) { - pr_err("twl4030: irq thread FAIL\n"); - status = -ESRCH; - goto fail; - } - set_irq_data(irq_num, task); - set_irq_chained_handler(irq_num, handle_twl4030_pih); - return status; + init_completion(&irq_event); + status = request_irq(irq_num, handle_twl4030_pih, IRQF_DISABLED, + "TWL4030-PIH", &irq_event); + if (status < 0) { + pr_err("twl4030: could not claim irq%d: %d\n", irq_num, status); + goto fail_rqirq; + } + + task = kthread_run(twl4030_irq_thread, (void *)irq_num, "twl4030-irq"); + if (IS_ERR(task)) { + pr_err("twl4030: could not create irq %d thread!\n", irq_num); + status = PTR_ERR(task); + goto fail_kthread; + } + return status; +fail_kthread: + free_irq(irq_num, &irq_event); +fail_rqirq: + /* clean up twl4030_sih_setup */ fail: for (i = irq_base; i < irq_end; i++) set_irq_chip_and_handler(i, NULL, NULL); diff --git a/drivers/misc/cb710/sgbuf2.c b/drivers/misc/cb710/sgbuf2.c index d38a7acdb6e..d019746551f 100644 --- a/drivers/misc/cb710/sgbuf2.c +++ b/drivers/misc/cb710/sgbuf2.c @@ -114,7 +114,6 @@ static void sg_dwiter_write_slow(struct sg_mapping_iter *miter, uint32_t data) if (!left) return; addr += len; - flush_kernel_dcache_page(miter->page); } while (sg_dwiter_next(miter)); } @@ -142,9 +141,6 @@ void cb710_sg_dwiter_write_next_block(struct sg_mapping_iter *miter, uint32_t da return; } else sg_dwiter_write_slow(miter, data); - - if (miter->length == miter->consumed) - flush_kernel_dcache_page(miter->page); } EXPORT_SYMBOL_GPL(cb710_sg_dwiter_write_next_block); diff --git a/drivers/mmc/host/cb710-mmc.c b/drivers/mmc/host/cb710-mmc.c index 11efefb1af5..4e72964a7b4 100644 --- a/drivers/mmc/host/cb710-mmc.c +++ b/drivers/mmc/host/cb710-mmc.c @@ -278,7 +278,7 @@ static int cb710_mmc_receive(struct cb710_slot *slot, struct mmc_data *data) if (unlikely(data->blksz & 15 && (data->blocks != 1 || data->blksz != 8))) return -EINVAL; - sg_miter_start(&miter, data->sg, data->sg_len, 0); + sg_miter_start(&miter, data->sg, data->sg_len, SG_MITER_TO_SG); cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT, 15, CB710_MMC_C2_READ_PIO_SIZE_MASK); @@ -307,7 +307,7 @@ static int cb710_mmc_receive(struct cb710_slot *slot, struct mmc_data *data) goto out; } out: - cb710_sg_miter_stop_writing(&miter); + sg_miter_stop(&miter); return err; } @@ -322,7 +322,7 @@ static int cb710_mmc_send(struct cb710_slot *slot, struct mmc_data *data) if (unlikely(data->blocks > 1 && data->blksz & 15)) return -EINVAL; - sg_miter_start(&miter, data->sg, data->sg_len, 0); + sg_miter_start(&miter, data->sg, data->sg_len, SG_MITER_FROM_SG); cb710_modify_port_8(slot, CB710_MMC_CONFIG2_PORT, 0, CB710_MMC_C2_READ_PIO_SIZE_MASK); diff --git a/drivers/mmc/host/imxmmc.c b/drivers/mmc/host/imxmmc.c index e0be21a4a69..bf98d7cc928 100644 --- a/drivers/mmc/host/imxmmc.c +++ b/drivers/mmc/host/imxmmc.c @@ -652,7 +652,7 @@ static irqreturn_t imxmci_irq(int irq, void *devid) set_bit(IMXMCI_PEND_STARTED_b, &host->pending_events); tasklet_schedule(&host->tasklet); - return IRQ_RETVAL(handled);; + return IRQ_RETVAL(handled); } static void imxmci_tasklet_fnc(unsigned long data) diff --git a/drivers/mmc/host/sdhci.c b/drivers/mmc/host/sdhci.c index 62041c7e924..fc96f8cb9c0 100644 --- a/drivers/mmc/host/sdhci.c +++ b/drivers/mmc/host/sdhci.c @@ -773,8 +773,14 @@ static void sdhci_prepare_data(struct sdhci_host *host, struct mmc_data *data) } if (!(host->flags & SDHCI_REQ_USE_DMA)) { - sg_miter_start(&host->sg_miter, - data->sg, data->sg_len, SG_MITER_ATOMIC); + int flags; + + flags = SG_MITER_ATOMIC; + if (host->data->flags & MMC_DATA_READ) + flags |= SG_MITER_TO_SG; + else + flags |= SG_MITER_FROM_SG; + sg_miter_start(&host->sg_miter, data->sg, data->sg_len, flags); host->blocks = data->blocks; } diff --git a/drivers/net/3c515.c b/drivers/net/3c515.c index 3e00fa8ea65..4a7c32895be 100644 --- a/drivers/net/3c515.c +++ b/drivers/net/3c515.c @@ -832,7 +832,9 @@ static int corkscrew_open(struct net_device *dev) skb_reserve(skb, 2); /* Align IP on 16 byte boundaries */ vp->rx_ring[i].addr = isa_virt_to_bus(skb->data); } - vp->rx_ring[i - 1].next = isa_virt_to_bus(&vp->rx_ring[0]); /* Wrap the ring. */ + if (i != 0) + vp->rx_ring[i - 1].next = + isa_virt_to_bus(&vp->rx_ring[0]); /* Wrap the ring. */ outl(isa_virt_to_bus(&vp->rx_ring[0]), ioaddr + UpListPtr); } if (vp->full_bus_master_tx) { /* Boomerang bus master Tx. */ diff --git a/drivers/net/3c59x.c b/drivers/net/3c59x.c index c34aee91250..c2041685094 100644 --- a/drivers/net/3c59x.c +++ b/drivers/net/3c59x.c @@ -2721,13 +2721,15 @@ dump_tx_ring(struct net_device *dev) &vp->tx_ring[vp->dirty_tx % TX_RING_SIZE]); issue_and_wait(dev, DownStall); for (i = 0; i < TX_RING_SIZE; i++) { - pr_err(" %d: @%p length %8.8x status %8.8x\n", i, - &vp->tx_ring[i], + unsigned int length; + #if DO_ZEROCOPY - le32_to_cpu(vp->tx_ring[i].frag[0].length), + length = le32_to_cpu(vp->tx_ring[i].frag[0].length); #else - le32_to_cpu(vp->tx_ring[i].length), + length = le32_to_cpu(vp->tx_ring[i].length); #endif + pr_err(" %d: @%p length %8.8x status %8.8x\n", + i, &vp->tx_ring[i], length, le32_to_cpu(vp->tx_ring[i].status)); } if (!stalled) diff --git a/drivers/net/eexpress.c b/drivers/net/eexpress.c index 1686dca2874..1f016d66684 100644 --- a/drivers/net/eexpress.c +++ b/drivers/net/eexpress.c @@ -1474,13 +1474,13 @@ static void eexp_hw_init586(struct net_device *dev) outw(0x0000, ioaddr + 0x800c); outw(0x0000, ioaddr + 0x800e); - for (i = 0; i < (sizeof(start_code)); i+=32) { + for (i = 0; i < ARRAY_SIZE(start_code) * 2; i+=32) { int j; outw(i, ioaddr + SM_PTR); - for (j = 0; j < 16; j+=2) + for (j = 0; j < 16 && (i+j)/2 < ARRAY_SIZE(start_code); j+=2) outw(start_code[(i+j)/2], ioaddr+0x4000+j); - for (j = 0; j < 16; j+=2) + for (j = 0; j < 16 && (i+j+16)/2 < ARRAY_SIZE(start_code); j+=2) outw(start_code[(i+j+16)/2], ioaddr+0x8000+j); } diff --git a/drivers/net/ehea/ehea.h b/drivers/net/ehea/ehea.h index 78952f8324e..fa311a95099 100644 --- a/drivers/net/ehea/ehea.h +++ b/drivers/net/ehea/ehea.h @@ -40,7 +40,7 @@ #include <asm/io.h> #define DRV_NAME "ehea" -#define DRV_VERSION "EHEA_0101" +#define DRV_VERSION "EHEA_0102" /* eHEA capability flags */ #define DLPAR_PORT_ADD_REM 1 diff --git a/drivers/net/ehea/ehea_main.c b/drivers/net/ehea/ehea_main.c index e8d46cc1bec..977c3d35827 100644 --- a/drivers/net/ehea/ehea_main.c +++ b/drivers/net/ehea/ehea_main.c @@ -1545,6 +1545,9 @@ static int ehea_clean_portres(struct ehea_port *port, struct ehea_port_res *pr) { int ret, i; + if (pr->qp) + netif_napi_del(&pr->napi); + ret = ehea_destroy_qp(pr->qp); if (!ret) { diff --git a/drivers/net/gianfar_ethtool.c b/drivers/net/gianfar_ethtool.c index dbf06e9313c..2234118eedb 100644 --- a/drivers/net/gianfar_ethtool.c +++ b/drivers/net/gianfar_ethtool.c @@ -366,9 +366,8 @@ static int gfar_scoalesce(struct net_device *dev, struct ethtool_coalesce *cvals return -EINVAL; } - priv->rxic = mk_ic_value( - gfar_usecs2ticks(priv, cvals->rx_coalesce_usecs), - cvals->rx_max_coalesced_frames); + priv->rxic = mk_ic_value(cvals->rx_max_coalesced_frames, + gfar_usecs2ticks(priv, cvals->rx_coalesce_usecs)); /* Set up tx coalescing */ if ((cvals->tx_coalesce_usecs == 0) || @@ -390,9 +389,8 @@ static int gfar_scoalesce(struct net_device *dev, struct ethtool_coalesce *cvals return -EINVAL; } - priv->txic = mk_ic_value( - gfar_usecs2ticks(priv, cvals->tx_coalesce_usecs), - cvals->tx_max_coalesced_frames); + priv->txic = mk_ic_value(cvals->tx_max_coalesced_frames, + gfar_usecs2ticks(priv, cvals->tx_coalesce_usecs)); gfar_write(&priv->regs->rxic, 0); if (priv->rxcoalescing) diff --git a/drivers/net/igbvf/vf.c b/drivers/net/igbvf/vf.c index 2a4faf9ade6..a9a61efa964 100644 --- a/drivers/net/igbvf/vf.c +++ b/drivers/net/igbvf/vf.c @@ -274,6 +274,8 @@ static s32 e1000_set_vfta_vf(struct e1000_hw *hw, u16 vid, bool set) err = mbx->ops.read_posted(hw, msgbuf, 2); + msgbuf[0] &= ~E1000_VT_MSGTYPE_CTS; + /* if nacked the vlan was rejected */ if (!err && (msgbuf[0] == (E1000_VF_SET_VLAN | E1000_VT_MSGTYPE_NACK))) err = -E1000_ERR_MAC_INIT; @@ -317,6 +319,8 @@ static void e1000_rar_set_vf(struct e1000_hw *hw, u8 * addr, u32 index) if (!ret_val) ret_val = mbx->ops.read_posted(hw, msgbuf, 3); + msgbuf[0] &= ~E1000_VT_MSGTYPE_CTS; + /* if nacked the address was rejected, use "perm_addr" */ if (!ret_val && (msgbuf[0] == (E1000_VF_SET_MAC_ADDR | E1000_VT_MSGTYPE_NACK))) diff --git a/drivers/net/ixgbe/ixgbe.h b/drivers/net/ixgbe/ixgbe.h index 1b12c7ba275..e11d83d5852 100644 --- a/drivers/net/ixgbe/ixgbe.h +++ b/drivers/net/ixgbe/ixgbe.h @@ -96,6 +96,8 @@ #define IXGBE_TX_FLAGS_VLAN_PRIO_MASK 0x0000e000 #define IXGBE_TX_FLAGS_VLAN_SHIFT 16 +#define IXGBE_MAX_RSC_INT_RATE 162760 + /* wrapper around a pointer to a socket buffer, * so a DMA handle can be stored along with the buffer */ struct ixgbe_tx_buffer { diff --git a/drivers/net/ixgbe/ixgbe_82598.c b/drivers/net/ixgbe/ixgbe_82598.c index b9923047ce1..522c03bc1da 100644 --- a/drivers/net/ixgbe/ixgbe_82598.c +++ b/drivers/net/ixgbe/ixgbe_82598.c @@ -50,6 +50,51 @@ static s32 ixgbe_read_i2c_eeprom_82598(struct ixgbe_hw *hw, u8 byte_offset, u8 *eeprom_data); /** + * ixgbe_set_pcie_completion_timeout - set pci-e completion timeout + * @hw: pointer to the HW structure + * + * The defaults for 82598 should be in the range of 50us to 50ms, + * however the hardware default for these parts is 500us to 1ms which is less + * than the 10ms recommended by the pci-e spec. To address this we need to + * increase the value to either 10ms to 250ms for capability version 1 config, + * or 16ms to 55ms for version 2. + **/ +void ixgbe_set_pcie_completion_timeout(struct ixgbe_hw *hw) +{ + struct ixgbe_adapter *adapter = hw->back; + u32 gcr = IXGBE_READ_REG(hw, IXGBE_GCR); + u16 pcie_devctl2; + + /* only take action if timeout value is defaulted to 0 */ + if (gcr & IXGBE_GCR_CMPL_TMOUT_MASK) + goto out; + + /* + * if capababilities version is type 1 we can write the + * timeout of 10ms to 250ms through the GCR register + */ + if (!(gcr & IXGBE_GCR_CAP_VER2)) { + gcr |= IXGBE_GCR_CMPL_TMOUT_10ms; + goto out; + } + + /* + * for version 2 capabilities we need to write the config space + * directly in order to set the completion timeout value for + * 16ms to 55ms + */ + pci_read_config_word(adapter->pdev, + IXGBE_PCI_DEVICE_CONTROL2, &pcie_devctl2); + pcie_devctl2 |= IXGBE_PCI_DEVICE_CONTROL2_16ms; + pci_write_config_word(adapter->pdev, + IXGBE_PCI_DEVICE_CONTROL2, pcie_devctl2); +out: + /* disable completion timeout resend */ + gcr &= ~IXGBE_GCR_CMPL_TMOUT_RESEND; + IXGBE_WRITE_REG(hw, IXGBE_GCR, gcr); +} + +/** * ixgbe_get_pcie_msix_count_82598 - Gets MSI-X vector count * @hw: pointer to hardware structure * @@ -153,6 +198,26 @@ out: } /** + * ixgbe_start_hw_82598 - Prepare hardware for Tx/Rx + * @hw: pointer to hardware structure + * + * Starts the hardware using the generic start_hw function. + * Then set pcie completion timeout + **/ +s32 ixgbe_start_hw_82598(struct ixgbe_hw *hw) +{ + s32 ret_val = 0; + + ret_val = ixgbe_start_hw_generic(hw); + + /* set the completion timeout for interface */ + if (ret_val == 0) + ixgbe_set_pcie_completion_timeout(hw); + + return ret_val; +} + +/** * ixgbe_get_link_capabilities_82598 - Determines link capabilities * @hw: pointer to hardware structure * @speed: pointer to link speed @@ -1085,7 +1150,7 @@ out: static struct ixgbe_mac_operations mac_ops_82598 = { .init_hw = &ixgbe_init_hw_generic, .reset_hw = &ixgbe_reset_hw_82598, - .start_hw = &ixgbe_start_hw_generic, + .start_hw = &ixgbe_start_hw_82598, .clear_hw_cntrs = &ixgbe_clear_hw_cntrs_generic, .get_media_type = &ixgbe_get_media_type_82598, .get_supported_physical_layer = &ixgbe_get_supported_physical_layer_82598, diff --git a/drivers/net/ixgbe/ixgbe_ethtool.c b/drivers/net/ixgbe/ixgbe_ethtool.c index 2a978008fd6..79144e950a3 100644 --- a/drivers/net/ixgbe/ixgbe_ethtool.c +++ b/drivers/net/ixgbe/ixgbe_ethtool.c @@ -1975,7 +1975,10 @@ static int ixgbe_set_coalesce(struct net_device *netdev, * any other value means disable eitr, which is best * served by setting the interrupt rate very high */ - adapter->eitr_param = IXGBE_MAX_INT_RATE; + if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) + adapter->eitr_param = IXGBE_MAX_RSC_INT_RATE; + else + adapter->eitr_param = IXGBE_MAX_INT_RATE; adapter->itr_setting = 0; } @@ -1999,13 +2002,13 @@ static int ixgbe_set_flags(struct net_device *netdev, u32 data) ethtool_op_set_flags(netdev, data); - if (!(adapter->flags & IXGBE_FLAG2_RSC_CAPABLE)) + if (!(adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE)) return 0; /* if state changes we need to update adapter->flags and reset */ if ((!!(data & ETH_FLAG_LRO)) != - (!!(adapter->flags & IXGBE_FLAG2_RSC_ENABLED))) { - adapter->flags ^= IXGBE_FLAG2_RSC_ENABLED; + (!!(adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED))) { + adapter->flags2 ^= IXGBE_FLAG2_RSC_ENABLED; if (netif_running(netdev)) ixgbe_reinit_locked(adapter); else diff --git a/drivers/net/ixgbe/ixgbe_main.c b/drivers/net/ixgbe/ixgbe_main.c index 200454f30f6..110c65ab5cb 100644 --- a/drivers/net/ixgbe/ixgbe_main.c +++ b/drivers/net/ixgbe/ixgbe_main.c @@ -780,7 +780,7 @@ static bool ixgbe_clean_rx_irq(struct ixgbe_q_vector *q_vector, prefetch(next_rxd); cleaned_count++; - if (adapter->flags & IXGBE_FLAG2_RSC_CAPABLE) + if (adapter->flags2 & IXGBE_FLAG2_RSC_CAPABLE) rsc_count = ixgbe_get_rsc_count(rx_desc); if (rsc_count) { @@ -2036,7 +2036,7 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter) IXGBE_WRITE_REG(hw, IXGBE_PSRTYPE(0), psrtype); } } else { - if (!(adapter->flags & IXGBE_FLAG2_RSC_ENABLED) && + if (!(adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) && (netdev->mtu <= ETH_DATA_LEN)) rx_buf_len = MAXIMUM_ETHERNET_VLAN_SIZE; else @@ -2165,7 +2165,7 @@ static void ixgbe_configure_rx(struct ixgbe_adapter *adapter) IXGBE_WRITE_REG(hw, IXGBE_RDRXCTL, rdrxctl); } - if (adapter->flags & IXGBE_FLAG2_RSC_ENABLED) { + if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) { /* Enable 82599 HW-RSC */ for (i = 0; i < adapter->num_rx_queues; i++) { j = adapter->rx_ring[i].reg_idx; @@ -3812,8 +3812,8 @@ static int __devinit ixgbe_sw_init(struct ixgbe_adapter *adapter) adapter->max_msix_q_vectors = MAX_MSIX_Q_VECTORS_82598; } else if (hw->mac.type == ixgbe_mac_82599EB) { adapter->max_msix_q_vectors = MAX_MSIX_Q_VECTORS_82599; - adapter->flags |= IXGBE_FLAG2_RSC_CAPABLE; - adapter->flags |= IXGBE_FLAG2_RSC_ENABLED; + adapter->flags2 |= IXGBE_FLAG2_RSC_CAPABLE; + adapter->flags2 |= IXGBE_FLAG2_RSC_ENABLED; adapter->flags |= IXGBE_FLAG_FDIR_HASH_CAPABLE; adapter->ring_feature[RING_F_FDIR].indices = IXGBE_MAX_FDIR_INDICES; @@ -5360,12 +5360,19 @@ static int ixgbe_del_sanmac_netdev(struct net_device *dev) static void ixgbe_netpoll(struct net_device *netdev) { struct ixgbe_adapter *adapter = netdev_priv(netdev); + int i; - disable_irq(adapter->pdev->irq); adapter->flags |= IXGBE_FLAG_IN_NETPOLL; - ixgbe_intr(adapter->pdev->irq, netdev); + if (adapter->flags & IXGBE_FLAG_MSIX_ENABLED) { + int num_q_vectors = adapter->num_msix_vectors - NON_Q_VECTORS; + for (i = 0; i < num_q_vectors; i++) { + struct ixgbe_q_vector *q_vector = adapter->q_vector[i]; + ixgbe_msix_clean_many(0, q_vector); + } + } else { + ixgbe_intr(adapter->pdev->irq, netdev); + } adapter->flags &= ~IXGBE_FLAG_IN_NETPOLL; - enable_irq(adapter->pdev->irq); } #endif @@ -5611,7 +5618,7 @@ static int __devinit ixgbe_probe(struct pci_dev *pdev, if (pci_using_dac) netdev->features |= NETIF_F_HIGHDMA; - if (adapter->flags & IXGBE_FLAG2_RSC_ENABLED) + if (adapter->flags2 & IXGBE_FLAG2_RSC_ENABLED) netdev->features |= NETIF_F_LRO; /* make sure the EEPROM is good */ diff --git a/drivers/net/ixgbe/ixgbe_type.h b/drivers/net/ixgbe/ixgbe_type.h index fa87309dc08..be90eb4575f 100644 --- a/drivers/net/ixgbe/ixgbe_type.h +++ b/drivers/net/ixgbe/ixgbe_type.h @@ -718,6 +718,12 @@ #define IXGBE_ECC_STATUS_82599 0x110E0 #define IXGBE_BAR_CTRL_82599 0x110F4 +/* PCI Express Control */ +#define IXGBE_GCR_CMPL_TMOUT_MASK 0x0000F000 +#define IXGBE_GCR_CMPL_TMOUT_10ms 0x00001000 +#define IXGBE_GCR_CMPL_TMOUT_RESEND 0x00010000 +#define IXGBE_GCR_CAP_VER2 0x00040000 + /* Time Sync Registers */ #define IXGBE_TSYNCRXCTL 0x05188 /* Rx Time Sync Control register - RW */ #define IXGBE_TSYNCTXCTL 0x08C00 /* Tx Time Sync Control register - RW */ @@ -1521,6 +1527,7 @@ /* PCI Bus Info */ #define IXGBE_PCI_LINK_STATUS 0xB2 +#define IXGBE_PCI_DEVICE_CONTROL2 0xC8 #define IXGBE_PCI_LINK_WIDTH 0x3F0 #define IXGBE_PCI_LINK_WIDTH_1 0x10 #define IXGBE_PCI_LINK_WIDTH_2 0x20 @@ -1531,6 +1538,7 @@ #define IXGBE_PCI_LINK_SPEED_5000 0x2 #define IXGBE_PCI_HEADER_TYPE_REGISTER 0x0E #define IXGBE_PCI_HEADER_TYPE_MULTIFUNC 0x80 +#define IXGBE_PCI_DEVICE_CONTROL2_16ms 0x0005 /* Number of 100 microseconds we wait for PCI Express master disable */ #define IXGBE_PCI_MASTER_DISABLE_TIMEOUT 800 diff --git a/drivers/net/mlx4/en_tx.c b/drivers/net/mlx4/en_tx.c index 08c43f2ae72..5a88b3f5769 100644 --- a/drivers/net/mlx4/en_tx.c +++ b/drivers/net/mlx4/en_tx.c @@ -249,6 +249,7 @@ static u32 mlx4_en_free_tx_desc(struct mlx4_en_priv *priv, pci_unmap_page(mdev->pdev, (dma_addr_t) be64_to_cpu(data->addr), frag->size, PCI_DMA_TODEVICE); + ++data; } } /* Stamp the freed descriptor */ diff --git a/drivers/net/netxen/netxen_nic_main.c b/drivers/net/netxen/netxen_nic_main.c index 637ac8b89ba..3cd8cfcf627 100644 --- a/drivers/net/netxen/netxen_nic_main.c +++ b/drivers/net/netxen/netxen_nic_main.c @@ -221,7 +221,7 @@ netxen_napi_disable(struct netxen_adapter *adapter) } } -static int nx_set_dma_mask(struct netxen_adapter *adapter, uint8_t revision_id) +static int nx_set_dma_mask(struct netxen_adapter *adapter) { struct pci_dev *pdev = adapter->pdev; uint64_t mask, cmask; @@ -229,19 +229,17 @@ static int nx_set_dma_mask(struct netxen_adapter *adapter, uint8_t revision_id) adapter->pci_using_dac = 0; mask = DMA_BIT_MASK(32); - /* - * Consistent DMA mask is set to 32 bit because it cannot be set to - * 35 bits. For P3 also leave it at 32 bits for now. Only the rings - * come off this pool. - */ cmask = DMA_BIT_MASK(32); + if (NX_IS_REVISION_P2(adapter->ahw.revision_id)) { #ifndef CONFIG_IA64 - if (revision_id >= NX_P3_B0) - mask = DMA_BIT_MASK(39); - else if (revision_id == NX_P2_C1) mask = DMA_BIT_MASK(35); #endif + } else { + mask = DMA_BIT_MASK(39); + cmask = mask; + } + if (pci_set_dma_mask(pdev, mask) == 0 && pci_set_consistent_dma_mask(pdev, cmask) == 0) { adapter->pci_using_dac = 1; @@ -256,7 +254,7 @@ static int nx_update_dma_mask(struct netxen_adapter *adapter) { int change, shift, err; - uint64_t mask, old_mask; + uint64_t mask, old_mask, old_cmask; struct pci_dev *pdev = adapter->pdev; change = 0; @@ -272,14 +270,29 @@ nx_update_dma_mask(struct netxen_adapter *adapter) if (change) { old_mask = pdev->dma_mask; + old_cmask = pdev->dev.coherent_dma_mask; + mask = (1ULL<<(32+shift)) - 1; err = pci_set_dma_mask(pdev, mask); if (err) - return pci_set_dma_mask(pdev, old_mask); + goto err_out; + + if (NX_IS_REVISION_P3(adapter->ahw.revision_id)) { + + err = pci_set_consistent_dma_mask(pdev, mask); + if (err) + goto err_out; + } + dev_info(&pdev->dev, "using %d-bit dma mask\n", 32+shift); } return 0; + +err_out: + pci_set_dma_mask(pdev, old_mask); + pci_set_consistent_dma_mask(pdev, old_cmask); + return err; } static void netxen_check_options(struct netxen_adapter *adapter) @@ -1006,7 +1019,7 @@ netxen_nic_probe(struct pci_dev *pdev, const struct pci_device_id *ent) revision_id = pdev->revision; adapter->ahw.revision_id = revision_id; - err = nx_set_dma_mask(adapter, revision_id); + err = nx_set_dma_mask(adapter); if (err) goto err_out_free_netdev; diff --git a/drivers/net/pcnet32.c b/drivers/net/pcnet32.c index 28368157dac..a646a445fda 100644 --- a/drivers/net/pcnet32.c +++ b/drivers/net/pcnet32.c @@ -1611,8 +1611,11 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) if (pcnet32_dwio_read_csr(ioaddr, 0) == 4 && pcnet32_dwio_check(ioaddr)) { a = &pcnet32_dwio; - } else + } else { + if (pcnet32_debug & NETIF_MSG_PROBE) + printk(KERN_ERR PFX "No access methods\n"); goto err_release_region; + } } chip_version = @@ -1719,7 +1722,9 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) ret = -ENOMEM; goto err_release_region; } - SET_NETDEV_DEV(dev, &pdev->dev); + + if (pdev) + SET_NETDEV_DEV(dev, &pdev->dev); if (pcnet32_debug & NETIF_MSG_PROBE) printk(KERN_INFO PFX "%s at %#3lx,", chipname, ioaddr); @@ -1818,7 +1823,6 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) spin_lock_init(&lp->lock); - SET_NETDEV_DEV(dev, &pdev->dev); lp->name = chipname; lp->shared_irq = shared; lp->tx_ring_size = TX_RING_SIZE; /* default tx ring size */ @@ -1852,12 +1856,6 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) ((cards_found >= MAX_UNITS) || full_duplex[cards_found])) lp->options |= PCNET32_PORT_FD; - if (!a) { - if (pcnet32_debug & NETIF_MSG_PROBE) - printk(KERN_ERR PFX "No access methods\n"); - ret = -ENODEV; - goto err_free_consistent; - } lp->a = *a; /* prior to register_netdev, dev->name is not yet correct */ @@ -1973,14 +1971,13 @@ pcnet32_probe1(unsigned long ioaddr, int shared, struct pci_dev *pdev) return 0; - err_free_ring: +err_free_ring: pcnet32_free_ring(dev); - err_free_consistent: pci_free_consistent(lp->pci_dev, sizeof(*lp->init_block), lp->init_block, lp->init_dma_addr); - err_free_netdev: +err_free_netdev: free_netdev(dev); - err_release_region: +err_release_region: release_region(ioaddr, PCNET32_TOTAL_SIZE); return ret; } @@ -2089,6 +2086,7 @@ static void pcnet32_free_ring(struct net_device *dev) static int pcnet32_open(struct net_device *dev) { struct pcnet32_private *lp = netdev_priv(dev); + struct pci_dev *pdev = lp->pci_dev; unsigned long ioaddr = dev->base_addr; u16 val; int i; @@ -2149,9 +2147,9 @@ static int pcnet32_open(struct net_device *dev) lp->a.write_csr(ioaddr, 124, val); /* Allied Telesyn AT 2700/2701 FX are 100Mbit only and do not negotiate */ - if (lp->pci_dev->subsystem_vendor == PCI_VENDOR_ID_AT && - (lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2700FX || - lp->pci_dev->subsystem_device == PCI_SUBDEVICE_ID_AT_2701FX)) { + if (pdev && pdev->subsystem_vendor == PCI_VENDOR_ID_AT && + (pdev->subsystem_device == PCI_SUBDEVICE_ID_AT_2700FX || + pdev->subsystem_device == PCI_SUBDEVICE_ID_AT_2701FX)) { if (lp->options & PCNET32_PORT_ASEL) { lp->options = PCNET32_PORT_FD | PCNET32_PORT_100; if (netif_msg_link(lp)) diff --git a/drivers/net/ppp_generic.c b/drivers/net/ppp_generic.c index 639d11bc444..cd37d739ac7 100644 --- a/drivers/net/ppp_generic.c +++ b/drivers/net/ppp_generic.c @@ -1384,7 +1384,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb) /* create a fragment for each channel */ bits = B; - while (nfree > 0 && len > 0) { + while (len > 0) { list = list->next; if (list == &ppp->channels) { i = 0; @@ -1431,29 +1431,31 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb) *otherwise divide it according to the speed *of the channel we are going to transmit on */ - if (pch->speed == 0) { - flen = totlen/nfree ; - if (nbigger > 0) { - flen++; - nbigger--; - } - } else { - flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) / - ((totspeed*totfree)/pch->speed)) - hdrlen; - if (nbigger > 0) { - flen += ((totfree - nzero)*pch->speed)/totspeed; - nbigger -= ((totfree - nzero)*pch->speed)/ + if (nfree > 0) { + if (pch->speed == 0) { + flen = totlen/nfree ; + if (nbigger > 0) { + flen++; + nbigger--; + } + } else { + flen = (((totfree - nzero)*(totlen + hdrlen*totfree)) / + ((totspeed*totfree)/pch->speed)) - hdrlen; + if (nbigger > 0) { + flen += ((totfree - nzero)*pch->speed)/totspeed; + nbigger -= ((totfree - nzero)*pch->speed)/ totspeed; + } } + nfree--; } - nfree--; /* *check if we are on the last channel or *we exceded the lenght of the data to *fragment */ - if ((nfree == 0) || (flen > len)) + if ((nfree <= 0) || (flen > len)) flen = len; /* *it is not worth to tx on slow channels: @@ -1467,7 +1469,7 @@ static int ppp_mp_explode(struct ppp *ppp, struct sk_buff *skb) continue; } - mtu = pch->chan->mtu + 2 - hdrlen; + mtu = pch->chan->mtu - hdrlen; if (mtu < 4) mtu = 4; if (flen > mtu) diff --git a/drivers/net/pppoe.c b/drivers/net/pppoe.c index f0031f1f97e..5f2090233d7 100644 --- a/drivers/net/pppoe.c +++ b/drivers/net/pppoe.c @@ -1063,6 +1063,7 @@ static void *pppoe_seq_next(struct seq_file *seq, void *v, loff_t *pos) else { int hash = hash_item(po->pppoe_pa.sid, po->pppoe_pa.remote); + po = NULL; while (++hash < PPPOE_HASH_SIZE) { po = pn->hash_table[hash]; if (po) diff --git a/drivers/net/pppol2tp.c b/drivers/net/pppol2tp.c index e7935d09c89..e0f9219a0ae 100644 --- a/drivers/net/pppol2tp.c +++ b/drivers/net/pppol2tp.c @@ -2680,6 +2680,7 @@ out_unregister_pppol2tp_proto: static void __exit pppol2tp_exit(void) { unregister_pppox_proto(PX_PROTO_OL2TP); + unregister_pernet_gen_device(pppol2tp_net_id, &pppol2tp_net_ops); proto_unregister(&pppol2tp_sk_proto); } diff --git a/drivers/net/s6gmac.c b/drivers/net/s6gmac.c index 5345e47b35a..4525cbe8dd6 100644 --- a/drivers/net/s6gmac.c +++ b/drivers/net/s6gmac.c @@ -793,7 +793,7 @@ static inline int s6gmac_phy_start(struct net_device *dev) struct s6gmac *pd = netdev_priv(dev); int i = 0; struct phy_device *p = NULL; - while ((!(p = pd->mii.bus->phy_map[i])) && (i < PHY_MAX_ADDR)) + while ((i < PHY_MAX_ADDR) && (!(p = pd->mii.bus->phy_map[i]))) i++; p = phy_connect(dev, dev_name(&p->dev), &s6gmac_adjust_link, 0, PHY_INTERFACE_MODE_RGMII); diff --git a/drivers/net/sky2.c b/drivers/net/sky2.c index 3550c5dcd93..0a551d8f5d9 100644 --- a/drivers/net/sky2.c +++ b/drivers/net/sky2.c @@ -1488,6 +1488,8 @@ static int sky2_up(struct net_device *dev) sky2_set_vlan_mode(hw, port, sky2->vlgrp != NULL); #endif + sky2->restarting = 0; + err = sky2_rx_start(sky2); if (err) goto err_out; @@ -1500,6 +1502,9 @@ static int sky2_up(struct net_device *dev) sky2_set_multicast(dev); + /* wake queue incase we are restarting */ + netif_wake_queue(dev); + if (netif_msg_ifup(sky2)) printk(KERN_INFO PFX "%s: enabling interface\n", dev->name); return 0; @@ -1533,6 +1538,8 @@ static inline int tx_dist(unsigned tail, unsigned head) /* Number of list elements available for next tx */ static inline int tx_avail(const struct sky2_port *sky2) { + if (unlikely(sky2->restarting)) + return 0; return sky2->tx_pending - tx_dist(sky2->tx_cons, sky2->tx_prod); } @@ -1818,6 +1825,10 @@ static int sky2_down(struct net_device *dev) if (netif_msg_ifdown(sky2)) printk(KERN_INFO PFX "%s: disabling interface\n", dev->name); + /* explicitly shut off tx incase we're restarting */ + sky2->restarting = 1; + netif_tx_disable(dev); + /* Force flow control off */ sky2_write8(hw, SK_REG(port, GMAC_CTRL), GMC_PAUSE_OFF); @@ -2359,7 +2370,7 @@ static inline void sky2_tx_done(struct net_device *dev, u16 last) { struct sky2_port *sky2 = netdev_priv(dev); - if (netif_running(dev)) { + if (likely(netif_running(dev) && !sky2->restarting)) { netif_tx_lock(dev); sky2_tx_complete(sky2, last); netif_tx_unlock(dev); @@ -4283,6 +4294,7 @@ static __devinit struct net_device *sky2_init_netdev(struct sky2_hw *hw, spin_lock_init(&sky2->phy_lock); sky2->tx_pending = TX_DEF_PENDING; sky2->rx_pending = RX_DEF_PENDING; + sky2->restarting = 0; hw->dev[port] = dev; diff --git a/drivers/net/sky2.h b/drivers/net/sky2.h index b5549c9e510..4486b066b43 100644 --- a/drivers/net/sky2.h +++ b/drivers/net/sky2.h @@ -2051,6 +2051,7 @@ struct sky2_port { u8 duplex; /* DUPLEX_HALF, DUPLEX_FULL */ u8 rx_csum; u8 wol; + u8 restarting; enum flow_control flow_mode; enum flow_control flow_status; diff --git a/drivers/net/tulip/de4x5.c b/drivers/net/tulip/de4x5.c index eb72d2e9ab3..acfdccd4456 100644 --- a/drivers/net/tulip/de4x5.c +++ b/drivers/net/tulip/de4x5.c @@ -5059,7 +5059,7 @@ mii_get_phy(struct net_device *dev) if ((id == 0) || (id == 65535)) continue; /* Valid ID? */ for (j=0; j<limit; j++) { /* Search PHY table */ if (id != phy_info[j].id) continue; /* ID match? */ - for (k=0; lp->phy[k].id && (k < DE4X5_MAX_PHY); k++); + for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++); if (k < DE4X5_MAX_PHY) { memcpy((char *)&lp->phy[k], (char *)&phy_info[j], sizeof(struct phy_table)); @@ -5072,7 +5072,7 @@ mii_get_phy(struct net_device *dev) break; } if ((j == limit) && (i < DE4X5_MAX_MII)) { - for (k=0; lp->phy[k].id && (k < DE4X5_MAX_PHY); k++); + for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++); lp->phy[k].addr = i; lp->phy[k].id = id; lp->phy[k].spd.reg = GENERIC_REG; /* ANLPA register */ @@ -5091,7 +5091,7 @@ mii_get_phy(struct net_device *dev) purgatory: lp->active = 0; if (lp->phy[0].id) { /* Reset the PHY devices */ - for (k=0; lp->phy[k].id && (k < DE4X5_MAX_PHY); k++) { /*For each PHY*/ + for (k=0; k < DE4X5_MAX_PHY && lp->phy[k].id; k++) { /*For each PHY*/ mii_wr(MII_CR_RST, MII_CR, lp->phy[k].addr, DE4X5_MII); while (mii_rd(MII_CR, lp->phy[k].addr, DE4X5_MII) & MII_CR_RST); diff --git a/drivers/net/wireless/airo.c b/drivers/net/wireless/airo.c index c70604f0329..8ce5e4cee16 100644 --- a/drivers/net/wireless/airo.c +++ b/drivers/net/wireless/airo.c @@ -5918,20 +5918,19 @@ static int airo_set_essid(struct net_device *dev, readSsidRid(local, &SSID_rid); /* Check if we asked for `any' */ - if(dwrq->flags == 0) { + if (dwrq->flags == 0) { /* Just send an empty SSID list */ memset(&SSID_rid, 0, sizeof(SSID_rid)); } else { - int index = (dwrq->flags & IW_ENCODE_INDEX) - 1; + unsigned index = (dwrq->flags & IW_ENCODE_INDEX) - 1; /* Check the size of the string */ - if(dwrq->length > IW_ESSID_MAX_SIZE) { + if (dwrq->length > IW_ESSID_MAX_SIZE) return -E2BIG ; - } + /* Check if index is valid */ - if((index < 0) || (index >= 4)) { + if (index >= ARRAY_SIZE(SSID_rid.ssids)) return -EINVAL; - } /* Set the SSID */ memset(SSID_rid.ssids[index].ssid, 0, @@ -6819,7 +6818,7 @@ static int airo_set_txpow(struct net_device *dev, return -EINVAL; } clear_bit (FLAG_RADIO_OFF, &local->flags); - for (i = 0; cap_rid.txPowerLevels[i] && (i < 8); i++) + for (i = 0; i < 8 && cap_rid.txPowerLevels[i]; i++) if (v == cap_rid.txPowerLevels[i]) { readConfigRid(local, 1); local->config.txPower = v; diff --git a/drivers/net/wireless/ath/ath9k/eeprom.c b/drivers/net/wireless/ath/ath9k/eeprom.c index a2fda702b62..ce0e86c36a8 100644 --- a/drivers/net/wireless/ath/ath9k/eeprom.c +++ b/drivers/net/wireless/ath/ath9k/eeprom.c @@ -460,7 +460,7 @@ static int ath9k_hw_4k_check_eeprom(struct ath_hw *ah) integer = swab32(eep->modalHeader.antCtrlCommon); eep->modalHeader.antCtrlCommon = integer; - for (i = 0; i < AR5416_MAX_CHAINS; i++) { + for (i = 0; i < AR5416_EEP4K_MAX_CHAINS; i++) { integer = swab32(eep->modalHeader.antCtrlChain[i]); eep->modalHeader.antCtrlChain[i] = integer; } @@ -914,7 +914,7 @@ static void ath9k_hw_set_4k_power_per_rate_table(struct ath_hw *ah, ctlMode, numCtlModes, isHt40CtlMode, (pCtlMode[ctlMode] & EXT_ADDITIVE)); - for (i = 0; (i < AR5416_NUM_CTLS) && + for (i = 0; (i < AR5416_EEP4K_NUM_CTLS) && pEepData->ctlIndex[i]; i++) { DPRINTF(ah->ah_sc, ATH_DBG_EEPROM, " LOOP-Ctlidx %d: cfgCtl 0x%2.2x " diff --git a/drivers/net/wireless/iwlwifi/iwl-3945.h b/drivers/net/wireless/iwlwifi/iwl-3945.h index fbb3a573463..2de6471d4be 100644 --- a/drivers/net/wireless/iwlwifi/iwl-3945.h +++ b/drivers/net/wireless/iwlwifi/iwl-3945.h @@ -112,7 +112,7 @@ enum iwl3945_antenna { #define IWL_TX_FIFO_NONE 7 /* Minimum number of queues. MAX_NUM is defined in hw specific files */ -#define IWL_MIN_NUM_QUEUES 4 +#define IWL39_MIN_NUM_QUEUES 4 #define IEEE80211_DATA_LEN 2304 #define IEEE80211_4ADDR_LEN 30 diff --git a/drivers/net/wireless/iwlwifi/iwl-core.c b/drivers/net/wireless/iwlwifi/iwl-core.c index 6ab07165ea2..18b135f510e 100644 --- a/drivers/net/wireless/iwlwifi/iwl-core.c +++ b/drivers/net/wireless/iwlwifi/iwl-core.c @@ -1332,6 +1332,9 @@ int iwl_setup_mac(struct iwl_priv *priv) hw->wiphy->custom_regulatory = true; + /* Firmware does not support this */ + hw->wiphy->disable_beacon_hints = true; + hw->wiphy->max_scan_ssids = PROBE_OPTION_MAX; /* we create the 802.11 header and a zero-length SSID element */ hw->wiphy->max_scan_ie_len = IWL_MAX_PROBE_REQUEST - 24 - 2; diff --git a/drivers/net/wireless/iwlwifi/iwl-debugfs.c b/drivers/net/wireless/iwlwifi/iwl-debugfs.c index 11e08c06891..ca00cc8ad4c 100644 --- a/drivers/net/wireless/iwlwifi/iwl-debugfs.c +++ b/drivers/net/wireless/iwlwifi/iwl-debugfs.c @@ -308,18 +308,18 @@ static ssize_t iwl_dbgfs_nvm_read(struct file *file, return -ENODATA; } + ptr = priv->eeprom; + if (!ptr) { + IWL_ERR(priv, "Invalid EEPROM/OTP memory\n"); + return -ENOMEM; + } + /* 4 characters for byte 0xYY */ buf = kzalloc(buf_size, GFP_KERNEL); if (!buf) { IWL_ERR(priv, "Can not allocate Buffer\n"); return -ENOMEM; } - - ptr = priv->eeprom; - if (!ptr) { - IWL_ERR(priv, "Invalid EEPROM/OTP memory\n"); - return -ENOMEM; - } pos += scnprintf(buf + pos, buf_size - pos, "NVM Type: %s\n", (priv->nvm_device_type == NVM_DEVICE_TYPE_OTP) ? "OTP" : "EEPROM"); diff --git a/drivers/net/wireless/iwlwifi/iwl-dev.h b/drivers/net/wireless/iwlwifi/iwl-dev.h index e2d620f0b6e..650e20af20f 100644 --- a/drivers/net/wireless/iwlwifi/iwl-dev.h +++ b/drivers/net/wireless/iwlwifi/iwl-dev.h @@ -258,8 +258,10 @@ struct iwl_channel_info { #define IWL_TX_FIFO_HCCA_2 6 #define IWL_TX_FIFO_NONE 7 -/* Minimum number of queues. MAX_NUM is defined in hw specific files */ -#define IWL_MIN_NUM_QUEUES 4 +/* Minimum number of queues. MAX_NUM is defined in hw specific files. + * Set the minimum to accommodate the 4 standard TX queues, 1 command + * queue, 2 (unused) HCCA queues, and 4 HT queues (one for each AC) */ +#define IWL_MIN_NUM_QUEUES 10 /* Power management (not Tx power) structures */ diff --git a/drivers/net/wireless/iwlwifi/iwl-sta.c b/drivers/net/wireless/iwlwifi/iwl-sta.c index 2addf735b19..ffd5c61a755 100644 --- a/drivers/net/wireless/iwlwifi/iwl-sta.c +++ b/drivers/net/wireless/iwlwifi/iwl-sta.c @@ -566,6 +566,8 @@ int iwl_remove_default_wep_key(struct iwl_priv *priv, unsigned long flags; spin_lock_irqsave(&priv->sta_lock, flags); + IWL_DEBUG_WEP(priv, "Removing default WEP key: idx=%d\n", + keyconf->keyidx); if (!test_and_clear_bit(keyconf->keyidx, &priv->ucode_key_table)) IWL_ERR(priv, "index %d not used in uCode key table.\n", @@ -573,6 +575,11 @@ int iwl_remove_default_wep_key(struct iwl_priv *priv, priv->default_wep_key--; memset(&priv->wep_keys[keyconf->keyidx], 0, sizeof(priv->wep_keys[0])); + if (iwl_is_rfkill(priv)) { + IWL_DEBUG_WEP(priv, "Not sending REPLY_WEPKEY command due to RFKILL.\n"); + spin_unlock_irqrestore(&priv->sta_lock, flags); + return 0; + } ret = iwl_send_static_wepkey_cmd(priv, 1); IWL_DEBUG_WEP(priv, "Remove default WEP key: idx=%d ret=%d\n", keyconf->keyidx, ret); @@ -853,6 +860,11 @@ int iwl_remove_dynamic_key(struct iwl_priv *priv, priv->stations[sta_id].sta.sta.modify_mask = STA_MODIFY_KEY_MASK; priv->stations[sta_id].sta.mode = STA_CONTROL_MODIFY_MSK; + if (iwl_is_rfkill(priv)) { + IWL_DEBUG_WEP(priv, "Not sending REPLY_ADD_STA command because RFKILL enabled. \n"); + spin_unlock_irqrestore(&priv->sta_lock, flags); + return 0; + } ret = iwl_send_add_sta(priv, &priv->stations[sta_id].sta, CMD_ASYNC); spin_unlock_irqrestore(&priv->sta_lock, flags); return ret; diff --git a/drivers/net/wireless/iwlwifi/iwl-tx.c b/drivers/net/wireless/iwlwifi/iwl-tx.c index 9bbeec9427f..2e89040e63b 100644 --- a/drivers/net/wireless/iwlwifi/iwl-tx.c +++ b/drivers/net/wireless/iwlwifi/iwl-tx.c @@ -720,8 +720,6 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb) goto drop_unlock; } - spin_unlock_irqrestore(&priv->lock, flags); - hdr_len = ieee80211_hdrlen(fc); /* Find (or create) index into station table for destination station */ @@ -729,7 +727,7 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb) if (sta_id == IWL_INVALID_STATION) { IWL_DEBUG_DROP(priv, "Dropping - INVALID STATION: %pM\n", hdr->addr1); - goto drop; + goto drop_unlock; } IWL_DEBUG_TX(priv, "station Id %d\n", sta_id); @@ -750,14 +748,17 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb) txq_id = priv->stations[sta_id].tid[tid].agg.txq_id; swq_id = iwl_virtual_agg_queue_num(swq_id, txq_id); } - priv->stations[sta_id].tid[tid].tfds_in_queue++; } txq = &priv->txq[txq_id]; q = &txq->q; txq->swq_id = swq_id; - spin_lock_irqsave(&priv->lock, flags); + if (unlikely(iwl_queue_space(q) < q->high_mark)) + goto drop_unlock; + + if (ieee80211_is_data_qos(fc)) + priv->stations[sta_id].tid[tid].tfds_in_queue++; /* Set up driver data for this TFD */ memset(&(txq->txb[q->write_ptr]), 0, sizeof(struct iwl_tx_info)); @@ -902,7 +903,6 @@ int iwl_tx_skb(struct iwl_priv *priv, struct sk_buff *skb) drop_unlock: spin_unlock_irqrestore(&priv->lock, flags); -drop: return -1; } EXPORT_SYMBOL(iwl_tx_skb); @@ -1171,6 +1171,8 @@ int iwl_tx_agg_start(struct iwl_priv *priv, const u8 *ra, u16 tid, u16 *ssn) IWL_ERR(priv, "Start AGG on invalid station\n"); return -ENXIO; } + if (unlikely(tid >= MAX_TID_COUNT)) + return -EINVAL; if (priv->stations[sta_id].tid[tid].agg.state != IWL_AGG_OFF) { IWL_ERR(priv, "Start AGG when state is not IWL_AGG_OFF !\n"); diff --git a/drivers/net/wireless/iwlwifi/iwl3945-base.c b/drivers/net/wireless/iwlwifi/iwl3945-base.c index 956798f2c80..523843369ca 100644 --- a/drivers/net/wireless/iwlwifi/iwl3945-base.c +++ b/drivers/net/wireless/iwlwifi/iwl3945-base.c @@ -3968,6 +3968,9 @@ static int iwl3945_setup_mac(struct iwl_priv *priv) hw->wiphy->custom_regulatory = true; + /* Firmware does not support this */ + hw->wiphy->disable_beacon_hints = true; + hw->wiphy->max_scan_ssids = PROBE_OPTION_MAX_3945; /* we create the 802.11 header and a zero-length SSID element */ hw->wiphy->max_scan_ie_len = IWL_MAX_PROBE_REQUEST - 24 - 2; @@ -4018,10 +4021,10 @@ static int iwl3945_pci_probe(struct pci_dev *pdev, const struct pci_device_id *e SET_IEEE80211_DEV(hw, &pdev->dev); if ((iwl3945_mod_params.num_of_queues > IWL39_MAX_NUM_QUEUES) || - (iwl3945_mod_params.num_of_queues < IWL_MIN_NUM_QUEUES)) { + (iwl3945_mod_params.num_of_queues < IWL39_MIN_NUM_QUEUES)) { IWL_ERR(priv, "invalid queues_num, should be between %d and %d\n", - IWL_MIN_NUM_QUEUES, IWL39_MAX_NUM_QUEUES); + IWL39_MIN_NUM_QUEUES, IWL39_MAX_NUM_QUEUES); err = -EINVAL; goto out_ieee80211_free_hw; } diff --git a/drivers/net/wireless/iwmc3200wifi/commands.c b/drivers/net/wireless/iwmc3200wifi/commands.c index 834a7f544e5..e2334d12359 100644 --- a/drivers/net/wireless/iwmc3200wifi/commands.c +++ b/drivers/net/wireless/iwmc3200wifi/commands.c @@ -220,6 +220,7 @@ int iwm_store_rxiq_calib_result(struct iwm_priv *iwm) eeprom_rxiq = iwm_eeprom_access(iwm, IWM_EEPROM_CALIB_RXIQ); if (IS_ERR(eeprom_rxiq)) { IWM_ERR(iwm, "Couldn't access EEPROM RX IQ entry\n"); + kfree(rxiq); return PTR_ERR(eeprom_rxiq); } diff --git a/drivers/net/wireless/iwmc3200wifi/netdev.c b/drivers/net/wireless/iwmc3200wifi/netdev.c index aea5ccf24cc..bf294e41753 100644 --- a/drivers/net/wireless/iwmc3200wifi/netdev.c +++ b/drivers/net/wireless/iwmc3200wifi/netdev.c @@ -106,10 +106,8 @@ void *iwm_if_alloc(int sizeof_bus, struct device *dev, int ret = 0; wdev = iwm_wdev_alloc(sizeof_bus, dev); - if (!wdev) { - dev_err(dev, "no memory for wireless device instance\n"); - return ERR_PTR(-ENOMEM); - } + if (IS_ERR(wdev)) + return wdev; iwm = wdev_to_iwm(wdev); iwm->bus_ops = if_ops; diff --git a/drivers/net/wireless/libertas/11d.c b/drivers/net/wireless/libertas/11d.c index 9a5408e7d94..5c6968101f0 100644 --- a/drivers/net/wireless/libertas/11d.c +++ b/drivers/net/wireless/libertas/11d.c @@ -47,7 +47,7 @@ static u8 lbs_region_2_code(u8 *region) { u8 i; - for (i = 0; region[i] && i < COUNTRY_CODE_LEN; i++) + for (i = 0; i < COUNTRY_CODE_LEN && region[i]; i++) region[i] = toupper(region[i]); for (i = 0; i < ARRAY_SIZE(region_code_mapping); i++) { diff --git a/drivers/net/wireless/libertas/assoc.c b/drivers/net/wireless/libertas/assoc.c index b9b37411903..d6997371c27 100644 --- a/drivers/net/wireless/libertas/assoc.c +++ b/drivers/net/wireless/libertas/assoc.c @@ -1,6 +1,7 @@ /* Copyright (C) 2006, Red Hat, Inc. */ #include <linux/types.h> +#include <linux/kernel.h> #include <linux/etherdevice.h> #include <linux/ieee80211.h> #include <linux/if_arp.h> @@ -43,21 +44,21 @@ static int get_common_rates(struct lbs_private *priv, u16 *rates_size) { u8 *card_rates = lbs_bg_rates; - size_t num_card_rates = sizeof(lbs_bg_rates); int ret = 0, i, j; - u8 tmp[30]; + u8 tmp[(ARRAY_SIZE(lbs_bg_rates) - 1) * (*rates_size - 1)]; size_t tmp_size = 0; /* For each rate in card_rates that exists in rate1, copy to tmp */ - for (i = 0; card_rates[i] && (i < num_card_rates); i++) { - for (j = 0; rates[j] && (j < *rates_size); j++) { + for (i = 0; i < ARRAY_SIZE(lbs_bg_rates) && card_rates[i]; i++) { + for (j = 0; j < *rates_size && rates[j]; j++) { if (rates[j] == card_rates[i]) tmp[tmp_size++] = card_rates[i]; } } lbs_deb_hex(LBS_DEB_JOIN, "AP rates ", rates, *rates_size); - lbs_deb_hex(LBS_DEB_JOIN, "card rates ", card_rates, num_card_rates); + lbs_deb_hex(LBS_DEB_JOIN, "card rates ", card_rates, + ARRAY_SIZE(lbs_bg_rates)); lbs_deb_hex(LBS_DEB_JOIN, "common rates", tmp, tmp_size); lbs_deb_join("TX data rate 0x%02x\n", priv->cur_rate); @@ -69,10 +70,7 @@ static int get_common_rates(struct lbs_private *priv, lbs_pr_alert("Previously set fixed data rate %#x isn't " "compatible with the network.\n", priv->cur_rate); ret = -1; - goto done; } - ret = 0; - done: memset(rates, 0, *rates_size); *rates_size = min_t(int, tmp_size, *rates_size); @@ -322,7 +320,7 @@ static int lbs_associate(struct lbs_private *priv, rates = (struct mrvl_ie_rates_param_set *) pos; rates->header.type = cpu_to_le16(TLV_TYPE_RATES); memcpy(&rates->rates, &bss->rates, MAX_RATES); - tmplen = MAX_RATES; + tmplen = min_t(u16, ARRAY_SIZE(rates->rates), MAX_RATES); if (get_common_rates(priv, rates->rates, &tmplen)) { ret = -1; goto done; @@ -598,7 +596,7 @@ static int lbs_adhoc_join(struct lbs_private *priv, /* Copy Data rates from the rates recorded in scan response */ memset(cmd.bss.rates, 0, sizeof(cmd.bss.rates)); - ratesize = min_t(u16, sizeof(cmd.bss.rates), MAX_RATES); + ratesize = min_t(u16, ARRAY_SIZE(cmd.bss.rates), MAX_RATES); memcpy(cmd.bss.rates, bss->rates, ratesize); if (get_common_rates(priv, cmd.bss.rates, &ratesize)) { lbs_deb_join("ADHOC_JOIN: get_common_rates returned error.\n"); diff --git a/drivers/net/wireless/libertas/scan.c b/drivers/net/wireless/libertas/scan.c index 601b5424967..6c95af3023c 100644 --- a/drivers/net/wireless/libertas/scan.c +++ b/drivers/net/wireless/libertas/scan.c @@ -5,6 +5,7 @@ * for sending scan commands to the firmware. */ #include <linux/types.h> +#include <linux/kernel.h> #include <linux/etherdevice.h> #include <linux/if_arp.h> #include <asm/unaligned.h> @@ -876,7 +877,7 @@ static inline char *lbs_translate_scan(struct lbs_private *priv, iwe.u.bitrate.disabled = 0; iwe.u.bitrate.value = 0; - for (j = 0; bss->rates[j] && (j < sizeof(bss->rates)); j++) { + for (j = 0; j < ARRAY_SIZE(bss->rates) && bss->rates[j]; j++) { /* Bit rate given in 500 kb/s units */ iwe.u.bitrate.value = bss->rates[j] * 500000; current_val = iwe_stream_add_value(info, start, current_val, diff --git a/drivers/net/wireless/zd1211rw/zd_mac.c b/drivers/net/wireless/zd1211rw/zd_mac.c index 40b07b98822..3bd3c779fff 100644 --- a/drivers/net/wireless/zd1211rw/zd_mac.c +++ b/drivers/net/wireless/zd1211rw/zd_mac.c @@ -698,7 +698,7 @@ int zd_mac_rx(struct ieee80211_hw *hw, const u8 *buffer, unsigned int length) && !mac->pass_ctrl) return 0; - fc = *(__le16 *)buffer; + fc = get_unaligned((__le16*)buffer); need_padding = ieee80211_is_data_qos(fc) ^ ieee80211_has_a4(fc); skb = dev_alloc_skb(length + (need_padding ? 2 : 0)); diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index 0f0e0b919ef..a45b0c0d574 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -70,7 +70,6 @@ #undef CCIO_COLLECT_STATS #endif -#include <linux/proc_fs.h> #include <asm/runway.h> /* for proc_runway_root */ #ifdef DEBUG_CCIO_INIT diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index c590974e981..d69bde6a234 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -614,7 +614,7 @@ dino_fixup_bus(struct pci_bus *bus) dev_name(&bus->self->dev), i, bus->self->resource[i].start, bus->self->resource[i].end); - pci_assign_resource(bus->self, i); + WARN_ON(pci_assign_resource(bus->self, i)); DBG("DEBUG %s after assign %d [0x%lx,0x%lx]\n", dev_name(&bus->self->dev), i, bus->self->resource[i].start, diff --git a/drivers/parisc/eisa_eeprom.c b/drivers/parisc/eisa_eeprom.c index 685d94e69d4..8c0b26e9b98 100644 --- a/drivers/parisc/eisa_eeprom.c +++ b/drivers/parisc/eisa_eeprom.c @@ -55,7 +55,7 @@ static ssize_t eisa_eeprom_read(struct file * file, ssize_t ret; int i; - if (*ppos >= HPEE_MAX_LENGTH) + if (*ppos < 0 || *ppos >= HPEE_MAX_LENGTH) return 0; count = *ppos + count < HPEE_MAX_LENGTH ? count : HPEE_MAX_LENGTH - *ppos; diff --git a/drivers/parisc/hppb.c b/drivers/parisc/hppb.c index 13856415b43..815db175d42 100644 --- a/drivers/parisc/hppb.c +++ b/drivers/parisc/hppb.c @@ -62,7 +62,8 @@ static int hppb_probe(struct parisc_device *dev) } card = card->next; } - printk(KERN_INFO "Found GeckoBoa at 0x%x\n", dev->hpa.start); + printk(KERN_INFO "Found GeckoBoa at 0x%llx\n", + (unsigned long long) dev->hpa.start); card->hpa = dev->hpa.start; card->mmio_region.name = "HP-PB Bus"; @@ -73,8 +74,10 @@ static int hppb_probe(struct parisc_device *dev) status = ccio_request_resource(dev, &card->mmio_region); if(status < 0) { - printk(KERN_ERR "%s: failed to claim HP-PB bus space (%08x, %08x)\n", - __FILE__, card->mmio_region.start, card->mmio_region.end); + printk(KERN_ERR "%s: failed to claim HP-PB " + "bus space (0x%08llx, 0x%08llx)\n", + __FILE__, (unsigned long long) card->mmio_region.start, + (unsigned long long) card->mmio_region.end); } return 0; diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c index ede614616f8..3aeb3279c92 100644 --- a/drivers/parisc/lba_pci.c +++ b/drivers/parisc/lba_pci.c @@ -992,7 +992,7 @@ lba_pat_resources(struct parisc_device *pa_dev, struct lba_device *lba_dev) return; io_pdc_cell = kzalloc(sizeof(pdc_pat_cell_mod_maddr_block_t), GFP_KERNEL); - if (!pa_pdc_cell) { + if (!io_pdc_cell) { kfree(pa_pdc_cell); return; } diff --git a/drivers/parisc/pdc_stable.c b/drivers/parisc/pdc_stable.c index f9f9a5f1bbd..13a64bc081b 100644 --- a/drivers/parisc/pdc_stable.c +++ b/drivers/parisc/pdc_stable.c @@ -370,7 +370,7 @@ pdcspath_layer_read(struct pdcspath_entry *entry, char *buf) if (!i) /* entry is not ready */ return -ENODATA; - for (i = 0; devpath->layers[i] && (likely(i < 6)); i++) + for (i = 0; i < 6 && devpath->layers[i]; i++) out += sprintf(out, "%u ", devpath->layers[i]); out += sprintf(out, "\n"); diff --git a/drivers/pci/setup-res.c b/drivers/pci/setup-res.c index b711fb7181e..1898c7b4790 100644 --- a/drivers/pci/setup-res.c +++ b/drivers/pci/setup-res.c @@ -100,16 +100,16 @@ int pci_claim_resource(struct pci_dev *dev, int resource) { struct resource *res = &dev->resource[resource]; struct resource *root; - char *dtype = resource < PCI_BRIDGE_RESOURCES ? "device" : "bridge"; int err; root = pci_find_parent_resource(dev, res); err = -EINVAL; if (root != NULL) - err = insert_resource(root, res); + err = request_resource(root, res); if (err) { + const char *dtype = resource < PCI_BRIDGE_RESOURCES ? "device" : "bridge"; dev_err(&dev->dev, "BAR %d: %s of %s %pR\n", resource, root ? "address space collision on" : diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig index 46dad12f952..77c6097ced8 100644 --- a/drivers/platform/x86/Kconfig +++ b/drivers/platform/x86/Kconfig @@ -277,31 +277,6 @@ config THINKPAD_ACPI_UNSAFE_LEDS Say N here, unless you are building a kernel for your own use, and need to control the important firmware LEDs. -config THINKPAD_ACPI_DOCK - bool "Legacy Docking Station Support" - depends on THINKPAD_ACPI - depends on ACPI_DOCK=n - default n - ---help--- - Allows the thinkpad_acpi driver to handle docking station events. - This support was made obsolete by the generic ACPI docking station - support (CONFIG_ACPI_DOCK). It will allow locking and removing the - laptop from the docking station, but will not properly connect PCI - devices. - - If you are not sure, say N here. - -config THINKPAD_ACPI_BAY - bool "Legacy Removable Bay Support" - depends on THINKPAD_ACPI - default y - ---help--- - Allows the thinkpad_acpi driver to handle removable bays. It will - electrically disable the device in the bay, and also generate - notifications when the bay lever is ejected or inserted. - - If you are not sure, say Y here. - config THINKPAD_ACPI_VIDEO bool "Video output control support" depends on THINKPAD_ACPI diff --git a/drivers/platform/x86/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c index ec560f16d72..222ffb892f2 100644 --- a/drivers/platform/x86/eeepc-laptop.c +++ b/drivers/platform/x86/eeepc-laptop.c @@ -143,6 +143,7 @@ struct eeepc_hotk { struct rfkill *bluetooth_rfkill; struct rfkill *wwan3g_rfkill; struct hotplug_slot *hotplug_slot; + struct work_struct hotplug_work; }; /* The actual device the driver binds to */ @@ -660,7 +661,7 @@ static int eeepc_get_adapter_status(struct hotplug_slot *hotplug_slot, return 0; } -static void eeepc_rfkill_hotplug(void) +static void eeepc_hotplug_work(struct work_struct *work) { struct pci_dev *dev; struct pci_bus *bus = pci_find_bus(0, 1); @@ -701,7 +702,7 @@ static void eeepc_rfkill_notify(acpi_handle handle, u32 event, void *data) if (event != ACPI_NOTIFY_BUS_CHECK) return; - eeepc_rfkill_hotplug(); + schedule_work(&ehotk->hotplug_work); } static void eeepc_hotk_notify(struct acpi_device *device, u32 event) @@ -892,7 +893,7 @@ static int eeepc_hotk_resume(struct acpi_device *device) rfkill_set_sw_state(ehotk->wlan_rfkill, wlan != 1); - eeepc_rfkill_hotplug(); + schedule_work(&ehotk->hotplug_work); } if (ehotk->bluetooth_rfkill) @@ -1093,6 +1094,8 @@ static int eeepc_rfkill_init(struct device *dev) { int result = 0; + INIT_WORK(&ehotk->hotplug_work, eeepc_hotplug_work); + eeepc_register_rfkill_notifier("\\_SB.PCI0.P0P6"); eeepc_register_rfkill_notifier("\\_SB.PCI0.P0P7"); diff --git a/drivers/platform/x86/hp-wmi.c b/drivers/platform/x86/hp-wmi.c index ca508564a18..a2ad53e1587 100644 --- a/drivers/platform/x86/hp-wmi.c +++ b/drivers/platform/x86/hp-wmi.c @@ -520,11 +520,13 @@ static int hp_wmi_resume_handler(struct platform_device *device) * the input layer will only actually pass it on if the state * changed. */ - - input_report_switch(hp_wmi_input_dev, SW_DOCK, hp_wmi_dock_state()); - input_report_switch(hp_wmi_input_dev, SW_TABLET_MODE, - hp_wmi_tablet_state()); - input_sync(hp_wmi_input_dev); + if (hp_wmi_input_dev) { + input_report_switch(hp_wmi_input_dev, SW_DOCK, + hp_wmi_dock_state()); + input_report_switch(hp_wmi_input_dev, SW_TABLET_MODE, + hp_wmi_tablet_state()); + input_sync(hp_wmi_input_dev); + } return 0; } diff --git a/drivers/platform/x86/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index a463fd72c49..e8560085250 100644 --- a/drivers/platform/x86/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -239,12 +239,6 @@ struct ibm_init_struct { }; static struct { -#ifdef CONFIG_THINKPAD_ACPI_BAY - u32 bay_status:1; - u32 bay_eject:1; - u32 bay_status2:1; - u32 bay_eject2:1; -#endif u32 bluetooth:1; u32 hotkey:1; u32 hotkey_mask:1; @@ -589,18 +583,6 @@ static int acpi_ec_write(int i, u8 v) return 1; } -#if defined(CONFIG_THINKPAD_ACPI_DOCK) || defined(CONFIG_THINKPAD_ACPI_BAY) -static int _sta(acpi_handle handle) -{ - int status; - - if (!handle || !acpi_evalf(handle, &status, "_STA", "d")) - status = 0; - - return status; -} -#endif - static int issue_thinkpad_cmos_command(int cmos_cmd) { if (!cmos_handle) @@ -784,6 +766,8 @@ static int dispatch_procfs_write(struct file *file, if (!ibm || !ibm->write) return -EINVAL; + if (count > PAGE_SIZE - 2) + return -EINVAL; kernbuf = kmalloc(count + 2, GFP_KERNEL); if (!kernbuf) @@ -4442,293 +4426,6 @@ static struct ibm_struct light_driver_data = { }; /************************************************************************* - * Dock subdriver - */ - -#ifdef CONFIG_THINKPAD_ACPI_DOCK - -static void dock_notify(struct ibm_struct *ibm, u32 event); -static int dock_read(char *p); -static int dock_write(char *buf); - -TPACPI_HANDLE(dock, root, "\\_SB.GDCK", /* X30, X31, X40 */ - "\\_SB.PCI0.DOCK", /* 600e/x,770e,770x,A2xm/p,T20-22,X20-21 */ - "\\_SB.PCI0.PCI1.DOCK", /* all others */ - "\\_SB.PCI.ISA.SLCE", /* 570 */ - ); /* A21e,G4x,R30,R31,R32,R40,R40e,R50e */ - -/* don't list other alternatives as we install a notify handler on the 570 */ -TPACPI_HANDLE(pci, root, "\\_SB.PCI"); /* 570 */ - -static const struct acpi_device_id ibm_pci_device_ids[] = { - {PCI_ROOT_HID_STRING, 0}, - {"", 0}, -}; - -static struct tp_acpi_drv_struct ibm_dock_acpidriver[2] = { - { - .notify = dock_notify, - .handle = &dock_handle, - .type = ACPI_SYSTEM_NOTIFY, - }, - { - /* THIS ONE MUST NEVER BE USED FOR DRIVER AUTOLOADING. - * We just use it to get notifications of dock hotplug - * in very old thinkpads */ - .hid = ibm_pci_device_ids, - .notify = dock_notify, - .handle = &pci_handle, - .type = ACPI_SYSTEM_NOTIFY, - }, -}; - -static struct ibm_struct dock_driver_data[2] = { - { - .name = "dock", - .read = dock_read, - .write = dock_write, - .acpi = &ibm_dock_acpidriver[0], - }, - { - .name = "dock", - .acpi = &ibm_dock_acpidriver[1], - }, -}; - -#define dock_docked() (_sta(dock_handle) & 1) - -static int __init dock_init(struct ibm_init_struct *iibm) -{ - vdbg_printk(TPACPI_DBG_INIT, "initializing dock subdriver\n"); - - TPACPI_ACPIHANDLE_INIT(dock); - - vdbg_printk(TPACPI_DBG_INIT, "dock is %s\n", - str_supported(dock_handle != NULL)); - - return (dock_handle)? 0 : 1; -} - -static int __init dock_init2(struct ibm_init_struct *iibm) -{ - int dock2_needed; - - vdbg_printk(TPACPI_DBG_INIT, "initializing dock subdriver part 2\n"); - - if (dock_driver_data[0].flags.acpi_driver_registered && - dock_driver_data[0].flags.acpi_notify_installed) { - TPACPI_ACPIHANDLE_INIT(pci); - dock2_needed = (pci_handle != NULL); - vdbg_printk(TPACPI_DBG_INIT, - "dock PCI handler for the TP 570 is %s\n", - str_supported(dock2_needed)); - } else { - vdbg_printk(TPACPI_DBG_INIT, - "dock subdriver part 2 not required\n"); - dock2_needed = 0; - } - - return (dock2_needed)? 0 : 1; -} - -static void dock_notify(struct ibm_struct *ibm, u32 event) -{ - int docked = dock_docked(); - int pci = ibm->acpi->hid && ibm->acpi->device && - acpi_match_device_ids(ibm->acpi->device, ibm_pci_device_ids); - int data; - - if (event == 1 && !pci) /* 570 */ - data = 1; /* button */ - else if (event == 1 && pci) /* 570 */ - data = 3; /* dock */ - else if (event == 3 && docked) - data = 1; /* button */ - else if (event == 3 && !docked) - data = 2; /* undock */ - else if (event == 0 && docked) - data = 3; /* dock */ - else { - printk(TPACPI_ERR "unknown dock event %d, status %d\n", - event, _sta(dock_handle)); - data = 0; /* unknown */ - } - acpi_bus_generate_proc_event(ibm->acpi->device, event, data); - acpi_bus_generate_netlink_event(ibm->acpi->device->pnp.device_class, - dev_name(&ibm->acpi->device->dev), - event, data); -} - -static int dock_read(char *p) -{ - int len = 0; - int docked = dock_docked(); - - if (!dock_handle) - len += sprintf(p + len, "status:\t\tnot supported\n"); - else if (!docked) - len += sprintf(p + len, "status:\t\tundocked\n"); - else { - len += sprintf(p + len, "status:\t\tdocked\n"); - len += sprintf(p + len, "commands:\tdock, undock\n"); - } - - return len; -} - -static int dock_write(char *buf) -{ - char *cmd; - - if (!dock_docked()) - return -ENODEV; - - while ((cmd = next_cmd(&buf))) { - if (strlencmp(cmd, "undock") == 0) { - if (!acpi_evalf(dock_handle, NULL, "_DCK", "vd", 0) || - !acpi_evalf(dock_handle, NULL, "_EJ0", "vd", 1)) - return -EIO; - } else if (strlencmp(cmd, "dock") == 0) { - if (!acpi_evalf(dock_handle, NULL, "_DCK", "vd", 1)) - return -EIO; - } else - return -EINVAL; - } - - return 0; -} - -#endif /* CONFIG_THINKPAD_ACPI_DOCK */ - -/************************************************************************* - * Bay subdriver - */ - -#ifdef CONFIG_THINKPAD_ACPI_BAY - -TPACPI_HANDLE(bay, root, "\\_SB.PCI.IDE.SECN.MAST", /* 570 */ - "\\_SB.PCI0.IDE0.IDES.IDSM", /* 600e/x, 770e, 770x */ - "\\_SB.PCI0.SATA.SCND.MSTR", /* T60, X60, Z60 */ - "\\_SB.PCI0.IDE0.SCND.MSTR", /* all others */ - ); /* A21e, R30, R31 */ -TPACPI_HANDLE(bay_ej, bay, "_EJ3", /* 600e/x, A2xm/p, A3x */ - "_EJ0", /* all others */ - ); /* 570,A21e,G4x,R30,R31,R32,R40e,R50e */ -TPACPI_HANDLE(bay2, root, "\\_SB.PCI0.IDE0.PRIM.SLAV", /* A3x, R32 */ - "\\_SB.PCI0.IDE0.IDEP.IDPS", /* 600e/x, 770e, 770x */ - ); /* all others */ -TPACPI_HANDLE(bay2_ej, bay2, "_EJ3", /* 600e/x, 770e, A3x */ - "_EJ0", /* 770x */ - ); /* all others */ - -static int __init bay_init(struct ibm_init_struct *iibm) -{ - vdbg_printk(TPACPI_DBG_INIT, "initializing bay subdriver\n"); - - TPACPI_ACPIHANDLE_INIT(bay); - if (bay_handle) - TPACPI_ACPIHANDLE_INIT(bay_ej); - TPACPI_ACPIHANDLE_INIT(bay2); - if (bay2_handle) - TPACPI_ACPIHANDLE_INIT(bay2_ej); - - tp_features.bay_status = bay_handle && - acpi_evalf(bay_handle, NULL, "_STA", "qv"); - tp_features.bay_status2 = bay2_handle && - acpi_evalf(bay2_handle, NULL, "_STA", "qv"); - - tp_features.bay_eject = bay_handle && bay_ej_handle && - (strlencmp(bay_ej_path, "_EJ0") == 0 || experimental); - tp_features.bay_eject2 = bay2_handle && bay2_ej_handle && - (strlencmp(bay2_ej_path, "_EJ0") == 0 || experimental); - - vdbg_printk(TPACPI_DBG_INIT, - "bay 1: status %s, eject %s; bay 2: status %s, eject %s\n", - str_supported(tp_features.bay_status), - str_supported(tp_features.bay_eject), - str_supported(tp_features.bay_status2), - str_supported(tp_features.bay_eject2)); - - return (tp_features.bay_status || tp_features.bay_eject || - tp_features.bay_status2 || tp_features.bay_eject2)? 0 : 1; -} - -static void bay_notify(struct ibm_struct *ibm, u32 event) -{ - acpi_bus_generate_proc_event(ibm->acpi->device, event, 0); - acpi_bus_generate_netlink_event(ibm->acpi->device->pnp.device_class, - dev_name(&ibm->acpi->device->dev), - event, 0); -} - -#define bay_occupied(b) (_sta(b##_handle) & 1) - -static int bay_read(char *p) -{ - int len = 0; - int occupied = bay_occupied(bay); - int occupied2 = bay_occupied(bay2); - int eject, eject2; - - len += sprintf(p + len, "status:\t\t%s\n", - tp_features.bay_status ? - (occupied ? "occupied" : "unoccupied") : - "not supported"); - if (tp_features.bay_status2) - len += sprintf(p + len, "status2:\t%s\n", occupied2 ? - "occupied" : "unoccupied"); - - eject = tp_features.bay_eject && occupied; - eject2 = tp_features.bay_eject2 && occupied2; - - if (eject && eject2) - len += sprintf(p + len, "commands:\teject, eject2\n"); - else if (eject) - len += sprintf(p + len, "commands:\teject\n"); - else if (eject2) - len += sprintf(p + len, "commands:\teject2\n"); - - return len; -} - -static int bay_write(char *buf) -{ - char *cmd; - - if (!tp_features.bay_eject && !tp_features.bay_eject2) - return -ENODEV; - - while ((cmd = next_cmd(&buf))) { - if (tp_features.bay_eject && strlencmp(cmd, "eject") == 0) { - if (!acpi_evalf(bay_ej_handle, NULL, NULL, "vd", 1)) - return -EIO; - } else if (tp_features.bay_eject2 && - strlencmp(cmd, "eject2") == 0) { - if (!acpi_evalf(bay2_ej_handle, NULL, NULL, "vd", 1)) - return -EIO; - } else - return -EINVAL; - } - - return 0; -} - -static struct tp_acpi_drv_struct ibm_bay_acpidriver = { - .notify = bay_notify, - .handle = &bay_handle, - .type = ACPI_SYSTEM_NOTIFY, -}; - -static struct ibm_struct bay_driver_data = { - .name = "bay", - .read = bay_read, - .write = bay_write, - .acpi = &ibm_bay_acpidriver, -}; - -#endif /* CONFIG_THINKPAD_ACPI_BAY */ - -/************************************************************************* * CMOS subdriver */ @@ -5945,14 +5642,48 @@ static struct backlight_ops ibm_backlight_data = { /* --------------------------------------------------------------------- */ +/* + * These are only useful for models that have only one possibility + * of GPU. If the BIOS model handles both ATI and Intel, don't use + * these quirks. + */ +#define TPACPI_BRGHT_Q_NOEC 0x0001 /* Must NOT use EC HBRV */ +#define TPACPI_BRGHT_Q_EC 0x0002 /* Should or must use EC HBRV */ +#define TPACPI_BRGHT_Q_ASK 0x8000 /* Ask for user report */ + +static const struct tpacpi_quirk brightness_quirk_table[] __initconst = { + /* Models with ATI GPUs known to require ECNVRAM mode */ + TPACPI_Q_IBM('1', 'Y', TPACPI_BRGHT_Q_EC), /* T43/p ATI */ + + /* Models with ATI GPUs (waiting confirmation) */ + TPACPI_Q_IBM('1', 'R', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC), + TPACPI_Q_IBM('1', 'Q', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC), + TPACPI_Q_IBM('7', '6', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC), + TPACPI_Q_IBM('7', '8', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_EC), + + /* Models with Intel Extreme Graphics 2 (waiting confirmation) */ + TPACPI_Q_IBM('1', 'V', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_NOEC), + TPACPI_Q_IBM('1', 'W', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_NOEC), + TPACPI_Q_IBM('1', 'U', TPACPI_BRGHT_Q_ASK|TPACPI_BRGHT_Q_NOEC), + + /* Models with Intel GMA900 */ + TPACPI_Q_IBM('7', '0', TPACPI_BRGHT_Q_NOEC), /* T43, R52 */ + TPACPI_Q_IBM('7', '4', TPACPI_BRGHT_Q_NOEC), /* X41 */ + TPACPI_Q_IBM('7', '5', TPACPI_BRGHT_Q_NOEC), /* X41 Tablet */ +}; + static int __init brightness_init(struct ibm_init_struct *iibm) { int b; + unsigned long quirks; vdbg_printk(TPACPI_DBG_INIT, "initializing brightness subdriver\n"); mutex_init(&brightness_mutex); + quirks = tpacpi_check_quirks(brightness_quirk_table, + ARRAY_SIZE(brightness_quirk_table)); + /* * We always attempt to detect acpi support, so as to switch * Lenovo Vista BIOS to ACPI brightness mode even if we are not @@ -6009,23 +5740,13 @@ static int __init brightness_init(struct ibm_init_struct *iibm) /* TPACPI_BRGHT_MODE_AUTO not implemented yet, just use default */ if (brightness_mode == TPACPI_BRGHT_MODE_AUTO || brightness_mode == TPACPI_BRGHT_MODE_MAX) { - if (thinkpad_id.vendor == PCI_VENDOR_ID_IBM) { - /* - * IBM models that define HBRV probably have - * EC-based backlight level control - */ - if (acpi_evalf(ec_handle, NULL, "HBRV", "qd")) - /* T40-T43, R50-R52, R50e, R51e, X31-X41 */ - brightness_mode = TPACPI_BRGHT_MODE_ECNVRAM; - else - /* all other IBM ThinkPads */ - brightness_mode = TPACPI_BRGHT_MODE_UCMS_STEP; - } else - /* All Lenovo ThinkPads */ + if (quirks & TPACPI_BRGHT_Q_EC) + brightness_mode = TPACPI_BRGHT_MODE_ECNVRAM; + else brightness_mode = TPACPI_BRGHT_MODE_UCMS_STEP; dbg_printk(TPACPI_DBG_BRGHT, - "selected brightness_mode=%d\n", + "driver auto-selected brightness_mode=%d\n", brightness_mode); } @@ -6052,6 +5773,15 @@ static int __init brightness_init(struct ibm_init_struct *iibm) vdbg_printk(TPACPI_DBG_INIT | TPACPI_DBG_BRGHT, "brightness is supported\n"); + if (quirks & TPACPI_BRGHT_Q_ASK) { + printk(TPACPI_NOTICE + "brightness: will use unverified default: " + "brightness_mode=%d\n", brightness_mode); + printk(TPACPI_NOTICE + "brightness: please report to %s whether it works well " + "or not on your ThinkPad\n", TPACPI_MAIL); + } + ibm_backlight_device->props.max_brightness = (tp_features.bright_16levels)? 15 : 7; ibm_backlight_device->props.brightness = b & TP_EC_BACKLIGHT_LVLMSK; @@ -7854,22 +7584,6 @@ static struct ibm_init_struct ibms_init[] __initdata = { .init = light_init, .data = &light_driver_data, }, -#ifdef CONFIG_THINKPAD_ACPI_DOCK - { - .init = dock_init, - .data = &dock_driver_data[0], - }, - { - .init = dock_init2, - .data = &dock_driver_data[1], - }, -#endif -#ifdef CONFIG_THINKPAD_ACPI_BAY - { - .init = bay_init, - .data = &bay_driver_data, - }, -#endif { .init = cmos_init, .data = &cmos_driver_data, @@ -7968,12 +7682,6 @@ TPACPI_PARAM(hotkey); TPACPI_PARAM(bluetooth); TPACPI_PARAM(video); TPACPI_PARAM(light); -#ifdef CONFIG_THINKPAD_ACPI_DOCK -TPACPI_PARAM(dock); -#endif -#ifdef CONFIG_THINKPAD_ACPI_BAY -TPACPI_PARAM(bay); -#endif /* CONFIG_THINKPAD_ACPI_BAY */ TPACPI_PARAM(cmos); TPACPI_PARAM(led); TPACPI_PARAM(beep); diff --git a/drivers/power/Kconfig b/drivers/power/Kconfig index 7eda34838bf..bdbc4f73fcd 100644 --- a/drivers/power/Kconfig +++ b/drivers/power/Kconfig @@ -43,6 +43,13 @@ config BATTERY_DS2760 help Say Y here to enable support for batteries with ds2760 chip. +config BATTERY_DS2782 + tristate "DS2782 standalone gas-gauge" + depends on I2C + help + Say Y here to enable support for the DS2782 standalone battery + gas-gauge. + config BATTERY_PMU tristate "Apple PMU battery" depends on PPC32 && ADB_PMU diff --git a/drivers/power/Makefile b/drivers/power/Makefile index daf3179689a..380d17c9ae2 100644 --- a/drivers/power/Makefile +++ b/drivers/power/Makefile @@ -19,6 +19,7 @@ obj-$(CONFIG_APM_POWER) += apm_power.o obj-$(CONFIG_WM8350_POWER) += wm8350_power.o obj-$(CONFIG_BATTERY_DS2760) += ds2760_battery.o +obj-$(CONFIG_BATTERY_DS2782) += ds2782_battery.o obj-$(CONFIG_BATTERY_PMU) += pmu_battery.o obj-$(CONFIG_BATTERY_OLPC) += olpc_battery.o obj-$(CONFIG_BATTERY_TOSA) += tosa_battery.o diff --git a/drivers/power/ds2782_battery.c b/drivers/power/ds2782_battery.c new file mode 100644 index 00000000000..da14f374cb6 --- /dev/null +++ b/drivers/power/ds2782_battery.c @@ -0,0 +1,330 @@ +/* + * I2C client/driver for the Maxim/Dallas DS2782 Stand-Alone Fuel Gauge IC + * + * Copyright (C) 2009 Bluewater Systems Ltd + * + * Author: Ryan Mallon <ryan@bluewatersys.com> + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/types.h> +#include <linux/errno.h> +#include <linux/swab.h> +#include <linux/i2c.h> +#include <linux/idr.h> +#include <linux/power_supply.h> + +#define DS2782_REG_RARC 0x06 /* Remaining active relative capacity */ + +#define DS2782_REG_VOLT_MSB 0x0c +#define DS2782_REG_TEMP_MSB 0x0a +#define DS2782_REG_CURRENT_MSB 0x0e + +/* EEPROM Block */ +#define DS2782_REG_RSNSP 0x69 /* Sense resistor value */ + +/* Current unit measurement in uA for a 1 milli-ohm sense resistor */ +#define DS2782_CURRENT_UNITS 1563 + +#define to_ds2782_info(x) container_of(x, struct ds2782_info, battery) + +struct ds2782_info { + struct i2c_client *client; + struct power_supply battery; + int id; +}; + +static DEFINE_IDR(battery_id); +static DEFINE_MUTEX(battery_lock); + +static inline int ds2782_read_reg(struct ds2782_info *info, int reg, u8 *val) +{ + int ret; + + ret = i2c_smbus_read_byte_data(info->client, reg); + if (ret < 0) { + dev_err(&info->client->dev, "register read failed\n"); + return ret; + } + + *val = ret; + return 0; +} + +static inline int ds2782_read_reg16(struct ds2782_info *info, int reg_msb, + s16 *val) +{ + int ret; + + ret = swab16(i2c_smbus_read_word_data(info->client, reg_msb)); + if (ret < 0) { + dev_err(&info->client->dev, "register read failed\n"); + return ret; + } + + *val = ret; + return 0; +} + +static int ds2782_get_temp(struct ds2782_info *info, int *temp) +{ + s16 raw; + int err; + + /* + * Temperature is measured in units of 0.125 degrees celcius, the + * power_supply class measures temperature in tenths of degrees + * celsius. The temperature value is stored as a 10 bit number, plus + * sign in the upper bits of a 16 bit register. + */ + err = ds2782_read_reg16(info, DS2782_REG_TEMP_MSB, &raw); + if (err) + return err; + *temp = ((raw / 32) * 125) / 100; + return 0; +} + +static int ds2782_get_current(struct ds2782_info *info, int *current_uA) +{ + int sense_res; + int err; + u8 sense_res_raw; + s16 raw; + + /* + * The units of measurement for current are dependent on the value of + * the sense resistor. + */ + err = ds2782_read_reg(info, DS2782_REG_RSNSP, &sense_res_raw); + if (err) + return err; + if (sense_res_raw == 0) { + dev_err(&info->client->dev, "sense resistor value is 0\n"); + return -ENXIO; + } + sense_res = 1000 / sense_res_raw; + + dev_dbg(&info->client->dev, "sense resistor = %d milli-ohms\n", + sense_res); + err = ds2782_read_reg16(info, DS2782_REG_CURRENT_MSB, &raw); + if (err) + return err; + *current_uA = raw * (DS2782_CURRENT_UNITS / sense_res); + return 0; +} + +static int ds2782_get_voltage(struct ds2782_info *info, int *voltage_uA) +{ + s16 raw; + int err; + + /* + * Voltage is measured in units of 4.88mV. The voltage is stored as + * a 10-bit number plus sign, in the upper bits of a 16-bit register + */ + err = ds2782_read_reg16(info, DS2782_REG_VOLT_MSB, &raw); + if (err) + return err; + *voltage_uA = (raw / 32) * 4800; + return 0; +} + +static int ds2782_get_capacity(struct ds2782_info *info, int *capacity) +{ + int err; + u8 raw; + + err = ds2782_read_reg(info, DS2782_REG_RARC, &raw); + if (err) + return err; + *capacity = raw; + return raw; +} + +static int ds2782_get_status(struct ds2782_info *info, int *status) +{ + int err; + int current_uA; + int capacity; + + err = ds2782_get_current(info, ¤t_uA); + if (err) + return err; + + err = ds2782_get_capacity(info, &capacity); + if (err) + return err; + + if (capacity == 100) + *status = POWER_SUPPLY_STATUS_FULL; + else if (current_uA == 0) + *status = POWER_SUPPLY_STATUS_NOT_CHARGING; + else if (current_uA < 0) + *status = POWER_SUPPLY_STATUS_DISCHARGING; + else + *status = POWER_SUPPLY_STATUS_CHARGING; + + return 0; +} + +static int ds2782_battery_get_property(struct power_supply *psy, + enum power_supply_property prop, + union power_supply_propval *val) +{ + struct ds2782_info *info = to_ds2782_info(psy); + int ret; + + switch (prop) { + case POWER_SUPPLY_PROP_STATUS: + ret = ds2782_get_status(info, &val->intval); + break; + + case POWER_SUPPLY_PROP_CAPACITY: + ret = ds2782_get_capacity(info, &val->intval); + break; + + case POWER_SUPPLY_PROP_VOLTAGE_NOW: + ret = ds2782_get_voltage(info, &val->intval); + break; + + case POWER_SUPPLY_PROP_CURRENT_NOW: + ret = ds2782_get_current(info, &val->intval); + break; + + case POWER_SUPPLY_PROP_TEMP: + ret = ds2782_get_temp(info, &val->intval); + break; + + default: + ret = -EINVAL; + } + + return ret; +} + +static enum power_supply_property ds2782_battery_props[] = { + POWER_SUPPLY_PROP_STATUS, + POWER_SUPPLY_PROP_CAPACITY, + POWER_SUPPLY_PROP_VOLTAGE_NOW, + POWER_SUPPLY_PROP_CURRENT_NOW, + POWER_SUPPLY_PROP_TEMP, +}; + +static void ds2782_power_supply_init(struct power_supply *battery) +{ + battery->type = POWER_SUPPLY_TYPE_BATTERY; + battery->properties = ds2782_battery_props; + battery->num_properties = ARRAY_SIZE(ds2782_battery_props); + battery->get_property = ds2782_battery_get_property; + battery->external_power_changed = NULL; +} + +static int ds2782_battery_remove(struct i2c_client *client) +{ + struct ds2782_info *info = i2c_get_clientdata(client); + + power_supply_unregister(&info->battery); + kfree(info->battery.name); + + mutex_lock(&battery_lock); + idr_remove(&battery_id, info->id); + mutex_unlock(&battery_lock); + + i2c_set_clientdata(client, info); + + kfree(info); + return 0; +} + +static int ds2782_battery_probe(struct i2c_client *client, + const struct i2c_device_id *id) +{ + struct ds2782_info *info; + int ret; + int num; + + /* Get an ID for this battery */ + ret = idr_pre_get(&battery_id, GFP_KERNEL); + if (ret == 0) { + ret = -ENOMEM; + goto fail_id; + } + + mutex_lock(&battery_lock); + ret = idr_get_new(&battery_id, client, &num); + mutex_unlock(&battery_lock); + if (ret < 0) + goto fail_id; + + info = kzalloc(sizeof(*info), GFP_KERNEL); + if (!info) { + ret = -ENOMEM; + goto fail_info; + } + + info->battery.name = kasprintf(GFP_KERNEL, "ds2782-%d", num); + if (!info->battery.name) { + ret = -ENOMEM; + goto fail_name; + } + + i2c_set_clientdata(client, info); + info->client = client; + ds2782_power_supply_init(&info->battery); + + ret = power_supply_register(&client->dev, &info->battery); + if (ret) { + dev_err(&client->dev, "failed to register battery\n"); + goto fail_register; + } + + return 0; + +fail_register: + kfree(info->battery.name); +fail_name: + i2c_set_clientdata(client, info); + kfree(info); +fail_info: + mutex_lock(&battery_lock); + idr_remove(&battery_id, num); + mutex_unlock(&battery_lock); +fail_id: + return ret; +} + +static const struct i2c_device_id ds2782_id[] = { + {"ds2782", 0}, + {}, +}; + +static struct i2c_driver ds2782_battery_driver = { + .driver = { + .name = "ds2782-battery", + }, + .probe = ds2782_battery_probe, + .remove = ds2782_battery_remove, + .id_table = ds2782_id, +}; + +static int __init ds2782_init(void) +{ + return i2c_add_driver(&ds2782_battery_driver); +} +module_init(ds2782_init); + +static void __exit ds2782_exit(void) +{ + i2c_del_driver(&ds2782_battery_driver); +} +module_exit(ds2782_exit); + +MODULE_AUTHOR("Ryan Mallon <ryan@bluewatersys.com>"); +MODULE_DESCRIPTION("Maxim/Dallas DS2782 Stand-Alone Fuel Gauage IC driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/power/olpc_battery.c b/drivers/power/olpc_battery.c index 5fbca2681ba..58e419299cd 100644 --- a/drivers/power/olpc_battery.c +++ b/drivers/power/olpc_battery.c @@ -8,6 +8,7 @@ * published by the Free Software Foundation. */ +#include <linux/kernel.h> #include <linux/module.h> #include <linux/err.h> #include <linux/platform_device.h> @@ -35,6 +36,7 @@ #define BAT_STAT_AC 0x10 #define BAT_STAT_CHARGING 0x20 #define BAT_STAT_DISCHARGING 0x40 +#define BAT_STAT_TRICKLE 0x80 #define BAT_ERR_INFOFAIL 0x02 #define BAT_ERR_OVERVOLTAGE 0x04 @@ -89,7 +91,7 @@ static char bat_serial[17]; /* Ick */ static int olpc_bat_get_status(union power_supply_propval *val, uint8_t ec_byte) { if (olpc_platform_info.ecver > 0x44) { - if (ec_byte & BAT_STAT_CHARGING) + if (ec_byte & (BAT_STAT_CHARGING | BAT_STAT_TRICKLE)) val->intval = POWER_SUPPLY_STATUS_CHARGING; else if (ec_byte & BAT_STAT_DISCHARGING) val->intval = POWER_SUPPLY_STATUS_DISCHARGING; @@ -219,7 +221,8 @@ static int olpc_bat_get_property(struct power_supply *psy, It doesn't matter though -- the EC will return the last-known information, and it's as if we just ran that _little_ bit faster and managed to read it out before the battery went away. */ - if (!(ec_byte & BAT_STAT_PRESENT) && psp != POWER_SUPPLY_PROP_PRESENT) + if (!(ec_byte & (BAT_STAT_PRESENT | BAT_STAT_TRICKLE)) && + psp != POWER_SUPPLY_PROP_PRESENT) return -ENODEV; switch (psp) { @@ -229,7 +232,8 @@ static int olpc_bat_get_property(struct power_supply *psy, return ret; break; case POWER_SUPPLY_PROP_PRESENT: - val->intval = !!(ec_byte & BAT_STAT_PRESENT); + val->intval = !!(ec_byte & (BAT_STAT_PRESENT | + BAT_STAT_TRICKLE)); break; case POWER_SUPPLY_PROP_HEALTH: @@ -334,21 +338,21 @@ static ssize_t olpc_bat_eeprom_read(struct kobject *kobj, struct bin_attribute *attr, char *buf, loff_t off, size_t count) { uint8_t ec_byte; - int ret, end; + int ret; + int i; if (off >= EEPROM_SIZE) return 0; if (off + count > EEPROM_SIZE) count = EEPROM_SIZE - off; - end = EEPROM_START + off + count; - for (ec_byte = EEPROM_START + off; ec_byte < end; ec_byte++) { - ret = olpc_ec_cmd(EC_BAT_EEPROM, &ec_byte, 1, - &buf[ec_byte - EEPROM_START], 1); + for (i = 0; i < count; i++) { + ec_byte = EEPROM_START + off + i; + ret = olpc_ec_cmd(EC_BAT_EEPROM, &ec_byte, 1, &buf[i], 1); if (ret) { - printk(KERN_ERR "olpc-battery: EC command " - "EC_BAT_EEPROM @ 0x%x failed -" - " %d!\n", ec_byte, ret); + pr_err("olpc-battery: " + "EC_BAT_EEPROM cmd @ 0x%x failed - %d!\n", + ec_byte, ret); return -EIO; } } diff --git a/drivers/s390/scsi/zfcp_erp.c b/drivers/s390/scsi/zfcp_erp.c index 8030e25152f..c75d6f35cb5 100644 --- a/drivers/s390/scsi/zfcp_erp.c +++ b/drivers/s390/scsi/zfcp_erp.c @@ -553,40 +553,35 @@ static void _zfcp_erp_unit_reopen_all(struct zfcp_port *port, int clear, _zfcp_erp_unit_reopen(unit, clear, id, ref); } -static void zfcp_erp_strategy_followup_actions(struct zfcp_erp_action *act) +static void zfcp_erp_strategy_followup_failed(struct zfcp_erp_action *act) { - struct zfcp_adapter *adapter = act->adapter; - struct zfcp_port *port = act->port; - struct zfcp_unit *unit = act->unit; - u32 status = act->status; - - /* initiate follow-up actions depending on success of finished action */ switch (act->action) { - case ZFCP_ERP_ACTION_REOPEN_ADAPTER: - if (status == ZFCP_ERP_SUCCEEDED) - _zfcp_erp_port_reopen_all(adapter, 0, "ersfa_1", NULL); - else - _zfcp_erp_adapter_reopen(adapter, 0, "ersfa_2", NULL); + _zfcp_erp_adapter_reopen(act->adapter, 0, "ersff_1", NULL); break; - case ZFCP_ERP_ACTION_REOPEN_PORT_FORCED: - if (status == ZFCP_ERP_SUCCEEDED) - _zfcp_erp_port_reopen(port, 0, "ersfa_3", NULL); - else - _zfcp_erp_adapter_reopen(adapter, 0, "ersfa_4", NULL); + _zfcp_erp_port_forced_reopen(act->port, 0, "ersff_2", NULL); break; - case ZFCP_ERP_ACTION_REOPEN_PORT: - if (status == ZFCP_ERP_SUCCEEDED) - _zfcp_erp_unit_reopen_all(port, 0, "ersfa_5", NULL); - else - _zfcp_erp_port_forced_reopen(port, 0, "ersfa_6", NULL); + _zfcp_erp_port_reopen(act->port, 0, "ersff_3", NULL); break; - case ZFCP_ERP_ACTION_REOPEN_UNIT: - if (status != ZFCP_ERP_SUCCEEDED) - _zfcp_erp_port_reopen(unit->port, 0, "ersfa_7", NULL); + _zfcp_erp_unit_reopen(act->unit, 0, "ersff_4", NULL); + break; + } +} + +static void zfcp_erp_strategy_followup_success(struct zfcp_erp_action *act) +{ + switch (act->action) { + case ZFCP_ERP_ACTION_REOPEN_ADAPTER: + _zfcp_erp_port_reopen_all(act->adapter, 0, "ersfs_1", NULL); + break; + case ZFCP_ERP_ACTION_REOPEN_PORT_FORCED: + _zfcp_erp_port_reopen(act->port, 0, "ersfs_2", NULL); + break; + case ZFCP_ERP_ACTION_REOPEN_PORT: + _zfcp_erp_unit_reopen_all(act->port, 0, "ersfs_3", NULL); break; } } @@ -801,7 +796,7 @@ static int zfcp_erp_port_forced_strategy(struct zfcp_erp_action *erp_action) return ZFCP_ERP_FAILED; case ZFCP_ERP_STEP_PHYS_PORT_CLOSING: - if (status & ZFCP_STATUS_PORT_PHYS_OPEN) + if (!(status & ZFCP_STATUS_PORT_PHYS_OPEN)) return ZFCP_ERP_SUCCEEDED; } return ZFCP_ERP_FAILED; @@ -853,11 +848,17 @@ void zfcp_erp_port_strategy_open_lookup(struct work_struct *work) gid_pn_work); retval = zfcp_fc_ns_gid_pn(&port->erp_action); - if (retval == -ENOMEM) - zfcp_erp_notify(&port->erp_action, ZFCP_ERP_NOMEM); - port->erp_action.step = ZFCP_ERP_STEP_NAMESERVER_LOOKUP; - if (retval) - zfcp_erp_notify(&port->erp_action, ZFCP_ERP_FAILED); + if (!retval) { + port->erp_action.step = ZFCP_ERP_STEP_NAMESERVER_LOOKUP; + goto out; + } + if (retval == -ENOMEM) { + zfcp_erp_notify(&port->erp_action, ZFCP_STATUS_ERP_LOWMEM); + goto out; + } + /* all other error condtions */ + zfcp_erp_notify(&port->erp_action, 0); +out: zfcp_port_put(port); } @@ -1289,7 +1290,10 @@ static int zfcp_erp_strategy(struct zfcp_erp_action *erp_action) retval = zfcp_erp_strategy_statechange(erp_action, retval); if (retval == ZFCP_ERP_EXIT) goto unlock; - zfcp_erp_strategy_followup_actions(erp_action); + if (retval == ZFCP_ERP_SUCCEEDED) + zfcp_erp_strategy_followup_success(erp_action); + if (retval == ZFCP_ERP_FAILED) + zfcp_erp_strategy_followup_failed(erp_action); unlock: write_unlock(&adapter->erp_lock); diff --git a/drivers/s390/scsi/zfcp_fc.c b/drivers/s390/scsi/zfcp_fc.c index 2f0705d76b7..47daebfa7e5 100644 --- a/drivers/s390/scsi/zfcp_fc.c +++ b/drivers/s390/scsi/zfcp_fc.c @@ -79,11 +79,9 @@ static int zfcp_wka_port_get(struct zfcp_wka_port *wka_port) mutex_unlock(&wka_port->mutex); - wait_event_timeout( - wka_port->completion_wq, - wka_port->status == ZFCP_WKA_PORT_ONLINE || - wka_port->status == ZFCP_WKA_PORT_OFFLINE, - HZ >> 1); + wait_event(wka_port->completion_wq, + wka_port->status == ZFCP_WKA_PORT_ONLINE || + wka_port->status == ZFCP_WKA_PORT_OFFLINE); if (wka_port->status == ZFCP_WKA_PORT_ONLINE) { atomic_inc(&wka_port->refcount); diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index c57658f3d34..47795fbf081 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -670,8 +670,11 @@ static int zfcp_fsf_req_sbal_get(struct zfcp_adapter *adapter) zfcp_fsf_sbal_check(adapter), 5 * HZ); if (ret > 0) return 0; - if (!ret) + if (!ret) { atomic_inc(&adapter->qdio_outb_full); + /* assume hanging outbound queue, try queue recovery */ + zfcp_erp_adapter_reopen(adapter, 0, "fsrsg_1", NULL); + } spin_lock_bh(&adapter->req_q_lock); return -EIO; @@ -722,7 +725,7 @@ static struct zfcp_fsf_req *zfcp_fsf_req_create(struct zfcp_adapter *adapter, req = zfcp_fsf_alloc_qtcb(pool); if (unlikely(!req)) - return ERR_PTR(-EIO); + return ERR_PTR(-ENOMEM); if (adapter->req_no == 0) adapter->req_no++; @@ -1010,6 +1013,23 @@ skip_fsfstatus: send_ct->handler(send_ct->handler_data); } +static void zfcp_fsf_setup_ct_els_unchained(struct qdio_buffer_element *sbale, + struct scatterlist *sg_req, + struct scatterlist *sg_resp) +{ + sbale[0].flags |= SBAL_FLAGS0_TYPE_WRITE_READ; + sbale[2].addr = sg_virt(sg_req); + sbale[2].length = sg_req->length; + sbale[3].addr = sg_virt(sg_resp); + sbale[3].length = sg_resp->length; + sbale[3].flags |= SBAL_FLAGS_LAST_ENTRY; +} + +static int zfcp_fsf_one_sbal(struct scatterlist *sg) +{ + return sg_is_last(sg) && sg->length <= PAGE_SIZE; +} + static int zfcp_fsf_setup_ct_els_sbals(struct zfcp_fsf_req *req, struct scatterlist *sg_req, struct scatterlist *sg_resp, @@ -1020,30 +1040,30 @@ static int zfcp_fsf_setup_ct_els_sbals(struct zfcp_fsf_req *req, int bytes; if (!(feat & FSF_FEATURE_ELS_CT_CHAINED_SBALS)) { - if (sg_req->length > PAGE_SIZE || sg_resp->length > PAGE_SIZE || - !sg_is_last(sg_req) || !sg_is_last(sg_resp)) + if (!zfcp_fsf_one_sbal(sg_req) || !zfcp_fsf_one_sbal(sg_resp)) return -EOPNOTSUPP; - sbale[0].flags |= SBAL_FLAGS0_TYPE_WRITE_READ; - sbale[2].addr = sg_virt(sg_req); - sbale[2].length = sg_req->length; - sbale[3].addr = sg_virt(sg_resp); - sbale[3].length = sg_resp->length; - sbale[3].flags |= SBAL_FLAGS_LAST_ENTRY; + zfcp_fsf_setup_ct_els_unchained(sbale, sg_req, sg_resp); + return 0; + } + + /* use single, unchained SBAL if it can hold the request */ + if (zfcp_fsf_one_sbal(sg_req) && zfcp_fsf_one_sbal(sg_resp)) { + zfcp_fsf_setup_ct_els_unchained(sbale, sg_req, sg_resp); return 0; } bytes = zfcp_qdio_sbals_from_sg(req, SBAL_FLAGS0_TYPE_WRITE_READ, sg_req, max_sbals); if (bytes <= 0) - return -ENOMEM; + return -EIO; req->qtcb->bottom.support.req_buf_length = bytes; req->sbale_curr = ZFCP_LAST_SBALE_PER_SBAL; bytes = zfcp_qdio_sbals_from_sg(req, SBAL_FLAGS0_TYPE_WRITE_READ, sg_resp, max_sbals); if (bytes <= 0) - return -ENOMEM; + return -EIO; req->qtcb->bottom.support.resp_buf_length = bytes; return 0; @@ -1607,10 +1627,10 @@ static void zfcp_fsf_open_wka_port_handler(struct zfcp_fsf_req *req) case FSF_ACCESS_DENIED: wka_port->status = ZFCP_WKA_PORT_OFFLINE; break; - case FSF_PORT_ALREADY_OPEN: - break; case FSF_GOOD: wka_port->handle = header->port_handle; + /* fall through */ + case FSF_PORT_ALREADY_OPEN: wka_port->status = ZFCP_WKA_PORT_ONLINE; } out: @@ -1731,15 +1751,16 @@ static void zfcp_fsf_close_physical_port_handler(struct zfcp_fsf_req *req) zfcp_fsf_access_denied_port(req, port); break; case FSF_PORT_BOXED: - zfcp_erp_port_boxed(port, "fscpph2", req); - req->status |= ZFCP_STATUS_FSFREQ_ERROR | - ZFCP_STATUS_FSFREQ_RETRY; /* can't use generic zfcp_erp_modify_port_status because * ZFCP_STATUS_COMMON_OPEN must not be reset for the port */ atomic_clear_mask(ZFCP_STATUS_PORT_PHYS_OPEN, &port->status); list_for_each_entry(unit, &port->unit_list_head, list) atomic_clear_mask(ZFCP_STATUS_COMMON_OPEN, &unit->status); + zfcp_erp_port_boxed(port, "fscpph2", req); + req->status |= ZFCP_STATUS_FSFREQ_ERROR | + ZFCP_STATUS_FSFREQ_RETRY; + break; case FSF_ADAPTER_STATUS_AVAILABLE: switch (header->fsf_status_qual.word[0]) { @@ -2541,7 +2562,6 @@ struct zfcp_fsf_req *zfcp_fsf_control_file(struct zfcp_adapter *adapter, bytes = zfcp_qdio_sbals_from_sg(req, direction, fsf_cfdc->sg, FSF_MAX_SBALS_PER_REQ); if (bytes != ZFCP_CFDC_MAX_SIZE) { - retval = -ENOMEM; zfcp_fsf_req_free(req); goto out; } diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index 967ede73f4c..6925a178468 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -167,20 +167,21 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt) struct zfcp_unit *unit = scpnt->device->hostdata; struct zfcp_fsf_req *old_req, *abrt_req; unsigned long flags; - unsigned long old_req_id = (unsigned long) scpnt->host_scribble; + unsigned long old_reqid = (unsigned long) scpnt->host_scribble; int retval = SUCCESS; int retry = 3; + char *dbf_tag; /* avoid race condition between late normal completion and abort */ write_lock_irqsave(&adapter->abort_lock, flags); spin_lock(&adapter->req_list_lock); - old_req = zfcp_reqlist_find(adapter, old_req_id); + old_req = zfcp_reqlist_find(adapter, old_reqid); spin_unlock(&adapter->req_list_lock); if (!old_req) { write_unlock_irqrestore(&adapter->abort_lock, flags); zfcp_scsi_dbf_event_abort("lte1", adapter, scpnt, NULL, - old_req_id); + old_reqid); return FAILED; /* completion could be in progress */ } old_req->data = NULL; @@ -189,7 +190,7 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt) write_unlock_irqrestore(&adapter->abort_lock, flags); while (retry--) { - abrt_req = zfcp_fsf_abort_fcp_command(old_req_id, unit); + abrt_req = zfcp_fsf_abort_fcp_command(old_reqid, unit); if (abrt_req) break; @@ -197,7 +198,7 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt) if (!(atomic_read(&adapter->status) & ZFCP_STATUS_COMMON_RUNNING)) { zfcp_scsi_dbf_event_abort("nres", adapter, scpnt, NULL, - old_req_id); + old_reqid); return SUCCESS; } } @@ -208,13 +209,14 @@ static int zfcp_scsi_eh_abort_handler(struct scsi_cmnd *scpnt) abrt_req->status & ZFCP_STATUS_FSFREQ_COMPLETED); if (abrt_req->status & ZFCP_STATUS_FSFREQ_ABORTSUCCEEDED) - zfcp_scsi_dbf_event_abort("okay", adapter, scpnt, abrt_req, 0); + dbf_tag = "okay"; else if (abrt_req->status & ZFCP_STATUS_FSFREQ_ABORTNOTNEEDED) - zfcp_scsi_dbf_event_abort("lte2", adapter, scpnt, abrt_req, 0); + dbf_tag = "lte2"; else { - zfcp_scsi_dbf_event_abort("fail", adapter, scpnt, abrt_req, 0); + dbf_tag = "fail"; retval = FAILED; } + zfcp_scsi_dbf_event_abort(dbf_tag, adapter, scpnt, abrt_req, old_reqid); zfcp_fsf_req_free(abrt_req); return retval; } @@ -534,6 +536,9 @@ static void zfcp_scsi_rport_register(struct zfcp_port *port) struct fc_rport_identifiers ids; struct fc_rport *rport; + if (port->rport) + return; + ids.node_name = port->wwnn; ids.port_name = port->wwpn; ids.port_id = port->d_id; @@ -557,8 +562,10 @@ static void zfcp_scsi_rport_block(struct zfcp_port *port) { struct fc_rport *rport = port->rport; - if (rport) + if (rport) { fc_remote_port_delete(rport); + port->rport = NULL; + } } void zfcp_scsi_schedule_rport_register(struct zfcp_port *port) diff --git a/drivers/s390/scsi/zfcp_sysfs.c b/drivers/s390/scsi/zfcp_sysfs.c index 3e51e64d110..0fe5cce818c 100644 --- a/drivers/s390/scsi/zfcp_sysfs.c +++ b/drivers/s390/scsi/zfcp_sysfs.c @@ -494,9 +494,14 @@ static ssize_t zfcp_sysfs_adapter_q_full_show(struct device *dev, struct Scsi_Host *scsi_host = class_to_shost(dev); struct zfcp_adapter *adapter = (struct zfcp_adapter *) scsi_host->hostdata[0]; + u64 util; + + spin_lock_bh(&adapter->qdio_stat_lock); + util = adapter->req_q_util; + spin_unlock_bh(&adapter->qdio_stat_lock); return sprintf(buf, "%d %llu\n", atomic_read(&adapter->qdio_outb_full), - (unsigned long long)adapter->req_q_util); + (unsigned long long)util); } static DEVICE_ATTR(queue_full, S_IRUGO, zfcp_sysfs_adapter_q_full_show, NULL); diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c index 2bc22be5f84..145ab9ba55e 100644 --- a/drivers/scsi/libfc/fc_exch.c +++ b/drivers/scsi/libfc/fc_exch.c @@ -415,9 +415,9 @@ static void fc_exch_timeout(struct work_struct *work) e_stat = ep->esb_stat; if (e_stat & ESB_ST_COMPLETE) { ep->esb_stat = e_stat & ~ESB_ST_REC_QUAL; + spin_unlock_bh(&ep->ex_lock); if (e_stat & ESB_ST_REC_QUAL) fc_exch_rrq(ep); - spin_unlock_bh(&ep->ex_lock); goto done; } else { resp = ep->resp; @@ -1624,14 +1624,14 @@ static void fc_exch_rrq(struct fc_exch *ep) struct fc_lport *lp; struct fc_els_rrq *rrq; struct fc_frame *fp; - struct fc_seq *rrq_sp; u32 did; lp = ep->lp; fp = fc_frame_alloc(lp, sizeof(*rrq)); if (!fp) - return; + goto retry; + rrq = fc_frame_payload_get(fp, sizeof(*rrq)); memset(rrq, 0, sizeof(*rrq)); rrq->rrq_cmd = ELS_RRQ; @@ -1647,13 +1647,20 @@ static void fc_exch_rrq(struct fc_exch *ep) fc_host_port_id(lp->host), FC_TYPE_ELS, FC_FC_FIRST_SEQ | FC_FC_END_SEQ | FC_FC_SEQ_INIT, 0); - rrq_sp = fc_exch_seq_send(lp, fp, fc_exch_rrq_resp, NULL, ep, - lp->e_d_tov); - if (!rrq_sp) { - ep->esb_stat |= ESB_ST_REC_QUAL; - fc_exch_timer_set_locked(ep, ep->r_a_tov); + if (fc_exch_seq_send(lp, fp, fc_exch_rrq_resp, NULL, ep, lp->e_d_tov)) + return; + +retry: + spin_lock_bh(&ep->ex_lock); + if (ep->state & (FC_EX_RST_CLEANUP | FC_EX_DONE)) { + spin_unlock_bh(&ep->ex_lock); + /* drop hold for rec qual */ + fc_exch_release(ep); return; } + ep->esb_stat |= ESB_ST_REC_QUAL; + fc_exch_timer_set_locked(ep, ep->r_a_tov); + spin_unlock_bh(&ep->ex_lock); } diff --git a/drivers/scsi/libiscsi.c b/drivers/scsi/libiscsi.c index 716cc344c5d..a751f6230c2 100644 --- a/drivers/scsi/libiscsi.c +++ b/drivers/scsi/libiscsi.c @@ -1974,10 +1974,10 @@ int iscsi_eh_abort(struct scsi_cmnd *sc) * good and have never sent us a successful tmf response * then sent more data for the cmd. */ - spin_lock(&session->lock); + spin_lock_bh(&session->lock); fail_scsi_task(task, DID_ABORT); conn->tmf_state = TMF_INITIAL; - spin_unlock(&session->lock); + spin_unlock_bh(&session->lock); iscsi_start_tx(conn); goto success_unlocked; case TMF_TIMEDOUT: diff --git a/drivers/scsi/libsas/sas_expander.c b/drivers/scsi/libsas/sas_expander.c index 54fa1e42dc4..b3381959acc 100644 --- a/drivers/scsi/libsas/sas_expander.c +++ b/drivers/scsi/libsas/sas_expander.c @@ -766,6 +766,7 @@ static int sas_ex_join_wide_port(struct domain_device *parent, int phy_id) if (!memcmp(phy->attached_sas_addr, ephy->attached_sas_addr, SAS_ADDR_SIZE) && ephy->port) { sas_port_add_phy(ephy->port, phy->phy); + phy->port = ephy->port; phy->phy_state = PHY_DEVICE_DISCOVERED; return 0; } @@ -945,11 +946,21 @@ static int sas_ex_discover_dev(struct domain_device *dev, int phy_id) if (ex->ex_phy[i].phy_state == PHY_VACANT || ex->ex_phy[i].phy_state == PHY_NOT_PRESENT) continue; - + /* + * Due to races, the phy might not get added to the + * wide port, so we add the phy to the wide port here. + */ if (SAS_ADDR(ex->ex_phy[i].attached_sas_addr) == - SAS_ADDR(child->sas_addr)) + SAS_ADDR(child->sas_addr)) { ex->ex_phy[i].phy_state= PHY_DEVICE_DISCOVERED; + res = sas_ex_join_wide_port(dev, i); + if (!res) + SAS_DPRINTK("Attaching ex phy%d to wide port %016llx\n", + i, SAS_ADDR(ex->ex_phy[i].attached_sas_addr)); + + } } + res = 0; } return res; @@ -1598,7 +1609,7 @@ static int sas_get_phy_attached_sas_addr(struct domain_device *dev, } static int sas_find_bcast_phy(struct domain_device *dev, int *phy_id, - int from_phy) + int from_phy, bool update) { struct expander_device *ex = &dev->ex_dev; int res = 0; @@ -1611,7 +1622,9 @@ static int sas_find_bcast_phy(struct domain_device *dev, int *phy_id, if (res) goto out; else if (phy_change_count != ex->ex_phy[i].phy_change_count) { - ex->ex_phy[i].phy_change_count = phy_change_count; + if (update) + ex->ex_phy[i].phy_change_count = + phy_change_count; *phy_id = i; return 0; } @@ -1653,31 +1666,52 @@ out: kfree(rg_req); return res; } +/** + * sas_find_bcast_dev - find the device issue BROADCAST(CHANGE). + * @dev:domain device to be detect. + * @src_dev: the device which originated BROADCAST(CHANGE). + * + * Add self-configuration expander suport. Suppose two expander cascading, + * when the first level expander is self-configuring, hotplug the disks in + * second level expander, BROADCAST(CHANGE) will not only be originated + * in the second level expander, but also be originated in the first level + * expander (see SAS protocol SAS 2r-14, 7.11 for detail), it is to say, + * expander changed count in two level expanders will all increment at least + * once, but the phy which chang count has changed is the source device which + * we concerned. + */ static int sas_find_bcast_dev(struct domain_device *dev, struct domain_device **src_dev) { struct expander_device *ex = &dev->ex_dev; int ex_change_count = -1; + int phy_id = -1; int res; + struct domain_device *ch; res = sas_get_ex_change_count(dev, &ex_change_count); if (res) goto out; - if (ex_change_count != -1 && - ex_change_count != ex->ex_change_count) { - *src_dev = dev; - ex->ex_change_count = ex_change_count; - } else { - struct domain_device *ch; - - list_for_each_entry(ch, &ex->children, siblings) { - if (ch->dev_type == EDGE_DEV || - ch->dev_type == FANOUT_DEV) { - res = sas_find_bcast_dev(ch, src_dev); - if (src_dev) - return res; - } + if (ex_change_count != -1 && ex_change_count != ex->ex_change_count) { + /* Just detect if this expander phys phy change count changed, + * in order to determine if this expander originate BROADCAST, + * and do not update phy change count field in our structure. + */ + res = sas_find_bcast_phy(dev, &phy_id, 0, false); + if (phy_id != -1) { + *src_dev = dev; + ex->ex_change_count = ex_change_count; + SAS_DPRINTK("Expander phy change count has changed\n"); + return res; + } else + SAS_DPRINTK("Expander phys DID NOT change\n"); + } + list_for_each_entry(ch, &ex->children, siblings) { + if (ch->dev_type == EDGE_DEV || ch->dev_type == FANOUT_DEV) { + res = sas_find_bcast_dev(ch, src_dev); + if (src_dev) + return res; } } out: @@ -1700,24 +1734,26 @@ static void sas_unregister_ex_tree(struct domain_device *dev) } static void sas_unregister_devs_sas_addr(struct domain_device *parent, - int phy_id) + int phy_id, bool last) { struct expander_device *ex_dev = &parent->ex_dev; struct ex_phy *phy = &ex_dev->ex_phy[phy_id]; struct domain_device *child, *n; - - list_for_each_entry_safe(child, n, &ex_dev->children, siblings) { - if (SAS_ADDR(child->sas_addr) == - SAS_ADDR(phy->attached_sas_addr)) { - if (child->dev_type == EDGE_DEV || - child->dev_type == FANOUT_DEV) - sas_unregister_ex_tree(child); - else - sas_unregister_dev(child); - break; + if (last) { + list_for_each_entry_safe(child, n, + &ex_dev->children, siblings) { + if (SAS_ADDR(child->sas_addr) == + SAS_ADDR(phy->attached_sas_addr)) { + if (child->dev_type == EDGE_DEV || + child->dev_type == FANOUT_DEV) + sas_unregister_ex_tree(child); + else + sas_unregister_dev(child); + break; + } } + sas_disable_routing(parent, phy->attached_sas_addr); } - sas_disable_routing(parent, phy->attached_sas_addr); memset(phy->attached_sas_addr, 0, SAS_ADDR_SIZE); sas_port_delete_phy(phy->port, phy->phy); if (phy->port->num_phys == 0) @@ -1770,15 +1806,31 @@ static int sas_discover_new(struct domain_device *dev, int phy_id) { struct ex_phy *ex_phy = &dev->ex_dev.ex_phy[phy_id]; struct domain_device *child; - int res; + bool found = false; + int res, i; SAS_DPRINTK("ex %016llx phy%d new device attached\n", SAS_ADDR(dev->sas_addr), phy_id); res = sas_ex_phy_discover(dev, phy_id); if (res) goto out; + /* to support the wide port inserted */ + for (i = 0; i < dev->ex_dev.num_phys; i++) { + struct ex_phy *ex_phy_temp = &dev->ex_dev.ex_phy[i]; + if (i == phy_id) + continue; + if (SAS_ADDR(ex_phy_temp->attached_sas_addr) == + SAS_ADDR(ex_phy->attached_sas_addr)) { + found = true; + break; + } + } + if (found) { + sas_ex_join_wide_port(dev, phy_id); + return 0; + } res = sas_ex_discover_devices(dev, phy_id); - if (res) + if (!res) goto out; list_for_each_entry(child, &dev->ex_dev.children, siblings) { if (SAS_ADDR(child->sas_addr) == @@ -1793,7 +1845,7 @@ out: return res; } -static int sas_rediscover_dev(struct domain_device *dev, int phy_id) +static int sas_rediscover_dev(struct domain_device *dev, int phy_id, bool last) { struct expander_device *ex = &dev->ex_dev; struct ex_phy *phy = &ex->ex_phy[phy_id]; @@ -1804,11 +1856,11 @@ static int sas_rediscover_dev(struct domain_device *dev, int phy_id) switch (res) { case SMP_RESP_NO_PHY: phy->phy_state = PHY_NOT_PRESENT; - sas_unregister_devs_sas_addr(dev, phy_id); + sas_unregister_devs_sas_addr(dev, phy_id, last); goto out; break; case SMP_RESP_PHY_VACANT: phy->phy_state = PHY_VACANT; - sas_unregister_devs_sas_addr(dev, phy_id); + sas_unregister_devs_sas_addr(dev, phy_id, last); goto out; break; case SMP_RESP_FUNC_ACC: break; @@ -1816,7 +1868,7 @@ static int sas_rediscover_dev(struct domain_device *dev, int phy_id) if (SAS_ADDR(attached_sas_addr) == 0) { phy->phy_state = PHY_EMPTY; - sas_unregister_devs_sas_addr(dev, phy_id); + sas_unregister_devs_sas_addr(dev, phy_id, last); } else if (SAS_ADDR(attached_sas_addr) == SAS_ADDR(phy->attached_sas_addr)) { SAS_DPRINTK("ex %016llx phy 0x%x broadcast flutter\n", @@ -1828,12 +1880,27 @@ out: return res; } +/** + * sas_rediscover - revalidate the domain. + * @dev:domain device to be detect. + * @phy_id: the phy id will be detected. + * + * NOTE: this process _must_ quit (return) as soon as any connection + * errors are encountered. Connection recovery is done elsewhere. + * Discover process only interrogates devices in order to discover the + * domain.For plugging out, we un-register the device only when it is + * the last phy in the port, for other phys in this port, we just delete it + * from the port.For inserting, we do discovery when it is the + * first phy,for other phys in this port, we add it to the port to + * forming the wide-port. + */ static int sas_rediscover(struct domain_device *dev, const int phy_id) { struct expander_device *ex = &dev->ex_dev; struct ex_phy *changed_phy = &ex->ex_phy[phy_id]; int res = 0; int i; + bool last = true; /* is this the last phy of the port */ SAS_DPRINTK("ex %016llx phy%d originated BROADCAST(CHANGE)\n", SAS_ADDR(dev->sas_addr), phy_id); @@ -1848,13 +1915,13 @@ static int sas_rediscover(struct domain_device *dev, const int phy_id) SAS_ADDR(changed_phy->attached_sas_addr)) { SAS_DPRINTK("phy%d part of wide port with " "phy%d\n", phy_id, i); - goto out; + last = false; + break; } } - res = sas_rediscover_dev(dev, phy_id); + res = sas_rediscover_dev(dev, phy_id, last); } else res = sas_discover_new(dev, phy_id); -out: return res; } @@ -1881,7 +1948,7 @@ int sas_ex_revalidate_domain(struct domain_device *port_dev) do { phy_id = -1; - res = sas_find_bcast_phy(dev, &phy_id, i); + res = sas_find_bcast_phy(dev, &phy_id, i, true); if (phy_id == -1) break; res = sas_rediscover(dev, phy_id); diff --git a/drivers/scsi/libsas/sas_port.c b/drivers/scsi/libsas/sas_port.c index e6ac59c023f..fe8b74c706d 100644 --- a/drivers/scsi/libsas/sas_port.c +++ b/drivers/scsi/libsas/sas_port.c @@ -56,7 +56,7 @@ static void sas_form_port(struct asd_sas_phy *phy) } } - /* find a port */ + /* see if the phy should be part of a wide port */ spin_lock_irqsave(&sas_ha->phy_port_lock, flags); for (i = 0; i < sas_ha->num_phys; i++) { port = sas_ha->sas_port[i]; @@ -69,12 +69,23 @@ static void sas_form_port(struct asd_sas_phy *phy) SAS_DPRINTK("phy%d matched wide port%d\n", phy->id, port->id); break; - } else if (*(u64 *) port->sas_addr == 0 && port->num_phys==0) { - memcpy(port->sas_addr, phy->sas_addr, SAS_ADDR_SIZE); - break; } spin_unlock(&port->phy_list_lock); } + /* The phy does not match any existing port, create a new one */ + if (i == sas_ha->num_phys) { + for (i = 0; i < sas_ha->num_phys; i++) { + port = sas_ha->sas_port[i]; + spin_lock(&port->phy_list_lock); + if (*(u64 *)port->sas_addr == 0 + && port->num_phys == 0) { + memcpy(port->sas_addr, phy->sas_addr, + SAS_ADDR_SIZE); + break; + } + spin_unlock(&port->phy_list_lock); + } + } if (i >= sas_ha->num_phys) { printk(KERN_NOTICE "%s: couldn't find a free port, bug?\n", diff --git a/drivers/scsi/qla4xxx/ql4_dbg.c b/drivers/scsi/qla4xxx/ql4_dbg.c index fcc184cd066..cbceb0ebabf 100644 --- a/drivers/scsi/qla4xxx/ql4_dbg.c +++ b/drivers/scsi/qla4xxx/ql4_dbg.c @@ -15,19 +15,18 @@ void qla4xxx_dump_buffer(void *b, uint32_t size) uint32_t cnt; uint8_t *c = b; - printk(" 0 1 2 3 4 5 6 7 8 9 Ah Bh Ch Dh Eh " + printk(" 0 1 2 3 4 5 6 7 8 9 Ah Bh Ch Dh Eh " "Fh\n"); printk("------------------------------------------------------------" "--\n"); - for (cnt = 0; cnt < size; cnt++, c++) { - printk(KERN_DEBUG "%02x", *c); - if (!(cnt % 16)) - printk(KERN_DEBUG "\n"); + for (cnt = 0; cnt < size; c++) { + printk(KERN_INFO "%02x", *c); + if (!(++cnt % 16)) + printk(KERN_INFO "\n"); else - printk(KERN_DEBUG " "); + printk(KERN_INFO " "); } - if (cnt % 16) - printk(KERN_DEBUG "\n"); + printk(KERN_INFO "\n"); } diff --git a/drivers/scsi/qla4xxx/ql4_def.h b/drivers/scsi/qla4xxx/ql4_def.h index b586f27c3bd..81b5f29254e 100644 --- a/drivers/scsi/qla4xxx/ql4_def.h +++ b/drivers/scsi/qla4xxx/ql4_def.h @@ -100,7 +100,6 @@ #define MAX_SRBS MAX_CMDS_TO_RISC #define MBOX_AEN_REG_COUNT 5 #define MAX_INIT_RETRIES 5 -#define IOCB_HIWAT_CUSHION 16 /* * Buffer sizes @@ -184,6 +183,11 @@ struct srb { uint16_t cc_stat; u_long r_start; /* Time we recieve a cmd from OS */ u_long u_start; /* Time when we handed the cmd to F/W */ + + /* Used for extended sense / status continuation */ + uint8_t *req_sense_ptr; + uint16_t req_sense_len; + uint16_t reserved2; }; /* @@ -302,7 +306,6 @@ struct scsi_qla_host { uint32_t tot_ddbs; uint16_t iocb_cnt; - uint16_t iocb_hiwat; /* SRB cache. */ #define SRB_MIN_REQ 128 @@ -436,6 +439,8 @@ struct scsi_qla_host { /* Map ddb_list entry by FW ddb index */ struct ddb_entry *fw_ddb_index_map[MAX_DDB_ENTRIES]; + /* Saved srb for status continuation entry processing */ + struct srb *status_srb; }; static inline int is_qla4010(struct scsi_qla_host *ha) diff --git a/drivers/scsi/qla4xxx/ql4_fw.h b/drivers/scsi/qla4xxx/ql4_fw.h index 1b667a70cff..9cd7a608df3 100644 --- a/drivers/scsi/qla4xxx/ql4_fw.h +++ b/drivers/scsi/qla4xxx/ql4_fw.h @@ -572,6 +572,7 @@ struct conn_event_log_entry { *************************************************************************/ #define IOCB_MAX_CDB_LEN 16 /* Bytes in a CBD */ #define IOCB_MAX_SENSEDATA_LEN 32 /* Bytes of sense data */ +#define IOCB_MAX_EXT_SENSEDATA_LEN 60 /* Bytes of extended sense data */ /* IOCB header structure */ struct qla4_header { @@ -733,6 +734,12 @@ struct status_entry { }; +/* Status Continuation entry */ +struct status_cont_entry { + struct qla4_header hdr; /* 00-03 */ + uint8_t ext_sense_data[IOCB_MAX_EXT_SENSEDATA_LEN]; /* 04-63 */ +}; + struct passthru0 { struct qla4_header hdr; /* 00-03 */ uint32_t handle; /* 04-07 */ diff --git a/drivers/scsi/qla4xxx/ql4_iocb.c b/drivers/scsi/qla4xxx/ql4_iocb.c index 912a67494ad..e0c32159749 100644 --- a/drivers/scsi/qla4xxx/ql4_iocb.c +++ b/drivers/scsi/qla4xxx/ql4_iocb.c @@ -10,9 +10,42 @@ #include "ql4_dbg.h" #include "ql4_inline.h" - #include <scsi/scsi_tcq.h> +static int +qla4xxx_space_in_req_ring(struct scsi_qla_host *ha, uint16_t req_cnt) +{ + uint16_t cnt; + + /* Calculate number of free request entries. */ + if ((req_cnt + 2) >= ha->req_q_count) { + cnt = (uint16_t) le32_to_cpu(ha->shadow_regs->req_q_out); + if (ha->request_in < cnt) + ha->req_q_count = cnt - ha->request_in; + else + ha->req_q_count = REQUEST_QUEUE_DEPTH - + (ha->request_in - cnt); + } + + /* Check if room for request in request ring. */ + if ((req_cnt + 2) < ha->req_q_count) + return 1; + else + return 0; +} + +static void qla4xxx_advance_req_ring_ptr(struct scsi_qla_host *ha) +{ + /* Advance request queue pointer */ + if (ha->request_in == (REQUEST_QUEUE_DEPTH - 1)) { + ha->request_in = 0; + ha->request_ptr = ha->request_ring; + } else { + ha->request_in++; + ha->request_ptr++; + } +} + /** * qla4xxx_get_req_pkt - returns a valid entry in request queue. * @ha: Pointer to host adapter structure. @@ -26,35 +59,18 @@ static int qla4xxx_get_req_pkt(struct scsi_qla_host *ha, struct queue_entry **queue_entry) { - uint16_t request_in; - uint8_t status = QLA_SUCCESS; - - *queue_entry = ha->request_ptr; + uint16_t req_cnt = 1; - /* get the latest request_in and request_out index */ - request_in = ha->request_in; - ha->request_out = (uint16_t) le32_to_cpu(ha->shadow_regs->req_q_out); - - /* Advance request queue pointer and check for queue full */ - if (request_in == (REQUEST_QUEUE_DEPTH - 1)) { - request_in = 0; - ha->request_ptr = ha->request_ring; - } else { - request_in++; - ha->request_ptr++; - } - - /* request queue is full, try again later */ - if ((ha->iocb_cnt + 1) >= ha->iocb_hiwat) { - /* restore request pointer */ - ha->request_ptr = *queue_entry; - status = QLA_ERROR; - } else { - ha->request_in = request_in; + if (qla4xxx_space_in_req_ring(ha, req_cnt)) { + *queue_entry = ha->request_ptr; memset(*queue_entry, 0, sizeof(**queue_entry)); + + qla4xxx_advance_req_ring_ptr(ha); + ha->req_q_count -= req_cnt; + return QLA_SUCCESS; } - return status; + return QLA_ERROR; } /** @@ -100,21 +116,14 @@ exit_send_marker: return status; } -static struct continuation_t1_entry* qla4xxx_alloc_cont_entry( - struct scsi_qla_host *ha) +static struct continuation_t1_entry * +qla4xxx_alloc_cont_entry(struct scsi_qla_host *ha) { struct continuation_t1_entry *cont_entry; cont_entry = (struct continuation_t1_entry *)ha->request_ptr; - /* Advance request queue pointer */ - if (ha->request_in == (REQUEST_QUEUE_DEPTH - 1)) { - ha->request_in = 0; - ha->request_ptr = ha->request_ring; - } else { - ha->request_in++; - ha->request_ptr++; - } + qla4xxx_advance_req_ring_ptr(ha); /* Load packet defaults */ cont_entry->hdr.entryType = ET_CONTINUE; @@ -197,13 +206,10 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb) struct scsi_cmnd *cmd = srb->cmd; struct ddb_entry *ddb_entry; struct command_t3_entry *cmd_entry; - int nseg; uint16_t tot_dsds; uint16_t req_cnt; - unsigned long flags; - uint16_t cnt; uint32_t index; char tag[2]; @@ -217,6 +223,19 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb) index = (uint32_t)cmd->request->tag; + /* + * Check to see if adapter is online before placing request on + * request queue. If a reset occurs and a request is in the queue, + * the firmware will still attempt to process the request, retrieving + * garbage for pointers. + */ + if (!test_bit(AF_ONLINE, &ha->flags)) { + DEBUG2(printk("scsi%ld: %s: Adapter OFFLINE! " + "Do not issue command.\n", + ha->host_no, __func__)); + goto queuing_error; + } + /* Calculate the number of request entries needed. */ nseg = scsi_dma_map(cmd); if (nseg < 0) @@ -224,17 +243,7 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb) tot_dsds = nseg; req_cnt = qla4xxx_calc_request_entries(tot_dsds); - - if (ha->req_q_count < (req_cnt + 2)) { - cnt = (uint16_t) le32_to_cpu(ha->shadow_regs->req_q_out); - if (ha->request_in < cnt) - ha->req_q_count = cnt - ha->request_in; - else - ha->req_q_count = REQUEST_QUEUE_DEPTH - - (ha->request_in - cnt); - } - - if (ha->req_q_count < (req_cnt + 2)) + if (!qla4xxx_space_in_req_ring(ha, req_cnt)) goto queuing_error; /* total iocbs active */ @@ -286,32 +295,10 @@ int qla4xxx_send_command_to_isp(struct scsi_qla_host *ha, struct srb * srb) break; } - - /* Advance request queue pointer */ - ha->request_in++; - if (ha->request_in == REQUEST_QUEUE_DEPTH) { - ha->request_in = 0; - ha->request_ptr = ha->request_ring; - } else - ha->request_ptr++; - - + qla4xxx_advance_req_ring_ptr(ha); qla4xxx_build_scsi_iocbs(srb, cmd_entry, tot_dsds); wmb(); - /* - * Check to see if adapter is online before placing request on - * request queue. If a reset occurs and a request is in the queue, - * the firmware will still attempt to process the request, retrieving - * garbage for pointers. - */ - if (!test_bit(AF_ONLINE, &ha->flags)) { - DEBUG2(printk("scsi%ld: %s: Adapter OFFLINE! " - "Do not issue command.\n", - ha->host_no, __func__)); - goto queuing_error; - } - srb->cmd->host_scribble = (unsigned char *)srb; /* update counters */ diff --git a/drivers/scsi/qla4xxx/ql4_isr.c b/drivers/scsi/qla4xxx/ql4_isr.c index 799120fcb9b..8025ee16588 100644 --- a/drivers/scsi/qla4xxx/ql4_isr.c +++ b/drivers/scsi/qla4xxx/ql4_isr.c @@ -11,6 +11,98 @@ #include "ql4_inline.h" /** + * qla4xxx_copy_sense - copy sense data into cmd sense buffer + * @ha: Pointer to host adapter structure. + * @sts_entry: Pointer to status entry structure. + * @srb: Pointer to srb structure. + **/ +static void qla4xxx_copy_sense(struct scsi_qla_host *ha, + struct status_entry *sts_entry, + struct srb *srb) +{ + struct scsi_cmnd *cmd = srb->cmd; + uint16_t sense_len; + + memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); + sense_len = le16_to_cpu(sts_entry->senseDataByteCnt); + if (sense_len == 0) + return; + + /* Save total available sense length, + * not to exceed cmd's sense buffer size */ + sense_len = min_t(uint16_t, sense_len, SCSI_SENSE_BUFFERSIZE); + srb->req_sense_ptr = cmd->sense_buffer; + srb->req_sense_len = sense_len; + + /* Copy sense from sts_entry pkt */ + sense_len = min_t(uint16_t, sense_len, IOCB_MAX_SENSEDATA_LEN); + memcpy(cmd->sense_buffer, sts_entry->senseData, sense_len); + + DEBUG2(printk(KERN_INFO "scsi%ld:%d:%d:%d: %s: sense key = %x, " + "ASL= %02x, ASC/ASCQ = %02x/%02x\n", ha->host_no, + cmd->device->channel, cmd->device->id, + cmd->device->lun, __func__, + sts_entry->senseData[2] & 0x0f, + sts_entry->senseData[7], + sts_entry->senseData[12], + sts_entry->senseData[13])); + + DEBUG5(qla4xxx_dump_buffer(cmd->sense_buffer, sense_len)); + srb->flags |= SRB_GOT_SENSE; + + /* Update srb, in case a sts_cont pkt follows */ + srb->req_sense_ptr += sense_len; + srb->req_sense_len -= sense_len; + if (srb->req_sense_len != 0) + ha->status_srb = srb; + else + ha->status_srb = NULL; +} + +/** + * qla4xxx_status_cont_entry - Process a Status Continuations entry. + * @ha: SCSI driver HA context + * @sts_cont: Entry pointer + * + * Extended sense data. + */ +static void +qla4xxx_status_cont_entry(struct scsi_qla_host *ha, + struct status_cont_entry *sts_cont) +{ + struct srb *srb = ha->status_srb; + struct scsi_cmnd *cmd; + uint8_t sense_len; + + if (srb == NULL) + return; + + cmd = srb->cmd; + if (cmd == NULL) { + DEBUG2(printk(KERN_INFO "scsi%ld: %s: Cmd already returned " + "back to OS srb=%p srb->state:%d\n", ha->host_no, + __func__, srb, srb->state)); + ha->status_srb = NULL; + return; + } + + /* Copy sense data. */ + sense_len = min_t(uint16_t, srb->req_sense_len, + IOCB_MAX_EXT_SENSEDATA_LEN); + memcpy(srb->req_sense_ptr, sts_cont->ext_sense_data, sense_len); + DEBUG5(qla4xxx_dump_buffer(srb->req_sense_ptr, sense_len)); + + srb->req_sense_ptr += sense_len; + srb->req_sense_len -= sense_len; + + /* Place command on done queue. */ + if (srb->req_sense_len == 0) { + qla4xxx_srb_compl(ha, srb); + ha->status_srb = NULL; + } +} + +/** * qla4xxx_status_entry - processes status IOCBs * @ha: Pointer to host adapter structure. * @sts_entry: Pointer to status entry structure. @@ -23,7 +115,6 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha, struct srb *srb; struct ddb_entry *ddb_entry; uint32_t residual; - uint16_t sensebytecnt; srb = qla4xxx_del_from_active_array(ha, le32_to_cpu(sts_entry->handle)); if (!srb) { @@ -92,24 +183,7 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha, break; /* Copy Sense Data into sense buffer. */ - memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); - - sensebytecnt = le16_to_cpu(sts_entry->senseDataByteCnt); - if (sensebytecnt == 0) - break; - - memcpy(cmd->sense_buffer, sts_entry->senseData, - min_t(uint16_t, sensebytecnt, SCSI_SENSE_BUFFERSIZE)); - - DEBUG2(printk("scsi%ld:%d:%d:%d: %s: sense key = %x, " - "ASC/ASCQ = %02x/%02x\n", ha->host_no, - cmd->device->channel, cmd->device->id, - cmd->device->lun, __func__, - sts_entry->senseData[2] & 0x0f, - sts_entry->senseData[12], - sts_entry->senseData[13])); - - srb->flags |= SRB_GOT_SENSE; + qla4xxx_copy_sense(ha, sts_entry, srb); break; case SCS_INCOMPLETE: @@ -176,23 +250,7 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha, break; /* Copy Sense Data into sense buffer. */ - memset(cmd->sense_buffer, 0, SCSI_SENSE_BUFFERSIZE); - - sensebytecnt = - le16_to_cpu(sts_entry->senseDataByteCnt); - if (sensebytecnt == 0) - break; - - memcpy(cmd->sense_buffer, sts_entry->senseData, - min_t(uint16_t, sensebytecnt, SCSI_SENSE_BUFFERSIZE)); - - DEBUG2(printk("scsi%ld:%d:%d:%d: %s: sense key = %x, " - "ASC/ASCQ = %02x/%02x\n", ha->host_no, - cmd->device->channel, cmd->device->id, - cmd->device->lun, __func__, - sts_entry->senseData[2] & 0x0f, - sts_entry->senseData[12], - sts_entry->senseData[13])); + qla4xxx_copy_sense(ha, sts_entry, srb); } else { /* * If RISC reports underrun and target does not @@ -268,9 +326,10 @@ static void qla4xxx_status_entry(struct scsi_qla_host *ha, status_entry_exit: - /* complete the request */ + /* complete the request, if not waiting for status_continuation pkt */ srb->cc_stat = sts_entry->completionStatus; - qla4xxx_srb_compl(ha, srb); + if (ha->status_srb == NULL) + qla4xxx_srb_compl(ha, srb); } /** @@ -305,10 +364,7 @@ static void qla4xxx_process_response_queue(struct scsi_qla_host * ha) /* process entry */ switch (sts_entry->hdr.entryType) { case ET_STATUS: - /* - * Common status - Single completion posted in single - * IOSB. - */ + /* Common status */ qla4xxx_status_entry(ha, sts_entry); break; @@ -316,9 +372,8 @@ static void qla4xxx_process_response_queue(struct scsi_qla_host * ha) break; case ET_STATUS_CONTINUATION: - /* Just throw away the status continuation entries */ - DEBUG2(printk("scsi%ld: %s: Status Continuation entry " - "- ignoring\n", ha->host_no, __func__)); + qla4xxx_status_cont_entry(ha, + (struct status_cont_entry *) sts_entry); break; case ET_COMMAND: diff --git a/drivers/scsi/qla4xxx/ql4_mbx.c b/drivers/scsi/qla4xxx/ql4_mbx.c index 051b0f5e8c8..09d6d4b76f3 100644 --- a/drivers/scsi/qla4xxx/ql4_mbx.c +++ b/drivers/scsi/qla4xxx/ql4_mbx.c @@ -385,16 +385,6 @@ int qla4xxx_get_firmware_status(struct scsi_qla_host * ha) mbox_sts[0])); return QLA_ERROR; } - - /* High-water mark of IOCBs */ - ha->iocb_hiwat = mbox_sts[2]; - if (ha->iocb_hiwat > IOCB_HIWAT_CUSHION) - ha->iocb_hiwat -= IOCB_HIWAT_CUSHION; - else - dev_info(&ha->pdev->dev, "WARNING!!! You have less than %d " - "firmware IOCBs available (%d).\n", - IOCB_HIWAT_CUSHION, ha->iocb_hiwat); - return QLA_SUCCESS; } diff --git a/drivers/scsi/qla4xxx/ql4_os.c b/drivers/scsi/qla4xxx/ql4_os.c index ec9da6ce848..40e3cafb3a9 100644 --- a/drivers/scsi/qla4xxx/ql4_os.c +++ b/drivers/scsi/qla4xxx/ql4_os.c @@ -66,6 +66,7 @@ static int qla4xxx_sess_get_param(struct iscsi_cls_session *sess, static int qla4xxx_host_get_param(struct Scsi_Host *shost, enum iscsi_host_param param, char *buf); static void qla4xxx_recovery_timedout(struct iscsi_cls_session *session); +static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc); /* * SCSI host template entry points @@ -89,6 +90,7 @@ static struct scsi_host_template qla4xxx_driver_template = { .eh_device_reset_handler = qla4xxx_eh_device_reset, .eh_target_reset_handler = qla4xxx_eh_target_reset, .eh_host_reset_handler = qla4xxx_eh_host_reset, + .eh_timed_out = qla4xxx_eh_cmd_timed_out, .slave_configure = qla4xxx_slave_configure, .slave_alloc = qla4xxx_slave_alloc, @@ -124,6 +126,21 @@ static struct iscsi_transport qla4xxx_iscsi_transport = { static struct scsi_transport_template *qla4xxx_scsi_transport; +static enum blk_eh_timer_return qla4xxx_eh_cmd_timed_out(struct scsi_cmnd *sc) +{ + struct iscsi_cls_session *session; + struct ddb_entry *ddb_entry; + + session = starget_to_session(scsi_target(sc->device)); + ddb_entry = session->dd_data; + + /* if we are not logged in then the LLD is going to clean up the cmd */ + if (atomic_read(&ddb_entry->state) != DDB_STATE_ONLINE) + return BLK_EH_RESET_TIMER; + else + return BLK_EH_NOT_HANDLED; +} + static void qla4xxx_recovery_timedout(struct iscsi_cls_session *session) { struct ddb_entry *ddb_entry = session->dd_data; @@ -904,18 +921,17 @@ static int qla4xxx_recover_adapter(struct scsi_qla_host *ha, /* Flush any pending ddb changed AENs */ qla4xxx_process_aen(ha, FLUSH_DDB_CHANGED_AENS); + qla4xxx_flush_active_srbs(ha); + /* Reset the firmware. If successful, function * returns with ISP interrupts enabled. */ - if (status == QLA_SUCCESS) { - DEBUG2(printk("scsi%ld: %s - Performing soft reset..\n", - ha->host_no, __func__)); - qla4xxx_flush_active_srbs(ha); - if (ql4xxx_lock_drvr_wait(ha) == QLA_SUCCESS) - status = qla4xxx_soft_reset(ha); - else - status = QLA_ERROR; - } + DEBUG2(printk("scsi%ld: %s - Performing soft reset..\n", + ha->host_no, __func__)); + if (ql4xxx_lock_drvr_wait(ha) == QLA_SUCCESS) + status = qla4xxx_soft_reset(ha); + else + status = QLA_ERROR; /* Flush any pending ddb changed AENs */ qla4xxx_process_aen(ha, FLUSH_DDB_CHANGED_AENS); @@ -1527,11 +1543,9 @@ static int qla4xxx_eh_device_reset(struct scsi_cmnd *cmd) { struct scsi_qla_host *ha = to_qla_host(cmd->device->host); struct ddb_entry *ddb_entry = cmd->device->hostdata; - struct srb *sp; int ret = FAILED, stat; - sp = (struct srb *) cmd->SCp.ptr; - if (!sp || !ddb_entry) + if (!ddb_entry) return ret; dev_info(&ha->pdev->dev, @@ -1644,7 +1658,7 @@ static int qla4xxx_eh_host_reset(struct scsi_cmnd *cmd) ha = (struct scsi_qla_host *) cmd->device->host->hostdata; dev_info(&ha->pdev->dev, - "scsi(%ld:%d:%d:%d): ADAPTER RESET ISSUED.\n", ha->host_no, + "scsi(%ld:%d:%d:%d): HOST RESET ISSUED.\n", ha->host_no, cmd->device->channel, cmd->device->id, cmd->device->lun); if (qla4xxx_wait_for_hba_online(ha) != QLA_SUCCESS) { diff --git a/drivers/scsi/qla4xxx/ql4_version.h b/drivers/scsi/qla4xxx/ql4_version.h index ab984cb89ce..6980cb279c8 100644 --- a/drivers/scsi/qla4xxx/ql4_version.h +++ b/drivers/scsi/qla4xxx/ql4_version.h @@ -5,5 +5,5 @@ * See LICENSE.qla4xxx for copyright and licensing details. */ -#define QLA4XXX_DRIVER_VERSION "5.01.00-k8" +#define QLA4XXX_DRIVER_VERSION "5.01.00-k9" diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index 783e33c65eb..b47240ca4b1 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -990,7 +990,7 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, struct iscsi_uevent *ev; int len = NLMSG_SPACE(sizeof(*ev) + data_size); - skb = alloc_skb(len, GFP_NOIO); + skb = alloc_skb(len, GFP_ATOMIC); if (!skb) { printk(KERN_ERR "can not deliver iscsi offload message:OOM\n"); return -ENOMEM; @@ -1012,7 +1012,7 @@ int iscsi_offload_mesg(struct Scsi_Host *shost, memcpy((char *)ev + sizeof(*ev), data, data_size); - return iscsi_multicast_skb(skb, ISCSI_NL_GRP_UIP, GFP_NOIO); + return iscsi_multicast_skb(skb, ISCSI_NL_GRP_UIP, GFP_ATOMIC); } EXPORT_SYMBOL_GPL(iscsi_offload_mesg); diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 5616cd780ff..b7b9fec67a9 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -1840,6 +1840,18 @@ static void sd_read_block_characteristics(struct scsi_disk *sdkp) kfree(buffer); } +static int sd_try_extended_inquiry(struct scsi_device *sdp) +{ + /* + * Although VPD inquiries can go to SCSI-2 type devices, + * some USB ones crash on receiving them, and the pages + * we currently ask for are for SPC-3 and beyond + */ + if (sdp->scsi_level > SCSI_SPC_2) + return 1; + return 0; +} + /** * sd_revalidate_disk - called the first time a new disk is seen, * performs disk spin up, read_capacity, etc. @@ -1877,8 +1889,12 @@ static int sd_revalidate_disk(struct gendisk *disk) */ if (sdkp->media_present) { sd_read_capacity(sdkp, buffer); - sd_read_block_limits(sdkp); - sd_read_block_characteristics(sdkp); + + if (sd_try_extended_inquiry(sdp)) { + sd_read_block_limits(sdkp); + sd_read_block_characteristics(sdkp); + } + sd_read_write_protect_flag(sdkp, buffer); sd_read_cache_type(sdkp, buffer); sd_read_app_tag_own(sdkp, buffer); diff --git a/drivers/serial/cpm_uart/cpm_uart_cpm2.c b/drivers/serial/cpm_uart/cpm_uart_cpm2.c index 141c0a3333a..a9802e76b5f 100644 --- a/drivers/serial/cpm_uart/cpm_uart_cpm2.c +++ b/drivers/serial/cpm_uart/cpm_uart_cpm2.c @@ -132,7 +132,7 @@ int cpm_uart_allocbuf(struct uart_cpm_port *pinfo, unsigned int is_con) memsz = L1_CACHE_ALIGN(pinfo->rx_nrfifos * pinfo->rx_fifosize) + L1_CACHE_ALIGN(pinfo->tx_nrfifos * pinfo->tx_fifosize); if (is_con) { - mem_addr = alloc_bootmem(memsz); + mem_addr = kzalloc(memsz, GFP_NOWAIT); dma_addr = virt_to_bus(mem_addr); } else diff --git a/drivers/video/console/sticore.c b/drivers/video/console/sticore.c index ef7870f5ea0..857b3668b3b 100644 --- a/drivers/video/console/sticore.c +++ b/drivers/video/console/sticore.c @@ -957,9 +957,14 @@ static int __devinit sticore_pci_init(struct pci_dev *pd, #ifdef CONFIG_PCI unsigned long fb_base, rom_base; unsigned int fb_len, rom_len; + int err; struct sti_struct *sti; - pci_enable_device(pd); + err = pci_enable_device(pd); + if (err < 0) { + dev_err(&pd->dev, "Cannot enable PCI device\n"); + return err; + } fb_base = pci_resource_start(pd, 0); fb_len = pci_resource_len(pd, 0); @@ -1048,7 +1053,7 @@ static void __devinit sti_init_roms(void) /* Register drivers for native & PCI cards */ register_parisc_driver(&pa_sti_driver); - pci_register_driver(&pci_sti_driver); + WARN_ON(pci_register_driver(&pci_sti_driver)); /* if we didn't find the given default sti, take the first one */ if (!default_sti) diff --git a/drivers/virtio/virtio_pci.c b/drivers/virtio/virtio_pci.c index bcec78ffc76..248e00ec4dc 100644 --- a/drivers/virtio/virtio_pci.c +++ b/drivers/virtio/virtio_pci.c @@ -52,8 +52,10 @@ struct virtio_pci_device char (*msix_names)[256]; /* Number of available vectors */ unsigned msix_vectors; - /* Vectors allocated */ + /* Vectors allocated, excluding per-vq vectors if any */ unsigned msix_used_vectors; + /* Whether we have vector per vq */ + bool per_vq_vectors; }; /* Constants for MSI-X */ @@ -258,7 +260,6 @@ static void vp_free_vectors(struct virtio_device *vdev) for (i = 0; i < vp_dev->msix_used_vectors; ++i) free_irq(vp_dev->msix_entries[i].vector, vp_dev); - vp_dev->msix_used_vectors = 0; if (vp_dev->msix_enabled) { /* Disable the vector used for configuration */ @@ -267,80 +268,77 @@ static void vp_free_vectors(struct virtio_device *vdev) /* Flush the write out to device */ ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - vp_dev->msix_enabled = 0; pci_disable_msix(vp_dev->pci_dev); + vp_dev->msix_enabled = 0; + vp_dev->msix_vectors = 0; } -} -static int vp_enable_msix(struct pci_dev *dev, struct msix_entry *entries, - int *options, int noptions) -{ - int i; - for (i = 0; i < noptions; ++i) - if (!pci_enable_msix(dev, entries, options[i])) - return options[i]; - return -EBUSY; + vp_dev->msix_used_vectors = 0; + kfree(vp_dev->msix_names); + vp_dev->msix_names = NULL; + kfree(vp_dev->msix_entries); + vp_dev->msix_entries = NULL; } -static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) +static int vp_request_vectors(struct virtio_device *vdev, int nvectors, + bool per_vq_vectors) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); const char *name = dev_name(&vp_dev->vdev.dev); unsigned i, v; int err = -ENOMEM; - /* We want at most one vector per queue and one for config changes. - * Fallback to separate vectors for config and a shared for queues. - * Finally fall back to regular interrupts. */ - int options[] = { max_vqs + 1, 2 }; - int nvectors = max(options[0], options[1]); + + if (!nvectors) { + /* Can't allocate MSI-X vectors, use regular interrupt */ + vp_dev->msix_vectors = 0; + err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, + IRQF_SHARED, name, vp_dev); + if (err) + return err; + vp_dev->intx_enabled = 1; + return 0; + } vp_dev->msix_entries = kmalloc(nvectors * sizeof *vp_dev->msix_entries, GFP_KERNEL); if (!vp_dev->msix_entries) - goto error_entries; + goto error; vp_dev->msix_names = kmalloc(nvectors * sizeof *vp_dev->msix_names, GFP_KERNEL); if (!vp_dev->msix_names) - goto error_names; + goto error; for (i = 0; i < nvectors; ++i) vp_dev->msix_entries[i].entry = i; - err = vp_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, - options, ARRAY_SIZE(options)); - if (err < 0) { - /* Can't allocate enough MSI-X vectors, use regular interrupt */ - vp_dev->msix_vectors = 0; - err = request_irq(vp_dev->pci_dev->irq, vp_interrupt, - IRQF_SHARED, name, vp_dev); - if (err) - goto error_irq; - vp_dev->intx_enabled = 1; - } else { - vp_dev->msix_vectors = err; - vp_dev->msix_enabled = 1; - - /* Set the vector used for configuration */ - v = vp_dev->msix_used_vectors; - snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, - "%s-config", name); - err = request_irq(vp_dev->msix_entries[v].vector, - vp_config_changed, 0, vp_dev->msix_names[v], - vp_dev); - if (err) - goto error_irq; - ++vp_dev->msix_used_vectors; - - iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - /* Verify we had enough resources to assign the vector */ - v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); - if (v == VIRTIO_MSI_NO_VECTOR) { - err = -EBUSY; - goto error_irq; - } + err = pci_enable_msix(vp_dev->pci_dev, vp_dev->msix_entries, nvectors); + if (err > 0) + err = -ENOSPC; + if (err) + goto error; + vp_dev->msix_vectors = nvectors; + vp_dev->msix_enabled = 1; + + /* Set the vector used for configuration */ + v = vp_dev->msix_used_vectors; + snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, + "%s-config", name); + err = request_irq(vp_dev->msix_entries[v].vector, + vp_config_changed, 0, vp_dev->msix_names[v], + vp_dev); + if (err) + goto error; + ++vp_dev->msix_used_vectors; + + iowrite16(v, vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + /* Verify we had enough resources to assign the vector */ + v = ioread16(vp_dev->ioaddr + VIRTIO_MSI_CONFIG_VECTOR); + if (v == VIRTIO_MSI_NO_VECTOR) { + err = -EBUSY; + goto error; } - if (vp_dev->msix_vectors && vp_dev->msix_vectors != max_vqs + 1) { + if (!per_vq_vectors) { /* Shared vector for all VQs */ v = vp_dev->msix_used_vectors; snprintf(vp_dev->msix_names[v], sizeof *vp_dev->msix_names, @@ -349,28 +347,25 @@ static int vp_request_vectors(struct virtio_device *vdev, unsigned max_vqs) vp_vring_interrupt, 0, vp_dev->msix_names[v], vp_dev); if (err) - goto error_irq; + goto error; ++vp_dev->msix_used_vectors; } return 0; -error_irq: +error: vp_free_vectors(vdev); - kfree(vp_dev->msix_names); -error_names: - kfree(vp_dev->msix_entries); -error_entries: return err; } static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, void (*callback)(struct virtqueue *vq), - const char *name) + const char *name, + u16 vector) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtio_pci_vq_info *info; struct virtqueue *vq; unsigned long flags, size; - u16 num, vector; + u16 num; int err; /* Select the queue we're interested in */ @@ -389,7 +384,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, info->queue_index = index; info->num = num; - info->vector = VIRTIO_MSI_NO_VECTOR; + info->vector = vector; size = PAGE_ALIGN(vring_size(num, VIRTIO_PCI_VRING_ALIGN)); info->queue = alloc_pages_exact(size, GFP_KERNEL|__GFP_ZERO); @@ -413,22 +408,7 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, vq->priv = info; info->vq = vq; - /* allocate per-vq vector if available and necessary */ - if (callback && vp_dev->msix_used_vectors < vp_dev->msix_vectors) { - vector = vp_dev->msix_used_vectors; - snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, - "%s-%s", dev_name(&vp_dev->vdev.dev), name); - err = request_irq(vp_dev->msix_entries[vector].vector, - vring_interrupt, 0, - vp_dev->msix_names[vector], vq); - if (err) - goto out_request_irq; - info->vector = vector; - ++vp_dev->msix_used_vectors; - } else - vector = VP_MSIX_VQ_VECTOR; - - if (callback && vp_dev->msix_enabled) { + if (vector != VIRTIO_MSI_NO_VECTOR) { iowrite16(vector, vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); vector = ioread16(vp_dev->ioaddr + VIRTIO_MSI_QUEUE_VECTOR); if (vector == VIRTIO_MSI_NO_VECTOR) { @@ -444,11 +424,6 @@ static struct virtqueue *vp_find_vq(struct virtio_device *vdev, unsigned index, return vq; out_assign: - if (info->vector != VIRTIO_MSI_NO_VECTOR) { - free_irq(vp_dev->msix_entries[info->vector].vector, vq); - --vp_dev->msix_used_vectors; - } -out_request_irq: vring_del_virtqueue(vq); out_activate_queue: iowrite32(0, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_PFN); @@ -462,12 +437,13 @@ static void vp_del_vq(struct virtqueue *vq) { struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); struct virtio_pci_vq_info *info = vq->priv; - unsigned long size; + unsigned long flags, size; - iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); + spin_lock_irqsave(&vp_dev->lock, flags); + list_del(&info->node); + spin_unlock_irqrestore(&vp_dev->lock, flags); - if (info->vector != VIRTIO_MSI_NO_VECTOR) - free_irq(vp_dev->msix_entries[info->vector].vector, vq); + iowrite16(info->queue_index, vp_dev->ioaddr + VIRTIO_PCI_QUEUE_SEL); if (vp_dev->msix_enabled) { iowrite16(VIRTIO_MSI_NO_VECTOR, @@ -489,36 +465,62 @@ static void vp_del_vq(struct virtqueue *vq) /* the config->del_vqs() implementation */ static void vp_del_vqs(struct virtio_device *vdev) { + struct virtio_pci_device *vp_dev = to_vp_device(vdev); struct virtqueue *vq, *n; + struct virtio_pci_vq_info *info; - list_for_each_entry_safe(vq, n, &vdev->vqs, list) + list_for_each_entry_safe(vq, n, &vdev->vqs, list) { + info = vq->priv; + if (vp_dev->per_vq_vectors) + free_irq(vp_dev->msix_entries[info->vector].vector, vq); vp_del_vq(vq); + } + vp_dev->per_vq_vectors = false; vp_free_vectors(vdev); } -/* the config->find_vqs() implementation */ -static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, - struct virtqueue *vqs[], - vq_callback_t *callbacks[], - const char *names[]) +static int vp_try_to_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[], + int nvectors, + bool per_vq_vectors) { - int vectors = 0; - int i, err; - - /* How many vectors would we like? */ - for (i = 0; i < nvqs; ++i) - if (callbacks[i]) - ++vectors; + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + u16 vector; + int i, err, allocated_vectors; - err = vp_request_vectors(vdev, vectors); + err = vp_request_vectors(vdev, nvectors, per_vq_vectors); if (err) goto error_request; + vp_dev->per_vq_vectors = per_vq_vectors; + allocated_vectors = vp_dev->msix_used_vectors; for (i = 0; i < nvqs; ++i) { - vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i]); - if (IS_ERR(vqs[i])) + if (!callbacks[i] || !vp_dev->msix_enabled) + vector = VIRTIO_MSI_NO_VECTOR; + else if (vp_dev->per_vq_vectors) + vector = allocated_vectors++; + else + vector = VP_MSIX_VQ_VECTOR; + vqs[i] = vp_find_vq(vdev, i, callbacks[i], names[i], vector); + if (IS_ERR(vqs[i])) { + err = PTR_ERR(vqs[i]); goto error_find; + } + /* allocate per-vq irq if available and necessary */ + if (vp_dev->per_vq_vectors && vector != VIRTIO_MSI_NO_VECTOR) { + snprintf(vp_dev->msix_names[vector], sizeof *vp_dev->msix_names, + "%s-%s", dev_name(&vp_dev->vdev.dev), names[i]); + err = request_irq(vp_dev->msix_entries[vector].vector, + vring_interrupt, 0, + vp_dev->msix_names[vector], vqs[i]); + if (err) { + vp_del_vq(vqs[i]); + goto error_find; + } + } } return 0; @@ -526,7 +528,37 @@ error_find: vp_del_vqs(vdev); error_request: - return PTR_ERR(vqs[i]); + return err; +} + +/* the config->find_vqs() implementation */ +static int vp_find_vqs(struct virtio_device *vdev, unsigned nvqs, + struct virtqueue *vqs[], + vq_callback_t *callbacks[], + const char *names[]) +{ + int vectors = 0; + int i, uninitialized_var(err); + + /* How many vectors would we like? */ + for (i = 0; i < nvqs; ++i) + if (callbacks[i]) + ++vectors; + + /* We want at most one vector per queue and one for config changes. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + vectors + 1, true); + if (!err) + return 0; + /* Fallback to separate vectors for config and a shared for queues. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + 2, false); + if (!err) + return 0; + /* Finally fall back to regular interrupts. */ + err = vp_try_to_find_vqs(vdev, nvqs, vqs, callbacks, names, + 0, false); + return err; } static struct virtio_config_ops virtio_pci_config_ops = { diff --git a/drivers/watchdog/coh901327_wdt.c b/drivers/watchdog/coh901327_wdt.c index fecb307d28e..aec7cefdef2 100644 --- a/drivers/watchdog/coh901327_wdt.c +++ b/drivers/watchdog/coh901327_wdt.c @@ -18,6 +18,7 @@ #include <linux/bitops.h> #include <linux/uaccess.h> #include <linux/clk.h> +#include <linux/delay.h> #define DRV_NAME "WDOG COH 901 327" @@ -92,6 +93,8 @@ static struct clk *clk; static void coh901327_enable(u16 timeout) { u16 val; + unsigned long freq; + unsigned long delay_ns; clk_enable(clk); /* Restart timer if it is disabled */ @@ -102,6 +105,14 @@ static void coh901327_enable(u16 timeout) /* Acknowledge any pending interrupt so it doesn't just fire off */ writew(U300_WDOG_IER_WILL_BARK_IRQ_ACK_ENABLE, virtbase + U300_WDOG_IER); + /* + * The interrupt is cleared in the 32 kHz clock domain. + * Wait 3 32 kHz cycles for it to take effect + */ + freq = clk_get_rate(clk); + delay_ns = (1000000000 + freq - 1) / freq; /* Freq to ns and round up */ + delay_ns = 3 * delay_ns; /* Wait 3 cycles */ + ndelay(delay_ns); /* Enable the watchdog interrupt */ writew(U300_WDOG_IMR_WILL_BARK_IRQ_ENABLE, virtbase + U300_WDOG_IMR); /* Activate the watchdog timer */ |