diff options
486 files changed, 56575 insertions, 7414 deletions
diff --git a/Documentation/crypto/async-tx-api.txt b/Documentation/crypto/async-tx-api.txt index c1e9545c59b..9f59fcbf5d8 100644 --- a/Documentation/crypto/async-tx-api.txt +++ b/Documentation/crypto/async-tx-api.txt @@ -13,9 +13,9 @@ 3.6 Constraints 3.7 Example -4 DRIVER DEVELOPER NOTES +4 DMAENGINE DRIVER DEVELOPER NOTES 4.1 Conformance points -4.2 "My application needs finer control of hardware channels" +4.2 "My application needs exclusive control of hardware channels" 5 SOURCE @@ -150,6 +150,7 @@ ops_run_* and ops_complete_* routines in drivers/md/raid5.c for more implementation examples. 4 DRIVER DEVELOPMENT NOTES + 4.1 Conformance points: There are a few conformance points required in dmaengine drivers to accommodate assumptions made by applications using the async_tx API: @@ -158,58 +159,49 @@ accommodate assumptions made by applications using the async_tx API: 3/ Use async_tx_run_dependencies() in the descriptor clean up path to handle submission of dependent operations -4.2 "My application needs finer control of hardware channels" -This requirement seems to arise from cases where a DMA engine driver is -trying to support device-to-memory DMA. The dmaengine and async_tx -implementations were designed for offloading memory-to-memory -operations; however, there are some capabilities of the dmaengine layer -that can be used for platform-specific channel management. -Platform-specific constraints can be handled by registering the -application as a 'dma_client' and implementing a 'dma_event_callback' to -apply a filter to the available channels in the system. Before showing -how to implement a custom dma_event callback some background of -dmaengine's client support is required. - -The following routines in dmaengine support multiple clients requesting -use of a channel: -- dma_async_client_register(struct dma_client *client) -- dma_async_client_chan_request(struct dma_client *client) - -dma_async_client_register takes a pointer to an initialized dma_client -structure. It expects that the 'event_callback' and 'cap_mask' fields -are already initialized. - -dma_async_client_chan_request triggers dmaengine to notify the client of -all channels that satisfy the capability mask. It is up to the client's -event_callback routine to track how many channels the client needs and -how many it is currently using. The dma_event_callback routine returns a -dma_state_client code to let dmaengine know the status of the -allocation. - -Below is the example of how to extend this functionality for -platform-specific filtering of the available channels beyond the -standard capability mask: - -static enum dma_state_client -my_dma_client_callback(struct dma_client *client, - struct dma_chan *chan, enum dma_state state) -{ - struct dma_device *dma_dev; - struct my_platform_specific_dma *plat_dma_dev; - - dma_dev = chan->device; - plat_dma_dev = container_of(dma_dev, - struct my_platform_specific_dma, - dma_dev); - - if (!plat_dma_dev->platform_specific_capability) - return DMA_DUP; - - . . . -} +4.2 "My application needs exclusive control of hardware channels" +Primarily this requirement arises from cases where a DMA engine driver +is being used to support device-to-memory operations. A channel that is +performing these operations cannot, for many platform specific reasons, +be shared. For these cases the dma_request_channel() interface is +provided. + +The interface is: +struct dma_chan *dma_request_channel(dma_cap_mask_t mask, + dma_filter_fn filter_fn, + void *filter_param); + +Where dma_filter_fn is defined as: +typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param); + +When the optional 'filter_fn' parameter is set to NULL +dma_request_channel simply returns the first channel that satisfies the +capability mask. Otherwise, when the mask parameter is insufficient for +specifying the necessary channel, the filter_fn routine can be used to +disposition the available channels in the system. The filter_fn routine +is called once for each free channel in the system. Upon seeing a +suitable channel filter_fn returns DMA_ACK which flags that channel to +be the return value from dma_request_channel. A channel allocated via +this interface is exclusive to the caller, until dma_release_channel() +is called. + +The DMA_PRIVATE capability flag is used to tag dma devices that should +not be used by the general-purpose allocator. It can be set at +initialization time if it is known that a channel will always be +private. Alternatively, it is set when dma_request_channel() finds an +unused "public" channel. + +A couple caveats to note when implementing a driver and consumer: +1/ Once a channel has been privately allocated it will no longer be + considered by the general-purpose allocator even after a call to + dma_release_channel(). +2/ Since capabilities are specified at the device level a dma_device + with multiple channels will either have all channels public, or all + channels private. 5 SOURCE -include/linux/dmaengine.h: core header file for DMA drivers and clients + +include/linux/dmaengine.h: core header file for DMA drivers and api users drivers/dma/dmaengine.c: offload engine channel management routines drivers/dma/: location for offload engine drivers include/linux/async_tx.h: core header file for the async_tx api diff --git a/Documentation/dmaengine.txt b/Documentation/dmaengine.txt new file mode 100644 index 00000000000..0c1c2f63c0a --- /dev/null +++ b/Documentation/dmaengine.txt @@ -0,0 +1 @@ +See Documentation/crypto/async-tx-api.txt diff --git a/Documentation/filesystems/btrfs.txt b/Documentation/filesystems/btrfs.txt new file mode 100644 index 00000000000..64087c34327 --- /dev/null +++ b/Documentation/filesystems/btrfs.txt @@ -0,0 +1,91 @@ + + BTRFS + ===== + +Btrfs is a new copy on write filesystem for Linux aimed at +implementing advanced features while focusing on fault tolerance, +repair and easy administration. Initially developed by Oracle, Btrfs +is licensed under the GPL and open for contribution from anyone. + +Linux has a wealth of filesystems to choose from, but we are facing a +number of challenges with scaling to the large storage subsystems that +are becoming common in today's data centers. Filesystems need to scale +in their ability to address and manage large storage, and also in +their ability to detect, repair and tolerate errors in the data stored +on disk. Btrfs is under heavy development, and is not suitable for +any uses other than benchmarking and review. The Btrfs disk format is +not yet finalized. + +The main Btrfs features include: + + * Extent based file storage (2^64 max file size) + * Space efficient packing of small files + * Space efficient indexed directories + * Dynamic inode allocation + * Writable snapshots + * Subvolumes (separate internal filesystem roots) + * Object level mirroring and striping + * Checksums on data and metadata (multiple algorithms available) + * Compression + * Integrated multiple device support, with several raid algorithms + * Online filesystem check (not yet implemented) + * Very fast offline filesystem check + * Efficient incremental backup and FS mirroring (not yet implemented) + * Online filesystem defragmentation + + + + MAILING LIST + ============ + +There is a Btrfs mailing list hosted on vger.kernel.org. You can +find details on how to subscribe here: + +http://vger.kernel.org/vger-lists.html#linux-btrfs + +Mailing list archives are available from gmane: + +http://dir.gmane.org/gmane.comp.file-systems.btrfs + + + + IRC + === + +Discussion of Btrfs also occurs on the #btrfs channel of the Freenode +IRC network. + + + + UTILITIES + ========= + +Userspace tools for creating and manipulating Btrfs file systems are +available from the git repository at the following location: + + http://git.kernel.org/?p=linux/kernel/git/mason/btrfs-progs-unstable.git + git://git.kernel.org/pub/scm/linux/kernel/git/mason/btrfs-progs-unstable.git + +These include the following tools: + +mkfs.btrfs: create a filesystem + +btrfsctl: control program to create snapshots and subvolumes: + + mount /dev/sda2 /mnt + btrfsctl -s new_subvol_name /mnt + btrfsctl -s snapshot_of_default /mnt/default + btrfsctl -s snapshot_of_new_subvol /mnt/new_subvol_name + btrfsctl -s snapshot_of_a_snapshot /mnt/snapshot_of_new_subvol + ls /mnt + default snapshot_of_a_snapshot snapshot_of_new_subvol + new_subvol_name snapshot_of_default + + Snapshots and subvolumes cannot be deleted right now, but you can + rm -rf all the files and directories inside them. + +btrfsck: do a limited check of the FS extent trees. + +btrfs-debug-tree: print all of the FS metadata in text form. Example: + + btrfs-debug-tree /dev/sda2 >& big_output_file diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index ed0a72442cf..8511d3532c2 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -141,6 +141,7 @@ and is between 256 and 4096 characters. It is defined in the file ht -- run only enough ACPI to enable Hyper Threading strict -- Be less tolerant of platforms that are not strictly ACPI specification compliant. + rsdt -- prefer RSDT over (default) XSDT See also Documentation/power/pm.txt, pci=noacpi @@ -151,16 +152,20 @@ and is between 256 and 4096 characters. It is defined in the file default: 0 acpi_sleep= [HW,ACPI] Sleep options - Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, old_ordering } - See Documentation/power/video.txt for s3_bios and s3_mode. + Format: { s3_bios, s3_mode, s3_beep, s4_nohwsig, + old_ordering, s4_nonvs } + See Documentation/power/video.txt for information on + s3_bios and s3_mode. s3_beep is for debugging; it makes the PC's speaker beep as soon as the kernel's real-mode entry point is called. s4_nohwsig prevents ACPI hardware signature from being used during resume from hibernation. old_ordering causes the ACPI 1.0 ordering of the _PTS - control method, wrt putting devices into low power - states, to be enforced (the ACPI 2.0 ordering of _PTS is - used by default). + control method, with respect to putting devices into + low power states, to be enforced (the ACPI 2.0 ordering + of _PTS is used by default). + s4_nonvs prevents the kernel from saving/restoring the + ACPI NVS memory during hibernation. acpi_sci= [HW,ACPI] ACPI System Control Interrupt trigger mode Format: { level | edge | high | low } @@ -195,7 +200,7 @@ and is between 256 and 4096 characters. It is defined in the file acpi_skip_timer_override [HW,ACPI] Recognize and ignore IRQ0/pin2 Interrupt Override. For broken nForce2 BIOS resulting in XT-PIC timer. - acpi_use_timer_override [HW,ACPI} + acpi_use_timer_override [HW,ACPI] Use timer override. For some broken Nvidia NF5 boards that require a timer override, but don't have HPET @@ -878,17 +883,19 @@ and is between 256 and 4096 characters. It is defined in the file See Documentation/ide/ide.txt. idle= [X86] - Format: idle=poll or idle=mwait, idle=halt, idle=nomwait - Poll forces a polling idle loop that can slightly improves the performance - of waking up a idle CPU, but will use a lot of power and make the system - run hot. Not recommended. - idle=mwait. On systems which support MONITOR/MWAIT but the kernel chose - to not use it because it doesn't save as much power as a normal idle - loop use the MONITOR/MWAIT idle loop anyways. Performance should be the same - as idle=poll. - idle=halt. Halt is forced to be used for CPU idle. + Format: idle=poll, idle=mwait, idle=halt, idle=nomwait + Poll forces a polling idle loop that can slightly + improve the performance of waking up a idle CPU, but + will use a lot of power and make the system run hot. + Not recommended. + idle=mwait: On systems which support MONITOR/MWAIT but + the kernel chose to not use it because it doesn't save + as much power as a normal idle loop, use the + MONITOR/MWAIT idle loop anyways. Performance should be + the same as idle=poll. + idle=halt: Halt is forced to be used for CPU idle. In such case C2/C3 won't be used again. - idle=nomwait. Disable mwait for CPU C-states + idle=nomwait: Disable mwait for CPU C-states ide-pci-generic.all-generic-ide [HW] (E)IDE subsystem Claim all unknown PCI IDE storage controllers. @@ -1074,8 +1081,8 @@ and is between 256 and 4096 characters. It is defined in the file lapic [X86-32,APIC] Enable the local APIC even if BIOS disabled it. - lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer in - C2 power state. + lapic_timer_c2_ok [X86-32,x86-64,APIC] trust the local apic timer + in C2 power state. libata.dma= [LIBATA] DMA control libata.dma=0 Disable all PATA and SATA DMA @@ -2303,7 +2310,8 @@ and is between 256 and 4096 characters. It is defined in the file thermal.psv= [HW,ACPI] -1: disable all passive trip points - <degrees C>: override all passive trip points to this value + <degrees C>: override all passive trip points to this + value thermal.tzp= [HW,ACPI] Specify global default ACPI thermal zone polling rate diff --git a/Documentation/powerpc/dts-bindings/4xx/ndfc.txt b/Documentation/powerpc/dts-bindings/4xx/ndfc.txt new file mode 100644 index 00000000000..869f0b5f16e --- /dev/null +++ b/Documentation/powerpc/dts-bindings/4xx/ndfc.txt @@ -0,0 +1,39 @@ +AMCC NDFC (NanD Flash Controller) + +Required properties: +- compatible : "ibm,ndfc". +- reg : should specify chip select and size used for the chip (0x2000). + +Optional properties: +- ccr : NDFC config and control register value (default 0). +- bank-settings : NDFC bank configuration register value (default 0). + +Notes: +- partition(s) - follows the OF MTD standard for partitions + +Example: + +ndfc@1,0 { + compatible = "ibm,ndfc"; + reg = <0x00000001 0x00000000 0x00002000>; + ccr = <0x00001000>; + bank-settings = <0x80002222>; + #address-cells = <1>; + #size-cells = <1>; + + nand { + #address-cells = <1>; + #size-cells = <1>; + + partition@0 { + label = "kernel"; + reg = <0x00000000 0x00200000>; + }; + partition@200000 { + label = "root"; + reg = <0x00200000 0x03E00000>; + }; + }; +}; + + diff --git a/arch/arm/mach-pxa/corgi.c b/arch/arm/mach-pxa/corgi.c index c5e28a46b29..a8d91b6c136 100644 --- a/arch/arm/mach-pxa/corgi.c +++ b/arch/arm/mach-pxa/corgi.c @@ -27,6 +27,7 @@ #include <linux/spi/spi.h> #include <linux/spi/ads7846.h> #include <linux/spi/corgi_lcd.h> +#include <linux/mtd/sharpsl.h> #include <video/w100fb.h> #include <asm/setup.h> @@ -542,6 +543,55 @@ err_free_1: static inline void corgi_init_spi(void) {} #endif +static struct mtd_partition sharpsl_nand_partitions[] = { + { + .name = "System Area", + .offset = 0, + .size = 7 * 1024 * 1024, + }, + { + .name = "Root Filesystem", + .offset = 7 * 1024 * 1024, + .size = 25 * 1024 * 1024, + }, + { + .name = "Home Filesystem", + .offset = MTDPART_OFS_APPEND, + .size = MTDPART_SIZ_FULL, + }, +}; + +static uint8_t scan_ff_pattern[] = { 0xff, 0xff }; + +static struct nand_bbt_descr sharpsl_bbt = { + .options = 0, + .offs = 4, + .len = 2, + .pattern = scan_ff_pattern +}; + +static struct sharpsl_nand_platform_data sharpsl_nand_platform_data = { + .badblock_pattern = &sharpsl_bbt, + .partitions = sharpsl_nand_partitions, + .nr_partitions = ARRAY_SIZE(sharpsl_nand_partitions), +}; + +static struct resource sharpsl_nand_resources[] = { + { + .start = 0x0C000000, + .end = 0x0C000FFF, + .flags = IORESOURCE_MEM, + }, +}; + +static struct platform_device sharpsl_nand_device = { + .name = "sharpsl-nand", + .id = -1, + .resource = sharpsl_nand_resources, + .num_resources = ARRAY_SIZE(sharpsl_nand_resources), + .dev.platform_data = &sharpsl_nand_platform_data, +}; + static struct mtd_partition sharpsl_rom_parts[] = { { .name ="Boot PROM Filesystem", @@ -577,6 +627,7 @@ static struct platform_device *devices[] __initdata = { &corgifb_device, &corgikbd_device, &corgiled_device, + &sharpsl_nand_device, &sharpsl_rom_device, }; @@ -617,6 +668,9 @@ static void __init corgi_init(void) platform_scoop_config = &corgi_pcmcia_config; + if (machine_is_husky()) + sharpsl_nand_partitions[1].size = 53 * 1024 * 1024; + platform_add_devices(devices, ARRAY_SIZE(devices)); } diff --git a/arch/arm/mach-pxa/poodle.c b/arch/arm/mach-pxa/poodle.c index ae88855bf97..f9093beba75 100644 --- a/arch/arm/mach-pxa/poodle.c +++ b/arch/arm/mach-pxa/poodle.c @@ -24,6 +24,7 @@ #include <linux/gpio.h> #include <linux/spi/spi.h> #include <linux/spi/ads7846.h> +#include <linux/mtd/sharpsl.h> #include <mach/hardware.h> #include <asm/mach-types.h> @@ -414,6 +415,55 @@ static struct pxafb_mach_info poodle_fb_info = { .lcd_conn = LCD_COLOR_TFT_16BPP, }; +static struct mtd_partition sharpsl_nand_partitions[] = { + { + .name = "System Area", + .offset = 0, + .size = 7 * 1024 * 1024, + }, + { + .name = "Root Filesystem", + .offset = 7 * 1024 * 1024, + .size = 22 * 1024 * 1024, + }, + { + .name = "Home Filesystem", + .offset = MTDPART_OFS_APPEND, + .size = MTDPART_SIZ_FULL, + }, +}; + +static uint8_t scan_ff_pattern[] = { 0xff, 0xff }; + +static struct nand_bbt_descr sharpsl_bbt = { + .options = 0, + .offs = 4, + .len = 2, + .pattern = scan_ff_pattern +}; + +static struct sharpsl_nand_platform_data sharpsl_nand_platform_data = { + .badblock_pattern = &sharpsl_bbt, + .partitions = sharpsl_nand_partitions, + .nr_partitions = ARRAY_SIZE(sharpsl_nand_partitions), +}; + +static struct resource sharpsl_nand_resources[] = { + { + .start = 0x0C000000, + .end = 0x0C000FFF, + .flags = IORESOURCE_MEM, + }, +}; + +static struct platform_device sharpsl_nand_device = { + .name = "sharpsl-nand", + .id = -1, + .resource = sharpsl_nand_resources, + .num_resources = ARRAY_SIZE(sharpsl_nand_resources), + .dev.platform_data = &sharpsl_nand_platform_data, +}; + static struct mtd_partition sharpsl_rom_parts[] = { { .name ="Boot PROM Filesystem", @@ -447,6 +497,7 @@ static struct platform_device sharpsl_rom_device = { static struct platform_device *devices[] __initdata = { &poodle_locomo_device, &poodle_scoop_device, + &sharpsl_nand_device, &sharpsl_rom_device, }; diff --git a/arch/arm/mach-pxa/spitz.c b/arch/arm/mach-pxa/spitz.c index 7299d87a1cb..6d447c9ce8a 100644 --- a/arch/arm/mach-pxa/spitz.c +++ b/arch/arm/mach-pxa/spitz.c @@ -31,6 +31,7 @@ #include <linux/spi/spi.h> #include <linux/spi/ads7846.h> #include <linux/spi/corgi_lcd.h> +#include <linux/mtd/sharpsl.h> #include <asm/setup.h> #include <asm/memory.h> @@ -613,6 +614,54 @@ static struct pxafb_mach_info spitz_pxafb_info = { .lcd_conn = LCD_COLOR_TFT_16BPP | LCD_ALTERNATE_MAPPING, }; +static struct mtd_partition sharpsl_nand_partitions[] = { + { + .name = "System Area", + .offset = 0, + .size = 7 * 1024 * 1024, + }, + { + .name = "Root Filesystem", + .offset = 7 * 1024 * 1024, + }, + { + .name = "Home Filesystem", + .offset = MTDPART_OFS_APPEND, + .size = MTDPART_SIZ_FULL, + }, +}; + +static uint8_t scan_ff_pattern[] = { 0xff, 0xff }; + +static struct nand_bbt_descr sharpsl_bbt = { + .options = 0, + .offs = 4, + .len = 2, + .pattern = scan_ff_pattern +}; + +static struct sharpsl_nand_platform_data sharpsl_nand_platform_data = { + .badblock_pattern = &sharpsl_bbt, + .partitions = sharpsl_nand_partitions, + .nr_partitions = ARRAY_SIZE(sharpsl_nand_partitions), +}; + +static struct resource sharpsl_nand_resources[] = { + { + .start = 0x0C000000, + .end = 0x0C000FFF, + .flags = IORESOURCE_MEM, + }, +}; + +static struct platform_device sharpsl_nand_device = { + .name = "sharpsl-nand", + .id = -1, + .resource = sharpsl_nand_resources, + .num_resources = ARRAY_SIZE(sharpsl_nand_resources), + .dev.platform_data = &sharpsl_nand_platform_data, +}; + static struct mtd_partition sharpsl_rom_parts[] = { { @@ -648,6 +697,7 @@ static struct platform_device *devices[] __initdata = { &spitzscoop_device, &spitzkbd_device, &spitzled_device, + &sharpsl_nand_device, &sharpsl_rom_device, }; @@ -671,6 +721,14 @@ static void __init common_init(void) pm_power_off = spitz_poweroff; arm_pm_restart = spitz_restart; + if (machine_is_spitz()) { + sharpsl_nand_partitions[1].size = 5 * 1024 * 1024; + } else if (machine_is_akita()) { + sharpsl_nand_partitions[1].size = 58 * 1024 * 1024; + } else if (machine_is_borzoi()) { + sharpsl_nand_partitions[1].size = 32 * 1024 * 1024; + } + PMCR = 0x00; /* Stop 3.6MHz and drive HIGH to PCMCIA and CS */ @@ -715,10 +773,29 @@ static struct i2c_board_info akita_i2c_board_info[] = { }, }; +static struct nand_bbt_descr sharpsl_akita_bbt = { + .options = 0, + .offs = 4, + .len = 1, + .pattern = scan_ff_pattern +}; + +static struct nand_ecclayout akita_oobinfo = { + .eccbytes = 24, + .eccpos = { + 0x5, 0x1, 0x2, 0x3, 0x6, 0x7, 0x15, 0x11, + 0x12, 0x13, 0x16, 0x17, 0x25, 0x21, 0x22, 0x23, + 0x26, 0x27, 0x35, 0x31, 0x32, 0x33, 0x36, 0x37}, + .oobfree = {{0x08, 0x09}} +}; + static void __init akita_init(void) { spitz_ficp_platform_data.transceiver_mode = akita_irda_transceiver_mode; + sharpsl_nand_platform_data.badblock_pattern = &sharpsl_akita_bbt; + sharpsl_nand_platform_data.ecc_layout = &akita_oobinfo; + /* We just pretend the second element of the array doesn't exist */ spitz_pcmcia_config.num_devs = 1; platform_scoop_config = &spitz_pcmcia_config; diff --git a/arch/avr32/mach-at32ap/at32ap700x.c b/arch/avr32/mach-at32ap/at32ap700x.c index ea7bc1e8562..3fbfd1e32a9 100644 --- a/arch/avr32/mach-at32ap/at32ap700x.c +++ b/arch/avr32/mach-at32ap/at32ap700x.c @@ -1305,7 +1305,7 @@ struct platform_device *__init at32_add_device_mci(unsigned int id, struct mci_platform_data *data) { struct platform_device *pdev; - struct dw_dma_slave *dws; + struct dw_dma_slave *dws = &data->dma_slave; u32 pioa_mask; u32 piob_mask; @@ -1324,22 +1324,13 @@ at32_add_device_mci(unsigned int id, struct mci_platform_data *data) ARRAY_SIZE(atmel_mci0_resource))) goto fail; - if (data->dma_slave) - dws = kmemdup(to_dw_dma_slave(data->dma_slave), - sizeof(struct dw_dma_slave), GFP_KERNEL); - else - dws = kzalloc(sizeof(struct dw_dma_slave), GFP_KERNEL); - - dws->slave.dev = &pdev->dev; - dws->slave.dma_dev = &dw_dmac0_device.dev; - dws->slave.reg_width = DMA_SLAVE_WIDTH_32BIT; + dws->dma_dev = &dw_dmac0_device.dev; + dws->reg_width = DW_DMA_SLAVE_WIDTH_32BIT; dws->cfg_hi = (DWC_CFGH_SRC_PER(0) | DWC_CFGH_DST_PER(1)); dws->cfg_lo &= ~(DWC_CFGL_HS_DST_POL | DWC_CFGL_HS_SRC_POL); - data->dma_slave = &dws->slave; - if (platform_device_add_data(pdev, data, sizeof(struct mci_platform_data))) goto fail; diff --git a/arch/ia64/include/asm/acpi-ext.h b/arch/ia64/include/asm/acpi-ext.h index 734d137dda6..7f8362b379e 100644 --- a/arch/ia64/include/asm/acpi-ext.h +++ b/arch/ia64/include/asm/acpi-ext.h @@ -14,7 +14,6 @@ #define _ASM_IA64_ACPI_EXT_H #include <linux/types.h> -#include <acpi/actypes.h> extern acpi_status hp_acpi_csr_space (acpi_handle, u64 *base, u64 *length); diff --git a/arch/ia64/include/asm/sn/acpi.h b/arch/ia64/include/asm/sn/acpi.h index 9ce2801cbd5..fd480db2556 100644 --- a/arch/ia64/include/asm/sn/acpi.h +++ b/arch/ia64/include/asm/sn/acpi.h @@ -9,8 +9,6 @@ #ifndef _ASM_IA64_SN_ACPI_H #define _ASM_IA64_SN_ACPI_H -#include "acpi/acglobal.h" - extern int sn_acpi_rev; #define SN_ACPI_BASE_SUPPORT() (sn_acpi_rev >= 0x20101) diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c index 0553648b759..d541671caf4 100644 --- a/arch/ia64/kernel/acpi.c +++ b/arch/ia64/kernel/acpi.c @@ -65,6 +65,7 @@ EXPORT_SYMBOL(pm_idle); void (*pm_power_off) (void); EXPORT_SYMBOL(pm_power_off); +u32 acpi_rsdt_forced; unsigned int acpi_cpei_override; unsigned int acpi_cpei_phys_cpuid; diff --git a/arch/ia64/sn/kernel/io_acpi_init.c b/arch/ia64/sn/kernel/io_acpi_init.c index bc610a6c785..c5a214026a7 100644 --- a/arch/ia64/sn/kernel/io_acpi_init.c +++ b/arch/ia64/sn/kernel/io_acpi_init.c @@ -13,7 +13,6 @@ #include <asm/sn/sn_sal.h> #include "xtalk/hubdev.h" #include <linux/acpi.h> -#include <acpi/acnamesp.h> /* @@ -64,6 +63,7 @@ static acpi_status __init sn_acpi_hubdev_init(acpi_handle handle, u32 depth, void *context, void **ret) { struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL }; u64 addr; struct hubdev_info *hubdev; struct hubdev_info *hubdev_ptr; @@ -77,11 +77,12 @@ sn_acpi_hubdev_init(acpi_handle handle, u32 depth, void *context, void **ret) status = acpi_get_vendor_resource(handle, METHOD_NAME__CRS, &sn_uuid, &buffer); if (ACPI_FAILURE(status)) { + acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer); printk(KERN_ERR "sn_acpi_hubdev_init: acpi_get_vendor_resource() " - "(0x%x) failed for: ", status); - acpi_ns_print_node_pathname(handle, NULL); - printk("\n"); + "(0x%x) failed for: %s\n", status, + (char *)name_buffer.pointer); + kfree(name_buffer.pointer); return AE_OK; /* Continue walking namespace */ } @@ -89,11 +90,12 @@ sn_acpi_hubdev_init(acpi_handle handle, u32 depth, void *context, void **ret) vendor = &resource->data.vendor_typed; if ((vendor->byte_length - sizeof(struct acpi_vendor_uuid)) != sizeof(struct hubdev_info *)) { + acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer); printk(KERN_ERR - "sn_acpi_hubdev_init: Invalid vendor data length: %d for: ", - vendor->byte_length); - acpi_ns_print_node_pathname(handle, NULL); - printk("\n"); + "sn_acpi_hubdev_init: Invalid vendor data length: " + "%d for: %s\n", + vendor->byte_length, (char *)name_buffer.pointer); + kfree(name_buffer.pointer); goto exit; } @@ -120,6 +122,7 @@ sn_get_bussoft_ptr(struct pci_bus *bus) { u64 addr; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL }; acpi_handle handle; struct pcibus_bussoft *prom_bussoft_ptr; struct acpi_resource *resource; @@ -131,11 +134,11 @@ sn_get_bussoft_ptr(struct pci_bus *bus) status = acpi_get_vendor_resource(handle, METHOD_NAME__CRS, &sn_uuid, &buffer); if (ACPI_FAILURE(status)) { + acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer); printk(KERN_ERR "%s: " - "acpi_get_vendor_resource() failed (0x%x) for: ", - __func__, status); - acpi_ns_print_node_pathname(handle, NULL); - printk("\n"); + "acpi_get_vendor_resource() failed (0x%x) for: %s\n", + __func__, status, (char *)name_buffer.pointer); + kfree(name_buffer.pointer); return NULL; } resource = buffer.pointer; @@ -168,6 +171,7 @@ sn_extract_device_info(acpi_handle handle, struct pcidev_info **pcidev_info, { u64 addr; struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL }; struct sn_irq_info *irq_info, *irq_info_prom; struct pcidev_info *pcidev_ptr, *pcidev_prom_ptr; struct acpi_resource *resource; @@ -182,11 +186,11 @@ sn_extract_device_info(acpi_handle handle, struct pcidev_info **pcidev_info, status = acpi_get_vendor_resource(handle, METHOD_NAME__CRS, &sn_uuid, &buffer); if (ACPI_FAILURE(status)) { + acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer); printk(KERN_ERR - "%s: acpi_get_vendor_resource() failed (0x%x) for: ", - __func__, status); - acpi_ns_print_node_pathname(handle, NULL); - printk("\n"); + "%s: acpi_get_vendor_resource() failed (0x%x) for: %s\n", + __func__, status, (char *)name_buffer.pointer); + kfree(name_buffer.pointer); return 1; } @@ -194,11 +198,12 @@ sn_extract_device_info(acpi_handle handle, struct pcidev_info **pcidev_info, vendor = &resource->data.vendor_typed; if ((vendor->byte_length - sizeof(struct acpi_vendor_uuid)) != sizeof(struct pci_devdev_info *)) { + acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer); printk(KERN_ERR - "%s: Invalid vendor data length: %d for: ", - __func__, vendor->byte_length); - acpi_ns_print_node_pathname(handle, NULL); - printk("\n"); + "%s: Invalid vendor data length: %d for: %s\n", + __func__, vendor->byte_length, + (char *)name_buffer.pointer); + kfree(name_buffer.pointer); ret = 1; goto exit; } @@ -239,6 +244,9 @@ get_host_devfn(acpi_handle device_handle, acpi_handle rootbus_handle) acpi_handle parent; int slot; acpi_status status; + struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + + acpi_get_name(device_handle, ACPI_FULL_PATHNAME, &name_buffer); /* * Do an upward search to find the root bus device, and @@ -249,9 +257,8 @@ get_host_devfn(acpi_handle device_handle, acpi_handle rootbus_handle) status = acpi_get_parent(child, &parent); if (ACPI_FAILURE(status)) { printk(KERN_ERR "%s: acpi_get_parent() failed " - "(0x%x) for: ", __func__, status); - acpi_ns_print_node_pathname(child, NULL); - printk("\n"); + "(0x%x) for: %s\n", __func__, status, + (char *)name_buffer.pointer); panic("%s: Unable to find host devfn\n", __func__); } if (parent == rootbus_handle) @@ -259,22 +266,20 @@ get_host_devfn(acpi_handle device_handle, acpi_handle rootbus_handle) child = parent; } if (!child) { - printk(KERN_ERR "%s: Unable to find root bus for: ", - __func__); - acpi_ns_print_node_pathname(device_handle, NULL); - printk("\n"); + printk(KERN_ERR "%s: Unable to find root bus for: %s\n", + __func__, (char *)name_buffer.pointer); BUG(); } status = acpi_evaluate_integer(child, METHOD_NAME__ADR, NULL, &adr); if (ACPI_FAILURE(status)) { - printk(KERN_ERR "%s: Unable to get _ADR (0x%x) for: ", - __func__, status); - acpi_ns_print_node_pathname(child, NULL); - printk("\n"); + printk(KERN_ERR "%s: Unable to get _ADR (0x%x) for: %s\n", + __func__, status, (char *)name_buffer.pointer); panic("%s: Unable to find host devfn\n", __func__); } + kfree(name_buffer.pointer); + slot = (adr >> 16) & 0xffff; function = adr & 0xffff; devfn = PCI_DEVFN(slot, function); @@ -300,27 +305,28 @@ find_matching_device(acpi_handle handle, u32 lvl, void *context, void **rv) int function; int slot; struct sn_pcidev_match *info = context; + struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL }; status = acpi_evaluate_integer(handle, METHOD_NAME__ADR, NULL, &adr); if (ACPI_SUCCESS(status)) { status = acpi_get_parent(handle, &parent); if (ACPI_FAILURE(status)) { + acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer); printk(KERN_ERR - "%s: acpi_get_parent() failed (0x%x) for: ", - __func__, status); - acpi_ns_print_node_pathname(handle, NULL); - printk("\n"); + "%s: acpi_get_parent() failed (0x%x) for: %s\n", + __func__, status, (char *)name_buffer.pointer); + kfree(name_buffer.pointer); return AE_OK; } status = acpi_evaluate_integer(parent, METHOD_NAME__BBN, NULL, &bbn); if (ACPI_FAILURE(status)) { + acpi_get_name(handle, ACPI_FULL_PATHNAME, &name_buffer); printk(KERN_ERR - "%s: Failed to find _BBN in parent of: ", - __func__); - acpi_ns_print_node_pathname(handle, NULL); - printk("\n"); + "%s: Failed to find _BBN in parent of: %s\n", + __func__, (char *)name_buffer.pointer); + kfree(name_buffer.pointer); return AE_OK; } @@ -350,24 +356,27 @@ sn_acpi_get_pcidev_info(struct pci_dev *dev, struct pcidev_info **pcidev_info, acpi_handle rootbus_handle; unsigned long long segment; acpi_status status; + struct acpi_buffer name_buffer = { ACPI_ALLOCATE_BUFFER, NULL }; rootbus_handle = PCI_CONTROLLER(dev)->acpi_handle; status = acpi_evaluate_integer(rootbus_handle, METHOD_NAME__SEG, NULL, &segment); if (ACPI_SUCCESS(status)) { if (segment != pci_domain_nr(dev)) { + acpi_get_name(rootbus_handle, ACPI_FULL_PATHNAME, + &name_buffer); printk(KERN_ERR - "%s: Segment number mismatch, 0x%llx vs 0x%x for: ", - __func__, segment, pci_domain_nr(dev)); - acpi_ns_print_node_pathname(rootbus_handle, NULL); - printk("\n"); + "%s: Segment number mismatch, 0x%llx vs 0x%x for: %s\n", + __func__, segment, pci_domain_nr(dev), + (char *)name_buffer.pointer); + kfree(name_buffer.pointer); return 1; } } else { - printk(KERN_ERR "%s: Unable to get __SEG from: ", - __func__); - acpi_ns_print_node_pathname(rootbus_handle, NULL); - printk("\n"); + acpi_get_name(rootbus_handle, ACPI_FULL_PATHNAME, &name_buffer); + printk(KERN_ERR "%s: Unable to get __SEG from: %s\n", + __func__, (char *)name_buffer.pointer); + kfree(name_buffer.pointer); return 1; } diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c index 8a924a5661d..0d4ffa4da1d 100644 --- a/arch/ia64/sn/kernel/io_common.c +++ b/arch/ia64/sn/kernel/io_common.c @@ -26,7 +26,6 @@ #include <linux/acpi.h> #include <asm/sn/sn2/sn_hwperf.h> #include <asm/sn/acpi.h> -#include "acpi/acglobal.h" extern void sn_init_cpei_timer(void); extern void register_sn_procfs(void); @@ -473,7 +472,7 @@ sn_io_early_init(void) { struct acpi_table_header *header = NULL; - acpi_get_table_by_index(ACPI_TABLE_INDEX_DSDT, &header); + acpi_get_table(ACPI_SIG_DSDT, 1, &header); BUG_ON(header == NULL); sn_acpi_rev = header->oem_revision; } @@ -505,7 +504,7 @@ sn_io_early_init(void) { struct acpi_table_header *header; - (void)acpi_get_table_by_index(ACPI_TABLE_INDEX_DSDT, &header); + (void)acpi_get_table(ACPI_SIG_DSDT, 1, &header); printk(KERN_INFO "ACPI DSDT OEM Rev 0x%x\n", header->oem_revision); } diff --git a/arch/parisc/Makefile b/arch/parisc/Makefile index 5ddad7bd60a..0d428278356 100644 --- a/arch/parisc/Makefile +++ b/arch/parisc/Makefile @@ -77,7 +77,7 @@ libs-y += arch/parisc/lib/ `$(CC) -print-libgcc-file-name` drivers-$(CONFIG_OPROFILE) += arch/parisc/oprofile/ -PALO := $(shell if which palo; then : ; \ +PALO := $(shell if (which palo 2>&1); then : ; \ elif [ -x /sbin/palo ]; then echo /sbin/palo; \ fi) diff --git a/arch/parisc/include/asm/Kbuild b/arch/parisc/include/asm/Kbuild index f88b252e419..2121d99f836 100644 --- a/arch/parisc/include/asm/Kbuild +++ b/arch/parisc/include/asm/Kbuild @@ -1,3 +1,4 @@ include include/asm-generic/Kbuild.asm unifdef-y += pdc.h +unifdef-y += swab.h diff --git a/arch/parisc/include/asm/byteorder.h b/arch/parisc/include/asm/byteorder.h index db148313de5..da66029c4cb 100644 --- a/arch/parisc/include/asm/byteorder.h +++ b/arch/parisc/include/asm/byteorder.h @@ -1,82 +1,7 @@ #ifndef _PARISC_BYTEORDER_H #define _PARISC_BYTEORDER_H -#include <asm/types.h> -#include <linux/compiler.h> - -#ifdef __GNUC__ - -static __inline__ __attribute_const__ __u16 ___arch__swab16(__u16 x) -{ - __asm__("dep %0, 15, 8, %0\n\t" /* deposit 00ab -> 0bab */ - "shd %%r0, %0, 8, %0" /* shift 000000ab -> 00ba */ - : "=r" (x) - : "0" (x)); - return x; -} - -static __inline__ __attribute_const__ __u32 ___arch__swab24(__u32 x) -{ - __asm__("shd %0, %0, 8, %0\n\t" /* shift xabcxabc -> cxab */ - "dep %0, 15, 8, %0\n\t" /* deposit cxab -> cbab */ - "shd %%r0, %0, 8, %0" /* shift 0000cbab -> 0cba */ - : "=r" (x) - : "0" (x)); - return x; -} - -static __inline__ __attribute_const__ __u32 ___arch__swab32(__u32 x) -{ - unsigned int temp; - __asm__("shd %0, %0, 16, %1\n\t" /* shift abcdabcd -> cdab */ - "dep %1, 15, 8, %1\n\t" /* deposit cdab -> cbab */ - "shd %0, %1, 8, %0" /* shift abcdcbab -> dcba */ - : "=r" (x), "=&r" (temp) - : "0" (x)); - return x; -} - - -#if BITS_PER_LONG > 32 -/* -** From "PA-RISC 2.0 Architecture", HP Professional Books. -** See Appendix I page 8 , "Endian Byte Swapping". -** -** Pretty cool algorithm: (* == zero'd bits) -** PERMH 01234567 -> 67452301 into %0 -** HSHL 67452301 -> 7*5*3*1* into %1 -** HSHR 67452301 -> *6*4*2*0 into %0 -** OR %0 | %1 -> 76543210 into %0 (all done!) -*/ -static __inline__ __attribute_const__ __u64 ___arch__swab64(__u64 x) { - __u64 temp; - __asm__("permh,3210 %0, %0\n\t" - "hshl %0, 8, %1\n\t" - "hshr,u %0, 8, %0\n\t" - "or %1, %0, %0" - : "=r" (x), "=&r" (temp) - : "0" (x)); - return x; -} -#define __arch__swab64(x) ___arch__swab64(x) -#define __BYTEORDER_HAS_U64__ -#elif !defined(__STRICT_ANSI__) -static __inline__ __attribute_const__ __u64 ___arch__swab64(__u64 x) -{ - __u32 t1 = ___arch__swab32((__u32) x); - __u32 t2 = ___arch__swab32((__u32) (x >> 32)); - return (((__u64) t1 << 32) | t2); -} -#define __arch__swab64(x) ___arch__swab64(x) -#define __BYTEORDER_HAS_U64__ -#endif - -#define __arch__swab16(x) ___arch__swab16(x) -#define __arch__swab24(x) ___arch__swab24(x) -#define __arch__swab32(x) ___arch__swab32(x) - -#endif /* __GNUC__ */ - +#include <asm/swab.h> #include <linux/byteorder/big_endian.h> #endif /* _PARISC_BYTEORDER_H */ diff --git a/arch/parisc/include/asm/checksum.h b/arch/parisc/include/asm/checksum.h index e9639ccc3fc..c84b2fcb18a 100644 --- a/arch/parisc/include/asm/checksum.h +++ b/arch/parisc/include/asm/checksum.h @@ -182,7 +182,7 @@ static __inline__ __sum16 csum_ipv6_magic(const struct in6_addr *saddr, #endif : "=r" (sum), "=r" (saddr), "=r" (daddr), "=r" (len) : "0" (sum), "1" (saddr), "2" (daddr), "3" (len), "r" (proto) - : "r19", "r20", "r21", "r22"); + : "r19", "r20", "r21", "r22", "memory"); return csum_fold(sum); } diff --git a/arch/parisc/include/asm/io.h b/arch/parisc/include/asm/io.h index 55ddb184210..d3031d1f9d0 100644 --- a/arch/parisc/include/asm/io.h +++ b/arch/parisc/include/asm/io.h @@ -4,12 +4,6 @@ #include <linux/types.h> #include <asm/pgtable.h> -extern unsigned long parisc_vmerge_boundary; -extern unsigned long parisc_vmerge_max_size; - -#define BIO_VMERGE_BOUNDARY parisc_vmerge_boundary -#define BIO_VMERGE_MAX_SIZE parisc_vmerge_max_size - #define virt_to_phys(a) ((unsigned long)__pa(a)) #define phys_to_virt(a) __va(a) #define virt_to_bus virt_to_phys @@ -182,9 +176,9 @@ static inline void __raw_writeq(unsigned long long b, volatile void __iomem *add /* readb can never be const, so use __fswab instead of le*_to_cpu */ #define readb(addr) __raw_readb(addr) -#define readw(addr) __fswab16(__raw_readw(addr)) -#define readl(addr) __fswab32(__raw_readl(addr)) -#define readq(addr) __fswab64(__raw_readq(addr)) +#define readw(addr) le16_to_cpu(__raw_readw(addr)) +#define readl(addr) le32_to_cpu(__raw_readl(addr)) +#define readq(addr) le64_to_cpu(__raw_readq(addr)) #define writeb(b, addr) __raw_writeb(b, addr) #define writew(b, addr) __raw_writew(cpu_to_le16(b), addr) #define writel(b, addr) __raw_writel(cpu_to_le32(b), addr) diff --git a/arch/parisc/include/asm/mmu_context.h b/arch/parisc/include/asm/mmu_context.h index 85856c74ad1..354b2aca990 100644 --- a/arch/parisc/include/asm/mmu_context.h +++ b/arch/parisc/include/asm/mmu_context.h @@ -34,16 +34,21 @@ destroy_context(struct mm_struct *mm) mm->context = 0; } -static inline void load_context(mm_context_t context) +static inline unsigned long __space_to_prot(mm_context_t context) { - mtsp(context, 3); #if SPACEID_SHIFT == 0 - mtctl(context << 1,8); + return context << 1; #else - mtctl(context >> (SPACEID_SHIFT - 1),8); + return context >> (SPACEID_SHIFT - 1); #endif } +static inline void load_context(mm_context_t context) +{ + mtsp(context, 3); + mtctl(__space_to_prot(context), 8); +} + static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) { diff --git a/arch/parisc/include/asm/processor.h b/arch/parisc/include/asm/processor.h index 3c9d34844c8..9d64df8754b 100644 --- a/arch/parisc/include/asm/processor.h +++ b/arch/parisc/include/asm/processor.h @@ -17,6 +17,7 @@ #include <asm/ptrace.h> #include <asm/types.h> #include <asm/system.h> +#include <asm/percpu.h> #endif /* __ASSEMBLY__ */ #define KERNEL_STACK_SIZE (4*PAGE_SIZE) @@ -109,8 +110,7 @@ struct cpuinfo_parisc { }; extern struct system_cpuinfo_parisc boot_cpu_data; -extern struct cpuinfo_parisc cpu_data[NR_CPUS]; -#define current_cpu_data cpu_data[smp_processor_id()] +DECLARE_PER_CPU(struct cpuinfo_parisc, cpu_data); #define CPU_HVERSION ((boot_cpu_data.hversion >> 4) & 0x0FFF) diff --git a/arch/parisc/include/asm/swab.h b/arch/parisc/include/asm/swab.h new file mode 100644 index 00000000000..3ff16c5a335 --- /dev/null +++ b/arch/parisc/include/asm/swab.h @@ -0,0 +1,66 @@ +#ifndef _PARISC_SWAB_H +#define _PARISC_SWAB_H + +#include <asm/types.h> +#include <linux/compiler.h> + +#define __SWAB_64_THRU_32__ + +static inline __attribute_const__ __u16 __arch_swab16(__u16 x) +{ + __asm__("dep %0, 15, 8, %0\n\t" /* deposit 00ab -> 0bab */ + "shd %%r0, %0, 8, %0" /* shift 000000ab -> 00ba */ + : "=r" (x) + : "0" (x)); + return x; +} +#define __arch_swab16 __arch_swab16 + +static inline __attribute_const__ __u32 __arch_swab24(__u32 x) +{ + __asm__("shd %0, %0, 8, %0\n\t" /* shift xabcxabc -> cxab */ + "dep %0, 15, 8, %0\n\t" /* deposit cxab -> cbab */ + "shd %%r0, %0, 8, %0" /* shift 0000cbab -> 0cba */ + : "=r" (x) + : "0" (x)); + return x; +} + +static inline __attribute_const__ __u32 __arch_swab32(__u32 x) +{ + unsigned int temp; + __asm__("shd %0, %0, 16, %1\n\t" /* shift abcdabcd -> cdab */ + "dep %1, 15, 8, %1\n\t" /* deposit cdab -> cbab */ + "shd %0, %1, 8, %0" /* shift abcdcbab -> dcba */ + : "=r" (x), "=&r" (temp) + : "0" (x)); + return x; +} +#define __arch_swab32 __arch_swab32 + +#if BITS_PER_LONG > 32 +/* +** From "PA-RISC 2.0 Architecture", HP Professional Books. +** See Appendix I page 8 , "Endian Byte Swapping". +** +** Pretty cool algorithm: (* == zero'd bits) +** PERMH 01234567 -> 67452301 into %0 +** HSHL 67452301 -> 7*5*3*1* into %1 +** HSHR 67452301 -> *6*4*2*0 into %0 +** OR %0 | %1 -> 76543210 into %0 (all done!) +*/ +static inline __attribute_const__ __u64 __arch_swab64(__u64 x) +{ + __u64 temp; + __asm__("permh,3210 %0, %0\n\t" + "hshl %0, 8, %1\n\t" + "hshr,u %0, 8, %0\n\t" + "or %1, %0, %0" + : "=r" (x), "=&r" (temp) + : "0" (x)); + return x; +} +#define __arch_swab64 __arch_swab64 +#endif /* BITS_PER_LONG > 32 */ + +#endif /* _PARISC_SWAB_H */ diff --git a/arch/parisc/include/asm/uaccess.h b/arch/parisc/include/asm/uaccess.h index 4878b9501f2..1c6dbb6f6e5 100644 --- a/arch/parisc/include/asm/uaccess.h +++ b/arch/parisc/include/asm/uaccess.h @@ -241,4 +241,6 @@ unsigned long copy_in_user(void __user *dst, const void __user *src, unsigned lo #define __copy_to_user_inatomic __copy_to_user #define __copy_from_user_inatomic __copy_from_user +int fixup_exception(struct pt_regs *regs); + #endif /* __PARISC_UACCESS_H */ diff --git a/arch/parisc/kernel/drivers.c b/arch/parisc/kernel/drivers.c index 884b7ce16a3..994bcd98090 100644 --- a/arch/parisc/kernel/drivers.c +++ b/arch/parisc/kernel/drivers.c @@ -549,6 +549,38 @@ static int parisc_generic_match(struct device *dev, struct device_driver *drv) return match_device(to_parisc_driver(drv), to_parisc_device(dev)); } +static ssize_t make_modalias(struct device *dev, char *buf) +{ + const struct parisc_device *padev = to_parisc_device(dev); + const struct parisc_device_id *id = &padev->id; + + return sprintf(buf, "parisc:t%02Xhv%04Xrev%02Xsv%08X\n", + (u8)id->hw_type, (u16)id->hversion, (u8)id->hversion_rev, + (u32)id->sversion); +} + +static int parisc_uevent(struct device *dev, struct kobj_uevent_env *env) +{ + const struct parisc_device *padev; + char modalias[40]; + + if (!dev) + return -ENODEV; + + padev = to_parisc_device(dev); + if (!padev) + return -ENODEV; + + if (add_uevent_var(env, "PARISC_NAME=%s", padev->name)) + return -ENOMEM; + + make_modalias(dev, modalias); + if (add_uevent_var(env, "MODALIAS=%s", modalias)) + return -ENOMEM; + + return 0; +} + #define pa_dev_attr(name, field, format_string) \ static ssize_t name##_show(struct device *dev, struct device_attribute *attr, char *buf) \ { \ @@ -566,12 +598,7 @@ pa_dev_attr_id(sversion, "0x%05x\n"); static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct parisc_device *padev = to_parisc_device(dev); - struct parisc_device_id *id = &padev->id; - - return sprintf(buf, "parisc:t%02Xhv%04Xrev%02Xsv%08X\n", - (u8)id->hw_type, (u16)id->hversion, (u8)id->hversion_rev, - (u32)id->sversion); + return make_modalias(dev, buf); } static struct device_attribute parisc_device_attrs[] = { @@ -587,6 +614,7 @@ static struct device_attribute parisc_device_attrs[] = { struct bus_type parisc_bus_type = { .name = "parisc", .match = parisc_generic_match, + .uevent = parisc_uevent, .dev_attrs = parisc_device_attrs, .probe = parisc_driver_probe, .remove = parisc_driver_remove, diff --git a/arch/parisc/kernel/hpmc.S b/arch/parisc/kernel/hpmc.S index 2cbf13b3ef1..5595a2f3118 100644 --- a/arch/parisc/kernel/hpmc.S +++ b/arch/parisc/kernel/hpmc.S @@ -80,6 +80,7 @@ END(hpmc_pim_data) .import intr_save, code ENTRY(os_hpmc) +.os_hpmc: /* * registers modified: @@ -295,5 +296,10 @@ os_hpmc_6: b . nop ENDPROC(os_hpmc) -ENTRY(os_hpmc_end) /* this label used to compute os_hpmc checksum */ +.os_hpmc_end: nop +.data +.align 4 + .export os_hpmc_size +os_hpmc_size: + .word .os_hpmc_end-.os_hpmc diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c index 4cea935e2f9..ac2c822928c 100644 --- a/arch/parisc/kernel/irq.c +++ b/arch/parisc/kernel/irq.c @@ -298,7 +298,7 @@ unsigned long txn_affinity_addr(unsigned int irq, int cpu) irq_desc[irq].affinity = cpumask_of_cpu(cpu); #endif - return cpu_data[cpu].txn_addr; + return per_cpu(cpu_data, cpu).txn_addr; } @@ -309,8 +309,9 @@ unsigned long txn_alloc_addr(unsigned int virt_irq) next_cpu++; /* assign to "next" CPU we want this bugger on */ /* validate entry */ - while ((next_cpu < NR_CPUS) && (!cpu_data[next_cpu].txn_addr || - !cpu_online(next_cpu))) + while ((next_cpu < NR_CPUS) && + (!per_cpu(cpu_data, next_cpu).txn_addr || + !cpu_online(next_cpu))) next_cpu++; if (next_cpu >= NR_CPUS) @@ -359,7 +360,7 @@ void do_cpu_irq_mask(struct pt_regs *regs) printk(KERN_DEBUG "redirecting irq %d from CPU %d to %d\n", irq, smp_processor_id(), cpu); gsc_writel(irq + CPU_IRQ_BASE, - cpu_data[cpu].hpa); + per_cpu(cpu_data, cpu).hpa); goto set_out; } #endif @@ -421,5 +422,5 @@ void __init init_IRQ(void) void ack_bad_irq(unsigned int irq) { - printk("unexpected IRQ %d\n", irq); + printk(KERN_WARNING "unexpected IRQ %d\n", irq); } diff --git a/arch/parisc/kernel/pdc_cons.c b/arch/parisc/kernel/pdc_cons.c index ccb68090781..1ff366cb968 100644 --- a/arch/parisc/kernel/pdc_cons.c +++ b/arch/parisc/kernel/pdc_cons.c @@ -52,7 +52,7 @@ #include <linux/tty.h> #include <asm/pdc.h> /* for iodc_call() proto and friends */ -static spinlock_t pdc_console_lock = SPIN_LOCK_UNLOCKED; +static DEFINE_SPINLOCK(pdc_console_lock); static void pdc_console_write(struct console *co, const char *s, unsigned count) { diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index f696f57faa1..75099efb3bf 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -541,9 +541,9 @@ static int __init perf_init(void) spin_lock_init(&perf_lock); /* TODO: this only lets us access the first cpu.. what to do for SMP? */ - cpu_device = cpu_data[0].dev; + cpu_device = per_cpu(cpu_data, 0).dev; printk("Performance monitoring counters enabled for %s\n", - cpu_data[0].dev->name); + per_cpu(cpu_data, 0).dev->name); return 0; } diff --git a/arch/parisc/kernel/processor.c b/arch/parisc/kernel/processor.c index 370086fb833..ecb609342fe 100644 --- a/arch/parisc/kernel/processor.c +++ b/arch/parisc/kernel/processor.c @@ -3,7 +3,7 @@ * Initial setup-routines for HP 9000 based hardware. * * Copyright (C) 1991, 1992, 1995 Linus Torvalds - * Modifications for PA-RISC (C) 1999 Helge Deller <deller@gmx.de> + * Modifications for PA-RISC (C) 1999-2008 Helge Deller <deller@gmx.de> * Modifications copyright 1999 SuSE GmbH (Philipp Rumpf) * Modifications copyright 2000 Martin K. Petersen <mkp@mkp.net> * Modifications copyright 2000 Philipp Rumpf <prumpf@tux.org> @@ -46,7 +46,7 @@ struct system_cpuinfo_parisc boot_cpu_data __read_mostly; EXPORT_SYMBOL(boot_cpu_data); -struct cpuinfo_parisc cpu_data[NR_CPUS] __read_mostly; +DEFINE_PER_CPU(struct cpuinfo_parisc, cpu_data); extern int update_cr16_clocksource(void); /* from time.c */ @@ -69,6 +69,23 @@ extern int update_cr16_clocksource(void); /* from time.c */ */ /** + * init_cpu_profiler - enable/setup per cpu profiling hooks. + * @cpunum: The processor instance. + * + * FIXME: doesn't do much yet... + */ +static void __cpuinit +init_percpu_prof(unsigned long cpunum) +{ + struct cpuinfo_parisc *p; + + p = &per_cpu(cpu_data, cpunum); + p->prof_counter = 1; + p->prof_multiplier = 1; +} + + +/** * processor_probe - Determine if processor driver should claim this device. * @dev: The device which has been found. * @@ -147,7 +164,7 @@ static int __cpuinit processor_probe(struct parisc_device *dev) } #endif - p = &cpu_data[cpuid]; + p = &per_cpu(cpu_data, cpuid); boot_cpu_data.cpu_count++; /* initialize counters - CPU 0 gets it_value set in time_init() */ @@ -162,12 +179,9 @@ static int __cpuinit processor_probe(struct parisc_device *dev) #ifdef CONFIG_SMP /* ** FIXME: review if any other initialization is clobbered - ** for boot_cpu by the above memset(). + ** for boot_cpu by the above memset(). */ - - /* stolen from init_percpu_prof() */ - cpu_data[cpuid].prof_counter = 1; - cpu_data[cpuid].prof_multiplier = 1; + init_percpu_prof(cpuid); #endif /* @@ -261,19 +275,6 @@ void __init collect_boot_cpu_data(void) } -/** - * init_cpu_profiler - enable/setup per cpu profiling hooks. - * @cpunum: The processor instance. - * - * FIXME: doesn't do much yet... - */ -static inline void __init -init_percpu_prof(int cpunum) -{ - cpu_data[cpunum].prof_counter = 1; - cpu_data[cpunum].prof_multiplier = 1; -} - /** * init_per_cpu - Handle individual processor initializations. @@ -293,7 +294,7 @@ init_percpu_prof(int cpunum) * * o Enable CPU profiling hooks. */ -int __init init_per_cpu(int cpunum) +int __cpuinit init_per_cpu(int cpunum) { int ret; struct pdc_coproc_cfg coproc_cfg; @@ -307,8 +308,8 @@ int __init init_per_cpu(int cpunum) /* FWIW, FP rev/model is a more accurate way to determine ** CPU type. CPU rev/model has some ambiguous cases. */ - cpu_data[cpunum].fp_rev = coproc_cfg.revision; - cpu_data[cpunum].fp_model = coproc_cfg.model; + per_cpu(cpu_data, cpunum).fp_rev = coproc_cfg.revision; + per_cpu(cpu_data, cpunum).fp_model = coproc_cfg.model; printk(KERN_INFO "FP[%d] enabled: Rev %ld Model %ld\n", cpunum, coproc_cfg.revision, coproc_cfg.model); @@ -344,16 +345,17 @@ int __init init_per_cpu(int cpunum) int show_cpuinfo (struct seq_file *m, void *v) { - int n; + unsigned long cpu; - for(n=0; n<boot_cpu_data.cpu_count; n++) { + for_each_online_cpu(cpu) { + const struct cpuinfo_parisc *cpuinfo = &per_cpu(cpu_data, cpu); #ifdef CONFIG_SMP - if (0 == cpu_data[n].hpa) + if (0 == cpuinfo->hpa) continue; #endif - seq_printf(m, "processor\t: %d\n" + seq_printf(m, "processor\t: %lu\n" "cpu family\t: PA-RISC %s\n", - n, boot_cpu_data.family_name); + cpu, boot_cpu_data.family_name); seq_printf(m, "cpu\t\t: %s\n", boot_cpu_data.cpu_name ); @@ -365,8 +367,8 @@ show_cpuinfo (struct seq_file *m, void *v) seq_printf(m, "model\t\t: %s\n" "model name\t: %s\n", boot_cpu_data.pdc.sys_model_name, - cpu_data[n].dev ? - cpu_data[n].dev->name : "Unknown" ); + cpuinfo->dev ? + cpuinfo->dev->name : "Unknown"); seq_printf(m, "hversion\t: 0x%08x\n" "sversion\t: 0x%08x\n", @@ -377,8 +379,8 @@ show_cpuinfo (struct seq_file *m, void *v) show_cache_info(m); seq_printf(m, "bogomips\t: %lu.%02lu\n", - cpu_data[n].loops_per_jiffy / (500000 / HZ), - (cpu_data[n].loops_per_jiffy / (5000 / HZ)) % 100); + cpuinfo->loops_per_jiffy / (500000 / HZ), + (cpuinfo->loops_per_jiffy / (5000 / HZ)) % 100); seq_printf(m, "software id\t: %ld\n\n", boot_cpu_data.pdc.model.sw_id); diff --git a/arch/parisc/kernel/setup.c b/arch/parisc/kernel/setup.c index 7d27853ff8c..82131ca8e05 100644 --- a/arch/parisc/kernel/setup.c +++ b/arch/parisc/kernel/setup.c @@ -58,11 +58,6 @@ int parisc_bus_is_phys __read_mostly = 1; /* Assume no IOMMU is present */ EXPORT_SYMBOL(parisc_bus_is_phys); #endif -/* This sets the vmerge boundary and size, it's here because it has to - * be available on all platforms (zero means no-virtual merging) */ -unsigned long parisc_vmerge_boundary = 0; -unsigned long parisc_vmerge_max_size = 0; - void __init setup_cmdline(char **cmdline_p) { extern unsigned int boot_args[]; @@ -321,7 +316,7 @@ static int __init parisc_init(void) processor_init(); printk(KERN_INFO "CPU(s): %d x %s at %d.%06d MHz\n", - boot_cpu_data.cpu_count, + num_present_cpus(), boot_cpu_data.cpu_name, boot_cpu_data.cpu_hz / 1000000, boot_cpu_data.cpu_hz % 1000000 ); @@ -387,8 +382,8 @@ void start_parisc(void) if (ret >= 0 && coproc_cfg.ccr_functional) { mtctl(coproc_cfg.ccr_functional, 10); - cpu_data[cpunum].fp_rev = coproc_cfg.revision; - cpu_data[cpunum].fp_model = coproc_cfg.model; + per_cpu(cpu_data, cpunum).fp_rev = coproc_cfg.revision; + per_cpu(cpu_data, cpunum).fp_model = coproc_cfg.model; asm volatile ("fstd %fr0,8(%sp)"); } else { diff --git a/arch/parisc/kernel/smp.c b/arch/parisc/kernel/smp.c index 80bc000523f..9995d7ed581 100644 --- a/arch/parisc/kernel/smp.c +++ b/arch/parisc/kernel/smp.c @@ -56,16 +56,17 @@ static int smp_debug_lvl = 0; if (lvl >= smp_debug_lvl) \ printk(printargs); #else -#define smp_debug(lvl, ...) +#define smp_debug(lvl, ...) do { } while(0) #endif /* DEBUG_SMP */ DEFINE_SPINLOCK(smp_lock); volatile struct task_struct *smp_init_current_idle_task; -static volatile int cpu_now_booting __read_mostly = 0; /* track which CPU is booting */ +/* track which CPU is booting */ +static volatile int cpu_now_booting __cpuinitdata; -static int parisc_max_cpus __read_mostly = 1; +static int parisc_max_cpus __cpuinitdata = 1; DEFINE_PER_CPU(spinlock_t, ipi_lock) = SPIN_LOCK_UNLOCKED; @@ -123,7 +124,7 @@ irqreturn_t ipi_interrupt(int irq, void *dev_id) { int this_cpu = smp_processor_id(); - struct cpuinfo_parisc *p = &cpu_data[this_cpu]; + struct cpuinfo_parisc *p = &per_cpu(cpu_data, this_cpu); unsigned long ops; unsigned long flags; @@ -202,13 +203,13 @@ ipi_interrupt(int irq, void *dev_id) static inline void ipi_send(int cpu, enum ipi_message_type op) { - struct cpuinfo_parisc *p = &cpu_data[cpu]; + struct cpuinfo_parisc *p = &per_cpu(cpu_data, cpu); spinlock_t *lock = &per_cpu(ipi_lock, cpu); unsigned long flags; spin_lock_irqsave(lock, flags); p->pending_ipi |= 1 << op; - gsc_writel(IPI_IRQ - CPU_IRQ_BASE, cpu_data[cpu].hpa); + gsc_writel(IPI_IRQ - CPU_IRQ_BASE, p->hpa); spin_unlock_irqrestore(lock, flags); } @@ -224,10 +225,7 @@ send_IPI_mask(cpumask_t mask, enum ipi_message_type op) static inline void send_IPI_single(int dest_cpu, enum ipi_message_type op) { - if (dest_cpu == NO_PROC_ID) { - BUG(); - return; - } + BUG_ON(dest_cpu == NO_PROC_ID); ipi_send(dest_cpu, op); } @@ -309,8 +307,7 @@ smp_cpu_init(int cpunum) /* Initialise the idle task for this CPU */ atomic_inc(&init_mm.mm_count); current->active_mm = &init_mm; - if(current->mm) - BUG(); + BUG_ON(current->mm); enter_lazy_tlb(&init_mm, current); init_IRQ(); /* make sure no IRQs are enabled or pending */ @@ -345,6 +342,7 @@ void __init smp_callin(void) */ int __cpuinit smp_boot_one_cpu(int cpuid) { + const struct cpuinfo_parisc *p = &per_cpu(cpu_data, cpuid); struct task_struct *idle; long timeout; @@ -376,7 +374,7 @@ int __cpuinit smp_boot_one_cpu(int cpuid) smp_init_current_idle_task = idle ; mb(); - printk("Releasing cpu %d now, hpa=%lx\n", cpuid, cpu_data[cpuid].hpa); + printk(KERN_INFO "Releasing cpu %d now, hpa=%lx\n", cpuid, p->hpa); /* ** This gets PDC to release the CPU from a very tight loop. @@ -387,7 +385,7 @@ int __cpuinit smp_boot_one_cpu(int cpuid) ** EIR{0}). MEM_RENDEZ is valid only when it is nonzero and the ** contents of memory are valid." */ - gsc_writel(TIMER_IRQ - CPU_IRQ_BASE, cpu_data[cpuid].hpa); + gsc_writel(TIMER_IRQ - CPU_IRQ_BASE, p->hpa); mb(); /* @@ -419,12 +417,12 @@ alive: return 0; } -void __devinit smp_prepare_boot_cpu(void) +void __init smp_prepare_boot_cpu(void) { - int bootstrap_processor=cpu_data[0].cpuid; /* CPU ID of BSP */ + int bootstrap_processor = per_cpu(cpu_data, 0).cpuid; /* Setup BSP mappings */ - printk("SMP: bootstrap CPU ID is %d\n",bootstrap_processor); + printk(KERN_INFO "SMP: bootstrap CPU ID is %d\n", bootstrap_processor); cpu_set(bootstrap_processor, cpu_online_map); cpu_set(bootstrap_processor, cpu_present_map); diff --git a/arch/parisc/kernel/time.c b/arch/parisc/kernel/time.c index 4d09203bc69..9d46c43a415 100644 --- a/arch/parisc/kernel/time.c +++ b/arch/parisc/kernel/time.c @@ -60,7 +60,7 @@ irqreturn_t timer_interrupt(int irq, void *dev_id) unsigned long cycles_elapsed, ticks_elapsed; unsigned long cycles_remainder; unsigned int cpu = smp_processor_id(); - struct cpuinfo_parisc *cpuinfo = &cpu_data[cpu]; + struct cpuinfo_parisc *cpuinfo = &per_cpu(cpu_data, cpu); /* gcc can optimize for "read-only" case with a local clocktick */ unsigned long cpt = clocktick; @@ -213,7 +213,7 @@ void __init start_cpu_itimer(void) mtctl(next_tick, 16); /* kick off Interval Timer (CR16) */ - cpu_data[cpu].it_value = next_tick; + per_cpu(cpu_data, cpu).it_value = next_tick; } struct platform_device rtc_parisc_dev = { diff --git a/arch/parisc/kernel/topology.c b/arch/parisc/kernel/topology.c index d71cb018a21..f5159381fdd 100644 --- a/arch/parisc/kernel/topology.c +++ b/arch/parisc/kernel/topology.c @@ -22,14 +22,14 @@ #include <linux/cpu.h> #include <linux/cache.h> -static struct cpu cpu_devices[NR_CPUS] __read_mostly; +static DEFINE_PER_CPU(struct cpu, cpu_devices); static int __init topology_init(void) { int num; for_each_present_cpu(num) { - register_cpu(&cpu_devices[num], num); + register_cpu(&per_cpu(cpu_devices, num), num); } return 0; } diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c index 4c771cd580e..ba658d2086f 100644 --- a/arch/parisc/kernel/traps.c +++ b/arch/parisc/kernel/traps.c @@ -745,6 +745,10 @@ void handle_interruption(int code, struct pt_regs *regs) /* Fall Through */ case 27: /* Data memory protection ID trap */ + if (code == 27 && !user_mode(regs) && + fixup_exception(regs)) + return; + die_if_kernel("Protection id trap", regs, code); si.si_code = SEGV_MAPERR; si.si_signo = SIGSEGV; @@ -821,8 +825,8 @@ void handle_interruption(int code, struct pt_regs *regs) int __init check_ivt(void *iva) { + extern u32 os_hpmc_size; extern const u32 os_hpmc[]; - extern const u32 os_hpmc_end[]; int i; u32 check = 0; @@ -839,8 +843,7 @@ int __init check_ivt(void *iva) *ivap++ = 0; /* Compute Checksum for HPMC handler */ - - length = os_hpmc_end - os_hpmc; + length = os_hpmc_size; ivap[7] = length; hpmcp = (u32 *)os_hpmc; diff --git a/arch/parisc/kernel/unwind.c b/arch/parisc/kernel/unwind.c index 6773c582e45..69dad5a850a 100644 --- a/arch/parisc/kernel/unwind.c +++ b/arch/parisc/kernel/unwind.c @@ -372,7 +372,7 @@ void unwind_frame_init_from_blocked_task(struct unwind_frame_info *info, struct struct pt_regs *r = &t->thread.regs; struct pt_regs *r2; - r2 = kmalloc(sizeof(struct pt_regs), GFP_KERNEL); + r2 = kmalloc(sizeof(struct pt_regs), GFP_ATOMIC); if (!r2) return; *r2 = *r; diff --git a/arch/parisc/lib/iomap.c b/arch/parisc/lib/iomap.c index 9abed07db7f..5069e8b2ca7 100644 --- a/arch/parisc/lib/iomap.c +++ b/arch/parisc/lib/iomap.c @@ -261,7 +261,7 @@ static const struct iomap_ops iomem_ops = { iomem_write32r, }; -const struct iomap_ops *iomap_ops[8] = { +static const struct iomap_ops *iomap_ops[8] = { [0] = &ioport_ops, [7] = &iomem_ops }; diff --git a/arch/parisc/lib/memcpy.c b/arch/parisc/lib/memcpy.c index 2d68431fc22..bbda909c866 100644 --- a/arch/parisc/lib/memcpy.c +++ b/arch/parisc/lib/memcpy.c @@ -275,7 +275,7 @@ handle_store_error: /* Returns 0 for success, otherwise, returns number of bytes not transferred. */ -unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) +static unsigned long pa_memcpy(void *dstp, const void *srcp, unsigned long len) { register unsigned long src, dst, t1, t2, t3; register unsigned char *pcs, *pcd; diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c index b2e3e9a8cec..92c7fa4ecc3 100644 --- a/arch/parisc/mm/fault.c +++ b/arch/parisc/mm/fault.c @@ -139,13 +139,41 @@ parisc_acctyp(unsigned long code, unsigned int inst) } #endif +int fixup_exception(struct pt_regs *regs) +{ + const struct exception_table_entry *fix; + + fix = search_exception_tables(regs->iaoq[0]); + if (fix) { + struct exception_data *d; + d = &__get_cpu_var(exception_data); + d->fault_ip = regs->iaoq[0]; + d->fault_space = regs->isr; + d->fault_addr = regs->ior; + + regs->iaoq[0] = ((fix->fixup) & ~3); + /* + * NOTE: In some cases the faulting instruction + * may be in the delay slot of a branch. We + * don't want to take the branch, so we don't + * increment iaoq[1], instead we set it to be + * iaoq[0]+4, and clear the B bit in the PSW + */ + regs->iaoq[1] = regs->iaoq[0] + 4; + regs->gr[0] &= ~PSW_B; /* IPSW in gr[0] */ + + return 1; + } + + return 0; +} + void do_page_fault(struct pt_regs *regs, unsigned long code, unsigned long address) { struct vm_area_struct *vma, *prev_vma; struct task_struct *tsk = current; struct mm_struct *mm = tsk->mm; - const struct exception_table_entry *fix; unsigned long acc_type; int fault; @@ -229,32 +257,8 @@ bad_area: no_context: - if (!user_mode(regs)) { - fix = search_exception_tables(regs->iaoq[0]); - - if (fix) { - struct exception_data *d; - - d = &__get_cpu_var(exception_data); - d->fault_ip = regs->iaoq[0]; - d->fault_space = regs->isr; - d->fault_addr = regs->ior; - - regs->iaoq[0] = ((fix->fixup) & ~3); - - /* - * NOTE: In some cases the faulting instruction - * may be in the delay slot of a branch. We - * don't want to take the branch, so we don't - * increment iaoq[1], instead we set it to be - * iaoq[0]+4, and clear the B bit in the PSW - */ - - regs->iaoq[1] = regs->iaoq[0] + 4; - regs->gr[0] &= ~PSW_B; /* IPSW in gr[0] */ - - return; - } + if (!user_mode(regs) && fixup_exception(regs)) { + return; } parisc_terminate("Bad Address (null pointer deref?)", regs, code, address); diff --git a/arch/powerpc/include/asm/cell-pmu.h b/arch/powerpc/include/asm/cell-pmu.h index 8066eede3a0..b4b7338ad79 100644 --- a/arch/powerpc/include/asm/cell-pmu.h +++ b/arch/powerpc/include/asm/cell-pmu.h @@ -37,9 +37,11 @@ #define CBE_PM_STOP_AT_MAX 0x40000000 #define CBE_PM_TRACE_MODE_GET(pm_control) (((pm_control) >> 28) & 0x3) #define CBE_PM_TRACE_MODE_SET(mode) (((mode) & 0x3) << 28) +#define CBE_PM_TRACE_BUF_OVFLW(bit) (((bit) & 0x1) << 17) #define CBE_PM_COUNT_MODE_SET(count) (((count) & 0x3) << 18) #define CBE_PM_FREEZE_ALL_CTRS 0x00100000 #define CBE_PM_ENABLE_EXT_TRACE 0x00008000 +#define CBE_PM_SPU_ADDR_TRACE_SET(msk) (((msk) & 0x3) << 9) /* Macros for the trace_address register. */ #define CBE_PM_TRACE_BUF_FULL 0x00000800 diff --git a/arch/powerpc/include/asm/oprofile_impl.h b/arch/powerpc/include/asm/oprofile_impl.h index 95035c602ba..639dc96077a 100644 --- a/arch/powerpc/include/asm/oprofile_impl.h +++ b/arch/powerpc/include/asm/oprofile_impl.h @@ -32,6 +32,12 @@ struct op_system_config { unsigned long mmcr0; unsigned long mmcr1; unsigned long mmcra; +#ifdef CONFIG_OPROFILE_CELL + /* Register for oprofile user tool to check cell kernel profiling + * suport. + */ + unsigned long cell_support; +#endif #endif unsigned long enable_kernel; unsigned long enable_user; diff --git a/arch/powerpc/oprofile/cell/pr_util.h b/arch/powerpc/oprofile/cell/pr_util.h index dfdbffa0681..964b93974d8 100644 --- a/arch/powerpc/oprofile/cell/pr_util.h +++ b/arch/powerpc/oprofile/cell/pr_util.h @@ -30,6 +30,10 @@ extern struct delayed_work spu_work; extern int spu_prof_running; +#define TRACE_ARRAY_SIZE 1024 + +extern spinlock_t oprof_spu_smpl_arry_lck; + struct spu_overlay_info { /* map of sections within an SPU overlay */ unsigned int vma; /* SPU virtual memory address from elf */ unsigned int size; /* size of section from elf */ @@ -89,10 +93,11 @@ void vma_map_free(struct vma_to_fileoffset_map *map); * Entry point for SPU profiling. * cycles_reset is the SPU_CYCLES count value specified by the user. */ -int start_spu_profiling(unsigned int cycles_reset); - -void stop_spu_profiling(void); +int start_spu_profiling_cycles(unsigned int cycles_reset); +void start_spu_profiling_events(void); +void stop_spu_profiling_cycles(void); +void stop_spu_profiling_events(void); /* add the necessary profiling hooks */ int spu_sync_start(void); diff --git a/arch/powerpc/oprofile/cell/spu_profiler.c b/arch/powerpc/oprofile/cell/spu_profiler.c index 83faa958b9d..9305ddaac51 100644 --- a/arch/powerpc/oprofile/cell/spu_profiler.c +++ b/arch/powerpc/oprofile/cell/spu_profiler.c @@ -18,11 +18,21 @@ #include <asm/cell-pmu.h> #include "pr_util.h" -#define TRACE_ARRAY_SIZE 1024 #define SCALE_SHIFT 14 static u32 *samples; +/* spu_prof_running is a flag used to indicate if spu profiling is enabled + * or not. It is set by the routines start_spu_profiling_cycles() and + * start_spu_profiling_events(). The flag is cleared by the routines + * stop_spu_profiling_cycles() and stop_spu_profiling_events(). These + * routines are called via global_start() and global_stop() which are called in + * op_powerpc_start() and op_powerpc_stop(). These routines are called once + * per system as a result of the user starting/stopping oprofile. Hence, only + * one CPU per user at a time will be changing the value of spu_prof_running. + * In general, OProfile does not protect against multiple users trying to run + * OProfile at a time. + */ int spu_prof_running; static unsigned int profiling_interval; @@ -31,8 +41,8 @@ static unsigned int profiling_interval; #define SPU_PC_MASK 0xFFFF -static DEFINE_SPINLOCK(sample_array_lock); -unsigned long sample_array_lock_flags; +DEFINE_SPINLOCK(oprof_spu_smpl_arry_lck); +unsigned long oprof_spu_smpl_arry_lck_flags; void set_spu_profiling_frequency(unsigned int freq_khz, unsigned int cycles_reset) { @@ -145,13 +155,13 @@ static enum hrtimer_restart profile_spus(struct hrtimer *timer) * sample array must be loaded and then processed for a given * cpu. The sample array is not per cpu. */ - spin_lock_irqsave(&sample_array_lock, - sample_array_lock_flags); + spin_lock_irqsave(&oprof_spu_smpl_arry_lck, + oprof_spu_smpl_arry_lck_flags); num_samples = cell_spu_pc_collection(cpu); if (num_samples == 0) { - spin_unlock_irqrestore(&sample_array_lock, - sample_array_lock_flags); + spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck, + oprof_spu_smpl_arry_lck_flags); continue; } @@ -162,8 +172,8 @@ static enum hrtimer_restart profile_spus(struct hrtimer *timer) num_samples); } - spin_unlock_irqrestore(&sample_array_lock, - sample_array_lock_flags); + spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck, + oprof_spu_smpl_arry_lck_flags); } smp_wmb(); /* insure spu event buffer updates are written */ @@ -182,13 +192,13 @@ static enum hrtimer_restart profile_spus(struct hrtimer *timer) static struct hrtimer timer; /* - * Entry point for SPU profiling. + * Entry point for SPU cycle profiling. * NOTE: SPU profiling is done system-wide, not per-CPU. * * cycles_reset is the count value specified by the user when * setting up OProfile to count SPU_CYCLES. */ -int start_spu_profiling(unsigned int cycles_reset) +int start_spu_profiling_cycles(unsigned int cycles_reset) { ktime_t kt; @@ -212,10 +222,30 @@ int start_spu_profiling(unsigned int cycles_reset) return 0; } -void stop_spu_profiling(void) +/* + * Entry point for SPU event profiling. + * NOTE: SPU profiling is done system-wide, not per-CPU. + * + * cycles_reset is the count value specified by the user when + * setting up OProfile to count SPU_CYCLES. + */ +void start_spu_profiling_events(void) +{ + spu_prof_running = 1; + schedule_delayed_work(&spu_work, DEFAULT_TIMER_EXPIRE); + + return; +} + +void stop_spu_profiling_cycles(void) { spu_prof_running = 0; hrtimer_cancel(&timer); kfree(samples); - pr_debug("SPU_PROF: stop_spu_profiling issued\n"); + pr_debug("SPU_PROF: stop_spu_profiling_cycles issued\n"); +} + +void stop_spu_profiling_events(void) +{ + spu_prof_running = 0; } diff --git a/arch/powerpc/oprofile/common.c b/arch/powerpc/oprofile/common.c index 17807acb05d..21f16edf6c8 100644 --- a/arch/powerpc/oprofile/common.c +++ b/arch/powerpc/oprofile/common.c @@ -132,6 +132,28 @@ static int op_powerpc_create_files(struct super_block *sb, struct dentry *root) oprofilefs_create_ulong(sb, root, "mmcr0", &sys.mmcr0); oprofilefs_create_ulong(sb, root, "mmcr1", &sys.mmcr1); oprofilefs_create_ulong(sb, root, "mmcra", &sys.mmcra); +#ifdef CONFIG_OPROFILE_CELL + /* create a file the user tool can check to see what level of profiling + * support exits with this kernel. Initialize bit mask to indicate + * what support the kernel has: + * bit 0 - Supports SPU event profiling in addition to PPU + * event and cycles; and SPU cycle profiling + * bits 1-31 - Currently unused. + * + * If the file does not exist, then the kernel only supports SPU + * cycle profiling, PPU event and cycle profiling. + */ + oprofilefs_create_ulong(sb, root, "cell_support", &sys.cell_support); + sys.cell_support = 0x1; /* Note, the user OProfile tool must check + * that this bit is set before attempting to + * user SPU event profiling. Older kernels + * will not have this file, hence the user + * tool is not allowed to do SPU event + * profiling on older kernels. Older kernels + * will accept SPU events but collected data + * is garbage. + */ +#endif #endif for (i = 0; i < model->num_counters; ++i) { diff --git a/arch/powerpc/oprofile/op_model_cell.c b/arch/powerpc/oprofile/op_model_cell.c index 25a4ec2514a..ae06c6236d9 100644 --- a/arch/powerpc/oprofile/op_model_cell.c +++ b/arch/powerpc/oprofile/op_model_cell.c @@ -40,14 +40,15 @@ #include "../platforms/cell/interrupt.h" #include "cell/pr_util.h" -static void cell_global_stop_spu(void); +#define PPU_PROFILING 0 +#define SPU_PROFILING_CYCLES 1 +#define SPU_PROFILING_EVENTS 2 -/* - * spu_cycle_reset is the number of cycles between samples. - * This variable is used for SPU profiling and should ONLY be set - * at the beginning of cell_reg_setup; otherwise, it's read-only. - */ -static unsigned int spu_cycle_reset; +#define SPU_EVENT_NUM_START 4100 +#define SPU_EVENT_NUM_STOP 4399 +#define SPU_PROFILE_EVENT_ADDR 4363 /* spu, address trace, decimal */ +#define SPU_PROFILE_EVENT_ADDR_MASK_A 0x146 /* sub unit set to zero */ +#define SPU_PROFILE_EVENT_ADDR_MASK_B 0x186 /* sub unit set to zero */ #define NUM_SPUS_PER_NODE 8 #define SPU_CYCLES_EVENT_NUM 2 /* event number for SPU_CYCLES */ @@ -66,6 +67,21 @@ static unsigned int spu_cycle_reset; #define MAX_SPU_COUNT 0xFFFFFF /* maximum 24 bit LFSR value */ +/* Minumum HW interval timer setting to send value to trace buffer is 10 cycle. + * To configure counter to send value every N cycles set counter to + * 2^32 - 1 - N. + */ +#define NUM_INTERVAL_CYC 0xFFFFFFFF - 10 + +/* + * spu_cycle_reset is the number of cycles between samples. + * This variable is used for SPU profiling and should ONLY be set + * at the beginning of cell_reg_setup; otherwise, it's read-only. + */ +static unsigned int spu_cycle_reset; +static unsigned int profiling_mode; +static int spu_evnt_phys_spu_indx; + struct pmc_cntrl_data { unsigned long vcntr; unsigned long evnts; @@ -105,6 +121,8 @@ struct pm_cntrl { u16 trace_mode; u16 freeze; u16 count_mode; + u16 spu_addr_trace; + u8 trace_buf_ovflw; }; static struct { @@ -122,7 +140,7 @@ static struct { #define GET_INPUT_CONTROL(x) ((x & 0x00000004) >> 2) static DEFINE_PER_CPU(unsigned long[NR_PHYS_CTRS], pmc_values); - +static unsigned long spu_pm_cnt[MAX_NUMNODES * NUM_SPUS_PER_NODE]; static struct pmc_cntrl_data pmc_cntrl[NUM_THREADS][NR_PHYS_CTRS]; /* @@ -152,6 +170,7 @@ static u32 hdw_thread; static u32 virt_cntr_inter_mask; static struct timer_list timer_virt_cntr; +static struct timer_list timer_spu_event_swap; /* * pm_signal needs to be global since it is initialized in @@ -165,7 +184,7 @@ static int spu_rtas_token; /* token for SPU cycle profiling */ static u32 reset_value[NR_PHYS_CTRS]; static int num_counters; static int oprofile_running; -static DEFINE_SPINLOCK(virt_cntr_lock); +static DEFINE_SPINLOCK(cntr_lock); static u32 ctr_enabled; @@ -336,13 +355,13 @@ static void set_pm_event(u32 ctr, int event, u32 unit_mask) for (i = 0; i < NUM_DEBUG_BUS_WORDS; i++) { if (bus_word & (1 << i)) { pm_regs.debug_bus_control |= - (bus_type << (30 - (2 * i))); + (bus_type << (30 - (2 * i))); for (j = 0; j < NUM_INPUT_BUS_WORDS; j++) { if (input_bus[j] == 0xff) { input_bus[j] = i; pm_regs.group_control |= - (i << (30 - (2 * j))); + (i << (30 - (2 * j))); break; } @@ -367,12 +386,16 @@ static void write_pm_cntrl(int cpu) if (pm_regs.pm_cntrl.stop_at_max == 1) val |= CBE_PM_STOP_AT_MAX; - if (pm_regs.pm_cntrl.trace_mode == 1) + if (pm_regs.pm_cntrl.trace_mode != 0) val |= CBE_PM_TRACE_MODE_SET(pm_regs.pm_cntrl.trace_mode); + if (pm_regs.pm_cntrl.trace_buf_ovflw == 1) + val |= CBE_PM_TRACE_BUF_OVFLW(pm_regs.pm_cntrl.trace_buf_ovflw); if (pm_regs.pm_cntrl.freeze == 1) val |= CBE_PM_FREEZE_ALL_CTRS; + val |= CBE_PM_SPU_ADDR_TRACE_SET(pm_regs.pm_cntrl.spu_addr_trace); + /* * Routine set_count_mode must be called previously to set * the count mode based on the user selection of user and kernel. @@ -441,7 +464,7 @@ static void cell_virtual_cntr(unsigned long data) * not both playing with the counters on the same node. */ - spin_lock_irqsave(&virt_cntr_lock, flags); + spin_lock_irqsave(&cntr_lock, flags); prev_hdw_thread = hdw_thread; @@ -480,7 +503,7 @@ static void cell_virtual_cntr(unsigned long data) cbe_disable_pm_interrupts(cpu); for (i = 0; i < num_counters; i++) { per_cpu(pmc_values, cpu + prev_hdw_thread)[i] - = cbe_read_ctr(cpu, i); + = cbe_read_ctr(cpu, i); if (per_cpu(pmc_values, cpu + next_hdw_thread)[i] == 0xFFFFFFFF) @@ -527,7 +550,7 @@ static void cell_virtual_cntr(unsigned long data) cbe_enable_pm(cpu); } - spin_unlock_irqrestore(&virt_cntr_lock, flags); + spin_unlock_irqrestore(&cntr_lock, flags); mod_timer(&timer_virt_cntr, jiffies + HZ / 10); } @@ -541,38 +564,146 @@ static void start_virt_cntrs(void) add_timer(&timer_virt_cntr); } -/* This function is called once for all cpus combined */ -static int cell_reg_setup(struct op_counter_config *ctr, +static int cell_reg_setup_spu_cycles(struct op_counter_config *ctr, struct op_system_config *sys, int num_ctrs) { - int i, j, cpu; - spu_cycle_reset = 0; + spu_cycle_reset = ctr[0].count; - if (ctr[0].event == SPU_CYCLES_EVENT_NUM) { - spu_cycle_reset = ctr[0].count; + /* + * Each node will need to make the rtas call to start + * and stop SPU profiling. Get the token once and store it. + */ + spu_rtas_token = rtas_token("ibm,cbe-spu-perftools"); + + if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) { + printk(KERN_ERR + "%s: rtas token ibm,cbe-spu-perftools unknown\n", + __func__); + return -EIO; + } + return 0; +} + +/* Unfortunately, the hardware will only support event profiling + * on one SPU per node at a time. Therefore, we must time slice + * the profiling across all SPUs in the node. Note, we do this + * in parallel for each node. The following routine is called + * periodically based on kernel timer to switch which SPU is + * being monitored in a round robbin fashion. + */ +static void spu_evnt_swap(unsigned long data) +{ + int node; + int cur_phys_spu, nxt_phys_spu, cur_spu_evnt_phys_spu_indx; + unsigned long flags; + int cpu; + int ret; + u32 interrupt_mask; + + + /* enable interrupts on cntr 0 */ + interrupt_mask = CBE_PM_CTR_OVERFLOW_INTR(0); + + hdw_thread = 0; + + /* Make sure spu event interrupt handler and spu event swap + * don't access the counters simultaneously. + */ + spin_lock_irqsave(&cntr_lock, flags); + + cur_spu_evnt_phys_spu_indx = spu_evnt_phys_spu_indx; + + if (++(spu_evnt_phys_spu_indx) == NUM_SPUS_PER_NODE) + spu_evnt_phys_spu_indx = 0; + + pm_signal[0].sub_unit = spu_evnt_phys_spu_indx; + pm_signal[1].sub_unit = spu_evnt_phys_spu_indx; + pm_signal[2].sub_unit = spu_evnt_phys_spu_indx; + + /* switch the SPU being profiled on each node */ + for_each_online_cpu(cpu) { + if (cbe_get_hw_thread_id(cpu)) + continue; + + node = cbe_cpu_to_node(cpu); + cur_phys_spu = (node * NUM_SPUS_PER_NODE) + + cur_spu_evnt_phys_spu_indx; + nxt_phys_spu = (node * NUM_SPUS_PER_NODE) + + spu_evnt_phys_spu_indx; /* - * Each node will need to make the rtas call to start - * and stop SPU profiling. Get the token once and store it. + * stop counters, save counter values, restore counts + * for previous physical SPU */ - spu_rtas_token = rtas_token("ibm,cbe-spu-perftools"); + cbe_disable_pm(cpu); + cbe_disable_pm_interrupts(cpu); - if (unlikely(spu_rtas_token == RTAS_UNKNOWN_SERVICE)) { - printk(KERN_ERR - "%s: rtas token ibm,cbe-spu-perftools unknown\n", - __func__); - return -EIO; - } + spu_pm_cnt[cur_phys_spu] + = cbe_read_ctr(cpu, 0); + + /* restore previous count for the next spu to sample */ + /* NOTE, hardware issue, counter will not start if the + * counter value is at max (0xFFFFFFFF). + */ + if (spu_pm_cnt[nxt_phys_spu] >= 0xFFFFFFFF) + cbe_write_ctr(cpu, 0, 0xFFFFFFF0); + else + cbe_write_ctr(cpu, 0, spu_pm_cnt[nxt_phys_spu]); + + pm_rtas_reset_signals(cbe_cpu_to_node(cpu)); + + /* setup the debug bus measure the one event and + * the two events to route the next SPU's PC on + * the debug bus + */ + ret = pm_rtas_activate_signals(cbe_cpu_to_node(cpu), 3); + if (ret) + printk(KERN_ERR "%s: pm_rtas_activate_signals failed, " + "SPU event swap\n", __func__); + + /* clear the trace buffer, don't want to take PC for + * previous SPU*/ + cbe_write_pm(cpu, trace_address, 0); + + enable_ctr(cpu, 0, pm_regs.pm07_cntrl); + + /* Enable interrupts on the CPU thread that is starting */ + cbe_enable_pm_interrupts(cpu, hdw_thread, + interrupt_mask); + cbe_enable_pm(cpu); } - pm_rtas_token = rtas_token("ibm,cbe-perftools"); + spin_unlock_irqrestore(&cntr_lock, flags); + /* swap approximately every 0.1 seconds */ + mod_timer(&timer_spu_event_swap, jiffies + HZ / 25); +} + +static void start_spu_event_swap(void) +{ + init_timer(&timer_spu_event_swap); + timer_spu_event_swap.function = spu_evnt_swap; + timer_spu_event_swap.data = 0UL; + timer_spu_event_swap.expires = jiffies + HZ / 25; + add_timer(&timer_spu_event_swap); +} + +static int cell_reg_setup_spu_events(struct op_counter_config *ctr, + struct op_system_config *sys, int num_ctrs) +{ + int i; + + /* routine is called once for all nodes */ + + spu_evnt_phys_spu_indx = 0; /* - * For all events excetp PPU CYCLEs, each node will need to make + * For all events except PPU CYCLEs, each node will need to make * the rtas cbe-perftools call to setup and reset the debug bus. * Make the token lookup call once and store it in the global * variable pm_rtas_token. */ + pm_rtas_token = rtas_token("ibm,cbe-perftools"); + if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) { printk(KERN_ERR "%s: rtas token ibm,cbe-perftools unknown\n", @@ -580,6 +711,58 @@ static int cell_reg_setup(struct op_counter_config *ctr, return -EIO; } + /* setup the pm_control register settings, + * settings will be written per node by the + * cell_cpu_setup() function. + */ + pm_regs.pm_cntrl.trace_buf_ovflw = 1; + + /* Use the occurrence trace mode to have SPU PC saved + * to the trace buffer. Occurrence data in trace buffer + * is not used. Bit 2 must be set to store SPU addresses. + */ + pm_regs.pm_cntrl.trace_mode = 2; + + pm_regs.pm_cntrl.spu_addr_trace = 0x1; /* using debug bus + event 2 & 3 */ + + /* setup the debug bus event array with the SPU PC routing events. + * Note, pm_signal[0] will be filled in by set_pm_event() call below. + */ + pm_signal[1].signal_group = SPU_PROFILE_EVENT_ADDR / 100; + pm_signal[1].bus_word = GET_BUS_WORD(SPU_PROFILE_EVENT_ADDR_MASK_A); + pm_signal[1].bit = SPU_PROFILE_EVENT_ADDR % 100; + pm_signal[1].sub_unit = spu_evnt_phys_spu_indx; + + pm_signal[2].signal_group = SPU_PROFILE_EVENT_ADDR / 100; + pm_signal[2].bus_word = GET_BUS_WORD(SPU_PROFILE_EVENT_ADDR_MASK_B); + pm_signal[2].bit = SPU_PROFILE_EVENT_ADDR % 100; + pm_signal[2].sub_unit = spu_evnt_phys_spu_indx; + + /* Set the user selected spu event to profile on, + * note, only one SPU profiling event is supported + */ + num_counters = 1; /* Only support one SPU event at a time */ + set_pm_event(0, ctr[0].event, ctr[0].unit_mask); + + reset_value[0] = 0xFFFFFFFF - ctr[0].count; + + /* global, used by cell_cpu_setup */ + ctr_enabled |= 1; + + /* Initialize the count for each SPU to the reset value */ + for (i=0; i < MAX_NUMNODES * NUM_SPUS_PER_NODE; i++) + spu_pm_cnt[i] = reset_value[0]; + + return 0; +} + +static int cell_reg_setup_ppu(struct op_counter_config *ctr, + struct op_system_config *sys, int num_ctrs) +{ + /* routine is called once for all nodes */ + int i, j, cpu; + num_counters = num_ctrs; if (unlikely(num_ctrs > NR_PHYS_CTRS)) { @@ -589,14 +772,6 @@ static int cell_reg_setup(struct op_counter_config *ctr, __func__); return -EIO; } - pm_regs.group_control = 0; - pm_regs.debug_bus_control = 0; - - /* setup the pm_control register */ - memset(&pm_regs.pm_cntrl, 0, sizeof(struct pm_cntrl)); - pm_regs.pm_cntrl.stop_at_max = 1; - pm_regs.pm_cntrl.trace_mode = 0; - pm_regs.pm_cntrl.freeze = 1; set_count_mode(sys->enable_kernel, sys->enable_user); @@ -665,6 +840,63 @@ static int cell_reg_setup(struct op_counter_config *ctr, } +/* This function is called once for all cpus combined */ +static int cell_reg_setup(struct op_counter_config *ctr, + struct op_system_config *sys, int num_ctrs) +{ + int ret=0; + spu_cycle_reset = 0; + + /* initialize the spu_arr_trace value, will be reset if + * doing spu event profiling. + */ + pm_regs.group_control = 0; + pm_regs.debug_bus_control = 0; + pm_regs.pm_cntrl.stop_at_max = 1; + pm_regs.pm_cntrl.trace_mode = 0; + pm_regs.pm_cntrl.freeze = 1; + pm_regs.pm_cntrl.trace_buf_ovflw = 0; + pm_regs.pm_cntrl.spu_addr_trace = 0; + + /* + * For all events except PPU CYCLEs, each node will need to make + * the rtas cbe-perftools call to setup and reset the debug bus. + * Make the token lookup call once and store it in the global + * variable pm_rtas_token. + */ + pm_rtas_token = rtas_token("ibm,cbe-perftools"); + + if (unlikely(pm_rtas_token == RTAS_UNKNOWN_SERVICE)) { + printk(KERN_ERR + "%s: rtas token ibm,cbe-perftools unknown\n", + __func__); + return -EIO; + } + + if (ctr[0].event == SPU_CYCLES_EVENT_NUM) { + profiling_mode = SPU_PROFILING_CYCLES; + ret = cell_reg_setup_spu_cycles(ctr, sys, num_ctrs); + } else if ((ctr[0].event >= SPU_EVENT_NUM_START) && + (ctr[0].event <= SPU_EVENT_NUM_STOP)) { + profiling_mode = SPU_PROFILING_EVENTS; + spu_cycle_reset = ctr[0].count; + + /* for SPU event profiling, need to setup the + * pm_signal array with the events to route the + * SPU PC before making the FW call. Note, only + * one SPU event for profiling can be specified + * at a time. + */ + cell_reg_setup_spu_events(ctr, sys, num_ctrs); + } else { + profiling_mode = PPU_PROFILING; + ret = cell_reg_setup_ppu(ctr, sys, num_ctrs); + } + + return ret; +} + + /* This function is called once for each cpu */ static int cell_cpu_setup(struct op_counter_config *cntr) @@ -672,8 +904,13 @@ static int cell_cpu_setup(struct op_counter_config *cntr) u32 cpu = smp_processor_id(); u32 num_enabled = 0; int i; + int ret; - if (spu_cycle_reset) + /* Cycle based SPU profiling does not use the performance + * counters. The trace array is configured to collect + * the data. + */ + if (profiling_mode == SPU_PROFILING_CYCLES) return 0; /* There is one performance monitor per processor chip (i.e. node), @@ -686,7 +923,6 @@ static int cell_cpu_setup(struct op_counter_config *cntr) cbe_disable_pm(cpu); cbe_disable_pm_interrupts(cpu); - cbe_write_pm(cpu, pm_interval, 0); cbe_write_pm(cpu, pm_start_stop, 0); cbe_write_pm(cpu, group_control, pm_regs.group_control); cbe_write_pm(cpu, debug_bus_control, pm_regs.debug_bus_control); @@ -703,7 +939,20 @@ static int cell_cpu_setup(struct op_counter_config *cntr) * The pm_rtas_activate_signals will return -EIO if the FW * call failed. */ - return pm_rtas_activate_signals(cbe_cpu_to_node(cpu), num_enabled); + if (profiling_mode == SPU_PROFILING_EVENTS) { + /* For SPU event profiling also need to setup the + * pm interval timer + */ + ret = pm_rtas_activate_signals(cbe_cpu_to_node(cpu), + num_enabled+2); + /* store PC from debug bus to Trace buffer as often + * as possible (every 10 cycles) + */ + cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC); + return ret; + } else + return pm_rtas_activate_signals(cbe_cpu_to_node(cpu), + num_enabled); } #define ENTRIES 303 @@ -885,7 +1134,122 @@ static struct notifier_block cpu_freq_notifier_block = { }; #endif -static int cell_global_start_spu(struct op_counter_config *ctr) +/* + * Note the generic OProfile stop calls do not support returning + * an error on stop. Hence, will not return an error if the FW + * calls fail on stop. Failure to reset the debug bus is not an issue. + * Failure to disable the SPU profiling is not an issue. The FW calls + * to enable the performance counters and debug bus will work even if + * the hardware was not cleanly reset. + */ +static void cell_global_stop_spu_cycles(void) +{ + int subfunc, rtn_value; + unsigned int lfsr_value; + int cpu; + + oprofile_running = 0; + smp_wmb(); + +#ifdef CONFIG_CPU_FREQ + cpufreq_unregister_notifier(&cpu_freq_notifier_block, + CPUFREQ_TRANSITION_NOTIFIER); +#endif + + for_each_online_cpu(cpu) { + if (cbe_get_hw_thread_id(cpu)) + continue; + + subfunc = 3; /* + * 2 - activate SPU tracing, + * 3 - deactivate + */ + lfsr_value = 0x8f100000; + + rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL, + subfunc, cbe_cpu_to_node(cpu), + lfsr_value); + + if (unlikely(rtn_value != 0)) { + printk(KERN_ERR + "%s: rtas call ibm,cbe-spu-perftools " \ + "failed, return = %d\n", + __func__, rtn_value); + } + + /* Deactivate the signals */ + pm_rtas_reset_signals(cbe_cpu_to_node(cpu)); + } + + stop_spu_profiling_cycles(); +} + +static void cell_global_stop_spu_events(void) +{ + int cpu; + oprofile_running = 0; + + stop_spu_profiling_events(); + smp_wmb(); + + for_each_online_cpu(cpu) { + if (cbe_get_hw_thread_id(cpu)) + continue; + + cbe_sync_irq(cbe_cpu_to_node(cpu)); + /* Stop the counters */ + cbe_disable_pm(cpu); + cbe_write_pm07_control(cpu, 0, 0); + + /* Deactivate the signals */ + pm_rtas_reset_signals(cbe_cpu_to_node(cpu)); + + /* Deactivate interrupts */ + cbe_disable_pm_interrupts(cpu); + } + del_timer_sync(&timer_spu_event_swap); +} + +static void cell_global_stop_ppu(void) +{ + int cpu; + + /* + * This routine will be called once for the system. + * There is one performance monitor per node, so we + * only need to perform this function once per node. + */ + del_timer_sync(&timer_virt_cntr); + oprofile_running = 0; + smp_wmb(); + + for_each_online_cpu(cpu) { + if (cbe_get_hw_thread_id(cpu)) + continue; + + cbe_sync_irq(cbe_cpu_to_node(cpu)); + /* Stop the counters */ + cbe_disable_pm(cpu); + + /* Deactivate the signals */ + pm_rtas_reset_signals(cbe_cpu_to_node(cpu)); + + /* Deactivate interrupts */ + cbe_disable_pm_interrupts(cpu); + } +} + +static void cell_global_stop(void) +{ + if (profiling_mode == PPU_PROFILING) + cell_global_stop_ppu(); + else if (profiling_mode == SPU_PROFILING_EVENTS) + cell_global_stop_spu_events(); + else + cell_global_stop_spu_cycles(); +} + +static int cell_global_start_spu_cycles(struct op_counter_config *ctr) { int subfunc; unsigned int lfsr_value; @@ -951,18 +1315,18 @@ static int cell_global_start_spu(struct op_counter_config *ctr) /* start profiling */ ret = rtas_call(spu_rtas_token, 3, 1, NULL, subfunc, - cbe_cpu_to_node(cpu), lfsr_value); + cbe_cpu_to_node(cpu), lfsr_value); if (unlikely(ret != 0)) { printk(KERN_ERR - "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n", - __func__, ret); + "%s: rtas call ibm,cbe-spu-perftools failed, " \ + "return = %d\n", __func__, ret); rtas_error = -EIO; goto out; } } - rtas_error = start_spu_profiling(spu_cycle_reset); + rtas_error = start_spu_profiling_cycles(spu_cycle_reset); if (rtas_error) goto out_stop; @@ -970,11 +1334,74 @@ static int cell_global_start_spu(struct op_counter_config *ctr) return 0; out_stop: - cell_global_stop_spu(); /* clean up the PMU/debug bus */ + cell_global_stop_spu_cycles(); /* clean up the PMU/debug bus */ out: return rtas_error; } +static int cell_global_start_spu_events(struct op_counter_config *ctr) +{ + int cpu; + u32 interrupt_mask = 0; + int rtn = 0; + + hdw_thread = 0; + + /* spu event profiling, uses the performance counters to generate + * an interrupt. The hardware is setup to store the SPU program + * counter into the trace array. The occurrence mode is used to + * enable storing data to the trace buffer. The bits are set + * to send/store the SPU address in the trace buffer. The debug + * bus must be setup to route the SPU program counter onto the + * debug bus. The occurrence data in the trace buffer is not used. + */ + + /* This routine gets called once for the system. + * There is one performance monitor per node, so we + * only need to perform this function once per node. + */ + + for_each_online_cpu(cpu) { + if (cbe_get_hw_thread_id(cpu)) + continue; + + /* + * Setup SPU event-based profiling. + * Set perf_mon_control bit 0 to a zero before + * enabling spu collection hardware. + * + * Only support one SPU event on one SPU per node. + */ + if (ctr_enabled & 1) { + cbe_write_ctr(cpu, 0, reset_value[0]); + enable_ctr(cpu, 0, pm_regs.pm07_cntrl); + interrupt_mask |= + CBE_PM_CTR_OVERFLOW_INTR(0); + } else { + /* Disable counter */ + cbe_write_pm07_control(cpu, 0, 0); + } + + cbe_get_and_clear_pm_interrupts(cpu); + cbe_enable_pm_interrupts(cpu, hdw_thread, interrupt_mask); + cbe_enable_pm(cpu); + + /* clear the trace buffer */ + cbe_write_pm(cpu, trace_address, 0); + } + + /* Start the timer to time slice collecting the event profile + * on each of the SPUs. Note, can collect profile on one SPU + * per node at a time. + */ + start_spu_event_swap(); + start_spu_profiling_events(); + oprofile_running = 1; + smp_wmb(); + + return rtn; +} + static int cell_global_start_ppu(struct op_counter_config *ctr) { u32 cpu, i; @@ -994,8 +1421,7 @@ static int cell_global_start_ppu(struct op_counter_config *ctr) if (ctr_enabled & (1 << i)) { cbe_write_ctr(cpu, i, reset_value[i]); enable_ctr(cpu, i, pm_regs.pm07_cntrl); - interrupt_mask |= - CBE_PM_CTR_OVERFLOW_INTR(i); + interrupt_mask |= CBE_PM_CTR_OVERFLOW_INTR(i); } else { /* Disable counter */ cbe_write_pm07_control(cpu, i, 0); @@ -1024,99 +1450,162 @@ static int cell_global_start_ppu(struct op_counter_config *ctr) static int cell_global_start(struct op_counter_config *ctr) { - if (spu_cycle_reset) - return cell_global_start_spu(ctr); + if (profiling_mode == SPU_PROFILING_CYCLES) + return cell_global_start_spu_cycles(ctr); + else if (profiling_mode == SPU_PROFILING_EVENTS) + return cell_global_start_spu_events(ctr); else return cell_global_start_ppu(ctr); } -/* - * Note the generic OProfile stop calls do not support returning - * an error on stop. Hence, will not return an error if the FW - * calls fail on stop. Failure to reset the debug bus is not an issue. - * Failure to disable the SPU profiling is not an issue. The FW calls - * to enable the performance counters and debug bus will work even if - * the hardware was not cleanly reset. + +/* The SPU interrupt handler + * + * SPU event profiling works as follows: + * The pm_signal[0] holds the one SPU event to be measured. It is routed on + * the debug bus using word 0 or 1. The value of pm_signal[1] and + * pm_signal[2] contain the necessary events to route the SPU program + * counter for the selected SPU onto the debug bus using words 2 and 3. + * The pm_interval register is setup to write the SPU PC value into the + * trace buffer at the maximum rate possible. The trace buffer is configured + * to store the PCs, wrapping when it is full. The performance counter is + * intialized to the max hardware count minus the number of events, N, between + * samples. Once the N events have occured, a HW counter overflow occurs + * causing the generation of a HW counter interrupt which also stops the + * writing of the SPU PC values to the trace buffer. Hence the last PC + * written to the trace buffer is the SPU PC that we want. Unfortunately, + * we have to read from the beginning of the trace buffer to get to the + * last value written. We just hope the PPU has nothing better to do then + * service this interrupt. The PC for the specific SPU being profiled is + * extracted from the trace buffer processed and stored. The trace buffer + * is cleared, interrupts are cleared, the counter is reset to max - N. + * A kernel timer is used to periodically call the routine spu_evnt_swap() + * to switch to the next physical SPU in the node to profile in round robbin + * order. This way data is collected for all SPUs on the node. It does mean + * that we need to use a relatively small value of N to ensure enough samples + * on each SPU are collected each SPU is being profiled 1/8 of the time. + * It may also be necessary to use a longer sample collection period. */ -static void cell_global_stop_spu(void) +static void cell_handle_interrupt_spu(struct pt_regs *regs, + struct op_counter_config *ctr) { - int subfunc, rtn_value; - unsigned int lfsr_value; - int cpu; + u32 cpu, cpu_tmp; + u64 trace_entry; + u32 interrupt_mask; + u64 trace_buffer[2]; + u64 last_trace_buffer; + u32 sample; + u32 trace_addr; + unsigned long sample_array_lock_flags; + int spu_num; + unsigned long flags; - oprofile_running = 0; + /* Make sure spu event interrupt handler and spu event swap + * don't access the counters simultaneously. + */ + cpu = smp_processor_id(); + spin_lock_irqsave(&cntr_lock, flags); -#ifdef CONFIG_CPU_FREQ - cpufreq_unregister_notifier(&cpu_freq_notifier_block, - CPUFREQ_TRANSITION_NOTIFIER); -#endif + cpu_tmp = cpu; + cbe_disable_pm(cpu); - for_each_online_cpu(cpu) { - if (cbe_get_hw_thread_id(cpu)) - continue; + interrupt_mask = cbe_get_and_clear_pm_interrupts(cpu); - subfunc = 3; /* - * 2 - activate SPU tracing, - * 3 - deactivate - */ - lfsr_value = 0x8f100000; + sample = 0xABCDEF; + trace_entry = 0xfedcba; + last_trace_buffer = 0xdeadbeaf; - rtn_value = rtas_call(spu_rtas_token, 3, 1, NULL, - subfunc, cbe_cpu_to_node(cpu), - lfsr_value); + if ((oprofile_running == 1) && (interrupt_mask != 0)) { + /* disable writes to trace buff */ + cbe_write_pm(cpu, pm_interval, 0); - if (unlikely(rtn_value != 0)) { - printk(KERN_ERR - "%s: rtas call ibm,cbe-spu-perftools failed, return = %d\n", - __func__, rtn_value); + /* only have one perf cntr being used, cntr 0 */ + if ((interrupt_mask & CBE_PM_CTR_OVERFLOW_INTR(0)) + && ctr[0].enabled) + /* The SPU PC values will be read + * from the trace buffer, reset counter + */ + + cbe_write_ctr(cpu, 0, reset_value[0]); + + trace_addr = cbe_read_pm(cpu, trace_address); + + while (!(trace_addr & CBE_PM_TRACE_BUF_EMPTY)) { + /* There is data in the trace buffer to process + * Read the buffer until you get to the last + * entry. This is the value we want. + */ + + cbe_read_trace_buffer(cpu, trace_buffer); + trace_addr = cbe_read_pm(cpu, trace_address); } - /* Deactivate the signals */ - pm_rtas_reset_signals(cbe_cpu_to_node(cpu)); - } + /* SPU Address 16 bit count format for 128 bit + * HW trace buffer is used for the SPU PC storage + * HDR bits 0:15 + * SPU Addr 0 bits 16:31 + * SPU Addr 1 bits 32:47 + * unused bits 48:127 + * + * HDR: bit4 = 1 SPU Address 0 valid + * HDR: bit5 = 1 SPU Address 1 valid + * - unfortunately, the valid bits don't seem to work + * + * Note trace_buffer[0] holds bits 0:63 of the HW + * trace buffer, trace_buffer[1] holds bits 64:127 + */ - stop_spu_profiling(); -} + trace_entry = trace_buffer[0] + & 0x00000000FFFF0000; -static void cell_global_stop_ppu(void) -{ - int cpu; + /* only top 16 of the 18 bit SPU PC address + * is stored in trace buffer, hence shift right + * by 16 -2 bits */ + sample = trace_entry >> 14; + last_trace_buffer = trace_buffer[0]; - /* - * This routine will be called once for the system. - * There is one performance monitor per node, so we - * only need to perform this function once per node. - */ - del_timer_sync(&timer_virt_cntr); - oprofile_running = 0; - smp_wmb(); + spu_num = spu_evnt_phys_spu_indx + + (cbe_cpu_to_node(cpu) * NUM_SPUS_PER_NODE); - for_each_online_cpu(cpu) { - if (cbe_get_hw_thread_id(cpu)) - continue; + /* make sure only one process at a time is calling + * spu_sync_buffer() + */ + spin_lock_irqsave(&oprof_spu_smpl_arry_lck, + sample_array_lock_flags); + spu_sync_buffer(spu_num, &sample, 1); + spin_unlock_irqrestore(&oprof_spu_smpl_arry_lck, + sample_array_lock_flags); - cbe_sync_irq(cbe_cpu_to_node(cpu)); - /* Stop the counters */ - cbe_disable_pm(cpu); + smp_wmb(); /* insure spu event buffer updates are written + * don't want events intermingled... */ - /* Deactivate the signals */ - pm_rtas_reset_signals(cbe_cpu_to_node(cpu)); + /* The counters were frozen by the interrupt. + * Reenable the interrupt and restart the counters. + */ + cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC); + cbe_enable_pm_interrupts(cpu, hdw_thread, + virt_cntr_inter_mask); - /* Deactivate interrupts */ - cbe_disable_pm_interrupts(cpu); - } -} + /* clear the trace buffer, re-enable writes to trace buff */ + cbe_write_pm(cpu, trace_address, 0); + cbe_write_pm(cpu, pm_interval, NUM_INTERVAL_CYC); -static void cell_global_stop(void) -{ - if (spu_cycle_reset) - cell_global_stop_spu(); - else - cell_global_stop_ppu(); + /* The writes to the various performance counters only writes + * to a latch. The new values (interrupt setting bits, reset + * counter value etc.) are not copied to the actual registers + * until the performance monitor is enabled. In order to get + * this to work as desired, the permormance monitor needs to + * be disabled while writing to the latches. This is a + * HW design issue. + */ + write_pm_cntrl(cpu); + cbe_enable_pm(cpu); + } + spin_unlock_irqrestore(&cntr_lock, flags); } -static void cell_handle_interrupt(struct pt_regs *regs, - struct op_counter_config *ctr) +static void cell_handle_interrupt_ppu(struct pt_regs *regs, + struct op_counter_config *ctr) { u32 cpu; u64 pc; @@ -1132,7 +1621,7 @@ static void cell_handle_interrupt(struct pt_regs *regs, * routine are not running at the same time. See the * cell_virtual_cntr() routine for additional comments. */ - spin_lock_irqsave(&virt_cntr_lock, flags); + spin_lock_irqsave(&cntr_lock, flags); /* * Need to disable and reenable the performance counters @@ -1185,7 +1674,16 @@ static void cell_handle_interrupt(struct pt_regs *regs, */ cbe_enable_pm(cpu); } - spin_unlock_irqrestore(&virt_cntr_lock, flags); + spin_unlock_irqrestore(&cntr_lock, flags); +} + +static void cell_handle_interrupt(struct pt_regs *regs, + struct op_counter_config *ctr) +{ + if (profiling_mode == PPU_PROFILING) + cell_handle_interrupt_ppu(regs, ctr); + else + cell_handle_interrupt_spu(regs, ctr); } /* @@ -1195,7 +1693,8 @@ static void cell_handle_interrupt(struct pt_regs *regs, */ static int cell_sync_start(void) { - if (spu_cycle_reset) + if ((profiling_mode == SPU_PROFILING_CYCLES) || + (profiling_mode == SPU_PROFILING_EVENTS)) return spu_sync_start(); else return DO_GENERIC_SYNC; @@ -1203,7 +1702,8 @@ static int cell_sync_start(void) static int cell_sync_stop(void) { - if (spu_cycle_reset) + if ((profiling_mode == SPU_PROFILING_CYCLES) || + (profiling_mode == SPU_PROFILING_EVENTS)) return spu_sync_stop(); else return 1; diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h index 9fa9dcdf344..e02a359d2aa 100644 --- a/arch/x86/include/asm/bitops.h +++ b/arch/x86/include/asm/bitops.h @@ -300,7 +300,7 @@ static inline int test_and_change_bit(int nr, volatile unsigned long *addr) return oldbit; } -static inline int constant_test_bit(int nr, const volatile unsigned long *addr) +static inline int constant_test_bit(unsigned int nr, const volatile unsigned long *addr) { return ((1UL << (nr % BITS_PER_LONG)) & (((unsigned long *)addr)[nr / BITS_PER_LONG])) != 0; diff --git a/arch/x86/kernel/acpi/boot.c b/arch/x86/kernel/acpi/boot.c index 29dc0c89d4a..d37593c2f43 100644 --- a/arch/x86/kernel/acpi/boot.c +++ b/arch/x86/kernel/acpi/boot.c @@ -47,7 +47,7 @@ #endif static int __initdata acpi_force = 0; - +u32 acpi_rsdt_forced; #ifdef CONFIG_ACPI int acpi_disabled = 0; #else @@ -1374,6 +1374,17 @@ static void __init acpi_process_madt(void) "Invalid BIOS MADT, disabling ACPI\n"); disable_acpi(); } + } else { + /* + * ACPI found no MADT, and so ACPI wants UP PIC mode. + * In the event an MPS table was found, forget it. + * Boot with "acpi=off" to use MPS on such a system. + */ + if (smp_found_config) { + printk(KERN_WARNING PREFIX + "No APIC-table, disabling MPS\n"); + smp_found_config = 0; + } } /* @@ -1809,6 +1820,10 @@ static int __init parse_acpi(char *arg) disable_acpi(); acpi_ht = 1; } + /* acpi=rsdt use RSDT instead of XSDT */ + else if (strcmp(arg, "rsdt") == 0) { + acpi_rsdt_forced = 1; + } /* "acpi=noirq" disables ACPI interrupt routing */ else if (strcmp(arg, "noirq") == 0) { acpi_noirq_set(); diff --git a/arch/x86/kernel/acpi/cstate.c b/arch/x86/kernel/acpi/cstate.c index c2502eb9aa8..a4805b3b409 100644 --- a/arch/x86/kernel/acpi/cstate.c +++ b/arch/x86/kernel/acpi/cstate.c @@ -56,6 +56,7 @@ static struct cstate_entry *cpu_cstate_entry; /* per CPU ptr */ static short mwait_supported[ACPI_PROCESSOR_MAX_POWER]; #define MWAIT_SUBSTATE_MASK (0xf) +#define MWAIT_CSTATE_MASK (0xf) #define MWAIT_SUBSTATE_SIZE (4) #define CPUID_MWAIT_LEAF (5) @@ -98,7 +99,8 @@ int acpi_processor_ffh_cstate_probe(unsigned int cpu, cpuid(CPUID_MWAIT_LEAF, &eax, &ebx, &ecx, &edx); /* Check whether this particular cx_type (in CST) is supported or not */ - cstate_type = (cx->address >> MWAIT_SUBSTATE_SIZE) + 1; + cstate_type = ((cx->address >> MWAIT_SUBSTATE_SIZE) & + MWAIT_CSTATE_MASK) + 1; edx_part = edx >> (cstate_type * MWAIT_SUBSTATE_SIZE); num_cstate_subtype = edx_part & MWAIT_SUBSTATE_MASK; diff --git a/arch/x86/kernel/acpi/sleep.c b/arch/x86/kernel/acpi/sleep.c index 806b4e9051b..707c1f6f95f 100644 --- a/arch/x86/kernel/acpi/sleep.c +++ b/arch/x86/kernel/acpi/sleep.c @@ -159,6 +159,8 @@ static int __init acpi_sleep_setup(char *str) #endif if (strncmp(str, "old_ordering", 12) == 0) acpi_old_suspend_ordering(); + if (strncmp(str, "s4_nonvs", 8) == 0) + acpi_s4_no_nvs(); str = strchr(str, ','); if (str != NULL) str += strspn(str, ", \t"); diff --git a/arch/x86/kernel/e820.c b/arch/x86/kernel/e820.c index 65a13943e09..e85826829cf 100644 --- a/arch/x86/kernel/e820.c +++ b/arch/x86/kernel/e820.c @@ -665,6 +665,27 @@ void __init e820_mark_nosave_regions(unsigned long limit_pfn) } #endif +#ifdef CONFIG_HIBERNATION +/** + * Mark ACPI NVS memory region, so that we can save/restore it during + * hibernation and the subsequent resume. + */ +static int __init e820_mark_nvs_memory(void) +{ + int i; + + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (ei->type == E820_NVS) + hibernate_nvs_register(ei->addr, ei->size); + } + + return 0; +} +core_initcall(e820_mark_nvs_memory); +#endif + /* * Early reserved memory areas. */ diff --git a/arch/x86/kernel/early-quirks.c b/arch/x86/kernel/early-quirks.c index 744aa7fc49d..76b8cd953de 100644 --- a/arch/x86/kernel/early-quirks.c +++ b/arch/x86/kernel/early-quirks.c @@ -201,6 +201,12 @@ struct chipset { void (*f)(int num, int slot, int func); }; +/* + * Only works for devices on the root bus. If you add any devices + * not on bus 0 readd another loop level in early_quirks(). But + * be careful because at least the Nvidia quirk here relies on + * only matching on bus 0. + */ static struct chipset early_qrk[] __initdata = { { PCI_VENDOR_ID_NVIDIA, PCI_ANY_ID, PCI_CLASS_BRIDGE_PCI, PCI_ANY_ID, QFLAG_APPLY_ONCE, nvidia_bugs }, @@ -267,17 +273,17 @@ static int __init check_dev_quirk(int num, int slot, int func) void __init early_quirks(void) { - int num, slot, func; + int slot, func; if (!early_pci_allowed()) return; /* Poor man's PCI discovery */ - for (num = 0; num < 32; num++) - for (slot = 0; slot < 32; slot++) - for (func = 0; func < 8; func++) { - /* Only probe function 0 on single fn devices */ - if (check_dev_quirk(num, slot, func)) - break; - } + /* Only scan the root bus */ + for (slot = 0; slot < 32; slot++) + for (func = 0; func < 8; func++) { + /* Only probe function 0 on single fn devices */ + if (check_dev_quirk(0, slot, func)) + break; + } } diff --git a/arch/x86/oprofile/op_model_amd.c b/arch/x86/oprofile/op_model_amd.c index 98658f25f54..8fdf06e4edf 100644 --- a/arch/x86/oprofile/op_model_amd.c +++ b/arch/x86/oprofile/op_model_amd.c @@ -2,7 +2,7 @@ * @file op_model_amd.c * athlon / K7 / K8 / Family 10h model-specific MSR operations * - * @remark Copyright 2002-2008 OProfile authors + * @remark Copyright 2002-2009 OProfile authors * @remark Read the file COPYING * * @author John Levon @@ -10,7 +10,7 @@ * @author Graydon Hoare * @author Robert Richter <robert.richter@amd.com> * @author Barry Kasindorf -*/ + */ #include <linux/oprofile.h> #include <linux/device.h> @@ -60,53 +60,10 @@ static unsigned long reset_value[NUM_COUNTERS]; #define IBS_OP_LOW_VALID_BIT (1ULL<<18) /* bit 18 */ #define IBS_OP_LOW_ENABLE (1ULL<<17) /* bit 17 */ -/* Codes used in cpu_buffer.c */ -/* This produces duplicate code, need to be fixed */ -#define IBS_FETCH_BEGIN 3 -#define IBS_OP_BEGIN 4 - -/* - * The function interface needs to be fixed, something like add - * data. Should then be added to linux/oprofile.h. - */ -extern void -oprofile_add_ibs_sample(struct pt_regs * const regs, - unsigned int * const ibs_sample, int ibs_code); - -struct ibs_fetch_sample { - /* MSRC001_1031 IBS Fetch Linear Address Register */ - unsigned int ibs_fetch_lin_addr_low; - unsigned int ibs_fetch_lin_addr_high; - /* MSRC001_1030 IBS Fetch Control Register */ - unsigned int ibs_fetch_ctl_low; - unsigned int ibs_fetch_ctl_high; - /* MSRC001_1032 IBS Fetch Physical Address Register */ - unsigned int ibs_fetch_phys_addr_low; - unsigned int ibs_fetch_phys_addr_high; -}; - -struct ibs_op_sample { - /* MSRC001_1034 IBS Op Logical Address Register (IbsRIP) */ - unsigned int ibs_op_rip_low; - unsigned int ibs_op_rip_high; - /* MSRC001_1035 IBS Op Data Register */ - unsigned int ibs_op_data1_low; - unsigned int ibs_op_data1_high; - /* MSRC001_1036 IBS Op Data 2 Register */ - unsigned int ibs_op_data2_low; - unsigned int ibs_op_data2_high; - /* MSRC001_1037 IBS Op Data 3 Register */ - unsigned int ibs_op_data3_low; - unsigned int ibs_op_data3_high; - /* MSRC001_1038 IBS DC Linear Address Register (IbsDcLinAd) */ - unsigned int ibs_dc_linear_low; - unsigned int ibs_dc_linear_high; - /* MSRC001_1039 IBS DC Physical Address Register (IbsDcPhysAd) */ - unsigned int ibs_dc_phys_low; - unsigned int ibs_dc_phys_high; -}; +#define IBS_FETCH_SIZE 6 +#define IBS_OP_SIZE 12 -static int ibs_allowed; /* AMD Family10h and later */ +static int has_ibs; /* AMD Family10h and later */ struct op_ibs_config { unsigned long op_enabled; @@ -197,31 +154,29 @@ static inline int op_amd_handle_ibs(struct pt_regs * const regs, struct op_msrs const * const msrs) { - unsigned int low, high; - struct ibs_fetch_sample ibs_fetch; - struct ibs_op_sample ibs_op; + u32 low, high; + u64 msr; + struct op_entry entry; - if (!ibs_allowed) + if (!has_ibs) return 1; if (ibs_config.fetch_enabled) { rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); if (high & IBS_FETCH_HIGH_VALID_BIT) { - ibs_fetch.ibs_fetch_ctl_high = high; - ibs_fetch.ibs_fetch_ctl_low = low; - rdmsr(MSR_AMD64_IBSFETCHLINAD, low, high); - ibs_fetch.ibs_fetch_lin_addr_high = high; - ibs_fetch.ibs_fetch_lin_addr_low = low; - rdmsr(MSR_AMD64_IBSFETCHPHYSAD, low, high); - ibs_fetch.ibs_fetch_phys_addr_high = high; - ibs_fetch.ibs_fetch_phys_addr_low = low; - - oprofile_add_ibs_sample(regs, - (unsigned int *)&ibs_fetch, - IBS_FETCH_BEGIN); + rdmsrl(MSR_AMD64_IBSFETCHLINAD, msr); + oprofile_write_reserve(&entry, regs, msr, + IBS_FETCH_CODE, IBS_FETCH_SIZE); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_add_data(&entry, low); + oprofile_add_data(&entry, high); + rdmsrl(MSR_AMD64_IBSFETCHPHYSAD, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_write_commit(&entry); /* reenable the IRQ */ - rdmsr(MSR_AMD64_IBSFETCHCTL, low, high); high &= ~IBS_FETCH_HIGH_VALID_BIT; high |= IBS_FETCH_HIGH_ENABLE; low &= IBS_FETCH_LOW_MAX_CNT_MASK; @@ -232,30 +187,29 @@ op_amd_handle_ibs(struct pt_regs * const regs, if (ibs_config.op_enabled) { rdmsr(MSR_AMD64_IBSOPCTL, low, high); if (low & IBS_OP_LOW_VALID_BIT) { - rdmsr(MSR_AMD64_IBSOPRIP, low, high); - ibs_op.ibs_op_rip_low = low; - ibs_op.ibs_op_rip_high = high; - rdmsr(MSR_AMD64_IBSOPDATA, low, high); - ibs_op.ibs_op_data1_low = low; - ibs_op.ibs_op_data1_high = high; - rdmsr(MSR_AMD64_IBSOPDATA2, low, high); - ibs_op.ibs_op_data2_low = low; - ibs_op.ibs_op_data2_high = high; - rdmsr(MSR_AMD64_IBSOPDATA3, low, high); - ibs_op.ibs_op_data3_low = low; - ibs_op.ibs_op_data3_high = high; - rdmsr(MSR_AMD64_IBSDCLINAD, low, high); - ibs_op.ibs_dc_linear_low = low; - ibs_op.ibs_dc_linear_high = high; - rdmsr(MSR_AMD64_IBSDCPHYSAD, low, high); - ibs_op.ibs_dc_phys_low = low; - ibs_op.ibs_dc_phys_high = high; + rdmsrl(MSR_AMD64_IBSOPRIP, msr); + oprofile_write_reserve(&entry, regs, msr, + IBS_OP_CODE, IBS_OP_SIZE); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA2, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + rdmsrl(MSR_AMD64_IBSOPDATA3, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + rdmsrl(MSR_AMD64_IBSDCLINAD, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + rdmsrl(MSR_AMD64_IBSDCPHYSAD, msr); + oprofile_add_data(&entry, (u32)msr); + oprofile_add_data(&entry, (u32)(msr >> 32)); + oprofile_write_commit(&entry); /* reenable the IRQ */ - oprofile_add_ibs_sample(regs, - (unsigned int *)&ibs_op, - IBS_OP_BEGIN); - rdmsr(MSR_AMD64_IBSOPCTL, low, high); high = 0; low &= ~IBS_OP_LOW_VALID_BIT; low |= IBS_OP_LOW_ENABLE; @@ -305,14 +259,14 @@ static void op_amd_start(struct op_msrs const * const msrs) } #ifdef CONFIG_OPROFILE_IBS - if (ibs_allowed && ibs_config.fetch_enabled) { + if (has_ibs && ibs_config.fetch_enabled) { low = (ibs_config.max_cnt_fetch >> 4) & 0xFFFF; high = ((ibs_config.rand_en & 0x1) << 25) /* bit 57 */ + IBS_FETCH_HIGH_ENABLE; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } - if (ibs_allowed && ibs_config.op_enabled) { + if (has_ibs && ibs_config.op_enabled) { low = ((ibs_config.max_cnt_op >> 4) & 0xFFFF) + ((ibs_config.dispatched_ops & 0x1) << 19) /* bit 19 */ + IBS_OP_LOW_ENABLE; @@ -341,14 +295,14 @@ static void op_amd_stop(struct op_msrs const * const msrs) } #ifdef CONFIG_OPROFILE_IBS - if (ibs_allowed && ibs_config.fetch_enabled) { + if (has_ibs && ibs_config.fetch_enabled) { /* clear max count and enable */ low = 0; high = 0; wrmsr(MSR_AMD64_IBSFETCHCTL, low, high); } - if (ibs_allowed && ibs_config.op_enabled) { + if (has_ibs && ibs_config.op_enabled) { /* clear max count and enable */ low = 0; high = 0; @@ -409,6 +363,7 @@ static int init_ibs_nmi(void) | IBSCTL_LVTOFFSETVAL); pci_read_config_dword(cpu_cfg, IBSCTL, &value); if (value != (ibs_eilvt_off | IBSCTL_LVTOFFSETVAL)) { + pci_dev_put(cpu_cfg); printk(KERN_DEBUG "Failed to setup IBS LVT offset, " "IBSCTL = 0x%08x", value); return 1; @@ -436,20 +391,20 @@ static int init_ibs_nmi(void) /* uninitialize the APIC for the IBS interrupts if needed */ static void clear_ibs_nmi(void) { - if (ibs_allowed) + if (has_ibs) on_each_cpu(apic_clear_ibs_nmi_per_cpu, NULL, 1); } /* initialize the APIC for the IBS interrupts if available */ static void ibs_init(void) { - ibs_allowed = boot_cpu_has(X86_FEATURE_IBS); + has_ibs = boot_cpu_has(X86_FEATURE_IBS); - if (!ibs_allowed) + if (!has_ibs) return; if (init_ibs_nmi()) { - ibs_allowed = 0; + has_ibs = 0; return; } @@ -458,7 +413,7 @@ static void ibs_init(void) static void ibs_exit(void) { - if (!ibs_allowed) + if (!has_ibs) return; clear_ibs_nmi(); @@ -478,7 +433,7 @@ static int setup_ibs_files(struct super_block *sb, struct dentry *root) if (ret) return ret; - if (!ibs_allowed) + if (!has_ibs) return ret; /* model specific files */ diff --git a/crypto/async_tx/async_tx.c b/crypto/async_tx/async_tx.c index dcbf1be149f..f21147f3626 100644 --- a/crypto/async_tx/async_tx.c +++ b/crypto/async_tx/async_tx.c @@ -28,351 +28,18 @@ #include <linux/async_tx.h> #ifdef CONFIG_DMA_ENGINE -static enum dma_state_client -dma_channel_add_remove(struct dma_client *client, - struct dma_chan *chan, enum dma_state state); - -static struct dma_client async_tx_dma = { - .event_callback = dma_channel_add_remove, - /* .cap_mask == 0 defaults to all channels */ -}; - -/** - * dma_cap_mask_all - enable iteration over all operation types - */ -static dma_cap_mask_t dma_cap_mask_all; - -/** - * chan_ref_percpu - tracks channel allocations per core/opertion - */ -struct chan_ref_percpu { - struct dma_chan_ref *ref; -}; - -static int channel_table_initialized; -static struct chan_ref_percpu *channel_table[DMA_TX_TYPE_END]; - -/** - * async_tx_lock - protect modification of async_tx_master_list and serialize - * rebalance operations - */ -static spinlock_t async_tx_lock; - -static LIST_HEAD(async_tx_master_list); - -/* async_tx_issue_pending_all - start all transactions on all channels */ -void async_tx_issue_pending_all(void) -{ - struct dma_chan_ref *ref; - - rcu_read_lock(); - list_for_each_entry_rcu(ref, &async_tx_master_list, node) - ref->chan->device->device_issue_pending(ref->chan); - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(async_tx_issue_pending_all); - -/* dma_wait_for_async_tx - spin wait for a transcation to complete - * @tx: transaction to wait on - */ -enum dma_status -dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) -{ - enum dma_status status; - struct dma_async_tx_descriptor *iter; - struct dma_async_tx_descriptor *parent; - - if (!tx) - return DMA_SUCCESS; - - /* poll through the dependency chain, return when tx is complete */ - do { - iter = tx; - - /* find the root of the unsubmitted dependency chain */ - do { - parent = iter->parent; - if (!parent) - break; - else - iter = parent; - } while (parent); - - /* there is a small window for ->parent == NULL and - * ->cookie == -EBUSY - */ - while (iter->cookie == -EBUSY) - cpu_relax(); - - status = dma_sync_wait(iter->chan, iter->cookie); - } while (status == DMA_IN_PROGRESS || (iter != tx)); - - return status; -} -EXPORT_SYMBOL_GPL(dma_wait_for_async_tx); - -/* async_tx_run_dependencies - helper routine for dma drivers to process - * (start) dependent operations on their target channel - * @tx: transaction with dependencies - */ -void async_tx_run_dependencies(struct dma_async_tx_descriptor *tx) -{ - struct dma_async_tx_descriptor *dep = tx->next; - struct dma_async_tx_descriptor *dep_next; - struct dma_chan *chan; - - if (!dep) - return; - - chan = dep->chan; - - /* keep submitting up until a channel switch is detected - * in that case we will be called again as a result of - * processing the interrupt from async_tx_channel_switch - */ - for (; dep; dep = dep_next) { - spin_lock_bh(&dep->lock); - dep->parent = NULL; - dep_next = dep->next; - if (dep_next && dep_next->chan == chan) - dep->next = NULL; /* ->next will be submitted */ - else - dep_next = NULL; /* submit current dep and terminate */ - spin_unlock_bh(&dep->lock); - - dep->tx_submit(dep); - } - - chan->device->device_issue_pending(chan); -} -EXPORT_SYMBOL_GPL(async_tx_run_dependencies); - -static void -free_dma_chan_ref(struct rcu_head *rcu) -{ - struct dma_chan_ref *ref; - ref = container_of(rcu, struct dma_chan_ref, rcu); - kfree(ref); -} - -static void -init_dma_chan_ref(struct dma_chan_ref *ref, struct dma_chan *chan) -{ - INIT_LIST_HEAD(&ref->node); - INIT_RCU_HEAD(&ref->rcu); - ref->chan = chan; - atomic_set(&ref->count, 0); -} - -/** - * get_chan_ref_by_cap - returns the nth channel of the given capability - * defaults to returning the channel with the desired capability and the - * lowest reference count if the index can not be satisfied - * @cap: capability to match - * @index: nth channel desired, passing -1 has the effect of forcing the - * default return value - */ -static struct dma_chan_ref * -get_chan_ref_by_cap(enum dma_transaction_type cap, int index) -{ - struct dma_chan_ref *ret_ref = NULL, *min_ref = NULL, *ref; - - rcu_read_lock(); - list_for_each_entry_rcu(ref, &async_tx_master_list, node) - if (dma_has_cap(cap, ref->chan->device->cap_mask)) { - if (!min_ref) - min_ref = ref; - else if (atomic_read(&ref->count) < - atomic_read(&min_ref->count)) - min_ref = ref; - - if (index-- == 0) { - ret_ref = ref; - break; - } - } - rcu_read_unlock(); - - if (!ret_ref) - ret_ref = min_ref; - - if (ret_ref) - atomic_inc(&ret_ref->count); - - return ret_ref; -} - -/** - * async_tx_rebalance - redistribute the available channels, optimize - * for cpu isolation in the SMP case, and opertaion isolation in the - * uniprocessor case - */ -static void async_tx_rebalance(void) -{ - int cpu, cap, cpu_idx = 0; - unsigned long flags; - - if (!channel_table_initialized) - return; - - spin_lock_irqsave(&async_tx_lock, flags); - - /* undo the last distribution */ - for_each_dma_cap_mask(cap, dma_cap_mask_all) - for_each_possible_cpu(cpu) { - struct dma_chan_ref *ref = - per_cpu_ptr(channel_table[cap], cpu)->ref; - if (ref) { - atomic_set(&ref->count, 0); - per_cpu_ptr(channel_table[cap], cpu)->ref = - NULL; - } - } - - for_each_dma_cap_mask(cap, dma_cap_mask_all) - for_each_online_cpu(cpu) { - struct dma_chan_ref *new; - if (NR_CPUS > 1) - new = get_chan_ref_by_cap(cap, cpu_idx++); - else - new = get_chan_ref_by_cap(cap, -1); - - per_cpu_ptr(channel_table[cap], cpu)->ref = new; - } - - spin_unlock_irqrestore(&async_tx_lock, flags); -} - -static enum dma_state_client -dma_channel_add_remove(struct dma_client *client, - struct dma_chan *chan, enum dma_state state) -{ - unsigned long found, flags; - struct dma_chan_ref *master_ref, *ref; - enum dma_state_client ack = DMA_DUP; /* default: take no action */ - - switch (state) { - case DMA_RESOURCE_AVAILABLE: - found = 0; - rcu_read_lock(); - list_for_each_entry_rcu(ref, &async_tx_master_list, node) - if (ref->chan == chan) { - found = 1; - break; - } - rcu_read_unlock(); - - pr_debug("async_tx: dma resource available [%s]\n", - found ? "old" : "new"); - - if (!found) - ack = DMA_ACK; - else - break; - - /* add the channel to the generic management list */ - master_ref = kmalloc(sizeof(*master_ref), GFP_KERNEL); - if (master_ref) { - /* keep a reference until async_tx is unloaded */ - dma_chan_get(chan); - init_dma_chan_ref(master_ref, chan); - spin_lock_irqsave(&async_tx_lock, flags); - list_add_tail_rcu(&master_ref->node, - &async_tx_master_list); - spin_unlock_irqrestore(&async_tx_lock, - flags); - } else { - printk(KERN_WARNING "async_tx: unable to create" - " new master entry in response to" - " a DMA_RESOURCE_ADDED event" - " (-ENOMEM)\n"); - return 0; - } - - async_tx_rebalance(); - break; - case DMA_RESOURCE_REMOVED: - found = 0; - spin_lock_irqsave(&async_tx_lock, flags); - list_for_each_entry(ref, &async_tx_master_list, node) - if (ref->chan == chan) { - /* permit backing devices to go away */ - dma_chan_put(ref->chan); - list_del_rcu(&ref->node); - call_rcu(&ref->rcu, free_dma_chan_ref); - found = 1; - break; - } - spin_unlock_irqrestore(&async_tx_lock, flags); - - pr_debug("async_tx: dma resource removed [%s]\n", - found ? "ours" : "not ours"); - - if (found) - ack = DMA_ACK; - else - break; - - async_tx_rebalance(); - break; - case DMA_RESOURCE_SUSPEND: - case DMA_RESOURCE_RESUME: - printk(KERN_WARNING "async_tx: does not support dma channel" - " suspend/resume\n"); - break; - default: - BUG(); - } - - return ack; -} - -static int __init -async_tx_init(void) +static int __init async_tx_init(void) { - enum dma_transaction_type cap; - - spin_lock_init(&async_tx_lock); - bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END); - - /* an interrupt will never be an explicit operation type. - * clearing this bit prevents allocation to a slot in 'channel_table' - */ - clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits); - - for_each_dma_cap_mask(cap, dma_cap_mask_all) { - channel_table[cap] = alloc_percpu(struct chan_ref_percpu); - if (!channel_table[cap]) - goto err; - } - - channel_table_initialized = 1; - dma_async_client_register(&async_tx_dma); - dma_async_client_chan_request(&async_tx_dma); + dmaengine_get(); printk(KERN_INFO "async_tx: api initialized (async)\n"); return 0; -err: - printk(KERN_ERR "async_tx: initialization failure\n"); - - while (--cap >= 0) - free_percpu(channel_table[cap]); - - return 1; } static void __exit async_tx_exit(void) { - enum dma_transaction_type cap; - - channel_table_initialized = 0; - - for_each_dma_cap_mask(cap, dma_cap_mask_all) - if (channel_table[cap]) - free_percpu(channel_table[cap]); - - dma_async_client_unregister(&async_tx_dma); + dmaengine_put(); } /** @@ -387,16 +54,9 @@ __async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, { /* see if we can keep the chain on one channel */ if (depend_tx && - dma_has_cap(tx_type, depend_tx->chan->device->cap_mask)) + dma_has_cap(tx_type, depend_tx->chan->device->cap_mask)) return depend_tx->chan; - else if (likely(channel_table_initialized)) { - struct dma_chan_ref *ref; - int cpu = get_cpu(); - ref = per_cpu_ptr(channel_table[tx_type], cpu)->ref; - put_cpu(); - return ref ? ref->chan : NULL; - } else - return NULL; + return dma_find_channel(tx_type); } EXPORT_SYMBOL_GPL(__async_tx_find_channel); #else diff --git a/drivers/Kconfig b/drivers/Kconfig index 2f557f570ad..00cf9553f74 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -107,4 +107,6 @@ source "drivers/uio/Kconfig" source "drivers/xen/Kconfig" source "drivers/staging/Kconfig" + +source "drivers/platform/Kconfig" endmenu diff --git a/drivers/Makefile b/drivers/Makefile index 6326f4dbbda..c1bf4173793 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -105,3 +105,4 @@ obj-$(CONFIG_OF) += of/ obj-$(CONFIG_SSB) += ssb/ obj-$(CONFIG_VIRTIO) += virtio/ obj-$(CONFIG_STAGING) += staging/ +obj-y += platform/ diff --git a/drivers/acpi/Kconfig b/drivers/acpi/Kconfig index b0243fd55ac..d7f9839ba26 100644 --- a/drivers/acpi/Kconfig +++ b/drivers/acpi/Kconfig @@ -196,90 +196,6 @@ config ACPI_NUMA depends on (X86 || IA64) default y if IA64_GENERIC || IA64_SGI_SN2 -config ACPI_WMI - tristate "WMI (EXPERIMENTAL)" - depends on X86 - depends on EXPERIMENTAL - help - This driver adds support for the ACPI-WMI (Windows Management - Instrumentation) mapper device (PNP0C14) found on some systems. - - ACPI-WMI is a proprietary extension to ACPI to expose parts of the - ACPI firmware to userspace - this is done through various vendor - defined methods and data blocks in a PNP0C14 device, which are then - made available for userspace to call. - - The implementation of this in Linux currently only exposes this to - other kernel space drivers. - - This driver is a required dependency to build the firmware specific - drivers needed on many machines, including Acer and HP laptops. - - It is safe to enable this driver even if your DSDT doesn't define - any ACPI-WMI devices. - -config ACPI_ASUS - tristate "ASUS/Medion Laptop Extras" - depends on X86 - select BACKLIGHT_CLASS_DEVICE - ---help--- - This driver provides support for extra features of ACPI-compatible - ASUS laptops. As some of Medion laptops are made by ASUS, it may also - support some Medion laptops (such as 9675 for example). It makes all - the extra buttons generate standard ACPI events that go through - /proc/acpi/events, and (on some models) adds support for changing the - display brightness and output, switching the LCD backlight on and off, - and most importantly, allows you to blink those fancy LEDs intended - for reporting mail and wireless status. - - Note: display switching code is currently considered EXPERIMENTAL, - toying with these values may even lock your machine. - - All settings are changed via /proc/acpi/asus directory entries. Owner - and group for these entries can be set with asus_uid and asus_gid - parameters. - - More information and a userspace daemon for handling the extra buttons - at <http://sourceforge.net/projects/acpi4asus/>. - - If you have an ACPI-compatible ASUS laptop, say Y or M here. This - driver is still under development, so if your laptop is unsupported or - something works not quite as expected, please use the mailing list - available on the above page (acpi4asus-user@lists.sourceforge.net). - - NOTE: This driver is deprecated and will probably be removed soon, - use asus-laptop instead. - -config ACPI_TOSHIBA - tristate "Toshiba Laptop Extras" - depends on X86 && INPUT - select INPUT_POLLDEV - select NET - select RFKILL - select BACKLIGHT_CLASS_DEVICE - ---help--- - This driver adds support for access to certain system settings - on "legacy free" Toshiba laptops. These laptops can be recognized by - their lack of a BIOS setup menu and APM support. - - On these machines, all system configuration is handled through the - ACPI. This driver is required for access to controls not covered - by the general ACPI drivers, such as LCD brightness, video output, - etc. - - This driver differs from the non-ACPI Toshiba laptop driver (located - under "Processor type and features") in several aspects. - Configuration is accessed by reading and writing text files in the - /proc tree instead of by program interface to /dev. Furthermore, no - power management functions are exposed, as those are handled by the - general ACPI drivers. - - More information about this driver is available at - <http://memebeam.org/toys/ToshibaAcpiDriver>. - - If you have a legacy free Toshiba laptop (such as the Libretto L1 - series), say Y. - config ACPI_CUSTOM_DSDT_FILE string "Custom DSDT Table file to include" default "" diff --git a/drivers/acpi/Makefile b/drivers/acpi/Makefile index 3c0c93300f1..d80f4cc2e0d 100644 --- a/drivers/acpi/Makefile +++ b/drivers/acpi/Makefile @@ -2,15 +2,8 @@ # Makefile for the Linux ACPI interpreter # -export ACPI_CFLAGS - -ACPI_CFLAGS := -Os - -ifdef CONFIG_ACPI_DEBUG - ACPI_CFLAGS += -DACPI_DEBUG_OUTPUT -endif - -EXTRA_CFLAGS += $(ACPI_CFLAGS) +ccflags-y := -Os +ccflags-$(CONFIG_ACPI_DEBUG) += -DACPI_DEBUG_OUTPUT # # ACPI Boot-Time Table Parsing @@ -22,9 +15,13 @@ obj-$(CONFIG_X86) += blacklist.o # ACPI Core Subsystem (Interpreter) # obj-y += osl.o utils.o reboot.o\ - dispatcher/ events/ executer/ hardware/ \ - namespace/ parser/ resources/ tables/ \ - utilities/ + acpica/ + +# sleep related files +obj-y += wakeup.o +obj-y += main.o +obj-$(CONFIG_ACPI_SLEEP) += proc.o + # # ACPI Bus and Device Drivers @@ -35,7 +32,6 @@ ifdef CONFIG_CPU_FREQ processor-objs += processor_perflib.o endif -obj-y += sleep/ obj-y += bus.o glue.o obj-y += scan.o # Keep EC driver first. Initialization of others depend on it. @@ -59,9 +55,6 @@ obj-y += power.o obj-$(CONFIG_ACPI_SYSTEM) += system.o event.o obj-$(CONFIG_ACPI_DEBUG) += debug.o obj-$(CONFIG_ACPI_NUMA) += numa.o -obj-$(CONFIG_ACPI_WMI) += wmi.o -obj-$(CONFIG_ACPI_ASUS) += asus_acpi.o -obj-$(CONFIG_ACPI_TOSHIBA) += toshiba_acpi.o obj-$(CONFIG_ACPI_HOTPLUG_MEMORY) += acpi_memhotplug.o obj-$(CONFIG_ACPI_PROCFS_POWER) += cm_sbs.o obj-$(CONFIG_ACPI_SBS) += sbshc.o diff --git a/drivers/acpi/acpica/Makefile b/drivers/acpi/acpica/Makefile new file mode 100644 index 00000000000..3f23298ee3f --- /dev/null +++ b/drivers/acpi/acpica/Makefile @@ -0,0 +1,44 @@ +# +# Makefile for ACPICA Core interpreter +# + +ccflags-y := -Os +ccflags-$(CONFIG_ACPI_DEBUG) += -DACPI_DEBUG_OUTPUT + +obj-y := dsfield.o dsmthdat.o dsopcode.o dswexec.o dswscope.o \ + dsmethod.o dsobject.o dsutils.o dswload.o dswstate.o \ + dsinit.o + +obj-y += evevent.o evregion.o evsci.o evxfevnt.o \ + evmisc.o evrgnini.o evxface.o evxfregn.o \ + evgpe.o evgpeblk.o + +obj-y += exconfig.o exfield.o exnames.o exoparg6.o exresolv.o exstorob.o\ + exconvrt.o exfldio.o exoparg1.o exprep.o exresop.o exsystem.o\ + excreate.o exmisc.o exoparg2.o exregion.o exstore.o exutils.o \ + exdump.o exmutex.o exoparg3.o exresnte.o exstoren.o + +obj-y += hwacpi.o hwgpe.o hwregs.o hwsleep.o hwxface.o + +obj-$(ACPI_FUTURE_USAGE) += hwtimer.o + +obj-y += nsaccess.o nsload.o nssearch.o nsxfeval.o \ + nsalloc.o nseval.o nsnames.o nsutils.o nsxfname.o \ + nsdump.o nsinit.o nsobject.o nswalk.o nsxfobj.o \ + nsparse.o nspredef.o + +obj-$(ACPI_FUTURE_USAGE) += nsdumpdv.o + +obj-y += psargs.o psparse.o psloop.o pstree.o pswalk.o \ + psopcode.o psscope.o psutils.o psxface.o + +obj-y += rsaddr.o rscreate.o rsinfo.o rsio.o rslist.o rsmisc.o rsxface.o \ + rscalc.o rsirq.o rsmemory.o rsutils.o + +obj-$(ACPI_FUTURE_USAGE) += rsdump.o + +obj-y += tbxface.o tbinstal.o tbutils.o tbfind.o tbfadt.o tbxfroot.o + +obj-y += utalloc.o utdebug.o uteval.o utinit.o utmisc.o utxface.o \ + utcopy.o utdelete.o utglobal.o utmath.o utobject.o \ + utstate.o utmutex.o utobject.o utresrc.o diff --git a/drivers/acpi/acpica/accommon.h b/drivers/acpi/acpica/accommon.h new file mode 100644 index 00000000000..3b20786cbb0 --- /dev/null +++ b/drivers/acpi/acpica/accommon.h @@ -0,0 +1,63 @@ +/****************************************************************************** + * + * Name: accommon.h - Common include files for generation of ACPICA source + * + *****************************************************************************/ + +/* + * Copyright (C) 2000 - 2008, Intel Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + * of any contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + */ + +#ifndef __ACCOMMON_H__ +#define __ACCOMMON_H__ + +/* + * Common set of includes for all ACPICA source files. + * We put them here because we don't want to duplicate them + * in the the source code again and again. + * + * Note: The order of these include files is important. + */ +#include "acconfig.h" /* Global configuration constants */ +#include "acmacros.h" /* C macros */ +#include "aclocal.h" /* Internal data types */ +#include "acobject.h" /* ACPI internal object */ +#include "acstruct.h" /* Common structures */ +#include "acglobal.h" /* All global variables */ +#include "achware.h" /* Hardware defines and interfaces */ +#include "acutils.h" /* Utility interfaces */ + +#endif /* __ACCOMMON_H__ */ diff --git a/include/acpi/acconfig.h b/drivers/acpi/acpica/acconfig.h index 29feee27f0e..e6777fb883d 100644 --- a/include/acpi/acconfig.h +++ b/drivers/acpi/acpica/acconfig.h @@ -61,10 +61,6 @@ * */ -/* Current ACPICA subsystem version in YYYYMMDD format */ - -#define ACPI_CA_VERSION 0x20080926 - /* * OS name, used for the _OS object. The _OS object is essentially obsolete, * but there is a large base of ASL/AML code in existing machines that check @@ -119,6 +115,10 @@ #define ACPI_ROOT_TABLE_SIZE_INCREMENT 4 +/* Maximum number of While() loop iterations before forced abort */ + +#define ACPI_MAX_LOOP_ITERATIONS 0xFFFF + /****************************************************************************** * * ACPI Specification constants (Do not change unless the specification changes) diff --git a/include/acpi/acdebug.h b/drivers/acpi/acpica/acdebug.h index 62c59df3b86..62c59df3b86 100644 --- a/include/acpi/acdebug.h +++ b/drivers/acpi/acpica/acdebug.h diff --git a/include/acpi/acdispat.h b/drivers/acpi/acpica/acdispat.h index 6291904be01..6291904be01 100644 --- a/include/acpi/acdispat.h +++ b/drivers/acpi/acpica/acdispat.h diff --git a/include/acpi/acevents.h b/drivers/acpi/acpica/acevents.h index d5d099bf349..07e20135f01 100644 --- a/include/acpi/acevents.h +++ b/drivers/acpi/acpica/acevents.h @@ -93,11 +93,13 @@ struct acpi_gpe_event_info *acpi_ev_get_gpe_event_info(acpi_handle gpe_device, */ u8 acpi_ev_valid_gpe_event(struct acpi_gpe_event_info *gpe_event_info); -acpi_status acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback); +acpi_status +acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback, void *context); acpi_status acpi_ev_delete_gpe_handlers(struct acpi_gpe_xrupt_info *gpe_xrupt_info, - struct acpi_gpe_block_info *gpe_block); + struct acpi_gpe_block_info *gpe_block, + void *context); acpi_status acpi_ev_create_gpe_block(struct acpi_namespace_node *gpe_device, diff --git a/include/acpi/acglobal.h b/drivers/acpi/acpica/acglobal.h index 15dda46b70d..ddb40f5c68f 100644 --- a/include/acpi/acglobal.h +++ b/drivers/acpi/acpica/acglobal.h @@ -102,6 +102,12 @@ ACPI_EXTERN u8 ACPI_INIT_GLOBAL(acpi_gbl_create_osi_method, TRUE); */ ACPI_EXTERN u8 ACPI_INIT_GLOBAL(acpi_gbl_leave_wake_gpes_disabled, TRUE); +/* + * Optionally use default values for the ACPI register widths. Set this to + * TRUE to use the defaults, if an FADT contains incorrect widths/lengths. + */ +ACPI_EXTERN u8 ACPI_INIT_GLOBAL(acpi_gbl_use_default_register_widths, TRUE); + /***************************************************************************** * * Debug support @@ -140,7 +146,7 @@ ACPI_EXTERN u32 acpi_gbl_trace_flags; */ ACPI_EXTERN struct acpi_internal_rsdt acpi_gbl_root_table_list; ACPI_EXTERN struct acpi_table_fadt acpi_gbl_FADT; -extern u8 acpi_gbl_permanent_mmap; +ACPI_EXTERN struct acpi_table_facs *acpi_gbl_FACS; /* These addresses are calculated from FADT address values */ @@ -326,6 +332,7 @@ ACPI_EXTERN struct acpi_fixed_event_handler ACPI_EXTERN struct acpi_gpe_xrupt_info *acpi_gbl_gpe_xrupt_list_head; ACPI_EXTERN struct acpi_gpe_block_info *acpi_gbl_gpe_fadt_blocks[ACPI_MAX_GPE_BLOCKS]; +ACPI_EXTERN u32 acpi_current_gpe_count; /***************************************************************************** * diff --git a/include/acpi/achware.h b/drivers/acpi/acpica/achware.h index 97a72b19327..58c69dc49ab 100644 --- a/include/acpi/achware.h +++ b/drivers/acpi/acpica/achware.h @@ -44,11 +44,7 @@ #ifndef __ACHWARE_H__ #define __ACHWARE_H__ -/* PM Timer ticks per second (HZ) */ - -#define PM_TIMER_FREQUENCY 3579545 - -/* Values for the _SST reserved method */ +/* Values for the _SST predefined method */ #define ACPI_SST_INDICATOR_OFF 0 #define ACPI_SST_WORKING 1 @@ -56,8 +52,6 @@ #define ACPI_SST_SLEEPING 3 #define ACPI_SST_SLEEP_CONTEXT 4 -/* Prototypes */ - /* * hwacpi - high level functions */ @@ -75,13 +69,6 @@ acpi_hw_register_read(u32 register_id, u32 * return_value); acpi_status acpi_hw_register_write(u32 register_id, u32 value); -acpi_status -acpi_hw_low_level_read(u32 width, - u32 * value, struct acpi_generic_address *reg); - -acpi_status -acpi_hw_low_level_write(u32 width, u32 value, struct acpi_generic_address *reg); - acpi_status acpi_hw_clear_acpi_status(void); /* @@ -94,13 +81,13 @@ acpi_hw_write_gpe_enable_reg(struct acpi_gpe_event_info *gpe_event_info); acpi_status acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, - struct acpi_gpe_block_info *gpe_block); + struct acpi_gpe_block_info *gpe_block, void *context); acpi_status acpi_hw_clear_gpe(struct acpi_gpe_event_info *gpe_event_info); acpi_status acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, - struct acpi_gpe_block_info *gpe_block); + struct acpi_gpe_block_info *gpe_block, void *context); acpi_status acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info, @@ -114,7 +101,8 @@ acpi_status acpi_hw_enable_all_wakeup_gpes(void); acpi_status acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, - struct acpi_gpe_block_info *gpe_block); + struct acpi_gpe_block_info *gpe_block, + void *context); #ifdef ACPI_FUTURE_USAGE /* diff --git a/include/acpi/acinterp.h b/drivers/acpi/acpica/acinterp.h index e8db7a3143a..e8db7a3143a 100644 --- a/include/acpi/acinterp.h +++ b/drivers/acpi/acpica/acinterp.h diff --git a/include/acpi/aclocal.h b/drivers/acpi/acpica/aclocal.h index ecab527cf78..492d02761bb 100644 --- a/include/acpi/aclocal.h +++ b/drivers/acpi/acpica/aclocal.h @@ -46,8 +46,6 @@ /* acpisrc:struct_defs -- for acpisrc conversion */ -#define ACPI_WAIT_FOREVER 0xFFFF /* u16, as per ACPI spec */ -#define ACPI_DO_NOT_WAIT 0 #define ACPI_SERIALIZED 0xFF typedef u32 acpi_mutex_handle; @@ -120,11 +118,6 @@ static char *acpi_gbl_mutex_names[ACPI_NUM_MUTEX] = { #define ACPI_MAX_LOCK 1 #define ACPI_NUM_LOCK ACPI_MAX_LOCK+1 -/* Owner IDs are used to track namespace nodes for selective deletion */ - -typedef u8 acpi_owner_id; -#define ACPI_OWNER_ID_MAX 0xFF - /* This Thread ID means that the mutex is not in use (unlocked) */ #define ACPI_MUTEX_NOT_ACQUIRED (acpi_thread_id) 0 @@ -165,11 +158,6 @@ typedef enum { ACPI_IMODE_EXECUTE = 0x03 } acpi_interpreter_mode; -union acpi_name_union { - u32 integer; - char ascii[4]; -}; - /* * The Namespace Node describes a named object that appears in the AML. * descriptor_type is used to differentiate between internal descriptors. @@ -216,26 +204,6 @@ struct acpi_namespace_node { #define ANOBJ_IS_BIT_OFFSET 0x40 /* i_aSL only: Reference is a bit offset */ #define ANOBJ_IS_REFERENCED 0x80 /* i_aSL only: Object was referenced */ -/* - * ACPI Table Descriptor. One per ACPI table - */ -struct acpi_table_desc { - acpi_physical_address address; - struct acpi_table_header *pointer; - u32 length; /* Length fixed at 32 bits */ - union acpi_name_union signature; - acpi_owner_id owner_id; - u8 flags; -}; - -/* Flags for above */ - -#define ACPI_TABLE_ORIGIN_UNKNOWN (0) -#define ACPI_TABLE_ORIGIN_MAPPED (1) -#define ACPI_TABLE_ORIGIN_ALLOCATED (2) -#define ACPI_TABLE_ORIGIN_MASK (3) -#define ACPI_TABLE_IS_LOADED (4) - /* One internal RSDT for table management */ struct acpi_internal_rsdt { @@ -266,15 +234,6 @@ struct acpi_ns_search_data { struct acpi_namespace_node *node; }; -/* - * Predefined Namespace items - */ -struct acpi_predefined_names { - char *name; - u8 type; - char *val; -}; - /* Object types used during package copies */ #define ACPI_COPY_TYPE_SIMPLE 0 @@ -487,10 +446,15 @@ struct acpi_gpe_walk_info { struct acpi_gpe_block_info *gpe_block; }; -typedef acpi_status(*acpi_gpe_callback) (struct acpi_gpe_xrupt_info * - gpe_xrupt_info, - struct acpi_gpe_block_info * - gpe_block); +struct acpi_gpe_device_info { + u32 index; + u32 next_block_base_index; + acpi_status status; + struct acpi_namespace_node *gpe_device; +}; + +typedef acpi_status(*acpi_gpe_callback) (struct acpi_gpe_xrupt_info *gpe_xrupt_info, + struct acpi_gpe_block_info *gpe_block, void *context); /* Information about each particular fixed event */ @@ -566,6 +530,7 @@ struct acpi_control_state { union acpi_parse_object *predicate_op; u8 *aml_predicate_start; /* Start of if/while predicate */ u8 *package_end; /* End of if/while block */ + u32 loop_count; /* While() loop counter */ }; /* @@ -671,6 +636,12 @@ union acpi_parse_value { union acpi_parse_object *arg; /* arguments and contained ops */ }; +#ifdef ACPI_DISASSEMBLER +#define ACPI_DISASM_ONLY_MEMBERS(a) a; +#else +#define ACPI_DISASM_ONLY_MEMBERS(a) +#endif + #define ACPI_PARSE_COMMON \ union acpi_parse_object *parent; /* Parent op */\ u8 descriptor_type; /* To differentiate various internal objs */\ @@ -790,9 +761,6 @@ struct acpi_parse_state { * ****************************************************************************/ -#define PCI_ROOT_HID_STRING "PNP0A03" -#define PCI_EXPRESS_ROOT_HID_STRING "PNP0A08" - struct acpi_bit_register_info { u8 parent_register; u8 bit_position; @@ -1019,26 +987,4 @@ struct acpi_debug_mem_block { #define ACPI_MEM_LIST_MAX 1 #define ACPI_NUM_MEM_LISTS 2 -struct acpi_memory_list { - char *list_name; - void *list_head; - u16 object_size; - u16 max_depth; - u16 current_depth; - u16 link_offset; - -#ifdef ACPI_DBG_TRACK_ALLOCATIONS - - /* Statistics for debug memory tracking only */ - - u32 total_allocated; - u32 total_freed; - u32 max_occupied; - u32 total_size; - u32 current_total_size; - u32 requests; - u32 hits; -#endif -}; - #endif /* __ACLOCAL_H__ */ diff --git a/include/acpi/acmacros.h b/drivers/acpi/acpica/acmacros.h index 1954c9d1d01..9c127e8e2d6 100644 --- a/include/acpi/acmacros.h +++ b/drivers/acpi/acpica/acmacros.h @@ -45,23 +45,6 @@ #define __ACMACROS_H__ /* - * Data manipulation macros - */ -#define ACPI_LOWORD(l) ((u16)(u32)(l)) -#define ACPI_HIWORD(l) ((u16)((((u32)(l)) >> 16) & 0xFFFF)) -#define ACPI_LOBYTE(l) ((u8)(u16)(l)) -#define ACPI_HIBYTE(l) ((u8)((((u16)(l)) >> 8) & 0xFF)) - -#define ACPI_SET_BIT(target,bit) ((target) |= (bit)) -#define ACPI_CLEAR_BIT(target,bit) ((target) &= ~(bit)) -#define ACPI_MIN(a,b) (((a)<(b))?(a):(b)) -#define ACPI_MAX(a,b) (((a)>(b))?(a):(b)) - -/* Size calculation */ - -#define ACPI_ARRAY_LENGTH(x) (sizeof(x) / sizeof((x)[0])) - -/* * Extract data using a pointer. Any more than a byte and we * get into potential aligment issues -- see the STORE macros below. * Use with care. @@ -76,39 +59,6 @@ #define ACPI_SET64(ptr) *ACPI_CAST_PTR (u64, ptr) /* - * Pointer manipulation - */ -#define ACPI_CAST_PTR(t, p) ((t *) (acpi_uintptr_t) (p)) -#define ACPI_CAST_INDIRECT_PTR(t, p) ((t **) (acpi_uintptr_t) (p)) -#define ACPI_ADD_PTR(t, a, b) ACPI_CAST_PTR (t, (ACPI_CAST_PTR (u8, (a)) + (acpi_size)(b))) -#define ACPI_PTR_DIFF(a, b) (acpi_size) (ACPI_CAST_PTR (u8, (a)) - ACPI_CAST_PTR (u8, (b))) - -/* Pointer/Integer type conversions */ - -#define ACPI_TO_POINTER(i) ACPI_ADD_PTR (void, (void *) NULL, (acpi_size) i) -#define ACPI_TO_INTEGER(p) ACPI_PTR_DIFF (p, (void *) NULL) -#define ACPI_OFFSET(d, f) (acpi_size) ACPI_PTR_DIFF (&(((d *)0)->f), (void *) NULL) -#define ACPI_PHYSADDR_TO_PTR(i) ACPI_TO_POINTER(i) -#define ACPI_PTR_TO_PHYSADDR(i) ACPI_TO_INTEGER(i) - -#ifndef ACPI_MISALIGNMENT_NOT_SUPPORTED -#define ACPI_COMPARE_NAME(a, b) (*ACPI_CAST_PTR (u32, (a)) == *ACPI_CAST_PTR (u32, (b))) -#else -#define ACPI_COMPARE_NAME(a, b) (!ACPI_STRNCMP (ACPI_CAST_PTR (char, (a)), ACPI_CAST_PTR (char, (b)), ACPI_NAME_SIZE)) -#endif - -/* - * Full 64-bit integer must be available on both 32-bit and 64-bit platforms - */ -struct acpi_integer_overlay { - u32 lo_dword; - u32 hi_dword; -}; - -#define ACPI_LODWORD(integer) (ACPI_CAST_PTR (struct acpi_integer_overlay, &integer)->lo_dword) -#define ACPI_HIDWORD(integer) (ACPI_CAST_PTR (struct acpi_integer_overlay, &integer)->hi_dword) - -/* * printf() format helpers */ @@ -209,7 +159,7 @@ struct acpi_integer_overlay { /* * The hardware does not support unaligned transfers. We must move the * data one byte at a time. These macros work whether the source or - * the destination (or both) is/are unaligned. (Little-endian move) + * the destination (or both) is/are unaligned. (Little-endian move) */ /* 16-bit source, 16/32/64 destination */ @@ -357,12 +307,6 @@ struct acpi_integer_overlay { {(u32)(Pargs), (u32)(Iargs), (u32)(flags), obj_type, class, type} #endif -#ifdef ACPI_DISASSEMBLER -#define ACPI_DISASM_ONLY_MEMBERS(a) a; -#else -#define ACPI_DISASM_ONLY_MEMBERS(a) -#endif - #define ARG_TYPE_WIDTH 5 #define ARG_1(x) ((u32)(x)) #define ARG_2(x) ((u32)(x) << (1 * ARG_TYPE_WIDTH)) @@ -388,32 +332,16 @@ struct acpi_integer_overlay { #define GET_CURRENT_ARG_TYPE(list) (list & ((u32) 0x1F)) #define INCREMENT_ARG_LIST(list) (list >>= ((u32) ARG_TYPE_WIDTH)) -#if defined (ACPI_DEBUG_OUTPUT) || !defined (ACPI_NO_ERROR_MESSAGES) -/* - * Module name is include in both debug and non-debug versions primarily for - * error messages. The __FILE__ macro is not very useful for this, because it - * often includes the entire pathname to the module - */ -#define ACPI_MODULE_NAME(name) static const char ACPI_UNUSED_VAR _acpi_module_name[] = name; -#else -#define ACPI_MODULE_NAME(name) -#endif - /* * Ascii error messages can be configured out */ #ifndef ACPI_NO_ERROR_MESSAGES -#define AE_INFO _acpi_module_name, __LINE__ /* * Error reporting. Callers module and line number are inserted by AE_INFO, * the plist contains a set of parens to allow variable-length lists. * These macros are used for both the debug and non-debug versions of the code. */ -#define ACPI_INFO(plist) acpi_ut_info plist -#define ACPI_WARNING(plist) acpi_ut_warning plist -#define ACPI_EXCEPTION(plist) acpi_ut_exception plist -#define ACPI_ERROR(plist) acpi_ut_error plist #define ACPI_ERROR_NAMESPACE(s, e) acpi_ns_report_error (AE_INFO, s, e); #define ACPI_ERROR_METHOD(s, n, p, e) acpi_ns_report_method_error (AE_INFO, s, n, p, e); @@ -421,13 +349,9 @@ struct acpi_integer_overlay { /* No error messages */ -#define ACPI_INFO(plist) -#define ACPI_WARNING(plist) -#define ACPI_EXCEPTION(plist) -#define ACPI_ERROR(plist) #define ACPI_ERROR_NAMESPACE(s, e) #define ACPI_ERROR_METHOD(s, n, p, e) -#endif +#endif /* ACPI_NO_ERROR_MESSAGES */ /* * Debug macros that are conditionally compiled @@ -435,36 +359,8 @@ struct acpi_integer_overlay { #ifdef ACPI_DEBUG_OUTPUT /* - * Common parameters used for debug output functions: - * line number, function name, module(file) name, component ID - */ -#define ACPI_DEBUG_PARAMETERS __LINE__, ACPI_GET_FUNCTION_NAME, _acpi_module_name, _COMPONENT - -/* * Function entry tracing */ - -/* - * If ACPI_GET_FUNCTION_NAME was not defined in the compiler-dependent header, - * define it now. This is the case where there the compiler does not support - * a __func__ macro or equivalent. - */ -#ifndef ACPI_GET_FUNCTION_NAME -#define ACPI_GET_FUNCTION_NAME _acpi_function_name -/* - * The Name parameter should be the procedure name as a quoted string. - * The function name is also used by the function exit macros below. - * Note: (const char) is used to be compatible with the debug interfaces - * and macros such as __func__. - */ -#define ACPI_FUNCTION_NAME(name) static const char _acpi_function_name[] = #name; - -#else -/* Compiler supports __func__ (or equivalent) -- Ignore this macro */ - -#define ACPI_FUNCTION_NAME(name) -#endif - #ifdef CONFIG_ACPI_DEBUG_FUNC_TRACE #define ACPI_FUNCTION_TRACE(a) ACPI_FUNCTION_NAME(a) \ @@ -584,15 +480,6 @@ struct acpi_integer_overlay { #define ACPI_DUMP_RESOURCE_LIST(a) acpi_rs_dump_resource_list(a) #define ACPI_DUMP_BUFFER(a, b) acpi_ut_dump_buffer((u8 *) a, b, DB_BYTE_DISPLAY, _COMPONENT) -/* - * Master debug print macros - * Print iff: - * 1) Debug print for the current component is enabled - * 2) Debug error level or trace level for the print statement is enabled - */ -#define ACPI_DEBUG_PRINT(plist) acpi_ut_debug_print plist -#define ACPI_DEBUG_PRINT_RAW(plist) acpi_ut_debug_print_raw plist - #else /* * This is the non-debug case -- make everything go away, @@ -603,7 +490,6 @@ struct acpi_integer_overlay { #define ACPI_DEBUG_DEFINE(a) do { } while(0) #define ACPI_DEBUG_ONLY_MEMBERS(a) do { } while(0) -#define ACPI_FUNCTION_NAME(a) do { } while(0) #define ACPI_FUNCTION_TRACE(a) do { } while(0) #define ACPI_FUNCTION_TRACE_PTR(a, b) do { } while(0) #define ACPI_FUNCTION_TRACE_U32(a, b) do { } while(0) @@ -619,8 +505,6 @@ struct acpi_integer_overlay { #define ACPI_DUMP_PATHNAME(a, b, c, d) do { } while(0) #define ACPI_DUMP_RESOURCE_LIST(a) do { } while(0) #define ACPI_DUMP_BUFFER(a, b) do { } while(0) -#define ACPI_DEBUG_PRINT(pl) do { } while(0) -#define ACPI_DEBUG_PRINT_RAW(pl) do { } while(0) #define return_VOID return #define return_ACPI_STATUS(s) return(s) @@ -629,7 +513,7 @@ struct acpi_integer_overlay { #define return_UINT32(s) return(s) #define return_PTR(s) return(s) -#endif +#endif /* ACPI_DEBUG_OUTPUT */ /* * Some code only gets executed when the debugger is built in. diff --git a/include/acpi/acnamesp.h b/drivers/acpi/acpica/acnamesp.h index db4e6f67785..46cb5b46d28 100644 --- a/include/acpi/acnamesp.h +++ b/drivers/acpi/acpica/acnamesp.h @@ -182,7 +182,9 @@ acpi_status acpi_ns_evaluate(struct acpi_evaluate_info *info); */ acpi_status acpi_ns_check_predefined_names(struct acpi_namespace_node *node, - union acpi_operand_object *return_object); + u32 user_param_count, + acpi_status return_status, + union acpi_operand_object **return_object); const union acpi_predefined_info *acpi_ns_check_for_predefined_name(struct acpi_namespace_node @@ -191,6 +193,7 @@ const union acpi_predefined_info *acpi_ns_check_for_predefined_name(struct void acpi_ns_check_parameter_count(char *pathname, struct acpi_namespace_node *node, + u32 user_param_count, const union acpi_predefined_info *info); /* diff --git a/include/acpi/acobject.h b/drivers/acpi/acpica/acobject.h index eb6f038b03d..eb6f038b03d 100644 --- a/include/acpi/acobject.h +++ b/drivers/acpi/acpica/acobject.h diff --git a/include/acpi/acopcode.h b/drivers/acpi/acpica/acopcode.h index dfdf6332788..dfdf6332788 100644 --- a/include/acpi/acopcode.h +++ b/drivers/acpi/acpica/acopcode.h diff --git a/include/acpi/acparser.h b/drivers/acpi/acpica/acparser.h index 23ee0fbf561..23ee0fbf561 100644 --- a/include/acpi/acparser.h +++ b/drivers/acpi/acpica/acparser.h diff --git a/include/acpi/acpredef.h b/drivers/acpi/acpica/acpredef.h index 16a9ca9a66e..16a9ca9a66e 100644 --- a/include/acpi/acpredef.h +++ b/drivers/acpi/acpica/acpredef.h diff --git a/include/acpi/acresrc.h b/drivers/acpi/acpica/acresrc.h index eef5bd7a59f..eef5bd7a59f 100644 --- a/include/acpi/acresrc.h +++ b/drivers/acpi/acpica/acresrc.h diff --git a/include/acpi/acstruct.h b/drivers/acpi/acpica/acstruct.h index 7980a26bad3..7980a26bad3 100644 --- a/include/acpi/acstruct.h +++ b/drivers/acpi/acpica/acstruct.h diff --git a/include/acpi/actables.h b/drivers/acpi/acpica/actables.h index 0cbe1b9ab52..7ce6e33c7f7 100644 --- a/include/acpi/actables.h +++ b/drivers/acpi/acpica/actables.h @@ -94,6 +94,8 @@ void acpi_tb_set_table_loaded_flag(u32 table_index, u8 is_loaded); /* * tbutils - table manager utilities */ +acpi_status acpi_tb_initialize_facs(void); + u8 acpi_tb_tables_loaded(void); void diff --git a/include/acpi/acutils.h b/drivers/acpi/acpica/acutils.h index d8307b2987e..80d8813484f 100644 --- a/include/acpi/acutils.h +++ b/drivers/acpi/acpica/acutils.h @@ -297,42 +297,6 @@ void acpi_ut_report_info(char *module_name, u32 line_number); void acpi_ut_report_warning(char *module_name, u32 line_number); -/* Error and message reporting interfaces */ - -void ACPI_INTERNAL_VAR_XFACE -acpi_ut_debug_print(u32 requested_debug_level, - u32 line_number, - const char *function_name, - const char *module_name, - u32 component_id, - const char *format, ...) ACPI_PRINTF_LIKE(6); - -void ACPI_INTERNAL_VAR_XFACE -acpi_ut_debug_print_raw(u32 requested_debug_level, - u32 line_number, - const char *function_name, - const char *module_name, - u32 component_id, - const char *format, ...) ACPI_PRINTF_LIKE(6); - -void ACPI_INTERNAL_VAR_XFACE -acpi_ut_error(const char *module_name, - u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3); - -void ACPI_INTERNAL_VAR_XFACE -acpi_ut_exception(const char *module_name, - u32 line_number, - acpi_status status, - const char *format, ...) ACPI_PRINTF_LIKE(4); - -void ACPI_INTERNAL_VAR_XFACE -acpi_ut_warning(const char *module_name, - u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3); - -void ACPI_INTERNAL_VAR_XFACE -acpi_ut_info(const char *module_name, - u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3); - /* * utdelete - Object deletion and reference counts */ diff --git a/include/acpi/amlcode.h b/drivers/acpi/acpica/amlcode.h index ff851c5df69..ff851c5df69 100644 --- a/include/acpi/amlcode.h +++ b/drivers/acpi/acpica/amlcode.h diff --git a/include/acpi/amlresrc.h b/drivers/acpi/acpica/amlresrc.h index 7b070e42b7c..7b070e42b7c 100644 --- a/include/acpi/amlresrc.h +++ b/drivers/acpi/acpica/amlresrc.h diff --git a/drivers/acpi/dispatcher/dsfield.c b/drivers/acpi/acpica/dsfield.c index f988a5e7d2b..53e27bc5a73 100644 --- a/drivers/acpi/dispatcher/dsfield.c +++ b/drivers/acpi/acpica/dsfield.c @@ -42,11 +42,12 @@ */ #include <acpi/acpi.h> -#include <acpi/amlcode.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> -#include <acpi/acparser.h> +#include "accommon.h" +#include "amlcode.h" +#include "acdispat.h" +#include "acinterp.h" +#include "acnamesp.h" +#include "acparser.h" #define _COMPONENT ACPI_DISPATCHER ACPI_MODULE_NAME("dsfield") diff --git a/drivers/acpi/dispatcher/dsinit.c b/drivers/acpi/acpica/dsinit.c index 949f7c75029..eb144b13d8f 100644 --- a/drivers/acpi/dispatcher/dsinit.c +++ b/drivers/acpi/acpica/dsinit.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acdispat.h> -#include <acpi/acnamesp.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "acdispat.h" +#include "acnamesp.h" +#include "actables.h" #define _COMPONENT ACPI_DISPATCHER ACPI_MODULE_NAME("dsinit") diff --git a/drivers/acpi/dispatcher/dsmethod.c b/drivers/acpi/acpica/dsmethod.c index 279a5a60a0d..14b8b8ed802 100644 --- a/drivers/acpi/dispatcher/dsmethod.c +++ b/drivers/acpi/acpica/dsmethod.c @@ -42,11 +42,14 @@ */ #include <acpi/acpi.h> -#include <acpi/amlcode.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "amlcode.h" +#include "acdispat.h" +#include "acinterp.h" +#include "acnamesp.h" +#ifdef ACPI_DISASSEMBLER #include <acpi/acdisasm.h> +#endif #define _COMPONENT ACPI_DISPATCHER ACPI_MODULE_NAME("dsmethod") @@ -412,6 +415,9 @@ acpi_ds_call_control_method(struct acpi_thread_state *thread, if (obj_desc->method.method_flags & AML_METHOD_INTERNAL_ONLY) { status = obj_desc->method.implementation(next_walk_state); + if (status == AE_OK) { + status = AE_CTRL_TERMINATE; + } } return_ACPI_STATUS(status); diff --git a/drivers/acpi/dispatcher/dsmthdat.c b/drivers/acpi/acpica/dsmthdat.c index d03f81bd1bc..da0f5468184 100644 --- a/drivers/acpi/dispatcher/dsmthdat.c +++ b/drivers/acpi/acpica/dsmthdat.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acdispat.h> -#include <acpi/acnamesp.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acdispat.h" +#include "acnamesp.h" +#include "acinterp.h" #define _COMPONENT ACPI_DISPATCHER ACPI_MODULE_NAME("dsmthdat") diff --git a/drivers/acpi/dispatcher/dsobject.c b/drivers/acpi/acpica/dsobject.c index 4f08e599d07..15c628e6aa0 100644 --- a/drivers/acpi/dispatcher/dsobject.c +++ b/drivers/acpi/acpica/dsobject.c @@ -42,11 +42,12 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/amlcode.h> -#include <acpi/acdispat.h> -#include <acpi/acnamesp.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acparser.h" +#include "amlcode.h" +#include "acdispat.h" +#include "acnamesp.h" +#include "acinterp.h" #define _COMPONENT ACPI_DISPATCHER ACPI_MODULE_NAME("dsobject") diff --git a/drivers/acpi/dispatcher/dsopcode.c b/drivers/acpi/acpica/dsopcode.c index 69fae5905bb..0c3b4dd60e8 100644 --- a/drivers/acpi/dispatcher/dsopcode.c +++ b/drivers/acpi/acpica/dsopcode.c @@ -43,13 +43,14 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/amlcode.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> -#include <acpi/acevents.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "acparser.h" +#include "amlcode.h" +#include "acdispat.h" +#include "acinterp.h" +#include "acnamesp.h" +#include "acevents.h" +#include "actables.h" #define _COMPONENT ACPI_DISPATCHER ACPI_MODULE_NAME("dsopcode") @@ -1140,10 +1141,29 @@ acpi_ds_exec_begin_control_op(struct acpi_walk_state *walk_state, op->common.aml_opcode, walk_state)); switch (op->common.aml_opcode) { - case AML_IF_OP: case AML_WHILE_OP: /* + * If this is an additional iteration of a while loop, continue. + * There is no need to allocate a new control state. + */ + if (walk_state->control_state) { + if (walk_state->control_state->control.aml_predicate_start + == (walk_state->parser_state.aml - 1)) { + + /* Reset the state to start-of-loop */ + + walk_state->control_state->common.state = + ACPI_CONTROL_CONDITIONAL_EXECUTING; + break; + } + } + + /*lint -fallthrough */ + + case AML_IF_OP: + + /* * IF/WHILE: Create a new control state to manage these * constructs. We need to manage these as a stack, in order * to handle nesting. @@ -1243,13 +1263,36 @@ acpi_ds_exec_end_control_op(struct acpi_walk_state * walk_state, ACPI_DEBUG_PRINT((ACPI_DB_DISPATCH, "[WHILE_OP] Op=%p\n", op)); - if (walk_state->control_state->common.value) { + control_state = walk_state->control_state; + if (control_state->common.value) { - /* Predicate was true, go back and evaluate it again! */ + /* Predicate was true, the body of the loop was just executed */ + /* + * This loop counter mechanism allows the interpreter to escape + * possibly infinite loops. This can occur in poorly written AML + * when the hardware does not respond within a while loop and the + * loop does not implement a timeout. + */ + control_state->control.loop_count++; + if (control_state->control.loop_count > + ACPI_MAX_LOOP_ITERATIONS) { + status = AE_AML_INFINITE_LOOP; + break; + } + + /* + * Go back and evaluate the predicate and maybe execute the loop + * another time + */ status = AE_CTRL_PENDING; + walk_state->aml_last_while = + control_state->control.aml_predicate_start; + break; } + /* Predicate was false, terminate this while loop */ + ACPI_DEBUG_PRINT((ACPI_DB_DISPATCH, "[WHILE_OP] termination! Op=%p\n", op)); @@ -1257,9 +1300,6 @@ acpi_ds_exec_end_control_op(struct acpi_walk_state * walk_state, control_state = acpi_ut_pop_generic_state(&walk_state->control_state); - - walk_state->aml_last_while = - control_state->control.aml_predicate_start; acpi_ut_delete_generic_state(control_state); break; diff --git a/drivers/acpi/dispatcher/dsutils.c b/drivers/acpi/acpica/dsutils.c index b398982f0d8..dabc23a4617 100644 --- a/drivers/acpi/dispatcher/dsutils.c +++ b/drivers/acpi/acpica/dsutils.c @@ -42,12 +42,13 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/amlcode.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> -#include <acpi/acdebug.h> +#include "accommon.h" +#include "acparser.h" +#include "amlcode.h" +#include "acdispat.h" +#include "acinterp.h" +#include "acnamesp.h" +#include "acdebug.h" #define _COMPONENT ACPI_DISPATCHER ACPI_MODULE_NAME("dsutils") diff --git a/drivers/acpi/dispatcher/dswexec.c b/drivers/acpi/acpica/dswexec.c index 396fe12078c..350e6656bc8 100644 --- a/drivers/acpi/dispatcher/dswexec.c +++ b/drivers/acpi/acpica/dswexec.c @@ -43,12 +43,13 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/amlcode.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> -#include <acpi/acdebug.h> +#include "accommon.h" +#include "acparser.h" +#include "amlcode.h" +#include "acdispat.h" +#include "acinterp.h" +#include "acnamesp.h" +#include "acdebug.h" #define _COMPONENT ACPI_DISPATCHER ACPI_MODULE_NAME("dswexec") diff --git a/drivers/acpi/dispatcher/dswload.c b/drivers/acpi/acpica/dswload.c index dff7a3e445a..3023ceaa8d5 100644 --- a/drivers/acpi/dispatcher/dswload.c +++ b/drivers/acpi/acpica/dswload.c @@ -42,12 +42,13 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/amlcode.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> -#include <acpi/acevents.h> +#include "accommon.h" +#include "acparser.h" +#include "amlcode.h" +#include "acdispat.h" +#include "acinterp.h" +#include "acnamesp.h" +#include "acevents.h" #ifdef ACPI_ASL_COMPILER #include <acpi/acdisasm.h> diff --git a/drivers/acpi/dispatcher/dswscope.c b/drivers/acpi/acpica/dswscope.c index 9e607326587..908645e72f0 100644 --- a/drivers/acpi/dispatcher/dswscope.c +++ b/drivers/acpi/acpica/dswscope.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acdispat.h> +#include "accommon.h" +#include "acdispat.h" #define _COMPONENT ACPI_DISPATCHER ACPI_MODULE_NAME("dswscope") diff --git a/drivers/acpi/dispatcher/dswstate.c b/drivers/acpi/acpica/dswstate.c index b00d4af791a..40f92bf7dce 100644 --- a/drivers/acpi/dispatcher/dswstate.c +++ b/drivers/acpi/acpica/dswstate.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/acdispat.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acparser.h" +#include "acdispat.h" +#include "acnamesp.h" #define _COMPONENT ACPI_DISPATCHER ACPI_MODULE_NAME("dswstate") diff --git a/drivers/acpi/events/evevent.c b/drivers/acpi/acpica/evevent.c index c56c5c6ea77..803edd9e3f6 100644 --- a/drivers/acpi/events/evevent.c +++ b/drivers/acpi/acpica/evevent.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acevents.h> +#include "accommon.h" +#include "acevents.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evevent") @@ -72,8 +73,8 @@ acpi_status acpi_ev_initialize_events(void) /* * Initialize the Fixed and General Purpose Events. This is done prior to - * enabling SCIs to prevent interrupts from occurring before the handlers are - * installed. + * enabling SCIs to prevent interrupts from occurring before the handlers + * are installed. */ status = acpi_ev_fixed_event_initialize(); if (ACPI_FAILURE(status)) { @@ -192,8 +193,8 @@ static acpi_status acpi_ev_fixed_event_initialize(void) acpi_status status; /* - * Initialize the structure that keeps track of fixed event handlers - * and enable the fixed events. + * Initialize the structure that keeps track of fixed event handlers and + * enable the fixed events. */ for (i = 0; i < ACPI_NUM_FIXED_EVENTS; i++) { acpi_gbl_fixed_event_handlers[i].handler = NULL; @@ -237,7 +238,7 @@ u32 acpi_ev_fixed_event_detect(void) /* * Read the fixed feature status and enable registers, as all the cases - * depend on their values. Ignore errors here. + * depend on their values. Ignore errors here. */ (void)acpi_hw_register_read(ACPI_REGISTER_PM1_STATUS, &fixed_status); (void)acpi_hw_register_read(ACPI_REGISTER_PM1_ENABLE, &fixed_enable); @@ -291,8 +292,8 @@ static u32 acpi_ev_fixed_event_dispatch(u32 event) status_register_id, 1); /* - * Make sure we've got a handler. If not, report an error. - * The event is disabled to prevent further interrupts. + * Make sure we've got a handler. If not, report an error. The event is + * disabled to prevent further interrupts. */ if (NULL == acpi_gbl_fixed_event_handlers[event].handler) { (void)acpi_set_register(acpi_gbl_fixed_event_info[event]. diff --git a/drivers/acpi/events/evgpe.c b/drivers/acpi/acpica/evgpe.c index f45c74fe745..f345ced3647 100644 --- a/drivers/acpi/events/evgpe.c +++ b/drivers/acpi/acpica/evgpe.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acevents.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acevents.h" +#include "acnamesp.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evgpe") @@ -125,7 +126,7 @@ acpi_ev_update_gpe_enable_masks(struct acpi_gpe_event_info *gpe_event_info, (1 << (gpe_event_info->gpe_number - gpe_register_info->base_gpe_number)); - /* 1) Disable case. Simply clear all enable bits */ + /* 1) Disable case. Simply clear all enable bits */ if (type == ACPI_GPE_DISABLE) { ACPI_CLEAR_BIT(gpe_register_info->enable_for_wake, @@ -134,7 +135,7 @@ acpi_ev_update_gpe_enable_masks(struct acpi_gpe_event_info *gpe_event_info, return_ACPI_STATUS(AE_OK); } - /* 2) Enable case. Set/Clear the appropriate enable bits */ + /* 2) Enable case. Set/Clear the appropriate enable bits */ switch (gpe_event_info->flags & ACPI_GPE_TYPE_MASK) { case ACPI_GPE_TYPE_WAKE: @@ -295,7 +296,7 @@ acpi_status acpi_ev_disable_gpe(struct acpi_gpe_event_info *gpe_event_info) * * FUNCTION: acpi_ev_get_gpe_event_info * - * PARAMETERS: gpe_device - Device node. NULL for GPE0/GPE1 + * PARAMETERS: gpe_device - Device node. NULL for GPE0/GPE1 * gpe_number - Raw GPE number * * RETURN: A GPE event_info struct. NULL if not a valid GPE @@ -372,7 +373,7 @@ struct acpi_gpe_event_info *acpi_ev_get_gpe_event_info(acpi_handle gpe_device, * * RETURN: INTERRUPT_HANDLED or INTERRUPT_NOT_HANDLED * - * DESCRIPTION: Detect if any GP events have occurred. This function is + * DESCRIPTION: Detect if any GP events have occurred. This function is * executed at interrupt level. * ******************************************************************************/ @@ -400,8 +401,8 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) /* * We need to obtain the GPE lock for both the data structs and registers - * Note: Not necessary to obtain the hardware lock, since the GPE registers - * are owned by the gpe_lock. + * Note: Not necessary to obtain the hardware lock, since the GPE + * registers are owned by the gpe_lock. */ flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); @@ -410,9 +411,8 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) gpe_block = gpe_xrupt_list->gpe_block_list_head; while (gpe_block) { /* - * Read all of the 8-bit GPE status and enable registers - * in this GPE block, saving all of them. - * Find all currently active GP events. + * Read all of the 8-bit GPE status and enable registers in this GPE + * block, saving all of them. Find all currently active GP events. */ for (i = 0; i < gpe_block->register_count; i++) { @@ -423,10 +423,8 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) /* Read the Status Register */ status = - acpi_hw_low_level_read(ACPI_GPE_REGISTER_WIDTH, - &status_reg, - &gpe_register_info-> - status_address); + acpi_read(&status_reg, + &gpe_register_info->status_address); if (ACPI_FAILURE(status)) { goto unlock_and_exit; } @@ -434,10 +432,8 @@ u32 acpi_ev_gpe_detect(struct acpi_gpe_xrupt_info * gpe_xrupt_list) /* Read the Enable Register */ status = - acpi_hw_low_level_read(ACPI_GPE_REGISTER_WIDTH, - &enable_reg, - &gpe_register_info-> - enable_address); + acpi_read(&enable_reg, + &gpe_register_info->enable_address); if (ACPI_FAILURE(status)) { goto unlock_and_exit; } @@ -527,8 +523,8 @@ static void ACPI_SYSTEM_XFACE acpi_ev_asynch_execute_gpe_method(void *context) (void)acpi_ev_enable_gpe(gpe_event_info, FALSE); /* - * Take a snapshot of the GPE info for this level - we copy the - * info to prevent a race condition with remove_handler/remove_block. + * Take a snapshot of the GPE info for this level - we copy the info to + * prevent a race condition with remove_handler/remove_block. */ ACPI_MEMCPY(&local_gpe_event_info, gpe_event_info, sizeof(struct acpi_gpe_event_info)); @@ -539,8 +535,8 @@ static void ACPI_SYSTEM_XFACE acpi_ev_asynch_execute_gpe_method(void *context) } /* - * Must check for control method type dispatch one more - * time to avoid race with ev_gpe_install_handler + * Must check for control method type dispatch one more time to avoid a + * race with ev_gpe_install_handler */ if ((local_gpe_event_info.flags & ACPI_GPE_DISPATCH_MASK) == ACPI_GPE_DISPATCH_METHOD) { @@ -584,8 +580,8 @@ static void acpi_ev_asynch_enable_gpe(void *context) if ((gpe_event_info->flags & ACPI_GPE_XRUPT_TYPE_MASK) == ACPI_GPE_LEVEL_TRIGGERED) { /* - * GPE is level-triggered, we clear the GPE status bit after - * handling the event. + * GPE is level-triggered, we clear the GPE status bit after handling + * the event. */ status = acpi_hw_clear_gpe(gpe_event_info); if (ACPI_FAILURE(status)) { @@ -624,7 +620,7 @@ acpi_ev_gpe_dispatch(struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) acpi_os_gpe_count(gpe_number); /* - * If edge-triggered, clear the GPE status bit now. Note that + * If edge-triggered, clear the GPE status bit now. Note that * level-triggered events are cleared after the GPE is serviced. */ if ((gpe_event_info->flags & ACPI_GPE_XRUPT_TYPE_MASK) == @@ -650,7 +646,8 @@ acpi_ev_gpe_dispatch(struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) /* * Invoke the installed handler (at interrupt level) - * Ignore return status for now. TBD: leave GPE disabled on error? + * Ignore return status for now. + * TBD: leave GPE disabled on error? */ (void)gpe_event_info->dispatch.handler->address(gpe_event_info-> dispatch. @@ -708,7 +705,7 @@ acpi_ev_gpe_dispatch(struct acpi_gpe_event_info *gpe_event_info, u32 gpe_number) gpe_number)); /* - * Disable the GPE. The GPE will remain disabled until the ACPI + * Disable the GPE. The GPE will remain disabled until the ACPICA * Core Subsystem is restarted, or a handler is installed. */ status = acpi_ev_disable_gpe(gpe_event_info); diff --git a/drivers/acpi/events/evgpeblk.c b/drivers/acpi/acpica/evgpeblk.c index 73c058e2f5c..484cc0565d5 100644 --- a/drivers/acpi/events/evgpeblk.c +++ b/drivers/acpi/acpica/evgpeblk.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acevents.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acevents.h" +#include "acnamesp.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evgpeblk") @@ -124,6 +125,7 @@ u8 acpi_ev_valid_gpe_event(struct acpi_gpe_event_info *gpe_event_info) * FUNCTION: acpi_ev_walk_gpe_list * * PARAMETERS: gpe_walk_callback - Routine called for each GPE block + * Context - Value passed to callback * * RETURN: Status * @@ -131,7 +133,8 @@ u8 acpi_ev_valid_gpe_event(struct acpi_gpe_event_info *gpe_event_info) * ******************************************************************************/ -acpi_status acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback) +acpi_status +acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback, void *context) { struct acpi_gpe_block_info *gpe_block; struct acpi_gpe_xrupt_info *gpe_xrupt_info; @@ -154,8 +157,13 @@ acpi_status acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback) /* One callback per GPE block */ - status = gpe_walk_callback(gpe_xrupt_info, gpe_block); + status = + gpe_walk_callback(gpe_xrupt_info, gpe_block, + context); if (ACPI_FAILURE(status)) { + if (status == AE_CTRL_END) { /* Callback abort */ + status = AE_OK; + } goto unlock_and_exit; } @@ -186,7 +194,8 @@ acpi_status acpi_ev_walk_gpe_list(acpi_gpe_callback gpe_walk_callback) acpi_status acpi_ev_delete_gpe_handlers(struct acpi_gpe_xrupt_info *gpe_xrupt_info, - struct acpi_gpe_block_info *gpe_block) + struct acpi_gpe_block_info *gpe_block, + void *context) { struct acpi_gpe_event_info *gpe_event_info; u32 i; @@ -309,17 +318,17 @@ acpi_ev_save_method_info(acpi_handle obj_handle, (gpe_block->block_base_number + (gpe_block->register_count * 8)))) { /* - * Not valid for this GPE block, just ignore it - * However, it may be valid for a different GPE block, since GPE0 and GPE1 - * methods both appear under \_GPE. + * Not valid for this GPE block, just ignore it. However, it may be + * valid for a different GPE block, since GPE0 and GPE1 methods both + * appear under \_GPE. */ return_ACPI_STATUS(AE_OK); } /* - * Now we can add this information to the gpe_event_info block - * for use during dispatch of this GPE. Default type is RUNTIME, although - * this may change when the _PRW methods are executed later. + * Now we can add this information to the gpe_event_info block for use + * during dispatch of this GPE. Default type is RUNTIME, although this may + * change when the _PRW methods are executed later. */ gpe_event_info = &gpe_block->event_info[gpe_number - gpe_block->block_base_number]; @@ -394,8 +403,8 @@ acpi_ev_match_prw_and_gpe(acpi_handle obj_handle, gpe_block = gpe_info->gpe_block; /* - * The _PRW object must return a package, we are only interested - * in the first element + * The _PRW object must return a package, we are only interested in the + * first element */ obj_desc = pkg_desc->package.elements[0]; @@ -434,7 +443,7 @@ acpi_ev_match_prw_and_gpe(acpi_handle obj_handle, /* * Is this GPE within this block? * - * TRUE iff these conditions are true: + * TRUE if and only if these conditions are true: * 1) The GPE devices match. * 2) The GPE index(number) is within the range of the Gpe Block * associated with the GPE device. @@ -457,6 +466,7 @@ acpi_ev_match_prw_and_gpe(acpi_handle obj_handle, if (ACPI_FAILURE(status)) { goto cleanup; } + status = acpi_ev_update_gpe_enable_masks(gpe_event_info, ACPI_GPE_DISABLE); @@ -476,9 +486,9 @@ acpi_ev_match_prw_and_gpe(acpi_handle obj_handle, * RETURN: A GPE interrupt block * * DESCRIPTION: Get or Create a GPE interrupt block. There is one interrupt - * block per unique interrupt level used for GPEs. - * Should be called only when the GPE lists are semaphore locked - * and not subject to change. + * block per unique interrupt level used for GPEs. Should be + * called only when the GPE lists are semaphore locked and not + * subject to change. * ******************************************************************************/ @@ -608,8 +618,9 @@ acpi_ev_delete_gpe_xrupt(struct acpi_gpe_xrupt_info *gpe_xrupt) * * FUNCTION: acpi_ev_install_gpe_block * - * PARAMETERS: gpe_block - New GPE block - * interrupt_number - Xrupt to be associated with this GPE block + * PARAMETERS: gpe_block - New GPE block + * interrupt_number - Xrupt to be associated with this + * GPE block * * RETURN: Status * @@ -666,7 +677,7 @@ acpi_ev_install_gpe_block(struct acpi_gpe_block_info *gpe_block, * * FUNCTION: acpi_ev_delete_gpe_block * - * PARAMETERS: gpe_block - Existing GPE block + * PARAMETERS: gpe_block - Existing GPE block * * RETURN: Status * @@ -688,7 +699,8 @@ acpi_status acpi_ev_delete_gpe_block(struct acpi_gpe_block_info *gpe_block) /* Disable all GPEs in this block */ - status = acpi_hw_disable_gpe_block(gpe_block->xrupt_block, gpe_block); + status = + acpi_hw_disable_gpe_block(gpe_block->xrupt_block, gpe_block, NULL); if (!gpe_block->previous && !gpe_block->next) { @@ -715,6 +727,9 @@ acpi_status acpi_ev_delete_gpe_block(struct acpi_gpe_block_info *gpe_block) acpi_os_release_lock(acpi_gbl_gpe_lock, flags); } + acpi_current_gpe_count -= + gpe_block->register_count * ACPI_GPE_REGISTER_WIDTH; + /* Free the gpe_block */ ACPI_FREE(gpe_block->register_info); @@ -786,9 +801,9 @@ acpi_ev_create_gpe_info_blocks(struct acpi_gpe_block_info *gpe_block) /* * Initialize the GPE Register and Event structures. A goal of these - * tables is to hide the fact that there are two separate GPE register sets - * in a given GPE hardware block, the status registers occupy the first half, - * and the enable registers occupy the second half. + * tables is to hide the fact that there are two separate GPE register + * sets in a given GPE hardware block, the status registers occupy the + * first half, and the enable registers occupy the second half. */ this_register = gpe_register_info; this_event = gpe_event_info; @@ -816,10 +831,8 @@ acpi_ev_create_gpe_info_blocks(struct acpi_gpe_block_info *gpe_block) ACPI_GPE_REGISTER_WIDTH; this_register->enable_address.bit_width = ACPI_GPE_REGISTER_WIDTH; - this_register->status_address.bit_offset = - ACPI_GPE_REGISTER_WIDTH; - this_register->enable_address.bit_offset = - ACPI_GPE_REGISTER_WIDTH; + this_register->status_address.bit_offset = 0; + this_register->enable_address.bit_offset = 0; /* Init the event_info for each GPE within this register */ @@ -832,18 +845,14 @@ acpi_ev_create_gpe_info_blocks(struct acpi_gpe_block_info *gpe_block) /* Disable all GPEs within this register */ - status = acpi_hw_low_level_write(ACPI_GPE_REGISTER_WIDTH, 0x00, - &this_register-> - enable_address); + status = acpi_write(0x00, &this_register->enable_address); if (ACPI_FAILURE(status)) { goto error_exit; } /* Clear any pending GPE events within this register */ - status = acpi_hw_low_level_write(ACPI_GPE_REGISTER_WIDTH, 0xFF, - &this_register-> - status_address); + status = acpi_write(0xFF, &this_register->status_address); if (ACPI_FAILURE(status)) { goto error_exit; } @@ -956,6 +965,9 @@ acpi_ev_create_gpe_block(struct acpi_namespace_node *gpe_device, gpe_device->name.ascii, gpe_block->register_count, interrupt_number)); + /* Update global count of currently available GPEs */ + + acpi_current_gpe_count += register_count * ACPI_GPE_REGISTER_WIDTH; return_ACPI_STATUS(AE_OK); } @@ -1055,7 +1067,7 @@ acpi_ev_initialize_gpe_block(struct acpi_namespace_node *gpe_device, /* Enable all valid runtime GPEs found above */ - status = acpi_hw_enable_runtime_gpe_block(NULL, gpe_block); + status = acpi_hw_enable_runtime_gpe_block(NULL, gpe_block, NULL); if (ACPI_FAILURE(status)) { ACPI_ERROR((AE_INFO, "Could not enable GPEs in GpeBlock %p", gpe_block)); diff --git a/drivers/acpi/events/evmisc.c b/drivers/acpi/acpica/evmisc.c index 1d5670be729..5f893057bcc 100644 --- a/drivers/acpi/events/evmisc.c +++ b/drivers/acpi/acpica/evmisc.c @@ -42,18 +42,15 @@ */ #include <acpi/acpi.h> -#include <acpi/acevents.h> -#include <acpi/acnamesp.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acevents.h" +#include "acnamesp.h" +#include "acinterp.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evmisc") -/* Pointer to FACS needed for the Global Lock */ -static struct acpi_table_facs *facs = NULL; - /* Local prototypes */ - static void ACPI_SYSTEM_XFACE acpi_ev_notify_dispatch(void *context); static u32 acpi_ev_global_lock_handler(void *context); @@ -152,7 +149,9 @@ acpi_ev_queue_notify_request(struct acpi_namespace_node * node, break; default: + /* All other types are not supported */ + return (AE_TYPE); } } @@ -193,9 +192,8 @@ acpi_ev_queue_notify_request(struct acpi_namespace_node * node, acpi_ut_delete_generic_state(notify_info); } } else { - /* - * There is no notify handler (per-device or system) for this device. - */ + /* There is no notify handler (per-device or system) for this device */ + ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No notify handler for Notify (%4.4s, %X) node %p\n", acpi_ut_get_node_name(node), notify_value, @@ -229,9 +227,8 @@ static void ACPI_SYSTEM_XFACE acpi_ev_notify_dispatch(void *context) ACPI_FUNCTION_ENTRY(); /* - * We will invoke a global notify handler if installed. - * This is done _before_ we invoke the per-device handler attached - * to the device. + * We will invoke a global notify handler if installed. This is done + * _before_ we invoke the per-device handler attached to the device. */ if (notify_info->notify.value <= ACPI_MAX_SYS_NOTIFY) { @@ -299,7 +296,7 @@ static u32 acpi_ev_global_lock_handler(void *context) * If we don't get it now, it will be marked pending and we will * take another interrupt when it becomes free. */ - ACPI_ACQUIRE_GLOBAL_LOCK(facs, acquired); + ACPI_ACQUIRE_GLOBAL_LOCK(acpi_gbl_FACS, acquired); if (acquired) { /* Got the lock, now wake all threads waiting for it */ @@ -336,34 +333,27 @@ acpi_status acpi_ev_init_global_lock_handler(void) ACPI_FUNCTION_TRACE(ev_init_global_lock_handler); - status = acpi_get_table_by_index(ACPI_TABLE_INDEX_FACS, - ACPI_CAST_INDIRECT_PTR(struct - acpi_table_header, - &facs)); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } + /* Attempt installation of the global lock handler */ - acpi_gbl_global_lock_present = TRUE; status = acpi_install_fixed_event_handler(ACPI_EVENT_GLOBAL, acpi_ev_global_lock_handler, NULL); /* - * If the global lock does not exist on this platform, the attempt - * to enable GBL_STATUS will fail (the GBL_ENABLE bit will not stick) - * Map to AE_OK, but mark global lock as not present. - * Any attempt to actually use the global lock will be flagged - * with an error. + * If the global lock does not exist on this platform, the attempt to + * enable GBL_STATUS will fail (the GBL_ENABLE bit will not stick). + * Map to AE_OK, but mark global lock as not present. Any attempt to + * actually use the global lock will be flagged with an error. */ if (status == AE_NO_HARDWARE_RESPONSE) { ACPI_ERROR((AE_INFO, "No response from Global Lock hardware, disabling lock")); acpi_gbl_global_lock_present = FALSE; - status = AE_OK; + return_ACPI_STATUS(AE_OK); } + acpi_gbl_global_lock_present = TRUE; return_ACPI_STATUS(status); } @@ -462,8 +452,8 @@ acpi_status acpi_ev_acquire_global_lock(u16 timeout) } /* - * Make sure that a global lock actually exists. If not, just treat - * the lock as a standard mutex. + * Make sure that a global lock actually exists. If not, just treat the + * lock as a standard mutex. */ if (!acpi_gbl_global_lock_present) { acpi_gbl_global_lock_acquired = TRUE; @@ -472,7 +462,7 @@ acpi_status acpi_ev_acquire_global_lock(u16 timeout) /* Attempt to acquire the actual hardware lock */ - ACPI_ACQUIRE_GLOBAL_LOCK(facs, acquired); + ACPI_ACQUIRE_GLOBAL_LOCK(acpi_gbl_FACS, acquired); if (acquired) { /* We got the lock */ @@ -536,7 +526,7 @@ acpi_status acpi_ev_release_global_lock(void) /* Allow any thread to release the lock */ - ACPI_RELEASE_GLOBAL_LOCK(facs, pending); + ACPI_RELEASE_GLOBAL_LOCK(acpi_gbl_FACS, pending); /* * If the pending bit was set, we must write GBL_RLS to the control @@ -582,8 +572,8 @@ void acpi_ev_terminate(void) if (acpi_gbl_events_initialized) { /* - * Disable all event-related functionality. - * In all cases, on error, print a message but obviously we don't abort. + * Disable all event-related functionality. In all cases, on error, + * print a message but obviously we don't abort. */ /* Disable all fixed events */ @@ -599,7 +589,7 @@ void acpi_ev_terminate(void) /* Disable all GPEs in all GPE blocks */ - status = acpi_ev_walk_gpe_list(acpi_hw_disable_gpe_block); + status = acpi_ev_walk_gpe_list(acpi_hw_disable_gpe_block, NULL); /* Remove SCI handler */ @@ -617,7 +607,7 @@ void acpi_ev_terminate(void) /* Deallocate all handler objects installed within GPE info structs */ - status = acpi_ev_walk_gpe_list(acpi_ev_delete_gpe_handlers); + status = acpi_ev_walk_gpe_list(acpi_ev_delete_gpe_handlers, NULL); /* Return to original mode if necessary */ diff --git a/drivers/acpi/events/evregion.c b/drivers/acpi/acpica/evregion.c index 236fbd1ca43..665c0887ab4 100644 --- a/drivers/acpi/events/evregion.c +++ b/drivers/acpi/acpica/evregion.c @@ -42,22 +42,15 @@ */ #include <acpi/acpi.h> -#include <acpi/acevents.h> -#include <acpi/acnamesp.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acevents.h" +#include "acnamesp.h" +#include "acinterp.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evregion") -#define ACPI_NUM_DEFAULT_SPACES 4 -static u8 acpi_gbl_default_address_spaces[ACPI_NUM_DEFAULT_SPACES] = { - ACPI_ADR_SPACE_SYSTEM_MEMORY, - ACPI_ADR_SPACE_SYSTEM_IO, - ACPI_ADR_SPACE_PCI_CONFIG, - ACPI_ADR_SPACE_DATA_TABLE -}; /* Local prototypes */ - static acpi_status acpi_ev_reg_run(acpi_handle obj_handle, u32 level, void *context, void **return_value); @@ -66,6 +59,17 @@ static acpi_status acpi_ev_install_handler(acpi_handle obj_handle, u32 level, void *context, void **return_value); +/* These are the address spaces that will get default handlers */ + +#define ACPI_NUM_DEFAULT_SPACES 4 + +static u8 acpi_gbl_default_address_spaces[ACPI_NUM_DEFAULT_SPACES] = { + ACPI_ADR_SPACE_SYSTEM_MEMORY, + ACPI_ADR_SPACE_SYSTEM_IO, + ACPI_ADR_SPACE_PCI_CONFIG, + ACPI_ADR_SPACE_DATA_TABLE +}; + /******************************************************************************* * * FUNCTION: acpi_ev_install_region_handlers @@ -91,18 +95,19 @@ acpi_status acpi_ev_install_region_handlers(void) } /* - * All address spaces (PCI Config, EC, SMBus) are scope dependent - * and registration must occur for a specific device. + * All address spaces (PCI Config, EC, SMBus) are scope dependent and + * registration must occur for a specific device. * - * In the case of the system memory and IO address spaces there is currently - * no device associated with the address space. For these we use the root. + * In the case of the system memory and IO address spaces there is + * currently no device associated with the address space. For these we + * use the root. * - * We install the default PCI config space handler at the root so - * that this space is immediately available even though the we have - * not enumerated all the PCI Root Buses yet. This is to conform - * to the ACPI specification which states that the PCI config - * space must be always available -- even though we are nowhere - * near ready to find the PCI root buses at this point. + * We install the default PCI config space handler at the root so that + * this space is immediately available even though the we have not + * enumerated all the PCI Root Buses yet. This is to conform to the ACPI + * specification which states that the PCI config space must be always + * available -- even though we are nowhere near ready to find the PCI root + * buses at this point. * * NOTE: We ignore AE_ALREADY_EXISTS because this means that a handler * has already been installed (via acpi_install_address_space_handler). @@ -160,12 +165,11 @@ acpi_status acpi_ev_initialize_op_regions(void) return_ACPI_STATUS(status); } - /* - * Run the _REG methods for op_regions in each default address space - */ - for (i = 0; i < ACPI_NUM_DEFAULT_SPACES; i++) { + /* Run the _REG methods for op_regions in each default address space */ - /* TBD: Make sure handler is the DEFAULT handler, otherwise + for (i = 0; i < ACPI_NUM_DEFAULT_SPACES; i++) { + /* + * TBD: Make sure handler is the DEFAULT handler, otherwise * _REG will have already been run. */ status = acpi_ev_execute_reg_methods(acpi_gbl_root_node, @@ -318,13 +322,13 @@ acpi_ev_address_space_dispatch(union acpi_operand_object *region_obj, } /* - * It may be the case that the region has never been initialized + * It may be the case that the region has never been initialized. * Some types of regions require special init code */ if (!(region_obj->region.flags & AOPOBJ_SETUP_COMPLETE)) { - /* - * This region has not been initialized yet, do it - */ + + /* This region has not been initialized yet, do it */ + region_setup = handler_desc->address_space.setup; if (!region_setup) { @@ -339,9 +343,9 @@ acpi_ev_address_space_dispatch(union acpi_operand_object *region_obj, } /* - * We must exit the interpreter because the region - * setup will potentially execute control methods - * (e.g., _REG method for this region) + * We must exit the interpreter because the region setup will + * potentially execute control methods (for example, the _REG method + * for this region) */ acpi_ex_exit_interpreter(); @@ -364,9 +368,8 @@ acpi_ev_address_space_dispatch(union acpi_operand_object *region_obj, return_ACPI_STATUS(status); } - /* - * Region initialization may have been completed by region_setup - */ + /* Region initialization may have been completed by region_setup */ + if (!(region_obj->region.flags & AOPOBJ_SETUP_COMPLETE)) { region_obj->region.flags |= AOPOBJ_SETUP_COMPLETE; @@ -521,8 +524,8 @@ acpi_ev_detach_region(union acpi_operand_object *region_obj, } /* - * If the region has been activated, call the setup handler - * with the deactivate notification + * If the region has been activated, call the setup handler with + * the deactivate notification */ if (region_obj->region.flags & AOPOBJ_SETUP_COMPLETE) { region_setup = handler_obj->address_space.setup; @@ -668,8 +671,8 @@ acpi_ev_install_handler(acpi_handle obj_handle, } /* - * We only care about regions.and objects - * that are allowed to have address space handlers + * We only care about regions and objects that are allowed to have + * address space handlers */ if ((node->type != ACPI_TYPE_DEVICE) && (node->type != ACPI_TYPE_REGION) && (node != acpi_gbl_root_node)) { @@ -710,9 +713,9 @@ acpi_ev_install_handler(acpi_handle obj_handle, /* * Since the object we found it on was a device, then it * means that someone has already installed a handler for - * the branch of the namespace from this device on. Just + * the branch of the namespace from this device on. Just * bail out telling the walk routine to not traverse this - * branch. This preserves the scoping rule for handlers. + * branch. This preserves the scoping rule for handlers. */ return (AE_CTRL_DEPTH); } @@ -723,9 +726,8 @@ acpi_ev_install_handler(acpi_handle obj_handle, } /* - * As long as the device didn't have a handler for this - * space we don't care about it. We just ignore it and - * proceed. + * As long as the device didn't have a handler for this space we + * don't care about it. We just ignore it and proceed. */ return (AE_OK); } @@ -733,16 +735,14 @@ acpi_ev_install_handler(acpi_handle obj_handle, /* Object is a Region */ if (obj_desc->region.space_id != handler_obj->address_space.space_id) { - /* - * This region is for a different address space - * -- just ignore it - */ + + /* This region is for a different address space, just ignore it */ + return (AE_OK); } /* - * Now we have a region and it is for the handler's address - * space type. + * Now we have a region and it is for the handler's address space type. * * First disconnect region for any previous handler (if any) */ @@ -786,9 +786,8 @@ acpi_ev_install_space_handler(struct acpi_namespace_node * node, ACPI_FUNCTION_TRACE(ev_install_space_handler); /* - * This registration is valid for only the types below - * and the root. This is where the default handlers - * get placed. + * This registration is valid for only the types below and the root. This + * is where the default handlers get placed. */ if ((node->type != ACPI_TYPE_DEVICE) && (node->type != ACPI_TYPE_PROCESSOR) && @@ -848,8 +847,8 @@ acpi_ev_install_space_handler(struct acpi_namespace_node * node, obj_desc = acpi_ns_get_attached_object(node); if (obj_desc) { /* - * The attached device object already exists. - * Make sure the handler is not already installed. + * The attached device object already exists. Make sure the handler + * is not already installed. */ handler_obj = obj_desc->device.handler; @@ -864,8 +863,8 @@ acpi_ev_install_space_handler(struct acpi_namespace_node * node, handler) { /* * It is (relatively) OK to attempt to install the SAME - * handler twice. This can easily happen - * with PCI_Config space. + * handler twice. This can easily happen with the + * PCI_Config space. */ status = AE_SAME_HANDLER; goto unlock_and_exit; @@ -925,9 +924,8 @@ acpi_ev_install_space_handler(struct acpi_namespace_node * node, /* * Install the handler * - * At this point there is no existing handler. - * Just allocate the object for the handler and link it - * into the list. + * At this point there is no existing handler. Just allocate the object + * for the handler and link it into the list. */ handler_obj = acpi_ut_create_internal_object(ACPI_TYPE_LOCAL_ADDRESS_HANDLER); @@ -1000,11 +998,10 @@ acpi_ev_execute_reg_methods(struct acpi_namespace_node *node, ACPI_FUNCTION_TRACE(ev_execute_reg_methods); /* - * Run all _REG methods for all Operation Regions for this - * space ID. This is a separate walk in order to handle any - * interdependencies between regions and _REG methods. (i.e. handlers - * must be installed for all regions of this Space ID before we - * can run any _REG methods) + * Run all _REG methods for all Operation Regions for this space ID. This + * is a separate walk in order to handle any interdependencies between + * regions and _REG methods. (i.e. handlers must be installed for all + * regions of this Space ID before we can run any _REG methods) */ status = acpi_ns_walk_namespace(ACPI_TYPE_ANY, node, ACPI_UINT32_MAX, ACPI_NS_WALK_UNLOCK, acpi_ev_reg_run, @@ -1042,8 +1039,8 @@ acpi_ev_reg_run(acpi_handle obj_handle, } /* - * We only care about regions.and objects - * that are allowed to have address space handlers + * We only care about regions.and objects that are allowed to have address + * space handlers */ if ((node->type != ACPI_TYPE_REGION) && (node != acpi_gbl_root_node)) { return (AE_OK); @@ -1062,10 +1059,9 @@ acpi_ev_reg_run(acpi_handle obj_handle, /* Object is a Region */ if (obj_desc->region.space_id != space_id) { - /* - * This region is for a different address space - * -- just ignore it - */ + + /* This region is for a different address space, just ignore it */ + return (AE_OK); } diff --git a/drivers/acpi/events/evrgnini.c b/drivers/acpi/acpica/evrgnini.c index 6b94b38df07..f3f1fb45c3d 100644 --- a/drivers/acpi/events/evrgnini.c +++ b/drivers/acpi/acpica/evrgnini.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acevents.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acevents.h" +#include "acnamesp.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evrgnini") @@ -233,9 +234,9 @@ acpi_ev_pci_config_region_setup(acpi_handle handle, if (ACPI_FAILURE(status)) { if (status == AE_SAME_HANDLER) { /* - * It is OK if the handler is already installed on the root - * bridge. Still need to return a context object for the - * new PCI_Config operation region, however. + * It is OK if the handler is already installed on the + * root bridge. Still need to return a context object + * for the new PCI_Config operation region, however. */ status = AE_OK; } else { @@ -272,8 +273,8 @@ acpi_ev_pci_config_region_setup(acpi_handle handle, } /* - * For PCI_Config space access, we need the segment, bus, - * device and function numbers. Acquire them here. + * For PCI_Config space access, we need the segment, bus, device and + * function numbers. Acquire them here. * * Find the parent device object. (This allows the operation region to be * within a subscope under the device, such as a control method.) @@ -289,16 +290,16 @@ acpi_ev_pci_config_region_setup(acpi_handle handle, } /* - * Get the PCI device and function numbers from the _ADR object - * contained in the parent's scope. + * Get the PCI device and function numbers from the _ADR object contained + * in the parent's scope. */ status = acpi_ut_evaluate_numeric_object(METHOD_NAME__ADR, pci_device_node, &pci_value); /* - * The default is zero, and since the allocation above zeroed - * the data, just do nothing on failure. + * The default is zero, and since the allocation above zeroed the data, + * just do nothing on failure. */ if (ACPI_SUCCESS(status)) { pci_id->device = ACPI_HIWORD(ACPI_LODWORD(pci_value)); @@ -382,9 +383,8 @@ static u8 acpi_ev_is_pci_root_bridge(struct acpi_namespace_node *node) struct acpi_compatible_id_list *cid; u32 i; - /* - * Get the _HID and check for a PCI Root Bridge - */ + /* Get the _HID and check for a PCI Root Bridge */ + status = acpi_ut_execute_HID(node, &hid); if (ACPI_FAILURE(status)) { return (FALSE); @@ -394,10 +394,8 @@ static u8 acpi_ev_is_pci_root_bridge(struct acpi_namespace_node *node) return (TRUE); } - /* - * The _HID did not match. - * Get the _CID and check for a PCI Root Bridge - */ + /* The _HID did not match. Get the _CID and check for a PCI Root Bridge */ + status = acpi_ut_execute_CID(node, &cid); if (ACPI_FAILURE(status)) { return (FALSE); @@ -516,9 +514,9 @@ acpi_ev_default_region_setup(acpi_handle handle, * Get the appropriate address space handler for a newly * created region. * - * This also performs address space specific initialization. For + * This also performs address space specific initialization. For * example, PCI regions must have an _ADR object that contains - * a PCI address in the scope of the definition. This address is + * a PCI address in the scope of the definition. This address is * required to perform an access to PCI config space. * * MUTEX: Interpreter should be unlocked, because we may run the _REG @@ -572,7 +570,7 @@ acpi_ev_initialize_region(union acpi_operand_object *region_obj, if (ACPI_SUCCESS(status)) { /* * The _REG method is optional and there can be only one per region - * definition. This will be executed when the handler is attached + * definition. This will be executed when the handler is attached * or removed */ region_obj2->extra.method_REG = method_node; @@ -670,10 +668,8 @@ acpi_ev_initialize_region(union acpi_operand_object *region_obj, } } - /* - * This node does not have the handler we need; - * Pop up one level - */ + /* This node does not have the handler we need; Pop up one level */ + node = acpi_ns_get_parent_node(node); } diff --git a/drivers/acpi/events/evsci.c b/drivers/acpi/acpica/evsci.c index 2a8b7787761..567b356c85a 100644 --- a/drivers/acpi/events/evsci.c +++ b/drivers/acpi/acpica/evsci.c @@ -43,7 +43,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acevents.h> +#include "accommon.h" +#include "acevents.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evsci") @@ -115,10 +116,8 @@ u32 ACPI_SYSTEM_XFACE acpi_ev_gpe_xrupt_handler(void *context) * if this interrupt handler is installed, ACPI is enabled. */ - /* - * GPEs: - * Check for and dispatch any GPEs that have occurred - */ + /* GPEs: Check for and dispatch any GPEs that have occurred */ + interrupt_handled |= acpi_ev_gpe_detect(gpe_xrupt_list); return_UINT32(interrupt_handled); @@ -158,11 +157,11 @@ u32 acpi_ev_install_sci_handler(void) * RETURN: E_OK if handler uninstalled OK, E_ERROR if handler was not * installed to begin with * - * DESCRIPTION: Remove the SCI interrupt handler. No further SCIs will be + * DESCRIPTION: Remove the SCI interrupt handler. No further SCIs will be * taken. * * Note: It doesn't seem important to disable all events or set the event - * enable registers to their original values. The OS should disable + * enable registers to their original values. The OS should disable * the SCI interrupt level when the handler is removed, so no more * events will come in. * diff --git a/drivers/acpi/events/evxface.c b/drivers/acpi/acpica/evxface.c index 94a6efe020b..3aca9010a11 100644 --- a/drivers/acpi/events/evxface.c +++ b/drivers/acpi/acpica/evxface.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acevents.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acnamesp.h" +#include "acevents.h" +#include "acinterp.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evxface") @@ -267,7 +268,7 @@ acpi_install_notify_handler(acpi_handle device, /* * Root Object: * Registering a notify handler on the root object indicates that the - * caller wishes to receive notifications for all objects. Note that + * caller wishes to receive notifications for all objects. Note that * only one <external> global handler can be regsitered (per notify type). */ if (device == ACPI_ROOT_OBJECT) { diff --git a/drivers/acpi/events/evxfevnt.c b/drivers/acpi/acpica/evxfevnt.c index 41554f736b6..35485e4b60a 100644 --- a/drivers/acpi/events/evxfevnt.c +++ b/drivers/acpi/acpica/evxfevnt.c @@ -42,13 +42,19 @@ */ #include <acpi/acpi.h> -#include <acpi/acevents.h> -#include <acpi/acnamesp.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "acevents.h" +#include "acnamesp.h" +#include "actables.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evxfevnt") +/* Local prototypes */ +acpi_status +acpi_ev_get_gpe_device(struct acpi_gpe_xrupt_info *gpe_xrupt_info, + struct acpi_gpe_block_info *gpe_block, void *context); + /******************************************************************************* * * FUNCTION: acpi_enable @@ -60,6 +66,7 @@ ACPI_MODULE_NAME("evxfevnt") * DESCRIPTION: Transfers the system into ACPI mode. * ******************************************************************************/ + acpi_status acpi_enable(void) { acpi_status status = AE_OK; @@ -161,8 +168,8 @@ acpi_status acpi_enable_event(u32 event, u32 flags) } /* - * Enable the requested fixed event (by writing a one to the - * enable register bit) + * Enable the requested fixed event (by writing a one to the enable + * register bit) */ status = acpi_set_register(acpi_gbl_fixed_event_info[event]. @@ -343,8 +350,8 @@ acpi_status acpi_disable_event(u32 event, u32 flags) } /* - * Disable the requested fixed event (by writing a zero to the - * enable register bit) + * Disable the requested fixed event (by writing a zero to the enable + * register bit) */ status = acpi_set_register(acpi_gbl_fixed_event_info[event]. @@ -396,8 +403,8 @@ acpi_status acpi_clear_event(u32 event) } /* - * Clear the requested fixed event (By writing a one to the - * status register bit) + * Clear the requested fixed event (By writing a one to the status + * register bit) */ status = acpi_set_register(acpi_gbl_fixed_event_info[event]. @@ -717,3 +724,148 @@ acpi_status acpi_remove_gpe_block(acpi_handle gpe_device) } ACPI_EXPORT_SYMBOL(acpi_remove_gpe_block) + +/******************************************************************************* + * + * FUNCTION: acpi_get_gpe_device + * + * PARAMETERS: Index - System GPE index (0-current_gpe_count) + * gpe_device - Where the parent GPE Device is returned + * + * RETURN: Status + * + * DESCRIPTION: Obtain the GPE device associated with the input index. A NULL + * gpe device indicates that the gpe number is contained in one of + * the FADT-defined gpe blocks. Otherwise, the GPE block device. + * + ******************************************************************************/ +acpi_status +acpi_get_gpe_device(u32 index, acpi_handle *gpe_device) +{ + struct acpi_gpe_device_info info; + acpi_status status; + + ACPI_FUNCTION_TRACE(acpi_get_gpe_device); + + if (!gpe_device) { + return_ACPI_STATUS(AE_BAD_PARAMETER); + } + + if (index >= acpi_current_gpe_count) { + return_ACPI_STATUS(AE_NOT_EXIST); + } + + /* Setup and walk the GPE list */ + + info.index = index; + info.status = AE_NOT_EXIST; + info.gpe_device = NULL; + info.next_block_base_index = 0; + + status = acpi_ev_walk_gpe_list(acpi_ev_get_gpe_device, &info); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + *gpe_device = info.gpe_device; + return_ACPI_STATUS(info.status); +} + +ACPI_EXPORT_SYMBOL(acpi_get_gpe_device) + +/******************************************************************************* + * + * FUNCTION: acpi_ev_get_gpe_device + * + * PARAMETERS: GPE_WALK_CALLBACK + * + * RETURN: Status + * + * DESCRIPTION: Matches the input GPE index (0-current_gpe_count) with a GPE + * block device. NULL if the GPE is one of the FADT-defined GPEs. + * + ******************************************************************************/ +acpi_status +acpi_ev_get_gpe_device(struct acpi_gpe_xrupt_info *gpe_xrupt_info, + struct acpi_gpe_block_info *gpe_block, void *context) +{ + struct acpi_gpe_device_info *info = context; + + /* Increment Index by the number of GPEs in this block */ + + info->next_block_base_index += + (gpe_block->register_count * ACPI_GPE_REGISTER_WIDTH); + + if (info->index < info->next_block_base_index) { + /* + * The GPE index is within this block, get the node. Leave the node + * NULL for the FADT-defined GPEs + */ + if ((gpe_block->node)->type == ACPI_TYPE_DEVICE) { + info->gpe_device = gpe_block->node; + } + + info->status = AE_OK; + return (AE_CTRL_END); + } + + return (AE_OK); +} + +/****************************************************************************** + * + * FUNCTION: acpi_disable_all_gpes + * + * PARAMETERS: None + * + * RETURN: Status + * + * DESCRIPTION: Disable and clear all GPEs in all GPE blocks + * + ******************************************************************************/ + +acpi_status acpi_disable_all_gpes(void) +{ + acpi_status status; + + ACPI_FUNCTION_TRACE(acpi_disable_all_gpes); + + status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + status = acpi_hw_disable_all_gpes(); + (void)acpi_ut_release_mutex(ACPI_MTX_EVENTS); + + return_ACPI_STATUS(status); +} + +/****************************************************************************** + * + * FUNCTION: acpi_enable_all_runtime_gpes + * + * PARAMETERS: None + * + * RETURN: Status + * + * DESCRIPTION: Enable all "runtime" GPEs, in all GPE blocks + * + ******************************************************************************/ + +acpi_status acpi_enable_all_runtime_gpes(void) +{ + acpi_status status; + + ACPI_FUNCTION_TRACE(acpi_enable_all_runtime_gpes); + + status = acpi_ut_acquire_mutex(ACPI_MTX_EVENTS); + if (ACPI_FAILURE(status)) { + return_ACPI_STATUS(status); + } + + status = acpi_hw_enable_all_runtime_gpes(); + (void)acpi_ut_release_mutex(ACPI_MTX_EVENTS); + + return_ACPI_STATUS(status); +} diff --git a/drivers/acpi/events/evxfregn.c b/drivers/acpi/acpica/evxfregn.c index e8750807e57..479e7a3721b 100644 --- a/drivers/acpi/events/evxfregn.c +++ b/drivers/acpi/acpica/evxfregn.c @@ -43,8 +43,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acevents.h> +#include "accommon.h" +#include "acnamesp.h" +#include "acevents.h" #define _COMPONENT ACPI_EVENTS ACPI_MODULE_NAME("evxfregn") diff --git a/drivers/acpi/executer/exconfig.c b/drivers/acpi/acpica/exconfig.c index 74da6fa52ef..932bbc26aa0 100644 --- a/drivers/acpi/executer/exconfig.c +++ b/drivers/acpi/acpica/exconfig.c @@ -42,10 +42,11 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> -#include <acpi/actables.h> -#include <acpi/acdispat.h> +#include "accommon.h" +#include "acinterp.h" +#include "acnamesp.h" +#include "actables.h" +#include "acdispat.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exconfig") diff --git a/drivers/acpi/executer/exconvrt.c b/drivers/acpi/acpica/exconvrt.c index 1d1f35adddd..0be10188316 100644 --- a/drivers/acpi/executer/exconvrt.c +++ b/drivers/acpi/acpica/exconvrt.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/amlcode.h> +#include "accommon.h" +#include "acinterp.h" +#include "amlcode.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exconvrt") diff --git a/drivers/acpi/executer/excreate.c b/drivers/acpi/acpica/excreate.c index ad09696d506..a57ad2564ab 100644 --- a/drivers/acpi/executer/excreate.c +++ b/drivers/acpi/acpica/excreate.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/amlcode.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acinterp.h" +#include "amlcode.h" +#include "acnamesp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("excreate") diff --git a/drivers/acpi/executer/exdump.c b/drivers/acpi/acpica/exdump.c index d087a7d28aa..aa313574b0d 100644 --- a/drivers/acpi/executer/exdump.c +++ b/drivers/acpi/acpica/exdump.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/amlcode.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acinterp.h" +#include "amlcode.h" +#include "acnamesp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exdump") diff --git a/drivers/acpi/executer/exfield.c b/drivers/acpi/acpica/exfield.c index 3e440d84226..a352d023385 100644 --- a/drivers/acpi/executer/exfield.c +++ b/drivers/acpi/acpica/exfield.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acdispat.h" +#include "acinterp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exfield") diff --git a/drivers/acpi/executer/exfldio.c b/drivers/acpi/acpica/exfldio.c index 9ff9d1f4615..ef58ac4e687 100644 --- a/drivers/acpi/executer/exfldio.c +++ b/drivers/acpi/acpica/exfldio.c @@ -42,10 +42,11 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/amlcode.h> -#include <acpi/acevents.h> -#include <acpi/acdispat.h> +#include "accommon.h" +#include "acinterp.h" +#include "amlcode.h" +#include "acevents.h" +#include "acdispat.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exfldio") @@ -498,14 +499,13 @@ acpi_ex_field_datum_io(union acpi_operand_object *obj_desc, return_ACPI_STATUS(status); } - ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, - "I/O to Data Register: ValuePtr %p\n", - value)); - if (read_write == ACPI_READ) { /* Read the datum from the data_register */ + ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, + "Read from Data Register\n")); + status = acpi_ex_extract_from_field(obj_desc->index_field. data_obj, value, @@ -513,6 +513,10 @@ acpi_ex_field_datum_io(union acpi_operand_object *obj_desc, } else { /* Write the datum to the data_register */ + ACPI_DEBUG_PRINT((ACPI_DB_BFIELD, + "Write to Data Register: Value %8.8X%8.8X\n", + ACPI_FORMAT_UINT64(*value))); + status = acpi_ex_insert_into_field(obj_desc->index_field. data_obj, value, diff --git a/drivers/acpi/executer/exmisc.c b/drivers/acpi/acpica/exmisc.c index efb19134005..6b0747ac683 100644 --- a/drivers/acpi/executer/exmisc.c +++ b/drivers/acpi/acpica/exmisc.c @@ -43,9 +43,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/amlcode.h> -#include <acpi/amlresrc.h> +#include "accommon.h" +#include "acinterp.h" +#include "amlcode.h" +#include "amlresrc.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exmisc") diff --git a/drivers/acpi/executer/exmutex.c b/drivers/acpi/acpica/exmutex.c index a8bf3d713e2..d301c1f363e 100644 --- a/drivers/acpi/executer/exmutex.c +++ b/drivers/acpi/acpica/exmutex.c @@ -43,8 +43,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/acevents.h> +#include "accommon.h" +#include "acinterp.h" +#include "acevents.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exmutex") diff --git a/drivers/acpi/executer/exnames.c b/drivers/acpi/acpica/exnames.c index 817e67be369..ffdae122d94 100644 --- a/drivers/acpi/executer/exnames.c +++ b/drivers/acpi/acpica/exnames.c @@ -43,8 +43,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/amlcode.h> +#include "accommon.h" +#include "acinterp.h" +#include "amlcode.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exnames") diff --git a/drivers/acpi/executer/exoparg1.c b/drivers/acpi/acpica/exoparg1.c index f622f9eac8a..b530480cc7d 100644 --- a/drivers/acpi/executer/exoparg1.c +++ b/drivers/acpi/acpica/exoparg1.c @@ -43,11 +43,12 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> -#include <acpi/amlcode.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acparser.h" +#include "acdispat.h" +#include "acinterp.h" +#include "amlcode.h" +#include "acnamesp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exoparg1") diff --git a/drivers/acpi/executer/exoparg2.c b/drivers/acpi/acpica/exoparg2.c index 368def5dffc..0b4f513ca88 100644 --- a/drivers/acpi/executer/exoparg2.c +++ b/drivers/acpi/acpica/exoparg2.c @@ -42,10 +42,11 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/acinterp.h> -#include <acpi/acevents.h> -#include <acpi/amlcode.h> +#include "accommon.h" +#include "acparser.h" +#include "acinterp.h" +#include "acevents.h" +#include "amlcode.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exoparg2") diff --git a/drivers/acpi/executer/exoparg3.c b/drivers/acpi/acpica/exoparg3.c index 9cb4197681a..c6520bbf882 100644 --- a/drivers/acpi/executer/exoparg3.c +++ b/drivers/acpi/acpica/exoparg3.c @@ -43,9 +43,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/acparser.h> -#include <acpi/amlcode.h> +#include "accommon.h" +#include "acinterp.h" +#include "acparser.h" +#include "amlcode.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exoparg3") diff --git a/drivers/acpi/executer/exoparg6.c b/drivers/acpi/acpica/exoparg6.c index 67d48737af5..ae43f7670a6 100644 --- a/drivers/acpi/executer/exoparg6.c +++ b/drivers/acpi/acpica/exoparg6.c @@ -43,9 +43,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/acparser.h> -#include <acpi/amlcode.h> +#include "accommon.h" +#include "acinterp.h" +#include "acparser.h" +#include "amlcode.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exoparg6") diff --git a/drivers/acpi/executer/exprep.c b/drivers/acpi/acpica/exprep.c index a7dc87ecee3..a226f74d4a5 100644 --- a/drivers/acpi/executer/exprep.c +++ b/drivers/acpi/acpica/exprep.c @@ -43,9 +43,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/amlcode.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acinterp.h" +#include "amlcode.h" +#include "acnamesp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exprep") diff --git a/drivers/acpi/executer/exregion.c b/drivers/acpi/acpica/exregion.c index 7a41c409ae4..76ec8ff903b 100644 --- a/drivers/acpi/executer/exregion.c +++ b/drivers/acpi/acpica/exregion.c @@ -43,7 +43,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acinterp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exregion") diff --git a/drivers/acpi/executer/exresnte.c b/drivers/acpi/acpica/exresnte.c index 423ad3635f3..a063a74006f 100644 --- a/drivers/acpi/executer/exresnte.c +++ b/drivers/acpi/acpica/exresnte.c @@ -43,9 +43,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acdispat.h" +#include "acinterp.h" +#include "acnamesp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exresnte") diff --git a/drivers/acpi/executer/exresolv.c b/drivers/acpi/acpica/exresolv.c index 60e8c47128e..f6105a6d612 100644 --- a/drivers/acpi/executer/exresolv.c +++ b/drivers/acpi/acpica/exresolv.c @@ -43,10 +43,11 @@ */ #include <acpi/acpi.h> -#include <acpi/amlcode.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "amlcode.h" +#include "acdispat.h" +#include "acinterp.h" +#include "acnamesp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exresolv") diff --git a/drivers/acpi/executer/exresop.c b/drivers/acpi/acpica/exresop.c index 0bb82593da7..3c3802764bf 100644 --- a/drivers/acpi/executer/exresop.c +++ b/drivers/acpi/acpica/exresop.c @@ -43,10 +43,11 @@ */ #include <acpi/acpi.h> -#include <acpi/amlcode.h> -#include <acpi/acparser.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "amlcode.h" +#include "acparser.h" +#include "acinterp.h" +#include "acnamesp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exresop") diff --git a/drivers/acpi/executer/exstore.c b/drivers/acpi/acpica/exstore.c index 1c118ba78ad..e35e9b4f6a4 100644 --- a/drivers/acpi/executer/exstore.c +++ b/drivers/acpi/acpica/exstore.c @@ -43,10 +43,11 @@ */ #include <acpi/acpi.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> -#include <acpi/amlcode.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acdispat.h" +#include "acinterp.h" +#include "amlcode.h" +#include "acnamesp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exstore") diff --git a/drivers/acpi/executer/exstoren.c b/drivers/acpi/acpica/exstoren.c index eef61a00803..145d15305f7 100644 --- a/drivers/acpi/executer/exstoren.c +++ b/drivers/acpi/acpica/exstoren.c @@ -44,8 +44,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/amlcode.h> +#include "accommon.h" +#include "acinterp.h" +#include "amlcode.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exstoren") diff --git a/drivers/acpi/executer/exstorob.c b/drivers/acpi/acpica/exstorob.c index 9a75ff09fb0..67340cc7014 100644 --- a/drivers/acpi/executer/exstorob.c +++ b/drivers/acpi/acpica/exstorob.c @@ -43,7 +43,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acinterp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exstorob") diff --git a/drivers/acpi/executer/exsystem.c b/drivers/acpi/acpica/exsystem.c index 68990f1df37..3d00b935723 100644 --- a/drivers/acpi/executer/exsystem.c +++ b/drivers/acpi/acpica/exsystem.c @@ -43,7 +43,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acinterp.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exsystem") diff --git a/drivers/acpi/executer/exutils.c b/drivers/acpi/acpica/exutils.c index 86c03880b52..32b85d68e75 100644 --- a/drivers/acpi/executer/exutils.c +++ b/drivers/acpi/acpica/exutils.c @@ -59,8 +59,9 @@ #define DEFINE_AML_GLOBALS #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/amlcode.h> +#include "accommon.h" +#include "acinterp.h" +#include "amlcode.h" #define _COMPONENT ACPI_EXECUTER ACPI_MODULE_NAME("exutils") diff --git a/drivers/acpi/hardware/hwacpi.c b/drivers/acpi/acpica/hwacpi.c index 816894ea839..a9d4fea4167 100644 --- a/drivers/acpi/hardware/hwacpi.c +++ b/drivers/acpi/acpica/hwacpi.c @@ -43,6 +43,7 @@ */ #include <acpi/acpi.h> +#include "accommon.h" #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwacpi") diff --git a/drivers/acpi/hardware/hwgpe.c b/drivers/acpi/acpica/hwgpe.c index 0b80db9d919..2013b66745d 100644 --- a/drivers/acpi/hardware/hwgpe.c +++ b/drivers/acpi/acpica/hwgpe.c @@ -43,7 +43,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acevents.h> +#include "accommon.h" +#include "acevents.h" #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwgpe") @@ -51,7 +52,8 @@ ACPI_MODULE_NAME("hwgpe") /* Local prototypes */ static acpi_status acpi_hw_enable_wakeup_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, - struct acpi_gpe_block_info *gpe_block); + struct acpi_gpe_block_info *gpe_block, + void *context); /****************************************************************************** * @@ -80,8 +82,7 @@ acpi_status acpi_hw_low_disable_gpe(struct acpi_gpe_event_info *gpe_event_info) /* Get current value of the enable register that contains this GPE */ - status = acpi_hw_low_level_read(ACPI_GPE_REGISTER_WIDTH, &enable_mask, - &gpe_register_info->enable_address); + status = acpi_read(&enable_mask, &gpe_register_info->enable_address); if (ACPI_FAILURE(status)) { return (status); } @@ -95,9 +96,7 @@ acpi_status acpi_hw_low_disable_gpe(struct acpi_gpe_event_info *gpe_event_info) /* Write the updated enable mask */ - status = acpi_hw_low_level_write(ACPI_GPE_REGISTER_WIDTH, enable_mask, - &gpe_register_info->enable_address); - + status = acpi_write(enable_mask, &gpe_register_info->enable_address); return (status); } @@ -132,8 +131,8 @@ acpi_hw_write_gpe_enable_reg(struct acpi_gpe_event_info * gpe_event_info) /* Write the entire GPE (runtime) enable register */ - status = acpi_hw_low_level_write(8, gpe_register_info->enable_for_run, - &gpe_register_info->enable_address); + status = acpi_write(gpe_register_info->enable_for_run, + &gpe_register_info->enable_address); return (status); } @@ -166,9 +165,8 @@ acpi_status acpi_hw_clear_gpe(struct acpi_gpe_event_info * gpe_event_info) * Write a one to the appropriate bit in the status register to * clear this GPE. */ - status = acpi_hw_low_level_write(8, register_bit, - &gpe_event_info->register_info-> - status_address); + status = acpi_write(register_bit, + &gpe_event_info->register_info->status_address); return (status); } @@ -227,9 +225,7 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info, /* GPE currently active (status bit == 1)? */ - status = - acpi_hw_low_level_read(8, &in_byte, - &gpe_register_info->status_address); + status = acpi_read(&in_byte, &gpe_register_info->status_address); if (ACPI_FAILURE(status)) { goto unlock_and_exit; } @@ -260,8 +256,8 @@ acpi_hw_get_gpe_status(struct acpi_gpe_event_info * gpe_event_info, ******************************************************************************/ acpi_status -acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info, - struct acpi_gpe_block_info * gpe_block) +acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, + struct acpi_gpe_block_info *gpe_block, void *context) { u32 i; acpi_status status; @@ -272,9 +268,9 @@ acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info, /* Disable all GPEs in this register */ - status = acpi_hw_low_level_write(8, 0x00, - &gpe_block->register_info[i]. - enable_address); + status = + acpi_write(0x00, + &gpe_block->register_info[i].enable_address); if (ACPI_FAILURE(status)) { return (status); } @@ -297,8 +293,8 @@ acpi_hw_disable_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info, ******************************************************************************/ acpi_status -acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info, - struct acpi_gpe_block_info * gpe_block) +acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, + struct acpi_gpe_block_info *gpe_block, void *context) { u32 i; acpi_status status; @@ -309,9 +305,9 @@ acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info, /* Clear status on all GPEs in this register */ - status = acpi_hw_low_level_write(8, 0xFF, - &gpe_block->register_info[i]. - status_address); + status = + acpi_write(0xFF, + &gpe_block->register_info[i].status_address); if (ACPI_FAILURE(status)) { return (status); } @@ -335,8 +331,8 @@ acpi_hw_clear_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info, ******************************************************************************/ acpi_status -acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info, - struct acpi_gpe_block_info * gpe_block) +acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, + struct acpi_gpe_block_info *gpe_block, void *context) { u32 i; acpi_status status; @@ -352,12 +348,9 @@ acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info, /* Enable all "runtime" GPEs in this register */ - status = - acpi_hw_low_level_write(8, - gpe_block->register_info[i]. - enable_for_run, - &gpe_block->register_info[i]. - enable_address); + status = acpi_write(gpe_block->register_info[i].enable_for_run, + &gpe_block->register_info[i]. + enable_address); if (ACPI_FAILURE(status)) { return (status); } @@ -382,7 +375,8 @@ acpi_hw_enable_runtime_gpe_block(struct acpi_gpe_xrupt_info * gpe_xrupt_info, static acpi_status acpi_hw_enable_wakeup_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, - struct acpi_gpe_block_info *gpe_block) + struct acpi_gpe_block_info *gpe_block, + void *context) { u32 i; acpi_status status; @@ -396,11 +390,9 @@ acpi_hw_enable_wakeup_gpe_block(struct acpi_gpe_xrupt_info *gpe_xrupt_info, /* Enable all "wake" GPEs in this register */ - status = acpi_hw_low_level_write(8, - gpe_block->register_info[i]. - enable_for_wake, - &gpe_block->register_info[i]. - enable_address); + status = acpi_write(gpe_block->register_info[i].enable_for_wake, + &gpe_block->register_info[i]. + enable_address); if (ACPI_FAILURE(status)) { return (status); } @@ -427,8 +419,8 @@ acpi_status acpi_hw_disable_all_gpes(void) ACPI_FUNCTION_TRACE(hw_disable_all_gpes); - status = acpi_ev_walk_gpe_list(acpi_hw_disable_gpe_block); - status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block); + status = acpi_ev_walk_gpe_list(acpi_hw_disable_gpe_block, NULL); + status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block, NULL); return_ACPI_STATUS(status); } @@ -450,7 +442,7 @@ acpi_status acpi_hw_enable_all_runtime_gpes(void) ACPI_FUNCTION_TRACE(hw_enable_all_runtime_gpes); - status = acpi_ev_walk_gpe_list(acpi_hw_enable_runtime_gpe_block); + status = acpi_ev_walk_gpe_list(acpi_hw_enable_runtime_gpe_block, NULL); return_ACPI_STATUS(status); } @@ -472,6 +464,6 @@ acpi_status acpi_hw_enable_all_wakeup_gpes(void) ACPI_FUNCTION_TRACE(hw_enable_all_wakeup_gpes); - status = acpi_ev_walk_gpe_list(acpi_hw_enable_wakeup_gpe_block); + status = acpi_ev_walk_gpe_list(acpi_hw_enable_wakeup_gpe_block, NULL); return_ACPI_STATUS(status); } diff --git a/drivers/acpi/acpica/hwregs.c b/drivers/acpi/acpica/hwregs.c new file mode 100644 index 00000000000..4dc43b01851 --- /dev/null +++ b/drivers/acpi/acpica/hwregs.c @@ -0,0 +1,353 @@ + +/******************************************************************************* + * + * Module Name: hwregs - Read/write access functions for the various ACPI + * control and status registers. + * + ******************************************************************************/ + +/* + * Copyright (C) 2000 - 2008, Intel Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + * of any contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + */ + +#include <acpi/acpi.h> +#include "accommon.h" +#include "acnamesp.h" +#include "acevents.h" + +#define _COMPONENT ACPI_HARDWARE +ACPI_MODULE_NAME("hwregs") + +/******************************************************************************* + * + * FUNCTION: acpi_hw_clear_acpi_status + * + * PARAMETERS: None + * + * RETURN: Status + * + * DESCRIPTION: Clears all fixed and general purpose status bits + * THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED + * + ******************************************************************************/ +acpi_status acpi_hw_clear_acpi_status(void) +{ + acpi_status status; + acpi_cpu_flags lock_flags = 0; + + ACPI_FUNCTION_TRACE(hw_clear_acpi_status); + + ACPI_DEBUG_PRINT((ACPI_DB_IO, "About to write %04X to %04X\n", + ACPI_BITMASK_ALL_FIXED_STATUS, + (u16) acpi_gbl_FADT.xpm1a_event_block.address)); + + lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + + status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, + ACPI_BITMASK_ALL_FIXED_STATUS); + if (ACPI_FAILURE(status)) { + goto unlock_and_exit; + } + + /* Clear the fixed events */ + + if (acpi_gbl_FADT.xpm1b_event_block.address) { + status = acpi_write(ACPI_BITMASK_ALL_FIXED_STATUS, + &acpi_gbl_FADT.xpm1b_event_block); + if (ACPI_FAILURE(status)) { + goto unlock_and_exit; + } + } + + /* Clear the GPE Bits in all GPE registers in all GPE blocks */ + + status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block, NULL); + + unlock_and_exit: + acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + return_ACPI_STATUS(status); +} + +/******************************************************************************* + * + * FUNCTION: acpi_hw_get_register_bit_mask + * + * PARAMETERS: register_id - Index of ACPI Register to access + * + * RETURN: The bitmask to be used when accessing the register + * + * DESCRIPTION: Map register_id into a register bitmask. + * + ******************************************************************************/ + +struct acpi_bit_register_info *acpi_hw_get_bit_register_info(u32 register_id) +{ + ACPI_FUNCTION_ENTRY(); + + if (register_id > ACPI_BITREG_MAX) { + ACPI_ERROR((AE_INFO, "Invalid BitRegister ID: %X", + register_id)); + return (NULL); + } + + return (&acpi_gbl_bit_register_info[register_id]); +} + +/****************************************************************************** + * + * FUNCTION: acpi_hw_register_read + * + * PARAMETERS: register_id - ACPI Register ID + * return_value - Where the register value is returned + * + * RETURN: Status and the value read. + * + * DESCRIPTION: Read from the specified ACPI register + * + ******************************************************************************/ +acpi_status +acpi_hw_register_read(u32 register_id, u32 * return_value) +{ + u32 value1 = 0; + u32 value2 = 0; + acpi_status status; + + ACPI_FUNCTION_TRACE(hw_register_read); + + switch (register_id) { + case ACPI_REGISTER_PM1_STATUS: /* 16-bit access */ + + status = acpi_read(&value1, &acpi_gbl_FADT.xpm1a_event_block); + if (ACPI_FAILURE(status)) { + goto exit; + } + + /* PM1B is optional */ + + status = acpi_read(&value2, &acpi_gbl_FADT.xpm1b_event_block); + value1 |= value2; + break; + + case ACPI_REGISTER_PM1_ENABLE: /* 16-bit access */ + + status = acpi_read(&value1, &acpi_gbl_xpm1a_enable); + if (ACPI_FAILURE(status)) { + goto exit; + } + + /* PM1B is optional */ + + status = acpi_read(&value2, &acpi_gbl_xpm1b_enable); + value1 |= value2; + break; + + case ACPI_REGISTER_PM1_CONTROL: /* 16-bit access */ + + status = acpi_read(&value1, &acpi_gbl_FADT.xpm1a_control_block); + if (ACPI_FAILURE(status)) { + goto exit; + } + + status = acpi_read(&value2, &acpi_gbl_FADT.xpm1b_control_block); + value1 |= value2; + break; + + case ACPI_REGISTER_PM2_CONTROL: /* 8-bit access */ + + status = acpi_read(&value1, &acpi_gbl_FADT.xpm2_control_block); + break; + + case ACPI_REGISTER_PM_TIMER: /* 32-bit access */ + + status = acpi_read(&value1, &acpi_gbl_FADT.xpm_timer_block); + break; + + case ACPI_REGISTER_SMI_COMMAND_BLOCK: /* 8-bit access */ + + status = + acpi_os_read_port(acpi_gbl_FADT.smi_command, &value1, 8); + break; + + default: + ACPI_ERROR((AE_INFO, "Unknown Register ID: %X", register_id)); + status = AE_BAD_PARAMETER; + break; + } + + exit: + + if (ACPI_SUCCESS(status)) { + *return_value = value1; + } + + return_ACPI_STATUS(status); +} + +/****************************************************************************** + * + * FUNCTION: acpi_hw_register_write + * + * PARAMETERS: register_id - ACPI Register ID + * Value - The value to write + * + * RETURN: Status + * + * DESCRIPTION: Write to the specified ACPI register + * + * NOTE: In accordance with the ACPI specification, this function automatically + * preserves the value of the following bits, meaning that these bits cannot be + * changed via this interface: + * + * PM1_CONTROL[0] = SCI_EN + * PM1_CONTROL[9] + * PM1_STATUS[11] + * + * ACPI References: + * 1) Hardware Ignored Bits: When software writes to a register with ignored + * bit fields, it preserves the ignored bit fields + * 2) SCI_EN: OSPM always preserves this bit position + * + ******************************************************************************/ + +acpi_status acpi_hw_register_write(u32 register_id, u32 value) +{ + acpi_status status; + u32 read_value; + + ACPI_FUNCTION_TRACE(hw_register_write); + + switch (register_id) { + case ACPI_REGISTER_PM1_STATUS: /* 16-bit access */ + + /* Perform a read first to preserve certain bits (per ACPI spec) */ + + status = acpi_hw_register_read(ACPI_REGISTER_PM1_STATUS, + &read_value); + if (ACPI_FAILURE(status)) { + goto exit; + } + + /* Insert the bits to be preserved */ + + ACPI_INSERT_BITS(value, ACPI_PM1_STATUS_PRESERVED_BITS, + read_value); + + /* Now we can write the data */ + + status = acpi_write(value, &acpi_gbl_FADT.xpm1a_event_block); + if (ACPI_FAILURE(status)) { + goto exit; + } + + /* PM1B is optional */ + + status = acpi_write(value, &acpi_gbl_FADT.xpm1b_event_block); + break; + + case ACPI_REGISTER_PM1_ENABLE: /* 16-bit access */ + + status = acpi_write(value, &acpi_gbl_xpm1a_enable); + if (ACPI_FAILURE(status)) { + goto exit; + } + + /* PM1B is optional */ + + status = acpi_write(value, &acpi_gbl_xpm1b_enable); + break; + + case ACPI_REGISTER_PM1_CONTROL: /* 16-bit access */ + + /* + * Perform a read first to preserve certain bits (per ACPI spec) + */ + status = acpi_hw_register_read(ACPI_REGISTER_PM1_CONTROL, + &read_value); + if (ACPI_FAILURE(status)) { + goto exit; + } + + /* Insert the bits to be preserved */ + + ACPI_INSERT_BITS(value, ACPI_PM1_CONTROL_PRESERVED_BITS, + read_value); + + /* Now we can write the data */ + + status = acpi_write(value, &acpi_gbl_FADT.xpm1a_control_block); + if (ACPI_FAILURE(status)) { + goto exit; + } + + status = acpi_write(value, &acpi_gbl_FADT.xpm1b_control_block); + break; + + case ACPI_REGISTER_PM1A_CONTROL: /* 16-bit access */ + + status = acpi_write(value, &acpi_gbl_FADT.xpm1a_control_block); + break; + + case ACPI_REGISTER_PM1B_CONTROL: /* 16-bit access */ + + status = acpi_write(value, &acpi_gbl_FADT.xpm1b_control_block); + break; + + case ACPI_REGISTER_PM2_CONTROL: /* 8-bit access */ + + status = acpi_write(value, &acpi_gbl_FADT.xpm2_control_block); + break; + + case ACPI_REGISTER_PM_TIMER: /* 32-bit access */ + + status = acpi_write(value, &acpi_gbl_FADT.xpm_timer_block); + break; + + case ACPI_REGISTER_SMI_COMMAND_BLOCK: /* 8-bit access */ + + /* SMI_CMD is currently always in IO space */ + + status = + acpi_os_write_port(acpi_gbl_FADT.smi_command, value, 8); + break; + + default: + status = AE_BAD_PARAMETER; + break; + } + + exit: + return_ACPI_STATUS(status); +} diff --git a/drivers/acpi/hardware/hwsleep.c b/drivers/acpi/acpica/hwsleep.c index 25dccdf179b..a2af2a4f2f2 100644 --- a/drivers/acpi/hardware/hwsleep.c +++ b/drivers/acpi/acpica/hwsleep.c @@ -43,7 +43,8 @@ */ #include <acpi/acpi.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "actables.h" #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwsleep") @@ -52,31 +53,19 @@ ACPI_MODULE_NAME("hwsleep") * * FUNCTION: acpi_set_firmware_waking_vector * - * PARAMETERS: physical_address - Physical address of ACPI real mode + * PARAMETERS: physical_address - 32-bit physical address of ACPI real mode * entry point. * * RETURN: Status * - * DESCRIPTION: Access function for the firmware_waking_vector field in FACS + * DESCRIPTION: Sets the 32-bit firmware_waking_vector field of the FACS * ******************************************************************************/ acpi_status -acpi_set_firmware_waking_vector(acpi_physical_address physical_address) +acpi_set_firmware_waking_vector(u32 physical_address) { - struct acpi_table_facs *facs; - acpi_status status; - ACPI_FUNCTION_TRACE(acpi_set_firmware_waking_vector); - /* Get the FACS */ - - status = acpi_get_table_by_index(ACPI_TABLE_INDEX_FACS, - ACPI_CAST_INDIRECT_PTR(struct - acpi_table_header, - &facs)); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); - } /* * According to the ACPI specification 2.0c and later, the 64-bit @@ -85,10 +74,16 @@ acpi_set_firmware_waking_vector(acpi_physical_address physical_address) * Protected Mode. Some systems (for example HP dv5-1004nr) are known * to fail to resume if the 64-bit vector is used. */ - if (facs->version >= 1) - facs->xfirmware_waking_vector = 0; - facs->firmware_waking_vector = (u32)physical_address; + /* Set the 32-bit vector */ + + acpi_gbl_FACS->firmware_waking_vector = physical_address; + + /* Clear the 64-bit vector if it exists */ + + if ((acpi_gbl_FACS->length > 32) && (acpi_gbl_FACS->version >= 1)) { + acpi_gbl_FACS->xfirmware_waking_vector = 0; + } return_ACPI_STATUS(AE_OK); } @@ -97,48 +92,39 @@ ACPI_EXPORT_SYMBOL(acpi_set_firmware_waking_vector) /******************************************************************************* * - * FUNCTION: acpi_get_firmware_waking_vector + * FUNCTION: acpi_set_firmware_waking_vector64 * - * PARAMETERS: *physical_address - Where the contents of - * the firmware_waking_vector field of - * the FACS will be returned. + * PARAMETERS: physical_address - 64-bit physical address of ACPI protected + * mode entry point. * - * RETURN: Status, vector + * RETURN: Status * - * DESCRIPTION: Access function for the firmware_waking_vector field in FACS + * DESCRIPTION: Sets the 64-bit X_firmware_waking_vector field of the FACS, if + * it exists in the table. * ******************************************************************************/ -#ifdef ACPI_FUTURE_USAGE acpi_status -acpi_get_firmware_waking_vector(acpi_physical_address * physical_address) +acpi_set_firmware_waking_vector64(u64 physical_address) { - struct acpi_table_facs *facs; - acpi_status status; + ACPI_FUNCTION_TRACE(acpi_set_firmware_waking_vector64); - ACPI_FUNCTION_TRACE(acpi_get_firmware_waking_vector); - - if (!physical_address) { - return_ACPI_STATUS(AE_BAD_PARAMETER); - } - /* Get the FACS */ + /* Determine if the 64-bit vector actually exists */ - status = acpi_get_table_by_index(ACPI_TABLE_INDEX_FACS, - ACPI_CAST_INDIRECT_PTR(struct - acpi_table_header, - &facs)); - if (ACPI_FAILURE(status)) { - return_ACPI_STATUS(status); + if ((acpi_gbl_FACS->length <= 32) || (acpi_gbl_FACS->version < 1)) { + return_ACPI_STATUS(AE_NOT_EXIST); } - /* Get the vector */ - *physical_address = (acpi_physical_address)facs->firmware_waking_vector; + /* Clear 32-bit vector, set the 64-bit X_ vector */ + + acpi_gbl_FACS->firmware_waking_vector = 0; + acpi_gbl_FACS->xfirmware_waking_vector = physical_address; return_ACPI_STATUS(AE_OK); } -ACPI_EXPORT_SYMBOL(acpi_get_firmware_waking_vector) -#endif +ACPI_EXPORT_SYMBOL(acpi_set_firmware_waking_vector64) + /******************************************************************************* * * FUNCTION: acpi_enter_sleep_state_prep diff --git a/drivers/acpi/hardware/hwtimer.c b/drivers/acpi/acpica/hwtimer.c index b53d575491b..b7f522c8f02 100644 --- a/drivers/acpi/hardware/hwtimer.c +++ b/drivers/acpi/acpica/hwtimer.c @@ -43,6 +43,7 @@ */ #include <acpi/acpi.h> +#include "accommon.h" #define _COMPONENT ACPI_HARDWARE ACPI_MODULE_NAME("hwtimer") diff --git a/drivers/acpi/hardware/hwregs.c b/drivers/acpi/acpica/hwxface.c index ddf792adcf9..ae597c0ab53 100644 --- a/drivers/acpi/hardware/hwregs.c +++ b/drivers/acpi/acpica/hwxface.c @@ -1,10 +1,9 @@ -/******************************************************************************* +/****************************************************************************** * - * Module Name: hwregs - Read/write access functions for the various ACPI - * control and status registers. + * Module Name: hwxface - Public ACPICA hardware interfaces * - ******************************************************************************/ + *****************************************************************************/ /* * Copyright (C) 2000 - 2008, Intel Corp. @@ -44,209 +43,208 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acevents.h> +#include "accommon.h" +#include "acnamesp.h" #define _COMPONENT ACPI_HARDWARE -ACPI_MODULE_NAME("hwregs") +ACPI_MODULE_NAME("hwxface") -/******************************************************************************* +/****************************************************************************** * - * FUNCTION: acpi_hw_clear_acpi_status + * FUNCTION: acpi_reset * * PARAMETERS: None * - * RETURN: None + * RETURN: Status * - * DESCRIPTION: Clears all fixed and general purpose status bits - * THIS FUNCTION MUST BE CALLED WITH INTERRUPTS DISABLED + * DESCRIPTION: Set reset register in memory or IO space. Note: Does not + * support reset register in PCI config space, this must be + * handled separately. * ******************************************************************************/ -acpi_status acpi_hw_clear_acpi_status(void) +acpi_status acpi_reset(void) { + struct acpi_generic_address *reset_reg; acpi_status status; - acpi_cpu_flags lock_flags = 0; - ACPI_FUNCTION_TRACE(hw_clear_acpi_status); + ACPI_FUNCTION_TRACE(acpi_reset); - ACPI_DEBUG_PRINT((ACPI_DB_IO, "About to write %04X to %04X\n", - ACPI_BITMASK_ALL_FIXED_STATUS, - (u16) acpi_gbl_FADT.xpm1a_event_block.address)); + reset_reg = &acpi_gbl_FADT.reset_register; - lock_flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); + /* Check if the reset register is supported */ - status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, - ACPI_BITMASK_ALL_FIXED_STATUS); - if (ACPI_FAILURE(status)) { - goto unlock_and_exit; + if (!(acpi_gbl_FADT.flags & ACPI_FADT_RESET_REGISTER) || + !reset_reg->address) { + return_ACPI_STATUS(AE_NOT_EXIST); } - /* Clear the fixed events */ - - if (acpi_gbl_FADT.xpm1b_event_block.address) { - status = - acpi_hw_low_level_write(16, ACPI_BITMASK_ALL_FIXED_STATUS, - &acpi_gbl_FADT.xpm1b_event_block); - if (ACPI_FAILURE(status)) { - goto unlock_and_exit; - } - } - - /* Clear the GPE Bits in all GPE registers in all GPE blocks */ - - status = acpi_ev_walk_gpe_list(acpi_hw_clear_gpe_block); + /* Write the reset value to the reset register */ - unlock_and_exit: - acpi_os_release_lock(acpi_gbl_hardware_lock, lock_flags); + status = acpi_write(acpi_gbl_FADT.reset_value, reset_reg); return_ACPI_STATUS(status); } -/******************************************************************************* +ACPI_EXPORT_SYMBOL(acpi_reset) + +/****************************************************************************** * - * FUNCTION: acpi_get_sleep_type_data + * FUNCTION: acpi_read * - * PARAMETERS: sleep_state - Numeric sleep state - * *sleep_type_a - Where SLP_TYPa is returned - * *sleep_type_b - Where SLP_TYPb is returned + * PARAMETERS: Value - Where the value is returned + * Reg - GAS register structure * - * RETURN: Status - ACPI status + * RETURN: Status * - * DESCRIPTION: Obtain the SLP_TYPa and SLP_TYPb values for the requested sleep - * state. + * DESCRIPTION: Read from either memory or IO space. * ******************************************************************************/ - -acpi_status -acpi_get_sleep_type_data(u8 sleep_state, u8 * sleep_type_a, u8 * sleep_type_b) +acpi_status acpi_read(u32 *value, struct acpi_generic_address *reg) { - acpi_status status = AE_OK; - struct acpi_evaluate_info *info; - - ACPI_FUNCTION_TRACE(acpi_get_sleep_type_data); - - /* Validate parameters */ - - if ((sleep_state > ACPI_S_STATES_MAX) || !sleep_type_a || !sleep_type_b) { - return_ACPI_STATUS(AE_BAD_PARAMETER); - } + u32 width; + u64 address; + acpi_status status; - /* Allocate the evaluation information block */ + ACPI_FUNCTION_NAME(acpi_read); - info = ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_evaluate_info)); - if (!info) { - return_ACPI_STATUS(AE_NO_MEMORY); + /* + * Must have a valid pointer to a GAS structure, and + * a non-zero address within. However, don't return an error + * because the PM1A/B code must not fail if B isn't present. + */ + if (!reg) { + return (AE_OK); } - info->pathname = - ACPI_CAST_PTR(char, acpi_gbl_sleep_state_names[sleep_state]); - - /* Evaluate the namespace object containing the values for this state */ - - status = acpi_ns_evaluate(info); - if (ACPI_FAILURE(status)) { - ACPI_DEBUG_PRINT((ACPI_DB_EXEC, - "%s while evaluating SleepState [%s]\n", - acpi_format_exception(status), - info->pathname)); + /* Get a local copy of the address. Handles possible alignment issues */ - goto cleanup; + ACPI_MOVE_64_TO_64(&address, ®->address); + if (!address) { + return (AE_OK); } - /* Must have a return object */ + /* Supported widths are 8/16/32 */ - if (!info->return_object) { - ACPI_ERROR((AE_INFO, "No Sleep State object returned from [%s]", - info->pathname)); - status = AE_NOT_EXIST; + width = reg->bit_width; + if ((width != 8) && (width != 16) && (width != 32)) { + return (AE_SUPPORT); } - /* It must be of type Package */ + /* Initialize entire 32-bit return value to zero */ - else if (ACPI_GET_OBJECT_TYPE(info->return_object) != ACPI_TYPE_PACKAGE) { - ACPI_ERROR((AE_INFO, - "Sleep State return object is not a Package")); - status = AE_AML_OPERAND_TYPE; - } + *value = 0; /* - * The package must have at least two elements. NOTE (March 2005): This - * goes against the current ACPI spec which defines this object as a - * package with one encoded DWORD element. However, existing practice - * by BIOS vendors seems to be to have 2 or more elements, at least - * one per sleep type (A/B). + * Two address spaces supported: Memory or IO. + * PCI_Config is not supported here because the GAS struct is insufficient */ - else if (info->return_object->package.count < 2) { - ACPI_ERROR((AE_INFO, - "Sleep State return package does not have at least two elements")); - status = AE_AML_NO_OPERAND; - } + switch (reg->space_id) { + case ACPI_ADR_SPACE_SYSTEM_MEMORY: - /* The first two elements must both be of type Integer */ + status = acpi_os_read_memory((acpi_physical_address) address, + value, width); + break; - else if ((ACPI_GET_OBJECT_TYPE(info->return_object->package.elements[0]) - != ACPI_TYPE_INTEGER) || - (ACPI_GET_OBJECT_TYPE(info->return_object->package.elements[1]) - != ACPI_TYPE_INTEGER)) { - ACPI_ERROR((AE_INFO, - "Sleep State return package elements are not both Integers (%s, %s)", - acpi_ut_get_object_type_name(info->return_object-> - package.elements[0]), - acpi_ut_get_object_type_name(info->return_object-> - package.elements[1]))); - status = AE_AML_OPERAND_TYPE; - } else { - /* Valid _Sx_ package size, type, and value */ + case ACPI_ADR_SPACE_SYSTEM_IO: - *sleep_type_a = (u8) - (info->return_object->package.elements[0])->integer.value; - *sleep_type_b = (u8) - (info->return_object->package.elements[1])->integer.value; - } + status = + acpi_os_read_port((acpi_io_address) address, value, width); + break; - if (ACPI_FAILURE(status)) { - ACPI_EXCEPTION((AE_INFO, status, - "While evaluating SleepState [%s], bad Sleep object %p type %s", - info->pathname, info->return_object, - acpi_ut_get_object_type_name(info-> - return_object))); + default: + ACPI_ERROR((AE_INFO, + "Unsupported address space: %X", reg->space_id)); + return (AE_BAD_PARAMETER); } - acpi_ut_remove_reference(info->return_object); + ACPI_DEBUG_PRINT((ACPI_DB_IO, + "Read: %8.8X width %2d from %8.8X%8.8X (%s)\n", + *value, width, ACPI_FORMAT_UINT64(address), + acpi_ut_get_region_name(reg->space_id))); - cleanup: - ACPI_FREE(info); - return_ACPI_STATUS(status); + return (status); } -ACPI_EXPORT_SYMBOL(acpi_get_sleep_type_data) +ACPI_EXPORT_SYMBOL(acpi_read) -/******************************************************************************* +/****************************************************************************** * - * FUNCTION: acpi_hw_get_register_bit_mask + * FUNCTION: acpi_write * - * PARAMETERS: register_id - Index of ACPI Register to access + * PARAMETERS: Value - To be written + * Reg - GAS register structure * - * RETURN: The bitmask to be used when accessing the register + * RETURN: Status * - * DESCRIPTION: Map register_id into a register bitmask. + * DESCRIPTION: Write to either memory or IO space. * ******************************************************************************/ -struct acpi_bit_register_info *acpi_hw_get_bit_register_info(u32 register_id) +acpi_status acpi_write(u32 value, struct acpi_generic_address *reg) { - ACPI_FUNCTION_ENTRY(); + u32 width; + u64 address; + acpi_status status; - if (register_id > ACPI_BITREG_MAX) { - ACPI_ERROR((AE_INFO, "Invalid BitRegister ID: %X", - register_id)); - return (NULL); + ACPI_FUNCTION_NAME(acpi_write); + + /* + * Must have a valid pointer to a GAS structure, and + * a non-zero address within. However, don't return an error + * because the PM1A/B code must not fail if B isn't present. + */ + if (!reg) { + return (AE_OK); } - return (&acpi_gbl_bit_register_info[register_id]); + /* Get a local copy of the address. Handles possible alignment issues */ + + ACPI_MOVE_64_TO_64(&address, ®->address); + if (!address) { + return (AE_OK); + } + + /* Supported widths are 8/16/32 */ + + width = reg->bit_width; + if ((width != 8) && (width != 16) && (width != 32)) { + return (AE_SUPPORT); + } + + /* + * Two address spaces supported: Memory or IO. + * PCI_Config is not supported here because the GAS struct is insufficient + */ + switch (reg->space_id) { + case ACPI_ADR_SPACE_SYSTEM_MEMORY: + + status = acpi_os_write_memory((acpi_physical_address) address, + value, width); + break; + + case ACPI_ADR_SPACE_SYSTEM_IO: + + status = acpi_os_write_port((acpi_io_address) address, value, + width); + break; + + default: + ACPI_ERROR((AE_INFO, + "Unsupported address space: %X", reg->space_id)); + return (AE_BAD_PARAMETER); + } + + ACPI_DEBUG_PRINT((ACPI_DB_IO, + "Wrote: %8.8X width %2d to %8.8X%8.8X (%s)\n", + value, width, ACPI_FORMAT_UINT64(address), + acpi_ut_get_region_name(reg->space_id))); + + return (status); } +ACPI_EXPORT_SYMBOL(acpi_write) + /******************************************************************************* * - * FUNCTION: acpi_get_register + * FUNCTION: acpi_get_register_unlocked * * PARAMETERS: register_id - ID of ACPI bit_register to access * return_value - Value that was read from the register @@ -254,17 +252,16 @@ struct acpi_bit_register_info *acpi_hw_get_bit_register_info(u32 register_id) * RETURN: Status and the value read from specified Register. Value * returned is normalized to bit0 (is shifted all the way right) * - * DESCRIPTION: ACPI bit_register read function. + * DESCRIPTION: ACPI bit_register read function. Does not acquire the HW lock. * ******************************************************************************/ - -acpi_status acpi_get_register_unlocked(u32 register_id, u32 * return_value) +acpi_status acpi_get_register_unlocked(u32 register_id, u32 *return_value) { u32 register_value = 0; struct acpi_bit_register_info *bit_reg_info; acpi_status status; - ACPI_FUNCTION_TRACE(acpi_get_register); + ACPI_FUNCTION_TRACE(acpi_get_register_unlocked); /* Get the info structure corresponding to the requested ACPI Register */ @@ -296,14 +293,31 @@ acpi_status acpi_get_register_unlocked(u32 register_id, u32 * return_value) return_ACPI_STATUS(status); } -acpi_status acpi_get_register(u32 register_id, u32 * return_value) +ACPI_EXPORT_SYMBOL(acpi_get_register_unlocked) + +/******************************************************************************* + * + * FUNCTION: acpi_get_register + * + * PARAMETERS: register_id - ID of ACPI bit_register to access + * return_value - Value that was read from the register + * + * RETURN: Status and the value read from specified Register. Value + * returned is normalized to bit0 (is shifted all the way right) + * + * DESCRIPTION: ACPI bit_register read function. + * + ******************************************************************************/ +acpi_status acpi_get_register(u32 register_id, u32 *return_value) { acpi_status status; acpi_cpu_flags flags; + flags = acpi_os_acquire_lock(acpi_gbl_hardware_lock); status = acpi_get_register_unlocked(register_id, return_value); acpi_os_release_lock(acpi_gbl_hardware_lock, flags); - return status; + + return (status); } ACPI_EXPORT_SYMBOL(acpi_get_register) @@ -370,8 +384,9 @@ acpi_status acpi_set_register(u32 register_id, u32 value) bit_reg_info-> access_bit_mask); if (value) { - status = acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, - (u16) value); + status = + acpi_hw_register_write(ACPI_REGISTER_PM1_STATUS, + (u16) value); register_value = 0; } break; @@ -459,399 +474,120 @@ acpi_status acpi_set_register(u32 register_id, u32 value) ACPI_EXPORT_SYMBOL(acpi_set_register) -/****************************************************************************** +/******************************************************************************* * - * FUNCTION: acpi_hw_register_read + * FUNCTION: acpi_get_sleep_type_data * - * PARAMETERS: register_id - ACPI Register ID - * return_value - Where the register value is returned + * PARAMETERS: sleep_state - Numeric sleep state + * *sleep_type_a - Where SLP_TYPa is returned + * *sleep_type_b - Where SLP_TYPb is returned * - * RETURN: Status and the value read. + * RETURN: Status - ACPI status * - * DESCRIPTION: Read from the specified ACPI register + * DESCRIPTION: Obtain the SLP_TYPa and SLP_TYPb values for the requested sleep + * state. * ******************************************************************************/ acpi_status -acpi_hw_register_read(u32 register_id, u32 * return_value) +acpi_get_sleep_type_data(u8 sleep_state, u8 *sleep_type_a, u8 *sleep_type_b) { - u32 value1 = 0; - u32 value2 = 0; - acpi_status status; - - ACPI_FUNCTION_TRACE(hw_register_read); - - switch (register_id) { - case ACPI_REGISTER_PM1_STATUS: /* 16-bit access */ - - status = - acpi_hw_low_level_read(16, &value1, - &acpi_gbl_FADT.xpm1a_event_block); - if (ACPI_FAILURE(status)) { - goto exit; - } - - /* PM1B is optional */ - - status = - acpi_hw_low_level_read(16, &value2, - &acpi_gbl_FADT.xpm1b_event_block); - value1 |= value2; - break; - - case ACPI_REGISTER_PM1_ENABLE: /* 16-bit access */ - - status = - acpi_hw_low_level_read(16, &value1, &acpi_gbl_xpm1a_enable); - if (ACPI_FAILURE(status)) { - goto exit; - } - - /* PM1B is optional */ - - status = - acpi_hw_low_level_read(16, &value2, &acpi_gbl_xpm1b_enable); - value1 |= value2; - break; - - case ACPI_REGISTER_PM1_CONTROL: /* 16-bit access */ - - status = - acpi_hw_low_level_read(16, &value1, - &acpi_gbl_FADT.xpm1a_control_block); - if (ACPI_FAILURE(status)) { - goto exit; - } - - status = - acpi_hw_low_level_read(16, &value2, - &acpi_gbl_FADT.xpm1b_control_block); - value1 |= value2; - break; - - case ACPI_REGISTER_PM2_CONTROL: /* 8-bit access */ - - status = - acpi_hw_low_level_read(8, &value1, - &acpi_gbl_FADT.xpm2_control_block); - break; - - case ACPI_REGISTER_PM_TIMER: /* 32-bit access */ - - status = - acpi_hw_low_level_read(32, &value1, - &acpi_gbl_FADT.xpm_timer_block); - break; + acpi_status status = AE_OK; + struct acpi_evaluate_info *info; - case ACPI_REGISTER_SMI_COMMAND_BLOCK: /* 8-bit access */ + ACPI_FUNCTION_TRACE(acpi_get_sleep_type_data); - status = - acpi_os_read_port(acpi_gbl_FADT.smi_command, &value1, 8); - break; + /* Validate parameters */ - default: - ACPI_ERROR((AE_INFO, "Unknown Register ID: %X", register_id)); - status = AE_BAD_PARAMETER; - break; + if ((sleep_state > ACPI_S_STATES_MAX) || !sleep_type_a || !sleep_type_b) { + return_ACPI_STATUS(AE_BAD_PARAMETER); } - exit: + /* Allocate the evaluation information block */ - if (ACPI_SUCCESS(status)) { - *return_value = value1; + info = ACPI_ALLOCATE_ZEROED(sizeof(struct acpi_evaluate_info)); + if (!info) { + return_ACPI_STATUS(AE_NO_MEMORY); } - return_ACPI_STATUS(status); -} - -/****************************************************************************** - * - * FUNCTION: acpi_hw_register_write - * - * PARAMETERS: register_id - ACPI Register ID - * Value - The value to write - * - * RETURN: Status - * - * DESCRIPTION: Write to the specified ACPI register - * - * NOTE: In accordance with the ACPI specification, this function automatically - * preserves the value of the following bits, meaning that these bits cannot be - * changed via this interface: - * - * PM1_CONTROL[0] = SCI_EN - * PM1_CONTROL[9] - * PM1_STATUS[11] - * - * ACPI References: - * 1) Hardware Ignored Bits: When software writes to a register with ignored - * bit fields, it preserves the ignored bit fields - * 2) SCI_EN: OSPM always preserves this bit position - * - ******************************************************************************/ - -acpi_status acpi_hw_register_write(u32 register_id, u32 value) -{ - acpi_status status; - u32 read_value; - - ACPI_FUNCTION_TRACE(hw_register_write); - - switch (register_id) { - case ACPI_REGISTER_PM1_STATUS: /* 16-bit access */ - - /* Perform a read first to preserve certain bits (per ACPI spec) */ - - status = acpi_hw_register_read(ACPI_REGISTER_PM1_STATUS, - &read_value); - if (ACPI_FAILURE(status)) { - goto exit; - } - - /* Insert the bits to be preserved */ - - ACPI_INSERT_BITS(value, ACPI_PM1_STATUS_PRESERVED_BITS, - read_value); - - /* Now we can write the data */ - - status = - acpi_hw_low_level_write(16, value, - &acpi_gbl_FADT.xpm1a_event_block); - if (ACPI_FAILURE(status)) { - goto exit; - } - - /* PM1B is optional */ - - status = - acpi_hw_low_level_write(16, value, - &acpi_gbl_FADT.xpm1b_event_block); - break; - - case ACPI_REGISTER_PM1_ENABLE: /* 16-bit access */ - - status = - acpi_hw_low_level_write(16, value, &acpi_gbl_xpm1a_enable); - if (ACPI_FAILURE(status)) { - goto exit; - } - - /* PM1B is optional */ - - status = - acpi_hw_low_level_write(16, value, &acpi_gbl_xpm1b_enable); - break; - - case ACPI_REGISTER_PM1_CONTROL: /* 16-bit access */ - - /* - * Perform a read first to preserve certain bits (per ACPI spec) - */ - status = acpi_hw_register_read(ACPI_REGISTER_PM1_CONTROL, - &read_value); - if (ACPI_FAILURE(status)) { - goto exit; - } - - /* Insert the bits to be preserved */ - - ACPI_INSERT_BITS(value, ACPI_PM1_CONTROL_PRESERVED_BITS, - read_value); - - /* Now we can write the data */ - - status = - acpi_hw_low_level_write(16, value, - &acpi_gbl_FADT.xpm1a_control_block); - if (ACPI_FAILURE(status)) { - goto exit; - } - - status = - acpi_hw_low_level_write(16, value, - &acpi_gbl_FADT.xpm1b_control_block); - break; - - case ACPI_REGISTER_PM1A_CONTROL: /* 16-bit access */ - - status = - acpi_hw_low_level_write(16, value, - &acpi_gbl_FADT.xpm1a_control_block); - break; - - case ACPI_REGISTER_PM1B_CONTROL: /* 16-bit access */ - - status = - acpi_hw_low_level_write(16, value, - &acpi_gbl_FADT.xpm1b_control_block); - break; - - case ACPI_REGISTER_PM2_CONTROL: /* 8-bit access */ - - status = - acpi_hw_low_level_write(8, value, - &acpi_gbl_FADT.xpm2_control_block); - break; - - case ACPI_REGISTER_PM_TIMER: /* 32-bit access */ - - status = - acpi_hw_low_level_write(32, value, - &acpi_gbl_FADT.xpm_timer_block); - break; - - case ACPI_REGISTER_SMI_COMMAND_BLOCK: /* 8-bit access */ + info->pathname = + ACPI_CAST_PTR(char, acpi_gbl_sleep_state_names[sleep_state]); - /* SMI_CMD is currently always in IO space */ + /* Evaluate the namespace object containing the values for this state */ - status = - acpi_os_write_port(acpi_gbl_FADT.smi_command, value, 8); - break; + status = acpi_ns_evaluate(info); + if (ACPI_FAILURE(status)) { + ACPI_DEBUG_PRINT((ACPI_DB_EXEC, + "%s while evaluating SleepState [%s]\n", + acpi_format_exception(status), + info->pathname)); - default: - status = AE_BAD_PARAMETER; - break; + goto cleanup; } - exit: - return_ACPI_STATUS(status); -} - -/****************************************************************************** - * - * FUNCTION: acpi_hw_low_level_read - * - * PARAMETERS: Width - 8, 16, or 32 - * Value - Where the value is returned - * Reg - GAS register structure - * - * RETURN: Status - * - * DESCRIPTION: Read from either memory or IO space. - * - ******************************************************************************/ - -acpi_status -acpi_hw_low_level_read(u32 width, u32 * value, struct acpi_generic_address *reg) -{ - u64 address; - acpi_status status; - - ACPI_FUNCTION_NAME(hw_low_level_read); + /* Must have a return object */ - /* - * Must have a valid pointer to a GAS structure, and - * a non-zero address within. However, don't return an error - * because the PM1A/B code must not fail if B isn't present. - */ - if (!reg) { - return (AE_OK); + if (!info->return_object) { + ACPI_ERROR((AE_INFO, "No Sleep State object returned from [%s]", + info->pathname)); + status = AE_NOT_EXIST; } - /* Get a local copy of the address. Handles possible alignment issues */ + /* It must be of type Package */ - ACPI_MOVE_64_TO_64(&address, ®->address); - if (!address) { - return (AE_OK); + else if (ACPI_GET_OBJECT_TYPE(info->return_object) != ACPI_TYPE_PACKAGE) { + ACPI_ERROR((AE_INFO, + "Sleep State return object is not a Package")); + status = AE_AML_OPERAND_TYPE; } - *value = 0; /* - * Two address spaces supported: Memory or IO. - * PCI_Config is not supported here because the GAS struct is insufficient + * The package must have at least two elements. NOTE (March 2005): This + * goes against the current ACPI spec which defines this object as a + * package with one encoded DWORD element. However, existing practice + * by BIOS vendors seems to be to have 2 or more elements, at least + * one per sleep type (A/B). */ - switch (reg->space_id) { - case ACPI_ADR_SPACE_SYSTEM_MEMORY: - - status = acpi_os_read_memory((acpi_physical_address) address, - value, width); - break; - - case ACPI_ADR_SPACE_SYSTEM_IO: - - status = - acpi_os_read_port((acpi_io_address) address, value, width); - break; - - default: + else if (info->return_object->package.count < 2) { ACPI_ERROR((AE_INFO, - "Unsupported address space: %X", reg->space_id)); - return (AE_BAD_PARAMETER); + "Sleep State return package does not have at least two elements")); + status = AE_AML_NO_OPERAND; } - ACPI_DEBUG_PRINT((ACPI_DB_IO, - "Read: %8.8X width %2d from %8.8X%8.8X (%s)\n", - *value, width, ACPI_FORMAT_UINT64(address), - acpi_ut_get_region_name(reg->space_id))); - - return (status); -} - -/****************************************************************************** - * - * FUNCTION: acpi_hw_low_level_write - * - * PARAMETERS: Width - 8, 16, or 32 - * Value - To be written - * Reg - GAS register structure - * - * RETURN: Status - * - * DESCRIPTION: Write to either memory or IO space. - * - ******************************************************************************/ - -acpi_status -acpi_hw_low_level_write(u32 width, u32 value, struct acpi_generic_address * reg) -{ - u64 address; - acpi_status status; - - ACPI_FUNCTION_NAME(hw_low_level_write); - - /* - * Must have a valid pointer to a GAS structure, and - * a non-zero address within. However, don't return an error - * because the PM1A/B code must not fail if B isn't present. - */ - if (!reg) { - return (AE_OK); - } + /* The first two elements must both be of type Integer */ - /* Get a local copy of the address. Handles possible alignment issues */ + else if ((ACPI_GET_OBJECT_TYPE(info->return_object->package.elements[0]) + != ACPI_TYPE_INTEGER) || + (ACPI_GET_OBJECT_TYPE(info->return_object->package.elements[1]) + != ACPI_TYPE_INTEGER)) { + ACPI_ERROR((AE_INFO, + "Sleep State return package elements are not both Integers (%s, %s)", + acpi_ut_get_object_type_name(info->return_object-> + package.elements[0]), + acpi_ut_get_object_type_name(info->return_object-> + package.elements[1]))); + status = AE_AML_OPERAND_TYPE; + } else { + /* Valid _Sx_ package size, type, and value */ - ACPI_MOVE_64_TO_64(&address, ®->address); - if (!address) { - return (AE_OK); + *sleep_type_a = (u8) + (info->return_object->package.elements[0])->integer.value; + *sleep_type_b = (u8) + (info->return_object->package.elements[1])->integer.value; } - /* - * Two address spaces supported: Memory or IO. - * PCI_Config is not supported here because the GAS struct is insufficient - */ - switch (reg->space_id) { - case ACPI_ADR_SPACE_SYSTEM_MEMORY: - - status = acpi_os_write_memory((acpi_physical_address) address, - value, width); - break; - - case ACPI_ADR_SPACE_SYSTEM_IO: - - status = acpi_os_write_port((acpi_io_address) address, value, - width); - break; - - default: - ACPI_ERROR((AE_INFO, - "Unsupported address space: %X", reg->space_id)); - return (AE_BAD_PARAMETER); + if (ACPI_FAILURE(status)) { + ACPI_EXCEPTION((AE_INFO, status, + "While evaluating SleepState [%s], bad Sleep object %p type %s", + info->pathname, info->return_object, + acpi_ut_get_object_type_name(info-> + return_object))); } - ACPI_DEBUG_PRINT((ACPI_DB_IO, - "Wrote: %8.8X width %2d to %8.8X%8.8X (%s)\n", - value, width, ACPI_FORMAT_UINT64(address), - acpi_ut_get_region_name(reg->space_id))); + acpi_ut_remove_reference(info->return_object); - return (status); + cleanup: + ACPI_FREE(info); + return_ACPI_STATUS(status); } + +ACPI_EXPORT_SYMBOL(acpi_get_sleep_type_data) diff --git a/drivers/acpi/namespace/nsaccess.c b/drivers/acpi/acpica/nsaccess.c index c39a7f68b88..88303ebe924 100644 --- a/drivers/acpi/namespace/nsaccess.c +++ b/drivers/acpi/acpica/nsaccess.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/amlcode.h> -#include <acpi/acnamesp.h> -#include <acpi/acdispat.h> +#include "accommon.h" +#include "amlcode.h" +#include "acnamesp.h" +#include "acdispat.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsaccess") @@ -165,12 +166,9 @@ acpi_status acpi_ns_root_initialize(void) obj_desc->method.method_flags = AML_METHOD_INTERNAL_ONLY; - -#ifndef ACPI_DUMP_APP obj_desc->method.implementation = acpi_ut_osi_implementation; #endif -#endif break; case ACPI_TYPE_INTEGER: @@ -521,11 +519,11 @@ acpi_ns_lookup(union acpi_generic_state *scope_info, } /* - * Search namespace for each segment of the name. Loop through and + * Search namespace for each segment of the name. Loop through and * verify (or add to the namespace) each name segment. * * The object type is significant only at the last name - * segment. (We don't care about the types along the path, only + * segment. (We don't care about the types along the path, only * the type of the final target object.) */ this_search_type = ACPI_TYPE_ANY; @@ -591,6 +589,10 @@ acpi_ns_lookup(union acpi_generic_state *scope_info, * segments). */ if (this_node->type == ACPI_TYPE_LOCAL_ALIAS) { + if (!this_node->object) { + return_ACPI_STATUS(AE_NOT_EXIST); + } + if (acpi_ns_opens_scope (((struct acpi_namespace_node *)this_node-> object)->type)) { diff --git a/drivers/acpi/namespace/nsalloc.c b/drivers/acpi/acpica/nsalloc.c index 3a1740ac2ed..f976d848fe8 100644 --- a/drivers/acpi/namespace/nsalloc.c +++ b/drivers/acpi/acpica/nsalloc.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acnamesp.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsalloc") diff --git a/drivers/acpi/namespace/nsdump.c b/drivers/acpi/acpica/nsdump.c index cc0ae39440e..0da33c8e9ba 100644 --- a/drivers/acpi/namespace/nsdump.c +++ b/drivers/acpi/acpica/nsdump.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acnamesp.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsdump") diff --git a/drivers/acpi/namespace/nsdumpdv.c b/drivers/acpi/acpica/nsdumpdv.c index 428f50fde11..41994fe7fbb 100644 --- a/drivers/acpi/namespace/nsdumpdv.c +++ b/drivers/acpi/acpica/nsdumpdv.c @@ -42,6 +42,7 @@ */ #include <acpi/acpi.h> +#include "accommon.h" /* TBD: This entire module is apparently obsolete and should be removed */ @@ -49,7 +50,7 @@ ACPI_MODULE_NAME("nsdumpdv") #ifdef ACPI_OBSOLETE_FUNCTIONS #if defined(ACPI_DEBUG_OUTPUT) || defined(ACPI_DEBUGGER) -#include <acpi/acnamesp.h> +#include "acnamesp.h" /******************************************************************************* * * FUNCTION: acpi_ns_dump_one_device diff --git a/drivers/acpi/namespace/nseval.c b/drivers/acpi/acpica/nseval.c index 4cdf03ac2b4..0f3d5f9b596 100644 --- a/drivers/acpi/namespace/nseval.c +++ b/drivers/acpi/acpica/nseval.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acparser.h" +#include "acinterp.h" +#include "acnamesp.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nseval") @@ -89,6 +90,7 @@ acpi_status acpi_ns_evaluate(struct acpi_evaluate_info * info) /* Initialize the return value to an invalid object */ info->return_object = NULL; + info->param_count = 0; /* * Get the actual namespace node for the target object. Handles these cases: @@ -141,41 +143,17 @@ acpi_status acpi_ns_evaluate(struct acpi_evaluate_info * info) return_ACPI_STATUS(AE_NULL_OBJECT); } - /* - * Calculate the number of arguments being passed to the method - */ + /* Count the number of arguments being passed to the method */ - info->param_count = 0; if (info->parameters) { - while (info->parameters[info->param_count]) + while (info->parameters[info->param_count]) { + if (info->param_count > ACPI_METHOD_MAX_ARG) { + return_ACPI_STATUS(AE_LIMIT); + } info->param_count++; + } } - /* - * Warning if too few or too many arguments have been passed by the - * caller. We don't want to abort here with an error because an - * incorrect number of arguments may not cause the method to fail. - * However, the method will fail if there are too few arguments passed - * and the method attempts to use one of the missing ones. - */ - - if (info->param_count < info->obj_desc->method.param_count) { - ACPI_WARNING((AE_INFO, - "Insufficient arguments - " - "method [%4.4s] needs %d, found %d", - acpi_ut_get_node_name(info->resolved_node), - info->obj_desc->method.param_count, - info->param_count)); - } else if (info->param_count > - info->obj_desc->method.param_count) { - ACPI_WARNING((AE_INFO, - "Excess arguments - " - "method [%4.4s] needs %d, found %d", - acpi_ut_get_node_name(info-> - resolved_node), - info->obj_desc->method.param_count, - info->param_count)); - } ACPI_DUMP_PATHNAME(info->resolved_node, "Execute Method:", ACPI_LV_INFO, _COMPONENT); @@ -264,32 +242,13 @@ acpi_status acpi_ns_evaluate(struct acpi_evaluate_info * info) } } - /* Validation of return values for ACPI-predefined methods and objects */ - - if ((status == AE_OK) || (status == AE_CTRL_RETURN_VALUE)) { - /* - * If this is the first evaluation, check the return value. This - * ensures that any warnings will only be emitted during the very - * first evaluation of the object. - */ - if (!(node->flags & ANOBJ_EVALUATED)) { - /* - * Check for a predefined ACPI name. If found, validate the - * returned object. - * - * Note: Ignore return status for now, emit warnings if there are - * problems with the returned object. May change later to abort - * the method on invalid return object. - */ - (void)acpi_ns_check_predefined_names(node, - info-> - return_object); - } - - /* Mark the node as having been evaluated */ - - node->flags |= ANOBJ_EVALUATED; - } + /* + * Check input argument count against the ASL-defined count for a method. + * Also check predefined names: argument count and return value against + * the ACPI specification. Some incorrect return value types are repaired. + */ + (void)acpi_ns_check_predefined_names(node, info->param_count, + status, &info->return_object); /* Check if there is a return value that must be dealt with */ diff --git a/drivers/acpi/namespace/nsinit.c b/drivers/acpi/acpica/nsinit.c index e4c57510d79..13501cb8186 100644 --- a/drivers/acpi/namespace/nsinit.c +++ b/drivers/acpi/acpica/nsinit.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acnamesp.h" +#include "acdispat.h" +#include "acinterp.h" #include <linux/nmi.h> #define _COMPONENT ACPI_NAMESPACE diff --git a/drivers/acpi/namespace/nsload.c b/drivers/acpi/acpica/nsload.c index a4a412b7c02..a0ba9e12379 100644 --- a/drivers/acpi/namespace/nsload.c +++ b/drivers/acpi/acpica/nsload.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acdispat.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "acnamesp.h" +#include "acdispat.h" +#include "actables.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsload") diff --git a/drivers/acpi/namespace/nsnames.c b/drivers/acpi/acpica/nsnames.c index 42a39a7c96e..ae3dc10a7e8 100644 --- a/drivers/acpi/namespace/nsnames.c +++ b/drivers/acpi/acpica/nsnames.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/amlcode.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "amlcode.h" +#include "acnamesp.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsnames") diff --git a/drivers/acpi/namespace/nsobject.c b/drivers/acpi/acpica/nsobject.c index 15fe09e24f7..08a97a57f8f 100644 --- a/drivers/acpi/namespace/nsobject.c +++ b/drivers/acpi/acpica/nsobject.c @@ -43,7 +43,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acnamesp.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsobject") diff --git a/drivers/acpi/namespace/nsparse.c b/drivers/acpi/acpica/nsparse.c index a82271a9dbb..b9e8d0070b6 100644 --- a/drivers/acpi/namespace/nsparse.c +++ b/drivers/acpi/acpica/nsparse.c @@ -42,10 +42,11 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acparser.h> -#include <acpi/acdispat.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "acnamesp.h" +#include "acparser.h" +#include "acdispat.h" +#include "actables.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsparse") diff --git a/drivers/acpi/namespace/nspredef.c b/drivers/acpi/acpica/nspredef.c index 0f17cf0898c..452703290d3 100644 --- a/drivers/acpi/namespace/nspredef.c +++ b/drivers/acpi/acpica/nspredef.c @@ -43,8 +43,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acpredef.h> +#include "accommon.h" +#include "acnamesp.h" +#include "acpredef.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nspredef") @@ -72,7 +73,7 @@ ACPI_MODULE_NAME("nspredef") /* Local prototypes */ static acpi_status acpi_ns_check_package(char *pathname, - union acpi_operand_object *return_object, + union acpi_operand_object **return_object_ptr, const union acpi_predefined_info *predefined); static acpi_status @@ -82,13 +83,18 @@ acpi_ns_check_package_elements(char *pathname, static acpi_status acpi_ns_check_object_type(char *pathname, - union acpi_operand_object *return_object, + union acpi_operand_object **return_object_ptr, u32 expected_btypes, u32 package_index); static acpi_status acpi_ns_check_reference(char *pathname, union acpi_operand_object *return_object); +static acpi_status +acpi_ns_repair_object(u32 expected_btypes, + u32 package_index, + union acpi_operand_object **return_object_ptr); + /* * Names for the types that can be returned by the predefined objects. * Used for warning messages. Must be in the same order as the ACPI_RTYPEs @@ -108,8 +114,8 @@ static const char *acpi_rtype_names[] = { * FUNCTION: acpi_ns_check_predefined_names * * PARAMETERS: Node - Namespace node for the method/object - * return_object - Object returned from the evaluation of this - * method/object + * return_object_ptr - Pointer to the object returned from the + * evaluation of a method or object * * RETURN: Status * @@ -119,8 +125,11 @@ static const char *acpi_rtype_names[] = { acpi_status acpi_ns_check_predefined_names(struct acpi_namespace_node *node, - union acpi_operand_object *return_object) + u32 user_param_count, + acpi_status return_status, + union acpi_operand_object **return_object_ptr) { + union acpi_operand_object *return_object = *return_object_ptr; acpi_status status = AE_OK; const union acpi_predefined_info *predefined; char *pathname; @@ -128,12 +137,6 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node, /* Match the name for this method/object against the predefined list */ predefined = acpi_ns_check_for_predefined_name(node); - if (!predefined) { - - /* Name was not one of the predefined names */ - - return (AE_OK); - } /* Get the full pathname to the object, for use in error messages */ @@ -143,10 +146,37 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node, } /* - * Check that the parameter count for this method is in accordance - * with the ACPI specification. + * Check that the parameter count for this method matches the ASL + * definition. For predefined names, ensure that both the caller and + * the method itself are in accordance with the ACPI specification. */ - acpi_ns_check_parameter_count(pathname, node, predefined); + acpi_ns_check_parameter_count(pathname, node, user_param_count, + predefined); + + /* If not a predefined name, we cannot validate the return object */ + + if (!predefined) { + goto exit; + } + + /* If the method failed, we cannot validate the return object */ + + if ((return_status != AE_OK) && (return_status != AE_CTRL_RETURN_VALUE)) { + goto exit; + } + + /* + * Only validate the return value on the first successful evaluation of + * the method. This ensures that any warnings will only be emitted during + * the very first evaluation of the method/object. + */ + if (node->flags & ANOBJ_EVALUATED) { + goto exit; + } + + /* Mark the node as having been successfully evaluated */ + + node->flags |= ANOBJ_EVALUATED; /* * If there is no return value, check if we require a return value for @@ -171,7 +201,7 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node, * We have a return value, but if one wasn't expected, just exit, this is * not a problem * - * For example, if "Implicit return value" is enabled, methods will + * For example, if the "Implicit Return" feature is enabled, methods will * always return a value */ if (!predefined->info.expected_btypes) { @@ -182,7 +212,7 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node, * Check that the type of the return object is what is expected for * this predefined name */ - status = acpi_ns_check_object_type(pathname, return_object, + status = acpi_ns_check_object_type(pathname, return_object_ptr, predefined->info.expected_btypes, ACPI_NOT_PACKAGE); if (ACPI_FAILURE(status)) { @@ -193,11 +223,12 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node, if (ACPI_GET_OBJECT_TYPE(return_object) == ACPI_TYPE_PACKAGE) { status = - acpi_ns_check_package(pathname, return_object, predefined); + acpi_ns_check_package(pathname, return_object_ptr, + predefined); } exit: - if (pathname) { + if (pathname != predefined->info.name) { ACPI_FREE(pathname); } @@ -210,6 +241,7 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node, * * PARAMETERS: Pathname - Full pathname to the node (for error msgs) * Node - Namespace node for the method/object + * user_param_count - Number of args passed in by the caller * Predefined - Pointer to entry in predefined name table * * RETURN: None @@ -223,32 +255,76 @@ acpi_ns_check_predefined_names(struct acpi_namespace_node *node, void acpi_ns_check_parameter_count(char *pathname, struct acpi_namespace_node *node, + u32 user_param_count, const union acpi_predefined_info *predefined) { u32 param_count; u32 required_params_current; u32 required_params_old; - /* - * Check that the ASL-defined parameter count is what is expected for - * this predefined name. - * - * Methods have 0-7 parameters. All other types have zero. - */ + /* Methods have 0-7 parameters. All other types have zero. */ + param_count = 0; if (node->type == ACPI_TYPE_METHOD) { param_count = node->object->method.param_count; } - /* Validate parameter count - allow two different legal counts (_SCP) */ + /* Argument count check for non-predefined methods/objects */ + + if (!predefined) { + /* + * Warning if too few or too many arguments have been passed by the + * caller. An incorrect number of arguments may not cause the method + * to fail. However, the method will fail if there are too few + * arguments and the method attempts to use one of the missing ones. + */ + if (user_param_count < param_count) { + ACPI_WARNING((AE_INFO, + "%s: Insufficient arguments - needs %d, found %d", + pathname, param_count, user_param_count)); + } else if (user_param_count > param_count) { + ACPI_WARNING((AE_INFO, + "%s: Excess arguments - needs %d, found %d", + pathname, param_count, user_param_count)); + } + return; + } + + /* Allow two different legal argument counts (_SCP, etc.) */ required_params_current = predefined->info.param_count & 0x0F; required_params_old = predefined->info.param_count >> 4; + if (user_param_count != ACPI_UINT32_MAX) { + + /* Validate the user-supplied parameter count */ + + if ((user_param_count != required_params_current) && + (user_param_count != required_params_old)) { + ACPI_WARNING((AE_INFO, + "%s: Parameter count mismatch - caller passed %d, ACPI requires %d", + pathname, user_param_count, + required_params_current)); + } + } + + /* + * Only validate the argument count on the first successful evaluation of + * the method. This ensures that any warnings will only be emitted during + * the very first evaluation of the method/object. + */ + if (node->flags & ANOBJ_EVALUATED) { + return; + } + + /* + * Check that the ASL-defined parameter count is what is expected for + * this predefined name. + */ if ((param_count != required_params_current) && (param_count != required_params_old)) { ACPI_WARNING((AE_INFO, - "%s: Parameter count mismatch - ASL declared %d, expected %d", + "%s: Parameter count mismatch - ASL declared %d, ACPI requires %d", pathname, param_count, required_params_current)); } } @@ -307,8 +383,8 @@ const union acpi_predefined_info *acpi_ns_check_for_predefined_name(struct * FUNCTION: acpi_ns_check_package * * PARAMETERS: Pathname - Full pathname to the node (for error msgs) - * return_object - Object returned from the evaluation of a - * method or object + * return_object_ptr - Pointer to the object returned from the + * evaluation of a method or object * Predefined - Pointer to entry in predefined name table * * RETURN: Status @@ -320,9 +396,10 @@ const union acpi_predefined_info *acpi_ns_check_for_predefined_name(struct static acpi_status acpi_ns_check_package(char *pathname, - union acpi_operand_object *return_object, + union acpi_operand_object **return_object_ptr, const union acpi_predefined_info *predefined) { + union acpi_operand_object *return_object = *return_object_ptr; const union acpi_predefined_info *package; union acpi_operand_object *sub_package; union acpi_operand_object **elements; @@ -408,7 +485,7 @@ acpi_ns_check_package(char *pathname, * elements must be of the same type */ for (i = 0; i < count; i++) { - status = acpi_ns_check_object_type(pathname, *elements, + status = acpi_ns_check_object_type(pathname, elements, package->ret_info. object_type1, i); if (ACPI_FAILURE(status)) { @@ -441,7 +518,7 @@ acpi_ns_check_package(char *pathname, status = acpi_ns_check_object_type(pathname, - *elements, + elements, package-> ret_info3. object_type[i], @@ -454,7 +531,7 @@ acpi_ns_check_package(char *pathname, status = acpi_ns_check_object_type(pathname, - *elements, + elements, package-> ret_info3. tail_object_type, @@ -471,7 +548,7 @@ acpi_ns_check_package(char *pathname, /* First element is the (Integer) count of sub-packages to follow */ - status = acpi_ns_check_object_type(pathname, *elements, + status = acpi_ns_check_object_type(pathname, elements, ACPI_RTYPE_INTEGER, 0); if (ACPI_FAILURE(status)) { return (status); @@ -509,7 +586,7 @@ acpi_ns_check_package(char *pathname, /* Each sub-object must be of type Package */ status = - acpi_ns_check_object_type(pathname, sub_package, + acpi_ns_check_object_type(pathname, &sub_package, ACPI_RTYPE_PACKAGE, i); if (ACPI_FAILURE(status)) { return (status); @@ -567,12 +644,8 @@ acpi_ns_check_package(char *pathname, for (j = 0; j < expected_count; j++) { status = acpi_ns_check_object_type(pathname, - sub_elements - [j], - package-> - ret_info2. - object_type - [j], j); + &sub_elements[j], + package->ret_info2.object_type[j], j); if (ACPI_FAILURE(status)) { return (status); } @@ -611,7 +684,7 @@ acpi_ns_check_package(char *pathname, status = acpi_ns_check_object_type(pathname, - *sub_elements, + sub_elements, ACPI_RTYPE_INTEGER, 0); if (ACPI_FAILURE(status)) { @@ -708,7 +781,7 @@ acpi_ns_check_package_elements(char *pathname, * The second group can have a count of zero. */ for (i = 0; i < count1; i++) { - status = acpi_ns_check_object_type(pathname, *this_element, + status = acpi_ns_check_object_type(pathname, this_element, type1, i); if (ACPI_FAILURE(status)) { return (status); @@ -717,7 +790,7 @@ acpi_ns_check_package_elements(char *pathname, } for (i = 0; i < count2; i++) { - status = acpi_ns_check_object_type(pathname, *this_element, + status = acpi_ns_check_object_type(pathname, this_element, type2, (i + count1)); if (ACPI_FAILURE(status)) { return (status); @@ -733,8 +806,8 @@ acpi_ns_check_package_elements(char *pathname, * FUNCTION: acpi_ns_check_object_type * * PARAMETERS: Pathname - Full pathname to the node (for error msgs) - * return_object - Object return from the execution of this - * method/object + * return_object_ptr - Pointer to the object returned from the + * evaluation of a method or object * expected_btypes - Bitmap of expected return type(s) * package_index - Index of object within parent package (if * applicable - ACPI_NOT_PACKAGE otherwise) @@ -748,9 +821,10 @@ acpi_ns_check_package_elements(char *pathname, static acpi_status acpi_ns_check_object_type(char *pathname, - union acpi_operand_object *return_object, + union acpi_operand_object **return_object_ptr, u32 expected_btypes, u32 package_index) { + union acpi_operand_object *return_object = *return_object_ptr; acpi_status status = AE_OK; u32 return_btype; char type_buffer[48]; /* Room for 5 types */ @@ -814,6 +888,14 @@ acpi_ns_check_object_type(char *pathname, /* Is the object one of the expected types? */ if (!(return_btype & expected_btypes)) { + + /* Type mismatch -- attempt repair of the returned object */ + + status = acpi_ns_repair_object(expected_btypes, package_index, + return_object_ptr); + if (ACPI_SUCCESS(status)) { + return (status); + } goto type_error_exit; } @@ -898,3 +980,86 @@ acpi_ns_check_reference(char *pathname, return (AE_AML_OPERAND_TYPE); } + +/******************************************************************************* + * + * FUNCTION: acpi_ns_repair_object + * + * PARAMETERS: Pathname - Full pathname to the node (for error msgs) + * package_index - Used to determine if target is in a package + * return_object_ptr - Pointer to the object returned from the + * evaluation of a method or object + * + * RETURN: Status. AE_OK if repair was successful. + * + * DESCRIPTION: Attempt to repair/convert a return object of a type that was + * not expected. + * + ******************************************************************************/ + +static acpi_status +acpi_ns_repair_object(u32 expected_btypes, + u32 package_index, + union acpi_operand_object **return_object_ptr) +{ + union acpi_operand_object *return_object = *return_object_ptr; + union acpi_operand_object *new_object; + acpi_size length; + + switch (ACPI_GET_OBJECT_TYPE(return_object)) { + case ACPI_TYPE_BUFFER: + + if (!(expected_btypes & ACPI_RTYPE_STRING)) { + return (AE_AML_OPERAND_TYPE); + } + + /* + * Have a Buffer, expected a String, convert. Use a to_string + * conversion, no transform performed on the buffer data. The best + * example of this is the _BIF method, where the string data from + * the battery is often (incorrectly) returned as buffer object(s). + */ + length = 0; + while ((length < return_object->buffer.length) && + (return_object->buffer.pointer[length])) { + length++; + } + + /* Allocate a new string object */ + + new_object = acpi_ut_create_string_object(length); + if (!new_object) { + return (AE_NO_MEMORY); + } + + /* + * Copy the raw buffer data with no transform. String is already NULL + * terminated at Length+1. + */ + ACPI_MEMCPY(new_object->string.pointer, + return_object->buffer.pointer, length); + + /* Install the new return object */ + + acpi_ut_remove_reference(return_object); + *return_object_ptr = new_object; + + /* + * If the object is a package element, we need to: + * 1. Decrement the reference count of the orignal object, it was + * incremented when building the package + * 2. Increment the reference count of the new object, it will be + * decremented when releasing the package + */ + if (package_index != ACPI_NOT_PACKAGE) { + acpi_ut_remove_reference(return_object); + acpi_ut_add_reference(new_object); + } + return (AE_OK); + + default: + break; + } + + return (AE_AML_OPERAND_TYPE); +} diff --git a/drivers/acpi/namespace/nssearch.c b/drivers/acpi/acpica/nssearch.c index a9a80bf811b..6fea13f3f52 100644 --- a/drivers/acpi/namespace/nssearch.c +++ b/drivers/acpi/acpica/nssearch.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acnamesp.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nssearch") diff --git a/drivers/acpi/namespace/nsutils.c b/drivers/acpi/acpica/nsutils.c index b0817e1127b..3e1149bf4aa 100644 --- a/drivers/acpi/namespace/nsutils.c +++ b/drivers/acpi/acpica/nsutils.c @@ -43,9 +43,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/amlcode.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "acnamesp.h" +#include "amlcode.h" +#include "actables.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsutils") @@ -314,9 +315,15 @@ void acpi_ns_get_internal_name_length(struct acpi_namestring_info *info) * * strlen() + 1 covers the first name_seg, which has no path separator */ - if (acpi_ns_valid_root_prefix(next_external_char[0])) { + if (acpi_ns_valid_root_prefix(*next_external_char)) { info->fully_qualified = TRUE; next_external_char++; + + /* Skip redundant root_prefix, like \\_SB.PCI0.SBRG.EC0 */ + + while (acpi_ns_valid_root_prefix(*next_external_char)) { + next_external_char++; + } } else { /* * Handle Carat prefixes diff --git a/drivers/acpi/namespace/nswalk.c b/drivers/acpi/acpica/nswalk.c index 3c905ce26d7..200895fa272 100644 --- a/drivers/acpi/namespace/nswalk.c +++ b/drivers/acpi/acpica/nswalk.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acnamesp.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nswalk") diff --git a/drivers/acpi/namespace/nsxfeval.c b/drivers/acpi/acpica/nsxfeval.c index a085cc39c05..22a7171ac1e 100644 --- a/drivers/acpi/namespace/nsxfeval.c +++ b/drivers/acpi/acpica/nsxfeval.c @@ -43,8 +43,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acnamesp.h" +#include "acinterp.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsxfeval") diff --git a/drivers/acpi/namespace/nsxfname.c b/drivers/acpi/acpica/nsxfname.c index 5efa4e7ddb0..9589fea2499 100644 --- a/drivers/acpi/namespace/nsxfname.c +++ b/drivers/acpi/acpica/nsxfname.c @@ -43,7 +43,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acnamesp.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsxfname") diff --git a/drivers/acpi/namespace/nsxfobj.c b/drivers/acpi/acpica/nsxfobj.c index 2b375ee80ce..1c7efc15225 100644 --- a/drivers/acpi/namespace/nsxfobj.c +++ b/drivers/acpi/acpica/nsxfobj.c @@ -43,7 +43,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acnamesp.h" #define _COMPONENT ACPI_NAMESPACE ACPI_MODULE_NAME("nsxfobj") diff --git a/drivers/acpi/parser/psargs.c b/drivers/acpi/acpica/psargs.c index d830b29b85b..b161f3544b5 100644 --- a/drivers/acpi/parser/psargs.c +++ b/drivers/acpi/acpica/psargs.c @@ -42,10 +42,11 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/amlcode.h> -#include <acpi/acnamesp.h> -#include <acpi/acdispat.h> +#include "accommon.h" +#include "acparser.h" +#include "amlcode.h" +#include "acnamesp.h" +#include "acdispat.h" #define _COMPONENT ACPI_PARSER ACPI_MODULE_NAME("psargs") diff --git a/drivers/acpi/parser/psloop.c b/drivers/acpi/acpica/psloop.c index 4647039a0d8..c5f6ce19a40 100644 --- a/drivers/acpi/parser/psloop.c +++ b/drivers/acpi/acpica/psloop.c @@ -50,9 +50,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/acdispat.h> -#include <acpi/amlcode.h> +#include "accommon.h" +#include "acparser.h" +#include "acdispat.h" +#include "amlcode.h" #define _COMPONENT ACPI_PARSER ACPI_MODULE_NAME("psloop") diff --git a/drivers/acpi/parser/psopcode.c b/drivers/acpi/acpica/psopcode.c index f425ab30eae..3bc3a60194d 100644 --- a/drivers/acpi/parser/psopcode.c +++ b/drivers/acpi/acpica/psopcode.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/acopcode.h> -#include <acpi/amlcode.h> +#include "accommon.h" +#include "acparser.h" +#include "acopcode.h" +#include "amlcode.h" #define _COMPONENT ACPI_PARSER ACPI_MODULE_NAME("psopcode") diff --git a/drivers/acpi/parser/psparse.c b/drivers/acpi/acpica/psparse.c index 68e932f215e..70838e9b608 100644 --- a/drivers/acpi/parser/psparse.c +++ b/drivers/acpi/acpica/psparse.c @@ -51,11 +51,12 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/acdispat.h> -#include <acpi/amlcode.h> -#include <acpi/acnamesp.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acparser.h" +#include "acdispat.h" +#include "amlcode.h" +#include "acnamesp.h" +#include "acinterp.h" #define _COMPONENT ACPI_PARSER ACPI_MODULE_NAME("psparse") @@ -447,10 +448,22 @@ acpi_status acpi_ps_parse_aml(struct acpi_walk_state *walk_state) walk_state, walk_state->parser_state.aml, walk_state->parser_state.aml_size)); + if (!walk_state->parser_state.aml) { + return_ACPI_STATUS(AE_NULL_OBJECT); + } + /* Create and initialize a new thread state */ thread = acpi_ut_create_thread_state(); if (!thread) { + if (walk_state->method_desc) { + + /* Executing a control method - additional cleanup */ + + acpi_ds_terminate_control_method( + walk_state->method_desc, walk_state); + } + acpi_ds_delete_walk_state(walk_state); return_ACPI_STATUS(AE_NO_MEMORY); } diff --git a/drivers/acpi/parser/psscope.c b/drivers/acpi/acpica/psscope.c index ee50e67c944..2feca5ca958 100644 --- a/drivers/acpi/parser/psscope.c +++ b/drivers/acpi/acpica/psscope.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> +#include "accommon.h" +#include "acparser.h" #define _COMPONENT ACPI_PARSER ACPI_MODULE_NAME("psscope") diff --git a/drivers/acpi/parser/pstree.c b/drivers/acpi/acpica/pstree.c index 1dd355ddd18..4d3389118ec 100644 --- a/drivers/acpi/parser/pstree.c +++ b/drivers/acpi/acpica/pstree.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/amlcode.h> +#include "accommon.h" +#include "acparser.h" +#include "amlcode.h" #define _COMPONENT ACPI_PARSER ACPI_MODULE_NAME("pstree") diff --git a/drivers/acpi/parser/psutils.c b/drivers/acpi/acpica/psutils.c index 7cf1f65cd5b..e636e078ad3 100644 --- a/drivers/acpi/parser/psutils.c +++ b/drivers/acpi/acpica/psutils.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/amlcode.h> +#include "accommon.h" +#include "acparser.h" +#include "amlcode.h" #define _COMPONENT ACPI_PARSER ACPI_MODULE_NAME("psutils") diff --git a/drivers/acpi/parser/pswalk.c b/drivers/acpi/acpica/pswalk.c index 8b86ad5a320..78b8b791f2a 100644 --- a/drivers/acpi/parser/pswalk.c +++ b/drivers/acpi/acpica/pswalk.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> +#include "accommon.h" +#include "acparser.h" #define _COMPONENT ACPI_PARSER ACPI_MODULE_NAME("pswalk") diff --git a/drivers/acpi/parser/psxface.c b/drivers/acpi/acpica/psxface.c index 270469aae84..ff06032c0f0 100644 --- a/drivers/acpi/parser/psxface.c +++ b/drivers/acpi/acpica/psxface.c @@ -42,9 +42,11 @@ */ #include <acpi/acpi.h> -#include <acpi/acparser.h> -#include <acpi/acdispat.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acparser.h" +#include "acdispat.h" +#include "acinterp.h" +#include "amlcode.h" #define _COMPONENT ACPI_PARSER ACPI_MODULE_NAME("psxface") @@ -278,6 +280,38 @@ acpi_status acpi_ps_execute_method(struct acpi_evaluate_info *info) goto cleanup; } + /* Invoke an internal method if necessary */ + + if (info->obj_desc->method.method_flags & AML_METHOD_INTERNAL_ONLY) { + status = info->obj_desc->method.implementation(walk_state); + info->return_object = walk_state->return_desc; + + /* Cleanup states */ + + acpi_ds_scope_stack_clear(walk_state); + acpi_ps_cleanup_scope(&walk_state->parser_state); + acpi_ds_terminate_control_method(walk_state->method_desc, + walk_state); + acpi_ds_delete_walk_state(walk_state); + goto cleanup; + } + + /* + * Start method evaluation with an implicit return of zero. + * This is done for Windows compatibility. + */ + if (acpi_gbl_enable_interpreter_slack) { + walk_state->implicit_return_obj = + acpi_ut_create_internal_object(ACPI_TYPE_INTEGER); + if (!walk_state->implicit_return_obj) { + status = AE_NO_MEMORY; + acpi_ds_delete_walk_state(walk_state); + goto cleanup; + } + + walk_state->implicit_return_obj->integer.value = 0; + } + /* Parse the AML */ status = acpi_ps_parse_aml(walk_state); diff --git a/drivers/acpi/resources/rsaddr.c b/drivers/acpi/acpica/rsaddr.c index 7f96332822b..1e437bfd8db 100644 --- a/drivers/acpi/resources/rsaddr.c +++ b/drivers/acpi/acpica/rsaddr.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acresrc.h> +#include "accommon.h" +#include "acresrc.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rsaddr") diff --git a/drivers/acpi/resources/rscalc.c b/drivers/acpi/acpica/rscalc.c index 8eaaecf9200..52865ee6bc7 100644 --- a/drivers/acpi/resources/rscalc.c +++ b/drivers/acpi/acpica/rscalc.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acresrc.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acresrc.h" +#include "acnamesp.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rscalc") diff --git a/drivers/acpi/resources/rscreate.c b/drivers/acpi/acpica/rscreate.c index 08b8d73e6ee..61566b1a061 100644 --- a/drivers/acpi/resources/rscreate.c +++ b/drivers/acpi/acpica/rscreate.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acresrc.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acresrc.h" +#include "acnamesp.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rscreate") diff --git a/drivers/acpi/resources/rsdump.c b/drivers/acpi/acpica/rsdump.c index 6bbbb7b8941..3f0ca5a12d3 100644 --- a/drivers/acpi/resources/rsdump.c +++ b/drivers/acpi/acpica/rsdump.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acresrc.h> +#include "accommon.h" +#include "acresrc.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rsdump") diff --git a/drivers/acpi/resources/rsinfo.c b/drivers/acpi/acpica/rsinfo.c index 3f0a1fedbe0..77b25fdb459 100644 --- a/drivers/acpi/resources/rsinfo.c +++ b/drivers/acpi/acpica/rsinfo.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acresrc.h> +#include "accommon.h" +#include "acresrc.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rsinfo") diff --git a/drivers/acpi/resources/rsio.c b/drivers/acpi/acpica/rsio.c index b66d42e7402..35a49aa9560 100644 --- a/drivers/acpi/resources/rsio.c +++ b/drivers/acpi/acpica/rsio.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acresrc.h> +#include "accommon.h" +#include "acresrc.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rsio") diff --git a/drivers/acpi/resources/rsirq.c b/drivers/acpi/acpica/rsirq.c index a8805efc036..2e0256983aa 100644 --- a/drivers/acpi/resources/rsirq.c +++ b/drivers/acpi/acpica/rsirq.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acresrc.h> +#include "accommon.h" +#include "acresrc.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rsirq") diff --git a/drivers/acpi/resources/rslist.c b/drivers/acpi/acpica/rslist.c index b78c7e797a1..1b1dbc69f08 100644 --- a/drivers/acpi/resources/rslist.c +++ b/drivers/acpi/acpica/rslist.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acresrc.h> +#include "accommon.h" +#include "acresrc.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rslist") diff --git a/drivers/acpi/resources/rsmemory.c b/drivers/acpi/acpica/rsmemory.c index 63b21abd90b..ddc76cebdc9 100644 --- a/drivers/acpi/resources/rsmemory.c +++ b/drivers/acpi/acpica/rsmemory.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acresrc.h> +#include "accommon.h" +#include "acresrc.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rsmemory") diff --git a/drivers/acpi/resources/rsmisc.c b/drivers/acpi/acpica/rsmisc.c index 96a6c035325..5bc49a55328 100644 --- a/drivers/acpi/resources/rsmisc.c +++ b/drivers/acpi/acpica/rsmisc.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acresrc.h> +#include "accommon.h" +#include "acresrc.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rsmisc") diff --git a/drivers/acpi/resources/rsutils.c b/drivers/acpi/acpica/rsutils.c index f7b3bcd59ba..bc03d596682 100644 --- a/drivers/acpi/resources/rsutils.c +++ b/drivers/acpi/acpica/rsutils.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acresrc.h> +#include "accommon.h" +#include "acnamesp.h" +#include "acresrc.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rsutils") diff --git a/drivers/acpi/resources/rsxface.c b/drivers/acpi/acpica/rsxface.c index f59f4c4e034..69a2aa5b5d8 100644 --- a/drivers/acpi/resources/rsxface.c +++ b/drivers/acpi/acpica/rsxface.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acresrc.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acresrc.h" +#include "acnamesp.h" #define _COMPONENT ACPI_RESOURCES ACPI_MODULE_NAME("rsxface") diff --git a/drivers/acpi/tables/tbfadt.c b/drivers/acpi/acpica/tbfadt.c index 2817158fb6a..3636e4f8fb7 100644 --- a/drivers/acpi/tables/tbfadt.c +++ b/drivers/acpi/acpica/tbfadt.c @@ -42,15 +42,16 @@ */ #include <acpi/acpi.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "actables.h" #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbfadt") /* Local prototypes */ -static void inline +static inline void acpi_tb_init_generic_address(struct acpi_generic_address *generic_address, - u8 byte_width, u64 address); + u8 space_id, u8 byte_width, u64 address); static void acpi_tb_convert_fadt(void); @@ -60,9 +61,10 @@ static void acpi_tb_validate_fadt(void); typedef struct acpi_fadt_info { char *name; - u8 target; - u8 source; + u8 address64; + u8 address32; u8 length; + u8 default_length; u8 type; } acpi_fadt_info; @@ -71,37 +73,61 @@ typedef struct acpi_fadt_info { #define ACPI_FADT_SEPARATE_LENGTH 2 static struct acpi_fadt_info fadt_info_table[] = { - {"Pm1aEventBlock", ACPI_FADT_OFFSET(xpm1a_event_block), + {"Pm1aEventBlock", + ACPI_FADT_OFFSET(xpm1a_event_block), ACPI_FADT_OFFSET(pm1a_event_block), - ACPI_FADT_OFFSET(pm1_event_length), ACPI_FADT_REQUIRED}, + ACPI_FADT_OFFSET(pm1_event_length), + ACPI_PM1_REGISTER_WIDTH * 2, /* Enable + Status register */ + ACPI_FADT_REQUIRED}, - {"Pm1bEventBlock", ACPI_FADT_OFFSET(xpm1b_event_block), + {"Pm1bEventBlock", + ACPI_FADT_OFFSET(xpm1b_event_block), ACPI_FADT_OFFSET(pm1b_event_block), - ACPI_FADT_OFFSET(pm1_event_length), 0}, + ACPI_FADT_OFFSET(pm1_event_length), + ACPI_PM1_REGISTER_WIDTH * 2, /* Enable + Status register */ + 0}, - {"Pm1aControlBlock", ACPI_FADT_OFFSET(xpm1a_control_block), + {"Pm1aControlBlock", + ACPI_FADT_OFFSET(xpm1a_control_block), ACPI_FADT_OFFSET(pm1a_control_block), - ACPI_FADT_OFFSET(pm1_control_length), ACPI_FADT_REQUIRED}, + ACPI_FADT_OFFSET(pm1_control_length), + ACPI_PM1_REGISTER_WIDTH, + ACPI_FADT_REQUIRED}, - {"Pm1bControlBlock", ACPI_FADT_OFFSET(xpm1b_control_block), + {"Pm1bControlBlock", + ACPI_FADT_OFFSET(xpm1b_control_block), ACPI_FADT_OFFSET(pm1b_control_block), - ACPI_FADT_OFFSET(pm1_control_length), 0}, + ACPI_FADT_OFFSET(pm1_control_length), + ACPI_PM1_REGISTER_WIDTH, + 0}, - {"Pm2ControlBlock", ACPI_FADT_OFFSET(xpm2_control_block), + {"Pm2ControlBlock", + ACPI_FADT_OFFSET(xpm2_control_block), ACPI_FADT_OFFSET(pm2_control_block), - ACPI_FADT_OFFSET(pm2_control_length), ACPI_FADT_SEPARATE_LENGTH}, + ACPI_FADT_OFFSET(pm2_control_length), + ACPI_PM2_REGISTER_WIDTH, + ACPI_FADT_SEPARATE_LENGTH}, - {"PmTimerBlock", ACPI_FADT_OFFSET(xpm_timer_block), + {"PmTimerBlock", + ACPI_FADT_OFFSET(xpm_timer_block), ACPI_FADT_OFFSET(pm_timer_block), - ACPI_FADT_OFFSET(pm_timer_length), ACPI_FADT_REQUIRED}, + ACPI_FADT_OFFSET(pm_timer_length), + ACPI_PM_TIMER_WIDTH, + ACPI_FADT_REQUIRED}, - {"Gpe0Block", ACPI_FADT_OFFSET(xgpe0_block), + {"Gpe0Block", + ACPI_FADT_OFFSET(xgpe0_block), ACPI_FADT_OFFSET(gpe0_block), - ACPI_FADT_OFFSET(gpe0_block_length), ACPI_FADT_SEPARATE_LENGTH}, + ACPI_FADT_OFFSET(gpe0_block_length), + 0, + ACPI_FADT_SEPARATE_LENGTH}, - {"Gpe1Block", ACPI_FADT_OFFSET(xgpe1_block), + {"Gpe1Block", + ACPI_FADT_OFFSET(xgpe1_block), ACPI_FADT_OFFSET(gpe1_block), - ACPI_FADT_OFFSET(gpe1_block_length), ACPI_FADT_SEPARATE_LENGTH} + ACPI_FADT_OFFSET(gpe1_block_length), + 0, + ACPI_FADT_SEPARATE_LENGTH} }; #define ACPI_FADT_INFO_ENTRIES (sizeof (fadt_info_table) / sizeof (struct acpi_fadt_info)) @@ -122,9 +148,9 @@ static struct acpi_fadt_info fadt_info_table[] = { * ******************************************************************************/ -static void inline +static inline void acpi_tb_init_generic_address(struct acpi_generic_address *generic_address, - u8 byte_width, u64 address) + u8 space_id, u8 byte_width, u64 address) { /* @@ -135,10 +161,10 @@ acpi_tb_init_generic_address(struct acpi_generic_address *generic_address, /* All other fields are byte-wide */ - generic_address->space_id = ACPI_ADR_SPACE_SYSTEM_IO; - generic_address->bit_width = byte_width << 3; + generic_address->space_id = space_id; + generic_address->bit_width = (u8)ACPI_MUL_8(byte_width); generic_address->bit_offset = 0; - generic_address->access_width = 0; + generic_address->access_width = 0; /* Access width ANY */ } /******************************************************************************* @@ -225,7 +251,8 @@ void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length) */ if (length > sizeof(struct acpi_table_fadt)) { ACPI_WARNING((AE_INFO, - "FADT (revision %u) is longer than ACPI 2.0 version, truncating length 0x%X to 0x%zX", + "FADT (revision %u) is longer than ACPI 2.0 version, " + "truncating length 0x%X to 0x%zX", table->revision, (unsigned)length, sizeof(struct acpi_table_fadt))); } @@ -244,7 +271,6 @@ void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length) * 2) Validate some of the important values within the FADT */ acpi_tb_convert_fadt(); - acpi_tb_validate_fadt(); } /******************************************************************************* @@ -278,22 +304,36 @@ void acpi_tb_create_local_fadt(struct acpi_table_header *table, u32 length) static void acpi_tb_convert_fadt(void) { - u8 pm1_register_length; - struct acpi_generic_address *target; + u8 pm1_register_bit_width; + u8 pm1_register_byte_width; + struct acpi_generic_address *target64; u32 i; /* Update the local FADT table header length */ acpi_gbl_FADT.header.length = sizeof(struct acpi_table_fadt); - /* Expand the 32-bit FACS and DSDT addresses to 64-bit as necessary */ - + /* + * Expand the 32-bit FACS and DSDT addresses to 64-bit as necessary. + * Later code will always use the X 64-bit field. Also, check for an + * address mismatch between the 32-bit and 64-bit address fields + * (FIRMWARE_CTRL/X_FIRMWARE_CTRL, DSDT/X_DSDT) which would indicate + * the presence of two FACS or two DSDT tables. + */ if (!acpi_gbl_FADT.Xfacs) { acpi_gbl_FADT.Xfacs = (u64) acpi_gbl_FADT.facs; + } else if (acpi_gbl_FADT.facs && + (acpi_gbl_FADT.Xfacs != (u64) acpi_gbl_FADT.facs)) { + ACPI_WARNING((AE_INFO, + "32/64 FACS address mismatch in FADT - two FACS tables!")); } if (!acpi_gbl_FADT.Xdsdt) { acpi_gbl_FADT.Xdsdt = (u64) acpi_gbl_FADT.dsdt; + } else if (acpi_gbl_FADT.dsdt && + (acpi_gbl_FADT.Xdsdt != (u64) acpi_gbl_FADT.dsdt)) { + ACPI_WARNING((AE_INFO, + "32/64 DSDT address mismatch in FADT - two DSDT tables!")); } /* @@ -312,18 +352,23 @@ static void acpi_tb_convert_fadt(void) } /* - * Expand the ACPI 1.0 32-bit V1.0 addresses to the ACPI 2.0 64-bit "X" - * generic address structures as necessary. + * Expand the ACPI 1.0 32-bit addresses to the ACPI 2.0 64-bit "X" + * generic address structures as necessary. Later code will always use + * the 64-bit address structures. */ for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) { - target = + target64 = ACPI_ADD_PTR(struct acpi_generic_address, &acpi_gbl_FADT, - fadt_info_table[i].target); + fadt_info_table[i].address64); - /* Expand only if the X target is null */ + /* Expand only if the 64-bit X target is null */ - if (!target->address) { - acpi_tb_init_generic_address(target, + if (!target64->address) { + + /* The space_id is always I/O for the 32-bit legacy address fields */ + + acpi_tb_init_generic_address(target64, + ACPI_ADR_SPACE_SYSTEM_IO, *ACPI_ADD_PTR(u8, &acpi_gbl_FADT, fadt_info_table @@ -332,11 +377,64 @@ static void acpi_tb_convert_fadt(void) &acpi_gbl_FADT, fadt_info_table [i]. - source)); + address32)); + } + } + + /* Validate FADT values now, before we make any changes */ + + acpi_tb_validate_fadt(); + + /* + * Optionally check all register lengths against the default values and + * update them if they are incorrect. + */ + if (acpi_gbl_use_default_register_widths) { + for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) { + target64 = + ACPI_ADD_PTR(struct acpi_generic_address, + &acpi_gbl_FADT, + fadt_info_table[i].address64); + + /* + * If a valid register (Address != 0) and the (default_length > 0) + * (Not a GPE register), then check the width against the default. + */ + if ((target64->address) && + (fadt_info_table[i].default_length > 0) && + (fadt_info_table[i].default_length != + target64->bit_width)) { + ACPI_WARNING((AE_INFO, + "Invalid length for %s: %d, using default %d", + fadt_info_table[i].name, + target64->bit_width, + fadt_info_table[i]. + default_length)); + + /* Incorrect size, set width to the default */ + + target64->bit_width = + fadt_info_table[i].default_length; + } } } /* + * Get the length of the individual PM1 registers (enable and status). + * Each register is defined to be (event block length / 2). + */ + pm1_register_bit_width = + (u8)ACPI_DIV_2(acpi_gbl_FADT.xpm1a_event_block.bit_width); + pm1_register_byte_width = (u8)ACPI_DIV_8(pm1_register_bit_width); + + /* + * Adjust the lengths of the PM1 Event Blocks so that they can be used to + * access the PM1 status register(s). Use (width / 2) + */ + acpi_gbl_FADT.xpm1a_event_block.bit_width = pm1_register_bit_width; + acpi_gbl_FADT.xpm1b_event_block.bit_width = pm1_register_bit_width; + + /* * Calculate separate GAS structs for the PM1 Enable registers. * These addresses do not appear (directly) in the FADT, so it is * useful to calculate them once, here. @@ -356,14 +454,14 @@ static void acpi_tb_convert_fadt(void) " PM1_EVT_LEN (%u)\n", acpi_gbl_FADT.xpm1a_event_block.bit_width, acpi_gbl_FADT.pm1_event_length); - pm1_register_length = (u8) ACPI_DIV_2(acpi_gbl_FADT.pm1_event_length); /* The PM1A register block is required */ acpi_tb_init_generic_address(&acpi_gbl_xpm1a_enable, - pm1_register_length, + acpi_gbl_FADT.xpm1a_event_block.space_id, + pm1_register_byte_width, (acpi_gbl_FADT.xpm1a_event_block.address + - pm1_register_length)); + pm1_register_byte_width)); /* Don't forget to copy space_id of the GAS */ acpi_gbl_xpm1a_enable.space_id = acpi_gbl_FADT.xpm1a_event_block.space_id; @@ -379,9 +477,10 @@ static void acpi_tb_convert_fadt(void) acpi_gbl_FADT.xpm1b_event_block.bit_width, acpi_gbl_FADT.pm1_event_length); acpi_tb_init_generic_address(&acpi_gbl_xpm1b_enable, - pm1_register_length, + acpi_gbl_FADT.xpm1b_event_block.space_id, + pm1_register_byte_width, (acpi_gbl_FADT.xpm1b_event_block. - address + pm1_register_length)); + address + pm1_register_byte_width)); /* Don't forget to copy space_id of the GAS */ acpi_gbl_xpm1b_enable.space_id = acpi_gbl_FADT.xpm1b_event_block.space_id; @@ -411,26 +510,63 @@ static void acpi_tb_convert_fadt(void) static void acpi_tb_validate_fadt(void) { + char *name; u32 *address32; struct acpi_generic_address *address64; u8 length; u32 i; - /* Examine all of the 64-bit extended address fields (X fields) */ + /* + * Check for FACS and DSDT address mismatches. An address mismatch between + * the 32-bit and 64-bit address fields (FIRMWARE_CTRL/X_FIRMWARE_CTRL and + * DSDT/X_DSDT) would indicate the presence of two FACS or two DSDT tables. + */ + if (acpi_gbl_FADT.facs && + (acpi_gbl_FADT.Xfacs != (u64) acpi_gbl_FADT.facs)) { + ACPI_WARNING((AE_INFO, + "32/64X FACS address mismatch in FADT - " + "two FACS tables! %8.8X/%8.8X%8.8X", + acpi_gbl_FADT.facs, + ACPI_FORMAT_UINT64(acpi_gbl_FADT.Xfacs))); + } - for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) { + if (acpi_gbl_FADT.dsdt && + (acpi_gbl_FADT.Xdsdt != (u64) acpi_gbl_FADT.dsdt)) { + ACPI_WARNING((AE_INFO, + "32/64X DSDT address mismatch in FADT - " + "two DSDT tables! %8.8X/%8.8X%8.8X", + acpi_gbl_FADT.dsdt, + ACPI_FORMAT_UINT64(acpi_gbl_FADT.Xdsdt))); + } - /* Generate pointers to the 32-bit and 64-bit addresses and get the length */ + /* Examine all of the 64-bit extended address fields (X fields) */ - address64 = - ACPI_ADD_PTR(struct acpi_generic_address, &acpi_gbl_FADT, - fadt_info_table[i].target); + for (i = 0; i < ACPI_FADT_INFO_ENTRIES; i++) { + /* + * Generate pointers to the 32-bit and 64-bit addresses, get the + * register length (width), and the register name + */ + address64 = ACPI_ADD_PTR(struct acpi_generic_address, + &acpi_gbl_FADT, + fadt_info_table[i].address64); address32 = ACPI_ADD_PTR(u32, &acpi_gbl_FADT, - fadt_info_table[i].source); + fadt_info_table[i].address32); length = *ACPI_ADD_PTR(u8, &acpi_gbl_FADT, fadt_info_table[i].length); + name = fadt_info_table[i].name; + + /* + * For each extended field, check for length mismatch between the + * legacy length field and the corresponding 64-bit X length field. + */ + if (address64 && (address64->bit_width != ACPI_MUL_8(length))) { + ACPI_WARNING((AE_INFO, + "32/64X length mismatch in %s: %d/%d", + name, ACPI_MUL_8(length), + address64->bit_width)); + } if (fadt_info_table[i].type & ACPI_FADT_REQUIRED) { /* @@ -439,8 +575,8 @@ static void acpi_tb_validate_fadt(void) */ if (!address64->address || !length) { ACPI_ERROR((AE_INFO, - "Required field \"%s\" has zero address and/or length: %8.8X%8.8X/%X", - fadt_info_table[i].name, + "Required field %s has zero address and/or length: %8.8X%8.8X/%X", + name, ACPI_FORMAT_UINT64(address64-> address), length)); @@ -453,8 +589,8 @@ static void acpi_tb_validate_fadt(void) if ((address64->address && !length) || (!address64->address && length)) { ACPI_WARNING((AE_INFO, - "Optional field \"%s\" has zero address or length: %8.8X%8.8X/%X", - fadt_info_table[i].name, + "Optional field %s has zero address or length: %8.8X%8.8X/%X", + name, ACPI_FORMAT_UINT64(address64-> address), length)); @@ -466,8 +602,8 @@ static void acpi_tb_validate_fadt(void) if (address64->address && *address32 && (address64->address != (u64) * address32)) { ACPI_ERROR((AE_INFO, - "32/64X address mismatch in \"%s\": [%8.8X] [%8.8X%8.8X], using 64X", - fadt_info_table[i].name, *address32, + "32/64X address mismatch in %s: %8.8X/%8.8X%8.8X, using 64X", + name, *address32, ACPI_FORMAT_UINT64(address64->address))); } } diff --git a/drivers/acpi/tables/tbfind.c b/drivers/acpi/acpica/tbfind.c index 531584defbb..1054dfd4920 100644 --- a/drivers/acpi/tables/tbfind.c +++ b/drivers/acpi/acpica/tbfind.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "actables.h" #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbfind") diff --git a/drivers/acpi/tables/tbinstal.c b/drivers/acpi/acpica/tbinstal.c index 18747ce8dd2..37374b21969 100644 --- a/drivers/acpi/tables/tbinstal.c +++ b/drivers/acpi/acpica/tbinstal.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "acnamesp.h" +#include "actables.h" #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbinstal") diff --git a/drivers/acpi/tables/tbutils.c b/drivers/acpi/acpica/tbutils.c index 0cc92ef5236..9684cc82793 100644 --- a/drivers/acpi/tables/tbutils.c +++ b/drivers/acpi/acpica/tbutils.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "actables.h" #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbutils") @@ -113,6 +114,30 @@ acpi_tb_check_xsdt(acpi_physical_address address) /******************************************************************************* * + * FUNCTION: acpi_tb_initialize_facs + * + * PARAMETERS: None + * + * RETURN: Status + * + * DESCRIPTION: Create a permanent mapping for the FADT and save it in a global + * for accessing the Global Lock and Firmware Waking Vector + * + ******************************************************************************/ + +acpi_status acpi_tb_initialize_facs(void) +{ + acpi_status status; + + status = acpi_get_table_by_index(ACPI_TABLE_INDEX_FACS, + ACPI_CAST_INDIRECT_PTR(struct + acpi_table_header, + &acpi_gbl_FACS)); + return status; +} + +/******************************************************************************* + * * FUNCTION: acpi_tb_tables_loaded * * PARAMETERS: None @@ -420,7 +445,8 @@ acpi_tb_parse_root_table(acpi_physical_address rsdp_address, u8 flags) /* Differentiate between RSDT and XSDT root tables */ - if (rsdp->revision > 1 && rsdp->xsdt_physical_address) { + if (rsdp->revision > 1 && rsdp->xsdt_physical_address + && !acpi_rsdt_forced) { /* * Root table is an XSDT (64-bit physical addresses). We must use the * XSDT if the revision is > 1 and the XSDT pointer is present, as per diff --git a/drivers/acpi/tables/tbxface.c b/drivers/acpi/acpica/tbxface.c index fd7770aa106..c3e841f3cde 100644 --- a/drivers/acpi/tables/tbxface.c +++ b/drivers/acpi/acpica/tbxface.c @@ -43,8 +43,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "acnamesp.h" +#include "actables.h" #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbxface") diff --git a/drivers/acpi/tables/tbxfroot.c b/drivers/acpi/acpica/tbxfroot.c index 2d157e0f98d..b7fc8dd4334 100644 --- a/drivers/acpi/tables/tbxfroot.c +++ b/drivers/acpi/acpica/tbxfroot.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "actables.h" #define _COMPONENT ACPI_TABLES ACPI_MODULE_NAME("tbxfroot") diff --git a/drivers/acpi/utilities/utalloc.c b/drivers/acpi/acpica/utalloc.c index 241c535c175..7580f6b3069 100644 --- a/drivers/acpi/utilities/utalloc.c +++ b/drivers/acpi/acpica/utalloc.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acdebug.h> +#include "accommon.h" +#include "acdebug.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utalloc") diff --git a/drivers/acpi/utilities/utcopy.c b/drivers/acpi/acpica/utcopy.c index 5b2f7c27b70..b0dcfd3c872 100644 --- a/drivers/acpi/utilities/utcopy.c +++ b/drivers/acpi/acpica/utcopy.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acnamesp.h" #define _COMPONENT ACPI_UTILITIES diff --git a/drivers/acpi/utilities/utdebug.c b/drivers/acpi/acpica/utdebug.c index fd66ecb6741..38821f53042 100644 --- a/drivers/acpi/utilities/utdebug.c +++ b/drivers/acpi/acpica/utdebug.c @@ -42,6 +42,7 @@ */ #include <acpi/acpi.h> +#include "accommon.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utdebug") @@ -136,7 +137,7 @@ static const char *acpi_ut_trim_function_name(const char *function_name) /******************************************************************************* * - * FUNCTION: acpi_ut_debug_print + * FUNCTION: acpi_debug_print * * PARAMETERS: requested_debug_level - Requested debug print level * line_number - Caller's line number (for error output) @@ -154,11 +155,11 @@ static const char *acpi_ut_trim_function_name(const char *function_name) ******************************************************************************/ void ACPI_INTERNAL_VAR_XFACE -acpi_ut_debug_print(u32 requested_debug_level, - u32 line_number, - const char *function_name, - const char *module_name, - u32 component_id, const char *format, ...) +acpi_debug_print(u32 requested_debug_level, + u32 line_number, + const char *function_name, + const char *module_name, + u32 component_id, const char *format, ...) { acpi_thread_id thread_id; va_list args; @@ -205,11 +206,11 @@ acpi_ut_debug_print(u32 requested_debug_level, va_end(args); } -ACPI_EXPORT_SYMBOL(acpi_ut_debug_print) +ACPI_EXPORT_SYMBOL(acpi_debug_print) /******************************************************************************* * - * FUNCTION: acpi_ut_debug_print_raw + * FUNCTION: acpi_debug_print_raw * * PARAMETERS: requested_debug_level - Requested debug print level * line_number - Caller's line number @@ -226,11 +227,11 @@ ACPI_EXPORT_SYMBOL(acpi_ut_debug_print) * ******************************************************************************/ void ACPI_INTERNAL_VAR_XFACE -acpi_ut_debug_print_raw(u32 requested_debug_level, - u32 line_number, - const char *function_name, - const char *module_name, - u32 component_id, const char *format, ...) +acpi_debug_print_raw(u32 requested_debug_level, + u32 line_number, + const char *function_name, + const char *module_name, + u32 component_id, const char *format, ...) { va_list args; @@ -244,7 +245,7 @@ acpi_ut_debug_print_raw(u32 requested_debug_level, va_end(args); } -ACPI_EXPORT_SYMBOL(acpi_ut_debug_print_raw) +ACPI_EXPORT_SYMBOL(acpi_debug_print_raw) /******************************************************************************* * @@ -270,9 +271,9 @@ acpi_ut_trace(u32 line_number, acpi_gbl_nesting_level++; acpi_ut_track_stack_ptr(); - acpi_ut_debug_print(ACPI_LV_FUNCTIONS, - line_number, function_name, module_name, - component_id, "%s\n", acpi_gbl_fn_entry_str); + acpi_debug_print(ACPI_LV_FUNCTIONS, + line_number, function_name, module_name, component_id, + "%s\n", acpi_gbl_fn_entry_str); } ACPI_EXPORT_SYMBOL(acpi_ut_trace) @@ -301,10 +302,9 @@ acpi_ut_trace_ptr(u32 line_number, acpi_gbl_nesting_level++; acpi_ut_track_stack_ptr(); - acpi_ut_debug_print(ACPI_LV_FUNCTIONS, - line_number, function_name, module_name, - component_id, "%s %p\n", acpi_gbl_fn_entry_str, - pointer); + acpi_debug_print(ACPI_LV_FUNCTIONS, + line_number, function_name, module_name, component_id, + "%s %p\n", acpi_gbl_fn_entry_str, pointer); } /******************************************************************************* @@ -333,10 +333,9 @@ acpi_ut_trace_str(u32 line_number, acpi_gbl_nesting_level++; acpi_ut_track_stack_ptr(); - acpi_ut_debug_print(ACPI_LV_FUNCTIONS, - line_number, function_name, module_name, - component_id, "%s %s\n", acpi_gbl_fn_entry_str, - string); + acpi_debug_print(ACPI_LV_FUNCTIONS, + line_number, function_name, module_name, component_id, + "%s %s\n", acpi_gbl_fn_entry_str, string); } /******************************************************************************* @@ -365,10 +364,9 @@ acpi_ut_trace_u32(u32 line_number, acpi_gbl_nesting_level++; acpi_ut_track_stack_ptr(); - acpi_ut_debug_print(ACPI_LV_FUNCTIONS, - line_number, function_name, module_name, - component_id, "%s %08X\n", acpi_gbl_fn_entry_str, - integer); + acpi_debug_print(ACPI_LV_FUNCTIONS, + line_number, function_name, module_name, component_id, + "%s %08X\n", acpi_gbl_fn_entry_str, integer); } /******************************************************************************* @@ -393,9 +391,9 @@ acpi_ut_exit(u32 line_number, const char *module_name, u32 component_id) { - acpi_ut_debug_print(ACPI_LV_FUNCTIONS, - line_number, function_name, module_name, - component_id, "%s\n", acpi_gbl_fn_exit_str); + acpi_debug_print(ACPI_LV_FUNCTIONS, + line_number, function_name, module_name, component_id, + "%s\n", acpi_gbl_fn_exit_str); acpi_gbl_nesting_level--; } @@ -426,17 +424,16 @@ acpi_ut_status_exit(u32 line_number, { if (ACPI_SUCCESS(status)) { - acpi_ut_debug_print(ACPI_LV_FUNCTIONS, - line_number, function_name, module_name, - component_id, "%s %s\n", - acpi_gbl_fn_exit_str, - acpi_format_exception(status)); + acpi_debug_print(ACPI_LV_FUNCTIONS, + line_number, function_name, module_name, + component_id, "%s %s\n", acpi_gbl_fn_exit_str, + acpi_format_exception(status)); } else { - acpi_ut_debug_print(ACPI_LV_FUNCTIONS, - line_number, function_name, module_name, - component_id, "%s ****Exception****: %s\n", - acpi_gbl_fn_exit_str, - acpi_format_exception(status)); + acpi_debug_print(ACPI_LV_FUNCTIONS, + line_number, function_name, module_name, + component_id, "%s ****Exception****: %s\n", + acpi_gbl_fn_exit_str, + acpi_format_exception(status)); } acpi_gbl_nesting_level--; @@ -467,10 +464,10 @@ acpi_ut_value_exit(u32 line_number, u32 component_id, acpi_integer value) { - acpi_ut_debug_print(ACPI_LV_FUNCTIONS, - line_number, function_name, module_name, - component_id, "%s %8.8X%8.8X\n", - acpi_gbl_fn_exit_str, ACPI_FORMAT_UINT64(value)); + acpi_debug_print(ACPI_LV_FUNCTIONS, + line_number, function_name, module_name, component_id, + "%s %8.8X%8.8X\n", acpi_gbl_fn_exit_str, + ACPI_FORMAT_UINT64(value)); acpi_gbl_nesting_level--; } @@ -499,9 +496,9 @@ acpi_ut_ptr_exit(u32 line_number, const char *module_name, u32 component_id, u8 *ptr) { - acpi_ut_debug_print(ACPI_LV_FUNCTIONS, - line_number, function_name, module_name, - component_id, "%s %p\n", acpi_gbl_fn_exit_str, ptr); + acpi_debug_print(ACPI_LV_FUNCTIONS, + line_number, function_name, module_name, component_id, + "%s %p\n", acpi_gbl_fn_exit_str, ptr); acpi_gbl_nesting_level--; } diff --git a/drivers/acpi/utilities/utdelete.c b/drivers/acpi/acpica/utdelete.c index d197c6b29e1..a0be9e39531 100644 --- a/drivers/acpi/utilities/utdelete.c +++ b/drivers/acpi/acpica/utdelete.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acinterp.h> -#include <acpi/acnamesp.h> -#include <acpi/acevents.h> +#include "accommon.h" +#include "acinterp.h" +#include "acnamesp.h" +#include "acevents.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utdelete") diff --git a/drivers/acpi/utilities/uteval.c b/drivers/acpi/acpica/uteval.c index 352747e49c7..da9450bc60f 100644 --- a/drivers/acpi/utilities/uteval.c +++ b/drivers/acpi/acpica/uteval.c @@ -42,8 +42,9 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acinterp.h> +#include "accommon.h" +#include "acnamesp.h" +#include "acinterp.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("uteval") @@ -129,7 +130,7 @@ acpi_status acpi_ut_osi_implementation(struct acpi_walk_state *walk_state) /* The interface is supported */ - return_ACPI_STATUS(AE_CTRL_TERMINATE); + return_ACPI_STATUS(AE_OK); } } @@ -143,13 +144,13 @@ acpi_status acpi_ut_osi_implementation(struct acpi_walk_state *walk_state) /* The interface is supported */ - return_ACPI_STATUS(AE_CTRL_TERMINATE); + return_ACPI_STATUS(AE_OK); } /* The interface is not supported */ return_desc->integer.value = 0; - return_ACPI_STATUS(AE_CTRL_TERMINATE); + return_ACPI_STATUS(AE_OK); } /******************************************************************************* diff --git a/drivers/acpi/utilities/utglobal.c b/drivers/acpi/acpica/utglobal.c index 17ed5ac840f..a3ab9d9da29 100644 --- a/drivers/acpi/utilities/utglobal.c +++ b/drivers/acpi/acpica/utglobal.c @@ -44,11 +44,11 @@ #define DEFINE_ACPI_GLOBALS #include <acpi/acpi.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acnamesp.h" -ACPI_EXPORT_SYMBOL(acpi_gbl_FADT) #define _COMPONENT ACPI_UTILITIES - ACPI_MODULE_NAME("utglobal") +ACPI_MODULE_NAME("utglobal") /******************************************************************************* * @@ -352,7 +352,7 @@ const char *acpi_gbl_region_types[ACPI_NUM_PREDEFINED_REGIONS] = { "PCI_Config", "EmbeddedControl", "SMBus", - "CMOS", + "SystemCMOS", "PCIBARTarget", "DataTable" }; @@ -756,6 +756,7 @@ acpi_status acpi_ut_init_globals(void) acpi_gbl_gpe_xrupt_list_head = NULL; acpi_gbl_gpe_fadt_blocks[0] = NULL; acpi_gbl_gpe_fadt_blocks[1] = NULL; + acpi_current_gpe_count = 0; /* Global handlers */ @@ -771,6 +772,7 @@ acpi_status acpi_ut_init_globals(void) acpi_gbl_global_lock_mutex = NULL; acpi_gbl_global_lock_acquired = FALSE; acpi_gbl_global_lock_handle = 0; + acpi_gbl_global_lock_present = FALSE; /* Miscellaneous variables */ @@ -815,5 +817,7 @@ acpi_status acpi_ut_init_globals(void) return_ACPI_STATUS(AE_OK); } +ACPI_EXPORT_SYMBOL(acpi_gbl_FADT) ACPI_EXPORT_SYMBOL(acpi_dbg_level) ACPI_EXPORT_SYMBOL(acpi_dbg_layer) +ACPI_EXPORT_SYMBOL(acpi_current_gpe_count) diff --git a/drivers/acpi/utilities/utinit.c b/drivers/acpi/acpica/utinit.c index cae515fc02d..a54ca84eb36 100644 --- a/drivers/acpi/utilities/utinit.c +++ b/drivers/acpi/acpica/utinit.c @@ -42,9 +42,10 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acevents.h> -#include <acpi/actables.h> +#include "accommon.h" +#include "acnamesp.h" +#include "acevents.h" +#include "actables.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utinit") diff --git a/drivers/acpi/utilities/utmath.c b/drivers/acpi/acpica/utmath.c index c927324fdd2..c9f682d640e 100644 --- a/drivers/acpi/utilities/utmath.c +++ b/drivers/acpi/acpica/utmath.c @@ -42,6 +42,7 @@ */ #include <acpi/acpi.h> +#include "accommon.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utmath") diff --git a/drivers/acpi/utilities/utmisc.c b/drivers/acpi/acpica/utmisc.c index 9089a158a87..c1f7f4e1a72 100644 --- a/drivers/acpi/utilities/utmisc.c +++ b/drivers/acpi/acpica/utmisc.c @@ -44,7 +44,8 @@ #include <linux/module.h> #include <acpi/acpi.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acnamesp.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utmisc") @@ -1016,7 +1017,7 @@ acpi_ut_walk_package_tree(union acpi_operand_object * source_object, /******************************************************************************* * - * FUNCTION: acpi_ut_error, acpi_ut_warning, acpi_ut_info + * FUNCTION: acpi_error, acpi_exception, acpi_warning, acpi_info * * PARAMETERS: module_name - Caller's module name (for error output) * line_number - Caller's line number (for error output) @@ -1029,7 +1030,7 @@ acpi_ut_walk_package_tree(union acpi_operand_object * source_object, ******************************************************************************/ void ACPI_INTERNAL_VAR_XFACE -acpi_ut_error(const char *module_name, u32 line_number, const char *format, ...) +acpi_error(const char *module_name, u32 line_number, const char *format, ...) { va_list args; @@ -1042,8 +1043,8 @@ acpi_ut_error(const char *module_name, u32 line_number, const char *format, ...) } void ACPI_INTERNAL_VAR_XFACE -acpi_ut_exception(const char *module_name, - u32 line_number, acpi_status status, const char *format, ...) +acpi_exception(const char *module_name, + u32 line_number, acpi_status status, const char *format, ...) { va_list args; @@ -1056,11 +1057,8 @@ acpi_ut_exception(const char *module_name, va_end(args); } -EXPORT_SYMBOL(acpi_ut_exception); - void ACPI_INTERNAL_VAR_XFACE -acpi_ut_warning(const char *module_name, - u32 line_number, const char *format, ...) +acpi_warning(const char *module_name, u32 line_number, const char *format, ...) { va_list args; @@ -1073,7 +1071,7 @@ acpi_ut_warning(const char *module_name, } void ACPI_INTERNAL_VAR_XFACE -acpi_ut_info(const char *module_name, u32 line_number, const char *format, ...) +acpi_info(const char *module_name, u32 line_number, const char *format, ...) { va_list args; @@ -1088,3 +1086,8 @@ acpi_ut_info(const char *module_name, u32 line_number, const char *format, ...) acpi_os_printf("\n"); va_end(args); } + +ACPI_EXPORT_SYMBOL(acpi_error) +ACPI_EXPORT_SYMBOL(acpi_exception) +ACPI_EXPORT_SYMBOL(acpi_warning) +ACPI_EXPORT_SYMBOL(acpi_info) diff --git a/drivers/acpi/utilities/utmutex.c b/drivers/acpi/acpica/utmutex.c index 7331dde9e1b..14eb52c4d64 100644 --- a/drivers/acpi/utilities/utmutex.c +++ b/drivers/acpi/acpica/utmutex.c @@ -42,6 +42,7 @@ */ #include <acpi/acpi.h> +#include "accommon.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utmutex") diff --git a/drivers/acpi/utilities/utobject.c b/drivers/acpi/acpica/utobject.c index 4bef3cfbacc..fd5ea7543e5 100644 --- a/drivers/acpi/utilities/utobject.c +++ b/drivers/acpi/acpica/utobject.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/acnamesp.h> +#include "accommon.h" +#include "acnamesp.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utobject") diff --git a/drivers/acpi/utilities/utresrc.c b/drivers/acpi/acpica/utresrc.c index c3e3e1308ed..91b7c00236f 100644 --- a/drivers/acpi/utilities/utresrc.c +++ b/drivers/acpi/acpica/utresrc.c @@ -42,7 +42,8 @@ */ #include <acpi/acpi.h> -#include <acpi/amlresrc.h> +#include "accommon.h" +#include "amlresrc.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utresrc") diff --git a/drivers/acpi/utilities/utstate.c b/drivers/acpi/acpica/utstate.c index 63a6d3d77d8..0440c958f5a 100644 --- a/drivers/acpi/utilities/utstate.c +++ b/drivers/acpi/acpica/utstate.c @@ -42,6 +42,7 @@ */ #include <acpi/acpi.h> +#include "accommon.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utstate") diff --git a/drivers/acpi/utilities/utxface.c b/drivers/acpi/acpica/utxface.c index c198a4d4058..078a22728c6 100644 --- a/drivers/acpi/utilities/utxface.c +++ b/drivers/acpi/acpica/utxface.c @@ -42,9 +42,11 @@ */ #include <acpi/acpi.h> -#include <acpi/acevents.h> -#include <acpi/acnamesp.h> -#include <acpi/acdebug.h> +#include "accommon.h" +#include "acevents.h" +#include "acnamesp.h" +#include "acdebug.h" +#include "actables.h" #define _COMPONENT ACPI_UTILITIES ACPI_MODULE_NAME("utxface") @@ -148,6 +150,16 @@ acpi_status acpi_enable_subsystem(u32 flags) } /* + * Obtain a permanent mapping for the FACS. This is required for the + * Global Lock and the Firmware Waking Vector + */ + status = acpi_tb_initialize_facs(); + if (ACPI_FAILURE(status)) { + ACPI_WARNING((AE_INFO, "Could not map the FACS table")); + return_ACPI_STATUS(status); + } + + /* * Install the default op_region handlers. These are installed unless * other handlers have already been installed via the * install_address_space_handler interface. diff --git a/drivers/acpi/battery.c b/drivers/acpi/battery.c index 1423b0c0cd2..65132f92045 100644 --- a/drivers/acpi/battery.c +++ b/drivers/acpi/battery.c @@ -471,7 +471,7 @@ static void sysfs_remove_battery(struct acpi_battery *battery) static int acpi_battery_update(struct acpi_battery *battery) { - int result; + int result, old_present = acpi_battery_present(battery); result = acpi_battery_get_status(battery); if (result) return result; @@ -482,7 +482,8 @@ static int acpi_battery_update(struct acpi_battery *battery) return 0; } #endif - if (!battery->update_time) { + if (!battery->update_time || + old_present != acpi_battery_present(battery)) { result = acpi_battery_get_info(battery); if (result) return result; diff --git a/drivers/acpi/cm_sbs.c b/drivers/acpi/cm_sbs.c index 307963bd104..332fe4b2170 100644 --- a/drivers/acpi/cm_sbs.c +++ b/drivers/acpi/cm_sbs.c @@ -27,9 +27,6 @@ #include <linux/seq_file.h> #include <acpi/acpi_bus.h> #include <acpi/acpi_drivers.h> -#include <acpi/acmacros.h> -#include <acpi/actypes.h> -#include <acpi/acutils.h> ACPI_MODULE_NAME("cm_sbs"); #define ACPI_AC_CLASS "ac_adapter" diff --git a/drivers/acpi/debug.c b/drivers/acpi/debug.c index c4839689200..20223cbd0d1 100644 --- a/drivers/acpi/debug.c +++ b/drivers/acpi/debug.c @@ -9,7 +9,6 @@ #include <linux/moduleparam.h> #include <asm/uaccess.h> #include <acpi/acpi_drivers.h> -#include <acpi/acglobal.h> #define _COMPONENT ACPI_SYSTEM_COMPONENT ACPI_MODULE_NAME("debug"); diff --git a/drivers/acpi/dispatcher/Makefile b/drivers/acpi/dispatcher/Makefile deleted file mode 100644 index eb7e602a83c..00000000000 --- a/drivers/acpi/dispatcher/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for all Linux ACPI interpreter subdirectories -# - -obj-y := dsfield.o dsmthdat.o dsopcode.o dswexec.o dswscope.o \ - dsmethod.o dsobject.o dsutils.o dswload.o dswstate.o \ - dsinit.o - -EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/ec.c b/drivers/acpi/ec.c index 30f3ef236ec..8dfcbb8aff7 100644 --- a/drivers/acpi/ec.c +++ b/drivers/acpi/ec.c @@ -42,7 +42,6 @@ #include <asm/io.h> #include <acpi/acpi_bus.h> #include <acpi/acpi_drivers.h> -#include <acpi/actypes.h> #define ACPI_EC_CLASS "embedded_controller" #define ACPI_EC_DEVICE_NAME "Embedded Controller" @@ -370,7 +369,7 @@ unlock: * Note: samsung nv5000 doesn't work with ec burst mode. * http://bugzilla.kernel.org/show_bug.cgi?id=4980 */ -int acpi_ec_burst_enable(struct acpi_ec *ec) +static int acpi_ec_burst_enable(struct acpi_ec *ec) { u8 d; struct transaction t = {.command = ACPI_EC_BURST_ENABLE, @@ -380,7 +379,7 @@ int acpi_ec_burst_enable(struct acpi_ec *ec) return acpi_ec_transaction(ec, &t, 0); } -int acpi_ec_burst_disable(struct acpi_ec *ec) +static int acpi_ec_burst_disable(struct acpi_ec *ec) { struct transaction t = {.command = ACPI_EC_BURST_DISABLE, .wdata = NULL, .rdata = NULL, @@ -756,10 +755,15 @@ static acpi_status acpi_ec_register_query_methods(acpi_handle handle, u32 level, void *context, void **return_value) { - struct acpi_namespace_node *node = handle; + char node_name[5]; + struct acpi_buffer buffer = { sizeof(node_name), node_name }; struct acpi_ec *ec = context; int value = 0; - if (sscanf(node->name.ascii, "_Q%x", &value) == 1) { + acpi_status status; + + status = acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer); + + if (ACPI_SUCCESS(status) && sscanf(node_name, "_Q%x", &value) == 1) { acpi_ec_add_query_handler(ec, value, handle, NULL, NULL); } return AE_OK; @@ -978,9 +982,9 @@ static const struct acpi_device_id ec_device_ids[] = { int __init acpi_ec_ecdt_probe(void) { - int ret; acpi_status status; struct acpi_table_ecdt *ecdt_ptr; + acpi_handle dummy; boot_ec = make_acpi_ec(); if (!boot_ec) @@ -1006,30 +1010,31 @@ int __init acpi_ec_ecdt_probe(void) boot_ec->gpe = ecdt_ptr->gpe; boot_ec->handle = ACPI_ROOT_OBJECT; acpi_get_handle(ACPI_ROOT_OBJECT, ecdt_ptr->id, &boot_ec->handle); - } else { - /* This workaround is needed only on some broken machines, - * which require early EC, but fail to provide ECDT */ - acpi_handle x; - printk(KERN_DEBUG PREFIX "Look up EC in DSDT\n"); - status = acpi_get_devices(ec_device_ids[0].id, ec_parse_device, - boot_ec, NULL); - /* Check that acpi_get_devices actually find something */ - if (ACPI_FAILURE(status) || !boot_ec->handle) - goto error; - /* We really need to limit this workaround, the only ASUS, - * which needs it, has fake EC._INI method, so use it as flag. - * Keep boot_ec struct as it will be needed soon. - */ - if (ACPI_FAILURE(acpi_get_handle(boot_ec->handle, "_INI", &x))) - return -ENODEV; + /* Add some basic check against completely broken table */ + if (boot_ec->data_addr != boot_ec->command_addr) + goto install; + /* fall through */ } - - ret = ec_install_handlers(boot_ec); - if (!ret) { + /* This workaround is needed only on some broken machines, + * which require early EC, but fail to provide ECDT */ + printk(KERN_DEBUG PREFIX "Look up EC in DSDT\n"); + status = acpi_get_devices(ec_device_ids[0].id, ec_parse_device, + boot_ec, NULL); + /* Check that acpi_get_devices actually find something */ + if (ACPI_FAILURE(status) || !boot_ec->handle) + goto error; + /* We really need to limit this workaround, the only ASUS, + * which needs it, has fake EC._INI method, so use it as flag. + * Keep boot_ec struct as it will be needed soon. + */ + if (ACPI_FAILURE(acpi_get_handle(boot_ec->handle, "_INI", &dummy))) + return -ENODEV; +install: + if (!ec_install_handlers(boot_ec)) { first_ec = boot_ec; return 0; } - error: +error: kfree(boot_ec); boot_ec = NULL; return -ENODEV; diff --git a/drivers/acpi/events/Makefile b/drivers/acpi/events/Makefile deleted file mode 100644 index d29f2ee449c..00000000000 --- a/drivers/acpi/events/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for all Linux ACPI interpreter subdirectories -# - -obj-y := evevent.o evregion.o evsci.o evxfevnt.o \ - evmisc.o evrgnini.o evxface.o evxfregn.o \ - evgpe.o evgpeblk.o - -EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/executer/Makefile b/drivers/acpi/executer/Makefile deleted file mode 100644 index e09998aa012..00000000000 --- a/drivers/acpi/executer/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# -# Makefile for all Linux ACPI interpreter subdirectories -# - -obj-y := exconfig.o exfield.o exnames.o exoparg6.o exresolv.o exstorob.o\ - exconvrt.o exfldio.o exoparg1.o exprep.o exresop.o exsystem.o\ - excreate.o exmisc.o exoparg2.o exregion.o exstore.o exutils.o \ - exdump.o exmutex.o exoparg3.o exresnte.o exstoren.o - -EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/hardware/Makefile b/drivers/acpi/hardware/Makefile deleted file mode 100644 index 438ad373b9a..00000000000 --- a/drivers/acpi/hardware/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for all Linux ACPI interpreter subdirectories -# - -obj-y := hwacpi.o hwgpe.o hwregs.o hwsleep.o - -obj-$(ACPI_FUTURE_USAGE) += hwtimer.o - -EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/sleep/main.c b/drivers/acpi/main.c index 28a691cc625..7e3c609cbef 100644 --- a/drivers/acpi/sleep/main.c +++ b/drivers/acpi/main.c @@ -101,13 +101,26 @@ void __init acpi_old_suspend_ordering(void) * cases. */ static bool set_sci_en_on_resume; +/* + * The ACPI specification wants us to save NVS memory regions during hibernation + * and to restore them during the subsequent resume. However, it is not certain + * if this mechanism is going to work on all machines, so we allow the user to + * disable this mechanism using the 'acpi_sleep=s4_nonvs' kernel command line + * option. + */ +static bool s4_no_nvs; + +void __init acpi_s4_no_nvs(void) +{ + s4_no_nvs = true; +} /** * acpi_pm_disable_gpes - Disable the GPEs. */ static int acpi_pm_disable_gpes(void) { - acpi_hw_disable_all_gpes(); + acpi_disable_all_gpes(); return 0; } @@ -135,7 +148,7 @@ static int acpi_pm_prepare(void) int error = __acpi_pm_prepare(); if (!error) - acpi_hw_disable_all_gpes(); + acpi_disable_all_gpes(); return error; } @@ -267,7 +280,7 @@ static int acpi_suspend_enter(suspend_state_t pm_state) * (like wakeup GPE) haven't handler, this can avoid such GPE misfire. * acpi_leave_sleep_state will reenable specific GPEs later */ - acpi_hw_disable_all_gpes(); + acpi_disable_all_gpes(); local_irq_restore(flags); printk(KERN_DEBUG "Back to C!\n"); @@ -394,9 +407,25 @@ void __init acpi_no_s4_hw_signature(void) static int acpi_hibernation_begin(void) { - acpi_target_sleep_state = ACPI_STATE_S4; - acpi_sleep_tts_switch(acpi_target_sleep_state); - return 0; + int error; + + error = s4_no_nvs ? 0 : hibernate_nvs_alloc(); + if (!error) { + acpi_target_sleep_state = ACPI_STATE_S4; + acpi_sleep_tts_switch(acpi_target_sleep_state); + } + + return error; +} + +static int acpi_hibernation_pre_snapshot(void) +{ + int error = acpi_pm_prepare(); + + if (!error) + hibernate_nvs_save(); + + return error; } static int acpi_hibernation_enter(void) @@ -417,6 +446,12 @@ static int acpi_hibernation_enter(void) return ACPI_SUCCESS(status) ? 0 : -EFAULT; } +static void acpi_hibernation_finish(void) +{ + hibernate_nvs_free(); + acpi_pm_finish(); +} + static void acpi_hibernation_leave(void) { /* @@ -432,18 +467,20 @@ static void acpi_hibernation_leave(void) "cannot resume!\n"); panic("ACPI S4 hardware signature mismatch"); } + /* Restore the NVS memory area */ + hibernate_nvs_restore(); } static void acpi_pm_enable_gpes(void) { - acpi_hw_enable_all_runtime_gpes(); + acpi_enable_all_runtime_gpes(); } static struct platform_hibernation_ops acpi_hibernation_ops = { .begin = acpi_hibernation_begin, .end = acpi_pm_end, - .pre_snapshot = acpi_pm_prepare, - .finish = acpi_pm_finish, + .pre_snapshot = acpi_hibernation_pre_snapshot, + .finish = acpi_hibernation_finish, .prepare = acpi_pm_prepare, .enter = acpi_hibernation_enter, .leave = acpi_hibernation_leave, @@ -469,8 +506,22 @@ static int acpi_hibernation_begin_old(void) error = acpi_sleep_prepare(ACPI_STATE_S4); + if (!error) { + if (!s4_no_nvs) + error = hibernate_nvs_alloc(); + if (!error) + acpi_target_sleep_state = ACPI_STATE_S4; + } + return error; +} + +static int acpi_hibernation_pre_snapshot_old(void) +{ + int error = acpi_pm_disable_gpes(); + if (!error) - acpi_target_sleep_state = ACPI_STATE_S4; + hibernate_nvs_save(); + return error; } @@ -481,8 +532,8 @@ static int acpi_hibernation_begin_old(void) static struct platform_hibernation_ops acpi_hibernation_ops_old = { .begin = acpi_hibernation_begin_old, .end = acpi_pm_end, - .pre_snapshot = acpi_pm_disable_gpes, - .finish = acpi_pm_finish, + .pre_snapshot = acpi_hibernation_pre_snapshot_old, + .finish = acpi_hibernation_finish, .prepare = acpi_pm_disable_gpes, .enter = acpi_hibernation_enter, .leave = acpi_hibernation_leave, @@ -622,7 +673,7 @@ static void acpi_power_off_prepare(void) { /* Prepare to power off the system */ acpi_sleep_prepare(ACPI_STATE_S5); - acpi_hw_disable_all_gpes(); + acpi_disable_all_gpes(); } static void acpi_power_off(void) @@ -671,7 +722,7 @@ int __init acpi_sleep_init(void) sleep_states[ACPI_STATE_S4] = 1; printk(" S4"); if (!nosigcheck) { - acpi_get_table_by_index(ACPI_TABLE_INDEX_FACS, + acpi_get_table(ACPI_SIG_FACS, 1, (struct acpi_table_header **)&facs); if (facs) s4_hardware_signature = diff --git a/drivers/acpi/namespace/Makefile b/drivers/acpi/namespace/Makefile deleted file mode 100644 index 371a2daf837..00000000000 --- a/drivers/acpi/namespace/Makefile +++ /dev/null @@ -1,12 +0,0 @@ -# -# Makefile for all Linux ACPI interpreter subdirectories -# - -obj-y := nsaccess.o nsload.o nssearch.o nsxfeval.o \ - nsalloc.o nseval.o nsnames.o nsutils.o nsxfname.o \ - nsdump.o nsinit.o nsobject.o nswalk.o nsxfobj.o \ - nsparse.o nspredef.o - -obj-$(ACPI_FUTURE_USAGE) += nsdumpdv.o - -EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 25ceae9191e..c5e292aab0e 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -29,7 +29,6 @@ #include <linux/errno.h> #include <linux/acpi.h> #include <acpi/acpi_bus.h> -#include <acpi/acmacros.h> #define ACPI_NUMA 0x80000000 #define _COMPONENT ACPI_NUMA diff --git a/drivers/acpi/osl.c b/drivers/acpi/osl.c index c8111424dcb..6729a4992f2 100644 --- a/drivers/acpi/osl.c +++ b/drivers/acpi/osl.c @@ -726,7 +726,7 @@ static acpi_status __acpi_os_execute(acpi_execute_type type, dpc = kmalloc(sizeof(struct acpi_os_dpc), GFP_ATOMIC); if (!dpc) - return_ACPI_STATUS(AE_NO_MEMORY); + return AE_NO_MEMORY; dpc->function = function; dpc->context = context; @@ -747,7 +747,7 @@ static acpi_status __acpi_os_execute(acpi_execute_type type, status = AE_ERROR; kfree(dpc); } - return_ACPI_STATUS(status); + return status; } acpi_status acpi_os_execute(acpi_execute_type type, diff --git a/drivers/acpi/parser/Makefile b/drivers/acpi/parser/Makefile deleted file mode 100644 index db24ee09cf1..00000000000 --- a/drivers/acpi/parser/Makefile +++ /dev/null @@ -1,8 +0,0 @@ -# -# Makefile for all Linux ACPI interpreter subdirectories -# - -obj-y := psargs.o psparse.o psloop.o pstree.o pswalk.o \ - psopcode.o psscope.o psutils.o psxface.o - -EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/pci_bind.c b/drivers/acpi/pci_bind.c index 4b252ea0e95..95650f83ce2 100644 --- a/drivers/acpi/pci_bind.c +++ b/drivers/acpi/pci_bind.c @@ -99,7 +99,7 @@ acpi_status acpi_get_pci_id(acpi_handle handle, struct acpi_pci_id *id) */ ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Device %s has PCI address %02x:%02x:%02x.%02x\n", + "Device %s has PCI address %04x:%02x:%02x.%d\n", acpi_device_bid(device), id->segment, id->bus, id->device, id->function)); @@ -111,12 +111,11 @@ EXPORT_SYMBOL(acpi_get_pci_id); int acpi_pci_bind(struct acpi_device *device) { int result = 0; - acpi_status status = AE_OK; - struct acpi_pci_data *data = NULL; - struct acpi_pci_data *pdata = NULL; - char *pathname = NULL; - struct acpi_buffer buffer = { 0, NULL }; - acpi_handle handle = NULL; + acpi_status status; + struct acpi_pci_data *data; + struct acpi_pci_data *pdata; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + acpi_handle handle; struct pci_dev *dev; struct pci_bus *bus; @@ -124,21 +123,18 @@ int acpi_pci_bind(struct acpi_device *device) if (!device || !device->parent) return -EINVAL; - pathname = kzalloc(ACPI_PATHNAME_MAX, GFP_KERNEL); - if (!pathname) - return -ENOMEM; - buffer.length = ACPI_PATHNAME_MAX; - buffer.pointer = pathname; - data = kzalloc(sizeof(struct acpi_pci_data), GFP_KERNEL); - if (!data) { - kfree(pathname); + if (!data) return -ENOMEM; + + status = acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer); + if (ACPI_FAILURE(status)) { + kfree(data); + return -ENODEV; } - acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer); ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Binding PCI device [%s]...\n", - pathname)); + (char *)buffer.pointer)); /* * Segment & Bus @@ -166,7 +162,7 @@ int acpi_pci_bind(struct acpi_device *device) data->id.device = device->pnp.bus_address >> 16; data->id.function = device->pnp.bus_address & 0xFFFF; - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "...to %02x:%02x:%02x.%02x\n", + ACPI_DEBUG_PRINT((ACPI_DB_INFO, "...to %04x:%02x:%02x.%d\n", data->id.segment, data->id.bus, data->id.device, data->id.function)); @@ -196,7 +192,7 @@ int acpi_pci_bind(struct acpi_device *device) } if (!data->dev) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Device %02x:%02x:%02x.%02x not present in PCI namespace\n", + "Device %04x:%02x:%02x.%d not present in PCI namespace\n", data->id.segment, data->id.bus, data->id.device, data->id.function)); result = -ENODEV; @@ -204,7 +200,7 @@ int acpi_pci_bind(struct acpi_device *device) } if (!data->dev->bus) { printk(KERN_ERR PREFIX - "Device %02x:%02x:%02x.%02x has invalid 'bus' field\n", + "Device %04x:%02x:%02x.%d has invalid 'bus' field\n", data->id.segment, data->id.bus, data->id.device, data->id.function); result = -ENODEV; @@ -219,7 +215,7 @@ int acpi_pci_bind(struct acpi_device *device) */ if (data->dev->subordinate) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Device %02x:%02x:%02x.%02x is a PCI bridge\n", + "Device %04x:%02x:%02x.%d is a PCI bridge\n", data->id.segment, data->id.bus, data->id.device, data->id.function)); data->bus = data->dev->subordinate; @@ -262,7 +258,7 @@ int acpi_pci_bind(struct acpi_device *device) } end: - kfree(pathname); + kfree(buffer.pointer); if (result) kfree(data); @@ -272,25 +268,21 @@ int acpi_pci_bind(struct acpi_device *device) static int acpi_pci_unbind(struct acpi_device *device) { int result = 0; - acpi_status status = AE_OK; - struct acpi_pci_data *data = NULL; - char *pathname = NULL; - struct acpi_buffer buffer = { 0, NULL }; + acpi_status status; + struct acpi_pci_data *data; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; if (!device || !device->parent) return -EINVAL; - pathname = kzalloc(ACPI_PATHNAME_MAX, GFP_KERNEL); - if (!pathname) - return -ENOMEM; + status = acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; - buffer.length = ACPI_PATHNAME_MAX; - buffer.pointer = pathname; - acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer); ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Unbinding PCI device [%s]...\n", - pathname)); - kfree(pathname); + (char *) buffer.pointer)); + kfree(buffer.pointer); status = acpi_get_data(device->handle, acpi_pci_data_handler, @@ -322,50 +314,44 @@ acpi_pci_bind_root(struct acpi_device *device, struct acpi_pci_id *id, struct pci_bus *bus) { int result = 0; - acpi_status status = AE_OK; + acpi_status status; struct acpi_pci_data *data = NULL; - char *pathname = NULL; - struct acpi_buffer buffer = { 0, NULL }; - - pathname = kzalloc(ACPI_PATHNAME_MAX, GFP_KERNEL); - if (!pathname) - return -ENOMEM; - - buffer.length = ACPI_PATHNAME_MAX; - buffer.pointer = pathname; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; if (!device || !id || !bus) { - kfree(pathname); return -EINVAL; } data = kzalloc(sizeof(struct acpi_pci_data), GFP_KERNEL); - if (!data) { - kfree(pathname); + if (!data) return -ENOMEM; - } data->id = *id; data->bus = bus; device->ops.bind = acpi_pci_bind; device->ops.unbind = acpi_pci_unbind; - acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer); + status = acpi_get_name(device->handle, ACPI_FULL_PATHNAME, &buffer); + if (ACPI_FAILURE(status)) { + kfree (data); + return -ENODEV; + } ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Binding PCI root bridge [%s] to " - "%02x:%02x\n", pathname, id->segment, id->bus)); + "%04x:%02x\n", (char *)buffer.pointer, + id->segment, id->bus)); status = acpi_attach_data(device->handle, acpi_pci_data_handler, data); if (ACPI_FAILURE(status)) { ACPI_EXCEPTION((AE_INFO, status, "Unable to attach ACPI-PCI context to device %s", - pathname)); + (char *)buffer.pointer)); result = -ENODEV; goto end; } end: - kfree(pathname); + kfree(buffer.pointer); if (result != 0) kfree(data); diff --git a/drivers/acpi/pci_irq.c b/drivers/acpi/pci_irq.c index bf79d83bdfb..891bdf6679f 100644 --- a/drivers/acpi/pci_irq.c +++ b/drivers/acpi/pci_irq.c @@ -4,6 +4,8 @@ * Copyright (C) 2001, 2002 Andy Grover <andrew.grover@intel.com> * Copyright (C) 2001, 2002 Paul Diefenbaugh <paul.s.diefenbaugh@intel.com> * Copyright (C) 2002 Dominik Brodowski <devel@brodo.de> + * (c) Copyright 2008 Hewlett-Packard Development Company, L.P. + * Bjorn Helgaas <bjorn.helgaas@hp.com> * * ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ * @@ -41,29 +43,36 @@ #define _COMPONENT ACPI_PCI_COMPONENT ACPI_MODULE_NAME("pci_irq"); -static struct acpi_prt_list acpi_prt; +struct acpi_prt_entry { + struct list_head list; + struct acpi_pci_id id; + u8 pin; + acpi_handle link; + u32 index; /* GSI, or link _CRS index */ +}; + +static LIST_HEAD(acpi_prt_list); static DEFINE_SPINLOCK(acpi_prt_lock); +static inline char pin_name(int pin) +{ + return 'A' + pin - 1; +} + /* -------------------------------------------------------------------------- PCI IRQ Routing Table (PRT) Support -------------------------------------------------------------------------- */ -static struct acpi_prt_entry *acpi_pci_irq_find_prt_entry(int segment, - int bus, - int device, int pin) +static struct acpi_prt_entry *acpi_pci_irq_find_prt_entry(struct pci_dev *dev, + int pin) { - struct acpi_prt_entry *entry = NULL; - - if (!acpi_prt.count) - return NULL; + struct acpi_prt_entry *entry; + int segment = pci_domain_nr(dev->bus); + int bus = dev->bus->number; + int device = PCI_SLOT(dev->devfn); - /* - * Parse through all PRT entries looking for a match on the specified - * PCI device's segment, bus, device, and pin (don't care about func). - * - */ spin_lock(&acpi_prt_lock); - list_for_each_entry(entry, &acpi_prt.entries, node) { + list_for_each_entry(entry, &acpi_prt_list, list) { if ((segment == entry->id.segment) && (bus == entry->id.bus) && (device == entry->id.device) @@ -72,7 +81,6 @@ static struct acpi_prt_entry *acpi_pci_irq_find_prt_entry(int segment, return entry; } } - spin_unlock(&acpi_prt_lock); return NULL; } @@ -124,25 +132,27 @@ struct prt_quirk { char *actual_source; }; +#define PCI_INTX_PIN(c) (c - 'A' + 1) + /* * These systems have incorrect _PRT entries. The BIOS claims the PCI * interrupt at the listed segment/bus/device/pin is connected to the first * link device, but it is actually connected to the second. */ static struct prt_quirk prt_quirks[] = { - { medion_md9580, 0, 0, 9, 'A', + { medion_md9580, 0, 0, 9, PCI_INTX_PIN('A'), "\\_SB_.PCI0.ISA_.LNKA", "\\_SB_.PCI0.ISA_.LNKB"}, - { dell_optiplex, 0, 0, 0xd, 'A', + { dell_optiplex, 0, 0, 0xd, PCI_INTX_PIN('A'), "\\_SB_.LNKB", "\\_SB_.LNKA"}, - { hp_t5710, 0, 0, 1, 'A', + { hp_t5710, 0, 0, 1, PCI_INTX_PIN('A'), "\\_SB_.PCI0.LNK1", "\\_SB_.PCI0.LNK3"}, }; -static void -do_prt_fixups(struct acpi_prt_entry *entry, struct acpi_pci_routing_table *prt) +static void do_prt_fixups(struct acpi_prt_entry *entry, + struct acpi_pci_routing_table *prt) { int i; struct prt_quirk *quirk; @@ -158,42 +168,43 @@ do_prt_fixups(struct acpi_prt_entry *entry, struct acpi_pci_routing_table *prt) entry->id.segment == quirk->segment && entry->id.bus == quirk->bus && entry->id.device == quirk->device && - entry->pin + 'A' == quirk->pin && + entry->pin == quirk->pin && !strcmp(prt->source, quirk->source) && strlen(prt->source) >= strlen(quirk->actual_source)) { printk(KERN_WARNING PREFIX "firmware reports " "%04x:%02x:%02x PCI INT %c connected to %s; " "changing to %s\n", entry->id.segment, entry->id.bus, - entry->id.device, 'A' + entry->pin, + entry->id.device, pin_name(entry->pin), prt->source, quirk->actual_source); strcpy(prt->source, quirk->actual_source); } } } -static int -acpi_pci_irq_add_entry(acpi_handle handle, - int segment, int bus, struct acpi_pci_routing_table *prt) +static int acpi_pci_irq_add_entry(acpi_handle handle, int segment, int bus, + struct acpi_pci_routing_table *prt) { - struct acpi_prt_entry *entry = NULL; - - - if (!prt) - return -EINVAL; + struct acpi_prt_entry *entry; entry = kzalloc(sizeof(struct acpi_prt_entry), GFP_KERNEL); if (!entry) return -ENOMEM; + /* + * Note that the _PRT uses 0=INTA, 1=INTB, etc, while PCI uses + * 1=INTA, 2=INTB. We use the PCI encoding throughout, so convert + * it here. + */ entry->id.segment = segment; entry->id.bus = bus; entry->id.device = (prt->address >> 16) & 0xFFFF; - entry->id.function = prt->address & 0xFFFF; - entry->pin = prt->pin; + entry->pin = prt->pin + 1; do_prt_fixups(entry, prt); + entry->index = prt->source_index; + /* * Type 1: Dynamic * --------------- @@ -207,10 +218,9 @@ acpi_pci_irq_add_entry(acpi_handle handle, * (e.g. exists somewhere 'below' this _PRT entry in the ACPI * namespace). */ - if (prt->source[0]) { - acpi_get_handle(handle, prt->source, &entry->link.handle); - entry->link.index = prt->source_index; - } + if (prt->source[0]) + acpi_get_handle(handle, prt->source, &entry->link); + /* * Type 2: Static * -------------- @@ -218,84 +228,38 @@ acpi_pci_irq_add_entry(acpi_handle handle, * the IRQ value, which is hardwired to specific interrupt inputs on * the interrupt controller. */ - else - entry->link.index = prt->source_index; ACPI_DEBUG_PRINT_RAW((ACPI_DB_INFO, - " %02X:%02X:%02X[%c] -> %s[%d]\n", + " %04x:%02x:%02x[%c] -> %s[%d]\n", entry->id.segment, entry->id.bus, - entry->id.device, ('A' + entry->pin), prt->source, - entry->link.index)); + entry->id.device, pin_name(entry->pin), + prt->source, entry->index)); spin_lock(&acpi_prt_lock); - list_add_tail(&entry->node, &acpi_prt.entries); - acpi_prt.count++; + list_add_tail(&entry->list, &acpi_prt_list); spin_unlock(&acpi_prt_lock); return 0; } -static void -acpi_pci_irq_del_entry(int segment, int bus, struct acpi_prt_entry *entry) -{ - if (segment == entry->id.segment && bus == entry->id.bus) { - acpi_prt.count--; - list_del(&entry->node); - kfree(entry); - } -} - int acpi_pci_irq_add_prt(acpi_handle handle, int segment, int bus) { - acpi_status status = AE_OK; - char *pathname = NULL; - struct acpi_buffer buffer = { 0, NULL }; - struct acpi_pci_routing_table *prt = NULL; - struct acpi_pci_routing_table *entry = NULL; - static int first_time = 1; - - - pathname = kzalloc(ACPI_PATHNAME_MAX, GFP_KERNEL); - if (!pathname) - return -ENOMEM; - - if (first_time) { - acpi_prt.count = 0; - INIT_LIST_HEAD(&acpi_prt.entries); - first_time = 0; - } - - /* - * NOTE: We're given a 'handle' to the _PRT object's parent device - * (either a PCI root bridge or PCI-PCI bridge). - */ + acpi_status status; + struct acpi_buffer buffer = { ACPI_ALLOCATE_BUFFER, NULL }; + struct acpi_pci_routing_table *entry; - buffer.length = ACPI_PATHNAME_MAX; - buffer.pointer = pathname; - acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer); + /* 'handle' is the _PRT's parent (root bridge or PCI-PCI bridge) */ + status = acpi_get_name(handle, ACPI_FULL_PATHNAME, &buffer); + if (ACPI_FAILURE(status)) + return -ENODEV; printk(KERN_DEBUG "ACPI: PCI Interrupt Routing Table [%s._PRT]\n", - pathname); + (char *) buffer.pointer); - /* - * Evaluate this _PRT and add its entries to our global list (acpi_prt). - */ + kfree(buffer.pointer); - buffer.length = 0; + buffer.length = ACPI_ALLOCATE_BUFFER; buffer.pointer = NULL; - kfree(pathname); - status = acpi_get_irq_routing_table(handle, &buffer); - if (status != AE_BUFFER_OVERFLOW) { - ACPI_EXCEPTION((AE_INFO, status, "Evaluating _PRT [%s]", - acpi_format_exception(status))); - return -ENODEV; - } - - prt = kzalloc(buffer.length, GFP_KERNEL); - if (!prt) { - return -ENOMEM; - } - buffer.pointer = prt; status = acpi_get_irq_routing_table(handle, &buffer); if (ACPI_FAILURE(status)) { @@ -305,36 +269,30 @@ int acpi_pci_irq_add_prt(acpi_handle handle, int segment, int bus) return -ENODEV; } - entry = prt; - + entry = buffer.pointer; while (entry && (entry->length > 0)) { acpi_pci_irq_add_entry(handle, segment, bus, entry); entry = (struct acpi_pci_routing_table *) ((unsigned long)entry + entry->length); } - kfree(prt); - + kfree(buffer.pointer); return 0; } void acpi_pci_irq_del_prt(int segment, int bus) { - struct list_head *node = NULL, *n = NULL; - struct acpi_prt_entry *entry = NULL; - - if (!acpi_prt.count) { - return; - } + struct acpi_prt_entry *entry, *tmp; printk(KERN_DEBUG - "ACPI: Delete PCI Interrupt Routing Table for %x:%x\n", segment, - bus); + "ACPI: Delete PCI Interrupt Routing Table for %04x:%02x\n", + segment, bus); spin_lock(&acpi_prt_lock); - list_for_each_safe(node, n, &acpi_prt.entries) { - entry = list_entry(node, struct acpi_prt_entry, node); - - acpi_pci_irq_del_entry(segment, bus, entry); + list_for_each_entry_safe(entry, tmp, &acpi_prt_list, list) { + if (segment == entry->id.segment && bus == entry->id.bus) { + list_del(&entry->list); + kfree(entry); + } } spin_unlock(&acpi_prt_lock); } @@ -342,162 +300,26 @@ void acpi_pci_irq_del_prt(int segment, int bus) /* -------------------------------------------------------------------------- PCI Interrupt Routing Support -------------------------------------------------------------------------- */ -typedef int (*irq_lookup_func) (struct acpi_prt_entry *, int *, int *, char **); - -static int -acpi_pci_allocate_irq(struct acpi_prt_entry *entry, - int *triggering, int *polarity, char **link) -{ - int irq; - - - if (entry->link.handle) { - irq = acpi_pci_link_allocate_irq(entry->link.handle, - entry->link.index, triggering, - polarity, link); - if (irq < 0) { - printk(KERN_WARNING PREFIX - "Invalid IRQ link routing entry\n"); - return -1; - } - } else { - irq = entry->link.index; - *triggering = ACPI_LEVEL_SENSITIVE; - *polarity = ACPI_ACTIVE_LOW; - } - - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found IRQ %d\n", irq)); - return irq; -} - -static int -acpi_pci_free_irq(struct acpi_prt_entry *entry, - int *triggering, int *polarity, char **link) -{ - int irq; - - if (entry->link.handle) { - irq = acpi_pci_link_free_irq(entry->link.handle); - } else { - irq = entry->link.index; - } - return irq; -} - -#ifdef CONFIG_X86_IO_APIC -extern int noioapicquirk; - -static int bridge_has_boot_interrupt_variant(struct pci_bus *bus) +static struct acpi_prt_entry *acpi_pci_irq_lookup(struct pci_dev *dev, int pin) { - struct pci_bus *bus_it; - - for (bus_it = bus ; bus_it ; bus_it = bus_it->parent) { - if (!bus_it->self) - return 0; - - printk(KERN_INFO "vendor=%04x device=%04x\n", bus_it->self->vendor, - bus_it->self->device); - - if (bus_it->self->irq_reroute_variant) - return bus_it->self->irq_reroute_variant; - } - return 0; -} -#endif /* CONFIG_X86_IO_APIC */ - -/* - * acpi_pci_irq_lookup - * success: return IRQ >= 0 - * failure: return -1 - */ -static int -acpi_pci_irq_lookup(struct pci_bus *bus, - int device, - int pin, - int *triggering, - int *polarity, char **link, irq_lookup_func func) -{ - struct acpi_prt_entry *entry = NULL; - int segment = pci_domain_nr(bus); - int bus_nr = bus->number; - int ret; - - - ACPI_DEBUG_PRINT((ACPI_DB_INFO, - "Searching for PRT entry for %02x:%02x:%02x[%c]\n", - segment, bus_nr, device, ('A' + pin))); - - entry = acpi_pci_irq_find_prt_entry(segment, bus_nr, device, pin); - if (!entry) { - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "PRT entry not found\n")); - return -1; - } - - ret = func(entry, triggering, polarity, link); - -#ifdef CONFIG_X86_IO_APIC - /* - * Some chipsets (e.g. intel 6700PXH) generate a legacy INTx when the - * IRQ entry in the chipset's IO-APIC is masked (as, e.g. the RT kernel - * does during interrupt handling). When this INTx generation cannot be - * disabled, we reroute these interrupts to their legacy equivalent to - * get rid of spurious interrupts. - */ - if (!noioapicquirk) { - switch (bridge_has_boot_interrupt_variant(bus)) { - case 0: - /* no rerouting necessary */ - break; - - case INTEL_IRQ_REROUTE_VARIANT: - /* - * Remap according to INTx routing table in 6700PXH - * specs, intel order number 302628-002, section - * 2.15.2. Other chipsets (80332, ...) have the same - * mapping and are handled here as well. - */ - printk(KERN_INFO "pci irq %d -> rerouted to legacy " - "irq %d\n", ret, (ret % 4) + 16); - ret = (ret % 4) + 16; - break; - - default: - printk(KERN_INFO "not rerouting irq %d to legacy irq: " - "unknown mapping\n", ret); - break; - } + struct acpi_prt_entry *entry; + struct pci_dev *bridge; + u8 bridge_pin, orig_pin = pin; + + entry = acpi_pci_irq_find_prt_entry(dev, pin); + if (entry) { + ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Found %s[%c] _PRT entry\n", + pci_name(dev), pin_name(pin))); + return entry; } -#endif /* CONFIG_X86_IO_APIC */ - - return ret; -} - -/* - * acpi_pci_irq_derive - * success: return IRQ >= 0 - * failure: return < 0 - */ -static int -acpi_pci_irq_derive(struct pci_dev *dev, - int pin, - int *triggering, - int *polarity, char **link, irq_lookup_func func) -{ - struct pci_dev *bridge = dev; - int irq = -1; - u8 bridge_pin = 0, orig_pin = pin; - - - if (!dev) - return -EINVAL; /* * Attempt to derive an IRQ for this device from a parent bridge's * PCI interrupt routing entry (eg. yenta bridge and add-in card bridge). */ - while (irq < 0 && bridge->bus->self) { - pin = (pin + PCI_SLOT(bridge->devfn)) % 4; - bridge = bridge->bus->self; + bridge = dev->bus->self; + while (bridge) { + pin = (((pin - 1) + PCI_SLOT(dev->devfn)) % 4) + 1; if ((bridge->class >> 8) == PCI_CLASS_BRIDGE_CARDBUS) { /* PC card has the same IRQ as its cardbridge */ @@ -506,50 +328,40 @@ acpi_pci_irq_derive(struct pci_dev *dev, ACPI_DEBUG_PRINT((ACPI_DB_INFO, "No interrupt pin configured for device %s\n", pci_name(bridge))); - return -1; + return NULL; } - /* Pin is from 0 to 3 */ - bridge_pin--; pin = bridge_pin; } - irq = acpi_pci_irq_lookup(bridge->bus, PCI_SLOT(bridge->devfn), - pin, triggering, polarity, - link, func); - } + entry = acpi_pci_irq_find_prt_entry(bridge, pin); + if (entry) { + ACPI_DEBUG_PRINT((ACPI_DB_INFO, + "Derived GSI for %s INT %c from %s\n", + pci_name(dev), pin_name(orig_pin), + pci_name(bridge))); + return entry; + } - if (irq < 0) { - dev_warn(&dev->dev, "can't derive routing for PCI INT %c\n", - 'A' + orig_pin); - return -1; + dev = bridge; + bridge = dev->bus->self; } - ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Derive IRQ %d for device %s from %s\n", - irq, pci_name(dev), pci_name(bridge))); - - return irq; + dev_warn(&dev->dev, "can't derive routing for PCI INT %c\n", + pin_name(orig_pin)); + return NULL; } -/* - * acpi_pci_irq_enable - * success: return 0 - * failure: return < 0 - */ - int acpi_pci_irq_enable(struct pci_dev *dev) { - int irq = 0; - u8 pin = 0; + struct acpi_prt_entry *entry; + int gsi; + u8 pin; int triggering = ACPI_LEVEL_SENSITIVE; int polarity = ACPI_ACTIVE_LOW; char *link = NULL; char link_desc[16]; int rc; - - if (!dev) - return -EINVAL; - pin = dev->pin; if (!pin) { ACPI_DEBUG_PRINT((ACPI_DB_INFO, @@ -557,31 +369,9 @@ int acpi_pci_irq_enable(struct pci_dev *dev) pci_name(dev))); return 0; } - pin--; - - if (!dev->bus) { - dev_err(&dev->dev, "invalid (NULL) 'bus' field\n"); - return -ENODEV; - } - - /* - * First we check the PCI IRQ routing table (PRT) for an IRQ. PRT - * values override any BIOS-assigned IRQs set during boot. - */ - irq = acpi_pci_irq_lookup(dev->bus, PCI_SLOT(dev->devfn), pin, - &triggering, &polarity, &link, - acpi_pci_allocate_irq); - - /* - * If no PRT entry was found, we'll try to derive an IRQ from the - * device's parent bridge. - */ - if (irq < 0) - irq = acpi_pci_irq_derive(dev, pin, &triggering, - &polarity, &link, - acpi_pci_allocate_irq); - if (irq < 0) { + entry = acpi_pci_irq_lookup(dev, pin); + if (!entry) { /* * IDE legacy mode controller IRQs are magic. Why do compat * extensions always make such a nasty mess. @@ -590,12 +380,24 @@ int acpi_pci_irq_enable(struct pci_dev *dev) (dev->class & 0x05) == 0) return 0; } + + if (entry) { + if (entry->link) + gsi = acpi_pci_link_allocate_irq(entry->link, + entry->index, + &triggering, &polarity, + &link); + else + gsi = entry->index; + } else + gsi = -1; + /* * No IRQ known to the ACPI subsystem - maybe the BIOS / * driver reported one, then use it. Exit in any case. */ - if (irq < 0) { - dev_warn(&dev->dev, "PCI INT %c: no GSI", 'A' + pin); + if (gsi < 0) { + dev_warn(&dev->dev, "PCI INT %c: no GSI", pin_name(pin)); /* Interrupt Line values above 0xF are forbidden */ if (dev->irq > 0 && (dev->irq <= 0xF)) { printk(" - using IRQ %d\n", dev->irq); @@ -608,10 +410,10 @@ int acpi_pci_irq_enable(struct pci_dev *dev) } } - rc = acpi_register_gsi(irq, triggering, polarity); + rc = acpi_register_gsi(gsi, triggering, polarity); if (rc < 0) { dev_warn(&dev->dev, "PCI INT %c: failed to register GSI\n", - 'A' + pin); + pin_name(pin)); return rc; } dev->irq = rc; @@ -622,7 +424,7 @@ int acpi_pci_irq_enable(struct pci_dev *dev) link_desc[0] = '\0'; dev_info(&dev->dev, "PCI INT %c%s -> GSI %u (%s, %s) -> IRQ %d\n", - 'A' + pin, link_desc, irq, + pin_name(pin), link_desc, gsi, (triggering == ACPI_LEVEL_SENSITIVE) ? "level" : "edge", (polarity == ACPI_ACTIVE_LOW) ? "low" : "high", dev->irq); @@ -636,42 +438,28 @@ void __attribute__ ((weak)) acpi_unregister_gsi(u32 i) void acpi_pci_irq_disable(struct pci_dev *dev) { - int gsi = 0; - u8 pin = 0; - int triggering = ACPI_LEVEL_SENSITIVE; - int polarity = ACPI_ACTIVE_LOW; - - - if (!dev || !dev->bus) - return; + struct acpi_prt_entry *entry; + int gsi; + u8 pin; pin = dev->pin; if (!pin) return; - pin--; - /* - * First we check the PCI IRQ routing table (PRT) for an IRQ. - */ - gsi = acpi_pci_irq_lookup(dev->bus, PCI_SLOT(dev->devfn), pin, - &triggering, &polarity, NULL, - acpi_pci_free_irq); - /* - * If no PRT entry was found, we'll try to derive an IRQ from the - * device's parent bridge. - */ - if (gsi < 0) - gsi = acpi_pci_irq_derive(dev, pin, - &triggering, &polarity, NULL, - acpi_pci_free_irq); - if (gsi < 0) + entry = acpi_pci_irq_lookup(dev, pin); + if (!entry) return; + if (entry->link) + gsi = acpi_pci_link_free_irq(entry->link); + else + gsi = entry->index; + /* * TBD: It might be worth clearing dev->irq by magic constant * (e.g. PCI_UNDEFINED_IRQ). */ - dev_info(&dev->dev, "PCI INT %c disabled\n", 'A' + pin); + dev_info(&dev->dev, "PCI INT %c disabled\n", pin_name(pin)); acpi_unregister_gsi(gsi); } diff --git a/drivers/acpi/pci_link.c b/drivers/acpi/pci_link.c index e52ad91ce2d..1c6e73c7865 100644 --- a/drivers/acpi/pci_link.c +++ b/drivers/acpi/pci_link.c @@ -796,10 +796,6 @@ static int irqrouter_resume(struct sys_device *dev) struct list_head *node = NULL; struct acpi_pci_link *link = NULL; - - /* Make sure SCI is enabled again (Apple firmware bug?) */ - acpi_set_register(ACPI_BITREG_SCI_ENABLE, 1); - list_for_each(node, &acpi_link.entries) { link = list_entry(node, struct acpi_pci_link, node); if (!link) { @@ -912,7 +908,7 @@ static int __init acpi_irq_nobalance_set(char *str) __setup("acpi_irq_nobalance", acpi_irq_nobalance_set); -int __init acpi_irq_balance_set(char *str) +static int __init acpi_irq_balance_set(char *str) { acpi_irq_balance = 1; return 1; diff --git a/drivers/acpi/power.c b/drivers/acpi/power.c index bb7d50dd281..c926e7d4a0d 100644 --- a/drivers/acpi/power.c +++ b/drivers/acpi/power.c @@ -139,6 +139,8 @@ static int acpi_power_get_state(acpi_handle handle, int *state) { acpi_status status = AE_OK; unsigned long long sta = 0; + char node_name[5]; + struct acpi_buffer buffer = { sizeof(node_name), node_name }; if (!handle || !state) @@ -151,8 +153,10 @@ static int acpi_power_get_state(acpi_handle handle, int *state) *state = (sta & 0x01)?ACPI_POWER_RESOURCE_STATE_ON: ACPI_POWER_RESOURCE_STATE_OFF; + acpi_get_name(handle, ACPI_SINGLE_NAME, &buffer); + ACPI_DEBUG_PRINT((ACPI_DB_INFO, "Resource [%s] is %s\n", - acpi_ut_get_node_name(handle), + node_name, *state ? "on" : "off")); return 0; diff --git a/drivers/acpi/sleep/proc.c b/drivers/acpi/proc.c index 4dbc2271acf..428c911dba0 100644 --- a/drivers/acpi/sleep/proc.c +++ b/drivers/acpi/proc.c @@ -28,8 +28,6 @@ static int acpi_system_sleep_seq_show(struct seq_file *seq, void *offset) { int i; - ACPI_FUNCTION_TRACE("acpi_system_sleep_seq_show"); - for (i = 0; i <= ACPI_STATE_S5; i++) { if (sleep_states[i]) { seq_printf(seq, "S%d ", i); @@ -86,49 +84,44 @@ acpi_system_write_sleep(struct file *file, #ifdef HAVE_ACPI_LEGACY_ALARM +static u32 cmos_bcd_read(int offset, int rtc_control); + static int acpi_system_alarm_seq_show(struct seq_file *seq, void *offset) { u32 sec, min, hr; u32 day, mo, yr, cent = 0; + u32 today = 0; unsigned char rtc_control = 0; unsigned long flags; - ACPI_FUNCTION_TRACE("acpi_system_alarm_seq_show"); - spin_lock_irqsave(&rtc_lock, flags); - sec = CMOS_READ(RTC_SECONDS_ALARM); - min = CMOS_READ(RTC_MINUTES_ALARM); - hr = CMOS_READ(RTC_HOURS_ALARM); rtc_control = CMOS_READ(RTC_CONTROL); + sec = cmos_bcd_read(RTC_SECONDS_ALARM, rtc_control); + min = cmos_bcd_read(RTC_MINUTES_ALARM, rtc_control); + hr = cmos_bcd_read(RTC_HOURS_ALARM, rtc_control); /* If we ever get an FACP with proper values... */ - if (acpi_gbl_FADT.day_alarm) + if (acpi_gbl_FADT.day_alarm) { /* ACPI spec: only low 6 its should be cared */ day = CMOS_READ(acpi_gbl_FADT.day_alarm) & 0x3F; - else - day = CMOS_READ(RTC_DAY_OF_MONTH); + if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) + day = bcd2bin(day); + } else + day = cmos_bcd_read(RTC_DAY_OF_MONTH, rtc_control); if (acpi_gbl_FADT.month_alarm) - mo = CMOS_READ(acpi_gbl_FADT.month_alarm); - else - mo = CMOS_READ(RTC_MONTH); + mo = cmos_bcd_read(acpi_gbl_FADT.month_alarm, rtc_control); + else { + mo = cmos_bcd_read(RTC_MONTH, rtc_control); + today = cmos_bcd_read(RTC_DAY_OF_MONTH, rtc_control); + } if (acpi_gbl_FADT.century) - cent = CMOS_READ(acpi_gbl_FADT.century); + cent = cmos_bcd_read(acpi_gbl_FADT.century, rtc_control); - yr = CMOS_READ(RTC_YEAR); + yr = cmos_bcd_read(RTC_YEAR, rtc_control); spin_unlock_irqrestore(&rtc_lock, flags); - if (!(rtc_control & RTC_DM_BINARY) || RTC_ALWAYS_BCD) { - sec = bcd2bin(sec); - min = bcd2bin(min); - hr = bcd2bin(hr); - day = bcd2bin(day); - mo = bcd2bin(mo); - yr = bcd2bin(yr); - cent = bcd2bin(cent); - } - /* we're trusting the FADT (see above) */ if (!acpi_gbl_FADT.century) /* If we're not trusting the FADT, we should at least make it @@ -153,6 +146,20 @@ static int acpi_system_alarm_seq_show(struct seq_file *seq, void *offset) else yr += cent * 100; + /* + * Show correct dates for alarms up to a month into the future. + * This solves issues for nearly all situations with the common + * 30-day alarm clocks in PC hardware. + */ + if (day < today) { + if (mo < 12) { + mo += 1; + } else { + mo = 1; + yr += 1; + } + } + seq_printf(seq, "%4.4u-", yr); (mo > 12) ? seq_puts(seq, "**-") : seq_printf(seq, "%2.2u-", mo); (day > 31) ? seq_puts(seq, "** ") : seq_printf(seq, "%2.2u ", day); @@ -227,13 +234,11 @@ acpi_system_write_alarm(struct file *file, int adjust = 0; unsigned char rtc_control = 0; - ACPI_FUNCTION_TRACE("acpi_system_write_alarm"); - if (count > sizeof(alarm_string) - 1) - return_VALUE(-EINVAL); + return -EINVAL; if (copy_from_user(alarm_string, buffer, count)) - return_VALUE(-EFAULT); + return -EFAULT; alarm_string[count] = '\0'; @@ -334,7 +339,7 @@ acpi_system_write_alarm(struct file *file, result = 0; end: - return_VALUE(result ? result : count); + return result ? result : count; } #endif /* HAVE_ACPI_LEGACY_ALARM */ diff --git a/drivers/acpi/reboot.c b/drivers/acpi/reboot.c index a6b662c00b6..93f91142d7a 100644 --- a/drivers/acpi/reboot.c +++ b/drivers/acpi/reboot.c @@ -42,7 +42,7 @@ void acpi_reboot(void) case ACPI_ADR_SPACE_SYSTEM_MEMORY: case ACPI_ADR_SPACE_SYSTEM_IO: printk(KERN_DEBUG "ACPI MEMORY or I/O RESET_REG.\n"); - acpi_hw_low_level_write(8, reset_value, rr); + acpi_reset(); break; } /* Wait ten seconds */ diff --git a/drivers/acpi/resources/Makefile b/drivers/acpi/resources/Makefile deleted file mode 100644 index 8de4f69dfa0..00000000000 --- a/drivers/acpi/resources/Makefile +++ /dev/null @@ -1,10 +0,0 @@ -# -# Makefile for all Linux ACPI interpreter subdirectories -# - -obj-y := rsaddr.o rscreate.o rsinfo.o rsio.o rslist.o rsmisc.o rsxface.o \ - rscalc.o rsirq.o rsmemory.o rsutils.o - -obj-$(ACPI_FUTURE_USAGE) += rsdump.o - -EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/sbshc.c b/drivers/acpi/sbshc.c index e53e590252c..0619734895b 100644 --- a/drivers/acpi/sbshc.c +++ b/drivers/acpi/sbshc.c @@ -10,7 +10,6 @@ #include <acpi/acpi_bus.h> #include <acpi/acpi_drivers.h> -#include <acpi/actypes.h> #include <linux/wait.h> #include <linux/delay.h> #include <linux/interrupt.h> diff --git a/drivers/acpi/scan.c b/drivers/acpi/scan.c index 39b7233c348..c54d7b6c406 100644 --- a/drivers/acpi/scan.c +++ b/drivers/acpi/scan.c @@ -10,7 +10,6 @@ #include <linux/kthread.h> #include <acpi/acpi_drivers.h> -#include <acpi/acinterp.h> /* for acpi_ex_eisa_id_to_string() */ #define _COMPONENT ACPI_BUS_COMPONENT ACPI_MODULE_NAME("scan"); diff --git a/drivers/acpi/sleep/sleep.h b/drivers/acpi/sleep.h index cfaf8f5b0a1..cfaf8f5b0a1 100644 --- a/drivers/acpi/sleep/sleep.h +++ b/drivers/acpi/sleep.h diff --git a/drivers/acpi/sleep/Makefile b/drivers/acpi/sleep/Makefile deleted file mode 100644 index f1fb888c2d2..00000000000 --- a/drivers/acpi/sleep/Makefile +++ /dev/null @@ -1,5 +0,0 @@ -obj-y := wakeup.o -obj-y += main.o -obj-$(CONFIG_ACPI_SLEEP) += proc.o - -EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/system.c b/drivers/acpi/system.c index 6e4107f8240..391d0358a59 100644 --- a/drivers/acpi/system.c +++ b/drivers/acpi/system.c @@ -192,65 +192,6 @@ static struct attribute_group interrupt_stats_attr_group = { }; static struct kobj_attribute *counter_attrs; -static int count_num_gpes(void) -{ - int count = 0; - struct acpi_gpe_xrupt_info *gpe_xrupt_info; - struct acpi_gpe_block_info *gpe_block; - acpi_cpu_flags flags; - - flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); - - gpe_xrupt_info = acpi_gbl_gpe_xrupt_list_head; - while (gpe_xrupt_info) { - gpe_block = gpe_xrupt_info->gpe_block_list_head; - while (gpe_block) { - count += gpe_block->register_count * - ACPI_GPE_REGISTER_WIDTH; - gpe_block = gpe_block->next; - } - gpe_xrupt_info = gpe_xrupt_info->next; - } - acpi_os_release_lock(acpi_gbl_gpe_lock, flags); - - return count; -} - -static int get_gpe_device(int index, acpi_handle *handle) -{ - struct acpi_gpe_xrupt_info *gpe_xrupt_info; - struct acpi_gpe_block_info *gpe_block; - acpi_cpu_flags flags; - struct acpi_namespace_node *node; - - flags = acpi_os_acquire_lock(acpi_gbl_gpe_lock); - - gpe_xrupt_info = acpi_gbl_gpe_xrupt_list_head; - while (gpe_xrupt_info) { - gpe_block = gpe_xrupt_info->gpe_block_list_head; - node = gpe_block->node; - while (gpe_block) { - index -= gpe_block->register_count * - ACPI_GPE_REGISTER_WIDTH; - if (index < 0) { - acpi_os_release_lock(acpi_gbl_gpe_lock, flags); - /* return NULL if it's FADT GPE */ - if (node->type != ACPI_TYPE_DEVICE) - *handle = NULL; - else - *handle = node; - return 0; - } - node = gpe_block->node; - gpe_block = gpe_block->next; - } - gpe_xrupt_info = gpe_xrupt_info->next; - } - acpi_os_release_lock(acpi_gbl_gpe_lock, flags); - - return -ENODEV; -} - static void delete_gpe_attr_array(void) { struct event_counter *tmp = all_counters; @@ -309,7 +250,7 @@ static int get_status(u32 index, acpi_event_status *status, acpi_handle *handle) goto end; if (index < num_gpes) { - result = get_gpe_device(index, handle); + result = acpi_get_gpe_device(index, handle); if (result) { ACPI_EXCEPTION((AE_INFO, AE_NOT_FOUND, "Invalid GPE 0x%x\n", index)); @@ -436,7 +377,7 @@ void acpi_irq_stats_init(void) if (all_counters) return; - num_gpes = count_num_gpes(); + num_gpes = acpi_current_gpe_count; num_counters = num_gpes + ACPI_NUM_FIXED_EVENTS + NUM_COUNTERS_EXTRA; all_attrs = kzalloc(sizeof(struct attribute *) * (num_counters + 1), diff --git a/drivers/acpi/tables/Makefile b/drivers/acpi/tables/Makefile deleted file mode 100644 index 7385efa6162..00000000000 --- a/drivers/acpi/tables/Makefile +++ /dev/null @@ -1,7 +0,0 @@ -# -# Makefile for all Linux ACPI interpreter subdirectories -# - -obj-y := tbxface.o tbinstal.o tbutils.o tbfind.o tbfadt.o tbxfroot.o - -EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/utilities/Makefile b/drivers/acpi/utilities/Makefile deleted file mode 100644 index 88eff14c489..00000000000 --- a/drivers/acpi/utilities/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# -# Makefile for all Linux ACPI interpreter subdirectories -# - -obj-y := utalloc.o utdebug.o uteval.o utinit.o utmisc.o utxface.o \ - utcopy.o utdelete.o utglobal.o utmath.o utobject.o \ - utstate.o utmutex.o utobject.o utcache.o utresrc.o - -EXTRA_CFLAGS += $(ACPI_CFLAGS) diff --git a/drivers/acpi/utilities/utcache.c b/drivers/acpi/utilities/utcache.c deleted file mode 100644 index 245fa80cf60..00000000000 --- a/drivers/acpi/utilities/utcache.c +++ /dev/null @@ -1,314 +0,0 @@ -/****************************************************************************** - * - * Module Name: utcache - local cache allocation routines - * - *****************************************************************************/ - -/* - * Copyright (C) 2000 - 2008, Intel Corp. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. Redistributions in binary form must reproduce at minimum a disclaimer - * substantially similar to the "NO WARRANTY" disclaimer below - * ("Disclaimer") and any redistribution must be conditioned upon - * including a substantially similar Disclaimer requirement for further - * binary redistribution. - * 3. Neither the names of the above-listed copyright holders nor the names - * of any contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * NO WARRANTY - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGES. - */ - -#include <acpi/acpi.h> - -#define _COMPONENT ACPI_UTILITIES -ACPI_MODULE_NAME("utcache") -#ifdef ACPI_USE_LOCAL_CACHE -/******************************************************************************* - * - * FUNCTION: acpi_os_create_cache - * - * PARAMETERS: cache_name - Ascii name for the cache - * object_size - Size of each cached object - * max_depth - Maximum depth of the cache (in objects) - * return_cache - Where the new cache object is returned - * - * RETURN: Status - * - * DESCRIPTION: Create a cache object - * - ******************************************************************************/ -acpi_status -acpi_os_create_cache(char *cache_name, - u16 object_size, - u16 max_depth, struct acpi_memory_list ** return_cache) -{ - struct acpi_memory_list *cache; - - ACPI_FUNCTION_ENTRY(); - - if (!cache_name || !return_cache || (object_size < 16)) { - return (AE_BAD_PARAMETER); - } - - /* Create the cache object */ - - cache = acpi_os_allocate(sizeof(struct acpi_memory_list)); - if (!cache) { - return (AE_NO_MEMORY); - } - - /* Populate the cache object and return it */ - - ACPI_MEMSET(cache, 0, sizeof(struct acpi_memory_list)); - cache->link_offset = 8; - cache->list_name = cache_name; - cache->object_size = object_size; - cache->max_depth = max_depth; - - *return_cache = cache; - return (AE_OK); -} - -/******************************************************************************* - * - * FUNCTION: acpi_os_purge_cache - * - * PARAMETERS: Cache - Handle to cache object - * - * RETURN: Status - * - * DESCRIPTION: Free all objects within the requested cache. - * - ******************************************************************************/ - -acpi_status acpi_os_purge_cache(struct acpi_memory_list * cache) -{ - char *next; - - ACPI_FUNCTION_ENTRY(); - - if (!cache) { - return (AE_BAD_PARAMETER); - } - - /* Walk the list of objects in this cache */ - - while (cache->list_head) { - - /* Delete and unlink one cached state object */ - - next = *(ACPI_CAST_INDIRECT_PTR(char, - &(((char *)cache-> - list_head)[cache-> - link_offset]))); - ACPI_FREE(cache->list_head); - - cache->list_head = next; - cache->current_depth--; - } - - return (AE_OK); -} - -/******************************************************************************* - * - * FUNCTION: acpi_os_delete_cache - * - * PARAMETERS: Cache - Handle to cache object - * - * RETURN: Status - * - * DESCRIPTION: Free all objects within the requested cache and delete the - * cache object. - * - ******************************************************************************/ - -acpi_status acpi_os_delete_cache(struct acpi_memory_list * cache) -{ - acpi_status status; - - ACPI_FUNCTION_ENTRY(); - - /* Purge all objects in the cache */ - - status = acpi_os_purge_cache(cache); - if (ACPI_FAILURE(status)) { - return (status); - } - - /* Now we can delete the cache object */ - - ACPI_FREE(cache); - return (AE_OK); -} - -/******************************************************************************* - * - * FUNCTION: acpi_os_release_object - * - * PARAMETERS: Cache - Handle to cache object - * Object - The object to be released - * - * RETURN: None - * - * DESCRIPTION: Release an object to the specified cache. If cache is full, - * the object is deleted. - * - ******************************************************************************/ - -acpi_status -acpi_os_release_object(struct acpi_memory_list * cache, void *object) -{ - acpi_status status; - - ACPI_FUNCTION_ENTRY(); - - if (!cache || !object) { - return (AE_BAD_PARAMETER); - } - - /* If cache is full, just free this object */ - - if (cache->current_depth >= cache->max_depth) { - ACPI_FREE(object); - ACPI_MEM_TRACKING(cache->total_freed++); - } - - /* Otherwise put this object back into the cache */ - - else { - status = acpi_ut_acquire_mutex(ACPI_MTX_CACHES); - if (ACPI_FAILURE(status)) { - return (status); - } - - /* Mark the object as cached */ - - ACPI_MEMSET(object, 0xCA, cache->object_size); - ACPI_SET_DESCRIPTOR_TYPE(object, ACPI_DESC_TYPE_CACHED); - - /* Put the object at the head of the cache list */ - - *(ACPI_CAST_INDIRECT_PTR(char, - &(((char *)object)[cache-> - link_offset]))) = - cache->list_head; - cache->list_head = object; - cache->current_depth++; - - (void)acpi_ut_release_mutex(ACPI_MTX_CACHES); - } - - return (AE_OK); -} - -/******************************************************************************* - * - * FUNCTION: acpi_os_acquire_object - * - * PARAMETERS: Cache - Handle to cache object - * - * RETURN: the acquired object. NULL on error - * - * DESCRIPTION: Get an object from the specified cache. If cache is empty, - * the object is allocated. - * - ******************************************************************************/ - -void *acpi_os_acquire_object(struct acpi_memory_list *cache) -{ - acpi_status status; - void *object; - - ACPI_FUNCTION_NAME(os_acquire_object); - - if (!cache) { - return (NULL); - } - - status = acpi_ut_acquire_mutex(ACPI_MTX_CACHES); - if (ACPI_FAILURE(status)) { - return (NULL); - } - - ACPI_MEM_TRACKING(cache->requests++); - - /* Check the cache first */ - - if (cache->list_head) { - - /* There is an object available, use it */ - - object = cache->list_head; - cache->list_head = *(ACPI_CAST_INDIRECT_PTR(char, - &(((char *) - object)[cache-> - link_offset]))); - - cache->current_depth--; - - ACPI_MEM_TRACKING(cache->hits++); - ACPI_DEBUG_PRINT((ACPI_DB_EXEC, - "Object %p from %s cache\n", object, - cache->list_name)); - - status = acpi_ut_release_mutex(ACPI_MTX_CACHES); - if (ACPI_FAILURE(status)) { - return (NULL); - } - - /* Clear (zero) the previously used Object */ - - ACPI_MEMSET(object, 0, cache->object_size); - } else { - /* The cache is empty, create a new object */ - - ACPI_MEM_TRACKING(cache->total_allocated++); - -#ifdef ACPI_DBG_TRACK_ALLOCATIONS - if ((cache->total_allocated - cache->total_freed) > - cache->max_occupied) { - cache->max_occupied = - cache->total_allocated - cache->total_freed; - } -#endif - - /* Avoid deadlock with ACPI_ALLOCATE_ZEROED */ - - status = acpi_ut_release_mutex(ACPI_MTX_CACHES); - if (ACPI_FAILURE(status)) { - return (NULL); - } - - object = ACPI_ALLOCATE_ZEROED(cache->object_size); - if (!object) { - return (NULL); - } - } - - return (object); -} -#endif /* ACPI_USE_LOCAL_CACHE */ diff --git a/drivers/acpi/video.c b/drivers/acpi/video.c index baa44192972..f261737636d 100644 --- a/drivers/acpi/video.c +++ b/drivers/acpi/video.c @@ -36,6 +36,7 @@ #include <linux/backlight.h> #include <linux/thermal.h> #include <linux/video_output.h> +#include <linux/sort.h> #include <asm/uaccess.h> #include <acpi/acpi_bus.h> @@ -481,6 +482,7 @@ acpi_video_device_lcd_set_level(struct acpi_video_device *device, int level) int status = AE_OK; union acpi_object arg0 = { ACPI_TYPE_INTEGER }; struct acpi_object_list args = { 1, &arg0 }; + int state; arg0.integer.value = level; @@ -489,6 +491,10 @@ acpi_video_device_lcd_set_level(struct acpi_video_device *device, int level) status = acpi_evaluate_object(device->dev->handle, "_BCM", &args, NULL); device->brightness->curr = level; + for (state = 2; state < device->brightness->count; state++) + if (level == device->brightness->levels[state]) + device->backlight->props.brightness = state - 2; + return status; } @@ -626,6 +632,16 @@ acpi_video_bus_DOS(struct acpi_video_bus *video, int bios_flag, int lcd_flag) } /* + * Simple comparison function used to sort backlight levels. + */ + +static int +acpi_video_cmp_level(const void *a, const void *b) +{ + return *(int *)a - *(int *)b; +} + +/* * Arg: * device : video output device (LCD, CRT, ..) * @@ -676,6 +692,10 @@ acpi_video_init_brightness(struct acpi_video_device *device) count++; } + /* don't sort the first two brightness levels */ + sort(&br->levels[2], count - 2, sizeof(br->levels[2]), + acpi_video_cmp_level, NULL); + if (count < 2) goto out_free_levels; diff --git a/drivers/acpi/video_detect.c b/drivers/acpi/video_detect.c index f022eb6f563..50e3d2dbf3a 100644 --- a/drivers/acpi/video_detect.c +++ b/drivers/acpi/video_detect.c @@ -234,7 +234,7 @@ EXPORT_SYMBOL(acpi_video_display_switch_support); * To force that backlight or display output switching is processed by vendor * specific acpi drivers or video.ko driver. */ -int __init acpi_backlight(char *str) +static int __init acpi_backlight(char *str) { if (str == NULL || *str == '\0') return 1; @@ -250,7 +250,7 @@ int __init acpi_backlight(char *str) } __setup("acpi_backlight=", acpi_backlight); -int __init acpi_display_output(char *str) +static int __init acpi_display_output(char *str) { if (str == NULL || *str == '\0') return 1; diff --git a/drivers/acpi/sleep/wakeup.c b/drivers/acpi/wakeup.c index dea4c23df76..2d34806d45d 100644 --- a/drivers/acpi/sleep/wakeup.c +++ b/drivers/acpi/wakeup.c @@ -8,7 +8,6 @@ #include <acpi/acpi_drivers.h> #include <linux/kernel.h> #include <linux/types.h> -#include <acpi/acevents.h> #include "sleep.h" #define _COMPONENT ACPI_SYSTEM_COMPONENT @@ -28,8 +27,6 @@ void acpi_enable_wakeup_device_prep(u8 sleep_state) { struct list_head *node, *next; - ACPI_FUNCTION_TRACE("acpi_enable_wakeup_device_prep"); - spin_lock(&acpi_device_lock); list_for_each_safe(node, next, &acpi_wakeup_device_list) { struct acpi_device *dev = container_of(node, @@ -61,7 +58,6 @@ void acpi_enable_wakeup_device(u8 sleep_state) * Caution: this routine must be invoked when interrupt is disabled * Refer ACPI2.0: P212 */ - ACPI_FUNCTION_TRACE("acpi_enable_wakeup_device"); spin_lock(&acpi_device_lock); list_for_each_safe(node, next, &acpi_wakeup_device_list) { struct acpi_device *dev = @@ -103,8 +99,6 @@ void acpi_disable_wakeup_device(u8 sleep_state) { struct list_head *node, *next; - ACPI_FUNCTION_TRACE("acpi_disable_wakeup_device"); - spin_lock(&acpi_device_lock); list_for_each_safe(node, next, &acpi_wakeup_device_list) { struct acpi_device *dev = diff --git a/drivers/ata/libata-acpi.c b/drivers/ata/libata-acpi.c index ef02e488d46..6273d98d00e 100644 --- a/drivers/ata/libata-acpi.c +++ b/drivers/ata/libata-acpi.c @@ -19,12 +19,6 @@ #include "libata.h" #include <acpi/acpi_bus.h> -#include <acpi/acnames.h> -#include <acpi/acnamesp.h> -#include <acpi/acparser.h> -#include <acpi/acexcep.h> -#include <acpi/acmacros.h> -#include <acpi/actypes.h> enum { ATA_ACPI_FILTER_SETXFER = 1 << 0, diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c index 175df54eb66..c507a9ac78f 100644 --- a/drivers/ata/libata-core.c +++ b/drivers/ata/libata-core.c @@ -4556,7 +4556,7 @@ void ata_sg_clean(struct ata_queued_cmd *qc) struct scatterlist *sg = qc->sg; int dir = qc->dma_dir; - WARN_ON(sg == NULL); + WARN_ON_ONCE(sg == NULL); VPRINTK("unmapping %u sg elements\n", qc->n_elem); @@ -4776,7 +4776,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) struct ata_port *ap = qc->ap; unsigned int tag; - WARN_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ qc->flags = 0; tag = qc->tag; @@ -4791,8 +4791,8 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) struct ata_port *ap = qc->ap; struct ata_link *link = qc->dev->link; - WARN_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ - WARN_ON(!(qc->flags & ATA_QCFLAG_ACTIVE)); + WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE)); if (likely(qc->flags & ATA_QCFLAG_DMAMAP)) ata_sg_clean(qc); @@ -4878,7 +4878,7 @@ void ata_qc_complete(struct ata_queued_cmd *qc) struct ata_device *dev = qc->dev; struct ata_eh_info *ehi = &dev->link->eh_info; - WARN_ON(ap->pflags & ATA_PFLAG_FROZEN); + WARN_ON_ONCE(ap->pflags & ATA_PFLAG_FROZEN); if (unlikely(qc->err_mask)) qc->flags |= ATA_QCFLAG_FAILED; @@ -5000,16 +5000,16 @@ void ata_qc_issue(struct ata_queued_cmd *qc) * check is skipped for old EH because it reuses active qc to * request ATAPI sense. */ - WARN_ON(ap->ops->error_handler && ata_tag_valid(link->active_tag)); + WARN_ON_ONCE(ap->ops->error_handler && ata_tag_valid(link->active_tag)); if (ata_is_ncq(prot)) { - WARN_ON(link->sactive & (1 << qc->tag)); + WARN_ON_ONCE(link->sactive & (1 << qc->tag)); if (!link->sactive) ap->nr_active_links++; link->sactive |= 1 << qc->tag; } else { - WARN_ON(link->sactive); + WARN_ON_ONCE(link->sactive); ap->nr_active_links++; link->active_tag = qc->tag; diff --git a/drivers/ata/libata-sff.c b/drivers/ata/libata-sff.c index c59ad76c84b..0eae9b45355 100644 --- a/drivers/ata/libata-sff.c +++ b/drivers/ata/libata-sff.c @@ -578,7 +578,7 @@ void ata_sff_tf_load(struct ata_port *ap, const struct ata_taskfile *tf) } if (is_addr && (tf->flags & ATA_TFLAG_LBA48)) { - WARN_ON(!ioaddr->ctl_addr); + WARN_ON_ONCE(!ioaddr->ctl_addr); iowrite8(tf->hob_feature, ioaddr->feature_addr); iowrite8(tf->hob_nsect, ioaddr->nsect_addr); iowrite8(tf->hob_lbal, ioaddr->lbal_addr); @@ -651,7 +651,7 @@ void ata_sff_tf_read(struct ata_port *ap, struct ata_taskfile *tf) iowrite8(tf->ctl, ioaddr->ctl_addr); ap->last_ctl = tf->ctl; } else - WARN_ON(1); + WARN_ON_ONCE(1); } } EXPORT_SYMBOL_GPL(ata_sff_tf_read); @@ -891,7 +891,7 @@ static void ata_pio_sectors(struct ata_queued_cmd *qc) /* READ/WRITE MULTIPLE */ unsigned int nsect; - WARN_ON(qc->dev->multi_count == 0); + WARN_ON_ONCE(qc->dev->multi_count == 0); nsect = min((qc->nbytes - qc->curbytes) / qc->sect_size, qc->dev->multi_count); @@ -918,7 +918,7 @@ static void atapi_send_cdb(struct ata_port *ap, struct ata_queued_cmd *qc) { /* send SCSI cdb */ DPRINTK("send cdb\n"); - WARN_ON(qc->dev->cdb_len < 12); + WARN_ON_ONCE(qc->dev->cdb_len < 12); ap->ops->sff_data_xfer(qc->dev, qc->cdb, qc->dev->cdb_len, 1); ata_sff_sync(ap); @@ -1014,7 +1014,7 @@ next_sg: } /* consumed can be larger than count only for the last transfer */ - WARN_ON(qc->cursg && count != consumed); + WARN_ON_ONCE(qc->cursg && count != consumed); if (bytes) goto next_sg; @@ -1172,13 +1172,13 @@ int ata_sff_hsm_move(struct ata_port *ap, struct ata_queued_cmd *qc, unsigned long flags = 0; int poll_next; - WARN_ON((qc->flags & ATA_QCFLAG_ACTIVE) == 0); + WARN_ON_ONCE((qc->flags & ATA_QCFLAG_ACTIVE) == 0); /* Make sure ata_sff_qc_issue() does not throw things * like DMA polling into the workqueue. Notice that * in_wq is not equivalent to (qc->tf.flags & ATA_TFLAG_POLLING). */ - WARN_ON(in_wq != ata_hsm_ok_in_wq(ap, qc)); + WARN_ON_ONCE(in_wq != ata_hsm_ok_in_wq(ap, qc)); fsm_start: DPRINTK("ata%u: protocol %d task_state %d (dev_stat 0x%X)\n", @@ -1387,7 +1387,7 @@ fsm_start: DPRINTK("ata%u: dev %u command complete, drv_stat 0x%x\n", ap->print_id, qc->dev->devno, status); - WARN_ON(qc->err_mask & (AC_ERR_DEV | AC_ERR_HSM)); + WARN_ON_ONCE(qc->err_mask & (AC_ERR_DEV | AC_ERR_HSM)); ap->hsm_task_state = HSM_ST_IDLE; @@ -1423,7 +1423,7 @@ void ata_pio_task(struct work_struct *work) int poll_next; fsm_start: - WARN_ON(ap->hsm_task_state == HSM_ST_IDLE); + WARN_ON_ONCE(ap->hsm_task_state == HSM_ST_IDLE); /* * This is purely heuristic. This is a fast path. @@ -1512,7 +1512,7 @@ unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc) break; case ATA_PROT_DMA: - WARN_ON(qc->tf.flags & ATA_TFLAG_POLLING); + WARN_ON_ONCE(qc->tf.flags & ATA_TFLAG_POLLING); ap->ops->sff_tf_load(ap, &qc->tf); /* load tf registers */ ap->ops->bmdma_setup(qc); /* set up bmdma */ @@ -1564,7 +1564,7 @@ unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc) break; case ATAPI_PROT_DMA: - WARN_ON(qc->tf.flags & ATA_TFLAG_POLLING); + WARN_ON_ONCE(qc->tf.flags & ATA_TFLAG_POLLING); ap->ops->sff_tf_load(ap, &qc->tf); /* load tf registers */ ap->ops->bmdma_setup(qc); /* set up bmdma */ @@ -1576,7 +1576,7 @@ unsigned int ata_sff_qc_issue(struct ata_queued_cmd *qc) break; default: - WARN_ON(1); + WARN_ON_ONCE(1); return AC_ERR_SYSTEM; } diff --git a/drivers/ata/pata_acpi.c b/drivers/ata/pata_acpi.c index e2e332d8ff9..8b77a9802df 100644 --- a/drivers/ata/pata_acpi.c +++ b/drivers/ata/pata_acpi.c @@ -13,12 +13,6 @@ #include <linux/device.h> #include <scsi/scsi_host.h> #include <acpi/acpi_bus.h> -#include <acpi/acnames.h> -#include <acpi/acnamesp.h> -#include <acpi/acparser.h> -#include <acpi/acexcep.h> -#include <acpi/acmacros.h> -#include <acpi/actypes.h> #include <linux/libata.h> #include <linux/ata.h> diff --git a/drivers/char/tpm/tpm_bios.c b/drivers/char/tpm/tpm_bios.c index 68f052b42ed..ed306eb1057 100644 --- a/drivers/char/tpm/tpm_bios.c +++ b/drivers/char/tpm/tpm_bios.c @@ -23,8 +23,6 @@ #include <linux/security.h> #include <linux/module.h> #include <acpi/acpi.h> -#include <acpi/actypes.h> -#include <acpi/actbl.h> #include "tpm.h" #define TCG_EVENT_NAME_LEN_MAX 255 diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 8d7cf3f3145..f1df59f59a3 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -15,12 +15,14 @@ #include <linux/tick.h> #define BREAK_FUZZ 4 /* 4 us */ +#define PRED_HISTORY_PCT 50 struct menu_device { int last_state_idx; unsigned int expected_us; unsigned int predicted_us; + unsigned int current_predicted_us; unsigned int last_measured_us; unsigned int elapsed_us; }; @@ -47,6 +49,12 @@ static int menu_select(struct cpuidle_device *dev) data->expected_us = (u32) ktime_to_ns(tick_nohz_get_sleep_length()) / 1000; + /* Recalculate predicted_us based on prediction_history_pct */ + data->predicted_us *= PRED_HISTORY_PCT; + data->predicted_us += (100 - PRED_HISTORY_PCT) * + data->current_predicted_us; + data->predicted_us /= 100; + /* find the deepest idle state that satisfies our constraints */ for (i = CPUIDLE_DRIVER_STATE_START + 1; i < dev->state_count; i++) { struct cpuidle_state *s = &dev->states[i]; @@ -97,7 +105,7 @@ static void menu_reflect(struct cpuidle_device *dev) measured_us = -1; /* Predict time until next break event */ - data->predicted_us = max(measured_us, data->last_measured_us); + data->current_predicted_us = max(measured_us, data->last_measured_us); if (last_idle_us + BREAK_FUZZ < data->expected_us - target->exit_latency) { diff --git a/drivers/dca/dca-core.c b/drivers/dca/dca-core.c index d883e1b8bb8..55433849bfa 100644 --- a/drivers/dca/dca-core.c +++ b/drivers/dca/dca-core.c @@ -270,6 +270,6 @@ static void __exit dca_exit(void) dca_sysfs_exit(); } -subsys_initcall(dca_init); +arch_initcall(dca_init); module_exit(dca_exit); diff --git a/drivers/dma/Kconfig b/drivers/dma/Kconfig index 904e57558bb..e34b0642081 100644 --- a/drivers/dma/Kconfig +++ b/drivers/dma/Kconfig @@ -33,7 +33,6 @@ config INTEL_IOATDMA config INTEL_IOP_ADMA tristate "Intel IOP ADMA support" depends on ARCH_IOP32X || ARCH_IOP33X || ARCH_IOP13XX - select ASYNC_CORE select DMA_ENGINE help Enable support for the Intel(R) IOP Series RAID engines. @@ -59,7 +58,6 @@ config FSL_DMA config MV_XOR bool "Marvell XOR engine support" depends on PLAT_ORION - select ASYNC_CORE select DMA_ENGINE ---help--- Enable support for the Marvell XOR engine. diff --git a/drivers/dma/dmaengine.c b/drivers/dma/dmaengine.c index 65799651737..403dbe78112 100644 --- a/drivers/dma/dmaengine.c +++ b/drivers/dma/dmaengine.c @@ -31,32 +31,18 @@ * * LOCKING: * - * The subsystem keeps two global lists, dma_device_list and dma_client_list. - * Both of these are protected by a mutex, dma_list_mutex. + * The subsystem keeps a global list of dma_device structs it is protected by a + * mutex, dma_list_mutex. + * + * A subsystem can get access to a channel by calling dmaengine_get() followed + * by dma_find_channel(), or if it has need for an exclusive channel it can call + * dma_request_channel(). Once a channel is allocated a reference is taken + * against its corresponding driver to disable removal. * * Each device has a channels list, which runs unlocked but is never modified * once the device is registered, it's just setup by the driver. * - * Each client is responsible for keeping track of the channels it uses. See - * the definition of dma_event_callback in dmaengine.h. - * - * Each device has a kref, which is initialized to 1 when the device is - * registered. A kref_get is done for each device registered. When the - * device is released, the corresponding kref_put is done in the release - * method. Every time one of the device's channels is allocated to a client, - * a kref_get occurs. When the channel is freed, the corresponding kref_put - * happens. The device's release function does a completion, so - * unregister_device does a remove event, device_unregister, a kref_put - * for the first reference, then waits on the completion for all other - * references to finish. - * - * Each channel has an open-coded implementation of Rusty Russell's "bigref," - * with a kref and a per_cpu local_t. A dma_chan_get is called when a client - * signals that it wants to use a channel, and dma_chan_put is called when - * a channel is removed or a client using it is unregistered. A client can - * take extra references per outstanding transaction, as is the case with - * the NET DMA client. The release function does a kref_put on the device. - * -ChrisL, DanW + * See Documentation/dmaengine.txt for more details */ #include <linux/init.h> @@ -70,54 +56,85 @@ #include <linux/rcupdate.h> #include <linux/mutex.h> #include <linux/jiffies.h> +#include <linux/rculist.h> +#include <linux/idr.h> static DEFINE_MUTEX(dma_list_mutex); static LIST_HEAD(dma_device_list); -static LIST_HEAD(dma_client_list); +static long dmaengine_ref_count; +static struct idr dma_idr; /* --- sysfs implementation --- */ +/** + * dev_to_dma_chan - convert a device pointer to the its sysfs container object + * @dev - device node + * + * Must be called under dma_list_mutex + */ +static struct dma_chan *dev_to_dma_chan(struct device *dev) +{ + struct dma_chan_dev *chan_dev; + + chan_dev = container_of(dev, typeof(*chan_dev), device); + return chan_dev->chan; +} + static ssize_t show_memcpy_count(struct device *dev, struct device_attribute *attr, char *buf) { - struct dma_chan *chan = to_dma_chan(dev); + struct dma_chan *chan; unsigned long count = 0; int i; + int err; - for_each_possible_cpu(i) - count += per_cpu_ptr(chan->local, i)->memcpy_count; + mutex_lock(&dma_list_mutex); + chan = dev_to_dma_chan(dev); + if (chan) { + for_each_possible_cpu(i) + count += per_cpu_ptr(chan->local, i)->memcpy_count; + err = sprintf(buf, "%lu\n", count); + } else + err = -ENODEV; + mutex_unlock(&dma_list_mutex); - return sprintf(buf, "%lu\n", count); + return err; } static ssize_t show_bytes_transferred(struct device *dev, struct device_attribute *attr, char *buf) { - struct dma_chan *chan = to_dma_chan(dev); + struct dma_chan *chan; unsigned long count = 0; int i; + int err; - for_each_possible_cpu(i) - count += per_cpu_ptr(chan->local, i)->bytes_transferred; + mutex_lock(&dma_list_mutex); + chan = dev_to_dma_chan(dev); + if (chan) { + for_each_possible_cpu(i) + count += per_cpu_ptr(chan->local, i)->bytes_transferred; + err = sprintf(buf, "%lu\n", count); + } else + err = -ENODEV; + mutex_unlock(&dma_list_mutex); - return sprintf(buf, "%lu\n", count); + return err; } static ssize_t show_in_use(struct device *dev, struct device_attribute *attr, char *buf) { - struct dma_chan *chan = to_dma_chan(dev); - int in_use = 0; - - if (unlikely(chan->slow_ref) && - atomic_read(&chan->refcount.refcount) > 1) - in_use = 1; - else { - if (local_read(&(per_cpu_ptr(chan->local, - get_cpu())->refcount)) > 0) - in_use = 1; - put_cpu(); - } + struct dma_chan *chan; + int err; - return sprintf(buf, "%d\n", in_use); + mutex_lock(&dma_list_mutex); + chan = dev_to_dma_chan(dev); + if (chan) + err = sprintf(buf, "%d\n", chan->client_count); + else + err = -ENODEV; + mutex_unlock(&dma_list_mutex); + + return err; } static struct device_attribute dma_attrs[] = { @@ -127,76 +144,110 @@ static struct device_attribute dma_attrs[] = { __ATTR_NULL }; -static void dma_async_device_cleanup(struct kref *kref); - -static void dma_dev_release(struct device *dev) +static void chan_dev_release(struct device *dev) { - struct dma_chan *chan = to_dma_chan(dev); - kref_put(&chan->device->refcount, dma_async_device_cleanup); + struct dma_chan_dev *chan_dev; + + chan_dev = container_of(dev, typeof(*chan_dev), device); + if (atomic_dec_and_test(chan_dev->idr_ref)) { + mutex_lock(&dma_list_mutex); + idr_remove(&dma_idr, chan_dev->dev_id); + mutex_unlock(&dma_list_mutex); + kfree(chan_dev->idr_ref); + } + kfree(chan_dev); } static struct class dma_devclass = { .name = "dma", .dev_attrs = dma_attrs, - .dev_release = dma_dev_release, + .dev_release = chan_dev_release, }; /* --- client and device registration --- */ -#define dma_chan_satisfies_mask(chan, mask) \ - __dma_chan_satisfies_mask((chan), &(mask)) +#define dma_device_satisfies_mask(device, mask) \ + __dma_device_satisfies_mask((device), &(mask)) static int -__dma_chan_satisfies_mask(struct dma_chan *chan, dma_cap_mask_t *want) +__dma_device_satisfies_mask(struct dma_device *device, dma_cap_mask_t *want) { dma_cap_mask_t has; - bitmap_and(has.bits, want->bits, chan->device->cap_mask.bits, + bitmap_and(has.bits, want->bits, device->cap_mask.bits, DMA_TX_TYPE_END); return bitmap_equal(want->bits, has.bits, DMA_TX_TYPE_END); } +static struct module *dma_chan_to_owner(struct dma_chan *chan) +{ + return chan->device->dev->driver->owner; +} + /** - * dma_client_chan_alloc - try to allocate channels to a client - * @client: &dma_client + * balance_ref_count - catch up the channel reference count + * @chan - channel to balance ->client_count versus dmaengine_ref_count * - * Called with dma_list_mutex held. + * balance_ref_count must be called under dma_list_mutex */ -static void dma_client_chan_alloc(struct dma_client *client) +static void balance_ref_count(struct dma_chan *chan) { - struct dma_device *device; - struct dma_chan *chan; - int desc; /* allocated descriptor count */ - enum dma_state_client ack; + struct module *owner = dma_chan_to_owner(chan); - /* Find a channel */ - list_for_each_entry(device, &dma_device_list, global_node) { - /* Does the client require a specific DMA controller? */ - if (client->slave && client->slave->dma_dev - && client->slave->dma_dev != device->dev) - continue; + while (chan->client_count < dmaengine_ref_count) { + __module_get(owner); + chan->client_count++; + } +} - list_for_each_entry(chan, &device->channels, device_node) { - if (!dma_chan_satisfies_mask(chan, client->cap_mask)) - continue; +/** + * dma_chan_get - try to grab a dma channel's parent driver module + * @chan - channel to grab + * + * Must be called under dma_list_mutex + */ +static int dma_chan_get(struct dma_chan *chan) +{ + int err = -ENODEV; + struct module *owner = dma_chan_to_owner(chan); + + if (chan->client_count) { + __module_get(owner); + err = 0; + } else if (try_module_get(owner)) + err = 0; + + if (err == 0) + chan->client_count++; + + /* allocate upon first client reference */ + if (chan->client_count == 1 && err == 0) { + int desc_cnt = chan->device->device_alloc_chan_resources(chan); + + if (desc_cnt < 0) { + err = desc_cnt; + chan->client_count = 0; + module_put(owner); + } else if (!dma_has_cap(DMA_PRIVATE, chan->device->cap_mask)) + balance_ref_count(chan); + } - desc = chan->device->device_alloc_chan_resources( - chan, client); - if (desc >= 0) { - ack = client->event_callback(client, - chan, - DMA_RESOURCE_AVAILABLE); + return err; +} - /* we are done once this client rejects - * an available resource - */ - if (ack == DMA_ACK) { - dma_chan_get(chan); - chan->client_count++; - } else if (ack == DMA_NAK) - return; - } - } - } +/** + * dma_chan_put - drop a reference to a dma channel's parent driver module + * @chan - channel to release + * + * Must be called under dma_list_mutex + */ +static void dma_chan_put(struct dma_chan *chan) +{ + if (!chan->client_count) + return; /* this channel failed alloc_chan_resources */ + chan->client_count--; + module_put(dma_chan_to_owner(chan)); + if (chan->client_count == 0) + chan->device->device_free_chan_resources(chan); } enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie) @@ -218,138 +269,342 @@ enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie) EXPORT_SYMBOL(dma_sync_wait); /** - * dma_chan_cleanup - release a DMA channel's resources - * @kref: kernel reference structure that contains the DMA channel device + * dma_cap_mask_all - enable iteration over all operation types + */ +static dma_cap_mask_t dma_cap_mask_all; + +/** + * dma_chan_tbl_ent - tracks channel allocations per core/operation + * @chan - associated channel for this entry + */ +struct dma_chan_tbl_ent { + struct dma_chan *chan; +}; + +/** + * channel_table - percpu lookup table for memory-to-memory offload providers */ -void dma_chan_cleanup(struct kref *kref) +static struct dma_chan_tbl_ent *channel_table[DMA_TX_TYPE_END]; + +static int __init dma_channel_table_init(void) { - struct dma_chan *chan = container_of(kref, struct dma_chan, refcount); - chan->device->device_free_chan_resources(chan); - kref_put(&chan->device->refcount, dma_async_device_cleanup); + enum dma_transaction_type cap; + int err = 0; + + bitmap_fill(dma_cap_mask_all.bits, DMA_TX_TYPE_END); + + /* 'interrupt', 'private', and 'slave' are channel capabilities, + * but are not associated with an operation so they do not need + * an entry in the channel_table + */ + clear_bit(DMA_INTERRUPT, dma_cap_mask_all.bits); + clear_bit(DMA_PRIVATE, dma_cap_mask_all.bits); + clear_bit(DMA_SLAVE, dma_cap_mask_all.bits); + + for_each_dma_cap_mask(cap, dma_cap_mask_all) { + channel_table[cap] = alloc_percpu(struct dma_chan_tbl_ent); + if (!channel_table[cap]) { + err = -ENOMEM; + break; + } + } + + if (err) { + pr_err("dmaengine: initialization failure\n"); + for_each_dma_cap_mask(cap, dma_cap_mask_all) + if (channel_table[cap]) + free_percpu(channel_table[cap]); + } + + return err; } -EXPORT_SYMBOL(dma_chan_cleanup); +arch_initcall(dma_channel_table_init); -static void dma_chan_free_rcu(struct rcu_head *rcu) +/** + * dma_find_channel - find a channel to carry out the operation + * @tx_type: transaction type + */ +struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type) { - struct dma_chan *chan = container_of(rcu, struct dma_chan, rcu); - int bias = 0x7FFFFFFF; - int i; - for_each_possible_cpu(i) - bias -= local_read(&per_cpu_ptr(chan->local, i)->refcount); - atomic_sub(bias, &chan->refcount.refcount); - kref_put(&chan->refcount, dma_chan_cleanup); + struct dma_chan *chan; + int cpu; + + WARN_ONCE(dmaengine_ref_count == 0, + "client called %s without a reference", __func__); + + cpu = get_cpu(); + chan = per_cpu_ptr(channel_table[tx_type], cpu)->chan; + put_cpu(); + + return chan; } +EXPORT_SYMBOL(dma_find_channel); -static void dma_chan_release(struct dma_chan *chan) +/** + * dma_issue_pending_all - flush all pending operations across all channels + */ +void dma_issue_pending_all(void) { - atomic_add(0x7FFFFFFF, &chan->refcount.refcount); - chan->slow_ref = 1; - call_rcu(&chan->rcu, dma_chan_free_rcu); + struct dma_device *device; + struct dma_chan *chan; + + WARN_ONCE(dmaengine_ref_count == 0, + "client called %s without a reference", __func__); + + rcu_read_lock(); + list_for_each_entry_rcu(device, &dma_device_list, global_node) { + if (dma_has_cap(DMA_PRIVATE, device->cap_mask)) + continue; + list_for_each_entry(chan, &device->channels, device_node) + if (chan->client_count) + device->device_issue_pending(chan); + } + rcu_read_unlock(); } +EXPORT_SYMBOL(dma_issue_pending_all); /** - * dma_chans_notify_available - broadcast available channels to the clients + * nth_chan - returns the nth channel of the given capability + * @cap: capability to match + * @n: nth channel desired + * + * Defaults to returning the channel with the desired capability and the + * lowest reference count when 'n' cannot be satisfied. Must be called + * under dma_list_mutex. */ -static void dma_clients_notify_available(void) +static struct dma_chan *nth_chan(enum dma_transaction_type cap, int n) { - struct dma_client *client; + struct dma_device *device; + struct dma_chan *chan; + struct dma_chan *ret = NULL; + struct dma_chan *min = NULL; - mutex_lock(&dma_list_mutex); + list_for_each_entry(device, &dma_device_list, global_node) { + if (!dma_has_cap(cap, device->cap_mask) || + dma_has_cap(DMA_PRIVATE, device->cap_mask)) + continue; + list_for_each_entry(chan, &device->channels, device_node) { + if (!chan->client_count) + continue; + if (!min) + min = chan; + else if (chan->table_count < min->table_count) + min = chan; + + if (n-- == 0) { + ret = chan; + break; /* done */ + } + } + if (ret) + break; /* done */ + } - list_for_each_entry(client, &dma_client_list, global_node) - dma_client_chan_alloc(client); + if (!ret) + ret = min; - mutex_unlock(&dma_list_mutex); + if (ret) + ret->table_count++; + + return ret; } /** - * dma_chans_notify_available - tell the clients that a channel is going away - * @chan: channel on its way out + * dma_channel_rebalance - redistribute the available channels + * + * Optimize for cpu isolation (each cpu gets a dedicated channel for an + * operation type) in the SMP case, and operation isolation (avoid + * multi-tasking channels) in the non-SMP case. Must be called under + * dma_list_mutex. */ -static void dma_clients_notify_removed(struct dma_chan *chan) +static void dma_channel_rebalance(void) { - struct dma_client *client; - enum dma_state_client ack; + struct dma_chan *chan; + struct dma_device *device; + int cpu; + int cap; + int n; - mutex_lock(&dma_list_mutex); + /* undo the last distribution */ + for_each_dma_cap_mask(cap, dma_cap_mask_all) + for_each_possible_cpu(cpu) + per_cpu_ptr(channel_table[cap], cpu)->chan = NULL; + + list_for_each_entry(device, &dma_device_list, global_node) { + if (dma_has_cap(DMA_PRIVATE, device->cap_mask)) + continue; + list_for_each_entry(chan, &device->channels, device_node) + chan->table_count = 0; + } - list_for_each_entry(client, &dma_client_list, global_node) { - ack = client->event_callback(client, chan, - DMA_RESOURCE_REMOVED); + /* don't populate the channel_table if no clients are available */ + if (!dmaengine_ref_count) + return; - /* client was holding resources for this channel so - * free it - */ - if (ack == DMA_ACK) { - dma_chan_put(chan); - chan->client_count--; + /* redistribute available channels */ + n = 0; + for_each_dma_cap_mask(cap, dma_cap_mask_all) + for_each_online_cpu(cpu) { + if (num_possible_cpus() > 1) + chan = nth_chan(cap, n++); + else + chan = nth_chan(cap, -1); + + per_cpu_ptr(channel_table[cap], cpu)->chan = chan; + } +} + +static struct dma_chan *private_candidate(dma_cap_mask_t *mask, struct dma_device *dev, + dma_filter_fn fn, void *fn_param) +{ + struct dma_chan *chan; + + if (!__dma_device_satisfies_mask(dev, mask)) { + pr_debug("%s: wrong capabilities\n", __func__); + return NULL; + } + /* devices with multiple channels need special handling as we need to + * ensure that all channels are either private or public. + */ + if (dev->chancnt > 1 && !dma_has_cap(DMA_PRIVATE, dev->cap_mask)) + list_for_each_entry(chan, &dev->channels, device_node) { + /* some channels are already publicly allocated */ + if (chan->client_count) + return NULL; } + + list_for_each_entry(chan, &dev->channels, device_node) { + if (chan->client_count) { + pr_debug("%s: %s busy\n", + __func__, dma_chan_name(chan)); + continue; + } + if (fn && !fn(chan, fn_param)) { + pr_debug("%s: %s filter said false\n", + __func__, dma_chan_name(chan)); + continue; + } + return chan; } - mutex_unlock(&dma_list_mutex); + return NULL; } /** - * dma_async_client_register - register a &dma_client - * @client: ptr to a client structure with valid 'event_callback' and 'cap_mask' + * dma_request_channel - try to allocate an exclusive channel + * @mask: capabilities that the channel must satisfy + * @fn: optional callback to disposition available channels + * @fn_param: opaque parameter to pass to dma_filter_fn */ -void dma_async_client_register(struct dma_client *client) +struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param) { - /* validate client data */ - BUG_ON(dma_has_cap(DMA_SLAVE, client->cap_mask) && - !client->slave); + struct dma_device *device, *_d; + struct dma_chan *chan = NULL; + int err; + /* Find a channel */ + mutex_lock(&dma_list_mutex); + list_for_each_entry_safe(device, _d, &dma_device_list, global_node) { + chan = private_candidate(mask, device, fn, fn_param); + if (chan) { + /* Found a suitable channel, try to grab, prep, and + * return it. We first set DMA_PRIVATE to disable + * balance_ref_count as this channel will not be + * published in the general-purpose allocator + */ + dma_cap_set(DMA_PRIVATE, device->cap_mask); + err = dma_chan_get(chan); + + if (err == -ENODEV) { + pr_debug("%s: %s module removed\n", __func__, + dma_chan_name(chan)); + list_del_rcu(&device->global_node); + } else if (err) + pr_err("dmaengine: failed to get %s: (%d)\n", + dma_chan_name(chan), err); + else + break; + chan = NULL; + } + } + mutex_unlock(&dma_list_mutex); + + pr_debug("%s: %s (%s)\n", __func__, chan ? "success" : "fail", + chan ? dma_chan_name(chan) : NULL); + + return chan; +} +EXPORT_SYMBOL_GPL(__dma_request_channel); + +void dma_release_channel(struct dma_chan *chan) +{ mutex_lock(&dma_list_mutex); - list_add_tail(&client->global_node, &dma_client_list); + WARN_ONCE(chan->client_count != 1, + "chan reference count %d != 1\n", chan->client_count); + dma_chan_put(chan); mutex_unlock(&dma_list_mutex); } -EXPORT_SYMBOL(dma_async_client_register); +EXPORT_SYMBOL_GPL(dma_release_channel); /** - * dma_async_client_unregister - unregister a client and free the &dma_client - * @client: &dma_client to free - * - * Force frees any allocated DMA channels, frees the &dma_client memory + * dmaengine_get - register interest in dma_channels */ -void dma_async_client_unregister(struct dma_client *client) +void dmaengine_get(void) { - struct dma_device *device; + struct dma_device *device, *_d; struct dma_chan *chan; - enum dma_state_client ack; - - if (!client) - return; + int err; mutex_lock(&dma_list_mutex); - /* free all channels the client is holding */ - list_for_each_entry(device, &dma_device_list, global_node) - list_for_each_entry(chan, &device->channels, device_node) { - ack = client->event_callback(client, chan, - DMA_RESOURCE_REMOVED); + dmaengine_ref_count++; - if (ack == DMA_ACK) { - dma_chan_put(chan); - chan->client_count--; - } + /* try to grab channels */ + list_for_each_entry_safe(device, _d, &dma_device_list, global_node) { + if (dma_has_cap(DMA_PRIVATE, device->cap_mask)) + continue; + list_for_each_entry(chan, &device->channels, device_node) { + err = dma_chan_get(chan); + if (err == -ENODEV) { + /* module removed before we could use it */ + list_del_rcu(&device->global_node); + break; + } else if (err) + pr_err("dmaengine: failed to get %s: (%d)\n", + dma_chan_name(chan), err); } + } - list_del(&client->global_node); + /* if this is the first reference and there were channels + * waiting we need to rebalance to get those channels + * incorporated into the channel table + */ + if (dmaengine_ref_count == 1) + dma_channel_rebalance(); mutex_unlock(&dma_list_mutex); } -EXPORT_SYMBOL(dma_async_client_unregister); +EXPORT_SYMBOL(dmaengine_get); /** - * dma_async_client_chan_request - send all available channels to the - * client that satisfy the capability mask - * @client - requester + * dmaengine_put - let dma drivers be removed when ref_count == 0 */ -void dma_async_client_chan_request(struct dma_client *client) +void dmaengine_put(void) { + struct dma_device *device; + struct dma_chan *chan; + mutex_lock(&dma_list_mutex); - dma_client_chan_alloc(client); + dmaengine_ref_count--; + BUG_ON(dmaengine_ref_count < 0); + /* drop channel references */ + list_for_each_entry(device, &dma_device_list, global_node) { + if (dma_has_cap(DMA_PRIVATE, device->cap_mask)) + continue; + list_for_each_entry(chan, &device->channels, device_node) + dma_chan_put(chan); + } mutex_unlock(&dma_list_mutex); } -EXPORT_SYMBOL(dma_async_client_chan_request); +EXPORT_SYMBOL(dmaengine_put); /** * dma_async_device_register - registers DMA devices found @@ -357,9 +612,9 @@ EXPORT_SYMBOL(dma_async_client_chan_request); */ int dma_async_device_register(struct dma_device *device) { - static int id; int chancnt = 0, rc; struct dma_chan* chan; + atomic_t *idr_ref; if (!device) return -ENODEV; @@ -386,57 +641,83 @@ int dma_async_device_register(struct dma_device *device) BUG_ON(!device->device_issue_pending); BUG_ON(!device->dev); - init_completion(&device->done); - kref_init(&device->refcount); - + idr_ref = kmalloc(sizeof(*idr_ref), GFP_KERNEL); + if (!idr_ref) + return -ENOMEM; + atomic_set(idr_ref, 0); + idr_retry: + if (!idr_pre_get(&dma_idr, GFP_KERNEL)) + return -ENOMEM; mutex_lock(&dma_list_mutex); - device->dev_id = id++; + rc = idr_get_new(&dma_idr, NULL, &device->dev_id); mutex_unlock(&dma_list_mutex); + if (rc == -EAGAIN) + goto idr_retry; + else if (rc != 0) + return rc; /* represent channels in sysfs. Probably want devs too */ list_for_each_entry(chan, &device->channels, device_node) { chan->local = alloc_percpu(typeof(*chan->local)); if (chan->local == NULL) continue; + chan->dev = kzalloc(sizeof(*chan->dev), GFP_KERNEL); + if (chan->dev == NULL) { + free_percpu(chan->local); + continue; + } chan->chan_id = chancnt++; - chan->dev.class = &dma_devclass; - chan->dev.parent = device->dev; - dev_set_name(&chan->dev, "dma%dchan%d", + chan->dev->device.class = &dma_devclass; + chan->dev->device.parent = device->dev; + chan->dev->chan = chan; + chan->dev->idr_ref = idr_ref; + chan->dev->dev_id = device->dev_id; + atomic_inc(idr_ref); + dev_set_name(&chan->dev->device, "dma%dchan%d", device->dev_id, chan->chan_id); - rc = device_register(&chan->dev); + rc = device_register(&chan->dev->device); if (rc) { - chancnt--; free_percpu(chan->local); chan->local = NULL; goto err_out; } - - /* One for the channel, one of the class device */ - kref_get(&device->refcount); - kref_get(&device->refcount); - kref_init(&chan->refcount); chan->client_count = 0; - chan->slow_ref = 0; - INIT_RCU_HEAD(&chan->rcu); } + device->chancnt = chancnt; mutex_lock(&dma_list_mutex); - list_add_tail(&device->global_node, &dma_device_list); + /* take references on public channels */ + if (dmaengine_ref_count && !dma_has_cap(DMA_PRIVATE, device->cap_mask)) + list_for_each_entry(chan, &device->channels, device_node) { + /* if clients are already waiting for channels we need + * to take references on their behalf + */ + if (dma_chan_get(chan) == -ENODEV) { + /* note we can only get here for the first + * channel as the remaining channels are + * guaranteed to get a reference + */ + rc = -ENODEV; + mutex_unlock(&dma_list_mutex); + goto err_out; + } + } + list_add_tail_rcu(&device->global_node, &dma_device_list); + dma_channel_rebalance(); mutex_unlock(&dma_list_mutex); - dma_clients_notify_available(); - return 0; err_out: list_for_each_entry(chan, &device->channels, device_node) { if (chan->local == NULL) continue; - kref_put(&device->refcount, dma_async_device_cleanup); - device_unregister(&chan->dev); - chancnt--; + mutex_lock(&dma_list_mutex); + chan->dev->chan = NULL; + mutex_unlock(&dma_list_mutex); + device_unregister(&chan->dev->device); free_percpu(chan->local); } return rc; @@ -444,37 +725,30 @@ err_out: EXPORT_SYMBOL(dma_async_device_register); /** - * dma_async_device_cleanup - function called when all references are released - * @kref: kernel reference object - */ -static void dma_async_device_cleanup(struct kref *kref) -{ - struct dma_device *device; - - device = container_of(kref, struct dma_device, refcount); - complete(&device->done); -} - -/** - * dma_async_device_unregister - unregisters DMA devices + * dma_async_device_unregister - unregister a DMA device * @device: &dma_device + * + * This routine is called by dma driver exit routines, dmaengine holds module + * references to prevent it being called while channels are in use. */ void dma_async_device_unregister(struct dma_device *device) { struct dma_chan *chan; mutex_lock(&dma_list_mutex); - list_del(&device->global_node); + list_del_rcu(&device->global_node); + dma_channel_rebalance(); mutex_unlock(&dma_list_mutex); list_for_each_entry(chan, &device->channels, device_node) { - dma_clients_notify_removed(chan); - device_unregister(&chan->dev); - dma_chan_release(chan); + WARN_ONCE(chan->client_count, + "%s called while %d clients hold a reference\n", + __func__, chan->client_count); + mutex_lock(&dma_list_mutex); + chan->dev->chan = NULL; + mutex_unlock(&dma_list_mutex); + device_unregister(&chan->dev->device); } - - kref_put(&device->refcount, dma_async_device_cleanup); - wait_for_completion(&device->done); } EXPORT_SYMBOL(dma_async_device_unregister); @@ -626,10 +900,96 @@ void dma_async_tx_descriptor_init(struct dma_async_tx_descriptor *tx, } EXPORT_SYMBOL(dma_async_tx_descriptor_init); +/* dma_wait_for_async_tx - spin wait for a transaction to complete + * @tx: in-flight transaction to wait on + * + * This routine assumes that tx was obtained from a call to async_memcpy, + * async_xor, async_memset, etc which ensures that tx is "in-flight" (prepped + * and submitted). Walking the parent chain is only meant to cover for DMA + * drivers that do not implement the DMA_INTERRUPT capability and may race with + * the driver's descriptor cleanup routine. + */ +enum dma_status +dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) +{ + enum dma_status status; + struct dma_async_tx_descriptor *iter; + struct dma_async_tx_descriptor *parent; + + if (!tx) + return DMA_SUCCESS; + + WARN_ONCE(tx->parent, "%s: speculatively walking dependency chain for" + " %s\n", __func__, dma_chan_name(tx->chan)); + + /* poll through the dependency chain, return when tx is complete */ + do { + iter = tx; + + /* find the root of the unsubmitted dependency chain */ + do { + parent = iter->parent; + if (!parent) + break; + else + iter = parent; + } while (parent); + + /* there is a small window for ->parent == NULL and + * ->cookie == -EBUSY + */ + while (iter->cookie == -EBUSY) + cpu_relax(); + + status = dma_sync_wait(iter->chan, iter->cookie); + } while (status == DMA_IN_PROGRESS || (iter != tx)); + + return status; +} +EXPORT_SYMBOL_GPL(dma_wait_for_async_tx); + +/* dma_run_dependencies - helper routine for dma drivers to process + * (start) dependent operations on their target channel + * @tx: transaction with dependencies + */ +void dma_run_dependencies(struct dma_async_tx_descriptor *tx) +{ + struct dma_async_tx_descriptor *dep = tx->next; + struct dma_async_tx_descriptor *dep_next; + struct dma_chan *chan; + + if (!dep) + return; + + chan = dep->chan; + + /* keep submitting up until a channel switch is detected + * in that case we will be called again as a result of + * processing the interrupt from async_tx_channel_switch + */ + for (; dep; dep = dep_next) { + spin_lock_bh(&dep->lock); + dep->parent = NULL; + dep_next = dep->next; + if (dep_next && dep_next->chan == chan) + dep->next = NULL; /* ->next will be submitted */ + else + dep_next = NULL; /* submit current dep and terminate */ + spin_unlock_bh(&dep->lock); + + dep->tx_submit(dep); + } + + chan->device->device_issue_pending(chan); +} +EXPORT_SYMBOL_GPL(dma_run_dependencies); + static int __init dma_bus_init(void) { + idr_init(&dma_idr); mutex_init(&dma_list_mutex); return class_register(&dma_devclass); } -subsys_initcall(dma_bus_init); +arch_initcall(dma_bus_init); + diff --git a/drivers/dma/dmatest.c b/drivers/dma/dmatest.c index ed9636bfb54..3603f1ea5b2 100644 --- a/drivers/dma/dmatest.c +++ b/drivers/dma/dmatest.c @@ -35,7 +35,7 @@ MODULE_PARM_DESC(threads_per_chan, static unsigned int max_channels; module_param(max_channels, uint, S_IRUGO); -MODULE_PARM_DESC(nr_channels, +MODULE_PARM_DESC(max_channels, "Maximum number of channels to use (default: all)"); /* @@ -71,7 +71,7 @@ struct dmatest_chan { /* * These are protected by dma_list_mutex since they're only used by - * the DMA client event callback + * the DMA filter function callback */ static LIST_HEAD(dmatest_channels); static unsigned int nr_channels; @@ -80,7 +80,7 @@ static bool dmatest_match_channel(struct dma_chan *chan) { if (test_channel[0] == '\0') return true; - return strcmp(dev_name(&chan->dev), test_channel) == 0; + return strcmp(dma_chan_name(chan), test_channel) == 0; } static bool dmatest_match_device(struct dma_device *device) @@ -215,7 +215,6 @@ static int dmatest_func(void *data) smp_rmb(); chan = thread->chan; - dma_chan_get(chan); while (!kthread_should_stop()) { total_tests++; @@ -293,7 +292,6 @@ static int dmatest_func(void *data) } ret = 0; - dma_chan_put(chan); kfree(thread->dstbuf); err_dstbuf: kfree(thread->srcbuf); @@ -319,21 +317,16 @@ static void dmatest_cleanup_channel(struct dmatest_chan *dtc) kfree(dtc); } -static enum dma_state_client dmatest_add_channel(struct dma_chan *chan) +static int dmatest_add_channel(struct dma_chan *chan) { struct dmatest_chan *dtc; struct dmatest_thread *thread; unsigned int i; - /* Have we already been told about this channel? */ - list_for_each_entry(dtc, &dmatest_channels, node) - if (dtc->chan == chan) - return DMA_DUP; - dtc = kmalloc(sizeof(struct dmatest_chan), GFP_KERNEL); if (!dtc) { - pr_warning("dmatest: No memory for %s\n", dev_name(&chan->dev)); - return DMA_NAK; + pr_warning("dmatest: No memory for %s\n", dma_chan_name(chan)); + return -ENOMEM; } dtc->chan = chan; @@ -343,16 +336,16 @@ static enum dma_state_client dmatest_add_channel(struct dma_chan *chan) thread = kzalloc(sizeof(struct dmatest_thread), GFP_KERNEL); if (!thread) { pr_warning("dmatest: No memory for %s-test%u\n", - dev_name(&chan->dev), i); + dma_chan_name(chan), i); break; } thread->chan = dtc->chan; smp_wmb(); thread->task = kthread_run(dmatest_func, thread, "%s-test%u", - dev_name(&chan->dev), i); + dma_chan_name(chan), i); if (IS_ERR(thread->task)) { pr_warning("dmatest: Failed to run thread %s-test%u\n", - dev_name(&chan->dev), i); + dma_chan_name(chan), i); kfree(thread); break; } @@ -362,86 +355,62 @@ static enum dma_state_client dmatest_add_channel(struct dma_chan *chan) list_add_tail(&thread->node, &dtc->threads); } - pr_info("dmatest: Started %u threads using %s\n", i, dev_name(&chan->dev)); + pr_info("dmatest: Started %u threads using %s\n", i, dma_chan_name(chan)); list_add_tail(&dtc->node, &dmatest_channels); nr_channels++; - return DMA_ACK; -} - -static enum dma_state_client dmatest_remove_channel(struct dma_chan *chan) -{ - struct dmatest_chan *dtc, *_dtc; - - list_for_each_entry_safe(dtc, _dtc, &dmatest_channels, node) { - if (dtc->chan == chan) { - list_del(&dtc->node); - dmatest_cleanup_channel(dtc); - pr_debug("dmatest: lost channel %s\n", - dev_name(&chan->dev)); - return DMA_ACK; - } - } - - return DMA_DUP; + return 0; } -/* - * Start testing threads as new channels are assigned to us, and kill - * them when the channels go away. - * - * When we unregister the client, all channels are removed so this - * will also take care of cleaning things up when the module is - * unloaded. - */ -static enum dma_state_client -dmatest_event(struct dma_client *client, struct dma_chan *chan, - enum dma_state state) +static bool filter(struct dma_chan *chan, void *param) { - enum dma_state_client ack = DMA_NAK; - - switch (state) { - case DMA_RESOURCE_AVAILABLE: - if (!dmatest_match_channel(chan) - || !dmatest_match_device(chan->device)) - ack = DMA_DUP; - else if (max_channels && nr_channels >= max_channels) - ack = DMA_NAK; - else - ack = dmatest_add_channel(chan); - break; - - case DMA_RESOURCE_REMOVED: - ack = dmatest_remove_channel(chan); - break; - - default: - pr_info("dmatest: Unhandled event %u (%s)\n", - state, dev_name(&chan->dev)); - break; - } - - return ack; + if (!dmatest_match_channel(chan) || !dmatest_match_device(chan->device)) + return false; + else + return true; } -static struct dma_client dmatest_client = { - .event_callback = dmatest_event, -}; - static int __init dmatest_init(void) { - dma_cap_set(DMA_MEMCPY, dmatest_client.cap_mask); - dma_async_client_register(&dmatest_client); - dma_async_client_chan_request(&dmatest_client); + dma_cap_mask_t mask; + struct dma_chan *chan; + int err = 0; + + dma_cap_zero(mask); + dma_cap_set(DMA_MEMCPY, mask); + for (;;) { + chan = dma_request_channel(mask, filter, NULL); + if (chan) { + err = dmatest_add_channel(chan); + if (err == 0) + continue; + else { + dma_release_channel(chan); + break; /* add_channel failed, punt */ + } + } else + break; /* no more channels available */ + if (max_channels && nr_channels >= max_channels) + break; /* we have all we need */ + } - return 0; + return err; } -module_init(dmatest_init); +/* when compiled-in wait for drivers to load first */ +late_initcall(dmatest_init); static void __exit dmatest_exit(void) { - dma_async_client_unregister(&dmatest_client); + struct dmatest_chan *dtc, *_dtc; + + list_for_each_entry_safe(dtc, _dtc, &dmatest_channels, node) { + list_del(&dtc->node); + dmatest_cleanup_channel(dtc); + pr_debug("dmatest: dropped channel %s\n", + dma_chan_name(dtc->chan)); + dma_release_channel(dtc->chan); + } } module_exit(dmatest_exit); diff --git a/drivers/dma/dw_dmac.c b/drivers/dma/dw_dmac.c index 0778d99aea7..6b702cc46b3 100644 --- a/drivers/dma/dw_dmac.c +++ b/drivers/dma/dw_dmac.c @@ -70,6 +70,15 @@ * the controller, though. */ +static struct device *chan2dev(struct dma_chan *chan) +{ + return &chan->dev->device; +} +static struct device *chan2parent(struct dma_chan *chan) +{ + return chan->dev->device.parent; +} + static struct dw_desc *dwc_first_active(struct dw_dma_chan *dwc) { return list_entry(dwc->active_list.next, struct dw_desc, desc_node); @@ -93,12 +102,12 @@ static struct dw_desc *dwc_desc_get(struct dw_dma_chan *dwc) ret = desc; break; } - dev_dbg(&dwc->chan.dev, "desc %p not ACKed\n", desc); + dev_dbg(chan2dev(&dwc->chan), "desc %p not ACKed\n", desc); i++; } spin_unlock_bh(&dwc->lock); - dev_vdbg(&dwc->chan.dev, "scanned %u descriptors on freelist\n", i); + dev_vdbg(chan2dev(&dwc->chan), "scanned %u descriptors on freelist\n", i); return ret; } @@ -108,10 +117,10 @@ static void dwc_sync_desc_for_cpu(struct dw_dma_chan *dwc, struct dw_desc *desc) struct dw_desc *child; list_for_each_entry(child, &desc->txd.tx_list, desc_node) - dma_sync_single_for_cpu(dwc->chan.dev.parent, + dma_sync_single_for_cpu(chan2parent(&dwc->chan), child->txd.phys, sizeof(child->lli), DMA_TO_DEVICE); - dma_sync_single_for_cpu(dwc->chan.dev.parent, + dma_sync_single_for_cpu(chan2parent(&dwc->chan), desc->txd.phys, sizeof(desc->lli), DMA_TO_DEVICE); } @@ -129,11 +138,11 @@ static void dwc_desc_put(struct dw_dma_chan *dwc, struct dw_desc *desc) spin_lock_bh(&dwc->lock); list_for_each_entry(child, &desc->txd.tx_list, desc_node) - dev_vdbg(&dwc->chan.dev, + dev_vdbg(chan2dev(&dwc->chan), "moving child desc %p to freelist\n", child); list_splice_init(&desc->txd.tx_list, &dwc->free_list); - dev_vdbg(&dwc->chan.dev, "moving desc %p to freelist\n", desc); + dev_vdbg(chan2dev(&dwc->chan), "moving desc %p to freelist\n", desc); list_add(&desc->desc_node, &dwc->free_list); spin_unlock_bh(&dwc->lock); } @@ -163,9 +172,9 @@ static void dwc_dostart(struct dw_dma_chan *dwc, struct dw_desc *first) /* ASSERT: channel is idle */ if (dma_readl(dw, CH_EN) & dwc->mask) { - dev_err(&dwc->chan.dev, + dev_err(chan2dev(&dwc->chan), "BUG: Attempted to start non-idle channel\n"); - dev_err(&dwc->chan.dev, + dev_err(chan2dev(&dwc->chan), " SAR: 0x%x DAR: 0x%x LLP: 0x%x CTL: 0x%x:%08x\n", channel_readl(dwc, SAR), channel_readl(dwc, DAR), @@ -193,7 +202,7 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc) void *param; struct dma_async_tx_descriptor *txd = &desc->txd; - dev_vdbg(&dwc->chan.dev, "descriptor %u complete\n", txd->cookie); + dev_vdbg(chan2dev(&dwc->chan), "descriptor %u complete\n", txd->cookie); dwc->completed = txd->cookie; callback = txd->callback; @@ -208,11 +217,11 @@ dwc_descriptor_complete(struct dw_dma_chan *dwc, struct dw_desc *desc) * mapped before they were submitted... */ if (!(txd->flags & DMA_COMPL_SKIP_DEST_UNMAP)) - dma_unmap_page(dwc->chan.dev.parent, desc->lli.dar, desc->len, - DMA_FROM_DEVICE); + dma_unmap_page(chan2parent(&dwc->chan), desc->lli.dar, + desc->len, DMA_FROM_DEVICE); if (!(txd->flags & DMA_COMPL_SKIP_SRC_UNMAP)) - dma_unmap_page(dwc->chan.dev.parent, desc->lli.sar, desc->len, - DMA_TO_DEVICE); + dma_unmap_page(chan2parent(&dwc->chan), desc->lli.sar, + desc->len, DMA_TO_DEVICE); /* * The API requires that no submissions are done from a @@ -228,7 +237,7 @@ static void dwc_complete_all(struct dw_dma *dw, struct dw_dma_chan *dwc) LIST_HEAD(list); if (dma_readl(dw, CH_EN) & dwc->mask) { - dev_err(&dwc->chan.dev, + dev_err(chan2dev(&dwc->chan), "BUG: XFER bit set, but channel not idle!\n"); /* Try to continue after resetting the channel... */ @@ -273,7 +282,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc) return; } - dev_vdbg(&dwc->chan.dev, "scan_descriptors: llp=0x%x\n", llp); + dev_vdbg(chan2dev(&dwc->chan), "scan_descriptors: llp=0x%x\n", llp); list_for_each_entry_safe(desc, _desc, &dwc->active_list, desc_node) { if (desc->lli.llp == llp) @@ -292,7 +301,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc) dwc_descriptor_complete(dwc, desc); } - dev_err(&dwc->chan.dev, + dev_err(chan2dev(&dwc->chan), "BUG: All descriptors done, but channel not idle!\n"); /* Try to continue after resetting the channel... */ @@ -308,7 +317,7 @@ static void dwc_scan_descriptors(struct dw_dma *dw, struct dw_dma_chan *dwc) static void dwc_dump_lli(struct dw_dma_chan *dwc, struct dw_lli *lli) { - dev_printk(KERN_CRIT, &dwc->chan.dev, + dev_printk(KERN_CRIT, chan2dev(&dwc->chan), " desc: s0x%x d0x%x l0x%x c0x%x:%x\n", lli->sar, lli->dar, lli->llp, lli->ctlhi, lli->ctllo); @@ -342,9 +351,9 @@ static void dwc_handle_error(struct dw_dma *dw, struct dw_dma_chan *dwc) * controller flagged an error instead of scribbling over * random memory locations. */ - dev_printk(KERN_CRIT, &dwc->chan.dev, + dev_printk(KERN_CRIT, chan2dev(&dwc->chan), "Bad descriptor submitted for DMA!\n"); - dev_printk(KERN_CRIT, &dwc->chan.dev, + dev_printk(KERN_CRIT, chan2dev(&dwc->chan), " cookie: %d\n", bad_desc->txd.cookie); dwc_dump_lli(dwc, &bad_desc->lli); list_for_each_entry(child, &bad_desc->txd.tx_list, desc_node) @@ -442,12 +451,12 @@ static dma_cookie_t dwc_tx_submit(struct dma_async_tx_descriptor *tx) * for DMA. But this is hard to do in a race-free manner. */ if (list_empty(&dwc->active_list)) { - dev_vdbg(&tx->chan->dev, "tx_submit: started %u\n", + dev_vdbg(chan2dev(tx->chan), "tx_submit: started %u\n", desc->txd.cookie); dwc_dostart(dwc, desc); list_add_tail(&desc->desc_node, &dwc->active_list); } else { - dev_vdbg(&tx->chan->dev, "tx_submit: queued %u\n", + dev_vdbg(chan2dev(tx->chan), "tx_submit: queued %u\n", desc->txd.cookie); list_add_tail(&desc->desc_node, &dwc->queue); @@ -472,11 +481,11 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, unsigned int dst_width; u32 ctllo; - dev_vdbg(&chan->dev, "prep_dma_memcpy d0x%x s0x%x l0x%zx f0x%lx\n", + dev_vdbg(chan2dev(chan), "prep_dma_memcpy d0x%x s0x%x l0x%zx f0x%lx\n", dest, src, len, flags); if (unlikely(!len)) { - dev_dbg(&chan->dev, "prep_dma_memcpy: length is zero!\n"); + dev_dbg(chan2dev(chan), "prep_dma_memcpy: length is zero!\n"); return NULL; } @@ -516,7 +525,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, first = desc; } else { prev->lli.llp = desc->txd.phys; - dma_sync_single_for_device(chan->dev.parent, + dma_sync_single_for_device(chan2parent(chan), prev->txd.phys, sizeof(prev->lli), DMA_TO_DEVICE); list_add_tail(&desc->desc_node, @@ -531,7 +540,7 @@ dwc_prep_dma_memcpy(struct dma_chan *chan, dma_addr_t dest, dma_addr_t src, prev->lli.ctllo |= DWC_CTLL_INT_EN; prev->lli.llp = 0; - dma_sync_single_for_device(chan->dev.parent, + dma_sync_single_for_device(chan2parent(chan), prev->txd.phys, sizeof(prev->lli), DMA_TO_DEVICE); @@ -562,15 +571,15 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, struct scatterlist *sg; size_t total_len = 0; - dev_vdbg(&chan->dev, "prep_dma_slave\n"); + dev_vdbg(chan2dev(chan), "prep_dma_slave\n"); if (unlikely(!dws || !sg_len)) return NULL; - reg_width = dws->slave.reg_width; + reg_width = dws->reg_width; prev = first = NULL; - sg_len = dma_map_sg(chan->dev.parent, sgl, sg_len, direction); + sg_len = dma_map_sg(chan2parent(chan), sgl, sg_len, direction); switch (direction) { case DMA_TO_DEVICE: @@ -579,7 +588,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, | DWC_CTLL_DST_FIX | DWC_CTLL_SRC_INC | DWC_CTLL_FC_M2P); - reg = dws->slave.tx_reg; + reg = dws->tx_reg; for_each_sg(sgl, sg, sg_len, i) { struct dw_desc *desc; u32 len; @@ -587,7 +596,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, desc = dwc_desc_get(dwc); if (!desc) { - dev_err(&chan->dev, + dev_err(chan2dev(chan), "not enough descriptors available\n"); goto err_desc_get; } @@ -607,7 +616,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, first = desc; } else { prev->lli.llp = desc->txd.phys; - dma_sync_single_for_device(chan->dev.parent, + dma_sync_single_for_device(chan2parent(chan), prev->txd.phys, sizeof(prev->lli), DMA_TO_DEVICE); @@ -625,7 +634,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, | DWC_CTLL_SRC_FIX | DWC_CTLL_FC_P2M); - reg = dws->slave.rx_reg; + reg = dws->rx_reg; for_each_sg(sgl, sg, sg_len, i) { struct dw_desc *desc; u32 len; @@ -633,7 +642,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, desc = dwc_desc_get(dwc); if (!desc) { - dev_err(&chan->dev, + dev_err(chan2dev(chan), "not enough descriptors available\n"); goto err_desc_get; } @@ -653,7 +662,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, first = desc; } else { prev->lli.llp = desc->txd.phys; - dma_sync_single_for_device(chan->dev.parent, + dma_sync_single_for_device(chan2parent(chan), prev->txd.phys, sizeof(prev->lli), DMA_TO_DEVICE); @@ -673,7 +682,7 @@ dwc_prep_slave_sg(struct dma_chan *chan, struct scatterlist *sgl, prev->lli.ctllo |= DWC_CTLL_INT_EN; prev->lli.llp = 0; - dma_sync_single_for_device(chan->dev.parent, + dma_sync_single_for_device(chan2parent(chan), prev->txd.phys, sizeof(prev->lli), DMA_TO_DEVICE); @@ -758,29 +767,21 @@ static void dwc_issue_pending(struct dma_chan *chan) spin_unlock_bh(&dwc->lock); } -static int dwc_alloc_chan_resources(struct dma_chan *chan, - struct dma_client *client) +static int dwc_alloc_chan_resources(struct dma_chan *chan) { struct dw_dma_chan *dwc = to_dw_dma_chan(chan); struct dw_dma *dw = to_dw_dma(chan->device); struct dw_desc *desc; - struct dma_slave *slave; struct dw_dma_slave *dws; int i; u32 cfghi; u32 cfglo; - dev_vdbg(&chan->dev, "alloc_chan_resources\n"); - - /* Channels doing slave DMA can only handle one client. */ - if (dwc->dws || client->slave) { - if (chan->client_count) - return -EBUSY; - } + dev_vdbg(chan2dev(chan), "alloc_chan_resources\n"); /* ASSERT: channel is idle */ if (dma_readl(dw, CH_EN) & dwc->mask) { - dev_dbg(&chan->dev, "DMA channel not idle?\n"); + dev_dbg(chan2dev(chan), "DMA channel not idle?\n"); return -EIO; } @@ -789,23 +790,17 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan, cfghi = DWC_CFGH_FIFO_MODE; cfglo = 0; - slave = client->slave; - if (slave) { + dws = dwc->dws; + if (dws) { /* * We need controller-specific data to set up slave * transfers. */ - BUG_ON(!slave->dma_dev || slave->dma_dev != dw->dma.dev); - - dws = container_of(slave, struct dw_dma_slave, slave); + BUG_ON(!dws->dma_dev || dws->dma_dev != dw->dma.dev); - dwc->dws = dws; cfghi = dws->cfg_hi; cfglo = dws->cfg_lo; - } else { - dwc->dws = NULL; } - channel_writel(dwc, CFG_LO, cfglo); channel_writel(dwc, CFG_HI, cfghi); @@ -822,7 +817,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan, desc = kzalloc(sizeof(struct dw_desc), GFP_KERNEL); if (!desc) { - dev_info(&chan->dev, + dev_info(chan2dev(chan), "only allocated %d descriptors\n", i); spin_lock_bh(&dwc->lock); break; @@ -832,7 +827,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan, desc->txd.tx_submit = dwc_tx_submit; desc->txd.flags = DMA_CTRL_ACK; INIT_LIST_HEAD(&desc->txd.tx_list); - desc->txd.phys = dma_map_single(chan->dev.parent, &desc->lli, + desc->txd.phys = dma_map_single(chan2parent(chan), &desc->lli, sizeof(desc->lli), DMA_TO_DEVICE); dwc_desc_put(dwc, desc); @@ -847,7 +842,7 @@ static int dwc_alloc_chan_resources(struct dma_chan *chan, spin_unlock_bh(&dwc->lock); - dev_dbg(&chan->dev, + dev_dbg(chan2dev(chan), "alloc_chan_resources allocated %d descriptors\n", i); return i; @@ -860,7 +855,7 @@ static void dwc_free_chan_resources(struct dma_chan *chan) struct dw_desc *desc, *_desc; LIST_HEAD(list); - dev_dbg(&chan->dev, "free_chan_resources (descs allocated=%u)\n", + dev_dbg(chan2dev(chan), "free_chan_resources (descs allocated=%u)\n", dwc->descs_allocated); /* ASSERT: channel is idle */ @@ -881,13 +876,13 @@ static void dwc_free_chan_resources(struct dma_chan *chan) spin_unlock_bh(&dwc->lock); list_for_each_entry_safe(desc, _desc, &list, desc_node) { - dev_vdbg(&chan->dev, " freeing descriptor %p\n", desc); - dma_unmap_single(chan->dev.parent, desc->txd.phys, + dev_vdbg(chan2dev(chan), " freeing descriptor %p\n", desc); + dma_unmap_single(chan2parent(chan), desc->txd.phys, sizeof(desc->lli), DMA_TO_DEVICE); kfree(desc); } - dev_vdbg(&chan->dev, "free_chan_resources done\n"); + dev_vdbg(chan2dev(chan), "free_chan_resources done\n"); } /*----------------------------------------------------------------------*/ diff --git a/drivers/dma/fsldma.c b/drivers/dma/fsldma.c index 0b95dcce447..ca70a21afc6 100644 --- a/drivers/dma/fsldma.c +++ b/drivers/dma/fsldma.c @@ -366,8 +366,7 @@ static struct fsl_desc_sw *fsl_dma_alloc_descriptor( * * Return - The number of descriptors allocated. */ -static int fsl_dma_alloc_chan_resources(struct dma_chan *chan, - struct dma_client *client) +static int fsl_dma_alloc_chan_resources(struct dma_chan *chan) { struct fsl_dma_chan *fsl_chan = to_fsl_chan(chan); @@ -823,7 +822,7 @@ static int __devinit fsl_dma_chan_probe(struct fsl_dma_device *fdev, */ WARN_ON(fdev->feature != new_fsl_chan->feature); - new_fsl_chan->dev = &new_fsl_chan->common.dev; + new_fsl_chan->dev = &new_fsl_chan->common.dev->device; new_fsl_chan->reg_base = ioremap(new_fsl_chan->reg.start, new_fsl_chan->reg.end - new_fsl_chan->reg.start + 1); diff --git a/drivers/dma/ioat.c b/drivers/dma/ioat.c index 9b16a3af9a0..4105d6575b6 100644 --- a/drivers/dma/ioat.c +++ b/drivers/dma/ioat.c @@ -75,60 +75,10 @@ static int ioat_dca_enabled = 1; module_param(ioat_dca_enabled, int, 0644); MODULE_PARM_DESC(ioat_dca_enabled, "control support of dca service (default: 1)"); -static int ioat_setup_functionality(struct pci_dev *pdev, void __iomem *iobase) -{ - struct ioat_device *device = pci_get_drvdata(pdev); - u8 version; - int err = 0; - - version = readb(iobase + IOAT_VER_OFFSET); - switch (version) { - case IOAT_VER_1_2: - device->dma = ioat_dma_probe(pdev, iobase); - if (device->dma && ioat_dca_enabled) - device->dca = ioat_dca_init(pdev, iobase); - break; - case IOAT_VER_2_0: - device->dma = ioat_dma_probe(pdev, iobase); - if (device->dma && ioat_dca_enabled) - device->dca = ioat2_dca_init(pdev, iobase); - break; - case IOAT_VER_3_0: - device->dma = ioat_dma_probe(pdev, iobase); - if (device->dma && ioat_dca_enabled) - device->dca = ioat3_dca_init(pdev, iobase); - break; - default: - err = -ENODEV; - break; - } - if (!device->dma) - err = -ENODEV; - return err; -} - -static void ioat_shutdown_functionality(struct pci_dev *pdev) -{ - struct ioat_device *device = pci_get_drvdata(pdev); - - dev_err(&pdev->dev, "Removing dma and dca services\n"); - if (device->dca) { - unregister_dca_provider(device->dca); - free_dca_provider(device->dca); - device->dca = NULL; - } - - if (device->dma) { - ioat_dma_remove(device->dma); - device->dma = NULL; - } -} - static struct pci_driver ioat_pci_driver = { .name = "ioatdma", .id_table = ioat_pci_tbl, .probe = ioat_probe, - .shutdown = ioat_shutdown_functionality, .remove = __devexit_p(ioat_remove), }; @@ -179,7 +129,29 @@ static int __devinit ioat_probe(struct pci_dev *pdev, pci_set_master(pdev); - err = ioat_setup_functionality(pdev, iobase); + switch (readb(iobase + IOAT_VER_OFFSET)) { + case IOAT_VER_1_2: + device->dma = ioat_dma_probe(pdev, iobase); + if (device->dma && ioat_dca_enabled) + device->dca = ioat_dca_init(pdev, iobase); + break; + case IOAT_VER_2_0: + device->dma = ioat_dma_probe(pdev, iobase); + if (device->dma && ioat_dca_enabled) + device->dca = ioat2_dca_init(pdev, iobase); + break; + case IOAT_VER_3_0: + device->dma = ioat_dma_probe(pdev, iobase); + if (device->dma && ioat_dca_enabled) + device->dca = ioat3_dca_init(pdev, iobase); + break; + default: + err = -ENODEV; + break; + } + if (!device->dma) + err = -ENODEV; + if (err) goto err_version; @@ -198,17 +170,21 @@ err_enable_device: return err; } -/* - * It is unsafe to remove this module: if removed while a requested - * dma is outstanding, esp. from tcp, it is possible to hang while - * waiting for something that will never finish. However, if you're - * feeling lucky, this usually works just fine. - */ static void __devexit ioat_remove(struct pci_dev *pdev) { struct ioat_device *device = pci_get_drvdata(pdev); - ioat_shutdown_functionality(pdev); + dev_err(&pdev->dev, "Removing dma and dca services\n"); + if (device->dca) { + unregister_dca_provider(device->dca); + free_dca_provider(device->dca); + device->dca = NULL; + } + + if (device->dma) { + ioat_dma_remove(device->dma); + device->dma = NULL; + } kfree(device); } diff --git a/drivers/dma/ioat_dma.c b/drivers/dma/ioat_dma.c index 6607fdd00b1..b3759c4b653 100644 --- a/drivers/dma/ioat_dma.c +++ b/drivers/dma/ioat_dma.c @@ -734,8 +734,7 @@ static void ioat2_dma_massage_chan_desc(struct ioat_dma_chan *ioat_chan) * ioat_dma_alloc_chan_resources - returns the number of allocated descriptors * @chan: the channel to be filled out */ -static int ioat_dma_alloc_chan_resources(struct dma_chan *chan, - struct dma_client *client) +static int ioat_dma_alloc_chan_resources(struct dma_chan *chan) { struct ioat_dma_chan *ioat_chan = to_ioat_chan(chan); struct ioat_desc_sw *desc; @@ -1341,12 +1340,11 @@ static void ioat_dma_start_null_desc(struct ioat_dma_chan *ioat_chan) */ #define IOAT_TEST_SIZE 2000 -DECLARE_COMPLETION(test_completion); static void ioat_dma_test_callback(void *dma_async_param) { - printk(KERN_ERR "ioatdma: ioat_dma_test_callback(%p)\n", - dma_async_param); - complete(&test_completion); + struct completion *cmp = dma_async_param; + + complete(cmp); } /** @@ -1363,6 +1361,7 @@ static int ioat_dma_self_test(struct ioatdma_device *device) dma_addr_t dma_dest, dma_src; dma_cookie_t cookie; int err = 0; + struct completion cmp; src = kzalloc(sizeof(u8) * IOAT_TEST_SIZE, GFP_KERNEL); if (!src) @@ -1381,7 +1380,7 @@ static int ioat_dma_self_test(struct ioatdma_device *device) dma_chan = container_of(device->common.channels.next, struct dma_chan, device_node); - if (device->common.device_alloc_chan_resources(dma_chan, NULL) < 1) { + if (device->common.device_alloc_chan_resources(dma_chan) < 1) { dev_err(&device->pdev->dev, "selftest cannot allocate chan resource\n"); err = -ENODEV; @@ -1402,8 +1401,9 @@ static int ioat_dma_self_test(struct ioatdma_device *device) } async_tx_ack(tx); + init_completion(&cmp); tx->callback = ioat_dma_test_callback; - tx->callback_param = (void *)0x8086; + tx->callback_param = &cmp; cookie = tx->tx_submit(tx); if (cookie < 0) { dev_err(&device->pdev->dev, @@ -1413,7 +1413,7 @@ static int ioat_dma_self_test(struct ioatdma_device *device) } device->common.device_issue_pending(dma_chan); - wait_for_completion_timeout(&test_completion, msecs_to_jiffies(3000)); + wait_for_completion_timeout(&cmp, msecs_to_jiffies(3000)); if (device->common.device_is_tx_complete(dma_chan, cookie, NULL, NULL) != DMA_SUCCESS) { diff --git a/drivers/dma/iop-adma.c b/drivers/dma/iop-adma.c index 6be31726220..ea5440dd10d 100644 --- a/drivers/dma/iop-adma.c +++ b/drivers/dma/iop-adma.c @@ -24,7 +24,6 @@ #include <linux/init.h> #include <linux/module.h> -#include <linux/async_tx.h> #include <linux/delay.h> #include <linux/dma-mapping.h> #include <linux/spinlock.h> @@ -116,7 +115,7 @@ iop_adma_run_tx_complete_actions(struct iop_adma_desc_slot *desc, } /* run dependent operations */ - async_tx_run_dependencies(&desc->async_tx); + dma_run_dependencies(&desc->async_tx); return cookie; } @@ -270,8 +269,6 @@ static void __iop_adma_slot_cleanup(struct iop_adma_chan *iop_chan) break; } - BUG_ON(!seen_current); - if (cookie > 0) { iop_chan->completed_cookie = cookie; pr_debug("\tcompleted cookie %d\n", cookie); @@ -471,8 +468,7 @@ static void iop_chan_start_null_xor(struct iop_adma_chan *iop_chan); * greater than 2x the number slots needed to satisfy a device->max_xor * request. * */ -static int iop_adma_alloc_chan_resources(struct dma_chan *chan, - struct dma_client *client) +static int iop_adma_alloc_chan_resources(struct dma_chan *chan) { char *hw_desc; int idx; @@ -866,7 +862,7 @@ static int __devinit iop_adma_memcpy_self_test(struct iop_adma_device *device) dma_chan = container_of(device->common.channels.next, struct dma_chan, device_node); - if (iop_adma_alloc_chan_resources(dma_chan, NULL) < 1) { + if (iop_adma_alloc_chan_resources(dma_chan) < 1) { err = -ENODEV; goto out; } @@ -964,7 +960,7 @@ iop_adma_xor_zero_sum_self_test(struct iop_adma_device *device) dma_chan = container_of(device->common.channels.next, struct dma_chan, device_node); - if (iop_adma_alloc_chan_resources(dma_chan, NULL) < 1) { + if (iop_adma_alloc_chan_resources(dma_chan) < 1) { err = -ENODEV; goto out; } @@ -1115,26 +1111,13 @@ static int __devexit iop_adma_remove(struct platform_device *dev) struct iop_adma_device *device = platform_get_drvdata(dev); struct dma_chan *chan, *_chan; struct iop_adma_chan *iop_chan; - int i; struct iop_adma_platform_data *plat_data = dev->dev.platform_data; dma_async_device_unregister(&device->common); - for (i = 0; i < 3; i++) { - unsigned int irq; - irq = platform_get_irq(dev, i); - free_irq(irq, device); - } - dma_free_coherent(&dev->dev, plat_data->pool_size, device->dma_desc_pool_virt, device->dma_desc_pool); - do { - struct resource *res; - res = platform_get_resource(dev, IORESOURCE_MEM, 0); - release_mem_region(res->start, res->end - res->start); - } while (0); - list_for_each_entry_safe(chan, _chan, &device->common.channels, device_node) { iop_chan = to_iop_adma_chan(chan); @@ -1255,7 +1238,6 @@ static int __devinit iop_adma_probe(struct platform_device *pdev) spin_lock_init(&iop_chan->lock); INIT_LIST_HEAD(&iop_chan->chain); INIT_LIST_HEAD(&iop_chan->all_slots); - INIT_RCU_HEAD(&iop_chan->common.rcu); iop_chan->common.device = dma_dev; list_add_tail(&iop_chan->common.device_node, &dma_dev->channels); @@ -1431,16 +1413,12 @@ static int __init iop_adma_init (void) return platform_driver_register(&iop_adma_driver); } -/* it's currently unsafe to unload this module */ -#if 0 static void __exit iop_adma_exit (void) { platform_driver_unregister(&iop_adma_driver); return; } module_exit(iop_adma_exit); -#endif - module_init(iop_adma_init); MODULE_AUTHOR("Intel Corporation"); diff --git a/drivers/dma/mv_xor.c b/drivers/dma/mv_xor.c index bcda1742641..d35cbd1ff0b 100644 --- a/drivers/dma/mv_xor.c +++ b/drivers/dma/mv_xor.c @@ -18,7 +18,6 @@ #include <linux/init.h> #include <linux/module.h> -#include <linux/async_tx.h> #include <linux/delay.h> #include <linux/dma-mapping.h> #include <linux/spinlock.h> @@ -340,7 +339,7 @@ mv_xor_run_tx_complete_actions(struct mv_xor_desc_slot *desc, } /* run dependent operations */ - async_tx_run_dependencies(&desc->async_tx); + dma_run_dependencies(&desc->async_tx); return cookie; } @@ -607,8 +606,7 @@ submit_done: } /* returns the number of allocated descriptors */ -static int mv_xor_alloc_chan_resources(struct dma_chan *chan, - struct dma_client *client) +static int mv_xor_alloc_chan_resources(struct dma_chan *chan) { char *hw_desc; int idx; @@ -958,7 +956,7 @@ static int __devinit mv_xor_memcpy_self_test(struct mv_xor_device *device) dma_chan = container_of(device->common.channels.next, struct dma_chan, device_node); - if (mv_xor_alloc_chan_resources(dma_chan, NULL) < 1) { + if (mv_xor_alloc_chan_resources(dma_chan) < 1) { err = -ENODEV; goto out; } @@ -1053,7 +1051,7 @@ mv_xor_xor_self_test(struct mv_xor_device *device) dma_chan = container_of(device->common.channels.next, struct dma_chan, device_node); - if (mv_xor_alloc_chan_resources(dma_chan, NULL) < 1) { + if (mv_xor_alloc_chan_resources(dma_chan) < 1) { err = -ENODEV; goto out; } @@ -1221,7 +1219,6 @@ static int __devinit mv_xor_probe(struct platform_device *pdev) INIT_LIST_HEAD(&mv_chan->chain); INIT_LIST_HEAD(&mv_chan->completed_slots); INIT_LIST_HEAD(&mv_chan->all_slots); - INIT_RCU_HEAD(&mv_chan->common.rcu); mv_chan->common.device = dma_dev; list_add_tail(&mv_chan->common.device_node, &dma_dev->channels); diff --git a/drivers/ide/ide-acpi.c b/drivers/ide/ide-acpi.c index 2f9e941968d..d8f295bdad7 100644 --- a/drivers/ide/ide-acpi.c +++ b/drivers/ide/ide-acpi.c @@ -18,12 +18,6 @@ #include <linux/dmi.h> #include <acpi/acpi_bus.h> -#include <acpi/acnames.h> -#include <acpi/acnamesp.h> -#include <acpi/acparser.h> -#include <acpi/acexcep.h> -#include <acpi/acmacros.h> -#include <acpi/actypes.h> #define REGS_PER_GTF 7 struct taskfile_array { diff --git a/drivers/leds/Kconfig b/drivers/leds/Kconfig index e7fb7d2fcbf..a4a1ae21463 100644 --- a/drivers/leds/Kconfig +++ b/drivers/leds/Kconfig @@ -63,6 +63,12 @@ config LEDS_WRAP help This option enables support for the PCEngines WRAP programmable LEDs. +config LEDS_ALIX2 + tristate "LED Support for ALIX.2 and ALIX.3 series" + depends on LEDS_CLASS && X86 && EXPERIMENTAL + help + This option enables support for the PCEngines ALIX.2 and ALIX.3 LEDs. + config LEDS_H1940 tristate "LED Support for iPAQ H1940 device" depends on LEDS_CLASS && ARCH_H1940 @@ -77,7 +83,7 @@ config LEDS_COBALT_QUBE config LEDS_COBALT_RAQ bool "LED Support for the Cobalt Raq series" - depends on LEDS_CLASS && MIPS_COBALT + depends on LEDS_CLASS=y && MIPS_COBALT select LEDS_TRIGGERS help This option enables support for the Cobalt Raq series LEDs. @@ -158,6 +164,13 @@ config LEDS_PCA955X LED driver chips accessed via the I2C bus. Supported devices include PCA9550, PCA9551, PCA9552, and PCA9553. +config LEDS_WM8350 + tristate "LED Support for WM8350 AudioPlus PMIC" + depends on LEDS_CLASS && MFD_WM8350 + help + This option enables support for LEDs driven by the Wolfson + Microelectronics WM8350 AudioPlus PMIC. + config LEDS_DA903X tristate "LED Support for DA9030/DA9034 PMIC" depends on LEDS_CLASS && PMIC_DA903X diff --git a/drivers/leds/Makefile b/drivers/leds/Makefile index e1967a29850..bc247cb02e8 100644 --- a/drivers/leds/Makefile +++ b/drivers/leds/Makefile @@ -11,6 +11,7 @@ obj-$(CONFIG_LEDS_S3C24XX) += leds-s3c24xx.o obj-$(CONFIG_LEDS_AMS_DELTA) += leds-ams-delta.o obj-$(CONFIG_LEDS_NET48XX) += leds-net48xx.o obj-$(CONFIG_LEDS_WRAP) += leds-wrap.o +obj-$(CONFIG_LEDS_ALIX2) += leds-alix2.o obj-$(CONFIG_LEDS_H1940) += leds-h1940.o obj-$(CONFIG_LEDS_COBALT_QUBE) += leds-cobalt-qube.o obj-$(CONFIG_LEDS_COBALT_RAQ) += leds-cobalt-raq.o @@ -23,6 +24,7 @@ obj-$(CONFIG_LEDS_FSG) += leds-fsg.o obj-$(CONFIG_LEDS_PCA955X) += leds-pca955x.o obj-$(CONFIG_LEDS_DA903X) += leds-da903x.o obj-$(CONFIG_LEDS_HP_DISK) += leds-hp-disk.o +obj-$(CONFIG_LEDS_WM8350) += leds-wm8350.o # LED Triggers obj-$(CONFIG_LEDS_TRIGGER_TIMER) += ledtrig-timer.o diff --git a/drivers/leds/led-class.c b/drivers/leds/led-class.c index 6c4a326176d..52f82e3ea13 100644 --- a/drivers/leds/led-class.c +++ b/drivers/leds/led-class.c @@ -91,9 +91,29 @@ void led_classdev_resume(struct led_classdev *led_cdev) } EXPORT_SYMBOL_GPL(led_classdev_resume); +static int led_suspend(struct device *dev, pm_message_t state) +{ + struct led_classdev *led_cdev = dev_get_drvdata(dev); + + if (led_cdev->flags & LED_CORE_SUSPENDRESUME) + led_classdev_suspend(led_cdev); + + return 0; +} + +static int led_resume(struct device *dev) +{ + struct led_classdev *led_cdev = dev_get_drvdata(dev); + + if (led_cdev->flags & LED_CORE_SUSPENDRESUME) + led_classdev_resume(led_cdev); + + return 0; +} + /** * led_classdev_register - register a new object of led_classdev class. - * @dev: The device to register. + * @parent: The device to register. * @led_cdev: the led_classdev structure for this device. */ int led_classdev_register(struct device *parent, struct led_classdev *led_cdev) @@ -174,6 +194,8 @@ static int __init leds_init(void) leds_class = class_create(THIS_MODULE, "leds"); if (IS_ERR(leds_class)) return PTR_ERR(leds_class); + leds_class->suspend = led_suspend; + leds_class->resume = led_resume; return 0; } diff --git a/drivers/leds/leds-alix2.c b/drivers/leds/leds-alix2.c new file mode 100644 index 00000000000..ddbd7730dfc --- /dev/null +++ b/drivers/leds/leds-alix2.c @@ -0,0 +1,181 @@ +/* + * LEDs driver for PCEngines ALIX.2 and ALIX.3 + * + * Copyright (C) 2008 Constantin Baranov <const@mimas.ru> + */ + +#include <linux/err.h> +#include <linux/io.h> +#include <linux/kernel.h> +#include <linux/leds.h> +#include <linux/module.h> +#include <linux/platform_device.h> +#include <linux/string.h> + +static int force = 0; +module_param(force, bool, 0444); +MODULE_PARM_DESC(force, "Assume system has ALIX.2 style LEDs"); + +struct alix_led { + struct led_classdev cdev; + unsigned short port; + unsigned int on_value; + unsigned int off_value; +}; + +static void alix_led_set(struct led_classdev *led_cdev, + enum led_brightness brightness) +{ + struct alix_led *led_dev = + container_of(led_cdev, struct alix_led, cdev); + + if (brightness) + outl(led_dev->on_value, led_dev->port); + else + outl(led_dev->off_value, led_dev->port); +} + +static struct alix_led alix_leds[] = { + { + .cdev = { + .name = "alix:1", + .brightness_set = alix_led_set, + }, + .port = 0x6100, + .on_value = 1 << 22, + .off_value = 1 << 6, + }, + { + .cdev = { + .name = "alix:2", + .brightness_set = alix_led_set, + }, + .port = 0x6180, + .on_value = 1 << 25, + .off_value = 1 << 9, + }, + { + .cdev = { + .name = "alix:3", + .brightness_set = alix_led_set, + }, + .port = 0x6180, + .on_value = 1 << 27, + .off_value = 1 << 11, + }, +}; + +static int __init alix_led_probe(struct platform_device *pdev) +{ + int i; + int ret; + + for (i = 0; i < ARRAY_SIZE(alix_leds); i++) { + alix_leds[i].cdev.flags |= LED_CORE_SUSPENDRESUME; + ret = led_classdev_register(&pdev->dev, &alix_leds[i].cdev); + if (ret < 0) + goto fail; + } + return 0; + +fail: + while (--i >= 0) + led_classdev_unregister(&alix_leds[i].cdev); + return ret; +} + +static int alix_led_remove(struct platform_device *pdev) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(alix_leds); i++) + led_classdev_unregister(&alix_leds[i].cdev); + return 0; +} + +static struct platform_driver alix_led_driver = { + .remove = alix_led_remove, + .driver = { + .name = KBUILD_MODNAME, + .owner = THIS_MODULE, + }, +}; + +static int __init alix_present(void) +{ + const unsigned long bios_phys = 0x000f0000; + const size_t bios_len = 0x00010000; + const char alix_sig[] = "PC Engines ALIX."; + const size_t alix_sig_len = sizeof(alix_sig) - 1; + + const char *bios_virt; + const char *scan_end; + const char *p; + int ret = 0; + + if (force) { + printk(KERN_NOTICE "%s: forced to skip BIOS test, " + "assume system has ALIX.2 style LEDs\n", + KBUILD_MODNAME); + ret = 1; + goto out; + } + + bios_virt = phys_to_virt(bios_phys); + scan_end = bios_virt + bios_len - (alix_sig_len + 2); + for (p = bios_virt; p < scan_end; p++) { + const char *tail; + + if (memcmp(p, alix_sig, alix_sig_len) != 0) { + continue; + } + + tail = p + alix_sig_len; + if ((tail[0] == '2' || tail[0] == '3') && tail[1] == '\0') { + printk(KERN_INFO + "%s: system is recognized as \"%s\"\n", + KBUILD_MODNAME, p); + ret = 1; + break; + } + } + +out: + return ret; +} + +static struct platform_device *pdev; + +static int __init alix_led_init(void) +{ + int ret; + + if (!alix_present()) { + ret = -ENODEV; + goto out; + } + + pdev = platform_device_register_simple(KBUILD_MODNAME, -1, NULL, 0); + if (!IS_ERR(pdev)) { + ret = platform_driver_probe(&alix_led_driver, alix_led_probe); + if (ret) + platform_device_unregister(pdev); + } else + ret = PTR_ERR(pdev); + +out: + return ret; +} + +static void __exit alix_led_exit(void) +{ + platform_device_unregister(pdev); + platform_driver_unregister(&alix_led_driver); +} + +module_init(alix_led_init); +module_exit(alix_led_exit); + +MODULE_AUTHOR("Constantin Baranov <const@mimas.ru>"); +MODULE_DESCRIPTION("PCEngines ALIX.2 and ALIX.3 LED driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/leds/leds-ams-delta.c b/drivers/leds/leds-ams-delta.c index 1bd590bb3a6..446050759b4 100644 --- a/drivers/leds/leds-ams-delta.c +++ b/drivers/leds/leds-ams-delta.c @@ -79,37 +79,12 @@ static struct ams_delta_led ams_delta_leds[] = { }, }; -#ifdef CONFIG_PM -static int ams_delta_led_suspend(struct platform_device *dev, - pm_message_t state) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i++) - led_classdev_suspend(&ams_delta_leds[i].cdev); - - return 0; -} - -static int ams_delta_led_resume(struct platform_device *dev) -{ - int i; - - for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i++) - led_classdev_resume(&ams_delta_leds[i].cdev); - - return 0; -} -#else -#define ams_delta_led_suspend NULL -#define ams_delta_led_resume NULL -#endif - static int ams_delta_led_probe(struct platform_device *pdev) { int i, ret; for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i++) { + ams_delta_leds[i].cdev.flags |= LED_CORE_SUSPENDRESUME; ret = led_classdev_register(&pdev->dev, &ams_delta_leds[i].cdev); if (ret < 0) @@ -127,7 +102,7 @@ static int ams_delta_led_remove(struct platform_device *pdev) { int i; - for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i--) + for (i = 0; i < ARRAY_SIZE(ams_delta_leds); i++) led_classdev_unregister(&ams_delta_leds[i].cdev); return 0; @@ -136,8 +111,6 @@ static int ams_delta_led_remove(struct platform_device *pdev) static struct platform_driver ams_delta_led_driver = { .probe = ams_delta_led_probe, .remove = ams_delta_led_remove, - .suspend = ams_delta_led_suspend, - .resume = ams_delta_led_resume, .driver = { .name = "ams-delta-led", .owner = THIS_MODULE, @@ -151,7 +124,7 @@ static int __init ams_delta_led_init(void) static void __exit ams_delta_led_exit(void) { - return platform_driver_unregister(&ams_delta_led_driver); + platform_driver_unregister(&ams_delta_led_driver); } module_init(ams_delta_led_init); diff --git a/drivers/leds/leds-clevo-mail.c b/drivers/leds/leds-clevo-mail.c index eb3415e88f4..1813c84ea5f 100644 --- a/drivers/leds/leds-clevo-mail.c +++ b/drivers/leds/leds-clevo-mail.c @@ -142,6 +142,7 @@ static struct led_classdev clevo_mail_led = { .name = "clevo::mail", .brightness_set = clevo_mail_led_set, .blink_set = clevo_mail_led_blink, + .flags = LED_CORE_SUSPENDRESUME, }; static int __init clevo_mail_led_probe(struct platform_device *pdev) @@ -155,29 +156,9 @@ static int clevo_mail_led_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM -static int clevo_mail_led_suspend(struct platform_device *dev, - pm_message_t state) -{ - led_classdev_suspend(&clevo_mail_led); - return 0; -} - -static int clevo_mail_led_resume(struct platform_device *dev) -{ - led_classdev_resume(&clevo_mail_led); - return 0; -} -#else -#define clevo_mail_led_suspend NULL -#define clevo_mail_led_resume NULL -#endif - static struct platform_driver clevo_mail_led_driver = { .probe = clevo_mail_led_probe, .remove = clevo_mail_led_remove, - .suspend = clevo_mail_led_suspend, - .resume = clevo_mail_led_resume, .driver = { .name = KBUILD_MODNAME, .owner = THIS_MODULE, diff --git a/drivers/leds/leds-fsg.c b/drivers/leds/leds-fsg.c index 34935155c1c..5f7c9c5c09b 100644 --- a/drivers/leds/leds-fsg.c +++ b/drivers/leds/leds-fsg.c @@ -99,64 +99,43 @@ static void fsg_led_ring_set(struct led_classdev *led_cdev, } - static struct led_classdev fsg_wlan_led = { .name = "fsg:blue:wlan", .brightness_set = fsg_led_wlan_set, + .flags = LED_CORE_SUSPENDRESUME, }; static struct led_classdev fsg_wan_led = { .name = "fsg:blue:wan", .brightness_set = fsg_led_wan_set, + .flags = LED_CORE_SUSPENDRESUME, }; static struct led_classdev fsg_sata_led = { .name = "fsg:blue:sata", .brightness_set = fsg_led_sata_set, + .flags = LED_CORE_SUSPENDRESUME, }; static struct led_classdev fsg_usb_led = { .name = "fsg:blue:usb", .brightness_set = fsg_led_usb_set, + .flags = LED_CORE_SUSPENDRESUME, }; static struct led_classdev fsg_sync_led = { .name = "fsg:blue:sync", .brightness_set = fsg_led_sync_set, + .flags = LED_CORE_SUSPENDRESUME, }; static struct led_classdev fsg_ring_led = { .name = "fsg:blue:ring", .brightness_set = fsg_led_ring_set, + .flags = LED_CORE_SUSPENDRESUME, }; - -#ifdef CONFIG_PM -static int fsg_led_suspend(struct platform_device *dev, pm_message_t state) -{ - led_classdev_suspend(&fsg_wlan_led); - led_classdev_suspend(&fsg_wan_led); - led_classdev_suspend(&fsg_sata_led); - led_classdev_suspend(&fsg_usb_led); - led_classdev_suspend(&fsg_sync_led); - led_classdev_suspend(&fsg_ring_led); - return 0; -} - -static int fsg_led_resume(struct platform_device *dev) -{ - led_classdev_resume(&fsg_wlan_led); - led_classdev_resume(&fsg_wan_led); - led_classdev_resume(&fsg_sata_led); - led_classdev_resume(&fsg_usb_led); - led_classdev_resume(&fsg_sync_led); - led_classdev_resume(&fsg_ring_led); - return 0; -} -#endif - - static int fsg_led_probe(struct platform_device *pdev) { int ret; @@ -232,10 +211,6 @@ static int fsg_led_remove(struct platform_device *pdev) static struct platform_driver fsg_led_driver = { .probe = fsg_led_probe, .remove = fsg_led_remove, -#ifdef CONFIG_PM - .suspend = fsg_led_suspend, - .resume = fsg_led_resume, -#endif .driver = { .name = "fsg-led", }, diff --git a/drivers/leds/leds-gpio.c b/drivers/leds/leds-gpio.c index b13bd2950e9..2e3df08b649 100644 --- a/drivers/leds/leds-gpio.c +++ b/drivers/leds/leds-gpio.c @@ -105,6 +105,7 @@ static int gpio_led_probe(struct platform_device *pdev) } led_dat->cdev.brightness_set = gpio_led_set; led_dat->cdev.brightness = LED_OFF; + led_dat->cdev.flags |= LED_CORE_SUSPENDRESUME; gpio_direction_output(led_dat->gpio, led_dat->active_low); @@ -154,44 +155,9 @@ static int __devexit gpio_led_remove(struct platform_device *pdev) return 0; } -#ifdef CONFIG_PM -static int gpio_led_suspend(struct platform_device *pdev, pm_message_t state) -{ - struct gpio_led_platform_data *pdata = pdev->dev.platform_data; - struct gpio_led_data *leds_data; - int i; - - leds_data = platform_get_drvdata(pdev); - - for (i = 0; i < pdata->num_leds; i++) - led_classdev_suspend(&leds_data[i].cdev); - - return 0; -} - -static int gpio_led_resume(struct platform_device *pdev) -{ - struct gpio_led_platform_data *pdata = pdev->dev.platform_data; - struct gpio_led_data *leds_data; - int i; - - leds_data = platform_get_drvdata(pdev); - - for (i = 0; i < pdata->num_leds; i++) - led_classdev_resume(&leds_data[i].cdev); - - return 0; -} -#else -#define gpio_led_suspend NULL -#define gpio_led_resume NULL -#endif - static struct platform_driver gpio_led_driver = { .probe = gpio_led_probe, .remove = __devexit_p(gpio_led_remove), - .suspend = gpio_led_suspend, - .resume = gpio_led_resume, .driver = { .name = "leds-gpio", .owner = THIS_MODULE, diff --git a/drivers/leds/leds-hp-disk.c b/drivers/leds/leds-hp-disk.c index 44fa757d825..d786adc8c5e 100644 --- a/drivers/leds/leds-hp-disk.c +++ b/drivers/leds/leds-hp-disk.c @@ -68,25 +68,9 @@ static struct led_classdev hpled_led = { .name = "hp:red:hddprotection", .default_trigger = "heartbeat", .brightness_set = hpled_set, + .flags = LED_CORE_SUSPENDRESUME, }; -#ifdef CONFIG_PM -static int hpled_suspend(struct acpi_device *dev, pm_message_t state) -{ - led_classdev_suspend(&hpled_led); - return 0; -} - -static int hpled_resume(struct acpi_device *dev) -{ - led_classdev_resume(&hpled_led); - return 0; -} -#else -#define hpled_suspend NULL -#define hpled_resume NULL -#endif - static int hpled_add(struct acpi_device *device) { int ret; @@ -121,8 +105,6 @@ static struct acpi_driver leds_hp_driver = { .ops = { .add = hpled_add, .remove = hpled_remove, - .suspend = hpled_suspend, - .resume = hpled_resume, } }; diff --git a/drivers/leds/leds-hp6xx.c b/drivers/leds/leds-hp6xx.c index e8fb1baf8a5..e4ce1fd4633 100644 --- a/drivers/leds/leds-hp6xx.c +++ b/drivers/leds/leds-hp6xx.c @@ -45,30 +45,16 @@ static struct led_classdev hp6xx_red_led = { .name = "hp6xx:red", .default_trigger = "hp6xx-charge", .brightness_set = hp6xxled_red_set, + .flags = LED_CORE_SUSPENDRESUME, }; static struct led_classdev hp6xx_green_led = { .name = "hp6xx:green", .default_trigger = "ide-disk", .brightness_set = hp6xxled_green_set, + .flags = LED_CORE_SUSPENDRESUME, }; -#ifdef CONFIG_PM -static int hp6xxled_suspend(struct platform_device *dev, pm_message_t state) -{ - led_classdev_suspend(&hp6xx_red_led); - led_classdev_suspend(&hp6xx_green_led); - return 0; -} - -static int hp6xxled_resume(struct platform_device *dev) -{ - led_classdev_resume(&hp6xx_red_led); - led_classdev_resume(&hp6xx_green_led); - return 0; -} -#endif - static int hp6xxled_probe(struct platform_device *pdev) { int ret; @@ -98,10 +84,6 @@ MODULE_ALIAS("platform:hp6xx-led"); static struct platform_driver hp6xxled_driver = { .probe = hp6xxled_probe, .remove = hp6xxled_remove, -#ifdef CONFIG_PM - .suspend = hp6xxled_suspend, - .resume = hp6xxled_resume, -#endif .driver = { .name = "hp6xx-led", .owner = THIS_MODULE, diff --git a/drivers/leds/leds-net48xx.c b/drivers/leds/leds-net48xx.c index 054360473c9..93987a12da4 100644 --- a/drivers/leds/leds-net48xx.c +++ b/drivers/leds/leds-net48xx.c @@ -33,26 +33,9 @@ static void net48xx_error_led_set(struct led_classdev *led_cdev, static struct led_classdev net48xx_error_led = { .name = "net48xx::error", .brightness_set = net48xx_error_led_set, + .flags = LED_CORE_SUSPENDRESUME, }; -#ifdef CONFIG_PM -static int net48xx_led_suspend(struct platform_device *dev, - pm_message_t state) -{ - led_classdev_suspend(&net48xx_error_led); - return 0; -} - -static int net48xx_led_resume(struct platform_device *dev) -{ - led_classdev_resume(&net48xx_error_led); - return 0; -} -#else -#define net48xx_led_suspend NULL -#define net48xx_led_resume NULL -#endif - static int net48xx_led_probe(struct platform_device *pdev) { return led_classdev_register(&pdev->dev, &net48xx_error_led); @@ -67,8 +50,6 @@ static int net48xx_led_remove(struct platform_device *pdev) static struct platform_driver net48xx_led_driver = { .probe = net48xx_led_probe, .remove = net48xx_led_remove, - .suspend = net48xx_led_suspend, - .resume = net48xx_led_resume, .driver = { .name = DRVNAME, .owner = THIS_MODULE, diff --git a/drivers/leds/leds-pca9532.c b/drivers/leds/leds-pca9532.c index 4064d4f6b33..76ec7498e2d 100644 --- a/drivers/leds/leds-pca9532.c +++ b/drivers/leds/leds-pca9532.c @@ -16,6 +16,7 @@ #include <linux/leds.h> #include <linux/input.h> #include <linux/mutex.h> +#include <linux/workqueue.h> #include <linux/leds-pca9532.h> static const unsigned short normal_i2c[] = { /*0x60,*/ I2C_CLIENT_END}; @@ -34,6 +35,7 @@ struct pca9532_data { struct pca9532_led leds[16]; struct mutex update_lock; struct input_dev *idev; + struct work_struct work; u8 pwm[2]; u8 psc[2]; }; @@ -63,7 +65,7 @@ static struct i2c_driver pca9532_driver = { * as a compromise we average one pwm to the values requested by all * leds that are not ON/OFF. * */ -static int pca9532_setpwm(struct i2c_client *client, int pwm, int blink, +static int pca9532_calcpwm(struct i2c_client *client, int pwm, int blink, enum led_brightness value) { int a = 0, b = 0, i = 0; @@ -84,11 +86,17 @@ static int pca9532_setpwm(struct i2c_client *client, int pwm, int blink, b = b/a; if (b > 0xFF) return -EINVAL; - mutex_lock(&data->update_lock); data->pwm[pwm] = b; + data->psc[pwm] = blink; + return 0; +} + +static int pca9532_setpwm(struct i2c_client *client, int pwm) +{ + struct pca9532_data *data = i2c_get_clientdata(client); + mutex_lock(&data->update_lock); i2c_smbus_write_byte_data(client, PCA9532_REG_PWM(pwm), data->pwm[pwm]); - data->psc[pwm] = blink; i2c_smbus_write_byte_data(client, PCA9532_REG_PSC(pwm), data->psc[pwm]); mutex_unlock(&data->update_lock); @@ -124,11 +132,11 @@ static void pca9532_set_brightness(struct led_classdev *led_cdev, led->state = PCA9532_ON; else { led->state = PCA9532_PWM0; /* Thecus: hardcode one pwm */ - err = pca9532_setpwm(led->client, 0, 0, value); + err = pca9532_calcpwm(led->client, 0, 0, value); if (err) return; /* XXX: led api doesn't allow error code? */ } - pca9532_setled(led); + schedule_work(&led->work); } static int pca9532_set_blink(struct led_classdev *led_cdev, @@ -137,6 +145,7 @@ static int pca9532_set_blink(struct led_classdev *led_cdev, struct pca9532_led *led = ldev_to_led(led_cdev); struct i2c_client *client = led->client; int psc; + int err = 0; if (*delay_on == 0 && *delay_off == 0) { /* led subsystem ask us for a blink rate */ @@ -148,11 +157,15 @@ static int pca9532_set_blink(struct led_classdev *led_cdev, /* Thecus specific: only use PSC/PWM 0 */ psc = (*delay_on * 152-1)/1000; - return pca9532_setpwm(client, 0, psc, led_cdev->brightness); + err = pca9532_calcpwm(client, 0, psc, led_cdev->brightness); + if (err) + return err; + schedule_work(&led->work); + return 0; } -int pca9532_event(struct input_dev *dev, unsigned int type, unsigned int code, - int value) +static int pca9532_event(struct input_dev *dev, unsigned int type, + unsigned int code, int value) { struct pca9532_data *data = input_get_drvdata(dev); @@ -165,13 +178,28 @@ int pca9532_event(struct input_dev *dev, unsigned int type, unsigned int code, else data->pwm[1] = 0; - dev_info(&dev->dev, "setting beep to %d \n", data->pwm[1]); + schedule_work(&data->work); + + return 0; +} + +static void pca9532_input_work(struct work_struct *work) +{ + struct pca9532_data *data; + data = container_of(work, struct pca9532_data, work); mutex_lock(&data->update_lock); i2c_smbus_write_byte_data(data->client, PCA9532_REG_PWM(1), data->pwm[1]); mutex_unlock(&data->update_lock); +} - return 0; +static void pca9532_led_work(struct work_struct *work) +{ + struct pca9532_led *led; + led = container_of(work, struct pca9532_led, work); + if (led->state == PCA9532_PWM0) + pca9532_setpwm(led->client, 0); + pca9532_setled(led); } static int pca9532_configure(struct i2c_client *client, @@ -204,8 +232,9 @@ static int pca9532_configure(struct i2c_client *client, led->ldev.brightness = LED_OFF; led->ldev.brightness_set = pca9532_set_brightness; led->ldev.blink_set = pca9532_set_blink; - if (led_classdev_register(&client->dev, - &led->ldev) < 0) { + INIT_WORK(&led->work, pca9532_led_work); + err = led_classdev_register(&client->dev, &led->ldev); + if (err < 0) { dev_err(&client->dev, "couldn't register LED %s\n", led->name); @@ -233,9 +262,11 @@ static int pca9532_configure(struct i2c_client *client, BIT_MASK(SND_TONE); data->idev->event = pca9532_event; input_set_drvdata(data->idev, data); + INIT_WORK(&data->work, pca9532_input_work); err = input_register_device(data->idev); if (err) { input_free_device(data->idev); + cancel_work_sync(&data->work); data->idev = NULL; goto exit; } @@ -252,18 +283,19 @@ exit: break; case PCA9532_TYPE_LED: led_classdev_unregister(&data->leds[i].ldev); + cancel_work_sync(&data->leds[i].work); break; case PCA9532_TYPE_N2100_BEEP: if (data->idev != NULL) { input_unregister_device(data->idev); input_free_device(data->idev); + cancel_work_sync(&data->work); data->idev = NULL; } break; } return err; - } static int pca9532_probe(struct i2c_client *client, @@ -271,12 +303,16 @@ static int pca9532_probe(struct i2c_client *client, { struct pca9532_data *data = i2c_get_clientdata(client); struct pca9532_platform_data *pca9532_pdata = client->dev.platform_data; + int err; + + if (!pca9532_pdata) + return -EIO; if (!i2c_check_functionality(client->adapter, I2C_FUNC_SMBUS_BYTE_DATA)) return -EIO; - data = kzalloc(sizeof(struct pca9532_data), GFP_KERNEL); + data = kzalloc(sizeof(*data), GFP_KERNEL); if (!data) return -ENOMEM; @@ -285,12 +321,13 @@ static int pca9532_probe(struct i2c_client *client, data->client = client; mutex_init(&data->update_lock); - if (pca9532_pdata == NULL) - return -EIO; - - pca9532_configure(client, data, pca9532_pdata); - return 0; + err = pca9532_configure(client, data, pca9532_pdata); + if (err) { + kfree(data); + i2c_set_clientdata(client, NULL); + } + return err; } static int pca9532_remove(struct i2c_client *client) @@ -303,11 +340,13 @@ static int pca9532_remove(struct i2c_client *client) break; case PCA9532_TYPE_LED: led_classdev_unregister(&data->leds[i].ldev); + cancel_work_sync(&data->leds[i].work); break; case PCA9532_TYPE_N2100_BEEP: if (data->idev != NULL) { input_unregister_device(data->idev); input_free_device(data->idev); + cancel_work_sync(&data->work); data->idev = NULL; } break; diff --git a/drivers/leds/leds-s3c24xx.c b/drivers/leds/leds-s3c24xx.c index 25a07f2643a..4d81131542a 100644 --- a/drivers/leds/leds-s3c24xx.c +++ b/drivers/leds/leds-s3c24xx.c @@ -82,6 +82,7 @@ static int s3c24xx_led_probe(struct platform_device *dev) led->cdev.brightness_set = s3c24xx_led_set; led->cdev.default_trigger = pdata->def_trigger; led->cdev.name = pdata->name; + led->cdev.flags |= LED_CORE_SUSPENDRESUME; led->pdata = pdata; @@ -111,33 +112,9 @@ static int s3c24xx_led_probe(struct platform_device *dev) return ret; } - -#ifdef CONFIG_PM -static int s3c24xx_led_suspend(struct platform_device *dev, pm_message_t state) -{ - struct s3c24xx_gpio_led *led = pdev_to_gpio(dev); - - led_classdev_suspend(&led->cdev); - return 0; -} - -static int s3c24xx_led_resume(struct platform_device *dev) -{ - struct s3c24xx_gpio_led *led = pdev_to_gpio(dev); - - led_classdev_resume(&led->cdev); - return 0; -} -#else -#define s3c24xx_led_suspend NULL -#define s3c24xx_led_resume NULL -#endif - static struct platform_driver s3c24xx_led_driver = { .probe = s3c24xx_led_probe, .remove = s3c24xx_led_remove, - .suspend = s3c24xx_led_suspend, - .resume = s3c24xx_led_resume, .driver = { .name = "s3c24xx_led", .owner = THIS_MODULE, diff --git a/drivers/leds/leds-wm8350.c b/drivers/leds/leds-wm8350.c new file mode 100644 index 00000000000..38c6bcb07e6 --- /dev/null +++ b/drivers/leds/leds-wm8350.c @@ -0,0 +1,311 @@ +/* + * LED driver for WM8350 driven LEDS. + * + * Copyright(C) 2007, 2008 Wolfson Microelectronics PLC. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/leds.h> +#include <linux/err.h> +#include <linux/mfd/wm8350/pmic.h> +#include <linux/regulator/consumer.h> + +/* Microamps */ +static const int isink_cur[] = { + 4, + 5, + 6, + 7, + 8, + 10, + 11, + 14, + 16, + 19, + 23, + 27, + 32, + 39, + 46, + 54, + 65, + 77, + 92, + 109, + 130, + 154, + 183, + 218, + 259, + 308, + 367, + 436, + 518, + 616, + 733, + 872, + 1037, + 1233, + 1466, + 1744, + 2073, + 2466, + 2933, + 3487, + 4147, + 4932, + 5865, + 6975, + 8294, + 9864, + 11730, + 13949, + 16589, + 19728, + 23460, + 27899, + 33178, + 39455, + 46920, + 55798, + 66355, + 78910, + 93840, + 111596, + 132710, + 157820, + 187681, + 223191 +}; + +#define to_wm8350_led(led_cdev) \ + container_of(led_cdev, struct wm8350_led, cdev) + +static void wm8350_led_enable(struct wm8350_led *led) +{ + int ret; + + if (led->enabled) + return; + + ret = regulator_enable(led->isink); + if (ret != 0) { + dev_err(led->cdev.dev, "Failed to enable ISINK: %d\n", ret); + return; + } + + ret = regulator_enable(led->dcdc); + if (ret != 0) { + dev_err(led->cdev.dev, "Failed to enable DCDC: %d\n", ret); + regulator_disable(led->isink); + return; + } + + led->enabled = 1; +} + +static void wm8350_led_disable(struct wm8350_led *led) +{ + int ret; + + if (!led->enabled) + return; + + ret = regulator_disable(led->dcdc); + if (ret != 0) { + dev_err(led->cdev.dev, "Failed to disable DCDC: %d\n", ret); + return; + } + + ret = regulator_disable(led->isink); + if (ret != 0) { + dev_err(led->cdev.dev, "Failed to disable ISINK: %d\n", ret); + regulator_enable(led->dcdc); + return; + } + + led->enabled = 0; +} + +static void led_work(struct work_struct *work) +{ + struct wm8350_led *led = container_of(work, struct wm8350_led, work); + int ret; + int uA; + unsigned long flags; + + mutex_lock(&led->mutex); + + spin_lock_irqsave(&led->value_lock, flags); + + if (led->value == LED_OFF) { + spin_unlock_irqrestore(&led->value_lock, flags); + wm8350_led_disable(led); + goto out; + } + + /* This scales linearly into the index of valid current + * settings which results in a linear scaling of perceived + * brightness due to the non-linear current settings provided + * by the hardware. + */ + uA = (led->max_uA_index * led->value) / LED_FULL; + spin_unlock_irqrestore(&led->value_lock, flags); + BUG_ON(uA >= ARRAY_SIZE(isink_cur)); + + ret = regulator_set_current_limit(led->isink, isink_cur[uA], + isink_cur[uA]); + if (ret != 0) + dev_err(led->cdev.dev, "Failed to set %duA: %d\n", + isink_cur[uA], ret); + + wm8350_led_enable(led); + +out: + mutex_unlock(&led->mutex); +} + +static void wm8350_led_set(struct led_classdev *led_cdev, + enum led_brightness value) +{ + struct wm8350_led *led = to_wm8350_led(led_cdev); + unsigned long flags; + + spin_lock_irqsave(&led->value_lock, flags); + led->value = value; + schedule_work(&led->work); + spin_unlock_irqrestore(&led->value_lock, flags); +} + +static void wm8350_led_shutdown(struct platform_device *pdev) +{ + struct wm8350_led *led = platform_get_drvdata(pdev); + + mutex_lock(&led->mutex); + led->value = LED_OFF; + wm8350_led_disable(led); + mutex_unlock(&led->mutex); +} + +static int wm8350_led_probe(struct platform_device *pdev) +{ + struct regulator *isink, *dcdc; + struct wm8350_led *led; + struct wm8350_led_platform_data *pdata = pdev->dev.platform_data; + int ret, i; + + if (pdata == NULL) { + dev_err(&pdev->dev, "no platform data\n"); + return -ENODEV; + } + + if (pdata->max_uA < isink_cur[0]) { + dev_err(&pdev->dev, "Invalid maximum current %duA\n", + pdata->max_uA); + return -EINVAL; + } + + isink = regulator_get(&pdev->dev, "led_isink"); + if (IS_ERR(isink)) { + printk(KERN_ERR "%s: cant get ISINK\n", __func__); + return PTR_ERR(isink); + } + + dcdc = regulator_get(&pdev->dev, "led_vcc"); + if (IS_ERR(dcdc)) { + printk(KERN_ERR "%s: cant get DCDC\n", __func__); + ret = PTR_ERR(dcdc); + goto err_isink; + } + + led = kzalloc(sizeof(*led), GFP_KERNEL); + if (led == NULL) { + ret = -ENOMEM; + goto err_dcdc; + } + + led->cdev.brightness_set = wm8350_led_set; + led->cdev.default_trigger = pdata->default_trigger; + led->cdev.name = pdata->name; + led->cdev.flags |= LED_CORE_SUSPENDRESUME; + led->enabled = regulator_is_enabled(isink); + led->isink = isink; + led->dcdc = dcdc; + + for (i = 0; i < ARRAY_SIZE(isink_cur) - 1; i++) + if (isink_cur[i] >= pdata->max_uA) + break; + led->max_uA_index = i; + if (pdata->max_uA != isink_cur[i]) + dev_warn(&pdev->dev, + "Maximum current %duA is not directly supported," + " check platform data\n", + pdata->max_uA); + + spin_lock_init(&led->value_lock); + mutex_init(&led->mutex); + INIT_WORK(&led->work, led_work); + led->value = LED_OFF; + platform_set_drvdata(pdev, led); + + ret = led_classdev_register(&pdev->dev, &led->cdev); + if (ret < 0) + goto err_led; + + return 0; + + err_led: + kfree(led); + err_dcdc: + regulator_put(dcdc); + err_isink: + regulator_put(isink); + return ret; +} + +static int wm8350_led_remove(struct platform_device *pdev) +{ + struct wm8350_led *led = platform_get_drvdata(pdev); + + led_classdev_unregister(&led->cdev); + flush_scheduled_work(); + wm8350_led_disable(led); + regulator_put(led->dcdc); + regulator_put(led->isink); + kfree(led); + return 0; +} + +static struct platform_driver wm8350_led_driver = { + .driver = { + .name = "wm8350-led", + .owner = THIS_MODULE, + }, + .probe = wm8350_led_probe, + .remove = wm8350_led_remove, + .shutdown = wm8350_led_shutdown, +}; + +static int __devinit wm8350_led_init(void) +{ + return platform_driver_register(&wm8350_led_driver); +} +module_init(wm8350_led_init); + +static void wm8350_led_exit(void) +{ + platform_driver_unregister(&wm8350_led_driver); +} +module_exit(wm8350_led_exit); + +MODULE_AUTHOR("Mark Brown"); +MODULE_DESCRIPTION("WM8350 LED driver"); +MODULE_LICENSE("GPL"); +MODULE_ALIAS("platform:wm8350-led"); diff --git a/drivers/leds/leds-wrap.c b/drivers/leds/leds-wrap.c index 2f3aa87f2a1..2982c86ac4c 100644 --- a/drivers/leds/leds-wrap.c +++ b/drivers/leds/leds-wrap.c @@ -56,40 +56,21 @@ static struct led_classdev wrap_power_led = { .name = "wrap::power", .brightness_set = wrap_power_led_set, .default_trigger = "default-on", + .flags = LED_CORE_SUSPENDRESUME, }; static struct led_classdev wrap_error_led = { .name = "wrap::error", .brightness_set = wrap_error_led_set, + .flags = LED_CORE_SUSPENDRESUME, }; static struct led_classdev wrap_extra_led = { .name = "wrap::extra", .brightness_set = wrap_extra_led_set, + .flags = LED_CORE_SUSPENDRESUME, }; -#ifdef CONFIG_PM -static int wrap_led_suspend(struct platform_device *dev, - pm_message_t state) -{ - led_classdev_suspend(&wrap_power_led); - led_classdev_suspend(&wrap_error_led); - led_classdev_suspend(&wrap_extra_led); - return 0; -} - -static int wrap_led_resume(struct platform_device *dev) -{ - led_classdev_resume(&wrap_power_led); - led_classdev_resume(&wrap_error_led); - led_classdev_resume(&wrap_extra_led); - return 0; -} -#else -#define wrap_led_suspend NULL -#define wrap_led_resume NULL -#endif - static int wrap_led_probe(struct platform_device *pdev) { int ret; @@ -127,8 +108,6 @@ static int wrap_led_remove(struct platform_device *pdev) static struct platform_driver wrap_led_driver = { .probe = wrap_led_probe, .remove = wrap_led_remove, - .suspend = wrap_led_suspend, - .resume = wrap_led_resume, .driver = { .name = DRVNAME, .owner = THIS_MODULE, diff --git a/drivers/leds/ledtrig-timer.c b/drivers/leds/ledtrig-timer.c index db681962d7b..3d6531396dd 100644 --- a/drivers/leds/ledtrig-timer.c +++ b/drivers/leds/ledtrig-timer.c @@ -199,6 +199,7 @@ err_out: static void timer_trig_deactivate(struct led_classdev *led_cdev) { struct timer_trig_data *timer_data = led_cdev->trigger_data; + unsigned long on = 0, off = 0; if (timer_data) { device_remove_file(led_cdev->dev, &dev_attr_delay_on); @@ -206,6 +207,10 @@ static void timer_trig_deactivate(struct led_classdev *led_cdev) del_timer_sync(&timer_data->timer); kfree(timer_data); } + + /* If there is hardware support for blinking, stop it */ + if (led_cdev->blink_set) + led_cdev->blink_set(led_cdev, &on, &off); } static struct led_trigger timer_led_trigger = { diff --git a/drivers/mfd/wm8350-core.c b/drivers/mfd/wm8350-core.c index 3a273ccef3f..f92595c8f16 100644 --- a/drivers/mfd/wm8350-core.c +++ b/drivers/mfd/wm8350-core.c @@ -1453,6 +1453,9 @@ void wm8350_device_exit(struct wm8350 *wm8350) { int i; + for (i = 0; i < ARRAY_SIZE(wm8350->pmic.led); i++) + platform_device_unregister(wm8350->pmic.led[i].pdev); + for (i = 0; i < ARRAY_SIZE(wm8350->pmic.pdev); i++) platform_device_unregister(wm8350->pmic.pdev[i]); diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 3949a1c7345..419c378bd24 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -120,7 +120,7 @@ config TIFM_CORE cards are supported via 'MMC/SD Card support: TI Flash Media MMC/SD Interface support (MMC_TIFM_SD)'. - To compile this driver as a module, choose M here: the module will + To compile this driver as a module, choose M here: the module will be called tifm_core. config TIFM_7XX1 @@ -133,100 +133,9 @@ config TIFM_7XX1 To make actual use of the device, you will have to select some flash card format drivers, as outlined in the TIFM_CORE Help. - To compile this driver as a module, choose M here: the module will + To compile this driver as a module, choose M here: the module will be called tifm_7xx1. -config ACER_WMI - tristate "Acer WMI Laptop Extras (EXPERIMENTAL)" - depends on X86 - depends on EXPERIMENTAL - depends on ACPI - depends on LEDS_CLASS - depends on NEW_LEDS - depends on BACKLIGHT_CLASS_DEVICE - depends on SERIO_I8042 - depends on RFKILL - select ACPI_WMI - ---help--- - This is a driver for newer Acer (and Wistron) laptops. It adds - wireless radio and bluetooth control, and on some laptops, - exposes the mail LED and LCD backlight. - - For more information about this driver see - <file:Documentation/laptops/acer-wmi.txt> - - If you have an ACPI-WMI compatible Acer/ Wistron laptop, say Y or M - here. - -config ASUS_LAPTOP - tristate "Asus Laptop Extras (EXPERIMENTAL)" - depends on X86 - depends on ACPI - depends on EXPERIMENTAL && !ACPI_ASUS - depends on LEDS_CLASS - depends on NEW_LEDS - depends on BACKLIGHT_CLASS_DEVICE - ---help--- - This is the new Linux driver for Asus laptops. It may also support some - MEDION, JVC or VICTOR laptops. It makes all the extra buttons generate - standard ACPI events that go through /proc/acpi/events. It also adds - support for video output switching, LCD backlight control, Bluetooth and - Wlan control, and most importantly, allows you to blink those fancy LEDs. - - For more information and a userspace daemon for handling the extra - buttons see <http://acpi4asus.sf.net/>. - - If you have an ACPI-compatible ASUS laptop, say Y or M here. - -config FUJITSU_LAPTOP - tristate "Fujitsu Laptop Extras" - depends on X86 - depends on ACPI - depends on INPUT - depends on BACKLIGHT_CLASS_DEVICE - ---help--- - This is a driver for laptops built by Fujitsu: - - * P2xxx/P5xxx/S6xxx/S7xxx series Lifebooks - * Possibly other Fujitsu laptop models - * Tested with S6410 and S7020 - - It adds support for LCD brightness control and some hotkeys. - - If you have a Fujitsu laptop, say Y or M here. - -config FUJITSU_LAPTOP_DEBUG - bool "Verbose debug mode for Fujitsu Laptop Extras" - depends on FUJITSU_LAPTOP - default n - ---help--- - Enables extra debug output from the fujitsu extras driver, at the - expense of a slight increase in driver size. - - If you are not sure, say N here. - -config TC1100_WMI - tristate "HP Compaq TC1100 Tablet WMI Extras (EXPERIMENTAL)" - depends on X86 && !X86_64 - depends on EXPERIMENTAL - depends on ACPI - select ACPI_WMI - ---help--- - This is a driver for the WMI extensions (wireless and bluetooth power - control) of the HP Compaq TC1100 tablet. - -config HP_WMI - tristate "HP WMI extras" - depends on ACPI_WMI - depends on INPUT - depends on RFKILL - help - Say Y here if you want to support WMI-based hotkeys on HP laptops and - to read data from WMI such as docking or ambient light sensor state. - - To compile this driver as a module, choose M here: the module will - be called hp-wmi. - config ICS932S401 tristate "Integrated Circuits ICS932S401" depends on I2C && EXPERIMENTAL @@ -237,170 +146,6 @@ config ICS932S401 This driver can also be built as a module. If so, the module will be called ics932s401. -config MSI_LAPTOP - tristate "MSI Laptop Extras" - depends on X86 - depends on ACPI - depends on BACKLIGHT_CLASS_DEVICE - ---help--- - This is a driver for laptops built by MSI (MICRO-STAR - INTERNATIONAL): - - MSI MegaBook S270 (MS-1013) - Cytron/TCM/Medion/Tchibo MD96100/SAM2000 - - It adds support for Bluetooth, WLAN and LCD brightness control. - - More information about this driver is available at - <http://0pointer.de/lennart/tchibo.html>. - - If you have an MSI S270 laptop, say Y or M here. - -config PANASONIC_LAPTOP - tristate "Panasonic Laptop Extras" - depends on X86 && INPUT && ACPI - depends on BACKLIGHT_CLASS_DEVICE - ---help--- - This driver adds support for access to backlight control and hotkeys - on Panasonic Let's Note laptops. - - If you have a Panasonic Let's note laptop (such as the R1(N variant), - R2, R3, R5, T2, W2 and Y2 series), say Y. - -config COMPAL_LAPTOP - tristate "Compal Laptop Extras" - depends on X86 - depends on ACPI - depends on BACKLIGHT_CLASS_DEVICE - ---help--- - This is a driver for laptops built by Compal: - - Compal FL90/IFL90 - Compal FL91/IFL91 - Compal FL92/JFL92 - Compal FT00/IFT00 - - It adds support for Bluetooth, WLAN and LCD brightness control. - - If you have an Compal FL9x/IFL9x/FT00 laptop, say Y or M here. - -config SONY_LAPTOP - tristate "Sony Laptop Extras" - depends on X86 && ACPI - select BACKLIGHT_CLASS_DEVICE - depends on INPUT - ---help--- - This mini-driver drives the SNC and SPIC devices present in the ACPI - BIOS of the Sony Vaio laptops. - - It gives access to some extra laptop functionalities like Bluetooth, - screen brightness control, Fn keys and allows powering on/off some - devices. - - Read <file:Documentation/laptops/sony-laptop.txt> for more information. - -config SONYPI_COMPAT - bool "Sonypi compatibility" - depends on SONY_LAPTOP - ---help--- - Build the sonypi driver compatibility code into the sony-laptop driver. - -config THINKPAD_ACPI - tristate "ThinkPad ACPI Laptop Extras" - depends on X86 && ACPI - select BACKLIGHT_LCD_SUPPORT - select BACKLIGHT_CLASS_DEVICE - select HWMON - select NVRAM - select INPUT - select NEW_LEDS - select LEDS_CLASS - select NET - select RFKILL - ---help--- - This is a driver for the IBM and Lenovo ThinkPad laptops. It adds - support for Fn-Fx key combinations, Bluetooth control, video - output switching, ThinkLight control, UltraBay eject and more. - For more information about this driver see - <file:Documentation/laptops/thinkpad-acpi.txt> and - <http://ibm-acpi.sf.net/> . - - This driver was formerly known as ibm-acpi. - - If you have an IBM or Lenovo ThinkPad laptop, say Y or M here. - -config THINKPAD_ACPI_DEBUG - bool "Verbose debug mode" - depends on THINKPAD_ACPI - default n - ---help--- - Enables extra debugging information, at the expense of a slightly - increase in driver size. - - If you are not sure, say N here. - -config THINKPAD_ACPI_DOCK - bool "Legacy Docking Station Support" - depends on THINKPAD_ACPI - depends on ACPI_DOCK=n - default n - ---help--- - Allows the thinkpad_acpi driver to handle docking station events. - This support was made obsolete by the generic ACPI docking station - support (CONFIG_ACPI_DOCK). It will allow locking and removing the - laptop from the docking station, but will not properly connect PCI - devices. - - If you are not sure, say N here. - -config THINKPAD_ACPI_BAY - bool "Legacy Removable Bay Support" - depends on THINKPAD_ACPI - default y - ---help--- - Allows the thinkpad_acpi driver to handle removable bays. It will - electrically disable the device in the bay, and also generate - notifications when the bay lever is ejected or inserted. - - If you are not sure, say Y here. - -config THINKPAD_ACPI_VIDEO - bool "Video output control support" - depends on THINKPAD_ACPI - default y - ---help--- - Allows the thinkpad_acpi driver to provide an interface to control - the various video output ports. - - This feature often won't work well, depending on ThinkPad model, - display state, video output devices in use, whether there is a X - server running, phase of the moon, and the current mood of - Schroedinger's cat. If you can use X.org's RandR to control - your ThinkPad's video output ports instead of this feature, - don't think twice: do it and say N here to save some memory. - - If you are not sure, say Y here. - -config THINKPAD_ACPI_HOTKEY_POLL - bool "Support NVRAM polling for hot keys" - depends on THINKPAD_ACPI - default y - ---help--- - Some thinkpad models benefit from NVRAM polling to detect a few of - the hot key press events. If you know your ThinkPad model does not - need to do NVRAM polling to support any of the hot keys you use, - unselecting this option will save about 1kB of memory. - - ThinkPads T40 and newer, R52 and newer, and X31 and newer are - unlikely to need NVRAM polling in their latest BIOS versions. - - NVRAM polling can detect at most the following keys: ThinkPad/Access - IBM, Zoom, Switch Display (fn+F7), ThinkLight, Volume up/down/mute, - Brightness up/down, Display Expand (fn+F8), Hibernate (fn+F12). - - If you are not sure, say Y here. The driver enables polling only if - it is strictly necessary to do so. - config ATMEL_SSC tristate "Device driver for Atmel SSC peripheral" depends on AVR32 || ARCH_AT91 @@ -413,31 +158,6 @@ config ATMEL_SSC If unsure, say N. -config INTEL_MENLOW - tristate "Thermal Management driver for Intel menlow platform" - depends on ACPI_THERMAL - select THERMAL - depends on X86 - ---help--- - ACPI thermal management enhancement driver on - Intel Menlow platform. - - If unsure, say N. - -config EEEPC_LAPTOP - tristate "Eee PC Hotkey Driver (EXPERIMENTAL)" - depends on X86 - depends on ACPI - depends on BACKLIGHT_CLASS_DEVICE - depends on HWMON - depends on EXPERIMENTAL - depends on RFKILL - ---help--- - This driver supports the Fn-Fx keys on Eee PC laptops. - It also adds the ability to switch camera/wlan on/off. - - If you have an Eee PC laptop, say Y or M here. - config ENCLOSURE_SERVICES tristate "Enclosure Services" default n diff --git a/drivers/misc/Makefile b/drivers/misc/Makefile index 5de863a0e39..9cf8ae6e4b3 100644 --- a/drivers/misc/Makefile +++ b/drivers/misc/Makefile @@ -1,33 +1,20 @@ # # Makefile for misc devices that really don't fit anywhere else. # -obj- := misc.o # Dummy rule to force built-in.o to be made obj-$(CONFIG_IBM_ASM) += ibmasm/ obj-$(CONFIG_HDPU_FEATURES) += hdpuftrs/ -obj-$(CONFIG_ASUS_LAPTOP) += asus-laptop.o -obj-$(CONFIG_EEEPC_LAPTOP) += eeepc-laptop.o -obj-$(CONFIG_MSI_LAPTOP) += msi-laptop.o -obj-$(CONFIG_COMPAL_LAPTOP) += compal-laptop.o -obj-$(CONFIG_ACER_WMI) += acer-wmi.o obj-$(CONFIG_ATMEL_PWM) += atmel_pwm.o obj-$(CONFIG_ATMEL_SSC) += atmel-ssc.o obj-$(CONFIG_ATMEL_TCLIB) += atmel_tclib.o -obj-$(CONFIG_HP_WMI) += hp-wmi.o obj-$(CONFIG_ICS932S401) += ics932s401.o -obj-$(CONFIG_TC1100_WMI) += tc1100-wmi.o obj-$(CONFIG_LKDTM) += lkdtm.o obj-$(CONFIG_TIFM_CORE) += tifm_core.o obj-$(CONFIG_DELL_LAPTOP) += dell-laptop.o obj-$(CONFIG_TIFM_7XX1) += tifm_7xx1.o obj-$(CONFIG_PHANTOM) += phantom.o obj-$(CONFIG_SGI_IOC4) += ioc4.o -obj-$(CONFIG_SONY_LAPTOP) += sony-laptop.o -obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o -obj-$(CONFIG_FUJITSU_LAPTOP) += fujitsu-laptop.o -obj-$(CONFIG_PANASONIC_LAPTOP) += panasonic-laptop.o obj-$(CONFIG_EEPROM_93CX6) += eeprom_93cx6.o -obj-$(CONFIG_INTEL_MENLOW) += intel_menlow.o obj-$(CONFIG_ENCLOSURE_SERVICES) += enclosure.o obj-$(CONFIG_KGDB_TESTS) += kgdbts.o obj-$(CONFIG_SGI_XP) += sgi-xp/ diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index 1e97916914a..76bfe16c09b 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -55,7 +55,6 @@ enum atmel_mci_state { struct atmel_mci_dma { #ifdef CONFIG_MMC_ATMELMCI_DMA - struct dma_client client; struct dma_chan *chan; struct dma_async_tx_descriptor *data_desc; #endif @@ -593,10 +592,8 @@ atmci_submit_data_dma(struct atmel_mci *host, struct mmc_data *data) /* If we don't have a channel, we can't do DMA */ chan = host->dma.chan; - if (chan) { - dma_chan_get(chan); + if (chan) host->data_chan = chan; - } if (!chan) return -ENODEV; @@ -1443,60 +1440,6 @@ static irqreturn_t atmci_detect_interrupt(int irq, void *dev_id) return IRQ_HANDLED; } -#ifdef CONFIG_MMC_ATMELMCI_DMA - -static inline struct atmel_mci * -dma_client_to_atmel_mci(struct dma_client *client) -{ - return container_of(client, struct atmel_mci, dma.client); -} - -static enum dma_state_client atmci_dma_event(struct dma_client *client, - struct dma_chan *chan, enum dma_state state) -{ - struct atmel_mci *host; - enum dma_state_client ret = DMA_NAK; - - host = dma_client_to_atmel_mci(client); - - switch (state) { - case DMA_RESOURCE_AVAILABLE: - spin_lock_bh(&host->lock); - if (!host->dma.chan) { - host->dma.chan = chan; - ret = DMA_ACK; - } - spin_unlock_bh(&host->lock); - - if (ret == DMA_ACK) - dev_info(&host->pdev->dev, - "Using %s for DMA transfers\n", - chan->dev.bus_id); - break; - - case DMA_RESOURCE_REMOVED: - spin_lock_bh(&host->lock); - if (host->dma.chan == chan) { - host->dma.chan = NULL; - ret = DMA_ACK; - } - spin_unlock_bh(&host->lock); - - if (ret == DMA_ACK) - dev_info(&host->pdev->dev, - "Lost %s, falling back to PIO\n", - chan->dev.bus_id); - break; - - default: - break; - } - - - return ret; -} -#endif /* CONFIG_MMC_ATMELMCI_DMA */ - static int __init atmci_init_slot(struct atmel_mci *host, struct mci_slot_pdata *slot_data, unsigned int id, u32 sdc_reg) @@ -1600,6 +1543,18 @@ static void __exit atmci_cleanup_slot(struct atmel_mci_slot *slot, mmc_free_host(slot->mmc); } +#ifdef CONFIG_MMC_ATMELMCI_DMA +static bool filter(struct dma_chan *chan, void *slave) +{ + struct dw_dma_slave *dws = slave; + + if (dws->dma_dev == chan->device->dev) + return true; + else + return false; +} +#endif + static int __init atmci_probe(struct platform_device *pdev) { struct mci_platform_data *pdata; @@ -1652,22 +1607,20 @@ static int __init atmci_probe(struct platform_device *pdev) goto err_request_irq; #ifdef CONFIG_MMC_ATMELMCI_DMA - if (pdata->dma_slave) { - struct dma_slave *slave = pdata->dma_slave; + if (pdata->dma_slave.dma_dev) { + struct dw_dma_slave *dws = &pdata->dma_slave; + dma_cap_mask_t mask; - slave->tx_reg = regs->start + MCI_TDR; - slave->rx_reg = regs->start + MCI_RDR; + dws->tx_reg = regs->start + MCI_TDR; + dws->rx_reg = regs->start + MCI_RDR; /* Try to grab a DMA channel */ - host->dma.client.event_callback = atmci_dma_event; - dma_cap_set(DMA_SLAVE, host->dma.client.cap_mask); - host->dma.client.slave = slave; - - dma_async_client_register(&host->dma.client); - dma_async_client_chan_request(&host->dma.client); - } else { - dev_notice(&pdev->dev, "DMA not available, using PIO\n"); + dma_cap_zero(mask); + dma_cap_set(DMA_SLAVE, mask); + host->dma.chan = dma_request_channel(mask, filter, dws); } + if (!host->dma.chan) + dev_notice(&pdev->dev, "DMA not available, using PIO\n"); #endif /* CONFIG_MMC_ATMELMCI_DMA */ platform_set_drvdata(pdev, host); @@ -1699,8 +1652,8 @@ static int __init atmci_probe(struct platform_device *pdev) err_init_slot: #ifdef CONFIG_MMC_ATMELMCI_DMA - if (pdata->dma_slave) - dma_async_client_unregister(&host->dma.client); + if (host->dma.chan) + dma_release_channel(host->dma.chan); #endif free_irq(irq, host); err_request_irq: @@ -1731,8 +1684,8 @@ static int __exit atmci_remove(struct platform_device *pdev) clk_disable(host->mck); #ifdef CONFIG_MMC_ATMELMCI_DMA - if (host->dma.client.slave) - dma_async_client_unregister(&host->dma.client); + if (host->dma.chan) + dma_release_channel(host->dma.chan); #endif free_irq(platform_get_irq(pdev, 0), host); @@ -1761,7 +1714,7 @@ static void __exit atmci_exit(void) platform_driver_unregister(&atmci_driver); } -module_init(atmci_init); +late_initcall(atmci_init); /* try to load after dma driver when built-in */ module_exit(atmci_exit); MODULE_DESCRIPTION("Atmel Multimedia Card Interface driver"); diff --git a/drivers/mtd/Kconfig b/drivers/mtd/Kconfig index a90d50c2c3e..7d04fb9ddca 100644 --- a/drivers/mtd/Kconfig +++ b/drivers/mtd/Kconfig @@ -45,6 +45,14 @@ config MTD_PARTITIONS devices. Partitioning on NFTL 'devices' is a different - that's the 'normal' form of partitioning used on a block device. +config MTD_TESTS + tristate "MTD tests support" + depends on m + help + This option includes various MTD tests into compilation. The tests + should normally be compiled as kernel modules. The modules perform + various checks and verifications when loaded. + config MTD_REDBOOT_PARTS tristate "RedBoot partition table parsing" depends on MTD_PARTITIONS @@ -316,6 +324,8 @@ source "drivers/mtd/nand/Kconfig" source "drivers/mtd/onenand/Kconfig" +source "drivers/mtd/lpddr/Kconfig" + source "drivers/mtd/ubi/Kconfig" endif # MTD diff --git a/drivers/mtd/Makefile b/drivers/mtd/Makefile index 4b77335715f..4521b1ecce4 100644 --- a/drivers/mtd/Makefile +++ b/drivers/mtd/Makefile @@ -29,6 +29,6 @@ obj-$(CONFIG_MTD_OOPS) += mtdoops.o nftl-objs := nftlcore.o nftlmount.o inftl-objs := inftlcore.o inftlmount.o -obj-y += chips/ maps/ devices/ nand/ onenand/ +obj-y += chips/ lpddr/ maps/ devices/ nand/ onenand/ tests/ obj-$(CONFIG_MTD_UBI) += ubi/ diff --git a/drivers/mtd/chips/cfi_cmdset_0001.c b/drivers/mtd/chips/cfi_cmdset_0001.c index c93a8be5d5f..f5ab6fa1057 100644 --- a/drivers/mtd/chips/cfi_cmdset_0001.c +++ b/drivers/mtd/chips/cfi_cmdset_0001.c @@ -58,8 +58,8 @@ static int cfi_intelext_write_buffers(struct mtd_info *, loff_t, size_t, size_t static int cfi_intelext_writev(struct mtd_info *, const struct kvec *, unsigned long, loff_t, size_t *); static int cfi_intelext_erase_varsize(struct mtd_info *, struct erase_info *); static void cfi_intelext_sync (struct mtd_info *); -static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, size_t len); -static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, size_t len); +static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len); +static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len); #ifdef CONFIG_MTD_OTP static int cfi_intelext_read_fact_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *); static int cfi_intelext_read_user_prot_reg (struct mtd_info *, loff_t, size_t, size_t *, u_char *); @@ -558,8 +558,8 @@ static struct mtd_info *cfi_intelext_setup(struct mtd_info *mtd) } for (i=0; i<mtd->numeraseregions;i++){ - printk(KERN_DEBUG "erase region %d: offset=0x%x,size=0x%x,blocks=%d\n", - i,mtd->eraseregions[i].offset, + printk(KERN_DEBUG "erase region %d: offset=0x%llx,size=0x%x,blocks=%d\n", + i,(unsigned long long)mtd->eraseregions[i].offset, mtd->eraseregions[i].erasesize, mtd->eraseregions[i].numblocks); } @@ -2058,7 +2058,7 @@ out: put_chip(map, chip, adr); return ret; } -static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { int ret; @@ -2082,7 +2082,7 @@ static int cfi_intelext_lock(struct mtd_info *mtd, loff_t ofs, size_t len) return ret; } -static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int cfi_intelext_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { int ret; diff --git a/drivers/mtd/chips/cfi_cmdset_0002.c b/drivers/mtd/chips/cfi_cmdset_0002.c index d74ec46aa03..94bb61e1904 100644 --- a/drivers/mtd/chips/cfi_cmdset_0002.c +++ b/drivers/mtd/chips/cfi_cmdset_0002.c @@ -71,8 +71,8 @@ static int get_chip(struct map_info *map, struct flchip *chip, unsigned long adr static void put_chip(struct map_info *map, struct flchip *chip, unsigned long adr); #include "fwh_lock.h" -static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, size_t len); -static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, size_t len); +static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len); +static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len); static struct mtd_chip_driver cfi_amdstd_chipdrv = { .probe = NULL, /* Not usable directly */ @@ -322,6 +322,14 @@ static struct cfi_fixup fixup_table[] = { }; +static void cfi_fixup_major_minor(struct cfi_private *cfi, + struct cfi_pri_amdstd *extp) +{ + if (cfi->mfr == CFI_MFR_SAMSUNG && cfi->id == 0x257e && + extp->MajorVersion == '0') + extp->MajorVersion = '1'; +} + struct mtd_info *cfi_cmdset_0002(struct map_info *map, int primary) { struct cfi_private *cfi = map->fldrv_priv; @@ -363,6 +371,8 @@ struct mtd_info *cfi_cmdset_0002(struct map_info *map, int primary) return NULL; } + cfi_fixup_major_minor(cfi, extp); + if (extp->MajorVersion != '1' || (extp->MinorVersion < '0' || extp->MinorVersion > '4')) { printk(KERN_ERR " Unknown Amd/Fujitsu Extended Query " @@ -1774,12 +1784,12 @@ out_unlock: return ret; } -static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int cfi_atmel_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { return cfi_varsize_frob(mtd, do_atmel_lock, ofs, len, NULL); } -static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int cfi_atmel_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { return cfi_varsize_frob(mtd, do_atmel_unlock, ofs, len, NULL); } diff --git a/drivers/mtd/chips/cfi_cmdset_0020.c b/drivers/mtd/chips/cfi_cmdset_0020.c index d4714dd9f7a..6c740f346f9 100644 --- a/drivers/mtd/chips/cfi_cmdset_0020.c +++ b/drivers/mtd/chips/cfi_cmdset_0020.c @@ -42,8 +42,8 @@ static int cfi_staa_writev(struct mtd_info *mtd, const struct kvec *vecs, unsigned long count, loff_t to, size_t *retlen); static int cfi_staa_erase_varsize(struct mtd_info *, struct erase_info *); static void cfi_staa_sync (struct mtd_info *); -static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, size_t len); -static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, size_t len); +static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len); +static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len); static int cfi_staa_suspend (struct mtd_info *); static void cfi_staa_resume (struct mtd_info *); @@ -221,8 +221,8 @@ static struct mtd_info *cfi_staa_setup(struct map_info *map) } for (i=0; i<mtd->numeraseregions;i++){ - printk(KERN_DEBUG "%d: offset=0x%x,size=0x%x,blocks=%d\n", - i,mtd->eraseregions[i].offset, + printk(KERN_DEBUG "%d: offset=0x%llx,size=0x%x,blocks=%d\n", + i, (unsigned long long)mtd->eraseregions[i].offset, mtd->eraseregions[i].erasesize, mtd->eraseregions[i].numblocks); } @@ -964,7 +964,7 @@ static int cfi_staa_erase_varsize(struct mtd_info *mtd, adr += regions[i].erasesize; len -= regions[i].erasesize; - if (adr % (1<< cfi->chipshift) == ((regions[i].offset + (regions[i].erasesize * regions[i].numblocks)) %( 1<< cfi->chipshift))) + if (adr % (1<< cfi->chipshift) == (((unsigned long)regions[i].offset + (regions[i].erasesize * regions[i].numblocks)) %( 1<< cfi->chipshift))) i++; if (adr >> cfi->chipshift) { @@ -1135,7 +1135,7 @@ retry: spin_unlock_bh(chip->mutex); return 0; } -static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int cfi_staa_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { struct map_info *map = mtd->priv; struct cfi_private *cfi = map->fldrv_priv; @@ -1284,7 +1284,7 @@ retry: spin_unlock_bh(chip->mutex); return 0; } -static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int cfi_staa_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { struct map_info *map = mtd->priv; struct cfi_private *cfi = map->fldrv_priv; diff --git a/drivers/mtd/chips/fwh_lock.h b/drivers/mtd/chips/fwh_lock.h index ab44f2b996f..57e0e4e921f 100644 --- a/drivers/mtd/chips/fwh_lock.h +++ b/drivers/mtd/chips/fwh_lock.h @@ -77,7 +77,7 @@ static int fwh_xxlock_oneblock(struct map_info *map, struct flchip *chip, } -static int fwh_lock_varsize(struct mtd_info *mtd, loff_t ofs, size_t len) +static int fwh_lock_varsize(struct mtd_info *mtd, loff_t ofs, uint64_t len) { int ret; @@ -88,7 +88,7 @@ static int fwh_lock_varsize(struct mtd_info *mtd, loff_t ofs, size_t len) } -static int fwh_unlock_varsize(struct mtd_info *mtd, loff_t ofs, size_t len) +static int fwh_unlock_varsize(struct mtd_info *mtd, loff_t ofs, uint64_t len) { int ret; diff --git a/drivers/mtd/devices/lart.c b/drivers/mtd/devices/lart.c index f4bda4cee49..578de1c67bf 100644 --- a/drivers/mtd/devices/lart.c +++ b/drivers/mtd/devices/lart.c @@ -619,7 +619,7 @@ static struct mtd_partition lart_partitions[] = { }; #endif -int __init lart_flash_init (void) +static int __init lart_flash_init (void) { int result; memset (&mtd,0,sizeof (mtd)); @@ -690,7 +690,7 @@ int __init lart_flash_init (void) return (result); } -void __exit lart_flash_exit (void) +static void __exit lart_flash_exit (void) { #ifndef HAVE_PARTITIONS del_mtd_device (&mtd); @@ -705,5 +705,3 @@ module_exit (lart_flash_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Abraham vd Merwe <abraham@2d3d.co.za>"); MODULE_DESCRIPTION("MTD driver for Intel 28F160F3 on LART board"); - - diff --git a/drivers/mtd/devices/m25p80.c b/drivers/mtd/devices/m25p80.c index 5733f064384..7c3fc766dcf 100644 --- a/drivers/mtd/devices/m25p80.c +++ b/drivers/mtd/devices/m25p80.c @@ -20,6 +20,7 @@ #include <linux/device.h> #include <linux/interrupt.h> #include <linux/mutex.h> +#include <linux/math64.h> #include <linux/mtd/mtd.h> #include <linux/mtd/partitions.h> @@ -169,9 +170,9 @@ static int wait_till_ready(struct m25p *flash) */ static int erase_chip(struct m25p *flash) { - DEBUG(MTD_DEBUG_LEVEL3, "%s: %s %dKiB\n", - dev_name(&flash->spi->dev), __func__, - flash->mtd.size / 1024); + DEBUG(MTD_DEBUG_LEVEL3, "%s: %s %lldKiB\n", + dev_name(&flash->spi->dev), __func__, + (long long)(flash->mtd.size >> 10)); /* Wait until finished previous write command. */ if (wait_till_ready(flash)) @@ -232,18 +233,18 @@ static int m25p80_erase(struct mtd_info *mtd, struct erase_info *instr) { struct m25p *flash = mtd_to_m25p(mtd); u32 addr,len; + uint32_t rem; - DEBUG(MTD_DEBUG_LEVEL2, "%s: %s %s 0x%08x, len %d\n", - dev_name(&flash->spi->dev), __func__, "at", - (u32)instr->addr, instr->len); + DEBUG(MTD_DEBUG_LEVEL2, "%s: %s %s 0x%llx, len %lld\n", + dev_name(&flash->spi->dev), __func__, "at", + (long long)instr->addr, (long long)instr->len); /* sanity checks */ if (instr->addr + instr->len > flash->mtd.size) return -EINVAL; - if ((instr->addr % mtd->erasesize) != 0 - || (instr->len % mtd->erasesize) != 0) { + div_u64_rem(instr->len, mtd->erasesize, &rem); + if (rem) return -EINVAL; - } addr = instr->addr; len = instr->len; @@ -677,24 +678,24 @@ static int __devinit m25p_probe(struct spi_device *spi) flash->mtd.erasesize = info->sector_size; } - dev_info(&spi->dev, "%s (%d Kbytes)\n", info->name, - flash->mtd.size / 1024); + dev_info(&spi->dev, "%s (%lld Kbytes)\n", info->name, + (long long)flash->mtd.size >> 10); DEBUG(MTD_DEBUG_LEVEL2, - "mtd .name = %s, .size = 0x%.8x (%uMiB) " + "mtd .name = %s, .size = 0x%llx (%lldMiB) " ".erasesize = 0x%.8x (%uKiB) .numeraseregions = %d\n", flash->mtd.name, - flash->mtd.size, flash->mtd.size / (1024*1024), + (long long)flash->mtd.size, (long long)(flash->mtd.size >> 20), flash->mtd.erasesize, flash->mtd.erasesize / 1024, flash->mtd.numeraseregions); if (flash->mtd.numeraseregions) for (i = 0; i < flash->mtd.numeraseregions; i++) DEBUG(MTD_DEBUG_LEVEL2, - "mtd.eraseregions[%d] = { .offset = 0x%.8x, " + "mtd.eraseregions[%d] = { .offset = 0x%llx, " ".erasesize = 0x%.8x (%uKiB), " ".numblocks = %d }\n", - i, flash->mtd.eraseregions[i].offset, + i, (long long)flash->mtd.eraseregions[i].offset, flash->mtd.eraseregions[i].erasesize, flash->mtd.eraseregions[i].erasesize / 1024, flash->mtd.eraseregions[i].numblocks); @@ -722,12 +723,12 @@ static int __devinit m25p_probe(struct spi_device *spi) if (nr_parts > 0) { for (i = 0; i < nr_parts; i++) { DEBUG(MTD_DEBUG_LEVEL2, "partitions[%d] = " - "{.name = %s, .offset = 0x%.8x, " - ".size = 0x%.8x (%uKiB) }\n", + "{.name = %s, .offset = 0x%llx, " + ".size = 0x%llx (%lldKiB) }\n", i, parts[i].name, - parts[i].offset, - parts[i].size, - parts[i].size / 1024); + (long long)parts[i].offset, + (long long)parts[i].size, + (long long)(parts[i].size >> 10)); } flash->partitioned = 1; return add_mtd_partitions(&flash->mtd, parts, nr_parts); diff --git a/drivers/mtd/devices/mtd_dataflash.c b/drivers/mtd/devices/mtd_dataflash.c index 65126cd668f..d44f741ae22 100644 --- a/drivers/mtd/devices/mtd_dataflash.c +++ b/drivers/mtd/devices/mtd_dataflash.c @@ -16,6 +16,7 @@ #include <linux/device.h> #include <linux/mutex.h> #include <linux/err.h> +#include <linux/math64.h> #include <linux/spi/spi.h> #include <linux/spi/flash.h> @@ -152,15 +153,20 @@ static int dataflash_erase(struct mtd_info *mtd, struct erase_info *instr) struct spi_message msg; unsigned blocksize = priv->page_size << 3; uint8_t *command; + uint32_t rem; - DEBUG(MTD_DEBUG_LEVEL2, "%s: erase addr=0x%x len 0x%x\n", - dev_name(&spi->dev), - instr->addr, instr->len); + DEBUG(MTD_DEBUG_LEVEL2, "%s: erase addr=0x%llx len 0x%llx\n", + dev_name(&spi->dev), (long long)instr->addr, + (long long)instr->len); /* Sanity checks */ - if ((instr->addr + instr->len) > mtd->size - || (instr->len % priv->page_size) != 0 - || (instr->addr % priv->page_size) != 0) + if (instr->addr + instr->len > mtd->size) + return -EINVAL; + div_u64_rem(instr->len, priv->page_size, &rem); + if (rem) + return -EINVAL; + div_u64_rem(instr->addr, priv->page_size, &rem); + if (rem) return -EINVAL; spi_message_init(&msg); @@ -178,7 +184,7 @@ static int dataflash_erase(struct mtd_info *mtd, struct erase_info *instr) /* Calculate flash page address; use block erase (for speed) if * we're at a block boundary and need to erase the whole block. */ - pageaddr = instr->addr / priv->page_size; + pageaddr = div_u64(instr->len, priv->page_size); do_block = (pageaddr & 0x7) == 0 && instr->len >= blocksize; pageaddr = pageaddr << priv->page_offset; @@ -667,8 +673,8 @@ add_dataflash_otp(struct spi_device *spi, char *name, if (revision >= 'c') otp_tag = otp_setup(device, revision); - dev_info(&spi->dev, "%s (%d KBytes) pagesize %d bytes%s\n", - name, DIV_ROUND_UP(device->size, 1024), + dev_info(&spi->dev, "%s (%lld KBytes) pagesize %d bytes%s\n", + name, (long long)((device->size + 1023) >> 10), pagesize, otp_tag); dev_set_drvdata(&spi->dev, priv); diff --git a/drivers/mtd/ftl.c b/drivers/mtd/ftl.c index 9bf581c4f74..a790c062af1 100644 --- a/drivers/mtd/ftl.c +++ b/drivers/mtd/ftl.c @@ -109,25 +109,25 @@ module_param(shuffle_freq, int, 0); /* Each memory region corresponds to a minor device */ typedef struct partition_t { struct mtd_blktrans_dev mbd; - u_int32_t state; - u_int32_t *VirtualBlockMap; - u_int32_t *VirtualPageMap; - u_int32_t FreeTotal; + uint32_t state; + uint32_t *VirtualBlockMap; + uint32_t *VirtualPageMap; + uint32_t FreeTotal; struct eun_info_t { - u_int32_t Offset; - u_int32_t EraseCount; - u_int32_t Free; - u_int32_t Deleted; + uint32_t Offset; + uint32_t EraseCount; + uint32_t Free; + uint32_t Deleted; } *EUNInfo; struct xfer_info_t { - u_int32_t Offset; - u_int32_t EraseCount; - u_int16_t state; + uint32_t Offset; + uint32_t EraseCount; + uint16_t state; } *XferInfo; - u_int16_t bam_index; - u_int32_t *bam_cache; - u_int16_t DataUnits; - u_int32_t BlocksPerUnit; + uint16_t bam_index; + uint32_t *bam_cache; + uint16_t DataUnits; + uint32_t BlocksPerUnit; erase_unit_header_t header; } partition_t; @@ -199,8 +199,8 @@ static int scan_header(partition_t *part) static int build_maps(partition_t *part) { erase_unit_header_t header; - u_int16_t xvalid, xtrans, i; - u_int blocks, j; + uint16_t xvalid, xtrans, i; + unsigned blocks, j; int hdr_ok, ret = -1; ssize_t retval; loff_t offset; @@ -269,14 +269,14 @@ static int build_maps(partition_t *part) /* Set up virtual page map */ blocks = le32_to_cpu(header.FormattedSize) >> header.BlockSize; - part->VirtualBlockMap = vmalloc(blocks * sizeof(u_int32_t)); + part->VirtualBlockMap = vmalloc(blocks * sizeof(uint32_t)); if (!part->VirtualBlockMap) goto out_XferInfo; - memset(part->VirtualBlockMap, 0xff, blocks * sizeof(u_int32_t)); + memset(part->VirtualBlockMap, 0xff, blocks * sizeof(uint32_t)); part->BlocksPerUnit = (1 << header.EraseUnitSize) >> header.BlockSize; - part->bam_cache = kmalloc(part->BlocksPerUnit * sizeof(u_int32_t), + part->bam_cache = kmalloc(part->BlocksPerUnit * sizeof(uint32_t), GFP_KERNEL); if (!part->bam_cache) goto out_VirtualBlockMap; @@ -290,7 +290,7 @@ static int build_maps(partition_t *part) offset = part->EUNInfo[i].Offset + le32_to_cpu(header.BAMOffset); ret = part->mbd.mtd->read(part->mbd.mtd, offset, - part->BlocksPerUnit * sizeof(u_int32_t), &retval, + part->BlocksPerUnit * sizeof(uint32_t), &retval, (unsigned char *)part->bam_cache); if (ret) @@ -332,7 +332,7 @@ out: ======================================================================*/ static int erase_xfer(partition_t *part, - u_int16_t xfernum) + uint16_t xfernum) { int ret; struct xfer_info_t *xfer; @@ -408,7 +408,7 @@ static int prepare_xfer(partition_t *part, int i) erase_unit_header_t header; struct xfer_info_t *xfer; int nbam, ret; - u_int32_t ctl; + uint32_t ctl; ssize_t retlen; loff_t offset; @@ -430,15 +430,15 @@ static int prepare_xfer(partition_t *part, int i) } /* Write the BAM stub */ - nbam = (part->BlocksPerUnit * sizeof(u_int32_t) + + nbam = (part->BlocksPerUnit * sizeof(uint32_t) + le32_to_cpu(part->header.BAMOffset) + SECTOR_SIZE - 1) / SECTOR_SIZE; offset = xfer->Offset + le32_to_cpu(part->header.BAMOffset); ctl = cpu_to_le32(BLOCK_CONTROL); - for (i = 0; i < nbam; i++, offset += sizeof(u_int32_t)) { + for (i = 0; i < nbam; i++, offset += sizeof(uint32_t)) { - ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(u_int32_t), + ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint32_t), &retlen, (u_char *)&ctl); if (ret) @@ -461,18 +461,18 @@ static int prepare_xfer(partition_t *part, int i) ======================================================================*/ -static int copy_erase_unit(partition_t *part, u_int16_t srcunit, - u_int16_t xferunit) +static int copy_erase_unit(partition_t *part, uint16_t srcunit, + uint16_t xferunit) { u_char buf[SECTOR_SIZE]; struct eun_info_t *eun; struct xfer_info_t *xfer; - u_int32_t src, dest, free, i; - u_int16_t unit; + uint32_t src, dest, free, i; + uint16_t unit; int ret; ssize_t retlen; loff_t offset; - u_int16_t srcunitswap = cpu_to_le16(srcunit); + uint16_t srcunitswap = cpu_to_le16(srcunit); eun = &part->EUNInfo[srcunit]; xfer = &part->XferInfo[xferunit]; @@ -486,7 +486,7 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit, offset = eun->Offset + le32_to_cpu(part->header.BAMOffset); ret = part->mbd.mtd->read(part->mbd.mtd, offset, - part->BlocksPerUnit * sizeof(u_int32_t), + part->BlocksPerUnit * sizeof(uint32_t), &retlen, (u_char *) (part->bam_cache)); /* mark the cache bad, in case we get an error later */ @@ -503,7 +503,7 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit, offset = xfer->Offset + 20; /* Bad! */ unit = cpu_to_le16(0x7fff); - ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(u_int16_t), + ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint16_t), &retlen, (u_char *) &unit); if (ret) { @@ -560,7 +560,7 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit, /* All clear? Then update the LogicalEUN again */ - ret = part->mbd.mtd->write(part->mbd.mtd, xfer->Offset + 20, sizeof(u_int16_t), + ret = part->mbd.mtd->write(part->mbd.mtd, xfer->Offset + 20, sizeof(uint16_t), &retlen, (u_char *)&srcunitswap); if (ret) { @@ -605,8 +605,8 @@ static int copy_erase_unit(partition_t *part, u_int16_t srcunit, static int reclaim_block(partition_t *part) { - u_int16_t i, eun, xfer; - u_int32_t best; + uint16_t i, eun, xfer; + uint32_t best; int queued, ret; DEBUG(0, "ftl_cs: reclaiming space...\n"); @@ -723,10 +723,10 @@ static void dump_lists(partition_t *part) } #endif -static u_int32_t find_free(partition_t *part) +static uint32_t find_free(partition_t *part) { - u_int16_t stop, eun; - u_int32_t blk; + uint16_t stop, eun; + uint32_t blk; size_t retlen; int ret; @@ -749,7 +749,7 @@ static u_int32_t find_free(partition_t *part) ret = part->mbd.mtd->read(part->mbd.mtd, part->EUNInfo[eun].Offset + le32_to_cpu(part->header.BAMOffset), - part->BlocksPerUnit * sizeof(u_int32_t), + part->BlocksPerUnit * sizeof(uint32_t), &retlen, (u_char *) (part->bam_cache)); if (ret) { @@ -786,7 +786,7 @@ static u_int32_t find_free(partition_t *part) static int ftl_read(partition_t *part, caddr_t buffer, u_long sector, u_long nblocks) { - u_int32_t log_addr, bsize; + uint32_t log_addr, bsize; u_long i; int ret; size_t offset, retlen; @@ -829,14 +829,14 @@ static int ftl_read(partition_t *part, caddr_t buffer, ======================================================================*/ -static int set_bam_entry(partition_t *part, u_int32_t log_addr, - u_int32_t virt_addr) +static int set_bam_entry(partition_t *part, uint32_t log_addr, + uint32_t virt_addr) { - u_int32_t bsize, blk, le_virt_addr; + uint32_t bsize, blk, le_virt_addr; #ifdef PSYCHO_DEBUG - u_int32_t old_addr; + uint32_t old_addr; #endif - u_int16_t eun; + uint16_t eun; int ret; size_t retlen, offset; @@ -845,11 +845,11 @@ static int set_bam_entry(partition_t *part, u_int32_t log_addr, bsize = 1 << part->header.EraseUnitSize; eun = log_addr / bsize; blk = (log_addr % bsize) / SECTOR_SIZE; - offset = (part->EUNInfo[eun].Offset + blk * sizeof(u_int32_t) + + offset = (part->EUNInfo[eun].Offset + blk * sizeof(uint32_t) + le32_to_cpu(part->header.BAMOffset)); #ifdef PSYCHO_DEBUG - ret = part->mbd.mtd->read(part->mbd.mtd, offset, sizeof(u_int32_t), + ret = part->mbd.mtd->read(part->mbd.mtd, offset, sizeof(uint32_t), &retlen, (u_char *)&old_addr); if (ret) { printk(KERN_WARNING"ftl: Error reading old_addr in set_bam_entry: %d\n",ret); @@ -886,7 +886,7 @@ static int set_bam_entry(partition_t *part, u_int32_t log_addr, #endif part->bam_cache[blk] = le_virt_addr; } - ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(u_int32_t), + ret = part->mbd.mtd->write(part->mbd.mtd, offset, sizeof(uint32_t), &retlen, (u_char *)&le_virt_addr); if (ret) { @@ -900,7 +900,7 @@ static int set_bam_entry(partition_t *part, u_int32_t log_addr, static int ftl_write(partition_t *part, caddr_t buffer, u_long sector, u_long nblocks) { - u_int32_t bsize, log_addr, virt_addr, old_addr, blk; + uint32_t bsize, log_addr, virt_addr, old_addr, blk; u_long i; int ret; size_t retlen, offset; diff --git a/drivers/mtd/inftlcore.c b/drivers/mtd/inftlcore.c index 50ce13887f6..73f05227dc8 100644 --- a/drivers/mtd/inftlcore.c +++ b/drivers/mtd/inftlcore.c @@ -50,7 +50,7 @@ static void inftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd) struct INFTLrecord *inftl; unsigned long temp; - if (mtd->type != MTD_NANDFLASH) + if (mtd->type != MTD_NANDFLASH || mtd->size > UINT_MAX) return; /* OK, this is moderately ugly. But probably safe. Alternatives? */ if (memcmp(mtd->name, "DiskOnChip", 10)) diff --git a/drivers/mtd/inftlmount.c b/drivers/mtd/inftlmount.c index 9113628ed1e..f751dd97c54 100644 --- a/drivers/mtd/inftlmount.c +++ b/drivers/mtd/inftlmount.c @@ -63,7 +63,7 @@ static int find_boot_record(struct INFTLrecord *inftl) * otherwise. */ inftl->EraseSize = inftl->mbd.mtd->erasesize; - inftl->nb_blocks = inftl->mbd.mtd->size / inftl->EraseSize; + inftl->nb_blocks = (u32)inftl->mbd.mtd->size / inftl->EraseSize; inftl->MediaUnit = BLOCK_NIL; @@ -187,7 +187,7 @@ static int find_boot_record(struct INFTLrecord *inftl) mh->BlockMultiplierBits); inftl->EraseSize = inftl->mbd.mtd->erasesize << mh->BlockMultiplierBits; - inftl->nb_blocks = inftl->mbd.mtd->size / inftl->EraseSize; + inftl->nb_blocks = (u32)inftl->mbd.mtd->size / inftl->EraseSize; block >>= mh->BlockMultiplierBits; } diff --git a/drivers/mtd/lpddr/Kconfig b/drivers/mtd/lpddr/Kconfig new file mode 100644 index 00000000000..acd4ea9b227 --- /dev/null +++ b/drivers/mtd/lpddr/Kconfig @@ -0,0 +1,22 @@ +# drivers/mtd/chips/Kconfig + +menu "LPDDR flash memory drivers" + depends on MTD!=n + +config MTD_LPDDR + tristate "Support for LPDDR flash chips" + select MTD_QINFO_PROBE + help + This option enables support of LPDDR (Low power double data rate) + flash chips. Synonymous with Mobile-DDR. It is a new standard for + DDR memories, intended for battery-operated systems. + +config MTD_QINFO_PROBE + tristate "Detect flash chips by QINFO probe" + help + Device Information for LPDDR chips is offered through the Overlay + Window QINFO interface, permits software to be used for entire + families of devices. This serves similar purpose of CFI on legacy + Flash products +endmenu + diff --git a/drivers/mtd/lpddr/Makefile b/drivers/mtd/lpddr/Makefile new file mode 100644 index 00000000000..da48e46b581 --- /dev/null +++ b/drivers/mtd/lpddr/Makefile @@ -0,0 +1,6 @@ +# +# linux/drivers/mtd/lpddr/Makefile +# + +obj-$(CONFIG_MTD_QINFO_PROBE) += qinfo_probe.o +obj-$(CONFIG_MTD_LPDDR) += lpddr_cmds.o diff --git a/drivers/mtd/lpddr/lpddr_cmds.c b/drivers/mtd/lpddr/lpddr_cmds.c new file mode 100644 index 00000000000..e22ca49583e --- /dev/null +++ b/drivers/mtd/lpddr/lpddr_cmds.c @@ -0,0 +1,796 @@ +/* + * LPDDR flash memory device operations. This module provides read, write, + * erase, lock/unlock support for LPDDR flash memories + * (C) 2008 Korolev Alexey <akorolev@infradead.org> + * (C) 2008 Vasiliy Leonenko <vasiliy.leonenko@gmail.com> + * Many thanks to Roman Borisov for intial enabling + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + * TODO: + * Implement VPP management + * Implement XIP support + * Implement OTP support + */ +#include <linux/mtd/pfow.h> +#include <linux/mtd/qinfo.h> + +static int lpddr_read(struct mtd_info *mtd, loff_t adr, size_t len, + size_t *retlen, u_char *buf); +static int lpddr_write_buffers(struct mtd_info *mtd, loff_t to, + size_t len, size_t *retlen, const u_char *buf); +static int lpddr_writev(struct mtd_info *mtd, const struct kvec *vecs, + unsigned long count, loff_t to, size_t *retlen); +static int lpddr_erase(struct mtd_info *mtd, struct erase_info *instr); +static int lpddr_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len); +static int lpddr_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len); +static int lpddr_point(struct mtd_info *mtd, loff_t adr, size_t len, + size_t *retlen, void **mtdbuf, resource_size_t *phys); +static void lpddr_unpoint(struct mtd_info *mtd, loff_t adr, size_t len); +static int get_chip(struct map_info *map, struct flchip *chip, int mode); +static int chip_ready(struct map_info *map, struct flchip *chip, int mode); +static void put_chip(struct map_info *map, struct flchip *chip); + +struct mtd_info *lpddr_cmdset(struct map_info *map) +{ + struct lpddr_private *lpddr = map->fldrv_priv; + struct flchip_shared *shared; + struct flchip *chip; + struct mtd_info *mtd; + int numchips; + int i, j; + + mtd = kzalloc(sizeof(*mtd), GFP_KERNEL); + if (!mtd) { + printk(KERN_ERR "Failed to allocate memory for MTD device\n"); + return NULL; + } + mtd->priv = map; + mtd->type = MTD_NORFLASH; + + /* Fill in the default mtd operations */ + mtd->read = lpddr_read; + mtd->type = MTD_NORFLASH; + mtd->flags = MTD_CAP_NORFLASH; + mtd->flags &= ~MTD_BIT_WRITEABLE; + mtd->erase = lpddr_erase; + mtd->write = lpddr_write_buffers; + mtd->writev = lpddr_writev; + mtd->read_oob = NULL; + mtd->write_oob = NULL; + mtd->sync = NULL; + mtd->lock = lpddr_lock; + mtd->unlock = lpddr_unlock; + mtd->suspend = NULL; + mtd->resume = NULL; + if (map_is_linear(map)) { + mtd->point = lpddr_point; + mtd->unpoint = lpddr_unpoint; + } + mtd->block_isbad = NULL; + mtd->block_markbad = NULL; + mtd->size = 1 << lpddr->qinfo->DevSizeShift; + mtd->erasesize = 1 << lpddr->qinfo->UniformBlockSizeShift; + mtd->writesize = 1 << lpddr->qinfo->BufSizeShift; + + shared = kmalloc(sizeof(struct flchip_shared) * lpddr->numchips, + GFP_KERNEL); + if (!shared) { + kfree(lpddr); + kfree(mtd); + return NULL; + } + + chip = &lpddr->chips[0]; + numchips = lpddr->numchips / lpddr->qinfo->HWPartsNum; + for (i = 0; i < numchips; i++) { + shared[i].writing = shared[i].erasing = NULL; + spin_lock_init(&shared[i].lock); + for (j = 0; j < lpddr->qinfo->HWPartsNum; j++) { + *chip = lpddr->chips[i]; + chip->start += j << lpddr->chipshift; + chip->oldstate = chip->state = FL_READY; + chip->priv = &shared[i]; + /* those should be reset too since + they create memory references. */ + init_waitqueue_head(&chip->wq); + spin_lock_init(&chip->_spinlock); + chip->mutex = &chip->_spinlock; + chip++; + } + } + + return mtd; +} +EXPORT_SYMBOL(lpddr_cmdset); + +static int wait_for_ready(struct map_info *map, struct flchip *chip, + unsigned int chip_op_time) +{ + unsigned int timeo, reset_timeo, sleep_time; + unsigned int dsr; + flstate_t chip_state = chip->state; + int ret = 0; + + /* set our timeout to 8 times the expected delay */ + timeo = chip_op_time * 8; + if (!timeo) + timeo = 500000; + reset_timeo = timeo; + sleep_time = chip_op_time / 2; + + for (;;) { + dsr = CMDVAL(map_read(map, map->pfow_base + PFOW_DSR)); + if (dsr & DSR_READY_STATUS) + break; + if (!timeo) { + printk(KERN_ERR "%s: Flash timeout error state %d \n", + map->name, chip_state); + ret = -ETIME; + break; + } + + /* OK Still waiting. Drop the lock, wait a while and retry. */ + spin_unlock(chip->mutex); + if (sleep_time >= 1000000/HZ) { + /* + * Half of the normal delay still remaining + * can be performed with a sleeping delay instead + * of busy waiting. + */ + msleep(sleep_time/1000); + timeo -= sleep_time; + sleep_time = 1000000/HZ; + } else { + udelay(1); + cond_resched(); + timeo--; + } + spin_lock(chip->mutex); + + while (chip->state != chip_state) { + /* Someone's suspended the operation: sleep */ + DECLARE_WAITQUEUE(wait, current); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&chip->wq, &wait); + spin_unlock(chip->mutex); + schedule(); + remove_wait_queue(&chip->wq, &wait); + spin_lock(chip->mutex); + } + if (chip->erase_suspended || chip->write_suspended) { + /* Suspend has occured while sleep: reset timeout */ + timeo = reset_timeo; + chip->erase_suspended = chip->write_suspended = 0; + } + } + /* check status for errors */ + if (dsr & DSR_ERR) { + /* Clear DSR*/ + map_write(map, CMD(~(DSR_ERR)), map->pfow_base + PFOW_DSR); + printk(KERN_WARNING"%s: Bad status on wait: 0x%x \n", + map->name, dsr); + print_drs_error(dsr); + ret = -EIO; + } + chip->state = FL_READY; + return ret; +} + +static int get_chip(struct map_info *map, struct flchip *chip, int mode) +{ + int ret; + DECLARE_WAITQUEUE(wait, current); + + retry: + if (chip->priv && (mode == FL_WRITING || mode == FL_ERASING) + && chip->state != FL_SYNCING) { + /* + * OK. We have possibility for contension on the write/erase + * operations which are global to the real chip and not per + * partition. So let's fight it over in the partition which + * currently has authority on the operation. + * + * The rules are as follows: + * + * - any write operation must own shared->writing. + * + * - any erase operation must own _both_ shared->writing and + * shared->erasing. + * + * - contension arbitration is handled in the owner's context. + * + * The 'shared' struct can be read and/or written only when + * its lock is taken. + */ + struct flchip_shared *shared = chip->priv; + struct flchip *contender; + spin_lock(&shared->lock); + contender = shared->writing; + if (contender && contender != chip) { + /* + * The engine to perform desired operation on this + * partition is already in use by someone else. + * Let's fight over it in the context of the chip + * currently using it. If it is possible to suspend, + * that other partition will do just that, otherwise + * it'll happily send us to sleep. In any case, when + * get_chip returns success we're clear to go ahead. + */ + ret = spin_trylock(contender->mutex); + spin_unlock(&shared->lock); + if (!ret) + goto retry; + spin_unlock(chip->mutex); + ret = chip_ready(map, contender, mode); + spin_lock(chip->mutex); + + if (ret == -EAGAIN) { + spin_unlock(contender->mutex); + goto retry; + } + if (ret) { + spin_unlock(contender->mutex); + return ret; + } + spin_lock(&shared->lock); + + /* We should not own chip if it is already in FL_SYNCING + * state. Put contender and retry. */ + if (chip->state == FL_SYNCING) { + put_chip(map, contender); + spin_unlock(contender->mutex); + goto retry; + } + spin_unlock(contender->mutex); + } + + /* Check if we have suspended erase on this chip. + Must sleep in such a case. */ + if (mode == FL_ERASING && shared->erasing + && shared->erasing->oldstate == FL_ERASING) { + spin_unlock(&shared->lock); + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&chip->wq, &wait); + spin_unlock(chip->mutex); + schedule(); + remove_wait_queue(&chip->wq, &wait); + spin_lock(chip->mutex); + goto retry; + } + + /* We now own it */ + shared->writing = chip; + if (mode == FL_ERASING) + shared->erasing = chip; + spin_unlock(&shared->lock); + } + + ret = chip_ready(map, chip, mode); + if (ret == -EAGAIN) + goto retry; + + return ret; +} + +static int chip_ready(struct map_info *map, struct flchip *chip, int mode) +{ + struct lpddr_private *lpddr = map->fldrv_priv; + int ret = 0; + DECLARE_WAITQUEUE(wait, current); + + /* Prevent setting state FL_SYNCING for chip in suspended state. */ + if (FL_SYNCING == mode && FL_READY != chip->oldstate) + goto sleep; + + switch (chip->state) { + case FL_READY: + case FL_JEDEC_QUERY: + return 0; + + case FL_ERASING: + if (!lpddr->qinfo->SuspEraseSupp || + !(mode == FL_READY || mode == FL_POINT)) + goto sleep; + + map_write(map, CMD(LPDDR_SUSPEND), + map->pfow_base + PFOW_PROGRAM_ERASE_SUSPEND); + chip->oldstate = FL_ERASING; + chip->state = FL_ERASE_SUSPENDING; + ret = wait_for_ready(map, chip, 0); + if (ret) { + /* Oops. something got wrong. */ + /* Resume and pretend we weren't here. */ + map_write(map, CMD(LPDDR_RESUME), + map->pfow_base + PFOW_COMMAND_CODE); + map_write(map, CMD(LPDDR_START_EXECUTION), + map->pfow_base + PFOW_COMMAND_EXECUTE); + chip->state = FL_ERASING; + chip->oldstate = FL_READY; + printk(KERN_ERR "%s: suspend operation failed." + "State may be wrong \n", map->name); + return -EIO; + } + chip->erase_suspended = 1; + chip->state = FL_READY; + return 0; + /* Erase suspend */ + case FL_POINT: + /* Only if there's no operation suspended... */ + if (mode == FL_READY && chip->oldstate == FL_READY) + return 0; + + default: +sleep: + set_current_state(TASK_UNINTERRUPTIBLE); + add_wait_queue(&chip->wq, &wait); + spin_unlock(chip->mutex); + schedule(); + remove_wait_queue(&chip->wq, &wait); + spin_lock(chip->mutex); + return -EAGAIN; + } +} + +static void put_chip(struct map_info *map, struct flchip *chip) +{ + if (chip->priv) { + struct flchip_shared *shared = chip->priv; + spin_lock(&shared->lock); + if (shared->writing == chip && chip->oldstate == FL_READY) { + /* We own the ability to write, but we're done */ + shared->writing = shared->erasing; + if (shared->writing && shared->writing != chip) { + /* give back the ownership */ + struct flchip *loaner = shared->writing; + spin_lock(loaner->mutex); + spin_unlock(&shared->lock); + spin_unlock(chip->mutex); + put_chip(map, loaner); + spin_lock(chip->mutex); + spin_unlock(loaner->mutex); + wake_up(&chip->wq); + return; + } + shared->erasing = NULL; + shared->writing = NULL; + } else if (shared->erasing == chip && shared->writing != chip) { + /* + * We own the ability to erase without the ability + * to write, which means the erase was suspended + * and some other partition is currently writing. + * Don't let the switch below mess things up since + * we don't have ownership to resume anything. + */ + spin_unlock(&shared->lock); + wake_up(&chip->wq); + return; + } + spin_unlock(&shared->lock); + } + + switch (chip->oldstate) { + case FL_ERASING: + chip->state = chip->oldstate; + map_write(map, CMD(LPDDR_RESUME), + map->pfow_base + PFOW_COMMAND_CODE); + map_write(map, CMD(LPDDR_START_EXECUTION), + map->pfow_base + PFOW_COMMAND_EXECUTE); + chip->oldstate = FL_READY; + chip->state = FL_ERASING; + break; + case FL_READY: + break; + default: + printk(KERN_ERR "%s: put_chip() called with oldstate %d!\n", + map->name, chip->oldstate); + } + wake_up(&chip->wq); +} + +int do_write_buffer(struct map_info *map, struct flchip *chip, + unsigned long adr, const struct kvec **pvec, + unsigned long *pvec_seek, int len) +{ + struct lpddr_private *lpddr = map->fldrv_priv; + map_word datum; + int ret, wbufsize, word_gap, words; + const struct kvec *vec; + unsigned long vec_seek; + unsigned long prog_buf_ofs; + + wbufsize = 1 << lpddr->qinfo->BufSizeShift; + + spin_lock(chip->mutex); + ret = get_chip(map, chip, FL_WRITING); + if (ret) { + spin_unlock(chip->mutex); + return ret; + } + /* Figure out the number of words to write */ + word_gap = (-adr & (map_bankwidth(map)-1)); + words = (len - word_gap + map_bankwidth(map) - 1) / map_bankwidth(map); + if (!word_gap) { + words--; + } else { + word_gap = map_bankwidth(map) - word_gap; + adr -= word_gap; + datum = map_word_ff(map); + } + /* Write data */ + /* Get the program buffer offset from PFOW register data first*/ + prog_buf_ofs = map->pfow_base + CMDVAL(map_read(map, + map->pfow_base + PFOW_PROGRAM_BUFFER_OFFSET)); + vec = *pvec; + vec_seek = *pvec_seek; + do { + int n = map_bankwidth(map) - word_gap; + + if (n > vec->iov_len - vec_seek) + n = vec->iov_len - vec_seek; + if (n > len) + n = len; + + if (!word_gap && (len < map_bankwidth(map))) + datum = map_word_ff(map); + + datum = map_word_load_partial(map, datum, + vec->iov_base + vec_seek, word_gap, n); + + len -= n; + word_gap += n; + if (!len || word_gap == map_bankwidth(map)) { + map_write(map, datum, prog_buf_ofs); + prog_buf_ofs += map_bankwidth(map); + word_gap = 0; + } + + vec_seek += n; + if (vec_seek == vec->iov_len) { + vec++; + vec_seek = 0; + } + } while (len); + *pvec = vec; + *pvec_seek = vec_seek; + + /* GO GO GO */ + send_pfow_command(map, LPDDR_BUFF_PROGRAM, adr, wbufsize, NULL); + chip->state = FL_WRITING; + ret = wait_for_ready(map, chip, (1<<lpddr->qinfo->ProgBufferTime)); + if (ret) { + printk(KERN_WARNING"%s Buffer program error: %d at %lx; \n", + map->name, ret, adr); + goto out; + } + + out: put_chip(map, chip); + spin_unlock(chip->mutex); + return ret; +} + +int do_erase_oneblock(struct mtd_info *mtd, loff_t adr) +{ + struct map_info *map = mtd->priv; + struct lpddr_private *lpddr = map->fldrv_priv; + int chipnum = adr >> lpddr->chipshift; + struct flchip *chip = &lpddr->chips[chipnum]; + int ret; + + spin_lock(chip->mutex); + ret = get_chip(map, chip, FL_ERASING); + if (ret) { + spin_unlock(chip->mutex); + return ret; + } + send_pfow_command(map, LPDDR_BLOCK_ERASE, adr, 0, NULL); + chip->state = FL_ERASING; + ret = wait_for_ready(map, chip, (1<<lpddr->qinfo->BlockEraseTime)*1000); + if (ret) { + printk(KERN_WARNING"%s Erase block error %d at : %llx\n", + map->name, ret, adr); + goto out; + } + out: put_chip(map, chip); + spin_unlock(chip->mutex); + return ret; +} + +static int lpddr_read(struct mtd_info *mtd, loff_t adr, size_t len, + size_t *retlen, u_char *buf) +{ + struct map_info *map = mtd->priv; + struct lpddr_private *lpddr = map->fldrv_priv; + int chipnum = adr >> lpddr->chipshift; + struct flchip *chip = &lpddr->chips[chipnum]; + int ret = 0; + + spin_lock(chip->mutex); + ret = get_chip(map, chip, FL_READY); + if (ret) { + spin_unlock(chip->mutex); + return ret; + } + + map_copy_from(map, buf, adr, len); + *retlen = len; + + put_chip(map, chip); + spin_unlock(chip->mutex); + return ret; +} + +static int lpddr_point(struct mtd_info *mtd, loff_t adr, size_t len, + size_t *retlen, void **mtdbuf, resource_size_t *phys) +{ + struct map_info *map = mtd->priv; + struct lpddr_private *lpddr = map->fldrv_priv; + int chipnum = adr >> lpddr->chipshift; + unsigned long ofs, last_end = 0; + struct flchip *chip = &lpddr->chips[chipnum]; + int ret = 0; + + if (!map->virt || (adr + len > mtd->size)) + return -EINVAL; + + /* ofs: offset within the first chip that the first read should start */ + ofs = adr - (chipnum << lpddr->chipshift); + + *mtdbuf = (void *)map->virt + chip->start + ofs; + *retlen = 0; + + while (len) { + unsigned long thislen; + + if (chipnum >= lpddr->numchips) + break; + + /* We cannot point across chips that are virtually disjoint */ + if (!last_end) + last_end = chip->start; + else if (chip->start != last_end) + break; + + if ((len + ofs - 1) >> lpddr->chipshift) + thislen = (1<<lpddr->chipshift) - ofs; + else + thislen = len; + /* get the chip */ + spin_lock(chip->mutex); + ret = get_chip(map, chip, FL_POINT); + spin_unlock(chip->mutex); + if (ret) + break; + + chip->state = FL_POINT; + chip->ref_point_counter++; + *retlen += thislen; + len -= thislen; + + ofs = 0; + last_end += 1 << lpddr->chipshift; + chipnum++; + chip = &lpddr->chips[chipnum]; + } + return 0; +} + +static void lpddr_unpoint (struct mtd_info *mtd, loff_t adr, size_t len) +{ + struct map_info *map = mtd->priv; + struct lpddr_private *lpddr = map->fldrv_priv; + int chipnum = adr >> lpddr->chipshift; + unsigned long ofs; + + /* ofs: offset within the first chip that the first read should start */ + ofs = adr - (chipnum << lpddr->chipshift); + + while (len) { + unsigned long thislen; + struct flchip *chip; + + chip = &lpddr->chips[chipnum]; + if (chipnum >= lpddr->numchips) + break; + + if ((len + ofs - 1) >> lpddr->chipshift) + thislen = (1<<lpddr->chipshift) - ofs; + else + thislen = len; + + spin_lock(chip->mutex); + if (chip->state == FL_POINT) { + chip->ref_point_counter--; + if (chip->ref_point_counter == 0) + chip->state = FL_READY; + } else + printk(KERN_WARNING "%s: Warning: unpoint called on non" + "pointed region\n", map->name); + + put_chip(map, chip); + spin_unlock(chip->mutex); + + len -= thislen; + ofs = 0; + chipnum++; + } +} + +static int lpddr_write_buffers(struct mtd_info *mtd, loff_t to, size_t len, + size_t *retlen, const u_char *buf) +{ + struct kvec vec; + + vec.iov_base = (void *) buf; + vec.iov_len = len; + + return lpddr_writev(mtd, &vec, 1, to, retlen); +} + + +static int lpddr_writev(struct mtd_info *mtd, const struct kvec *vecs, + unsigned long count, loff_t to, size_t *retlen) +{ + struct map_info *map = mtd->priv; + struct lpddr_private *lpddr = map->fldrv_priv; + int ret = 0; + int chipnum; + unsigned long ofs, vec_seek, i; + int wbufsize = 1 << lpddr->qinfo->BufSizeShift; + + size_t len = 0; + + for (i = 0; i < count; i++) + len += vecs[i].iov_len; + + *retlen = 0; + if (!len) + return 0; + + chipnum = to >> lpddr->chipshift; + + ofs = to; + vec_seek = 0; + + do { + /* We must not cross write block boundaries */ + int size = wbufsize - (ofs & (wbufsize-1)); + + if (size > len) + size = len; + + ret = do_write_buffer(map, &lpddr->chips[chipnum], + ofs, &vecs, &vec_seek, size); + if (ret) + return ret; + + ofs += size; + (*retlen) += size; + len -= size; + + /* Be nice and reschedule with the chip in a usable + * state for other processes */ + cond_resched(); + + } while (len); + + return 0; +} + +static int lpddr_erase(struct mtd_info *mtd, struct erase_info *instr) +{ + unsigned long ofs, len; + int ret; + struct map_info *map = mtd->priv; + struct lpddr_private *lpddr = map->fldrv_priv; + int size = 1 << lpddr->qinfo->UniformBlockSizeShift; + + ofs = instr->addr; + len = instr->len; + + if (ofs > mtd->size || (len + ofs) > mtd->size) + return -EINVAL; + + while (len > 0) { + ret = do_erase_oneblock(mtd, ofs); + if (ret) + return ret; + ofs += size; + len -= size; + } + instr->state = MTD_ERASE_DONE; + mtd_erase_callback(instr); + + return 0; +} + +#define DO_XXLOCK_LOCK 1 +#define DO_XXLOCK_UNLOCK 2 +int do_xxlock(struct mtd_info *mtd, loff_t adr, uint32_t len, int thunk) +{ + int ret = 0; + struct map_info *map = mtd->priv; + struct lpddr_private *lpddr = map->fldrv_priv; + int chipnum = adr >> lpddr->chipshift; + struct flchip *chip = &lpddr->chips[chipnum]; + + spin_lock(chip->mutex); + ret = get_chip(map, chip, FL_LOCKING); + if (ret) { + spin_unlock(chip->mutex); + return ret; + } + + if (thunk == DO_XXLOCK_LOCK) { + send_pfow_command(map, LPDDR_LOCK_BLOCK, adr, adr + len, NULL); + chip->state = FL_LOCKING; + } else if (thunk == DO_XXLOCK_UNLOCK) { + send_pfow_command(map, LPDDR_UNLOCK_BLOCK, adr, adr + len, NULL); + chip->state = FL_UNLOCKING; + } else + BUG(); + + ret = wait_for_ready(map, chip, 1); + if (ret) { + printk(KERN_ERR "%s: block unlock error status %d \n", + map->name, ret); + goto out; + } +out: put_chip(map, chip); + spin_unlock(chip->mutex); + return ret; +} + +static int lpddr_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) +{ + return do_xxlock(mtd, ofs, len, DO_XXLOCK_LOCK); +} + +static int lpddr_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) +{ + return do_xxlock(mtd, ofs, len, DO_XXLOCK_UNLOCK); +} + +int word_program(struct map_info *map, loff_t adr, uint32_t curval) +{ + int ret; + struct lpddr_private *lpddr = map->fldrv_priv; + int chipnum = adr >> lpddr->chipshift; + struct flchip *chip = &lpddr->chips[chipnum]; + + spin_lock(chip->mutex); + ret = get_chip(map, chip, FL_WRITING); + if (ret) { + spin_unlock(chip->mutex); + return ret; + } + + send_pfow_command(map, LPDDR_WORD_PROGRAM, adr, 0x00, (map_word *)&curval); + + ret = wait_for_ready(map, chip, (1<<lpddr->qinfo->SingleWordProgTime)); + if (ret) { + printk(KERN_WARNING"%s word_program error at: %llx; val: %x\n", + map->name, adr, curval); + goto out; + } + +out: put_chip(map, chip); + spin_unlock(chip->mutex); + return ret; +} + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Alexey Korolev <akorolev@infradead.org>"); +MODULE_DESCRIPTION("MTD driver for LPDDR flash chips"); diff --git a/drivers/mtd/lpddr/qinfo_probe.c b/drivers/mtd/lpddr/qinfo_probe.c new file mode 100644 index 00000000000..79bf40f48b7 --- /dev/null +++ b/drivers/mtd/lpddr/qinfo_probe.c @@ -0,0 +1,255 @@ +/* + * Probing flash chips with QINFO records. + * (C) 2008 Korolev Alexey <akorolev@infradead.org> + * (C) 2008 Vasiliy Leonenko <vasiliy.leonenko@gmail.com> + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA + * 02110-1301, USA. + */ +#include <linux/module.h> +#include <linux/types.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/errno.h> +#include <linux/slab.h> +#include <linux/interrupt.h> + +#include <linux/mtd/xip.h> +#include <linux/mtd/map.h> +#include <linux/mtd/pfow.h> +#include <linux/mtd/qinfo.h> + +static int lpddr_chip_setup(struct map_info *map, struct lpddr_private *lpddr); +struct mtd_info *lpddr_probe(struct map_info *map); +static struct lpddr_private *lpddr_probe_chip(struct map_info *map); +static int lpddr_pfow_present(struct map_info *map, + struct lpddr_private *lpddr); + +static struct qinfo_query_info qinfo_array[] = { + /* General device info */ + {0, 0, "DevSizeShift", "Device size 2^n bytes"}, + {0, 3, "BufSizeShift", "Program buffer size 2^n bytes"}, + /* Erase block information */ + {1, 1, "TotalBlocksNum", "Total number of blocks"}, + {1, 2, "UniformBlockSizeShift", "Uniform block size 2^n bytes"}, + /* Partition information */ + {2, 1, "HWPartsNum", "Number of hardware partitions"}, + /* Optional features */ + {5, 1, "SuspEraseSupp", "Suspend erase supported"}, + /* Operation typical time */ + {10, 0, "SingleWordProgTime", "Single word program 2^n u-sec"}, + {10, 1, "ProgBufferTime", "Program buffer write 2^n u-sec"}, + {10, 2, "BlockEraseTime", "Block erase 2^n m-sec"}, + {10, 3, "FullChipEraseTime", "Full chip erase 2^n m-sec"}, +}; + +static long lpddr_get_qinforec_pos(struct map_info *map, char *id_str) +{ + int qinfo_lines = sizeof(qinfo_array)/sizeof(struct qinfo_query_info); + int i; + int bankwidth = map_bankwidth(map) * 8; + int major, minor; + + for (i = 0; i < qinfo_lines; i++) { + if (strcmp(id_str, qinfo_array[i].id_str) == 0) { + major = qinfo_array[i].major & ((1 << bankwidth) - 1); + minor = qinfo_array[i].minor & ((1 << bankwidth) - 1); + return minor | (major << bankwidth); + } + } + printk(KERN_ERR"%s qinfo id string is wrong! \n", map->name); + BUG(); + return -1; +} + +static uint16_t lpddr_info_query(struct map_info *map, char *id_str) +{ + unsigned int dsr, val; + int bits_per_chip = map_bankwidth(map) * 8; + unsigned long adr = lpddr_get_qinforec_pos(map, id_str); + int attempts = 20; + + /* Write a request for the PFOW record */ + map_write(map, CMD(LPDDR_INFO_QUERY), + map->pfow_base + PFOW_COMMAND_CODE); + map_write(map, CMD(adr & ((1 << bits_per_chip) - 1)), + map->pfow_base + PFOW_COMMAND_ADDRESS_L); + map_write(map, CMD(adr >> bits_per_chip), + map->pfow_base + PFOW_COMMAND_ADDRESS_H); + map_write(map, CMD(LPDDR_START_EXECUTION), + map->pfow_base + PFOW_COMMAND_EXECUTE); + + while ((attempts--) > 0) { + dsr = CMDVAL(map_read(map, map->pfow_base + PFOW_DSR)); + if (dsr & DSR_READY_STATUS) + break; + udelay(10); + } + + val = CMDVAL(map_read(map, map->pfow_base + PFOW_COMMAND_DATA)); + return val; +} + +static int lpddr_pfow_present(struct map_info *map, struct lpddr_private *lpddr) +{ + map_word pfow_val[4]; + + /* Check identification string */ + pfow_val[0] = map_read(map, map->pfow_base + PFOW_QUERY_STRING_P); + pfow_val[1] = map_read(map, map->pfow_base + PFOW_QUERY_STRING_F); + pfow_val[2] = map_read(map, map->pfow_base + PFOW_QUERY_STRING_O); + pfow_val[3] = map_read(map, map->pfow_base + PFOW_QUERY_STRING_W); + + if (!map_word_equal(map, CMD('P'), pfow_val[0])) + goto out; + + if (!map_word_equal(map, CMD('F'), pfow_val[1])) + goto out; + + if (!map_word_equal(map, CMD('O'), pfow_val[2])) + goto out; + + if (!map_word_equal(map, CMD('W'), pfow_val[3])) + goto out; + + return 1; /* "PFOW" is found */ +out: + printk(KERN_WARNING"%s: PFOW string at 0x%lx is not found \n", + map->name, map->pfow_base); + return 0; +} + +static int lpddr_chip_setup(struct map_info *map, struct lpddr_private *lpddr) +{ + + lpddr->qinfo = kmalloc(sizeof(struct qinfo_chip), GFP_KERNEL); + if (!lpddr->qinfo) { + printk(KERN_WARNING "%s: no memory for LPDDR qinfo structure\n", + map->name); + return 0; + } + memset(lpddr->qinfo, 0, sizeof(struct qinfo_chip)); + + /* Get the ManuID */ + lpddr->ManufactId = CMDVAL(map_read(map, map->pfow_base + PFOW_MANUFACTURER_ID)); + /* Get the DeviceID */ + lpddr->DevId = CMDVAL(map_read(map, map->pfow_base + PFOW_DEVICE_ID)); + /* read parameters from chip qinfo table */ + lpddr->qinfo->DevSizeShift = lpddr_info_query(map, "DevSizeShift"); + lpddr->qinfo->TotalBlocksNum = lpddr_info_query(map, "TotalBlocksNum"); + lpddr->qinfo->BufSizeShift = lpddr_info_query(map, "BufSizeShift"); + lpddr->qinfo->HWPartsNum = lpddr_info_query(map, "HWPartsNum"); + lpddr->qinfo->UniformBlockSizeShift = + lpddr_info_query(map, "UniformBlockSizeShift"); + lpddr->qinfo->SuspEraseSupp = lpddr_info_query(map, "SuspEraseSupp"); + lpddr->qinfo->SingleWordProgTime = + lpddr_info_query(map, "SingleWordProgTime"); + lpddr->qinfo->ProgBufferTime = lpddr_info_query(map, "ProgBufferTime"); + lpddr->qinfo->BlockEraseTime = lpddr_info_query(map, "BlockEraseTime"); + return 1; +} +static struct lpddr_private *lpddr_probe_chip(struct map_info *map) +{ + struct lpddr_private lpddr; + struct lpddr_private *retlpddr; + int numvirtchips; + + + if ((map->pfow_base + 0x1000) >= map->size) { + printk(KERN_NOTICE"%s Probe at base (0x%08lx) past the end of" + "the map(0x%08lx)\n", map->name, + (unsigned long)map->pfow_base, map->size - 1); + return NULL; + } + memset(&lpddr, 0, sizeof(struct lpddr_private)); + if (!lpddr_pfow_present(map, &lpddr)) + return NULL; + + if (!lpddr_chip_setup(map, &lpddr)) + return NULL; + + /* Ok so we found a chip */ + lpddr.chipshift = lpddr.qinfo->DevSizeShift; + lpddr.numchips = 1; + + numvirtchips = lpddr.numchips * lpddr.qinfo->HWPartsNum; + retlpddr = kmalloc(sizeof(struct lpddr_private) + + numvirtchips * sizeof(struct flchip), GFP_KERNEL); + if (!retlpddr) + return NULL; + + memset(retlpddr, 0, sizeof(struct lpddr_private) + + numvirtchips * sizeof(struct flchip)); + memcpy(retlpddr, &lpddr, sizeof(struct lpddr_private)); + + retlpddr->numchips = numvirtchips; + retlpddr->chipshift = retlpddr->qinfo->DevSizeShift - + __ffs(retlpddr->qinfo->HWPartsNum); + + return retlpddr; +} + +struct mtd_info *lpddr_probe(struct map_info *map) +{ + struct mtd_info *mtd = NULL; + struct lpddr_private *lpddr; + + /* First probe the map to see if we havecan open PFOW here */ + lpddr = lpddr_probe_chip(map); + if (!lpddr) + return NULL; + + map->fldrv_priv = lpddr; + mtd = lpddr_cmdset(map); + if (mtd) { + if (mtd->size > map->size) { + printk(KERN_WARNING "Reducing visibility of %ldKiB chip" + "to %ldKiB\n", (unsigned long)mtd->size >> 10, + (unsigned long)map->size >> 10); + mtd->size = map->size; + } + return mtd; + } + + kfree(lpddr->qinfo); + kfree(lpddr); + map->fldrv_priv = NULL; + return NULL; +} + +static struct mtd_chip_driver lpddr_chipdrv = { + .probe = lpddr_probe, + .name = "qinfo_probe", + .module = THIS_MODULE +}; + +static int __init lpddr_probe_init(void) +{ + register_mtd_chip_driver(&lpddr_chipdrv); + return 0; +} + +static void __exit lpddr_probe_exit(void) +{ + unregister_mtd_chip_driver(&lpddr_chipdrv); +} + +module_init(lpddr_probe_init); +module_exit(lpddr_probe_exit); + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Vasiliy Leonenko <vasiliy.leonenko@gmail.com>"); +MODULE_DESCRIPTION("Driver to probe qinfo flash chips"); + diff --git a/drivers/mtd/maps/Kconfig b/drivers/mtd/maps/Kconfig index 5ea16936216..0225cbbf22d 100644 --- a/drivers/mtd/maps/Kconfig +++ b/drivers/mtd/maps/Kconfig @@ -10,8 +10,8 @@ config MTD_COMPLEX_MAPPINGS paged mappings of flash chips. config MTD_PHYSMAP - tristate "CFI Flash device in physical memory map" - depends on MTD_CFI || MTD_JEDECPROBE || MTD_ROM + tristate "Flash device in physical memory map" + depends on MTD_CFI || MTD_JEDECPROBE || MTD_ROM || MTD_LPDDR help This provides a 'mapping' driver which allows the NOR Flash and ROM driver code to communicate with chips which are mapped @@ -23,9 +23,20 @@ config MTD_PHYSMAP To compile this driver as a module, choose M here: the module will be called physmap. +config MTD_PHYSMAP_COMPAT + bool "Physmap compat support" + depends on MTD_PHYSMAP + default n + help + Setup a simple mapping via the Kconfig options. Normally the + physmap configuration options are done via your board's + resource file. + + If unsure, say N here. + config MTD_PHYSMAP_START hex "Physical start address of flash mapping" - depends on MTD_PHYSMAP + depends on MTD_PHYSMAP_COMPAT default "0x8000000" help This is the physical memory location at which the flash chips @@ -37,7 +48,7 @@ config MTD_PHYSMAP_START config MTD_PHYSMAP_LEN hex "Physical length of flash mapping" - depends on MTD_PHYSMAP + depends on MTD_PHYSMAP_COMPAT default "0" help This is the total length of the mapping of the flash chips on @@ -51,7 +62,7 @@ config MTD_PHYSMAP_LEN config MTD_PHYSMAP_BANKWIDTH int "Bank width in octets" - depends on MTD_PHYSMAP + depends on MTD_PHYSMAP_COMPAT default "2" help This is the total width of the data bus of the flash devices diff --git a/drivers/mtd/maps/alchemy-flash.c b/drivers/mtd/maps/alchemy-flash.c index 82811bcb043..845ad4f2a54 100644 --- a/drivers/mtd/maps/alchemy-flash.c +++ b/drivers/mtd/maps/alchemy-flash.c @@ -111,7 +111,7 @@ static struct mtd_partition alchemy_partitions[] = { static struct mtd_info *mymtd; -int __init alchemy_mtd_init(void) +static int __init alchemy_mtd_init(void) { struct mtd_partition *parts; int nb_parts = 0; diff --git a/drivers/mtd/maps/amd76xrom.c b/drivers/mtd/maps/amd76xrom.c index d1eec7d3243..237733d094c 100644 --- a/drivers/mtd/maps/amd76xrom.c +++ b/drivers/mtd/maps/amd76xrom.c @@ -232,8 +232,8 @@ static int __devinit amd76xrom_init_one (struct pci_dev *pdev, /* Trim the size if we are larger than the map */ if (map->mtd->size > map->map.size) { printk(KERN_WARNING MOD_NAME - " rom(%u) larger than window(%lu). fixing...\n", - map->mtd->size, map->map.size); + " rom(%llu) larger than window(%lu). fixing...\n", + (unsigned long long)map->mtd->size, map->map.size); map->mtd->size = map->map.size; } if (window->rsrc.parent) { diff --git a/drivers/mtd/maps/cfi_flagadm.c b/drivers/mtd/maps/cfi_flagadm.c index 0ecc3f6d735..b4ed8161191 100644 --- a/drivers/mtd/maps/cfi_flagadm.c +++ b/drivers/mtd/maps/cfi_flagadm.c @@ -88,7 +88,7 @@ struct mtd_partition flagadm_parts[] = { static struct mtd_info *mymtd; -int __init init_flagadm(void) +static int __init init_flagadm(void) { printk(KERN_NOTICE "FlagaDM flash device: %x at %x\n", FLASH_SIZE, FLASH_PHYS_ADDR); diff --git a/drivers/mtd/maps/ck804xrom.c b/drivers/mtd/maps/ck804xrom.c index 1a6feb4474d..5f7a245ed13 100644 --- a/drivers/mtd/maps/ck804xrom.c +++ b/drivers/mtd/maps/ck804xrom.c @@ -263,8 +263,8 @@ static int __devinit ck804xrom_init_one (struct pci_dev *pdev, /* Trim the size if we are larger than the map */ if (map->mtd->size > map->map.size) { printk(KERN_WARNING MOD_NAME - " rom(%u) larger than window(%lu). fixing...\n", - map->mtd->size, map->map.size); + " rom(%llu) larger than window(%lu). fixing...\n", + (unsigned long long)map->mtd->size, map->map.size); map->mtd->size = map->map.size; } if (window->rsrc.parent) { diff --git a/drivers/mtd/maps/dbox2-flash.c b/drivers/mtd/maps/dbox2-flash.c index e115667bf1d..cfacfa6f45d 100644 --- a/drivers/mtd/maps/dbox2-flash.c +++ b/drivers/mtd/maps/dbox2-flash.c @@ -69,7 +69,7 @@ struct map_info dbox2_flash_map = { .phys = WINDOW_ADDR, }; -int __init init_dbox2_flash(void) +static int __init init_dbox2_flash(void) { printk(KERN_NOTICE "D-Box 2 flash driver (size->0x%X mem->0x%X)\n", WINDOW_SIZE, WINDOW_ADDR); dbox2_flash_map.virt = ioremap(WINDOW_ADDR, WINDOW_SIZE); diff --git a/drivers/mtd/maps/edb7312.c b/drivers/mtd/maps/edb7312.c index 9433738c166..be9e90b4458 100644 --- a/drivers/mtd/maps/edb7312.c +++ b/drivers/mtd/maps/edb7312.c @@ -71,7 +71,7 @@ static const char *probes[] = { "RedBoot", "cmdlinepart", NULL }; static int mtd_parts_nb = 0; static struct mtd_partition *mtd_parts = 0; -int __init init_edb7312nor(void) +static int __init init_edb7312nor(void) { static const char *rom_probe_types[] = PROBETYPES; const char **type; diff --git a/drivers/mtd/maps/esb2rom.c b/drivers/mtd/maps/esb2rom.c index bbbcdd4c8d1..11a2f57df9c 100644 --- a/drivers/mtd/maps/esb2rom.c +++ b/drivers/mtd/maps/esb2rom.c @@ -324,8 +324,8 @@ static int __devinit esb2rom_init_one(struct pci_dev *pdev, /* Trim the size if we are larger than the map */ if (map->mtd->size > map->map.size) { printk(KERN_WARNING MOD_NAME - " rom(%u) larger than window(%lu). fixing...\n", - map->mtd->size, map->map.size); + " rom(%llu) larger than window(%lu). fixing...\n", + (unsigned long long)map->mtd->size, map->map.size); map->mtd->size = map->map.size; } if (window->rsrc.parent) { diff --git a/drivers/mtd/maps/fortunet.c b/drivers/mtd/maps/fortunet.c index a8e3fde4cbd..1e43124d498 100644 --- a/drivers/mtd/maps/fortunet.c +++ b/drivers/mtd/maps/fortunet.c @@ -181,7 +181,7 @@ __setup("MTD_Partition=", MTD_New_Partition); /* Backwards-spelling-compatibility */ __setup("MTD_Partion=", MTD_New_Partition); -int __init init_fortunet(void) +static int __init init_fortunet(void) { int ix,iy; for(iy=ix=0;ix<MAX_NUM_REGIONS;ix++) diff --git a/drivers/mtd/maps/h720x-flash.c b/drivers/mtd/maps/h720x-flash.c index 3b959fad1c4..72c724fa8c2 100644 --- a/drivers/mtd/maps/h720x-flash.c +++ b/drivers/mtd/maps/h720x-flash.c @@ -65,7 +65,7 @@ static const char *probes[] = { "cmdlinepart", NULL }; /* * Initialize FLASH support */ -int __init h720x_mtd_init(void) +static int __init h720x_mtd_init(void) { char *part_type = NULL; diff --git a/drivers/mtd/maps/ichxrom.c b/drivers/mtd/maps/ichxrom.c index aeb6c916e23..c32bc28920b 100644 --- a/drivers/mtd/maps/ichxrom.c +++ b/drivers/mtd/maps/ichxrom.c @@ -258,8 +258,8 @@ static int __devinit ichxrom_init_one (struct pci_dev *pdev, /* Trim the size if we are larger than the map */ if (map->mtd->size > map->map.size) { printk(KERN_WARNING MOD_NAME - " rom(%u) larger than window(%lu). fixing...\n", - map->mtd->size, map->map.size); + " rom(%llu) larger than window(%lu). fixing...\n", + (unsigned long long)map->mtd->size, map->map.size); map->mtd->size = map->map.size; } if (window->rsrc.parent) { diff --git a/drivers/mtd/maps/impa7.c b/drivers/mtd/maps/impa7.c index 2682ab51a36..998a27da97f 100644 --- a/drivers/mtd/maps/impa7.c +++ b/drivers/mtd/maps/impa7.c @@ -70,7 +70,7 @@ static struct mtd_partition *mtd_parts[NUM_FLASHBANKS]; static const char *probes[] = { "cmdlinepart", NULL }; -int __init init_impa7(void) +static int __init init_impa7(void) { static const char *rom_probe_types[] = PROBETYPES; const char **type; diff --git a/drivers/mtd/maps/ipaq-flash.c b/drivers/mtd/maps/ipaq-flash.c index ed58f6a77bd..748c85f635f 100644 --- a/drivers/mtd/maps/ipaq-flash.c +++ b/drivers/mtd/maps/ipaq-flash.c @@ -202,7 +202,7 @@ static const char *part_probes[] = { "cmdlinepart", "RedBoot", NULL }; static int __init h1900_special_case(void); -int __init ipaq_mtd_init(void) +static int __init ipaq_mtd_init(void) { struct mtd_partition *parts = NULL; int nb_parts = 0; diff --git a/drivers/mtd/maps/mbx860.c b/drivers/mtd/maps/mbx860.c index 706f67394b0..0eb5a7c8538 100644 --- a/drivers/mtd/maps/mbx860.c +++ b/drivers/mtd/maps/mbx860.c @@ -55,7 +55,7 @@ struct map_info mbx_map = { .bankwidth = 4, }; -int __init init_mbx(void) +static int __init init_mbx(void) { printk(KERN_NOTICE "Motorola MBX flash device: 0x%x at 0x%x\n", WINDOW_SIZE*4, WINDOW_ADDR); mbx_map.virt = ioremap(WINDOW_ADDR, WINDOW_SIZE * 4); diff --git a/drivers/mtd/maps/nettel.c b/drivers/mtd/maps/nettel.c index 965e6c6d6ab..a97133eb9d7 100644 --- a/drivers/mtd/maps/nettel.c +++ b/drivers/mtd/maps/nettel.c @@ -226,7 +226,7 @@ static int __init nettel_init(void) if ((amd_mtd = do_map_probe("jedec_probe", &nettel_amd_map))) { printk(KERN_NOTICE "SNAPGEAR: AMD flash device size = %dK\n", - amd_mtd->size>>10); + (int)(amd_mtd->size>>10)); amd_mtd->owner = THIS_MODULE; @@ -357,13 +357,12 @@ static int __init nettel_init(void) *intel1par = 0; } - printk(KERN_NOTICE "SNAPGEAR: Intel flash device size = %dK\n", - (intel_mtd->size >> 10)); + printk(KERN_NOTICE "SNAPGEAR: Intel flash device size = %lldKiB\n", + (unsigned long long)(intel_mtd->size >> 10)); intel_mtd->owner = THIS_MODULE; - num_intel_partitions = sizeof(nettel_intel_partitions) / - sizeof(nettel_intel_partitions[0]); + num_intel_partitions = ARRAY_SIZE(nettel_intel_partitions); if (intelboot) { /* diff --git a/drivers/mtd/maps/octagon-5066.c b/drivers/mtd/maps/octagon-5066.c index 43e04c1d22a..2b2e4509321 100644 --- a/drivers/mtd/maps/octagon-5066.c +++ b/drivers/mtd/maps/octagon-5066.c @@ -184,7 +184,7 @@ void cleanup_oct5066(void) release_region(PAGE_IO, 1); } -int __init init_oct5066(void) +static int __init init_oct5066(void) { int i; int ret = 0; diff --git a/drivers/mtd/maps/physmap.c b/drivers/mtd/maps/physmap.c index 1db16e549e3..87743661d48 100644 --- a/drivers/mtd/maps/physmap.c +++ b/drivers/mtd/maps/physmap.c @@ -29,7 +29,6 @@ struct physmap_flash_info { struct map_info map[MAX_RESOURCES]; #ifdef CONFIG_MTD_PARTITIONS int nr_parts; - struct mtd_partition *parts; #endif }; @@ -56,14 +55,10 @@ static int physmap_flash_remove(struct platform_device *dev) for (i = 0; i < MAX_RESOURCES; i++) { if (info->mtd[i] != NULL) { #ifdef CONFIG_MTD_PARTITIONS - if (info->nr_parts) { + if (info->nr_parts || physmap_data->nr_parts) del_mtd_partitions(info->mtd[i]); - kfree(info->parts); - } else if (physmap_data->nr_parts) { - del_mtd_partitions(info->mtd[i]); - } else { + else del_mtd_device(info->mtd[i]); - } #else del_mtd_device(info->mtd[i]); #endif @@ -73,7 +68,12 @@ static int physmap_flash_remove(struct platform_device *dev) return 0; } -static const char *rom_probe_types[] = { "cfi_probe", "jedec_probe", "map_rom", NULL }; +static const char *rom_probe_types[] = { + "cfi_probe", + "jedec_probe", + "qinfo_probe", + "map_rom", + NULL }; #ifdef CONFIG_MTD_PARTITIONS static const char *part_probe_types[] = { "cmdlinepart", "RedBoot", NULL }; #endif @@ -86,6 +86,9 @@ static int physmap_flash_probe(struct platform_device *dev) int err = 0; int i; int devices_found = 0; +#ifdef CONFIG_MTD_PARTITIONS + struct mtd_partition *parts; +#endif physmap_data = dev->dev.platform_data; if (physmap_data == NULL) @@ -119,6 +122,7 @@ static int physmap_flash_probe(struct platform_device *dev) info->map[i].size = dev->resource[i].end - dev->resource[i].start + 1; info->map[i].bankwidth = physmap_data->width; info->map[i].set_vpp = physmap_data->set_vpp; + info->map[i].pfow_base = physmap_data->pfow_base; info->map[i].virt = devm_ioremap(&dev->dev, info->map[i].phys, info->map[i].size); @@ -163,9 +167,10 @@ static int physmap_flash_probe(struct platform_device *dev) goto err_out; #ifdef CONFIG_MTD_PARTITIONS - err = parse_mtd_partitions(info->cmtd, part_probe_types, &info->parts, 0); + err = parse_mtd_partitions(info->cmtd, part_probe_types, &parts, 0); if (err > 0) { - add_mtd_partitions(info->cmtd, info->parts, err); + add_mtd_partitions(info->cmtd, parts, err); + kfree(parts); return 0; } @@ -251,14 +256,7 @@ static struct platform_driver physmap_flash_driver = { }; -#ifdef CONFIG_MTD_PHYSMAP_LEN -#if CONFIG_MTD_PHYSMAP_LEN != 0 -#warning using PHYSMAP compat code -#define PHYSMAP_COMPAT -#endif -#endif - -#ifdef PHYSMAP_COMPAT +#ifdef CONFIG_MTD_PHYSMAP_COMPAT static struct physmap_flash_data physmap_flash_data = { .width = CONFIG_MTD_PHYSMAP_BANKWIDTH, }; @@ -302,7 +300,7 @@ static int __init physmap_init(void) int err; err = platform_driver_register(&physmap_flash_driver); -#ifdef PHYSMAP_COMPAT +#ifdef CONFIG_MTD_PHYSMAP_COMPAT if (err == 0) platform_device_register(&physmap_flash); #endif @@ -312,7 +310,7 @@ static int __init physmap_init(void) static void __exit physmap_exit(void) { -#ifdef PHYSMAP_COMPAT +#ifdef CONFIG_MTD_PHYSMAP_COMPAT platform_device_unregister(&physmap_flash); #endif platform_driver_unregister(&physmap_flash_driver); @@ -326,8 +324,7 @@ MODULE_AUTHOR("David Woodhouse <dwmw2@infradead.org>"); MODULE_DESCRIPTION("Generic configurable MTD map driver"); /* legacy platform drivers can't hotplug or coldplg */ -#ifndef PHYSMAP_COMPAT +#ifndef CONFIG_MTD_PHYSMAP_COMPAT /* work with hotplug and coldplug */ MODULE_ALIAS("platform:physmap-flash"); #endif - diff --git a/drivers/mtd/maps/pmcmsp-flash.c b/drivers/mtd/maps/pmcmsp-flash.c index f43ba2815cb..4768bd5459d 100644 --- a/drivers/mtd/maps/pmcmsp-flash.c +++ b/drivers/mtd/maps/pmcmsp-flash.c @@ -48,7 +48,7 @@ static int fcnt; #define DEBUG_MARKER printk(KERN_NOTICE "%s[%d]\n", __func__, __LINE__) -int __init init_msp_flash(void) +static int __init init_msp_flash(void) { int i, j; int offset, coff; diff --git a/drivers/mtd/maps/redwood.c b/drivers/mtd/maps/redwood.c index de002eb1a7f..933c0b63b01 100644 --- a/drivers/mtd/maps/redwood.c +++ b/drivers/mtd/maps/redwood.c @@ -122,7 +122,7 @@ struct map_info redwood_flash_map = { static struct mtd_info *redwood_mtd; -int __init init_redwood_flash(void) +static int __init init_redwood_flash(void) { int err; diff --git a/drivers/mtd/maps/rpxlite.c b/drivers/mtd/maps/rpxlite.c index 14d90edb443..3e3ef53d4fd 100644 --- a/drivers/mtd/maps/rpxlite.c +++ b/drivers/mtd/maps/rpxlite.c @@ -23,7 +23,7 @@ static struct map_info rpxlite_map = { .phys = WINDOW_ADDR, }; -int __init init_rpxlite(void) +static int __init init_rpxlite(void) { printk(KERN_NOTICE "RPX Lite or CLLF flash device: %x at %x\n", WINDOW_SIZE*4, WINDOW_ADDR); rpxlite_map.virt = ioremap(WINDOW_ADDR, WINDOW_SIZE * 4); diff --git a/drivers/mtd/maps/sbc8240.c b/drivers/mtd/maps/sbc8240.c index 6e1e99cd2b5..d5374cdcb16 100644 --- a/drivers/mtd/maps/sbc8240.c +++ b/drivers/mtd/maps/sbc8240.c @@ -136,7 +136,7 @@ static struct mtd_part_def sbc8240_part_banks[NUM_FLASH_BANKS]; #endif /* CONFIG_MTD_PARTITIONS */ -int __init init_sbc8240_mtd (void) +static int __init init_sbc8240_mtd (void) { static struct _cjs { u_long addr; diff --git a/drivers/mtd/maps/scb2_flash.c b/drivers/mtd/maps/scb2_flash.c index 21169e6d646..7e329f09a54 100644 --- a/drivers/mtd/maps/scb2_flash.c +++ b/drivers/mtd/maps/scb2_flash.c @@ -118,7 +118,8 @@ scb2_fixup_mtd(struct mtd_info *mtd) struct mtd_erase_region_info *region = &mtd->eraseregions[i]; if (region->numblocks * region->erasesize > mtd->size) { - region->numblocks = (mtd->size / region->erasesize); + region->numblocks = ((unsigned long)mtd->size / + region->erasesize); done = 1; } else { region->numblocks = 0; @@ -187,8 +188,9 @@ scb2_flash_probe(struct pci_dev *dev, const struct pci_device_id *ent) return -ENODEV; } - printk(KERN_NOTICE MODNAME ": chip size 0x%x at offset 0x%x\n", - scb2_mtd->size, SCB2_WINDOW - scb2_mtd->size); + printk(KERN_NOTICE MODNAME ": chip size 0x%llx at offset 0x%llx\n", + (unsigned long long)scb2_mtd->size, + (unsigned long long)(SCB2_WINDOW - scb2_mtd->size)); add_mtd_device(scb2_mtd); diff --git a/drivers/mtd/maps/sharpsl-flash.c b/drivers/mtd/maps/sharpsl-flash.c index 026eab02818..b392f096c70 100644 --- a/drivers/mtd/maps/sharpsl-flash.c +++ b/drivers/mtd/maps/sharpsl-flash.c @@ -47,7 +47,7 @@ static struct mtd_partition sharpsl_partitions[1] = { } }; -int __init init_sharpsl(void) +static int __init init_sharpsl(void) { struct mtd_partition *parts; int nb_parts = 0; diff --git a/drivers/mtd/maps/tqm8xxl.c b/drivers/mtd/maps/tqm8xxl.c index a5d3d8531fa..60146984f4b 100644 --- a/drivers/mtd/maps/tqm8xxl.c +++ b/drivers/mtd/maps/tqm8xxl.c @@ -109,7 +109,7 @@ static struct mtd_partition tqm8xxl_fs_partitions[] = { }; #endif -int __init init_tqm_mtd(void) +static int __init init_tqm_mtd(void) { int idx = 0, ret = 0; unsigned long flash_addr, flash_size, mtd_size = 0; diff --git a/drivers/mtd/maps/uclinux.c b/drivers/mtd/maps/uclinux.c index 0dc645f8152..81756e39771 100644 --- a/drivers/mtd/maps/uclinux.c +++ b/drivers/mtd/maps/uclinux.c @@ -51,7 +51,7 @@ int uclinux_point(struct mtd_info *mtd, loff_t from, size_t len, /****************************************************************************/ -int __init uclinux_mtd_init(void) +static int __init uclinux_mtd_init(void) { struct mtd_info *mtd; struct map_info *mapp; @@ -94,7 +94,7 @@ int __init uclinux_mtd_init(void) /****************************************************************************/ -void __exit uclinux_mtd_cleanup(void) +static void __exit uclinux_mtd_cleanup(void) { if (uclinux_ram_mtdinfo) { del_mtd_partitions(uclinux_ram_mtdinfo); diff --git a/drivers/mtd/maps/vmax301.c b/drivers/mtd/maps/vmax301.c index 5a0c9a353b0..6d452dcdfe3 100644 --- a/drivers/mtd/maps/vmax301.c +++ b/drivers/mtd/maps/vmax301.c @@ -146,7 +146,7 @@ static void __exit cleanup_vmax301(void) iounmap((void *)vmax_map[0].map_priv_1 - WINDOW_START); } -int __init init_vmax301(void) +static int __init init_vmax301(void) { int i; unsigned long iomapadr; diff --git a/drivers/mtd/maps/wr_sbc82xx_flash.c b/drivers/mtd/maps/wr_sbc82xx_flash.c index 413b0cf9bbd..933a2b6598b 100644 --- a/drivers/mtd/maps/wr_sbc82xx_flash.c +++ b/drivers/mtd/maps/wr_sbc82xx_flash.c @@ -74,7 +74,7 @@ do { \ } \ } while (0); -int __init init_sbc82xx_flash(void) +static int __init init_sbc82xx_flash(void) { volatile memctl_cpm2_t *mc = &cpm2_immr->im_memctl; int bigflash; diff --git a/drivers/mtd/mtdchar.c b/drivers/mtd/mtdchar.c index bcffeda2df3..e9ec59e9a56 100644 --- a/drivers/mtd/mtdchar.c +++ b/drivers/mtd/mtdchar.c @@ -450,16 +450,20 @@ static int mtd_ioctl(struct inode *inode, struct file *file, if (!erase) ret = -ENOMEM; else { + struct erase_info_user einfo; + wait_queue_head_t waitq; DECLARE_WAITQUEUE(wait, current); init_waitqueue_head(&waitq); - if (copy_from_user(&erase->addr, argp, + if (copy_from_user(&einfo, argp, sizeof(struct erase_info_user))) { kfree(erase); return -EFAULT; } + erase->addr = einfo.start; + erase->len = einfo.length; erase->mtd = mtd; erase->callback = mtdchar_erase_callback; erase->priv = (unsigned long)&waitq; diff --git a/drivers/mtd/mtdconcat.c b/drivers/mtd/mtdconcat.c index 1a05cf37851..3dbb1b38db6 100644 --- a/drivers/mtd/mtdconcat.c +++ b/drivers/mtd/mtdconcat.c @@ -197,7 +197,7 @@ concat_writev(struct mtd_info *mtd, const struct kvec *vecs, continue; } - size = min(total_len, (size_t)(subdev->size - to)); + size = min_t(uint64_t, total_len, subdev->size - to); wsize = size; /* store for future use */ entry_high = entry_low; @@ -385,7 +385,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr) struct mtd_concat *concat = CONCAT(mtd); struct mtd_info *subdev; int i, err; - u_int32_t length, offset = 0; + uint64_t length, offset = 0; struct erase_info *erase; if (!(mtd->flags & MTD_WRITEABLE)) @@ -518,7 +518,7 @@ static int concat_erase(struct mtd_info *mtd, struct erase_info *instr) return 0; } -static int concat_lock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int concat_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { struct mtd_concat *concat = CONCAT(mtd); int i, err = -EINVAL; @@ -528,7 +528,7 @@ static int concat_lock(struct mtd_info *mtd, loff_t ofs, size_t len) for (i = 0; i < concat->num_subdev; i++) { struct mtd_info *subdev = concat->subdev[i]; - size_t size; + uint64_t size; if (ofs >= subdev->size) { size = 0; @@ -556,7 +556,7 @@ static int concat_lock(struct mtd_info *mtd, loff_t ofs, size_t len) return err; } -static int concat_unlock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int concat_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { struct mtd_concat *concat = CONCAT(mtd); int i, err = 0; @@ -566,7 +566,7 @@ static int concat_unlock(struct mtd_info *mtd, loff_t ofs, size_t len) for (i = 0; i < concat->num_subdev; i++) { struct mtd_info *subdev = concat->subdev[i]; - size_t size; + uint64_t size; if (ofs >= subdev->size) { size = 0; @@ -696,7 +696,7 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c int i; size_t size; struct mtd_concat *concat; - u_int32_t max_erasesize, curr_erasesize; + uint32_t max_erasesize, curr_erasesize; int num_erase_region; printk(KERN_NOTICE "Concatenating MTD devices:\n"); @@ -842,12 +842,14 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c concat->mtd.erasesize = curr_erasesize; concat->mtd.numeraseregions = 0; } else { + uint64_t tmp64; + /* * erase block size varies across the subdevices: allocate * space to store the data describing the variable erase regions */ struct mtd_erase_region_info *erase_region_p; - u_int32_t begin, position; + uint64_t begin, position; concat->mtd.erasesize = max_erasesize; concat->mtd.numeraseregions = num_erase_region; @@ -879,8 +881,9 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c erase_region_p->offset = begin; erase_region_p->erasesize = curr_erasesize; - erase_region_p->numblocks = - (position - begin) / curr_erasesize; + tmp64 = position - begin; + do_div(tmp64, curr_erasesize); + erase_region_p->numblocks = tmp64; begin = position; curr_erasesize = subdev[i]->erasesize; @@ -897,9 +900,9 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c erase_region_p->offset = begin; erase_region_p->erasesize = curr_erasesize; - erase_region_p->numblocks = - (position - - begin) / curr_erasesize; + tmp64 = position - begin; + do_div(tmp64, curr_erasesize); + erase_region_p->numblocks = tmp64; begin = position; curr_erasesize = @@ -909,14 +912,16 @@ struct mtd_info *mtd_concat_create(struct mtd_info *subdev[], /* subdevices to c } position += subdev[i]->eraseregions[j]. - numblocks * curr_erasesize; + numblocks * (uint64_t)curr_erasesize; } } } /* Now write the final entry */ erase_region_p->offset = begin; erase_region_p->erasesize = curr_erasesize; - erase_region_p->numblocks = (position - begin) / curr_erasesize; + tmp64 = position - begin; + do_div(tmp64, curr_erasesize); + erase_region_p->numblocks = tmp64; } return &concat->mtd; diff --git a/drivers/mtd/mtdcore.c b/drivers/mtd/mtdcore.c index a9d24694982..76fe0a1e7a5 100644 --- a/drivers/mtd/mtdcore.c +++ b/drivers/mtd/mtdcore.c @@ -57,6 +57,19 @@ int add_mtd_device(struct mtd_info *mtd) mtd->index = i; mtd->usecount = 0; + if (is_power_of_2(mtd->erasesize)) + mtd->erasesize_shift = ffs(mtd->erasesize) - 1; + else + mtd->erasesize_shift = 0; + + if (is_power_of_2(mtd->writesize)) + mtd->writesize_shift = ffs(mtd->writesize) - 1; + else + mtd->writesize_shift = 0; + + mtd->erasesize_mask = (1 << mtd->erasesize_shift) - 1; + mtd->writesize_mask = (1 << mtd->writesize_shift) - 1; + /* Some chips always power up locked. Unlock them now */ if ((mtd->flags & MTD_WRITEABLE) && (mtd->flags & MTD_POWERUP_LOCK) && mtd->unlock) { @@ -344,7 +357,8 @@ static inline int mtd_proc_info (char *buf, int i) if (!this) return 0; - return sprintf(buf, "mtd%d: %8.8x %8.8x \"%s\"\n", i, this->size, + return sprintf(buf, "mtd%d: %8.8llx %8.8x \"%s\"\n", i, + (unsigned long long)this->size, this->erasesize, this->name); } diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index aebb3b27edb..1a6b3beabe8 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c @@ -80,9 +80,9 @@ static int mtdoops_erase_block(struct mtd_info *mtd, int offset) if (ret) { set_current_state(TASK_RUNNING); remove_wait_queue(&wait_q, &wait); - printk (KERN_WARNING "mtdoops: erase of region [0x%x, 0x%x] " + printk (KERN_WARNING "mtdoops: erase of region [0x%llx, 0x%llx] " "on \"%s\" failed\n", - erase.addr, erase.len, mtd->name); + (unsigned long long)erase.addr, (unsigned long long)erase.len, mtd->name); return ret; } @@ -289,7 +289,10 @@ static void mtdoops_notify_add(struct mtd_info *mtd) } cxt->mtd = mtd; - cxt->oops_pages = mtd->size / OOPS_PAGE_SIZE; + if (mtd->size > INT_MAX) + cxt->oops_pages = INT_MAX / OOPS_PAGE_SIZE; + else + cxt->oops_pages = (int)mtd->size / OOPS_PAGE_SIZE; find_next_position(cxt); diff --git a/drivers/mtd/mtdpart.c b/drivers/mtd/mtdpart.c index 3728913fa5f..144e6b613a7 100644 --- a/drivers/mtd/mtdpart.c +++ b/drivers/mtd/mtdpart.c @@ -26,7 +26,7 @@ static LIST_HEAD(mtd_partitions); struct mtd_part { struct mtd_info mtd; struct mtd_info *master; - u_int32_t offset; + uint64_t offset; int index; struct list_head list; int registered; @@ -235,7 +235,7 @@ void mtd_erase_callback(struct erase_info *instr) } EXPORT_SYMBOL_GPL(mtd_erase_callback); -static int part_lock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int part_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { struct mtd_part *part = PART(mtd); if ((len + ofs) > mtd->size) @@ -243,7 +243,7 @@ static int part_lock(struct mtd_info *mtd, loff_t ofs, size_t len) return part->master->lock(part->master, ofs + part->offset, len); } -static int part_unlock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int part_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { struct mtd_part *part = PART(mtd); if ((len + ofs) > mtd->size) @@ -317,7 +317,7 @@ EXPORT_SYMBOL(del_mtd_partitions); static struct mtd_part *add_one_partition(struct mtd_info *master, const struct mtd_partition *part, int partno, - u_int32_t cur_offset) + uint64_t cur_offset) { struct mtd_part *slave; @@ -395,19 +395,19 @@ static struct mtd_part *add_one_partition(struct mtd_info *master, slave->offset = cur_offset; if (slave->offset == MTDPART_OFS_NXTBLK) { slave->offset = cur_offset; - if ((cur_offset % master->erasesize) != 0) { + if (mtd_mod_by_eb(cur_offset, master) != 0) { /* Round up to next erasesize */ - slave->offset = ((cur_offset / master->erasesize) + 1) * master->erasesize; + slave->offset = (mtd_div_by_eb(cur_offset, master) + 1) * master->erasesize; printk(KERN_NOTICE "Moving partition %d: " - "0x%08x -> 0x%08x\n", partno, - cur_offset, slave->offset); + "0x%012llx -> 0x%012llx\n", partno, + (unsigned long long)cur_offset, (unsigned long long)slave->offset); } } if (slave->mtd.size == MTDPART_SIZ_FULL) slave->mtd.size = master->size - slave->offset; - printk(KERN_NOTICE "0x%08x-0x%08x : \"%s\"\n", slave->offset, - slave->offset + slave->mtd.size, slave->mtd.name); + printk(KERN_NOTICE "0x%012llx-0x%012llx : \"%s\"\n", (unsigned long long)slave->offset, + (unsigned long long)(slave->offset + slave->mtd.size), slave->mtd.name); /* let's do some sanity checks */ if (slave->offset >= master->size) { @@ -420,13 +420,13 @@ static struct mtd_part *add_one_partition(struct mtd_info *master, } if (slave->offset + slave->mtd.size > master->size) { slave->mtd.size = master->size - slave->offset; - printk(KERN_WARNING"mtd: partition \"%s\" extends beyond the end of device \"%s\" -- size truncated to %#x\n", - part->name, master->name, slave->mtd.size); + printk(KERN_WARNING"mtd: partition \"%s\" extends beyond the end of device \"%s\" -- size truncated to %#llx\n", + part->name, master->name, (unsigned long long)slave->mtd.size); } if (master->numeraseregions > 1) { /* Deal with variable erase size stuff */ int i, max = master->numeraseregions; - u32 end = slave->offset + slave->mtd.size; + u64 end = slave->offset + slave->mtd.size; struct mtd_erase_region_info *regions = master->eraseregions; /* Find the first erase regions which is part of this @@ -449,7 +449,7 @@ static struct mtd_part *add_one_partition(struct mtd_info *master, } if ((slave->mtd.flags & MTD_WRITEABLE) && - (slave->offset % slave->mtd.erasesize)) { + mtd_mod_by_eb(slave->offset, &slave->mtd)) { /* Doesn't start on a boundary of major erase size */ /* FIXME: Let it be writable if it is on a boundary of * _minor_ erase size though */ @@ -458,7 +458,7 @@ static struct mtd_part *add_one_partition(struct mtd_info *master, part->name); } if ((slave->mtd.flags & MTD_WRITEABLE) && - (slave->mtd.size % slave->mtd.erasesize)) { + mtd_mod_by_eb(slave->mtd.size, &slave->mtd)) { slave->mtd.flags &= ~MTD_WRITEABLE; printk(KERN_WARNING"mtd: partition \"%s\" doesn't end on an erase block -- force read-only\n", part->name); @@ -466,7 +466,7 @@ static struct mtd_part *add_one_partition(struct mtd_info *master, slave->mtd.ecclayout = master->ecclayout; if (master->block_isbad) { - uint32_t offs = 0; + uint64_t offs = 0; while (offs < slave->mtd.size) { if (master->block_isbad(master, @@ -501,7 +501,7 @@ int add_mtd_partitions(struct mtd_info *master, int nbparts) { struct mtd_part *slave; - u_int32_t cur_offset = 0; + uint64_t cur_offset = 0; int i; printk(KERN_NOTICE "Creating %d MTD partitions on \"%s\":\n", nbparts, master->name); diff --git a/drivers/mtd/nand/Kconfig b/drivers/mtd/nand/Kconfig index f8ae0400c49..8b12e6e109d 100644 --- a/drivers/mtd/nand/Kconfig +++ b/drivers/mtd/nand/Kconfig @@ -163,6 +163,13 @@ config MTD_NAND_S3C2410_HWECC incorrect ECC generation, and if using these, the default of software ECC is preferable. +config MTD_NAND_NDFC + tristate "NDFC NanD Flash Controller" + depends on 4xx + select MTD_NAND_ECC_SMC + help + NDFC Nand Flash Controllers are integrated in IBM/AMCC's 4xx SoCs + config MTD_NAND_S3C2410_CLKSTOP bool "S3C2410 NAND IDLE clock stop" depends on MTD_NAND_S3C2410 diff --git a/drivers/mtd/nand/alauda.c b/drivers/mtd/nand/alauda.c index 96238039485..6d9649159a1 100644 --- a/drivers/mtd/nand/alauda.c +++ b/drivers/mtd/nand/alauda.c @@ -676,11 +676,11 @@ static int alauda_probe(struct usb_interface *interface, goto error; al->write_out = usb_sndbulkpipe(al->dev, - ep_wr->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); + usb_endpoint_num(ep_wr)); al->bulk_in = usb_rcvbulkpipe(al->dev, - ep_in->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); + usb_endpoint_num(ep_in)); al->bulk_out = usb_sndbulkpipe(al->dev, - ep_out->bEndpointAddress & USB_ENDPOINT_NUMBER_MASK); + usb_endpoint_num(ep_out)); /* second device is identical up to now */ memcpy(al+1, al, sizeof(*al)); diff --git a/drivers/mtd/nand/cafe_nand.c b/drivers/mtd/nand/cafe_nand.c index b8064bf3aee..22a6b2e50e9 100644 --- a/drivers/mtd/nand/cafe_nand.c +++ b/drivers/mtd/nand/cafe_nand.c @@ -90,7 +90,7 @@ static int timing[3]; module_param_array(timing, int, &numtimings, 0644); #ifdef CONFIG_MTD_PARTITIONS -static const char *part_probes[] = { "RedBoot", NULL }; +static const char *part_probes[] = { "cmdlinepart", "RedBoot", NULL }; #endif /* Hrm. Why isn't this already conditional on something in the struct device? */ @@ -805,10 +805,13 @@ static int __devinit cafe_nand_probe(struct pci_dev *pdev, add_mtd_device(mtd); #ifdef CONFIG_MTD_PARTITIONS +#ifdef CONFIG_MTD_CMDLINE_PARTS + mtd->name = "cafe_nand"; +#endif nr_parts = parse_mtd_partitions(mtd, part_probes, &parts, 0); if (nr_parts > 0) { cafe->parts = parts; - dev_info(&cafe->pdev->dev, "%d RedBoot partitions found\n", nr_parts); + dev_info(&cafe->pdev->dev, "%d partitions found\n", nr_parts); add_mtd_partitions(mtd, parts, nr_parts); } #endif diff --git a/drivers/mtd/nand/fsl_elbc_nand.c b/drivers/mtd/nand/fsl_elbc_nand.c index 4aa5bd6158d..65929db2944 100644 --- a/drivers/mtd/nand/fsl_elbc_nand.c +++ b/drivers/mtd/nand/fsl_elbc_nand.c @@ -777,7 +777,9 @@ static int fsl_elbc_chip_init(struct fsl_elbc_mtd *priv) /* Fill in fsl_elbc_mtd structure */ priv->mtd.priv = chip; priv->mtd.owner = THIS_MODULE; - priv->fmr = 0; /* rest filled in later */ + + /* Set the ECCM according to the settings in bootloader.*/ + priv->fmr = in_be32(&lbc->fmr) & FMR_ECCM; /* fill in nand_chip structure */ /* set up function call table */ diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index 0a9c9cd33f9..0c3afccde8a 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -2014,13 +2014,14 @@ static int nand_erase(struct mtd_info *mtd, struct erase_info *instr) int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr, int allowbbt) { - int page, len, status, pages_per_block, ret, chipnr; + int page, status, pages_per_block, ret, chipnr; struct nand_chip *chip = mtd->priv; - int rewrite_bbt[NAND_MAX_CHIPS]={0}; + loff_t rewrite_bbt[NAND_MAX_CHIPS]={0}; unsigned int bbt_masked_page = 0xffffffff; + loff_t len; - DEBUG(MTD_DEBUG_LEVEL3, "nand_erase: start = 0x%08x, len = %i\n", - (unsigned int)instr->addr, (unsigned int)instr->len); + DEBUG(MTD_DEBUG_LEVEL3, "nand_erase: start = 0x%012llx, len = %llu\n", + (unsigned long long)instr->addr, (unsigned long long)instr->len); /* Start address must align on block boundary */ if (instr->addr & ((1 << chip->phys_erase_shift) - 1)) { @@ -2116,7 +2117,8 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr, DEBUG(MTD_DEBUG_LEVEL0, "nand_erase: " "Failed erase, page 0x%08x\n", page); instr->state = MTD_ERASE_FAILED; - instr->fail_addr = (page << chip->page_shift); + instr->fail_addr = + ((loff_t)page << chip->page_shift); goto erase_exit; } @@ -2126,7 +2128,8 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr, */ if (bbt_masked_page != 0xffffffff && (page & BBT_PAGE_MASK) == bbt_masked_page) - rewrite_bbt[chipnr] = (page << chip->page_shift); + rewrite_bbt[chipnr] = + ((loff_t)page << chip->page_shift); /* Increment page address and decrement length */ len -= (1 << chip->phys_erase_shift); @@ -2173,7 +2176,7 @@ int nand_erase_nand(struct mtd_info *mtd, struct erase_info *instr, continue; /* update the BBT for chip */ DEBUG(MTD_DEBUG_LEVEL0, "nand_erase_nand: nand_update_bbt " - "(%d:0x%0x 0x%0x)\n", chipnr, rewrite_bbt[chipnr], + "(%d:0x%0llx 0x%0x)\n", chipnr, rewrite_bbt[chipnr], chip->bbt_td->pages[chipnr]); nand_update_bbt(mtd, rewrite_bbt[chipnr]); } @@ -2365,7 +2368,7 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd, if (!mtd->name) mtd->name = type->name; - chip->chipsize = type->chipsize << 20; + chip->chipsize = (uint64_t)type->chipsize << 20; /* Newer devices have all the information in additional id bytes */ if (!type->pagesize) { @@ -2423,7 +2426,10 @@ static struct nand_flash_dev *nand_get_flash_type(struct mtd_info *mtd, chip->bbt_erase_shift = chip->phys_erase_shift = ffs(mtd->erasesize) - 1; - chip->chip_shift = ffs(chip->chipsize) - 1; + if (chip->chipsize & 0xffffffff) + chip->chip_shift = ffs((unsigned)chip->chipsize) - 1; + else + chip->chip_shift = ffs((unsigned)(chip->chipsize >> 32)) + 32 - 1; /* Set the bad block position */ chip->badblockpos = mtd->writesize > 512 ? @@ -2517,7 +2523,6 @@ int nand_scan_ident(struct mtd_info *mtd, int maxchips) /** * nand_scan_tail - [NAND Interface] Scan for the NAND device * @mtd: MTD device structure - * @maxchips: Number of chips to scan for * * This is the second phase of the normal nand_scan() function. It * fills out all the uninitialized function pointers with the defaults diff --git a/drivers/mtd/nand/nand_bbt.c b/drivers/mtd/nand/nand_bbt.c index 0b1c48595f1..55c23e5cd21 100644 --- a/drivers/mtd/nand/nand_bbt.c +++ b/drivers/mtd/nand/nand_bbt.c @@ -171,16 +171,16 @@ static int read_bbt(struct mtd_info *mtd, uint8_t *buf, int page, int num, if (tmp == msk) continue; if (reserved_block_code && (tmp == reserved_block_code)) { - printk(KERN_DEBUG "nand_read_bbt: Reserved block at 0x%08x\n", - ((offs << 2) + (act >> 1)) << this->bbt_erase_shift); + printk(KERN_DEBUG "nand_read_bbt: Reserved block at 0x%012llx\n", + (loff_t)((offs << 2) + (act >> 1)) << this->bbt_erase_shift); this->bbt[offs + (act >> 3)] |= 0x2 << (act & 0x06); mtd->ecc_stats.bbtblocks++; continue; } /* Leave it for now, if its matured we can move this * message to MTD_DEBUG_LEVEL0 */ - printk(KERN_DEBUG "nand_read_bbt: Bad block at 0x%08x\n", - ((offs << 2) + (act >> 1)) << this->bbt_erase_shift); + printk(KERN_DEBUG "nand_read_bbt: Bad block at 0x%012llx\n", + (loff_t)((offs << 2) + (act >> 1)) << this->bbt_erase_shift); /* Factory marked bad or worn out ? */ if (tmp == 0) this->bbt[offs + (act >> 3)] |= 0x3 << (act & 0x06); @@ -284,7 +284,7 @@ static int read_abs_bbts(struct mtd_info *mtd, uint8_t *buf, /* Read the primary version, if available */ if (td->options & NAND_BBT_VERSION) { - scan_read_raw(mtd, buf, td->pages[0] << this->page_shift, + scan_read_raw(mtd, buf, (loff_t)td->pages[0] << this->page_shift, mtd->writesize); td->version[0] = buf[mtd->writesize + td->veroffs]; printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n", @@ -293,7 +293,7 @@ static int read_abs_bbts(struct mtd_info *mtd, uint8_t *buf, /* Read the mirror version, if available */ if (md && (md->options & NAND_BBT_VERSION)) { - scan_read_raw(mtd, buf, md->pages[0] << this->page_shift, + scan_read_raw(mtd, buf, (loff_t)md->pages[0] << this->page_shift, mtd->writesize); md->version[0] = buf[mtd->writesize + md->veroffs]; printk(KERN_DEBUG "Bad block table at page %d, version 0x%02X\n", @@ -411,7 +411,7 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, numblocks = this->chipsize >> (this->bbt_erase_shift - 1); startblock = chip * numblocks; numblocks += startblock; - from = startblock << (this->bbt_erase_shift - 1); + from = (loff_t)startblock << (this->bbt_erase_shift - 1); } for (i = startblock; i < numblocks;) { @@ -428,8 +428,8 @@ static int create_bbt(struct mtd_info *mtd, uint8_t *buf, if (ret) { this->bbt[i >> 3] |= 0x03 << (i & 0x6); - printk(KERN_WARNING "Bad eraseblock %d at 0x%08x\n", - i >> 1, (unsigned int)from); + printk(KERN_WARNING "Bad eraseblock %d at 0x%012llx\n", + i >> 1, (unsigned long long)from); mtd->ecc_stats.badblocks++; } @@ -495,7 +495,7 @@ static int search_bbt(struct mtd_info *mtd, uint8_t *buf, struct nand_bbt_descr for (block = 0; block < td->maxblocks; block++) { int actblock = startblock + dir * block; - loff_t offs = actblock << this->bbt_erase_shift; + loff_t offs = (loff_t)actblock << this->bbt_erase_shift; /* Read first page */ scan_read_raw(mtd, buf, offs, mtd->writesize); @@ -719,7 +719,7 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf, memset(&einfo, 0, sizeof(einfo)); einfo.mtd = mtd; - einfo.addr = (unsigned long)to; + einfo.addr = to; einfo.len = 1 << this->bbt_erase_shift; res = nand_erase_nand(mtd, &einfo, 1); if (res < 0) @@ -729,8 +729,8 @@ static int write_bbt(struct mtd_info *mtd, uint8_t *buf, if (res < 0) goto outerr; - printk(KERN_DEBUG "Bad block table written to 0x%08x, version " - "0x%02X\n", (unsigned int)to, td->version[chip]); + printk(KERN_DEBUG "Bad block table written to 0x%012llx, version " + "0x%02X\n", (unsigned long long)to, td->version[chip]); /* Mark it as used */ td->pages[chip] = page; @@ -910,7 +910,7 @@ static void mark_bbt_region(struct mtd_info *mtd, struct nand_bbt_descr *td) newval = oldval | (0x2 << (block & 0x06)); this->bbt[(block >> 3)] = newval; if ((oldval != newval) && td->reserved_block_code) - nand_update_bbt(mtd, block << (this->bbt_erase_shift - 1)); + nand_update_bbt(mtd, (loff_t)block << (this->bbt_erase_shift - 1)); continue; } update = 0; @@ -931,7 +931,7 @@ static void mark_bbt_region(struct mtd_info *mtd, struct nand_bbt_descr *td) new ones have been marked, then we need to update the stored bbts. This should only happen once. */ if (update && td->reserved_block_code) - nand_update_bbt(mtd, (block - 2) << (this->bbt_erase_shift - 1)); + nand_update_bbt(mtd, (loff_t)(block - 2) << (this->bbt_erase_shift - 1)); } } @@ -1027,7 +1027,6 @@ int nand_update_bbt(struct mtd_info *mtd, loff_t offs) if (!this->bbt || !td) return -EINVAL; - len = mtd->size >> (this->bbt_erase_shift + 2); /* Allocate a temporary buffer for one eraseblock incl. oob */ len = (1 << this->bbt_erase_shift); len += (len >> this->page_shift) * mtd->oobsize; diff --git a/drivers/mtd/nand/nandsim.c b/drivers/mtd/nand/nandsim.c index ae7c57781a6..cd0711b83ac 100644 --- a/drivers/mtd/nand/nandsim.c +++ b/drivers/mtd/nand/nandsim.c @@ -38,6 +38,9 @@ #include <linux/delay.h> #include <linux/list.h> #include <linux/random.h> +#include <linux/sched.h> +#include <linux/fs.h> +#include <linux/pagemap.h> /* Default simulator parameters values */ #if !defined(CONFIG_NANDSIM_FIRST_ID_BYTE) || \ @@ -100,6 +103,7 @@ static unsigned int bitflips = 0; static char *gravepages = NULL; static unsigned int rptwear = 0; static unsigned int overridesize = 0; +static char *cache_file = NULL; module_param(first_id_byte, uint, 0400); module_param(second_id_byte, uint, 0400); @@ -122,12 +126,13 @@ module_param(bitflips, uint, 0400); module_param(gravepages, charp, 0400); module_param(rptwear, uint, 0400); module_param(overridesize, uint, 0400); +module_param(cache_file, charp, 0400); MODULE_PARM_DESC(first_id_byte, "The first byte returned by NAND Flash 'read ID' command (manufacturer ID)"); MODULE_PARM_DESC(second_id_byte, "The second byte returned by NAND Flash 'read ID' command (chip ID)"); MODULE_PARM_DESC(third_id_byte, "The third byte returned by NAND Flash 'read ID' command"); MODULE_PARM_DESC(fourth_id_byte, "The fourth byte returned by NAND Flash 'read ID' command"); -MODULE_PARM_DESC(access_delay, "Initial page access delay (microiseconds)"); +MODULE_PARM_DESC(access_delay, "Initial page access delay (microseconds)"); MODULE_PARM_DESC(programm_delay, "Page programm delay (microseconds"); MODULE_PARM_DESC(erase_delay, "Sector erase delay (milliseconds)"); MODULE_PARM_DESC(output_cycle, "Word output (from flash) time (nanodeconds)"); @@ -153,6 +158,7 @@ MODULE_PARM_DESC(rptwear, "Number of erases inbetween reporting wear, if MODULE_PARM_DESC(overridesize, "Specifies the NAND Flash size overriding the ID bytes. " "The size is specified in erase blocks and as the exponent of a power of two" " e.g. 5 means a size of 32 erase blocks"); +MODULE_PARM_DESC(cache_file, "File to use to cache nand pages instead of memory"); /* The largest possible page size */ #define NS_LARGEST_PAGE_SIZE 2048 @@ -266,6 +272,9 @@ MODULE_PARM_DESC(overridesize, "Specifies the NAND Flash size overriding the I */ #define NS_MAX_PREVSTATES 1 +/* Maximum page cache pages needed to read or write a NAND page to the cache_file */ +#define NS_MAX_HELD_PAGES 16 + /* * A union to represent flash memory contents and flash buffer. */ @@ -295,6 +304,9 @@ struct nandsim { /* The simulated NAND flash pages array */ union ns_mem *pages; + /* Slab allocator for nand pages */ + struct kmem_cache *nand_pages_slab; + /* Internal buffer of page + OOB size bytes */ union ns_mem buf; @@ -335,6 +347,13 @@ struct nandsim { int ale; /* address Latch Enable */ int wp; /* write Protect */ } lines; + + /* Fields needed when using a cache file */ + struct file *cfile; /* Open file */ + unsigned char *pages_written; /* Which pages have been written */ + void *file_buf; + struct page *held_pages[NS_MAX_HELD_PAGES]; + int held_cnt; }; /* @@ -420,25 +439,69 @@ static struct mtd_info *nsmtd; static u_char ns_verify_buf[NS_LARGEST_PAGE_SIZE]; /* - * Allocate array of page pointers and initialize the array to NULL - * pointers. + * Allocate array of page pointers, create slab allocation for an array + * and initialize the array by NULL pointers. * * RETURNS: 0 if success, -ENOMEM if memory alloc fails. */ static int alloc_device(struct nandsim *ns) { - int i; + struct file *cfile; + int i, err; + + if (cache_file) { + cfile = filp_open(cache_file, O_CREAT | O_RDWR | O_LARGEFILE, 0600); + if (IS_ERR(cfile)) + return PTR_ERR(cfile); + if (!cfile->f_op || (!cfile->f_op->read && !cfile->f_op->aio_read)) { + NS_ERR("alloc_device: cache file not readable\n"); + err = -EINVAL; + goto err_close; + } + if (!cfile->f_op->write && !cfile->f_op->aio_write) { + NS_ERR("alloc_device: cache file not writeable\n"); + err = -EINVAL; + goto err_close; + } + ns->pages_written = vmalloc(ns->geom.pgnum); + if (!ns->pages_written) { + NS_ERR("alloc_device: unable to allocate pages written array\n"); + err = -ENOMEM; + goto err_close; + } + ns->file_buf = kmalloc(ns->geom.pgszoob, GFP_KERNEL); + if (!ns->file_buf) { + NS_ERR("alloc_device: unable to allocate file buf\n"); + err = -ENOMEM; + goto err_free; + } + ns->cfile = cfile; + memset(ns->pages_written, 0, ns->geom.pgnum); + return 0; + } ns->pages = vmalloc(ns->geom.pgnum * sizeof(union ns_mem)); if (!ns->pages) { - NS_ERR("alloc_map: unable to allocate page array\n"); + NS_ERR("alloc_device: unable to allocate page array\n"); return -ENOMEM; } for (i = 0; i < ns->geom.pgnum; i++) { ns->pages[i].byte = NULL; } + ns->nand_pages_slab = kmem_cache_create("nandsim", + ns->geom.pgszoob, 0, 0, NULL); + if (!ns->nand_pages_slab) { + NS_ERR("cache_create: unable to create kmem_cache\n"); + return -ENOMEM; + } return 0; + +err_free: + vfree(ns->pages_written); +err_close: + filp_close(cfile, NULL); + return err; } /* @@ -448,11 +511,20 @@ static void free_device(struct nandsim *ns) { int i; + if (ns->cfile) { + kfree(ns->file_buf); + vfree(ns->pages_written); + filp_close(ns->cfile, NULL); + return; + } + if (ns->pages) { for (i = 0; i < ns->geom.pgnum; i++) { if (ns->pages[i].byte) - kfree(ns->pages[i].byte); + kmem_cache_free(ns->nand_pages_slab, + ns->pages[i].byte); } + kmem_cache_destroy(ns->nand_pages_slab); vfree(ns->pages); } } @@ -464,7 +536,7 @@ static char *get_partition_name(int i) return kstrdup(buf, GFP_KERNEL); } -static u_int64_t divide(u_int64_t n, u_int32_t d) +static uint64_t divide(uint64_t n, uint32_t d) { do_div(n, d); return n; @@ -480,8 +552,8 @@ static int init_nandsim(struct mtd_info *mtd) struct nand_chip *chip = (struct nand_chip *)mtd->priv; struct nandsim *ns = (struct nandsim *)(chip->priv); int i, ret = 0; - u_int64_t remains; - u_int64_t next_offset; + uint64_t remains; + uint64_t next_offset; if (NS_IS_INITIALIZED(ns)) { NS_ERR("init_nandsim: nandsim is already initialized\n"); @@ -548,7 +620,7 @@ static int init_nandsim(struct mtd_info *mtd) remains = ns->geom.totsz; next_offset = 0; for (i = 0; i < parts_num; ++i) { - u_int64_t part_sz = (u_int64_t)parts[i] * ns->geom.secsz; + uint64_t part_sz = (uint64_t)parts[i] * ns->geom.secsz; if (!part_sz || part_sz > remains) { NS_ERR("bad partition size.\n"); @@ -1211,6 +1283,97 @@ static int find_operation(struct nandsim *ns, uint32_t flag) return -1; } +static void put_pages(struct nandsim *ns) +{ + int i; + + for (i = 0; i < ns->held_cnt; i++) + page_cache_release(ns->held_pages[i]); +} + +/* Get page cache pages in advance to provide NOFS memory allocation */ +static int get_pages(struct nandsim *ns, struct file *file, size_t count, loff_t pos) +{ + pgoff_t index, start_index, end_index; + struct page *page; + struct address_space *mapping = file->f_mapping; + + start_index = pos >> PAGE_CACHE_SHIFT; + end_index = (pos + count - 1) >> PAGE_CACHE_SHIFT; + if (end_index - start_index + 1 > NS_MAX_HELD_PAGES) + return -EINVAL; + ns->held_cnt = 0; + for (index = start_index; index <= end_index; index++) { + page = find_get_page(mapping, index); + if (page == NULL) { + page = find_or_create_page(mapping, index, GFP_NOFS); + if (page == NULL) { + write_inode_now(mapping->host, 1); + page = find_or_create_page(mapping, index, GFP_NOFS); + } + if (page == NULL) { + put_pages(ns); + return -ENOMEM; + } + unlock_page(page); + } + ns->held_pages[ns->held_cnt++] = page; + } + return 0; +} + +static int set_memalloc(void) +{ + if (current->flags & PF_MEMALLOC) + return 0; + current->flags |= PF_MEMALLOC; + return 1; +} + +static void clear_memalloc(int memalloc) +{ + if (memalloc) + current->flags &= ~PF_MEMALLOC; +} + +static ssize_t read_file(struct nandsim *ns, struct file *file, void *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + ssize_t tx; + int err, memalloc; + + err = get_pages(ns, file, count, *pos); + if (err) + return err; + old_fs = get_fs(); + set_fs(get_ds()); + memalloc = set_memalloc(); + tx = vfs_read(file, (char __user *)buf, count, pos); + clear_memalloc(memalloc); + set_fs(old_fs); + put_pages(ns); + return tx; +} + +static ssize_t write_file(struct nandsim *ns, struct file *file, void *buf, size_t count, loff_t *pos) +{ + mm_segment_t old_fs; + ssize_t tx; + int err, memalloc; + + err = get_pages(ns, file, count, *pos); + if (err) + return err; + old_fs = get_fs(); + set_fs(get_ds()); + memalloc = set_memalloc(); + tx = vfs_write(file, (char __user *)buf, count, pos); + clear_memalloc(memalloc); + set_fs(old_fs); + put_pages(ns); + return tx; +} + /* * Returns a pointer to the current page. */ @@ -1227,6 +1390,38 @@ static inline u_char *NS_PAGE_BYTE_OFF(struct nandsim *ns) return NS_GET_PAGE(ns)->byte + ns->regs.column + ns->regs.off; } +int do_read_error(struct nandsim *ns, int num) +{ + unsigned int page_no = ns->regs.row; + + if (read_error(page_no)) { + int i; + memset(ns->buf.byte, 0xFF, num); + for (i = 0; i < num; ++i) + ns->buf.byte[i] = random32(); + NS_WARN("simulating read error in page %u\n", page_no); + return 1; + } + return 0; +} + +void do_bit_flips(struct nandsim *ns, int num) +{ + if (bitflips && random32() < (1 << 22)) { + int flips = 1; + if (bitflips > 1) + flips = (random32() % (int) bitflips) + 1; + while (flips--) { + int pos = random32() % (num * 8); + ns->buf.byte[pos / 8] ^= (1 << (pos % 8)); + NS_WARN("read_page: flipping bit %d in page %d " + "reading from %d ecc: corrected=%u failed=%u\n", + pos, ns->regs.row, ns->regs.column + ns->regs.off, + nsmtd->ecc_stats.corrected, nsmtd->ecc_stats.failed); + } + } +} + /* * Fill the NAND buffer with data read from the specified page. */ @@ -1234,36 +1429,40 @@ static void read_page(struct nandsim *ns, int num) { union ns_mem *mypage; + if (ns->cfile) { + if (!ns->pages_written[ns->regs.row]) { + NS_DBG("read_page: page %d not written\n", ns->regs.row); + memset(ns->buf.byte, 0xFF, num); + } else { + loff_t pos; + ssize_t tx; + + NS_DBG("read_page: page %d written, reading from %d\n", + ns->regs.row, ns->regs.column + ns->regs.off); + if (do_read_error(ns, num)) + return; + pos = (loff_t)ns->regs.row * ns->geom.pgszoob + ns->regs.column + ns->regs.off; + tx = read_file(ns, ns->cfile, ns->buf.byte, num, &pos); + if (tx != num) { + NS_ERR("read_page: read error for page %d ret %ld\n", ns->regs.row, (long)tx); + return; + } + do_bit_flips(ns, num); + } + return; + } + mypage = NS_GET_PAGE(ns); if (mypage->byte == NULL) { NS_DBG("read_page: page %d not allocated\n", ns->regs.row); memset(ns->buf.byte, 0xFF, num); } else { - unsigned int page_no = ns->regs.row; NS_DBG("read_page: page %d allocated, reading from %d\n", ns->regs.row, ns->regs.column + ns->regs.off); - if (read_error(page_no)) { - int i; - memset(ns->buf.byte, 0xFF, num); - for (i = 0; i < num; ++i) - ns->buf.byte[i] = random32(); - NS_WARN("simulating read error in page %u\n", page_no); + if (do_read_error(ns, num)) return; - } memcpy(ns->buf.byte, NS_PAGE_BYTE_OFF(ns), num); - if (bitflips && random32() < (1 << 22)) { - int flips = 1; - if (bitflips > 1) - flips = (random32() % (int) bitflips) + 1; - while (flips--) { - int pos = random32() % (num * 8); - ns->buf.byte[pos / 8] ^= (1 << (pos % 8)); - NS_WARN("read_page: flipping bit %d in page %d " - "reading from %d ecc: corrected=%u failed=%u\n", - pos, ns->regs.row, ns->regs.column + ns->regs.off, - nsmtd->ecc_stats.corrected, nsmtd->ecc_stats.failed); - } - } + do_bit_flips(ns, num); } } @@ -1275,11 +1474,20 @@ static void erase_sector(struct nandsim *ns) union ns_mem *mypage; int i; + if (ns->cfile) { + for (i = 0; i < ns->geom.pgsec; i++) + if (ns->pages_written[ns->regs.row + i]) { + NS_DBG("erase_sector: freeing page %d\n", ns->regs.row + i); + ns->pages_written[ns->regs.row + i] = 0; + } + return; + } + mypage = NS_GET_PAGE(ns); for (i = 0; i < ns->geom.pgsec; i++) { if (mypage->byte != NULL) { NS_DBG("erase_sector: freeing page %d\n", ns->regs.row+i); - kfree(mypage->byte); + kmem_cache_free(ns->nand_pages_slab, mypage->byte); mypage->byte = NULL; } mypage++; @@ -1295,16 +1503,57 @@ static int prog_page(struct nandsim *ns, int num) union ns_mem *mypage; u_char *pg_off; + if (ns->cfile) { + loff_t off, pos; + ssize_t tx; + int all; + + NS_DBG("prog_page: writing page %d\n", ns->regs.row); + pg_off = ns->file_buf + ns->regs.column + ns->regs.off; + off = (loff_t)ns->regs.row * ns->geom.pgszoob + ns->regs.column + ns->regs.off; + if (!ns->pages_written[ns->regs.row]) { + all = 1; + memset(ns->file_buf, 0xff, ns->geom.pgszoob); + } else { + all = 0; + pos = off; + tx = read_file(ns, ns->cfile, pg_off, num, &pos); + if (tx != num) { + NS_ERR("prog_page: read error for page %d ret %ld\n", ns->regs.row, (long)tx); + return -1; + } + } + for (i = 0; i < num; i++) + pg_off[i] &= ns->buf.byte[i]; + if (all) { + pos = (loff_t)ns->regs.row * ns->geom.pgszoob; + tx = write_file(ns, ns->cfile, ns->file_buf, ns->geom.pgszoob, &pos); + if (tx != ns->geom.pgszoob) { + NS_ERR("prog_page: write error for page %d ret %ld\n", ns->regs.row, (long)tx); + return -1; + } + ns->pages_written[ns->regs.row] = 1; + } else { + pos = off; + tx = write_file(ns, ns->cfile, pg_off, num, &pos); + if (tx != num) { + NS_ERR("prog_page: write error for page %d ret %ld\n", ns->regs.row, (long)tx); + return -1; + } + } + return 0; + } + mypage = NS_GET_PAGE(ns); if (mypage->byte == NULL) { NS_DBG("prog_page: allocating page %d\n", ns->regs.row); /* * We allocate memory with GFP_NOFS because a flash FS may * utilize this. If it is holding an FS lock, then gets here, - * then kmalloc runs writeback which goes to the FS again - * and deadlocks. This was seen in practice. + * then kernel memory alloc runs writeback which goes to the FS + * again and deadlocks. This was seen in practice. */ - mypage->byte = kmalloc(ns->geom.pgszoob, GFP_NOFS); + mypage->byte = kmem_cache_alloc(ns->nand_pages_slab, GFP_NOFS); if (mypage->byte == NULL) { NS_ERR("prog_page: error allocating memory for page %d\n", ns->regs.row); return -1; @@ -1736,13 +1985,17 @@ static void ns_nand_write_byte(struct mtd_info *mtd, u_char byte) /* Check if chip is expecting command */ if (NS_STATE(ns->nxstate) != STATE_UNKNOWN && !(ns->nxstate & STATE_CMD_MASK)) { - /* - * We are in situation when something else (not command) - * was expected but command was input. In this case ignore - * previous command(s)/state(s) and accept the last one. - */ - NS_WARN("write_byte: command (%#x) wasn't expected, expected state is %s, " - "ignore previous states\n", (uint)byte, get_state_name(ns->nxstate)); + /* Do not warn if only 2 id bytes are read */ + if (!(ns->regs.command == NAND_CMD_READID && + NS_STATE(ns->state) == STATE_DATAOUT_ID && ns->regs.count == 2)) { + /* + * We are in situation when something else (not command) + * was expected but command was input. In this case ignore + * previous command(s)/state(s) and accept the last one. + */ + NS_WARN("write_byte: command (%#x) wasn't expected, expected state is %s, " + "ignore previous states\n", (uint)byte, get_state_name(ns->nxstate)); + } switch_to_ready_state(ns, NS_STATUS_FAILED(ns)); } @@ -2044,7 +2297,7 @@ static int __init ns_init_module(void) } if (overridesize) { - u_int64_t new_size = (u_int64_t)nsmtd->erasesize << overridesize; + uint64_t new_size = (uint64_t)nsmtd->erasesize << overridesize; if (new_size >> overridesize != nsmtd->erasesize) { NS_ERR("overridesize is too big\n"); goto err_exit; diff --git a/drivers/mtd/nand/ndfc.c b/drivers/mtd/nand/ndfc.c index 955959eb02d..582cf80f555 100644 --- a/drivers/mtd/nand/ndfc.c +++ b/drivers/mtd/nand/ndfc.c @@ -2,12 +2,20 @@ * drivers/mtd/ndfc.c * * Overview: - * Platform independend driver for NDFC (NanD Flash Controller) + * Platform independent driver for NDFC (NanD Flash Controller) * integrated into EP440 cores * + * Ported to an OF platform driver by Sean MacLennan + * + * The NDFC supports multiple chips, but this driver only supports a + * single chip since I do not have access to any boards with + * multiple chips. + * * Author: Thomas Gleixner * * Copyright 2006 IBM + * Copyright 2008 PIKA Technologies + * Sean MacLennan <smaclennan@pikatech.com> * * This program is free software; you can redistribute it and/or modify it * under the terms of the GNU General Public License as published by the @@ -21,27 +29,20 @@ #include <linux/mtd/partitions.h> #include <linux/mtd/ndfc.h> #include <linux/mtd/mtd.h> -#include <linux/platform_device.h> - +#include <linux/of_platform.h> #include <asm/io.h> -#ifdef CONFIG_40x -#include <asm/ibm405.h> -#else -#include <asm/ibm44x.h> -#endif - -struct ndfc_nand_mtd { - struct mtd_info mtd; - struct nand_chip chip; - struct platform_nand_chip *pl_chip; -}; -static struct ndfc_nand_mtd ndfc_mtd[NDFC_MAX_BANKS]; struct ndfc_controller { - void __iomem *ndfcbase; - struct nand_hw_control ndfc_control; - atomic_t childs_active; + struct of_device *ofdev; + void __iomem *ndfcbase; + struct mtd_info mtd; + struct nand_chip chip; + int chip_select; + struct nand_hw_control ndfc_control; +#ifdef CONFIG_MTD_PARTITIONS + struct mtd_partition *parts; +#endif }; static struct ndfc_controller ndfc_ctrl; @@ -50,17 +51,14 @@ static void ndfc_select_chip(struct mtd_info *mtd, int chip) { uint32_t ccr; struct ndfc_controller *ndfc = &ndfc_ctrl; - struct nand_chip *nandchip = mtd->priv; - struct ndfc_nand_mtd *nandmtd = nandchip->priv; - struct platform_nand_chip *pchip = nandmtd->pl_chip; - ccr = __raw_readl(ndfc->ndfcbase + NDFC_CCR); + ccr = in_be32(ndfc->ndfcbase + NDFC_CCR); if (chip >= 0) { ccr &= ~NDFC_CCR_BS_MASK; - ccr |= NDFC_CCR_BS(chip + pchip->chip_offset); + ccr |= NDFC_CCR_BS(chip + ndfc->chip_select); } else ccr |= NDFC_CCR_RESET_CE; - __raw_writel(ccr, ndfc->ndfcbase + NDFC_CCR); + out_be32(ndfc->ndfcbase + NDFC_CCR, ccr); } static void ndfc_hwcontrol(struct mtd_info *mtd, int cmd, unsigned int ctrl) @@ -80,7 +78,7 @@ static int ndfc_ready(struct mtd_info *mtd) { struct ndfc_controller *ndfc = &ndfc_ctrl; - return __raw_readl(ndfc->ndfcbase + NDFC_STAT) & NDFC_STAT_IS_READY; + return in_be32(ndfc->ndfcbase + NDFC_STAT) & NDFC_STAT_IS_READY; } static void ndfc_enable_hwecc(struct mtd_info *mtd, int mode) @@ -88,9 +86,9 @@ static void ndfc_enable_hwecc(struct mtd_info *mtd, int mode) uint32_t ccr; struct ndfc_controller *ndfc = &ndfc_ctrl; - ccr = __raw_readl(ndfc->ndfcbase + NDFC_CCR); + ccr = in_be32(ndfc->ndfcbase + NDFC_CCR); ccr |= NDFC_CCR_RESET_ECC; - __raw_writel(ccr, ndfc->ndfcbase + NDFC_CCR); + out_be32(ndfc->ndfcbase + NDFC_CCR, ccr); wmb(); } @@ -102,9 +100,10 @@ static int ndfc_calculate_ecc(struct mtd_info *mtd, uint8_t *p = (uint8_t *)&ecc; wmb(); - ecc = __raw_readl(ndfc->ndfcbase + NDFC_ECC); - ecc_code[0] = p[1]; - ecc_code[1] = p[2]; + ecc = in_be32(ndfc->ndfcbase + NDFC_ECC); + /* The NDFC uses Smart Media (SMC) bytes order */ + ecc_code[0] = p[2]; + ecc_code[1] = p[1]; ecc_code[2] = p[3]; return 0; @@ -123,7 +122,7 @@ static void ndfc_read_buf(struct mtd_info *mtd, uint8_t *buf, int len) uint32_t *p = (uint32_t *) buf; for(;len > 0; len -= 4) - *p++ = __raw_readl(ndfc->ndfcbase + NDFC_DATA); + *p++ = in_be32(ndfc->ndfcbase + NDFC_DATA); } static void ndfc_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len) @@ -132,7 +131,7 @@ static void ndfc_write_buf(struct mtd_info *mtd, const uint8_t *buf, int len) uint32_t *p = (uint32_t *) buf; for(;len > 0; len -= 4) - __raw_writel(*p++, ndfc->ndfcbase + NDFC_DATA); + out_be32(ndfc->ndfcbase + NDFC_DATA, *p++); } static int ndfc_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len) @@ -141,7 +140,7 @@ static int ndfc_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len) uint32_t *p = (uint32_t *) buf; for(;len > 0; len -= 4) - if (*p++ != __raw_readl(ndfc->ndfcbase + NDFC_DATA)) + if (*p++ != in_be32(ndfc->ndfcbase + NDFC_DATA)) return -EFAULT; return 0; } @@ -149,10 +148,19 @@ static int ndfc_verify_buf(struct mtd_info *mtd, const uint8_t *buf, int len) /* * Initialize chip structure */ -static void ndfc_chip_init(struct ndfc_nand_mtd *mtd) +static int ndfc_chip_init(struct ndfc_controller *ndfc, + struct device_node *node) { - struct ndfc_controller *ndfc = &ndfc_ctrl; - struct nand_chip *chip = &mtd->chip; +#ifdef CONFIG_MTD_PARTITIONS +#ifdef CONFIG_MTD_CMDLINE_PARTS + static const char *part_types[] = { "cmdlinepart", NULL }; +#else + static const char *part_types[] = { NULL }; +#endif +#endif + struct device_node *flash_np; + struct nand_chip *chip = &ndfc->chip; + int ret; chip->IO_ADDR_R = ndfc->ndfcbase + NDFC_DATA; chip->IO_ADDR_W = ndfc->ndfcbase + NDFC_DATA; @@ -160,8 +168,6 @@ static void ndfc_chip_init(struct ndfc_nand_mtd *mtd) chip->dev_ready = ndfc_ready; chip->select_chip = ndfc_select_chip; chip->chip_delay = 50; - chip->priv = mtd; - chip->options = mtd->pl_chip->options; chip->controller = &ndfc->ndfc_control; chip->read_buf = ndfc_read_buf; chip->write_buf = ndfc_write_buf; @@ -172,143 +178,136 @@ static void ndfc_chip_init(struct ndfc_nand_mtd *mtd) chip->ecc.mode = NAND_ECC_HW; chip->ecc.size = 256; chip->ecc.bytes = 3; - chip->ecclayout = chip->ecc.layout = mtd->pl_chip->ecclayout; - mtd->mtd.priv = chip; - mtd->mtd.owner = THIS_MODULE; -} - -static int ndfc_chip_probe(struct platform_device *pdev) -{ - struct platform_nand_chip *nc = pdev->dev.platform_data; - struct ndfc_chip_settings *settings = nc->priv; - struct ndfc_controller *ndfc = &ndfc_ctrl; - struct ndfc_nand_mtd *nandmtd; - - if (nc->chip_offset >= NDFC_MAX_BANKS || nc->nr_chips > NDFC_MAX_BANKS) - return -EINVAL; - - /* Set the bank settings */ - __raw_writel(settings->bank_settings, - ndfc->ndfcbase + NDFC_BCFG0 + (nc->chip_offset << 2)); - nandmtd = &ndfc_mtd[pdev->id]; - if (nandmtd->pl_chip) - return -EBUSY; + ndfc->mtd.priv = chip; + ndfc->mtd.owner = THIS_MODULE; - nandmtd->pl_chip = nc; - ndfc_chip_init(nandmtd); - - /* Scan for chips */ - if (nand_scan(&nandmtd->mtd, nc->nr_chips)) { - nandmtd->pl_chip = NULL; + flash_np = of_get_next_child(node, NULL); + if (!flash_np) return -ENODEV; + + ndfc->mtd.name = kasprintf(GFP_KERNEL, "%s.%s", + ndfc->ofdev->dev.bus_id, flash_np->name); + if (!ndfc->mtd.name) { + ret = -ENOMEM; + goto err; } -#ifdef CONFIG_MTD_PARTITIONS - printk("Number of partitions %d\n", nc->nr_partitions); - if (nc->nr_partitions) { - /* Add the full device, so complete dumps can be made */ - add_mtd_device(&nandmtd->mtd); - add_mtd_partitions(&nandmtd->mtd, nc->partitions, - nc->nr_partitions); + ret = nand_scan(&ndfc->mtd, 1); + if (ret) + goto err; - } else -#else - add_mtd_device(&nandmtd->mtd); +#ifdef CONFIG_MTD_PARTITIONS + ret = parse_mtd_partitions(&ndfc->mtd, part_types, &ndfc->parts, 0); + if (ret < 0) + goto err; + +#ifdef CONFIG_MTD_OF_PARTS + if (ret == 0) { + ret = of_mtd_parse_partitions(&ndfc->ofdev->dev, flash_np, + &ndfc->parts); + if (ret < 0) + goto err; + } #endif - atomic_inc(&ndfc->childs_active); - return 0; -} + if (ret > 0) + ret = add_mtd_partitions(&ndfc->mtd, ndfc->parts, ret); + else +#endif + ret = add_mtd_device(&ndfc->mtd); -static int ndfc_chip_remove(struct platform_device *pdev) -{ - return 0; +err: + of_node_put(flash_np); + if (ret) + kfree(ndfc->mtd.name); + return ret; } -static int ndfc_nand_probe(struct platform_device *pdev) +static int __devinit ndfc_probe(struct of_device *ofdev, + const struct of_device_id *match) { - struct platform_nand_ctrl *nc = pdev->dev.platform_data; - struct ndfc_controller_settings *settings = nc->priv; - struct resource *res = pdev->resource; struct ndfc_controller *ndfc = &ndfc_ctrl; - unsigned long long phys = settings->ndfc_erpn | res->start; + const u32 *reg; + u32 ccr; + int err, len; -#ifndef CONFIG_PHYS_64BIT - ndfc->ndfcbase = ioremap((phys_addr_t)phys, res->end - res->start + 1); -#else - ndfc->ndfcbase = ioremap64(phys, res->end - res->start + 1); -#endif + spin_lock_init(&ndfc->ndfc_control.lock); + init_waitqueue_head(&ndfc->ndfc_control.wq); + ndfc->ofdev = ofdev; + dev_set_drvdata(&ofdev->dev, ndfc); + + /* Read the reg property to get the chip select */ + reg = of_get_property(ofdev->node, "reg", &len); + if (reg == NULL || len != 12) { + dev_err(&ofdev->dev, "unable read reg property (%d)\n", len); + return -ENOENT; + } + ndfc->chip_select = reg[0]; + + ndfc->ndfcbase = of_iomap(ofdev->node, 0); if (!ndfc->ndfcbase) { - printk(KERN_ERR "NDFC: ioremap failed\n"); + dev_err(&ofdev->dev, "failed to get memory\n"); return -EIO; } - __raw_writel(settings->ccr_settings, ndfc->ndfcbase + NDFC_CCR); + ccr = NDFC_CCR_BS(ndfc->chip_select); - spin_lock_init(&ndfc->ndfc_control.lock); - init_waitqueue_head(&ndfc->ndfc_control.wq); + /* It is ok if ccr does not exist - just default to 0 */ + reg = of_get_property(ofdev->node, "ccr", NULL); + if (reg) + ccr |= *reg; - platform_set_drvdata(pdev, ndfc); + out_be32(ndfc->ndfcbase + NDFC_CCR, ccr); - printk("NDFC NAND Driver initialized. Chip-Rev: 0x%08x\n", - __raw_readl(ndfc->ndfcbase + NDFC_REVID)); + /* Set the bank settings if given */ + reg = of_get_property(ofdev->node, "bank-settings", NULL); + if (reg) { + int offset = NDFC_BCFG0 + (ndfc->chip_select << 2); + out_be32(ndfc->ndfcbase + offset, *reg); + } + + err = ndfc_chip_init(ndfc, ofdev->node); + if (err) { + iounmap(ndfc->ndfcbase); + return err; + } return 0; } -static int ndfc_nand_remove(struct platform_device *pdev) +static int __devexit ndfc_remove(struct of_device *ofdev) { - struct ndfc_controller *ndfc = platform_get_drvdata(pdev); + struct ndfc_controller *ndfc = dev_get_drvdata(&ofdev->dev); - if (atomic_read(&ndfc->childs_active)) - return -EBUSY; + nand_release(&ndfc->mtd); - if (ndfc) { - platform_set_drvdata(pdev, NULL); - iounmap(ndfc_ctrl.ndfcbase); - ndfc_ctrl.ndfcbase = NULL; - } return 0; } -/* driver device registration */ - -static struct platform_driver ndfc_chip_driver = { - .probe = ndfc_chip_probe, - .remove = ndfc_chip_remove, - .driver = { - .name = "ndfc-chip", - .owner = THIS_MODULE, - }, +static const struct of_device_id ndfc_match[] = { + { .compatible = "ibm,ndfc", }, + {} }; +MODULE_DEVICE_TABLE(of, ndfc_match); -static struct platform_driver ndfc_nand_driver = { - .probe = ndfc_nand_probe, - .remove = ndfc_nand_remove, - .driver = { - .name = "ndfc-nand", - .owner = THIS_MODULE, +static struct of_platform_driver ndfc_driver = { + .driver = { + .name = "ndfc", }, + .match_table = ndfc_match, + .probe = ndfc_probe, + .remove = __devexit_p(ndfc_remove), }; static int __init ndfc_nand_init(void) { - int ret; - - spin_lock_init(&ndfc_ctrl.ndfc_control.lock); - init_waitqueue_head(&ndfc_ctrl.ndfc_control.wq); - - ret = platform_driver_register(&ndfc_nand_driver); - if (!ret) - ret = platform_driver_register(&ndfc_chip_driver); - return ret; + return of_register_platform_driver(&ndfc_driver); } static void __exit ndfc_nand_exit(void) { - platform_driver_unregister(&ndfc_chip_driver); - platform_driver_unregister(&ndfc_nand_driver); + of_unregister_platform_driver(&ndfc_driver); } module_init(ndfc_nand_init); @@ -316,6 +315,4 @@ module_exit(ndfc_nand_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Thomas Gleixner <tglx@linutronix.de>"); -MODULE_DESCRIPTION("Platform driver for NDFC"); -MODULE_ALIAS("platform:ndfc-chip"); -MODULE_ALIAS("platform:ndfc-nand"); +MODULE_DESCRIPTION("OF Platform driver for NDFC"); diff --git a/drivers/mtd/nand/pxa3xx_nand.c b/drivers/mtd/nand/pxa3xx_nand.c index fc414449561..cc55cbc2b30 100644 --- a/drivers/mtd/nand/pxa3xx_nand.c +++ b/drivers/mtd/nand/pxa3xx_nand.c @@ -298,7 +298,7 @@ static struct pxa3xx_nand_flash *builtin_flash_types[] = { #define NDTR1_tAR(c) (min((c), 15) << 0) /* convert nano-seconds to nand flash controller clock cycles */ -#define ns2cycle(ns, clk) (int)(((ns) * (clk / 1000000) / 1000) + 1) +#define ns2cycle(ns, clk) (int)(((ns) * (clk / 1000000) / 1000) - 1) static void pxa3xx_nand_set_timing(struct pxa3xx_nand_info *info, const struct pxa3xx_nand_timing *t) @@ -368,14 +368,14 @@ static int prepare_read_prog_cmd(struct pxa3xx_nand_info *info, /* large block, 2 cycles for column address * row address starts from 3rd cycle */ - info->ndcb1 |= (page_addr << 16) | (column & 0xffff); + info->ndcb1 |= page_addr << 16; if (info->row_addr_cycles == 3) info->ndcb2 = (page_addr >> 16) & 0xff; } else /* small block, 1 cycles for column address * row address starts from 2nd cycle */ - info->ndcb1 = (page_addr << 8) | (column & 0xff); + info->ndcb1 = page_addr << 8; if (cmd == cmdset->program) info->ndcb0 |= NDCB0_CMD_TYPE(1) | NDCB0_AUTO_RS; diff --git a/drivers/mtd/nand/sharpsl.c b/drivers/mtd/nand/sharpsl.c index 30a518e211b..54ec7542a7b 100644 --- a/drivers/mtd/nand/sharpsl.c +++ b/drivers/mtd/nand/sharpsl.c @@ -2,6 +2,7 @@ * drivers/mtd/nand/sharpsl.c * * Copyright (C) 2004 Richard Purdie + * Copyright (C) 2008 Dmitry Baryshkov * * Based on Sharp's NAND driver sharp_sl.c * @@ -19,22 +20,31 @@ #include <linux/mtd/nand.h> #include <linux/mtd/nand_ecc.h> #include <linux/mtd/partitions.h> +#include <linux/mtd/sharpsl.h> #include <linux/interrupt.h> +#include <linux/platform_device.h> + #include <asm/io.h> #include <mach/hardware.h> #include <asm/mach-types.h> -static void __iomem *sharpsl_io_base; -static int sharpsl_phys_base = 0x0C000000; +struct sharpsl_nand { + struct mtd_info mtd; + struct nand_chip chip; + + void __iomem *io; +}; + +#define mtd_to_sharpsl(_mtd) container_of(_mtd, struct sharpsl_nand, mtd) /* register offset */ -#define ECCLPLB sharpsl_io_base+0x00 /* line parity 7 - 0 bit */ -#define ECCLPUB sharpsl_io_base+0x04 /* line parity 15 - 8 bit */ -#define ECCCP sharpsl_io_base+0x08 /* column parity 5 - 0 bit */ -#define ECCCNTR sharpsl_io_base+0x0C /* ECC byte counter */ -#define ECCCLRR sharpsl_io_base+0x10 /* cleare ECC */ -#define FLASHIO sharpsl_io_base+0x14 /* Flash I/O */ -#define FLASHCTL sharpsl_io_base+0x18 /* Flash Control */ +#define ECCLPLB 0x00 /* line parity 7 - 0 bit */ +#define ECCLPUB 0x04 /* line parity 15 - 8 bit */ +#define ECCCP 0x08 /* column parity 5 - 0 bit */ +#define ECCCNTR 0x0C /* ECC byte counter */ +#define ECCCLRR 0x10 /* cleare ECC */ +#define FLASHIO 0x14 /* Flash I/O */ +#define FLASHCTL 0x18 /* Flash Control */ /* Flash control bit */ #define FLRYBY (1 << 5) @@ -45,35 +55,6 @@ static int sharpsl_phys_base = 0x0C000000; #define FLCE0 (1 << 0) /* - * MTD structure for SharpSL - */ -static struct mtd_info *sharpsl_mtd = NULL; - -/* - * Define partitions for flash device - */ -#define DEFAULT_NUM_PARTITIONS 3 - -static int nr_partitions; -static struct mtd_partition sharpsl_nand_default_partition_info[] = { - { - .name = "System Area", - .offset = 0, - .size = 7 * 1024 * 1024, - }, - { - .name = "Root Filesystem", - .offset = 7 * 1024 * 1024, - .size = 30 * 1024 * 1024, - }, - { - .name = "Home Filesystem", - .offset = MTDPART_OFS_APPEND, - .size = MTDPART_SIZ_FULL, - }, -}; - -/* * hardware specific access to control-lines * ctrl: * NAND_CNE: bit 0 -> ! bit 0 & 4 @@ -84,6 +65,7 @@ static struct mtd_partition sharpsl_nand_default_partition_info[] = { static void sharpsl_nand_hwcontrol(struct mtd_info *mtd, int cmd, unsigned int ctrl) { + struct sharpsl_nand *sharpsl = mtd_to_sharpsl(mtd); struct nand_chip *chip = mtd->priv; if (ctrl & NAND_CTRL_CHANGE) { @@ -93,103 +75,97 @@ static void sharpsl_nand_hwcontrol(struct mtd_info *mtd, int cmd, bits ^= 0x11; - writeb((readb(FLASHCTL) & ~0x17) | bits, FLASHCTL); + writeb((readb(sharpsl->io + FLASHCTL) & ~0x17) | bits, sharpsl->io + FLASHCTL); } if (cmd != NAND_CMD_NONE) writeb(cmd, chip->IO_ADDR_W); } -static uint8_t scan_ff_pattern[] = { 0xff, 0xff }; - -static struct nand_bbt_descr sharpsl_bbt = { - .options = 0, - .offs = 4, - .len = 2, - .pattern = scan_ff_pattern -}; - -static struct nand_bbt_descr sharpsl_akita_bbt = { - .options = 0, - .offs = 4, - .len = 1, - .pattern = scan_ff_pattern -}; - -static struct nand_ecclayout akita_oobinfo = { - .eccbytes = 24, - .eccpos = { - 0x5, 0x1, 0x2, 0x3, 0x6, 0x7, 0x15, 0x11, - 0x12, 0x13, 0x16, 0x17, 0x25, 0x21, 0x22, 0x23, - 0x26, 0x27, 0x35, 0x31, 0x32, 0x33, 0x36, 0x37}, - .oobfree = {{0x08, 0x09}} -}; - static int sharpsl_nand_dev_ready(struct mtd_info *mtd) { - return !((readb(FLASHCTL) & FLRYBY) == 0); + struct sharpsl_nand *sharpsl = mtd_to_sharpsl(mtd); + return !((readb(sharpsl->io + FLASHCTL) & FLRYBY) == 0); } static void sharpsl_nand_enable_hwecc(struct mtd_info *mtd, int mode) { - writeb(0, ECCCLRR); + struct sharpsl_nand *sharpsl = mtd_to_sharpsl(mtd); + writeb(0, sharpsl->io + ECCCLRR); } static int sharpsl_nand_calculate_ecc(struct mtd_info *mtd, const u_char * dat, u_char * ecc_code) { - ecc_code[0] = ~readb(ECCLPUB); - ecc_code[1] = ~readb(ECCLPLB); - ecc_code[2] = (~readb(ECCCP) << 2) | 0x03; - return readb(ECCCNTR) != 0; + struct sharpsl_nand *sharpsl = mtd_to_sharpsl(mtd); + ecc_code[0] = ~readb(sharpsl->io + ECCLPUB); + ecc_code[1] = ~readb(sharpsl->io + ECCLPLB); + ecc_code[2] = (~readb(sharpsl->io + ECCCP) << 2) | 0x03; + return readb(sharpsl->io + ECCCNTR) != 0; } #ifdef CONFIG_MTD_PARTITIONS -const char *part_probes[] = { "cmdlinepart", NULL }; +static const char *part_probes[] = { "cmdlinepart", NULL }; #endif /* * Main initialization routine */ -static int __init sharpsl_nand_init(void) +static int __devinit sharpsl_nand_probe(struct platform_device *pdev) { struct nand_chip *this; +#ifdef CONFIG_MTD_PARTITIONS struct mtd_partition *sharpsl_partition_info; + int nr_partitions; +#endif + struct resource *r; int err = 0; + struct sharpsl_nand *sharpsl; + struct sharpsl_nand_platform_data *data = pdev->dev.platform_data; + + if (!data) { + dev_err(&pdev->dev, "no platform data!\n"); + return -EINVAL; + } /* Allocate memory for MTD device structure and private data */ - sharpsl_mtd = kmalloc(sizeof(struct mtd_info) + sizeof(struct nand_chip), GFP_KERNEL); - if (!sharpsl_mtd) { + sharpsl = kzalloc(sizeof(struct sharpsl_nand), GFP_KERNEL); + if (!sharpsl) { printk("Unable to allocate SharpSL NAND MTD device structure.\n"); return -ENOMEM; } + r = platform_get_resource(pdev, IORESOURCE_MEM, 0); + if (!r) { + dev_err(&pdev->dev, "no io memory resource defined!\n"); + err = -ENODEV; + goto err_get_res; + } + /* map physical address */ - sharpsl_io_base = ioremap(sharpsl_phys_base, 0x1000); - if (!sharpsl_io_base) { + sharpsl->io = ioremap(r->start, resource_size(r)); + if (!sharpsl->io) { printk("ioremap to access Sharp SL NAND chip failed\n"); - kfree(sharpsl_mtd); - return -EIO; + err = -EIO; + goto err_ioremap; } /* Get pointer to private data */ - this = (struct nand_chip *)(&sharpsl_mtd[1]); - - /* Initialize structures */ - memset(sharpsl_mtd, 0, sizeof(struct mtd_info)); - memset(this, 0, sizeof(struct nand_chip)); + this = (struct nand_chip *)(&sharpsl->chip); /* Link the private data with the MTD structure */ - sharpsl_mtd->priv = this; - sharpsl_mtd->owner = THIS_MODULE; + sharpsl->mtd.priv = this; + sharpsl->mtd.owner = THIS_MODULE; + + platform_set_drvdata(pdev, sharpsl); /* * PXA initialize */ - writeb(readb(FLASHCTL) | FLWP, FLASHCTL); + writeb(readb(sharpsl->io + FLASHCTL) | FLWP, sharpsl->io + FLASHCTL); /* Set address of NAND IO lines */ - this->IO_ADDR_R = FLASHIO; - this->IO_ADDR_W = FLASHIO; + this->IO_ADDR_R = sharpsl->io + FLASHIO; + this->IO_ADDR_W = sharpsl->io + FLASHIO; /* Set address of hardware control function */ this->cmd_ctrl = sharpsl_nand_hwcontrol; this->dev_ready = sharpsl_nand_dev_ready; @@ -199,68 +175,89 @@ static int __init sharpsl_nand_init(void) this->ecc.mode = NAND_ECC_HW; this->ecc.size = 256; this->ecc.bytes = 3; - this->badblock_pattern = &sharpsl_bbt; - if (machine_is_akita() || machine_is_borzoi()) { - this->badblock_pattern = &sharpsl_akita_bbt; - this->ecc.layout = &akita_oobinfo; - } + this->badblock_pattern = data->badblock_pattern; + this->ecc.layout = data->ecc_layout; this->ecc.hwctl = sharpsl_nand_enable_hwecc; this->ecc.calculate = sharpsl_nand_calculate_ecc; this->ecc.correct = nand_correct_data; /* Scan to find existence of the device */ - err = nand_scan(sharpsl_mtd, 1); - if (err) { - iounmap(sharpsl_io_base); - kfree(sharpsl_mtd); - return err; - } + err = nand_scan(&sharpsl->mtd, 1); + if (err) + goto err_scan; /* Register the partitions */ - sharpsl_mtd->name = "sharpsl-nand"; - nr_partitions = parse_mtd_partitions(sharpsl_mtd, part_probes, &sharpsl_partition_info, 0); - + sharpsl->mtd.name = "sharpsl-nand"; +#ifdef CONFIG_MTD_PARTITIONS + nr_partitions = parse_mtd_partitions(&sharpsl->mtd, part_probes, &sharpsl_partition_info, 0); if (nr_partitions <= 0) { - nr_partitions = DEFAULT_NUM_PARTITIONS; - sharpsl_partition_info = sharpsl_nand_default_partition_info; - if (machine_is_poodle()) { - sharpsl_partition_info[1].size = 22 * 1024 * 1024; - } else if (machine_is_corgi() || machine_is_shepherd()) { - sharpsl_partition_info[1].size = 25 * 1024 * 1024; - } else if (machine_is_husky()) { - sharpsl_partition_info[1].size = 53 * 1024 * 1024; - } else if (machine_is_spitz()) { - sharpsl_partition_info[1].size = 5 * 1024 * 1024; - } else if (machine_is_akita()) { - sharpsl_partition_info[1].size = 58 * 1024 * 1024; - } else if (machine_is_borzoi()) { - sharpsl_partition_info[1].size = 32 * 1024 * 1024; - } + nr_partitions = data->nr_partitions; + sharpsl_partition_info = data->partitions; } - add_mtd_partitions(sharpsl_mtd, sharpsl_partition_info, nr_partitions); + if (nr_partitions > 0) + err = add_mtd_partitions(&sharpsl->mtd, sharpsl_partition_info, nr_partitions); + else +#endif + err = add_mtd_device(&sharpsl->mtd); + if (err) + goto err_add; /* Return happy */ return 0; -} -module_init(sharpsl_nand_init); +err_add: + nand_release(&sharpsl->mtd); + +err_scan: + platform_set_drvdata(pdev, NULL); + iounmap(sharpsl->io); +err_ioremap: +err_get_res: + kfree(sharpsl); + return err; +} /* * Clean up routine */ -static void __exit sharpsl_nand_cleanup(void) +static int __devexit sharpsl_nand_remove(struct platform_device *pdev) { + struct sharpsl_nand *sharpsl = platform_get_drvdata(pdev); + /* Release resources, unregister device */ - nand_release(sharpsl_mtd); + nand_release(&sharpsl->mtd); - iounmap(sharpsl_io_base); + platform_set_drvdata(pdev, NULL); + + iounmap(sharpsl->io); /* Free the MTD device structure */ - kfree(sharpsl_mtd); + kfree(sharpsl); + + return 0; +} + +static struct platform_driver sharpsl_nand_driver = { + .driver = { + .name = "sharpsl-nand", + .owner = THIS_MODULE, + }, + .probe = sharpsl_nand_probe, + .remove = __devexit_p(sharpsl_nand_remove), +}; + +static int __init sharpsl_nand_init(void) +{ + return platform_driver_register(&sharpsl_nand_driver); } +module_init(sharpsl_nand_init); -module_exit(sharpsl_nand_cleanup); +static void __exit sharpsl_nand_exit(void) +{ + platform_driver_unregister(&sharpsl_nand_driver); +} +module_exit(sharpsl_nand_exit); MODULE_LICENSE("GPL"); MODULE_AUTHOR("Richard Purdie <rpurdie@rpsys.net>"); diff --git a/drivers/mtd/nftlcore.c b/drivers/mtd/nftlcore.c index 320b929abe7..d1c4546513f 100644 --- a/drivers/mtd/nftlcore.c +++ b/drivers/mtd/nftlcore.c @@ -39,7 +39,7 @@ static void nftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd) struct NFTLrecord *nftl; unsigned long temp; - if (mtd->type != MTD_NANDFLASH) + if (mtd->type != MTD_NANDFLASH || mtd->size > UINT_MAX) return; /* OK, this is moderately ugly. But probably safe. Alternatives? */ if (memcmp(mtd->name, "DiskOnChip", 10)) diff --git a/drivers/mtd/nftlmount.c b/drivers/mtd/nftlmount.c index ccc4f209fbb..8b22b1836e9 100644 --- a/drivers/mtd/nftlmount.c +++ b/drivers/mtd/nftlmount.c @@ -51,7 +51,7 @@ static int find_boot_record(struct NFTLrecord *nftl) the mtd device accordingly. We could even get rid of nftl->EraseSize if there were any point in doing so. */ nftl->EraseSize = nftl->mbd.mtd->erasesize; - nftl->nb_blocks = nftl->mbd.mtd->size / nftl->EraseSize; + nftl->nb_blocks = (u32)nftl->mbd.mtd->size / nftl->EraseSize; nftl->MediaUnit = BLOCK_NIL; nftl->SpareMediaUnit = BLOCK_NIL; @@ -168,7 +168,7 @@ device is already correct. printk(KERN_NOTICE "WARNING: Support for NFTL with UnitSizeFactor 0x%02x is experimental\n", mh->UnitSizeFactor); nftl->EraseSize = nftl->mbd.mtd->erasesize << (0xff - mh->UnitSizeFactor); - nftl->nb_blocks = nftl->mbd.mtd->size / nftl->EraseSize; + nftl->nb_blocks = (u32)nftl->mbd.mtd->size / nftl->EraseSize; } #endif nftl->nb_boot_blocks = le16_to_cpu(mh->FirstPhysicalEUN); diff --git a/drivers/mtd/onenand/onenand_base.c b/drivers/mtd/onenand/onenand_base.c index 90ed319f26e..529af271db1 100644 --- a/drivers/mtd/onenand/onenand_base.c +++ b/drivers/mtd/onenand/onenand_base.c @@ -1772,7 +1772,7 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr) int len; int ret = 0; - DEBUG(MTD_DEBUG_LEVEL3, "onenand_erase: start = 0x%08x, len = %i\n", (unsigned int) instr->addr, (unsigned int) instr->len); + DEBUG(MTD_DEBUG_LEVEL3, "onenand_erase: start = 0x%012llx, len = %llu\n", (unsigned long long) instr->addr, (unsigned long long) instr->len); block_size = (1 << this->erase_shift); @@ -1810,7 +1810,7 @@ static int onenand_erase(struct mtd_info *mtd, struct erase_info *instr) /* Check if we have a bad block, we do not erase bad blocks */ if (onenand_block_isbad_nolock(mtd, addr, 0)) { - printk (KERN_WARNING "onenand_erase: attempt to erase a bad block at addr 0x%08x\n", (unsigned int) addr); + printk (KERN_WARNING "onenand_erase: attempt to erase a bad block at addr 0x%012llx\n", (unsigned long long) addr); instr->state = MTD_ERASE_FAILED; goto erase_exit; } @@ -2029,7 +2029,7 @@ static int onenand_do_lock_cmd(struct mtd_info *mtd, loff_t ofs, size_t len, int * * Lock one or more blocks */ -static int onenand_lock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int onenand_lock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { int ret; @@ -2047,7 +2047,7 @@ static int onenand_lock(struct mtd_info *mtd, loff_t ofs, size_t len) * * Unlock one or more blocks */ -static int onenand_unlock(struct mtd_info *mtd, loff_t ofs, size_t len) +static int onenand_unlock(struct mtd_info *mtd, loff_t ofs, uint64_t len) { int ret; diff --git a/drivers/mtd/rfd_ftl.c b/drivers/mtd/rfd_ftl.c index e538c0a72ab..d2aa9c46530 100644 --- a/drivers/mtd/rfd_ftl.c +++ b/drivers/mtd/rfd_ftl.c @@ -21,8 +21,6 @@ #include <asm/types.h> -#define const_cpu_to_le16 __constant_cpu_to_le16 - static int block_size = 0; module_param(block_size, int, 0); MODULE_PARM_DESC(block_size, "Block size to use by RFD, defaults to erase unit size"); @@ -156,7 +154,7 @@ static int scan_header(struct partition *part) size_t retlen; sectors_per_block = part->block_size / SECTOR_SIZE; - part->total_blocks = part->mbd.mtd->size / part->block_size; + part->total_blocks = (u32)part->mbd.mtd->size / part->block_size; if (part->total_blocks < 2) return -ENOENT; @@ -276,16 +274,17 @@ static void erase_callback(struct erase_info *erase) part = (struct partition*)erase->priv; - i = erase->addr / part->block_size; - if (i >= part->total_blocks || part->blocks[i].offset != erase->addr) { - printk(KERN_ERR PREFIX "erase callback for unknown offset %x " - "on '%s'\n", erase->addr, part->mbd.mtd->name); + i = (u32)erase->addr / part->block_size; + if (i >= part->total_blocks || part->blocks[i].offset != erase->addr || + erase->addr > UINT_MAX) { + printk(KERN_ERR PREFIX "erase callback for unknown offset %llx " + "on '%s'\n", (unsigned long long)erase->addr, part->mbd.mtd->name); return; } if (erase->state != MTD_ERASE_DONE) { - printk(KERN_WARNING PREFIX "erase failed at 0x%x on '%s', " - "state %d\n", erase->addr, + printk(KERN_WARNING PREFIX "erase failed at 0x%llx on '%s', " + "state %d\n", (unsigned long long)erase->addr, part->mbd.mtd->name, erase->state); part->blocks[i].state = BLOCK_FAILED; @@ -297,7 +296,7 @@ static void erase_callback(struct erase_info *erase) return; } - magic = const_cpu_to_le16(RFD_MAGIC); + magic = cpu_to_le16(RFD_MAGIC); part->blocks[i].state = BLOCK_ERASED; part->blocks[i].free_sectors = part->data_sectors_per_block; @@ -345,9 +344,9 @@ static int erase_block(struct partition *part, int block) rc = part->mbd.mtd->erase(part->mbd.mtd, erase); if (rc) { - printk(KERN_ERR PREFIX "erase of region %x,%x on '%s' " - "failed\n", erase->addr, erase->len, - part->mbd.mtd->name); + printk(KERN_ERR PREFIX "erase of region %llx,%llx on '%s' " + "failed\n", (unsigned long long)erase->addr, + (unsigned long long)erase->len, part->mbd.mtd->name); kfree(erase); } @@ -587,7 +586,7 @@ static int mark_sector_deleted(struct partition *part, u_long old_addr) int block, offset, rc; u_long addr; size_t retlen; - u16 del = const_cpu_to_le16(SECTOR_DELETED); + u16 del = cpu_to_le16(SECTOR_DELETED); block = old_addr / part->block_size; offset = (old_addr % part->block_size) / SECTOR_SIZE - @@ -763,7 +762,7 @@ static void rfd_ftl_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd) { struct partition *part; - if (mtd->type != MTD_NORFLASH) + if (mtd->type != MTD_NORFLASH || mtd->size > UINT_MAX) return; part = kzalloc(sizeof(struct partition), GFP_KERNEL); diff --git a/drivers/mtd/ssfdc.c b/drivers/mtd/ssfdc.c index 33a5d6ed6f1..3f67e00d98e 100644 --- a/drivers/mtd/ssfdc.c +++ b/drivers/mtd/ssfdc.c @@ -294,7 +294,8 @@ static void ssfdcr_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd) int cis_sector; /* Check for small page NAND flash */ - if (mtd->type != MTD_NANDFLASH || mtd->oobsize != OOB_SIZE) + if (mtd->type != MTD_NANDFLASH || mtd->oobsize != OOB_SIZE || + mtd->size > UINT_MAX) return; /* Check for SSDFC format by reading CIS/IDI sector */ @@ -316,7 +317,7 @@ static void ssfdcr_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd) ssfdc->cis_block = cis_sector / (mtd->erasesize >> SECTOR_SHIFT); ssfdc->erase_size = mtd->erasesize; - ssfdc->map_len = mtd->size / mtd->erasesize; + ssfdc->map_len = (u32)mtd->size / mtd->erasesize; DEBUG(MTD_DEBUG_LEVEL1, "SSFDC_RO: cis_block=%d,erase_size=%d,map_len=%d,n_zones=%d\n", @@ -327,7 +328,7 @@ static void ssfdcr_add_mtd(struct mtd_blktrans_ops *tr, struct mtd_info *mtd) ssfdc->heads = 16; ssfdc->sectors = 32; get_chs(mtd->size, NULL, &ssfdc->heads, &ssfdc->sectors); - ssfdc->cylinders = (unsigned short)((mtd->size >> SECTOR_SHIFT) / + ssfdc->cylinders = (unsigned short)(((u32)mtd->size >> SECTOR_SHIFT) / ((long)ssfdc->sectors * (long)ssfdc->heads)); DEBUG(MTD_DEBUG_LEVEL1, "SSFDC_RO: using C:%d H:%d S:%d == %ld sects\n", diff --git a/drivers/mtd/tests/Makefile b/drivers/mtd/tests/Makefile new file mode 100644 index 00000000000..c1d50133500 --- /dev/null +++ b/drivers/mtd/tests/Makefile @@ -0,0 +1,7 @@ +obj-$(CONFIG_MTD_TESTS) += mtd_oobtest.o +obj-$(CONFIG_MTD_TESTS) += mtd_pagetest.o +obj-$(CONFIG_MTD_TESTS) += mtd_readtest.o +obj-$(CONFIG_MTD_TESTS) += mtd_speedtest.o +obj-$(CONFIG_MTD_TESTS) += mtd_stresstest.o +obj-$(CONFIG_MTD_TESTS) += mtd_subpagetest.o +obj-$(CONFIG_MTD_TESTS) += mtd_torturetest.o diff --git a/drivers/mtd/tests/mtd_oobtest.c b/drivers/mtd/tests/mtd_oobtest.c new file mode 100644 index 00000000000..afbc3f8126d --- /dev/null +++ b/drivers/mtd/tests/mtd_oobtest.c @@ -0,0 +1,742 @@ +/* + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Test OOB read and write on MTD device. + * + * Author: Adrian Hunter <ext-adrian.hunter@nokia.com> + */ + +#include <asm/div64.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/err.h> +#include <linux/mtd/mtd.h> +#include <linux/sched.h> + +#define PRINT_PREF KERN_INFO "mtd_oobtest: " + +static int dev; +module_param(dev, int, S_IRUGO); +MODULE_PARM_DESC(dev, "MTD device number to use"); + +static struct mtd_info *mtd; +static unsigned char *readbuf; +static unsigned char *writebuf; +static unsigned char *bbt; + +static int ebcnt; +static int pgcnt; +static int errcnt; +static int use_offset; +static int use_len; +static int use_len_max; +static int vary_offset; +static unsigned long next = 1; + +static inline unsigned int simple_rand(void) +{ + next = next * 1103515245 + 12345; + return (unsigned int)((next / 65536) % 32768); +} + +static inline void simple_srand(unsigned long seed) +{ + next = seed; +} + +static void set_random_data(unsigned char *buf, size_t len) +{ + size_t i; + + for (i = 0; i < len; ++i) + buf[i] = simple_rand(); +} + +static int erase_eraseblock(int ebnum) +{ + int err; + struct erase_info ei; + loff_t addr = ebnum * mtd->erasesize; + + memset(&ei, 0, sizeof(struct erase_info)); + ei.mtd = mtd; + ei.addr = addr; + ei.len = mtd->erasesize; + + err = mtd->erase(mtd, &ei); + if (err) { + printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum); + return err; + } + + if (ei.state == MTD_ERASE_FAILED) { + printk(PRINT_PREF "some erase error occurred at EB %d\n", + ebnum); + return -EIO; + } + + return 0; +} + +static int erase_whole_device(void) +{ + int err; + unsigned int i; + + printk(PRINT_PREF "erasing whole device\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = erase_eraseblock(i); + if (err) + return err; + cond_resched(); + } + printk(PRINT_PREF "erased %u eraseblocks\n", i); + return 0; +} + +static void do_vary_offset(void) +{ + use_len -= 1; + if (use_len < 1) { + use_offset += 1; + if (use_offset >= use_len_max) + use_offset = 0; + use_len = use_len_max - use_offset; + } +} + +static int write_eraseblock(int ebnum) +{ + int i; + struct mtd_oob_ops ops; + int err = 0; + loff_t addr = ebnum * mtd->erasesize; + + for (i = 0; i < pgcnt; ++i, addr += mtd->writesize) { + set_random_data(writebuf, use_len); + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = use_len; + ops.oobretlen = 0; + ops.ooboffs = use_offset; + ops.datbuf = 0; + ops.oobbuf = writebuf; + err = mtd->write_oob(mtd, addr, &ops); + if (err || ops.oobretlen != use_len) { + printk(PRINT_PREF "error: writeoob failed at %#llx\n", + (long long)addr); + printk(PRINT_PREF "error: use_len %d, use_offset %d\n", + use_len, use_offset); + errcnt += 1; + return err ? err : -1; + } + if (vary_offset) + do_vary_offset(); + } + + return err; +} + +static int write_whole_device(void) +{ + int err; + unsigned int i; + + printk(PRINT_PREF "writing OOBs of whole device\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = write_eraseblock(i); + if (err) + return err; + if (i % 256 == 0) + printk(PRINT_PREF "written up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "written %u eraseblocks\n", i); + return 0; +} + +static int verify_eraseblock(int ebnum) +{ + int i; + struct mtd_oob_ops ops; + int err = 0; + loff_t addr = ebnum * mtd->erasesize; + + for (i = 0; i < pgcnt; ++i, addr += mtd->writesize) { + set_random_data(writebuf, use_len); + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = use_len; + ops.oobretlen = 0; + ops.ooboffs = use_offset; + ops.datbuf = 0; + ops.oobbuf = readbuf; + err = mtd->read_oob(mtd, addr, &ops); + if (err || ops.oobretlen != use_len) { + printk(PRINT_PREF "error: readoob failed at %#llx\n", + (long long)addr); + errcnt += 1; + return err ? err : -1; + } + if (memcmp(readbuf, writebuf, use_len)) { + printk(PRINT_PREF "error: verify failed at %#llx\n", + (long long)addr); + errcnt += 1; + if (errcnt > 1000) { + printk(PRINT_PREF "error: too many errors\n"); + return -1; + } + } + if (use_offset != 0 || use_len < mtd->ecclayout->oobavail) { + int k; + + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = mtd->ecclayout->oobavail; + ops.oobretlen = 0; + ops.ooboffs = 0; + ops.datbuf = 0; + ops.oobbuf = readbuf; + err = mtd->read_oob(mtd, addr, &ops); + if (err || ops.oobretlen != mtd->ecclayout->oobavail) { + printk(PRINT_PREF "error: readoob failed at " + "%#llx\n", (long long)addr); + errcnt += 1; + return err ? err : -1; + } + if (memcmp(readbuf + use_offset, writebuf, use_len)) { + printk(PRINT_PREF "error: verify failed at " + "%#llx\n", (long long)addr); + errcnt += 1; + if (errcnt > 1000) { + printk(PRINT_PREF "error: too many " + "errors\n"); + return -1; + } + } + for (k = 0; k < use_offset; ++k) + if (readbuf[k] != 0xff) { + printk(PRINT_PREF "error: verify 0xff " + "failed at %#llx\n", + (long long)addr); + errcnt += 1; + if (errcnt > 1000) { + printk(PRINT_PREF "error: too " + "many errors\n"); + return -1; + } + } + for (k = use_offset + use_len; + k < mtd->ecclayout->oobavail; ++k) + if (readbuf[k] != 0xff) { + printk(PRINT_PREF "error: verify 0xff " + "failed at %#llx\n", + (long long)addr); + errcnt += 1; + if (errcnt > 1000) { + printk(PRINT_PREF "error: too " + "many errors\n"); + return -1; + } + } + } + if (vary_offset) + do_vary_offset(); + } + return err; +} + +static int verify_eraseblock_in_one_go(int ebnum) +{ + struct mtd_oob_ops ops; + int err = 0; + loff_t addr = ebnum * mtd->erasesize; + size_t len = mtd->ecclayout->oobavail * pgcnt; + + set_random_data(writebuf, len); + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = len; + ops.oobretlen = 0; + ops.ooboffs = 0; + ops.datbuf = 0; + ops.oobbuf = readbuf; + err = mtd->read_oob(mtd, addr, &ops); + if (err || ops.oobretlen != len) { + printk(PRINT_PREF "error: readoob failed at %#llx\n", + (long long)addr); + errcnt += 1; + return err ? err : -1; + } + if (memcmp(readbuf, writebuf, len)) { + printk(PRINT_PREF "error: verify failed at %#llx\n", + (long long)addr); + errcnt += 1; + if (errcnt > 1000) { + printk(PRINT_PREF "error: too many errors\n"); + return -1; + } + } + + return err; +} + +static int verify_all_eraseblocks(void) +{ + int err; + unsigned int i; + + printk(PRINT_PREF "verifying all eraseblocks\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = verify_eraseblock(i); + if (err) + return err; + if (i % 256 == 0) + printk(PRINT_PREF "verified up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "verified %u eraseblocks\n", i); + return 0; +} + +static int is_block_bad(int ebnum) +{ + int ret; + loff_t addr = ebnum * mtd->erasesize; + + ret = mtd->block_isbad(mtd, addr); + if (ret) + printk(PRINT_PREF "block %d is bad\n", ebnum); + return ret; +} + +static int scan_for_bad_eraseblocks(void) +{ + int i, bad = 0; + + bbt = kmalloc(ebcnt, GFP_KERNEL); + if (!bbt) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + return -ENOMEM; + } + memset(bbt, 0 , ebcnt); + + printk(PRINT_PREF "scanning for bad eraseblocks\n"); + for (i = 0; i < ebcnt; ++i) { + bbt[i] = is_block_bad(i) ? 1 : 0; + if (bbt[i]) + bad += 1; + cond_resched(); + } + printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad); + return 0; +} + +static int __init mtd_oobtest_init(void) +{ + int err = 0; + unsigned int i; + uint64_t tmp; + struct mtd_oob_ops ops; + loff_t addr = 0, addr0; + + printk(KERN_INFO "\n"); + printk(KERN_INFO "=================================================\n"); + printk(PRINT_PREF "MTD device: %d\n", dev); + + mtd = get_mtd_device(NULL, dev); + if (IS_ERR(mtd)) { + err = PTR_ERR(mtd); + printk(PRINT_PREF "error: cannot get MTD device\n"); + return err; + } + + if (mtd->type != MTD_NANDFLASH) { + printk(PRINT_PREF "this test requires NAND flash\n"); + goto out; + } + + tmp = mtd->size; + do_div(tmp, mtd->erasesize); + ebcnt = tmp; + pgcnt = mtd->erasesize / mtd->writesize; + + printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, " + "page size %u, count of eraseblocks %u, pages per " + "eraseblock %u, OOB size %u\n", + (unsigned long long)mtd->size, mtd->erasesize, + mtd->writesize, ebcnt, pgcnt, mtd->oobsize); + + err = -ENOMEM; + mtd->erasesize = mtd->erasesize; + readbuf = kmalloc(mtd->erasesize, GFP_KERNEL); + if (!readbuf) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out; + } + writebuf = kmalloc(mtd->erasesize, GFP_KERNEL); + if (!writebuf) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out; + } + + err = scan_for_bad_eraseblocks(); + if (err) + goto out; + + use_offset = 0; + use_len = mtd->ecclayout->oobavail; + use_len_max = mtd->ecclayout->oobavail; + vary_offset = 0; + + /* First test: write all OOB, read it back and verify */ + printk(PRINT_PREF "test 1 of 5\n"); + + err = erase_whole_device(); + if (err) + goto out; + + simple_srand(1); + err = write_whole_device(); + if (err) + goto out; + + simple_srand(1); + err = verify_all_eraseblocks(); + if (err) + goto out; + + /* + * Second test: write all OOB, a block at a time, read it back and + * verify. + */ + printk(PRINT_PREF "test 2 of 5\n"); + + err = erase_whole_device(); + if (err) + goto out; + + simple_srand(3); + err = write_whole_device(); + if (err) + goto out; + + /* Check all eraseblocks */ + simple_srand(3); + printk(PRINT_PREF "verifying all eraseblocks\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = verify_eraseblock_in_one_go(i); + if (err) + goto out; + if (i % 256 == 0) + printk(PRINT_PREF "verified up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "verified %u eraseblocks\n", i); + + /* + * Third test: write OOB at varying offsets and lengths, read it back + * and verify. + */ + printk(PRINT_PREF "test 3 of 5\n"); + + err = erase_whole_device(); + if (err) + goto out; + + /* Write all eraseblocks */ + use_offset = 0; + use_len = mtd->ecclayout->oobavail; + use_len_max = mtd->ecclayout->oobavail; + vary_offset = 1; + simple_srand(5); + printk(PRINT_PREF "writing OOBs of whole device\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = write_eraseblock(i); + if (err) + goto out; + if (i % 256 == 0) + printk(PRINT_PREF "written up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "written %u eraseblocks\n", i); + + /* Check all eraseblocks */ + use_offset = 0; + use_len = mtd->ecclayout->oobavail; + use_len_max = mtd->ecclayout->oobavail; + vary_offset = 1; + simple_srand(5); + err = verify_all_eraseblocks(); + if (err) + goto out; + + use_offset = 0; + use_len = mtd->ecclayout->oobavail; + use_len_max = mtd->ecclayout->oobavail; + vary_offset = 0; + + /* Fourth test: try to write off end of device */ + printk(PRINT_PREF "test 4 of 5\n"); + + err = erase_whole_device(); + if (err) + goto out; + + addr0 = 0; + for (i = 0; bbt[i] && i < ebcnt; ++i) + addr0 += mtd->erasesize; + + /* Attempt to write off end of OOB */ + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = 1; + ops.oobretlen = 0; + ops.ooboffs = mtd->ecclayout->oobavail; + ops.datbuf = 0; + ops.oobbuf = writebuf; + printk(PRINT_PREF "attempting to start write past end of OOB\n"); + printk(PRINT_PREF "an error is expected...\n"); + err = mtd->write_oob(mtd, addr0, &ops); + if (err) { + printk(PRINT_PREF "error occurred as expected\n"); + err = 0; + } else { + printk(PRINT_PREF "error: can write past end of OOB\n"); + errcnt += 1; + } + + /* Attempt to read off end of OOB */ + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = 1; + ops.oobretlen = 0; + ops.ooboffs = mtd->ecclayout->oobavail; + ops.datbuf = 0; + ops.oobbuf = readbuf; + printk(PRINT_PREF "attempting to start read past end of OOB\n"); + printk(PRINT_PREF "an error is expected...\n"); + err = mtd->read_oob(mtd, addr0, &ops); + if (err) { + printk(PRINT_PREF "error occurred as expected\n"); + err = 0; + } else { + printk(PRINT_PREF "error: can read past end of OOB\n"); + errcnt += 1; + } + + if (bbt[ebcnt - 1]) + printk(PRINT_PREF "skipping end of device tests because last " + "block is bad\n"); + else { + /* Attempt to write off end of device */ + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = mtd->ecclayout->oobavail + 1; + ops.oobretlen = 0; + ops.ooboffs = 0; + ops.datbuf = 0; + ops.oobbuf = writebuf; + printk(PRINT_PREF "attempting to write past end of device\n"); + printk(PRINT_PREF "an error is expected...\n"); + err = mtd->write_oob(mtd, mtd->size - mtd->writesize, &ops); + if (err) { + printk(PRINT_PREF "error occurred as expected\n"); + err = 0; + } else { + printk(PRINT_PREF "error: wrote past end of device\n"); + errcnt += 1; + } + + /* Attempt to read off end of device */ + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = mtd->ecclayout->oobavail + 1; + ops.oobretlen = 0; + ops.ooboffs = 0; + ops.datbuf = 0; + ops.oobbuf = readbuf; + printk(PRINT_PREF "attempting to read past end of device\n"); + printk(PRINT_PREF "an error is expected...\n"); + err = mtd->read_oob(mtd, mtd->size - mtd->writesize, &ops); + if (err) { + printk(PRINT_PREF "error occurred as expected\n"); + err = 0; + } else { + printk(PRINT_PREF "error: read past end of device\n"); + errcnt += 1; + } + + err = erase_eraseblock(ebcnt - 1); + if (err) + goto out; + + /* Attempt to write off end of device */ + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = mtd->ecclayout->oobavail; + ops.oobretlen = 0; + ops.ooboffs = 1; + ops.datbuf = 0; + ops.oobbuf = writebuf; + printk(PRINT_PREF "attempting to write past end of device\n"); + printk(PRINT_PREF "an error is expected...\n"); + err = mtd->write_oob(mtd, mtd->size - mtd->writesize, &ops); + if (err) { + printk(PRINT_PREF "error occurred as expected\n"); + err = 0; + } else { + printk(PRINT_PREF "error: wrote past end of device\n"); + errcnt += 1; + } + + /* Attempt to read off end of device */ + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = mtd->ecclayout->oobavail; + ops.oobretlen = 0; + ops.ooboffs = 1; + ops.datbuf = 0; + ops.oobbuf = readbuf; + printk(PRINT_PREF "attempting to read past end of device\n"); + printk(PRINT_PREF "an error is expected...\n"); + err = mtd->read_oob(mtd, mtd->size - mtd->writesize, &ops); + if (err) { + printk(PRINT_PREF "error occurred as expected\n"); + err = 0; + } else { + printk(PRINT_PREF "error: read past end of device\n"); + errcnt += 1; + } + } + + /* Fifth test: write / read across block boundaries */ + printk(PRINT_PREF "test 5 of 5\n"); + + /* Erase all eraseblocks */ + err = erase_whole_device(); + if (err) + goto out; + + /* Write all eraseblocks */ + simple_srand(11); + printk(PRINT_PREF "writing OOBs of whole device\n"); + for (i = 0; i < ebcnt - 1; ++i) { + int cnt = 2; + int pg; + size_t sz = mtd->ecclayout->oobavail; + if (bbt[i] || bbt[i + 1]) + continue; + addr = (i + 1) * mtd->erasesize - mtd->writesize; + for (pg = 0; pg < cnt; ++pg) { + set_random_data(writebuf, sz); + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = sz; + ops.oobretlen = 0; + ops.ooboffs = 0; + ops.datbuf = 0; + ops.oobbuf = writebuf; + err = mtd->write_oob(mtd, addr, &ops); + if (err) + goto out; + if (i % 256 == 0) + printk(PRINT_PREF "written up to eraseblock " + "%u\n", i); + cond_resched(); + addr += mtd->writesize; + } + } + printk(PRINT_PREF "written %u eraseblocks\n", i); + + /* Check all eraseblocks */ + simple_srand(11); + printk(PRINT_PREF "verifying all eraseblocks\n"); + for (i = 0; i < ebcnt - 1; ++i) { + if (bbt[i] || bbt[i + 1]) + continue; + set_random_data(writebuf, mtd->ecclayout->oobavail * 2); + addr = (i + 1) * mtd->erasesize - mtd->writesize; + ops.mode = MTD_OOB_AUTO; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = mtd->ecclayout->oobavail * 2; + ops.oobretlen = 0; + ops.ooboffs = 0; + ops.datbuf = 0; + ops.oobbuf = readbuf; + err = mtd->read_oob(mtd, addr, &ops); + if (err) + goto out; + if (memcmp(readbuf, writebuf, mtd->ecclayout->oobavail * 2)) { + printk(PRINT_PREF "error: verify failed at %#llx\n", + (long long)addr); + errcnt += 1; + if (errcnt > 1000) { + printk(PRINT_PREF "error: too many errors\n"); + goto out; + } + } + if (i % 256 == 0) + printk(PRINT_PREF "verified up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "verified %u eraseblocks\n", i); + + printk(PRINT_PREF "finished with %d errors\n", errcnt); +out: + kfree(bbt); + kfree(writebuf); + kfree(readbuf); + put_mtd_device(mtd); + if (err) + printk(PRINT_PREF "error %d occurred\n", err); + printk(KERN_INFO "=================================================\n"); + return err; +} +module_init(mtd_oobtest_init); + +static void __exit mtd_oobtest_exit(void) +{ + return; +} +module_exit(mtd_oobtest_exit); + +MODULE_DESCRIPTION("Out-of-band test module"); +MODULE_AUTHOR("Adrian Hunter"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/tests/mtd_pagetest.c b/drivers/mtd/tests/mtd_pagetest.c new file mode 100644 index 00000000000..9648818b9e2 --- /dev/null +++ b/drivers/mtd/tests/mtd_pagetest.c @@ -0,0 +1,632 @@ +/* + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Test page read and write on MTD device. + * + * Author: Adrian Hunter <ext-adrian.hunter@nokia.com> + */ + +#include <asm/div64.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/err.h> +#include <linux/mtd/mtd.h> +#include <linux/sched.h> + +#define PRINT_PREF KERN_INFO "mtd_pagetest: " + +static int dev; +module_param(dev, int, S_IRUGO); +MODULE_PARM_DESC(dev, "MTD device number to use"); + +static struct mtd_info *mtd; +static unsigned char *twopages; +static unsigned char *writebuf; +static unsigned char *boundary; +static unsigned char *bbt; + +static int pgsize; +static int bufsize; +static int ebcnt; +static int pgcnt; +static int errcnt; +static unsigned long next = 1; + +static inline unsigned int simple_rand(void) +{ + next = next * 1103515245 + 12345; + return (unsigned int)((next / 65536) % 32768); +} + +static inline void simple_srand(unsigned long seed) +{ + next = seed; +} + +static void set_random_data(unsigned char *buf, size_t len) +{ + size_t i; + + for (i = 0; i < len; ++i) + buf[i] = simple_rand(); +} + +static int erase_eraseblock(int ebnum) +{ + int err; + struct erase_info ei; + loff_t addr = ebnum * mtd->erasesize; + + memset(&ei, 0, sizeof(struct erase_info)); + ei.mtd = mtd; + ei.addr = addr; + ei.len = mtd->erasesize; + + err = mtd->erase(mtd, &ei); + if (err) { + printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum); + return err; + } + + if (ei.state == MTD_ERASE_FAILED) { + printk(PRINT_PREF "some erase error occurred at EB %d\n", + ebnum); + return -EIO; + } + + return 0; +} + +static int write_eraseblock(int ebnum) +{ + int err = 0; + size_t written = 0; + loff_t addr = ebnum * mtd->erasesize; + + set_random_data(writebuf, mtd->erasesize); + cond_resched(); + err = mtd->write(mtd, addr, mtd->erasesize, &written, writebuf); + if (err || written != mtd->erasesize) + printk(PRINT_PREF "error: write failed at %#llx\n", + (long long)addr); + + return err; +} + +static int verify_eraseblock(int ebnum) +{ + uint32_t j; + size_t read = 0; + int err = 0, i; + loff_t addr0, addrn; + loff_t addr = ebnum * mtd->erasesize; + + addr0 = 0; + for (i = 0; bbt[i] && i < ebcnt; ++i) + addr0 += mtd->erasesize; + + addrn = mtd->size; + for (i = 0; bbt[ebcnt - i - 1] && i < ebcnt; ++i) + addrn -= mtd->erasesize; + + set_random_data(writebuf, mtd->erasesize); + for (j = 0; j < pgcnt - 1; ++j, addr += pgsize) { + /* Do a read to set the internal dataRAMs to different data */ + err = mtd->read(mtd, addr0, bufsize, &read, twopages); + if (err == -EUCLEAN) + err = 0; + if (err || read != bufsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr0); + return err; + } + err = mtd->read(mtd, addrn - bufsize, bufsize, &read, twopages); + if (err == -EUCLEAN) + err = 0; + if (err || read != bufsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)(addrn - bufsize)); + return err; + } + memset(twopages, 0, bufsize); + read = 0; + err = mtd->read(mtd, addr, bufsize, &read, twopages); + if (err == -EUCLEAN) + err = 0; + if (err || read != bufsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr); + break; + } + if (memcmp(twopages, writebuf + (j * pgsize), bufsize)) { + printk(PRINT_PREF "error: verify failed at %#llx\n", + (long long)addr); + errcnt += 1; + } + } + /* Check boundary between eraseblocks */ + if (addr <= addrn - pgsize - pgsize && !bbt[ebnum + 1]) { + unsigned long oldnext = next; + /* Do a read to set the internal dataRAMs to different data */ + err = mtd->read(mtd, addr0, bufsize, &read, twopages); + if (err == -EUCLEAN) + err = 0; + if (err || read != bufsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr0); + return err; + } + err = mtd->read(mtd, addrn - bufsize, bufsize, &read, twopages); + if (err == -EUCLEAN) + err = 0; + if (err || read != bufsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)(addrn - bufsize)); + return err; + } + memset(twopages, 0, bufsize); + read = 0; + err = mtd->read(mtd, addr, bufsize, &read, twopages); + if (err == -EUCLEAN) + err = 0; + if (err || read != bufsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr); + return err; + } + memcpy(boundary, writebuf + mtd->erasesize - pgsize, pgsize); + set_random_data(boundary + pgsize, pgsize); + if (memcmp(twopages, boundary, bufsize)) { + printk(PRINT_PREF "error: verify failed at %#llx\n", + (long long)addr); + errcnt += 1; + } + next = oldnext; + } + return err; +} + +static int crosstest(void) +{ + size_t read = 0; + int err = 0, i; + loff_t addr, addr0, addrn; + unsigned char *pp1, *pp2, *pp3, *pp4; + + printk(PRINT_PREF "crosstest\n"); + pp1 = kmalloc(pgsize * 4, GFP_KERNEL); + if (!pp1) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + return -ENOMEM; + } + pp2 = pp1 + pgsize; + pp3 = pp2 + pgsize; + pp4 = pp3 + pgsize; + memset(pp1, 0, pgsize * 4); + + addr0 = 0; + for (i = 0; bbt[i] && i < ebcnt; ++i) + addr0 += mtd->erasesize; + + addrn = mtd->size; + for (i = 0; bbt[ebcnt - i - 1] && i < ebcnt; ++i) + addrn -= mtd->erasesize; + + /* Read 2nd-to-last page to pp1 */ + read = 0; + addr = addrn - pgsize - pgsize; + err = mtd->read(mtd, addr, pgsize, &read, pp1); + if (err == -EUCLEAN) + err = 0; + if (err || read != pgsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr); + kfree(pp1); + return err; + } + + /* Read 3rd-to-last page to pp1 */ + read = 0; + addr = addrn - pgsize - pgsize - pgsize; + err = mtd->read(mtd, addr, pgsize, &read, pp1); + if (err == -EUCLEAN) + err = 0; + if (err || read != pgsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr); + kfree(pp1); + return err; + } + + /* Read first page to pp2 */ + read = 0; + addr = addr0; + printk(PRINT_PREF "reading page at %#llx\n", (long long)addr); + err = mtd->read(mtd, addr, pgsize, &read, pp2); + if (err == -EUCLEAN) + err = 0; + if (err || read != pgsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr); + kfree(pp1); + return err; + } + + /* Read last page to pp3 */ + read = 0; + addr = addrn - pgsize; + printk(PRINT_PREF "reading page at %#llx\n", (long long)addr); + err = mtd->read(mtd, addr, pgsize, &read, pp3); + if (err == -EUCLEAN) + err = 0; + if (err || read != pgsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr); + kfree(pp1); + return err; + } + + /* Read first page again to pp4 */ + read = 0; + addr = addr0; + printk(PRINT_PREF "reading page at %#llx\n", (long long)addr); + err = mtd->read(mtd, addr, pgsize, &read, pp4); + if (err == -EUCLEAN) + err = 0; + if (err || read != pgsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr); + kfree(pp1); + return err; + } + + /* pp2 and pp4 should be the same */ + printk(PRINT_PREF "verifying pages read at %#llx match\n", + (long long)addr0); + if (memcmp(pp2, pp4, pgsize)) { + printk(PRINT_PREF "verify failed!\n"); + errcnt += 1; + } else if (!err) + printk(PRINT_PREF "crosstest ok\n"); + kfree(pp1); + return err; +} + +static int erasecrosstest(void) +{ + size_t read = 0, written = 0; + int err = 0, i, ebnum, ok = 1, ebnum2; + loff_t addr0; + char *readbuf = twopages; + + printk(PRINT_PREF "erasecrosstest\n"); + + ebnum = 0; + addr0 = 0; + for (i = 0; bbt[i] && i < ebcnt; ++i) { + addr0 += mtd->erasesize; + ebnum += 1; + } + + ebnum2 = ebcnt - 1; + while (ebnum2 && bbt[ebnum2]) + ebnum2 -= 1; + + printk(PRINT_PREF "erasing block %d\n", ebnum); + err = erase_eraseblock(ebnum); + if (err) + return err; + + printk(PRINT_PREF "writing 1st page of block %d\n", ebnum); + set_random_data(writebuf, pgsize); + strcpy(writebuf, "There is no data like this!"); + err = mtd->write(mtd, addr0, pgsize, &written, writebuf); + if (err || written != pgsize) { + printk(PRINT_PREF "error: write failed at %#llx\n", + (long long)addr0); + return err ? err : -1; + } + + printk(PRINT_PREF "reading 1st page of block %d\n", ebnum); + memset(readbuf, 0, pgsize); + err = mtd->read(mtd, addr0, pgsize, &read, readbuf); + if (err == -EUCLEAN) + err = 0; + if (err || read != pgsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr0); + return err ? err : -1; + } + + printk(PRINT_PREF "verifying 1st page of block %d\n", ebnum); + if (memcmp(writebuf, readbuf, pgsize)) { + printk(PRINT_PREF "verify failed!\n"); + errcnt += 1; + ok = 0; + return err; + } + + printk(PRINT_PREF "erasing block %d\n", ebnum); + err = erase_eraseblock(ebnum); + if (err) + return err; + + printk(PRINT_PREF "writing 1st page of block %d\n", ebnum); + set_random_data(writebuf, pgsize); + strcpy(writebuf, "There is no data like this!"); + err = mtd->write(mtd, addr0, pgsize, &written, writebuf); + if (err || written != pgsize) { + printk(PRINT_PREF "error: write failed at %#llx\n", + (long long)addr0); + return err ? err : -1; + } + + printk(PRINT_PREF "erasing block %d\n", ebnum2); + err = erase_eraseblock(ebnum2); + if (err) + return err; + + printk(PRINT_PREF "reading 1st page of block %d\n", ebnum); + memset(readbuf, 0, pgsize); + err = mtd->read(mtd, addr0, pgsize, &read, readbuf); + if (err == -EUCLEAN) + err = 0; + if (err || read != pgsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr0); + return err ? err : -1; + } + + printk(PRINT_PREF "verifying 1st page of block %d\n", ebnum); + if (memcmp(writebuf, readbuf, pgsize)) { + printk(PRINT_PREF "verify failed!\n"); + errcnt += 1; + ok = 0; + } + + if (ok && !err) + printk(PRINT_PREF "erasecrosstest ok\n"); + return err; +} + +static int erasetest(void) +{ + size_t read = 0, written = 0; + int err = 0, i, ebnum, ok = 1; + loff_t addr0; + + printk(PRINT_PREF "erasetest\n"); + + ebnum = 0; + addr0 = 0; + for (i = 0; bbt[i] && i < ebcnt; ++i) { + addr0 += mtd->erasesize; + ebnum += 1; + } + + printk(PRINT_PREF "erasing block %d\n", ebnum); + err = erase_eraseblock(ebnum); + if (err) + return err; + + printk(PRINT_PREF "writing 1st page of block %d\n", ebnum); + set_random_data(writebuf, pgsize); + err = mtd->write(mtd, addr0, pgsize, &written, writebuf); + if (err || written != pgsize) { + printk(PRINT_PREF "error: write failed at %#llx\n", + (long long)addr0); + return err ? err : -1; + } + + printk(PRINT_PREF "erasing block %d\n", ebnum); + err = erase_eraseblock(ebnum); + if (err) + return err; + + printk(PRINT_PREF "reading 1st page of block %d\n", ebnum); + err = mtd->read(mtd, addr0, pgsize, &read, twopages); + if (err == -EUCLEAN) + err = 0; + if (err || read != pgsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr0); + return err ? err : -1; + } + + printk(PRINT_PREF "verifying 1st page of block %d is all 0xff\n", + ebnum); + for (i = 0; i < pgsize; ++i) + if (twopages[i] != 0xff) { + printk(PRINT_PREF "verifying all 0xff failed at %d\n", + i); + errcnt += 1; + ok = 0; + break; + } + + if (ok && !err) + printk(PRINT_PREF "erasetest ok\n"); + + return err; +} + +static int is_block_bad(int ebnum) +{ + loff_t addr = ebnum * mtd->erasesize; + int ret; + + ret = mtd->block_isbad(mtd, addr); + if (ret) + printk(PRINT_PREF "block %d is bad\n", ebnum); + return ret; +} + +static int scan_for_bad_eraseblocks(void) +{ + int i, bad = 0; + + bbt = kmalloc(ebcnt, GFP_KERNEL); + if (!bbt) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + return -ENOMEM; + } + memset(bbt, 0 , ebcnt); + + printk(PRINT_PREF "scanning for bad eraseblocks\n"); + for (i = 0; i < ebcnt; ++i) { + bbt[i] = is_block_bad(i) ? 1 : 0; + if (bbt[i]) + bad += 1; + cond_resched(); + } + printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad); + return 0; +} + +static int __init mtd_pagetest_init(void) +{ + int err = 0; + uint64_t tmp; + uint32_t i; + + printk(KERN_INFO "\n"); + printk(KERN_INFO "=================================================\n"); + printk(PRINT_PREF "MTD device: %d\n", dev); + + mtd = get_mtd_device(NULL, dev); + if (IS_ERR(mtd)) { + err = PTR_ERR(mtd); + printk(PRINT_PREF "error: cannot get MTD device\n"); + return err; + } + + if (mtd->type != MTD_NANDFLASH) { + printk(PRINT_PREF "this test requires NAND flash\n"); + goto out; + } + + tmp = mtd->size; + do_div(tmp, mtd->erasesize); + ebcnt = tmp; + pgcnt = mtd->erasesize / mtd->writesize; + + printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, " + "page size %u, count of eraseblocks %u, pages per " + "eraseblock %u, OOB size %u\n", + (unsigned long long)mtd->size, mtd->erasesize, + pgsize, ebcnt, pgcnt, mtd->oobsize); + + err = -ENOMEM; + bufsize = pgsize * 2; + writebuf = kmalloc(mtd->erasesize, GFP_KERNEL); + if (!writebuf) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out; + } + twopages = kmalloc(bufsize, GFP_KERNEL); + if (!twopages) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out; + } + boundary = kmalloc(bufsize, GFP_KERNEL); + if (!boundary) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out; + } + + err = scan_for_bad_eraseblocks(); + if (err) + goto out; + + /* Erase all eraseblocks */ + printk(PRINT_PREF "erasing whole device\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = erase_eraseblock(i); + if (err) + goto out; + cond_resched(); + } + printk(PRINT_PREF "erased %u eraseblocks\n", i); + + /* Write all eraseblocks */ + simple_srand(1); + printk(PRINT_PREF "writing whole device\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = write_eraseblock(i); + if (err) + goto out; + if (i % 256 == 0) + printk(PRINT_PREF "written up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "written %u eraseblocks\n", i); + + /* Check all eraseblocks */ + simple_srand(1); + printk(PRINT_PREF "verifying all eraseblocks\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = verify_eraseblock(i); + if (err) + goto out; + if (i % 256 == 0) + printk(PRINT_PREF "verified up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "verified %u eraseblocks\n", i); + + err = crosstest(); + if (err) + goto out; + + err = erasecrosstest(); + if (err) + goto out; + + err = erasetest(); + if (err) + goto out; + + printk(PRINT_PREF "finished with %d errors\n", errcnt); +out: + + kfree(bbt); + kfree(boundary); + kfree(twopages); + kfree(writebuf); + put_mtd_device(mtd); + if (err) + printk(PRINT_PREF "error %d occurred\n", err); + printk(KERN_INFO "=================================================\n"); + return err; +} +module_init(mtd_pagetest_init); + +static void __exit mtd_pagetest_exit(void) +{ + return; +} +module_exit(mtd_pagetest_exit); + +MODULE_DESCRIPTION("NAND page test"); +MODULE_AUTHOR("Adrian Hunter"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/tests/mtd_readtest.c b/drivers/mtd/tests/mtd_readtest.c new file mode 100644 index 00000000000..645e77fdc63 --- /dev/null +++ b/drivers/mtd/tests/mtd_readtest.c @@ -0,0 +1,253 @@ +/* + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Check MTD device read. + * + * Author: Adrian Hunter <ext-adrian.hunter@nokia.com> + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/err.h> +#include <linux/mtd/mtd.h> +#include <linux/sched.h> + +#define PRINT_PREF KERN_INFO "mtd_readtest: " + +static int dev; +module_param(dev, int, S_IRUGO); +MODULE_PARM_DESC(dev, "MTD device number to use"); + +static struct mtd_info *mtd; +static unsigned char *iobuf; +static unsigned char *iobuf1; +static unsigned char *bbt; + +static int pgsize; +static int ebcnt; +static int pgcnt; + +static int read_eraseblock_by_page(int ebnum) +{ + size_t read = 0; + int i, ret, err = 0; + loff_t addr = ebnum * mtd->erasesize; + void *buf = iobuf; + void *oobbuf = iobuf1; + + for (i = 0; i < pgcnt; i++) { + memset(buf, 0 , pgcnt); + ret = mtd->read(mtd, addr, pgsize, &read, buf); + if (ret == -EUCLEAN) + ret = 0; + if (ret || read != pgsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr); + if (!err) + err = ret; + if (!err) + err = -EINVAL; + } + if (mtd->oobsize) { + struct mtd_oob_ops ops; + + ops.mode = MTD_OOB_PLACE; + ops.len = 0; + ops.retlen = 0; + ops.ooblen = mtd->oobsize; + ops.oobretlen = 0; + ops.ooboffs = 0; + ops.datbuf = 0; + ops.oobbuf = oobbuf; + ret = mtd->read_oob(mtd, addr, &ops); + if (ret || ops.oobretlen != mtd->oobsize) { + printk(PRINT_PREF "error: read oob failed at " + "%#llx\n", (long long)addr); + if (!err) + err = ret; + if (!err) + err = -EINVAL; + } + oobbuf += mtd->oobsize; + } + addr += pgsize; + buf += pgsize; + } + + return err; +} + +static void dump_eraseblock(int ebnum) +{ + int i, j, n; + char line[128]; + int pg, oob; + + printk(PRINT_PREF "dumping eraseblock %d\n", ebnum); + n = mtd->erasesize; + for (i = 0; i < n;) { + char *p = line; + + p += sprintf(p, "%05x: ", i); + for (j = 0; j < 32 && i < n; j++, i++) + p += sprintf(p, "%02x", (unsigned int)iobuf[i]); + printk(KERN_CRIT "%s\n", line); + cond_resched(); + } + if (!mtd->oobsize) + return; + printk(PRINT_PREF "dumping oob from eraseblock %d\n", ebnum); + n = mtd->oobsize; + for (pg = 0, i = 0; pg < pgcnt; pg++) + for (oob = 0; oob < n;) { + char *p = line; + + p += sprintf(p, "%05x: ", i); + for (j = 0; j < 32 && oob < n; j++, oob++, i++) + p += sprintf(p, "%02x", + (unsigned int)iobuf1[i]); + printk(KERN_CRIT "%s\n", line); + cond_resched(); + } +} + +static int is_block_bad(int ebnum) +{ + loff_t addr = ebnum * mtd->erasesize; + int ret; + + ret = mtd->block_isbad(mtd, addr); + if (ret) + printk(PRINT_PREF "block %d is bad\n", ebnum); + return ret; +} + +static int scan_for_bad_eraseblocks(void) +{ + int i, bad = 0; + + bbt = kmalloc(ebcnt, GFP_KERNEL); + if (!bbt) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + return -ENOMEM; + } + memset(bbt, 0 , ebcnt); + + printk(PRINT_PREF "scanning for bad eraseblocks\n"); + for (i = 0; i < ebcnt; ++i) { + bbt[i] = is_block_bad(i) ? 1 : 0; + if (bbt[i]) + bad += 1; + cond_resched(); + } + printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad); + return 0; +} + +static int __init mtd_readtest_init(void) +{ + uint64_t tmp; + int err, i; + + printk(KERN_INFO "\n"); + printk(KERN_INFO "=================================================\n"); + printk(PRINT_PREF "MTD device: %d\n", dev); + + mtd = get_mtd_device(NULL, dev); + if (IS_ERR(mtd)) { + err = PTR_ERR(mtd); + printk(PRINT_PREF "error: Cannot get MTD device\n"); + return err; + } + + if (mtd->writesize == 1) { + printk(PRINT_PREF "not NAND flash, assume page size is 512 " + "bytes.\n"); + pgsize = 512; + } else + pgsize = mtd->writesize; + + tmp = mtd->size; + do_div(tmp, mtd->erasesize); + ebcnt = tmp; + pgcnt = mtd->erasesize / mtd->writesize; + + printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, " + "page size %u, count of eraseblocks %u, pages per " + "eraseblock %u, OOB size %u\n", + (unsigned long long)mtd->size, mtd->erasesize, + pgsize, ebcnt, pgcnt, mtd->oobsize); + + err = -ENOMEM; + iobuf = kmalloc(mtd->erasesize, GFP_KERNEL); + if (!iobuf) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out; + } + iobuf1 = kmalloc(mtd->erasesize, GFP_KERNEL); + if (!iobuf1) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out; + } + + err = scan_for_bad_eraseblocks(); + if (err) + goto out; + + /* Read all eraseblocks 1 page at a time */ + printk(PRINT_PREF "testing page read\n"); + for (i = 0; i < ebcnt; ++i) { + int ret; + + if (bbt[i]) + continue; + ret = read_eraseblock_by_page(i); + if (ret) { + dump_eraseblock(i); + if (!err) + err = ret; + } + cond_resched(); + } + + if (err) + printk(PRINT_PREF "finished with errors\n"); + else + printk(PRINT_PREF "finished\n"); + +out: + + kfree(iobuf); + kfree(iobuf1); + kfree(bbt); + put_mtd_device(mtd); + if (err) + printk(PRINT_PREF "error %d occurred\n", err); + printk(KERN_INFO "=================================================\n"); + return err; +} +module_init(mtd_readtest_init); + +static void __exit mtd_readtest_exit(void) +{ + return; +} +module_exit(mtd_readtest_exit); + +MODULE_DESCRIPTION("Read test module"); +MODULE_AUTHOR("Adrian Hunter"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/tests/mtd_speedtest.c b/drivers/mtd/tests/mtd_speedtest.c new file mode 100644 index 00000000000..141363a7e80 --- /dev/null +++ b/drivers/mtd/tests/mtd_speedtest.c @@ -0,0 +1,502 @@ +/* + * Copyright (C) 2007 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Test read and write speed of a MTD device. + * + * Author: Adrian Hunter <ext-adrian.hunter@nokia.com> + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/err.h> +#include <linux/mtd/mtd.h> +#include <linux/sched.h> + +#define PRINT_PREF KERN_INFO "mtd_speedtest: " + +static int dev; +module_param(dev, int, S_IRUGO); +MODULE_PARM_DESC(dev, "MTD device number to use"); + +static struct mtd_info *mtd; +static unsigned char *iobuf; +static unsigned char *bbt; + +static int pgsize; +static int ebcnt; +static int pgcnt; +static int goodebcnt; +static struct timeval start, finish; +static unsigned long next = 1; + +static inline unsigned int simple_rand(void) +{ + next = next * 1103515245 + 12345; + return (unsigned int)((next / 65536) % 32768); +} + +static inline void simple_srand(unsigned long seed) +{ + next = seed; +} + +static void set_random_data(unsigned char *buf, size_t len) +{ + size_t i; + + for (i = 0; i < len; ++i) + buf[i] = simple_rand(); +} + +static int erase_eraseblock(int ebnum) +{ + int err; + struct erase_info ei; + loff_t addr = ebnum * mtd->erasesize; + + memset(&ei, 0, sizeof(struct erase_info)); + ei.mtd = mtd; + ei.addr = addr; + ei.len = mtd->erasesize; + + err = mtd->erase(mtd, &ei); + if (err) { + printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum); + return err; + } + + if (ei.state == MTD_ERASE_FAILED) { + printk(PRINT_PREF "some erase error occurred at EB %d\n", + ebnum); + return -EIO; + } + + return 0; +} + +static int erase_whole_device(void) +{ + int err; + unsigned int i; + + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = erase_eraseblock(i); + if (err) + return err; + cond_resched(); + } + return 0; +} + +static int write_eraseblock(int ebnum) +{ + size_t written = 0; + int err = 0; + loff_t addr = ebnum * mtd->erasesize; + + err = mtd->write(mtd, addr, mtd->erasesize, &written, iobuf); + if (err || written != mtd->erasesize) { + printk(PRINT_PREF "error: write failed at %#llx\n", addr); + if (!err) + err = -EINVAL; + } + + return err; +} + +static int write_eraseblock_by_page(int ebnum) +{ + size_t written = 0; + int i, err = 0; + loff_t addr = ebnum * mtd->erasesize; + void *buf = iobuf; + + for (i = 0; i < pgcnt; i++) { + err = mtd->write(mtd, addr, pgsize, &written, buf); + if (err || written != pgsize) { + printk(PRINT_PREF "error: write failed at %#llx\n", + addr); + if (!err) + err = -EINVAL; + break; + } + addr += pgsize; + buf += pgsize; + } + + return err; +} + +static int write_eraseblock_by_2pages(int ebnum) +{ + size_t written = 0, sz = pgsize * 2; + int i, n = pgcnt / 2, err = 0; + loff_t addr = ebnum * mtd->erasesize; + void *buf = iobuf; + + for (i = 0; i < n; i++) { + err = mtd->write(mtd, addr, sz, &written, buf); + if (err || written != sz) { + printk(PRINT_PREF "error: write failed at %#llx\n", + addr); + if (!err) + err = -EINVAL; + return err; + } + addr += sz; + buf += sz; + } + if (pgcnt % 2) { + err = mtd->write(mtd, addr, pgsize, &written, buf); + if (err || written != pgsize) { + printk(PRINT_PREF "error: write failed at %#llx\n", + addr); + if (!err) + err = -EINVAL; + } + } + + return err; +} + +static int read_eraseblock(int ebnum) +{ + size_t read = 0; + int err = 0; + loff_t addr = ebnum * mtd->erasesize; + + err = mtd->read(mtd, addr, mtd->erasesize, &read, iobuf); + /* Ignore corrected ECC errors */ + if (err == -EUCLEAN) + err = 0; + if (err || read != mtd->erasesize) { + printk(PRINT_PREF "error: read failed at %#llx\n", addr); + if (!err) + err = -EINVAL; + } + + return err; +} + +static int read_eraseblock_by_page(int ebnum) +{ + size_t read = 0; + int i, err = 0; + loff_t addr = ebnum * mtd->erasesize; + void *buf = iobuf; + + for (i = 0; i < pgcnt; i++) { + err = mtd->read(mtd, addr, pgsize, &read, buf); + /* Ignore corrected ECC errors */ + if (err == -EUCLEAN) + err = 0; + if (err || read != pgsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + addr); + if (!err) + err = -EINVAL; + break; + } + addr += pgsize; + buf += pgsize; + } + + return err; +} + +static int read_eraseblock_by_2pages(int ebnum) +{ + size_t read = 0, sz = pgsize * 2; + int i, n = pgcnt / 2, err = 0; + loff_t addr = ebnum * mtd->erasesize; + void *buf = iobuf; + + for (i = 0; i < n; i++) { + err = mtd->read(mtd, addr, sz, &read, buf); + /* Ignore corrected ECC errors */ + if (err == -EUCLEAN) + err = 0; + if (err || read != sz) { + printk(PRINT_PREF "error: read failed at %#llx\n", + addr); + if (!err) + err = -EINVAL; + return err; + } + addr += sz; + buf += sz; + } + if (pgcnt % 2) { + err = mtd->read(mtd, addr, pgsize, &read, buf); + /* Ignore corrected ECC errors */ + if (err == -EUCLEAN) + err = 0; + if (err || read != pgsize) { + printk(PRINT_PREF "error: read failed at %#llx\n", + addr); + if (!err) + err = -EINVAL; + } + } + + return err; +} + +static int is_block_bad(int ebnum) +{ + loff_t addr = ebnum * mtd->erasesize; + int ret; + + ret = mtd->block_isbad(mtd, addr); + if (ret) + printk(PRINT_PREF "block %d is bad\n", ebnum); + return ret; +} + +static inline void start_timing(void) +{ + do_gettimeofday(&start); +} + +static inline void stop_timing(void) +{ + do_gettimeofday(&finish); +} + +static long calc_speed(void) +{ + long ms, k, speed; + + ms = (finish.tv_sec - start.tv_sec) * 1000 + + (finish.tv_usec - start.tv_usec) / 1000; + k = goodebcnt * mtd->erasesize / 1024; + speed = (k * 1000) / ms; + return speed; +} + +static int scan_for_bad_eraseblocks(void) +{ + int i, bad = 0; + + bbt = kmalloc(ebcnt, GFP_KERNEL); + if (!bbt) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + return -ENOMEM; + } + memset(bbt, 0 , ebcnt); + + printk(PRINT_PREF "scanning for bad eraseblocks\n"); + for (i = 0; i < ebcnt; ++i) { + bbt[i] = is_block_bad(i) ? 1 : 0; + if (bbt[i]) + bad += 1; + cond_resched(); + } + printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad); + goodebcnt = ebcnt - bad; + return 0; +} + +static int __init mtd_speedtest_init(void) +{ + int err, i; + long speed; + uint64_t tmp; + + printk(KERN_INFO "\n"); + printk(KERN_INFO "=================================================\n"); + printk(PRINT_PREF "MTD device: %d\n", dev); + + mtd = get_mtd_device(NULL, dev); + if (IS_ERR(mtd)) { + err = PTR_ERR(mtd); + printk(PRINT_PREF "error: cannot get MTD device\n"); + return err; + } + + if (mtd->writesize == 1) { + printk(PRINT_PREF "not NAND flash, assume page size is 512 " + "bytes.\n"); + pgsize = 512; + } else + pgsize = mtd->writesize; + + tmp = mtd->size; + do_div(tmp, mtd->erasesize); + ebcnt = tmp; + pgcnt = mtd->erasesize / mtd->writesize; + + printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, " + "page size %u, count of eraseblocks %u, pages per " + "eraseblock %u, OOB size %u\n", + (unsigned long long)mtd->size, mtd->erasesize, + pgsize, ebcnt, pgcnt, mtd->oobsize); + + err = -ENOMEM; + iobuf = kmalloc(mtd->erasesize, GFP_KERNEL); + if (!iobuf) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out; + } + + simple_srand(1); + set_random_data(iobuf, mtd->erasesize); + + err = scan_for_bad_eraseblocks(); + if (err) + goto out; + + err = erase_whole_device(); + if (err) + goto out; + + /* Write all eraseblocks, 1 eraseblock at a time */ + printk(PRINT_PREF "testing eraseblock write speed\n"); + start_timing(); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = write_eraseblock(i); + if (err) + goto out; + cond_resched(); + } + stop_timing(); + speed = calc_speed(); + printk(PRINT_PREF "eraseblock write speed is %ld KiB/s\n", speed); + + /* Read all eraseblocks, 1 eraseblock at a time */ + printk(PRINT_PREF "testing eraseblock read speed\n"); + start_timing(); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = read_eraseblock(i); + if (err) + goto out; + cond_resched(); + } + stop_timing(); + speed = calc_speed(); + printk(PRINT_PREF "eraseblock read speed is %ld KiB/s\n", speed); + + err = erase_whole_device(); + if (err) + goto out; + + /* Write all eraseblocks, 1 page at a time */ + printk(PRINT_PREF "testing page write speed\n"); + start_timing(); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = write_eraseblock_by_page(i); + if (err) + goto out; + cond_resched(); + } + stop_timing(); + speed = calc_speed(); + printk(PRINT_PREF "page write speed is %ld KiB/s\n", speed); + + /* Read all eraseblocks, 1 page at a time */ + printk(PRINT_PREF "testing page read speed\n"); + start_timing(); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = read_eraseblock_by_page(i); + if (err) + goto out; + cond_resched(); + } + stop_timing(); + speed = calc_speed(); + printk(PRINT_PREF "page read speed is %ld KiB/s\n", speed); + + err = erase_whole_device(); + if (err) + goto out; + + /* Write all eraseblocks, 2 pages at a time */ + printk(PRINT_PREF "testing 2 page write speed\n"); + start_timing(); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = write_eraseblock_by_2pages(i); + if (err) + goto out; + cond_resched(); + } + stop_timing(); + speed = calc_speed(); + printk(PRINT_PREF "2 page write speed is %ld KiB/s\n", speed); + + /* Read all eraseblocks, 2 pages at a time */ + printk(PRINT_PREF "testing 2 page read speed\n"); + start_timing(); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = read_eraseblock_by_2pages(i); + if (err) + goto out; + cond_resched(); + } + stop_timing(); + speed = calc_speed(); + printk(PRINT_PREF "2 page read speed is %ld KiB/s\n", speed); + + /* Erase all eraseblocks */ + printk(PRINT_PREF "Testing erase speed\n"); + start_timing(); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = erase_eraseblock(i); + if (err) + goto out; + cond_resched(); + } + stop_timing(); + speed = calc_speed(); + printk(PRINT_PREF "erase speed is %ld KiB/s\n", speed); + + printk(PRINT_PREF "finished\n"); +out: + kfree(iobuf); + kfree(bbt); + put_mtd_device(mtd); + if (err) + printk(PRINT_PREF "error %d occurred\n", err); + printk(KERN_INFO "=================================================\n"); + return err; +} +module_init(mtd_speedtest_init); + +static void __exit mtd_speedtest_exit(void) +{ + return; +} +module_exit(mtd_speedtest_exit); + +MODULE_DESCRIPTION("Speed test module"); +MODULE_AUTHOR("Adrian Hunter"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/tests/mtd_stresstest.c b/drivers/mtd/tests/mtd_stresstest.c new file mode 100644 index 00000000000..63920476b57 --- /dev/null +++ b/drivers/mtd/tests/mtd_stresstest.c @@ -0,0 +1,330 @@ +/* + * Copyright (C) 2006-2008 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Test random reads, writes and erases on MTD device. + * + * Author: Adrian Hunter <ext-adrian.hunter@nokia.com> + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/err.h> +#include <linux/mtd/mtd.h> +#include <linux/sched.h> +#include <linux/vmalloc.h> + +#define PRINT_PREF KERN_INFO "mtd_stresstest: " + +static int dev; +module_param(dev, int, S_IRUGO); +MODULE_PARM_DESC(dev, "MTD device number to use"); + +static int count = 10000; +module_param(count, int, S_IRUGO); +MODULE_PARM_DESC(count, "Number of operations to do (default is 10000)"); + +static struct mtd_info *mtd; +static unsigned char *writebuf; +static unsigned char *readbuf; +static unsigned char *bbt; +static int *offsets; + +static int pgsize; +static int bufsize; +static int ebcnt; +static int pgcnt; +static unsigned long next = 1; + +static inline unsigned int simple_rand(void) +{ + next = next * 1103515245 + 12345; + return (unsigned int)((next / 65536) % 32768); +} + +static inline void simple_srand(unsigned long seed) +{ + next = seed; +} + +static int rand_eb(void) +{ + int eb; + +again: + if (ebcnt < 32768) + eb = simple_rand(); + else + eb = (simple_rand() << 15) | simple_rand(); + /* Read or write up 2 eraseblocks at a time - hence 'ebcnt - 1' */ + eb %= (ebcnt - 1); + if (bbt[eb]) + goto again; + return eb; +} + +static int rand_offs(void) +{ + int offs; + + if (bufsize < 32768) + offs = simple_rand(); + else + offs = (simple_rand() << 15) | simple_rand(); + offs %= bufsize; + return offs; +} + +static int rand_len(int offs) +{ + int len; + + if (bufsize < 32768) + len = simple_rand(); + else + len = (simple_rand() << 15) | simple_rand(); + len %= (bufsize - offs); + return len; +} + +static int erase_eraseblock(int ebnum) +{ + int err; + struct erase_info ei; + loff_t addr = ebnum * mtd->erasesize; + + memset(&ei, 0, sizeof(struct erase_info)); + ei.mtd = mtd; + ei.addr = addr; + ei.len = mtd->erasesize; + + err = mtd->erase(mtd, &ei); + if (unlikely(err)) { + printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum); + return err; + } + + if (unlikely(ei.state == MTD_ERASE_FAILED)) { + printk(PRINT_PREF "some erase error occurred at EB %d\n", + ebnum); + return -EIO; + } + + return 0; +} + +static int is_block_bad(int ebnum) +{ + loff_t addr = ebnum * mtd->erasesize; + int ret; + + ret = mtd->block_isbad(mtd, addr); + if (ret) + printk(PRINT_PREF "block %d is bad\n", ebnum); + return ret; +} + +static int do_read(void) +{ + size_t read = 0; + int eb = rand_eb(); + int offs = rand_offs(); + int len = rand_len(offs), err; + loff_t addr; + + if (bbt[eb + 1]) { + if (offs >= mtd->erasesize) + offs -= mtd->erasesize; + if (offs + len > mtd->erasesize) + len = mtd->erasesize - offs; + } + addr = eb * mtd->erasesize + offs; + err = mtd->read(mtd, addr, len, &read, readbuf); + if (err == -EUCLEAN) + err = 0; + if (unlikely(err || read != len)) { + printk(PRINT_PREF "error: read failed at 0x%llx\n", + (long long)addr); + if (!err) + err = -EINVAL; + return err; + } + return 0; +} + +static int do_write(void) +{ + int eb = rand_eb(), offs, err, len; + size_t written = 0; + loff_t addr; + + offs = offsets[eb]; + if (offs >= mtd->erasesize) { + err = erase_eraseblock(eb); + if (err) + return err; + offs = offsets[eb] = 0; + } + len = rand_len(offs); + len = ((len + pgsize - 1) / pgsize) * pgsize; + if (offs + len > mtd->erasesize) { + if (bbt[eb + 1]) + len = mtd->erasesize - offs; + else { + err = erase_eraseblock(eb + 1); + if (err) + return err; + offsets[eb + 1] = 0; + } + } + addr = eb * mtd->erasesize + offs; + err = mtd->write(mtd, addr, len, &written, writebuf); + if (unlikely(err || written != len)) { + printk(PRINT_PREF "error: write failed at 0x%llx\n", + (long long)addr); + if (!err) + err = -EINVAL; + return err; + } + offs += len; + while (offs > mtd->erasesize) { + offsets[eb++] = mtd->erasesize; + offs -= mtd->erasesize; + } + offsets[eb] = offs; + return 0; +} + +static int do_operation(void) +{ + if (simple_rand() & 1) + return do_read(); + else + return do_write(); +} + +static int scan_for_bad_eraseblocks(void) +{ + int i, bad = 0; + + bbt = kmalloc(ebcnt, GFP_KERNEL); + if (!bbt) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + return -ENOMEM; + } + memset(bbt, 0 , ebcnt); + + printk(PRINT_PREF "scanning for bad eraseblocks\n"); + for (i = 0; i < ebcnt; ++i) { + bbt[i] = is_block_bad(i) ? 1 : 0; + if (bbt[i]) + bad += 1; + cond_resched(); + } + printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad); + return 0; +} + +static int __init mtd_stresstest_init(void) +{ + int err; + int i, op; + uint64_t tmp; + + printk(KERN_INFO "\n"); + printk(KERN_INFO "=================================================\n"); + printk(PRINT_PREF "MTD device: %d\n", dev); + + mtd = get_mtd_device(NULL, dev); + if (IS_ERR(mtd)) { + err = PTR_ERR(mtd); + printk(PRINT_PREF "error: cannot get MTD device\n"); + return err; + } + + if (mtd->writesize == 1) { + printk(PRINT_PREF "not NAND flash, assume page size is 512 " + "bytes.\n"); + pgsize = 512; + } else + pgsize = mtd->writesize; + + tmp = mtd->size; + do_div(tmp, mtd->erasesize); + ebcnt = tmp; + pgcnt = mtd->erasesize / mtd->writesize; + + printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, " + "page size %u, count of eraseblocks %u, pages per " + "eraseblock %u, OOB size %u\n", + (unsigned long long)mtd->size, mtd->erasesize, + pgsize, ebcnt, pgcnt, mtd->oobsize); + + /* Read or write up 2 eraseblocks at a time */ + bufsize = mtd->erasesize * 2; + + err = -ENOMEM; + readbuf = vmalloc(bufsize); + writebuf = vmalloc(bufsize); + offsets = kmalloc(ebcnt * sizeof(int), GFP_KERNEL); + if (!readbuf || !writebuf || !offsets) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out; + } + for (i = 0; i < ebcnt; i++) + offsets[i] = mtd->erasesize; + simple_srand(current->pid); + for (i = 0; i < bufsize; i++) + writebuf[i] = simple_rand(); + + err = scan_for_bad_eraseblocks(); + if (err) + goto out; + + /* Do operations */ + printk(PRINT_PREF "doing operations\n"); + for (op = 0; op < count; op++) { + if ((op & 1023) == 0) + printk(PRINT_PREF "%d operations done\n", op); + err = do_operation(); + if (err) + goto out; + cond_resched(); + } + printk(PRINT_PREF "finished, %d operations done\n", op); + +out: + kfree(offsets); + kfree(bbt); + vfree(writebuf); + vfree(readbuf); + put_mtd_device(mtd); + if (err) + printk(PRINT_PREF "error %d occurred\n", err); + printk(KERN_INFO "=================================================\n"); + return err; +} +module_init(mtd_stresstest_init); + +static void __exit mtd_stresstest_exit(void) +{ + return; +} +module_exit(mtd_stresstest_exit); + +MODULE_DESCRIPTION("Stress test module"); +MODULE_AUTHOR("Adrian Hunter"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/tests/mtd_subpagetest.c b/drivers/mtd/tests/mtd_subpagetest.c new file mode 100644 index 00000000000..5b889724268 --- /dev/null +++ b/drivers/mtd/tests/mtd_subpagetest.c @@ -0,0 +1,525 @@ +/* + * Copyright (C) 2006-2007 Nokia Corporation + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Test sub-page read and write on MTD device. + * Author: Adrian Hunter <ext-adrian.hunter@nokia.com> + * + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/err.h> +#include <linux/mtd/mtd.h> +#include <linux/sched.h> + +#define PRINT_PREF KERN_INFO "mtd_subpagetest: " + +static int dev; +module_param(dev, int, S_IRUGO); +MODULE_PARM_DESC(dev, "MTD device number to use"); + +static struct mtd_info *mtd; +static unsigned char *writebuf; +static unsigned char *readbuf; +static unsigned char *bbt; + +static int subpgsize; +static int bufsize; +static int ebcnt; +static int pgcnt; +static int errcnt; +static unsigned long next = 1; + +static inline unsigned int simple_rand(void) +{ + next = next * 1103515245 + 12345; + return (unsigned int)((next / 65536) % 32768); +} + +static inline void simple_srand(unsigned long seed) +{ + next = seed; +} + +static void set_random_data(unsigned char *buf, size_t len) +{ + size_t i; + + for (i = 0; i < len; ++i) + buf[i] = simple_rand(); +} + +static inline void clear_data(unsigned char *buf, size_t len) +{ + memset(buf, 0, len); +} + +static int erase_eraseblock(int ebnum) +{ + int err; + struct erase_info ei; + loff_t addr = ebnum * mtd->erasesize; + + memset(&ei, 0, sizeof(struct erase_info)); + ei.mtd = mtd; + ei.addr = addr; + ei.len = mtd->erasesize; + + err = mtd->erase(mtd, &ei); + if (err) { + printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum); + return err; + } + + if (ei.state == MTD_ERASE_FAILED) { + printk(PRINT_PREF "some erase error occurred at EB %d\n", + ebnum); + return -EIO; + } + + return 0; +} + +static int erase_whole_device(void) +{ + int err; + unsigned int i; + + printk(PRINT_PREF "erasing whole device\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = erase_eraseblock(i); + if (err) + return err; + cond_resched(); + } + printk(PRINT_PREF "erased %u eraseblocks\n", i); + return 0; +} + +static int write_eraseblock(int ebnum) +{ + size_t written = 0; + int err = 0; + loff_t addr = ebnum * mtd->erasesize; + + set_random_data(writebuf, subpgsize); + err = mtd->write(mtd, addr, subpgsize, &written, writebuf); + if (unlikely(err || written != subpgsize)) { + printk(PRINT_PREF "error: write failed at %#llx\n", + (long long)addr); + if (written != subpgsize) { + printk(PRINT_PREF " write size: %#x\n", subpgsize); + printk(PRINT_PREF " written: %#zx\n", written); + } + return err ? err : -1; + } + + addr += subpgsize; + + set_random_data(writebuf, subpgsize); + err = mtd->write(mtd, addr, subpgsize, &written, writebuf); + if (unlikely(err || written != subpgsize)) { + printk(PRINT_PREF "error: write failed at %#llx\n", + (long long)addr); + if (written != subpgsize) { + printk(PRINT_PREF " write size: %#x\n", subpgsize); + printk(PRINT_PREF " written: %#zx\n", written); + } + return err ? err : -1; + } + + return err; +} + +static int write_eraseblock2(int ebnum) +{ + size_t written = 0; + int err = 0, k; + loff_t addr = ebnum * mtd->erasesize; + + for (k = 1; k < 33; ++k) { + if (addr + (subpgsize * k) > (ebnum + 1) * mtd->erasesize) + break; + set_random_data(writebuf, subpgsize * k); + err = mtd->write(mtd, addr, subpgsize * k, &written, writebuf); + if (unlikely(err || written != subpgsize * k)) { + printk(PRINT_PREF "error: write failed at %#llx\n", + (long long)addr); + if (written != subpgsize) { + printk(PRINT_PREF " write size: %#x\n", + subpgsize * k); + printk(PRINT_PREF " written: %#08zx\n", + written); + } + return err ? err : -1; + } + addr += subpgsize * k; + } + + return err; +} + +static void print_subpage(unsigned char *p) +{ + int i, j; + + for (i = 0; i < subpgsize; ) { + for (j = 0; i < subpgsize && j < 32; ++i, ++j) + printk("%02x", *p++); + printk("\n"); + } +} + +static int verify_eraseblock(int ebnum) +{ + size_t read = 0; + int err = 0; + loff_t addr = ebnum * mtd->erasesize; + + set_random_data(writebuf, subpgsize); + clear_data(readbuf, subpgsize); + read = 0; + err = mtd->read(mtd, addr, subpgsize, &read, readbuf); + if (unlikely(err || read != subpgsize)) { + if (err == -EUCLEAN && read == subpgsize) { + printk(PRINT_PREF "ECC correction at %#llx\n", + (long long)addr); + err = 0; + } else { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr); + return err ? err : -1; + } + } + if (unlikely(memcmp(readbuf, writebuf, subpgsize))) { + printk(PRINT_PREF "error: verify failed at %#llx\n", + (long long)addr); + printk(PRINT_PREF "------------- written----------------\n"); + print_subpage(writebuf); + printk(PRINT_PREF "------------- read ------------------\n"); + print_subpage(readbuf); + printk(PRINT_PREF "-------------------------------------\n"); + errcnt += 1; + } + + addr += subpgsize; + + set_random_data(writebuf, subpgsize); + clear_data(readbuf, subpgsize); + read = 0; + err = mtd->read(mtd, addr, subpgsize, &read, readbuf); + if (unlikely(err || read != subpgsize)) { + if (err == -EUCLEAN && read == subpgsize) { + printk(PRINT_PREF "ECC correction at %#llx\n", + (long long)addr); + err = 0; + } else { + printk(PRINT_PREF "error: read failed at %#llx\n", + (long long)addr); + return err ? err : -1; + } + } + if (unlikely(memcmp(readbuf, writebuf, subpgsize))) { + printk(PRINT_PREF "error: verify failed at %#llx\n", + (long long)addr); + printk(PRINT_PREF "------------- written----------------\n"); + print_subpage(writebuf); + printk(PRINT_PREF "------------- read ------------------\n"); + print_subpage(readbuf); + printk(PRINT_PREF "-------------------------------------\n"); + errcnt += 1; + } + + return err; +} + +static int verify_eraseblock2(int ebnum) +{ + size_t read = 0; + int err = 0, k; + loff_t addr = ebnum * mtd->erasesize; + + for (k = 1; k < 33; ++k) { + if (addr + (subpgsize * k) > (ebnum + 1) * mtd->erasesize) + break; + set_random_data(writebuf, subpgsize * k); + clear_data(readbuf, subpgsize * k); + read = 0; + err = mtd->read(mtd, addr, subpgsize * k, &read, readbuf); + if (unlikely(err || read != subpgsize * k)) { + if (err == -EUCLEAN && read == subpgsize * k) { + printk(PRINT_PREF "ECC correction at %#llx\n", + (long long)addr); + err = 0; + } else { + printk(PRINT_PREF "error: read failed at " + "%#llx\n", (long long)addr); + return err ? err : -1; + } + } + if (unlikely(memcmp(readbuf, writebuf, subpgsize * k))) { + printk(PRINT_PREF "error: verify failed at %#llx\n", + (long long)addr); + errcnt += 1; + } + addr += subpgsize * k; + } + + return err; +} + +static int verify_eraseblock_ff(int ebnum) +{ + uint32_t j; + size_t read = 0; + int err = 0; + loff_t addr = ebnum * mtd->erasesize; + + memset(writebuf, 0xff, subpgsize); + for (j = 0; j < mtd->erasesize / subpgsize; ++j) { + clear_data(readbuf, subpgsize); + read = 0; + err = mtd->read(mtd, addr, subpgsize, &read, readbuf); + if (unlikely(err || read != subpgsize)) { + if (err == -EUCLEAN && read == subpgsize) { + printk(PRINT_PREF "ECC correction at %#llx\n", + (long long)addr); + err = 0; + } else { + printk(PRINT_PREF "error: read failed at " + "%#llx\n", (long long)addr); + return err ? err : -1; + } + } + if (unlikely(memcmp(readbuf, writebuf, subpgsize))) { + printk(PRINT_PREF "error: verify 0xff failed at " + "%#llx\n", (long long)addr); + errcnt += 1; + } + addr += subpgsize; + } + + return err; +} + +static int verify_all_eraseblocks_ff(void) +{ + int err; + unsigned int i; + + printk(PRINT_PREF "verifying all eraseblocks for 0xff\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = verify_eraseblock_ff(i); + if (err) + return err; + if (i % 256 == 0) + printk(PRINT_PREF "verified up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "verified %u eraseblocks\n", i); + return 0; +} + +static int is_block_bad(int ebnum) +{ + loff_t addr = ebnum * mtd->erasesize; + int ret; + + ret = mtd->block_isbad(mtd, addr); + if (ret) + printk(PRINT_PREF "block %d is bad\n", ebnum); + return ret; +} + +static int scan_for_bad_eraseblocks(void) +{ + int i, bad = 0; + + bbt = kmalloc(ebcnt, GFP_KERNEL); + if (!bbt) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + return -ENOMEM; + } + memset(bbt, 0 , ebcnt); + + printk(PRINT_PREF "scanning for bad eraseblocks\n"); + for (i = 0; i < ebcnt; ++i) { + bbt[i] = is_block_bad(i) ? 1 : 0; + if (bbt[i]) + bad += 1; + cond_resched(); + } + printk(PRINT_PREF "scanned %d eraseblocks, %d are bad\n", i, bad); + return 0; +} + +static int __init mtd_subpagetest_init(void) +{ + int err = 0; + uint32_t i; + uint64_t tmp; + + printk(KERN_INFO "\n"); + printk(KERN_INFO "=================================================\n"); + printk(PRINT_PREF "MTD device: %d\n", dev); + + mtd = get_mtd_device(NULL, dev); + if (IS_ERR(mtd)) { + err = PTR_ERR(mtd); + printk(PRINT_PREF "error: cannot get MTD device\n"); + return err; + } + + if (mtd->type != MTD_NANDFLASH) { + printk(PRINT_PREF "this test requires NAND flash\n"); + goto out; + } + + subpgsize = mtd->writesize >> mtd->subpage_sft; + printk(PRINT_PREF "MTD device size %llu, eraseblock size %u, " + "page size %u, subpage size %u, count of eraseblocks %u, " + "pages per eraseblock %u, OOB size %u\n", + (unsigned long long)mtd->size, mtd->erasesize, + mtd->writesize, subpgsize, ebcnt, pgcnt, mtd->oobsize); + + err = -ENOMEM; + bufsize = subpgsize * 32; + writebuf = kmalloc(bufsize, GFP_KERNEL); + if (!writebuf) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out; + } + readbuf = kmalloc(bufsize, GFP_KERNEL); + if (!readbuf) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out; + } + + tmp = mtd->size; + do_div(tmp, mtd->erasesize); + ebcnt = tmp; + pgcnt = mtd->erasesize / mtd->writesize; + + err = scan_for_bad_eraseblocks(); + if (err) + goto out; + + err = erase_whole_device(); + if (err) + goto out; + + printk(PRINT_PREF "writing whole device\n"); + simple_srand(1); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = write_eraseblock(i); + if (unlikely(err)) + goto out; + if (i % 256 == 0) + printk(PRINT_PREF "written up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "written %u eraseblocks\n", i); + + simple_srand(1); + printk(PRINT_PREF "verifying all eraseblocks\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = verify_eraseblock(i); + if (unlikely(err)) + goto out; + if (i % 256 == 0) + printk(PRINT_PREF "verified up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "verified %u eraseblocks\n", i); + + err = erase_whole_device(); + if (err) + goto out; + + err = verify_all_eraseblocks_ff(); + if (err) + goto out; + + /* Write all eraseblocks */ + simple_srand(3); + printk(PRINT_PREF "writing whole device\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = write_eraseblock2(i); + if (unlikely(err)) + goto out; + if (i % 256 == 0) + printk(PRINT_PREF "written up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "written %u eraseblocks\n", i); + + /* Check all eraseblocks */ + simple_srand(3); + printk(PRINT_PREF "verifying all eraseblocks\n"); + for (i = 0; i < ebcnt; ++i) { + if (bbt[i]) + continue; + err = verify_eraseblock2(i); + if (unlikely(err)) + goto out; + if (i % 256 == 0) + printk(PRINT_PREF "verified up to eraseblock %u\n", i); + cond_resched(); + } + printk(PRINT_PREF "verified %u eraseblocks\n", i); + + err = erase_whole_device(); + if (err) + goto out; + + err = verify_all_eraseblocks_ff(); + if (err) + goto out; + + printk(PRINT_PREF "finished with %d errors\n", errcnt); + +out: + kfree(bbt); + kfree(readbuf); + kfree(writebuf); + put_mtd_device(mtd); + if (err) + printk(PRINT_PREF "error %d occurred\n", err); + printk(KERN_INFO "=================================================\n"); + return err; +} +module_init(mtd_subpagetest_init); + +static void __exit mtd_subpagetest_exit(void) +{ + return; +} +module_exit(mtd_subpagetest_exit); + +MODULE_DESCRIPTION("Subpage test module"); +MODULE_AUTHOR("Adrian Hunter"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/tests/mtd_torturetest.c b/drivers/mtd/tests/mtd_torturetest.c new file mode 100644 index 00000000000..631a0ab3a33 --- /dev/null +++ b/drivers/mtd/tests/mtd_torturetest.c @@ -0,0 +1,530 @@ +/* + * Copyright (C) 2006-2008 Artem Bityutskiy + * Copyright (C) 2006-2008 Jarkko Lavinen + * Copyright (C) 2006-2008 Adrian Hunter + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; see the file COPYING. If not, write to the Free Software + * Foundation, 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + * + * Authors: Artem Bityutskiy, Jarkko Lavinen, Adria Hunter + * + * WARNING: this test program may kill your flash and your device. Do not + * use it unless you know what you do. Authors are not responsible for any + * damage caused by this program. + */ + +#include <linux/init.h> +#include <linux/module.h> +#include <linux/moduleparam.h> +#include <linux/err.h> +#include <linux/mtd/mtd.h> +#include <linux/sched.h> + +#define PRINT_PREF KERN_INFO "mtd_torturetest: " +#define RETRIES 3 + +static int eb = 8; +module_param(eb, int, S_IRUGO); +MODULE_PARM_DESC(eb, "eraseblock number within the selected MTD device"); + +static int ebcnt = 32; +module_param(ebcnt, int, S_IRUGO); +MODULE_PARM_DESC(ebcnt, "number of consecutive eraseblocks to torture"); + +static int pgcnt; +module_param(pgcnt, int, S_IRUGO); +MODULE_PARM_DESC(pgcnt, "number of pages per eraseblock to torture (0 => all)"); + +static int dev; +module_param(dev, int, S_IRUGO); +MODULE_PARM_DESC(dev, "MTD device number to use"); + +static int gran = 512; +module_param(gran, int, S_IRUGO); +MODULE_PARM_DESC(gran, "how often the status information should be printed"); + +static int check = 1; +module_param(check, int, S_IRUGO); +MODULE_PARM_DESC(check, "if the written data should be checked"); + +static unsigned int cycles_count; +module_param(cycles_count, uint, S_IRUGO); +MODULE_PARM_DESC(cycles_count, "how many erase cycles to do " + "(infinite by default)"); + +static struct mtd_info *mtd; + +/* This buffer contains 0x555555...0xAAAAAA... pattern */ +static unsigned char *patt_5A5; +/* This buffer contains 0xAAAAAA...0x555555... pattern */ +static unsigned char *patt_A5A; +/* This buffer contains all 0xFF bytes */ +static unsigned char *patt_FF; +/* This a temporary buffer is use when checking data */ +static unsigned char *check_buf; +/* How many erase cycles were done */ +static unsigned int erase_cycles; + +static int pgsize; +static struct timeval start, finish; + +static void report_corrupt(unsigned char *read, unsigned char *written); + +static inline void start_timing(void) +{ + do_gettimeofday(&start); +} + +static inline void stop_timing(void) +{ + do_gettimeofday(&finish); +} + +/* + * Erase eraseblock number @ebnum. + */ +static inline int erase_eraseblock(int ebnum) +{ + int err; + struct erase_info ei; + loff_t addr = ebnum * mtd->erasesize; + + memset(&ei, 0, sizeof(struct erase_info)); + ei.mtd = mtd; + ei.addr = addr; + ei.len = mtd->erasesize; + + err = mtd->erase(mtd, &ei); + if (err) { + printk(PRINT_PREF "error %d while erasing EB %d\n", err, ebnum); + return err; + } + + if (ei.state == MTD_ERASE_FAILED) { + printk(PRINT_PREF "some erase error occurred at EB %d\n", + ebnum); + return -EIO; + } + + return 0; +} + +/* + * Check that the contents of eraseblock number @enbum is equivalent to the + * @buf buffer. + */ +static inline int check_eraseblock(int ebnum, unsigned char *buf) +{ + int err, retries = 0; + size_t read = 0; + loff_t addr = ebnum * mtd->erasesize; + size_t len = mtd->erasesize; + + if (pgcnt) { + addr = (ebnum + 1) * mtd->erasesize - pgcnt * pgsize; + len = pgcnt * pgsize; + } + +retry: + err = mtd->read(mtd, addr, len, &read, check_buf); + if (err == -EUCLEAN) + printk(PRINT_PREF "single bit flip occurred at EB %d " + "MTD reported that it was fixed.\n", ebnum); + else if (err) { + printk(PRINT_PREF "error %d while reading EB %d, " + "read %zd\n", err, ebnum, read); + return err; + } + + if (read != len) { + printk(PRINT_PREF "failed to read %zd bytes from EB %d, " + "read only %zd, but no error reported\n", + len, ebnum, read); + return -EIO; + } + + if (memcmp(buf, check_buf, len)) { + printk(PRINT_PREF "read wrong data from EB %d\n", ebnum); + report_corrupt(check_buf, buf); + + if (retries++ < RETRIES) { + /* Try read again */ + yield(); + printk(PRINT_PREF "re-try reading data from EB %d\n", + ebnum); + goto retry; + } else { + printk(PRINT_PREF "retried %d times, still errors, " + "give-up\n", RETRIES); + return -EINVAL; + } + } + + if (retries != 0) + printk(PRINT_PREF "only attempt number %d was OK (!!!)\n", + retries); + + return 0; +} + +static inline int write_pattern(int ebnum, void *buf) +{ + int err; + size_t written = 0; + loff_t addr = ebnum * mtd->erasesize; + size_t len = mtd->erasesize; + + if (pgcnt) { + addr = (ebnum + 1) * mtd->erasesize - pgcnt * pgsize; + len = pgcnt * pgsize; + } + err = mtd->write(mtd, addr, len, &written, buf); + if (err) { + printk(PRINT_PREF "error %d while writing EB %d, written %zd" + " bytes\n", err, ebnum, written); + return err; + } + if (written != len) { + printk(PRINT_PREF "written only %zd bytes of %zd, but no error" + " reported\n", written, len); + return -EIO; + } + + return 0; +} + +static int __init tort_init(void) +{ + int err = 0, i, infinite = !cycles_count; + int bad_ebs[ebcnt]; + + printk(KERN_INFO "\n"); + printk(KERN_INFO "=================================================\n"); + printk(PRINT_PREF "Warning: this program is trying to wear out your " + "flash, stop it if this is not wanted.\n"); + printk(PRINT_PREF "MTD device: %d\n", dev); + printk(PRINT_PREF "torture %d eraseblocks (%d-%d) of mtd%d\n", + ebcnt, eb, eb + ebcnt - 1, dev); + if (pgcnt) + printk(PRINT_PREF "torturing just %d pages per eraseblock\n", + pgcnt); + printk(PRINT_PREF "write verify %s\n", check ? "enabled" : "disabled"); + + mtd = get_mtd_device(NULL, dev); + if (IS_ERR(mtd)) { + err = PTR_ERR(mtd); + printk(PRINT_PREF "error: cannot get MTD device\n"); + return err; + } + + if (mtd->writesize == 1) { + printk(PRINT_PREF "not NAND flash, assume page size is 512 " + "bytes.\n"); + pgsize = 512; + } else + pgsize = mtd->writesize; + + if (pgcnt && (pgcnt > mtd->erasesize / pgsize || pgcnt < 0)) { + printk(PRINT_PREF "error: invalid pgcnt value %d\n", pgcnt); + goto out_mtd; + } + + err = -ENOMEM; + patt_5A5 = kmalloc(mtd->erasesize, GFP_KERNEL); + if (!patt_5A5) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out_mtd; + } + + patt_A5A = kmalloc(mtd->erasesize, GFP_KERNEL); + if (!patt_A5A) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out_patt_5A5; + } + + patt_FF = kmalloc(mtd->erasesize, GFP_KERNEL); + if (!patt_FF) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out_patt_A5A; + } + + check_buf = kmalloc(mtd->erasesize, GFP_KERNEL); + if (!check_buf) { + printk(PRINT_PREF "error: cannot allocate memory\n"); + goto out_patt_FF; + } + + err = 0; + + /* Initialize patterns */ + memset(patt_FF, 0xFF, mtd->erasesize); + for (i = 0; i < mtd->erasesize / pgsize; i++) { + if (!(i & 1)) { + memset(patt_5A5 + i * pgsize, 0x55, pgsize); + memset(patt_A5A + i * pgsize, 0xAA, pgsize); + } else { + memset(patt_5A5 + i * pgsize, 0xAA, pgsize); + memset(patt_A5A + i * pgsize, 0x55, pgsize); + } + } + + /* + * Check if there is a bad eraseblock among those we are going to test. + */ + memset(&bad_ebs[0], 0, sizeof(int) * ebcnt); + if (mtd->block_isbad) { + for (i = eb; i < eb + ebcnt; i++) { + err = mtd->block_isbad(mtd, + (loff_t)i * mtd->erasesize); + + if (err < 0) { + printk(PRINT_PREF "block_isbad() returned %d " + "for EB %d\n", err, i); + goto out; + } + + if (err) { + printk("EB %d is bad. Skip it.\n", i); + bad_ebs[i - eb] = 1; + } + } + } + + start_timing(); + while (1) { + int i; + void *patt; + + /* Erase all eraseblocks */ + for (i = eb; i < eb + ebcnt; i++) { + if (bad_ebs[i - eb]) + continue; + err = erase_eraseblock(i); + if (err) + goto out; + cond_resched(); + } + + /* Check if the eraseblocks contain only 0xFF bytes */ + if (check) { + for (i = eb; i < eb + ebcnt; i++) { + if (bad_ebs[i - eb]) + continue; + err = check_eraseblock(i, patt_FF); + if (err) { + printk(PRINT_PREF "verify failed" + " for 0xFF... pattern\n"); + goto out; + } + cond_resched(); + } + } + + /* Write the pattern */ + for (i = eb; i < eb + ebcnt; i++) { + if (bad_ebs[i - eb]) + continue; + if ((eb + erase_cycles) & 1) + patt = patt_5A5; + else + patt = patt_A5A; + err = write_pattern(i, patt); + if (err) + goto out; + cond_resched(); + } + + /* Verify what we wrote */ + if (check) { + for (i = eb; i < eb + ebcnt; i++) { + if (bad_ebs[i - eb]) + continue; + if ((eb + erase_cycles) & 1) + patt = patt_5A5; + else + patt = patt_A5A; + err = check_eraseblock(i, patt); + if (err) { + printk(PRINT_PREF "verify failed for %s" + " pattern\n", + ((eb + erase_cycles) & 1) ? + "0x55AA55..." : "0xAA55AA..."); + goto out; + } + cond_resched(); + } + } + + erase_cycles += 1; + + if (erase_cycles % gran == 0) { + long ms; + + stop_timing(); + ms = (finish.tv_sec - start.tv_sec) * 1000 + + (finish.tv_usec - start.tv_usec) / 1000; + printk(PRINT_PREF "%08u erase cycles done, took %lu " + "milliseconds (%lu seconds)\n", + erase_cycles, ms, ms / 1000); + start_timing(); + } + + if (!infinite && --cycles_count == 0) + break; + } +out: + + printk(PRINT_PREF "finished after %u erase cycles\n", + erase_cycles); + kfree(check_buf); +out_patt_FF: + kfree(patt_FF); +out_patt_A5A: + kfree(patt_A5A); +out_patt_5A5: + kfree(patt_5A5); +out_mtd: + put_mtd_device(mtd); + if (err) + printk(PRINT_PREF "error %d occurred during torturing\n", err); + printk(KERN_INFO "=================================================\n"); + return err; +} +module_init(tort_init); + +static void __exit tort_exit(void) +{ + return; +} +module_exit(tort_exit); + +static int countdiffs(unsigned char *buf, unsigned char *check_buf, + unsigned offset, unsigned len, unsigned *bytesp, + unsigned *bitsp); +static void print_bufs(unsigned char *read, unsigned char *written, int start, + int len); + +/* + * Report the detailed information about how the read EB differs from what was + * written. + */ +static void report_corrupt(unsigned char *read, unsigned char *written) +{ + int i; + int bytes, bits, pages, first; + int offset, len; + size_t check_len = mtd->erasesize; + + if (pgcnt) + check_len = pgcnt * pgsize; + + bytes = bits = pages = 0; + for (i = 0; i < check_len; i += pgsize) + if (countdiffs(written, read, i, pgsize, &bytes, + &bits) >= 0) + pages++; + + printk(PRINT_PREF "verify fails on %d pages, %d bytes/%d bits\n", + pages, bytes, bits); + printk(PRINT_PREF "The following is a list of all differences between" + " what was read from flash and what was expected\n"); + + for (i = 0; i < check_len; i += pgsize) { + cond_resched(); + bytes = bits = 0; + first = countdiffs(written, read, i, pgsize, &bytes, + &bits); + if (first < 0) + continue; + + printk("-------------------------------------------------------" + "----------------------------------\n"); + + printk(PRINT_PREF "Page %zd has %d bytes/%d bits failing verify," + " starting at offset 0x%x\n", + (mtd->erasesize - check_len + i) / pgsize, + bytes, bits, first); + + offset = first & ~0x7; + len = ((first + bytes) | 0x7) + 1 - offset; + + print_bufs(read, written, offset, len); + } +} + +static void print_bufs(unsigned char *read, unsigned char *written, int start, + int len) +{ + int i = 0, j1, j2; + char *diff; + + printk("Offset Read Written\n"); + while (i < len) { + printk("0x%08x: ", start + i); + diff = " "; + for (j1 = 0; j1 < 8 && i + j1 < len; j1++) { + printk(" %02x", read[start + i + j1]); + if (read[start + i + j1] != written[start + i + j1]) + diff = "***"; + } + + while (j1 < 8) { + printk(" "); + j1 += 1; + } + + printk(" %s ", diff); + + for (j2 = 0; j2 < 8 && i + j2 < len; j2++) + printk(" %02x", written[start + i + j2]); + printk("\n"); + i += 8; + } +} + +/* + * Count the number of differing bytes and bits and return the first differing + * offset. + */ +static int countdiffs(unsigned char *buf, unsigned char *check_buf, + unsigned offset, unsigned len, unsigned *bytesp, + unsigned *bitsp) +{ + unsigned i, bit; + int first = -1; + + for (i = offset; i < offset + len; i++) + if (buf[i] != check_buf[i]) { + first = i; + break; + } + + while (i < offset + len) { + if (buf[i] != check_buf[i]) { + (*bytesp)++; + bit = 1; + while (bit < 256) { + if ((buf[i] & bit) != (check_buf[i] & bit)) + (*bitsp)++; + bit <<= 1; + } + } + i++; + } + + return first; +} + +MODULE_DESCRIPTION("Eraseblock torturing module"); +MODULE_AUTHOR("Artem Bityutskiy, Jarkko Lavinen, Adrian Hunter"); +MODULE_LICENSE("GPL"); diff --git a/drivers/mtd/ubi/build.c b/drivers/mtd/ubi/build.c index 7caf22cd5ad..9082768cc6c 100644 --- a/drivers/mtd/ubi/build.c +++ b/drivers/mtd/ubi/build.c @@ -561,7 +561,7 @@ static int io_init(struct ubi_device *ubi) */ ubi->peb_size = ubi->mtd->erasesize; - ubi->peb_count = ubi->mtd->size / ubi->mtd->erasesize; + ubi->peb_count = mtd_div_by_eb(ubi->mtd->size, ubi->mtd); ubi->flash_size = ubi->mtd->size; if (ubi->mtd->block_isbad && ubi->mtd->block_markbad) diff --git a/drivers/mtd/ubi/gluebi.c b/drivers/mtd/ubi/gluebi.c index 605812bb0b1..6dd4f5e77f8 100644 --- a/drivers/mtd/ubi/gluebi.c +++ b/drivers/mtd/ubi/gluebi.c @@ -215,7 +215,8 @@ static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr) struct ubi_volume *vol; struct ubi_device *ubi; - dbg_gen("erase %u bytes at offset %u", instr->len, instr->addr); + dbg_gen("erase %llu bytes at offset %llu", (unsigned long long)instr->len, + (unsigned long long)instr->addr); if (instr->addr < 0 || instr->addr > mtd->size - mtd->erasesize) return -EINVAL; @@ -223,11 +224,11 @@ static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr) if (instr->len < 0 || instr->addr + instr->len > mtd->size) return -EINVAL; - if (instr->addr % mtd->writesize || instr->len % mtd->writesize) + if (mtd_mod_by_ws(instr->addr, mtd) || mtd_mod_by_ws(instr->len, mtd)) return -EINVAL; - lnum = instr->addr / mtd->erasesize; - count = instr->len / mtd->erasesize; + lnum = mtd_div_by_eb(instr->addr, mtd); + count = mtd_div_by_eb(instr->len, mtd); vol = container_of(mtd, struct ubi_volume, gluebi_mtd); ubi = vol->ubi; @@ -255,7 +256,7 @@ static int gluebi_erase(struct mtd_info *mtd, struct erase_info *instr) out_err: instr->state = MTD_ERASE_FAILED; - instr->fail_addr = lnum * mtd->erasesize; + instr->fail_addr = (long long)lnum * mtd->erasesize; return err; } @@ -294,7 +295,7 @@ int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol) * bytes. */ if (vol->vol_type == UBI_DYNAMIC_VOLUME) - mtd->size = vol->usable_leb_size * vol->reserved_pebs; + mtd->size = (long long)vol->usable_leb_size * vol->reserved_pebs; else mtd->size = vol->used_bytes; @@ -304,8 +305,8 @@ int ubi_create_gluebi(struct ubi_device *ubi, struct ubi_volume *vol) return -ENFILE; } - dbg_gen("added mtd%d (\"%s\"), size %u, EB size %u", - mtd->index, mtd->name, mtd->size, mtd->erasesize); + dbg_gen("added mtd%d (\"%s\"), size %llu, EB size %u", + mtd->index, mtd->name, (unsigned long long)mtd->size, mtd->erasesize); return 0; } diff --git a/drivers/oprofile/buffer_sync.c b/drivers/oprofile/buffer_sync.c index 65e8294a9e2..9da5a4b8113 100644 --- a/drivers/oprofile/buffer_sync.c +++ b/drivers/oprofile/buffer_sync.c @@ -1,11 +1,12 @@ /** * @file buffer_sync.c * - * @remark Copyright 2002 OProfile authors + * @remark Copyright 2002-2009 OProfile authors * @remark Read the file COPYING * * @author John Levon <levon@movementarian.org> * @author Barry Kasindorf + * @author Robert Richter <robert.richter@amd.com> * * This is the core of the buffer management. Each * CPU buffer is processed and entered into the @@ -315,88 +316,73 @@ static void add_trace_begin(void) add_event_entry(TRACE_BEGIN_CODE); } -#ifdef CONFIG_OPROFILE_IBS - -#define IBS_FETCH_CODE_SIZE 2 -#define IBS_OP_CODE_SIZE 5 - -/* - * Add IBS fetch and op entries to event buffer - */ -static void add_ibs_begin(int cpu, int code, struct mm_struct *mm) +static void add_data(struct op_entry *entry, struct mm_struct *mm) { - unsigned long rip; - int i, count; - unsigned long ibs_cookie = 0; + unsigned long code, pc, val; + unsigned long cookie; off_t offset; - struct op_sample *sample; - - sample = cpu_buffer_read_entry(cpu); - if (!sample) - goto Error; - rip = sample->eip; -#ifdef __LP64__ - rip += sample->event << 32; -#endif + if (!op_cpu_buffer_get_data(entry, &code)) + return; + if (!op_cpu_buffer_get_data(entry, &pc)) + return; + if (!op_cpu_buffer_get_size(entry)) + return; if (mm) { - ibs_cookie = lookup_dcookie(mm, rip, &offset); + cookie = lookup_dcookie(mm, pc, &offset); - if (ibs_cookie == NO_COOKIE) - offset = rip; - if (ibs_cookie == INVALID_COOKIE) { + if (cookie == NO_COOKIE) + offset = pc; + if (cookie == INVALID_COOKIE) { atomic_inc(&oprofile_stats.sample_lost_no_mapping); - offset = rip; + offset = pc; } - if (ibs_cookie != last_cookie) { - add_cookie_switch(ibs_cookie); - last_cookie = ibs_cookie; + if (cookie != last_cookie) { + add_cookie_switch(cookie); + last_cookie = cookie; } } else - offset = rip; + offset = pc; add_event_entry(ESCAPE_CODE); add_event_entry(code); add_event_entry(offset); /* Offset from Dcookie */ - /* we send the Dcookie offset, but send the raw Linear Add also*/ - add_event_entry(sample->eip); - add_event_entry(sample->event); - - if (code == IBS_FETCH_CODE) - count = IBS_FETCH_CODE_SIZE; /*IBS FETCH is 2 int64s*/ - else - count = IBS_OP_CODE_SIZE; /*IBS OP is 5 int64s*/ - - for (i = 0; i < count; i++) { - sample = cpu_buffer_read_entry(cpu); - if (!sample) - goto Error; - add_event_entry(sample->eip); - add_event_entry(sample->event); - } - - return; - -Error: - return; + while (op_cpu_buffer_get_data(entry, &val)) + add_event_entry(val); } -#endif - -static void add_sample_entry(unsigned long offset, unsigned long event) +static inline void add_sample_entry(unsigned long offset, unsigned long event) { add_event_entry(offset); add_event_entry(event); } -static int add_us_sample(struct mm_struct *mm, struct op_sample *s) +/* + * Add a sample to the global event buffer. If possible the + * sample is converted into a persistent dentry/offset pair + * for later lookup from userspace. Return 0 on failure. + */ +static int +add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel) { unsigned long cookie; off_t offset; + if (in_kernel) { + add_sample_entry(s->eip, s->event); + return 1; + } + + /* add userspace sample */ + + if (!mm) { + atomic_inc(&oprofile_stats.sample_lost_no_mm); + return 0; + } + cookie = lookup_dcookie(mm, s->eip, &offset); if (cookie == INVALID_COOKIE) { @@ -415,25 +401,6 @@ static int add_us_sample(struct mm_struct *mm, struct op_sample *s) } -/* Add a sample to the global event buffer. If possible the - * sample is converted into a persistent dentry/offset pair - * for later lookup from userspace. - */ -static int -add_sample(struct mm_struct *mm, struct op_sample *s, int in_kernel) -{ - if (in_kernel) { - add_sample_entry(s->eip, s->event); - return 1; - } else if (mm) { - return add_us_sample(mm, s); - } else { - atomic_inc(&oprofile_stats.sample_lost_no_mm); - } - return 0; -} - - static void release_mm(struct mm_struct *mm) { if (!mm) @@ -526,66 +493,69 @@ void sync_buffer(int cpu) { struct mm_struct *mm = NULL; struct mm_struct *oldmm; + unsigned long val; struct task_struct *new; unsigned long cookie = 0; int in_kernel = 1; sync_buffer_state state = sb_buffer_start; unsigned int i; unsigned long available; + unsigned long flags; + struct op_entry entry; + struct op_sample *sample; mutex_lock(&buffer_mutex); add_cpu_switch(cpu); - cpu_buffer_reset(cpu); - available = cpu_buffer_entries(cpu); + op_cpu_buffer_reset(cpu); + available = op_cpu_buffer_entries(cpu); for (i = 0; i < available; ++i) { - struct op_sample *s = cpu_buffer_read_entry(cpu); - if (!s) + sample = op_cpu_buffer_read_entry(&entry, cpu); + if (!sample) break; - if (is_code(s->eip)) { - switch (s->event) { - case 0: - case CPU_IS_KERNEL: + if (is_code(sample->eip)) { + flags = sample->event; + if (flags & TRACE_BEGIN) { + state = sb_bt_start; + add_trace_begin(); + } + if (flags & KERNEL_CTX_SWITCH) { /* kernel/userspace switch */ - in_kernel = s->event; + in_kernel = flags & IS_KERNEL; if (state == sb_buffer_start) state = sb_sample_start; - add_kernel_ctx_switch(s->event); - break; - case CPU_TRACE_BEGIN: - state = sb_bt_start; - add_trace_begin(); - break; -#ifdef CONFIG_OPROFILE_IBS - case IBS_FETCH_BEGIN: - state = sb_bt_start; - add_ibs_begin(cpu, IBS_FETCH_CODE, mm); - break; - case IBS_OP_BEGIN: - state = sb_bt_start; - add_ibs_begin(cpu, IBS_OP_CODE, mm); - break; -#endif - default: + add_kernel_ctx_switch(flags & IS_KERNEL); + } + if (flags & USER_CTX_SWITCH + && op_cpu_buffer_get_data(&entry, &val)) { /* userspace context switch */ + new = (struct task_struct *)val; oldmm = mm; - new = (struct task_struct *)s->event; release_mm(oldmm); mm = take_tasks_mm(new); if (mm != oldmm) cookie = get_exec_dcookie(mm); add_user_ctx_switch(new, cookie); - break; - } - } else if (state >= sb_bt_start && - !add_sample(mm, s, in_kernel)) { - if (state == sb_bt_start) { - state = sb_bt_ignore; - atomic_inc(&oprofile_stats.bt_lost_no_mapping); } + if (op_cpu_buffer_get_size(&entry)) + add_data(&entry, mm); + continue; + } + + if (state < sb_bt_start) + /* ignore sample */ + continue; + + if (add_sample(mm, sample, in_kernel)) + continue; + + /* ignore backtraces if failed to add a sample */ + if (state == sb_bt_start) { + state = sb_bt_ignore; + atomic_inc(&oprofile_stats.bt_lost_no_mapping); } } release_mm(mm); diff --git a/drivers/oprofile/cpu_buffer.c b/drivers/oprofile/cpu_buffer.c index 61090969158..2e03b6d796d 100644 --- a/drivers/oprofile/cpu_buffer.c +++ b/drivers/oprofile/cpu_buffer.c @@ -1,11 +1,12 @@ /** * @file cpu_buffer.c * - * @remark Copyright 2002 OProfile authors + * @remark Copyright 2002-2009 OProfile authors * @remark Read the file COPYING * * @author John Levon <levon@movementarian.org> * @author Barry Kasindorf <barry.kasindorf@amd.com> + * @author Robert Richter <robert.richter@amd.com> * * Each CPU has a local buffer that stores PC value/event * pairs. We also log context switches when we notice them. @@ -45,8 +46,8 @@ * can be changed to a single buffer solution when the ring buffer * access is implemented as non-locking atomic code. */ -struct ring_buffer *op_ring_buffer_read; -struct ring_buffer *op_ring_buffer_write; +static struct ring_buffer *op_ring_buffer_read; +static struct ring_buffer *op_ring_buffer_write; DEFINE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer); static void wq_sync_buffer(struct work_struct *work); @@ -54,19 +55,9 @@ static void wq_sync_buffer(struct work_struct *work); #define DEFAULT_TIMER_EXPIRE (HZ / 10) static int work_enabled; -void free_cpu_buffers(void) -{ - if (op_ring_buffer_read) - ring_buffer_free(op_ring_buffer_read); - op_ring_buffer_read = NULL; - if (op_ring_buffer_write) - ring_buffer_free(op_ring_buffer_write); - op_ring_buffer_write = NULL; -} - unsigned long oprofile_get_cpu_buffer_size(void) { - return fs_cpu_buffer_size; + return oprofile_cpu_buffer_size; } void oprofile_cpu_buffer_inc_smpl_lost(void) @@ -77,11 +68,21 @@ void oprofile_cpu_buffer_inc_smpl_lost(void) cpu_buf->sample_lost_overflow++; } +void free_cpu_buffers(void) +{ + if (op_ring_buffer_read) + ring_buffer_free(op_ring_buffer_read); + op_ring_buffer_read = NULL; + if (op_ring_buffer_write) + ring_buffer_free(op_ring_buffer_write); + op_ring_buffer_write = NULL; +} + int alloc_cpu_buffers(void) { int i; - unsigned long buffer_size = fs_cpu_buffer_size; + unsigned long buffer_size = oprofile_cpu_buffer_size; op_ring_buffer_read = ring_buffer_alloc(buffer_size, OP_BUFFER_FLAGS); if (!op_ring_buffer_read) @@ -97,8 +98,6 @@ int alloc_cpu_buffers(void) b->last_is_kernel = -1; b->tracing = 0; b->buffer_size = buffer_size; - b->tail_pos = 0; - b->head_pos = 0; b->sample_received = 0; b->sample_lost_overflow = 0; b->backtrace_aborted = 0; @@ -145,47 +144,156 @@ void end_cpu_work(void) flush_scheduled_work(); } -static inline int -add_sample(struct oprofile_cpu_buffer *cpu_buf, - unsigned long pc, unsigned long event) +/* + * This function prepares the cpu buffer to write a sample. + * + * Struct op_entry is used during operations on the ring buffer while + * struct op_sample contains the data that is stored in the ring + * buffer. Struct entry can be uninitialized. The function reserves a + * data array that is specified by size. Use + * op_cpu_buffer_write_commit() after preparing the sample. In case of + * errors a null pointer is returned, otherwise the pointer to the + * sample. + * + */ +struct op_sample +*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size) +{ + entry->event = ring_buffer_lock_reserve + (op_ring_buffer_write, sizeof(struct op_sample) + + size * sizeof(entry->sample->data[0]), &entry->irq_flags); + if (entry->event) + entry->sample = ring_buffer_event_data(entry->event); + else + entry->sample = NULL; + + if (!entry->sample) + return NULL; + + entry->size = size; + entry->data = entry->sample->data; + + return entry->sample; +} + +int op_cpu_buffer_write_commit(struct op_entry *entry) +{ + return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event, + entry->irq_flags); +} + +struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu) +{ + struct ring_buffer_event *e; + e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); + if (e) + goto event; + if (ring_buffer_swap_cpu(op_ring_buffer_read, + op_ring_buffer_write, + cpu)) + return NULL; + e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); + if (e) + goto event; + return NULL; + +event: + entry->event = e; + entry->sample = ring_buffer_event_data(e); + entry->size = (ring_buffer_event_length(e) - sizeof(struct op_sample)) + / sizeof(entry->sample->data[0]); + entry->data = entry->sample->data; + return entry->sample; +} + +unsigned long op_cpu_buffer_entries(int cpu) +{ + return ring_buffer_entries_cpu(op_ring_buffer_read, cpu) + + ring_buffer_entries_cpu(op_ring_buffer_write, cpu); +} + +static int +op_add_code(struct oprofile_cpu_buffer *cpu_buf, unsigned long backtrace, + int is_kernel, struct task_struct *task) { struct op_entry entry; - int ret; + struct op_sample *sample; + unsigned long flags; + int size; + + flags = 0; + + if (backtrace) + flags |= TRACE_BEGIN; + + /* notice a switch from user->kernel or vice versa */ + is_kernel = !!is_kernel; + if (cpu_buf->last_is_kernel != is_kernel) { + cpu_buf->last_is_kernel = is_kernel; + flags |= KERNEL_CTX_SWITCH; + if (is_kernel) + flags |= IS_KERNEL; + } + + /* notice a task switch */ + if (cpu_buf->last_task != task) { + cpu_buf->last_task = task; + flags |= USER_CTX_SWITCH; + } + + if (!flags) + /* nothing to do */ + return 0; + + if (flags & USER_CTX_SWITCH) + size = 1; + else + size = 0; + + sample = op_cpu_buffer_write_reserve(&entry, size); + if (!sample) + return -ENOMEM; - ret = cpu_buffer_write_entry(&entry); - if (ret) - return ret; + sample->eip = ESCAPE_CODE; + sample->event = flags; - entry.sample->eip = pc; - entry.sample->event = event; + if (size) + op_cpu_buffer_add_data(&entry, (unsigned long)task); - ret = cpu_buffer_write_commit(&entry); - if (ret) - return ret; + op_cpu_buffer_write_commit(&entry); return 0; } static inline int -add_code(struct oprofile_cpu_buffer *buffer, unsigned long value) +op_add_sample(struct oprofile_cpu_buffer *cpu_buf, + unsigned long pc, unsigned long event) { - return add_sample(buffer, ESCAPE_CODE, value); + struct op_entry entry; + struct op_sample *sample; + + sample = op_cpu_buffer_write_reserve(&entry, 0); + if (!sample) + return -ENOMEM; + + sample->eip = pc; + sample->event = event; + + return op_cpu_buffer_write_commit(&entry); } -/* This must be safe from any context. It's safe writing here - * because of the head/tail separation of the writer and reader - * of the CPU buffer. +/* + * This must be safe from any context. * * is_kernel is needed because on some architectures you cannot * tell if you are in kernel or user space simply by looking at * pc. We tag this in the buffer by generating kernel enter/exit * events whenever is_kernel changes */ -static int log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, - int is_kernel, unsigned long event) +static int +log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, + unsigned long backtrace, int is_kernel, unsigned long event) { - struct task_struct *task; - cpu_buf->sample_received++; if (pc == ESCAPE_CODE) { @@ -193,25 +301,10 @@ static int log_sample(struct oprofile_cpu_buffer *cpu_buf, unsigned long pc, return 0; } - is_kernel = !!is_kernel; - - task = current; - - /* notice a switch from user->kernel or vice versa */ - if (cpu_buf->last_is_kernel != is_kernel) { - cpu_buf->last_is_kernel = is_kernel; - if (add_code(cpu_buf, is_kernel)) - goto fail; - } - - /* notice a task switch */ - if (cpu_buf->last_task != task) { - cpu_buf->last_task = task; - if (add_code(cpu_buf, (unsigned long)task)) - goto fail; - } + if (op_add_code(cpu_buf, backtrace, is_kernel, current)) + goto fail; - if (add_sample(cpu_buf, pc, event)) + if (op_add_sample(cpu_buf, pc, event)) goto fail; return 1; @@ -221,109 +314,102 @@ fail: return 0; } -static int oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) +static inline void oprofile_begin_trace(struct oprofile_cpu_buffer *cpu_buf) { - add_code(cpu_buf, CPU_TRACE_BEGIN); cpu_buf->tracing = 1; - return 1; } -static void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf) +static inline void oprofile_end_trace(struct oprofile_cpu_buffer *cpu_buf) { cpu_buf->tracing = 0; } -void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, - unsigned long event, int is_kernel) +static inline void +__oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, + unsigned long event, int is_kernel) { struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); - - if (!backtrace_depth) { - log_sample(cpu_buf, pc, is_kernel, event); - return; - } - - if (!oprofile_begin_trace(cpu_buf)) - return; + unsigned long backtrace = oprofile_backtrace_depth; /* * if log_sample() fail we can't backtrace since we lost the * source of this event */ - if (log_sample(cpu_buf, pc, is_kernel, event)) - oprofile_ops.backtrace(regs, backtrace_depth); + if (!log_sample(cpu_buf, pc, backtrace, is_kernel, event)) + /* failed */ + return; + + if (!backtrace) + return; + + oprofile_begin_trace(cpu_buf); + oprofile_ops.backtrace(regs, backtrace); oprofile_end_trace(cpu_buf); } +void oprofile_add_ext_sample(unsigned long pc, struct pt_regs * const regs, + unsigned long event, int is_kernel) +{ + __oprofile_add_ext_sample(pc, regs, event, is_kernel); +} + void oprofile_add_sample(struct pt_regs * const regs, unsigned long event) { int is_kernel = !user_mode(regs); unsigned long pc = profile_pc(regs); - oprofile_add_ext_sample(pc, regs, event, is_kernel); + __oprofile_add_ext_sample(pc, regs, event, is_kernel); } -#ifdef CONFIG_OPROFILE_IBS - -#define MAX_IBS_SAMPLE_SIZE 14 - -void oprofile_add_ibs_sample(struct pt_regs * const regs, - unsigned int * const ibs_sample, int ibs_code) +/* + * Add samples with data to the ring buffer. + * + * Use oprofile_add_data(&entry, val) to add data and + * oprofile_write_commit(&entry) to commit the sample. + */ +void +oprofile_write_reserve(struct op_entry *entry, struct pt_regs * const regs, + unsigned long pc, int code, int size) { + struct op_sample *sample; int is_kernel = !user_mode(regs); struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); - struct task_struct *task; - int fail = 0; cpu_buf->sample_received++; - /* notice a switch from user->kernel or vice versa */ - if (cpu_buf->last_is_kernel != is_kernel) { - if (add_code(cpu_buf, is_kernel)) - goto fail; - cpu_buf->last_is_kernel = is_kernel; - } - - /* notice a task switch */ - if (!is_kernel) { - task = current; - if (cpu_buf->last_task != task) { - if (add_code(cpu_buf, (unsigned long)task)) - goto fail; - cpu_buf->last_task = task; - } - } - - fail = fail || add_code(cpu_buf, ibs_code); - fail = fail || add_sample(cpu_buf, ibs_sample[0], ibs_sample[1]); - fail = fail || add_sample(cpu_buf, ibs_sample[2], ibs_sample[3]); - fail = fail || add_sample(cpu_buf, ibs_sample[4], ibs_sample[5]); - - if (ibs_code == IBS_OP_BEGIN) { - fail = fail || add_sample(cpu_buf, ibs_sample[6], ibs_sample[7]); - fail = fail || add_sample(cpu_buf, ibs_sample[8], ibs_sample[9]); - fail = fail || add_sample(cpu_buf, ibs_sample[10], ibs_sample[11]); - } + /* no backtraces for samples with data */ + if (op_add_code(cpu_buf, 0, is_kernel, current)) + goto fail; - if (fail) + sample = op_cpu_buffer_write_reserve(entry, size + 2); + if (!sample) goto fail; + sample->eip = ESCAPE_CODE; + sample->event = 0; /* no flags */ - if (backtrace_depth) - oprofile_ops.backtrace(regs, backtrace_depth); + op_cpu_buffer_add_data(entry, code); + op_cpu_buffer_add_data(entry, pc); return; fail: cpu_buf->sample_lost_overflow++; - return; } -#endif +int oprofile_add_data(struct op_entry *entry, unsigned long val) +{ + return op_cpu_buffer_add_data(entry, val); +} + +int oprofile_write_commit(struct op_entry *entry) +{ + return op_cpu_buffer_write_commit(entry); +} void oprofile_add_pc(unsigned long pc, int is_kernel, unsigned long event) { struct oprofile_cpu_buffer *cpu_buf = &__get_cpu_var(cpu_buffer); - log_sample(cpu_buf, pc, is_kernel, event); + log_sample(cpu_buf, pc, 0, is_kernel, event); } void oprofile_add_trace(unsigned long pc) @@ -340,7 +426,7 @@ void oprofile_add_trace(unsigned long pc) if (pc == ESCAPE_CODE) goto fail; - if (add_sample(cpu_buf, pc, 0)) + if (op_add_sample(cpu_buf, pc, 0)) goto fail; return; diff --git a/drivers/oprofile/cpu_buffer.h b/drivers/oprofile/cpu_buffer.h index aacb0f0bc56..63f81c44846 100644 --- a/drivers/oprofile/cpu_buffer.h +++ b/drivers/oprofile/cpu_buffer.h @@ -1,10 +1,11 @@ /** * @file cpu_buffer.h * - * @remark Copyright 2002 OProfile authors + * @remark Copyright 2002-2009 OProfile authors * @remark Read the file COPYING * * @author John Levon <levon@movementarian.org> + * @author Robert Richter <robert.richter@amd.com> */ #ifndef OPROFILE_CPU_BUFFER_H @@ -31,17 +32,12 @@ void end_cpu_work(void); struct op_sample { unsigned long eip; unsigned long event; + unsigned long data[0]; }; -struct op_entry { - struct ring_buffer_event *event; - struct op_sample *sample; - unsigned long irq_flags; -}; +struct op_entry; struct oprofile_cpu_buffer { - volatile unsigned long head_pos; - volatile unsigned long tail_pos; unsigned long buffer_size; struct task_struct *last_task; int last_is_kernel; @@ -54,8 +50,6 @@ struct oprofile_cpu_buffer { struct delayed_work work; }; -extern struct ring_buffer *op_ring_buffer_read; -extern struct ring_buffer *op_ring_buffer_write; DECLARE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer); /* @@ -64,7 +58,7 @@ DECLARE_PER_CPU(struct oprofile_cpu_buffer, cpu_buffer); * reset these to invalid values; the next sample collected will * populate the buffer with proper values to initialize the buffer */ -static inline void cpu_buffer_reset(int cpu) +static inline void op_cpu_buffer_reset(int cpu) { struct oprofile_cpu_buffer *cpu_buf = &per_cpu(cpu_buffer, cpu); @@ -72,55 +66,48 @@ static inline void cpu_buffer_reset(int cpu) cpu_buf->last_task = NULL; } -static inline int cpu_buffer_write_entry(struct op_entry *entry) -{ - entry->event = ring_buffer_lock_reserve(op_ring_buffer_write, - sizeof(struct op_sample), - &entry->irq_flags); - if (entry->event) - entry->sample = ring_buffer_event_data(entry->event); - else - entry->sample = NULL; - - if (!entry->sample) - return -ENOMEM; - - return 0; -} +struct op_sample +*op_cpu_buffer_write_reserve(struct op_entry *entry, unsigned long size); +int op_cpu_buffer_write_commit(struct op_entry *entry); +struct op_sample *op_cpu_buffer_read_entry(struct op_entry *entry, int cpu); +unsigned long op_cpu_buffer_entries(int cpu); -static inline int cpu_buffer_write_commit(struct op_entry *entry) +/* returns the remaining free size of data in the entry */ +static inline +int op_cpu_buffer_add_data(struct op_entry *entry, unsigned long val) { - return ring_buffer_unlock_commit(op_ring_buffer_write, entry->event, - entry->irq_flags); + if (!entry->size) + return 0; + *entry->data = val; + entry->size--; + entry->data++; + return entry->size; } -static inline struct op_sample *cpu_buffer_read_entry(int cpu) +/* returns the size of data in the entry */ +static inline +int op_cpu_buffer_get_size(struct op_entry *entry) { - struct ring_buffer_event *e; - e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); - if (e) - return ring_buffer_event_data(e); - if (ring_buffer_swap_cpu(op_ring_buffer_read, - op_ring_buffer_write, - cpu)) - return NULL; - e = ring_buffer_consume(op_ring_buffer_read, cpu, NULL); - if (e) - return ring_buffer_event_data(e); - return NULL; + return entry->size; } -/* "acquire" as many cpu buffer slots as we can */ -static inline unsigned long cpu_buffer_entries(int cpu) +/* returns 0 if empty or the size of data including the current value */ +static inline +int op_cpu_buffer_get_data(struct op_entry *entry, unsigned long *val) { - return ring_buffer_entries_cpu(op_ring_buffer_read, cpu) - + ring_buffer_entries_cpu(op_ring_buffer_write, cpu); + int size = entry->size; + if (!size) + return 0; + *val = *entry->data; + entry->size--; + entry->data++; + return size; } -/* transient events for the CPU buffer -> event buffer */ -#define CPU_IS_KERNEL 1 -#define CPU_TRACE_BEGIN 2 -#define IBS_FETCH_BEGIN 3 -#define IBS_OP_BEGIN 4 +/* extra data flags */ +#define KERNEL_CTX_SWITCH (1UL << 0) +#define IS_KERNEL (1UL << 1) +#define TRACE_BEGIN (1UL << 2) +#define USER_CTX_SWITCH (1UL << 3) #endif /* OPROFILE_CPU_BUFFER_H */ diff --git a/drivers/oprofile/event_buffer.c b/drivers/oprofile/event_buffer.c index 191a3202cec..2b7ae366ceb 100644 --- a/drivers/oprofile/event_buffer.c +++ b/drivers/oprofile/event_buffer.c @@ -73,8 +73,8 @@ int alloc_event_buffer(void) unsigned long flags; spin_lock_irqsave(&oprofilefs_lock, flags); - buffer_size = fs_buffer_size; - buffer_watershed = fs_buffer_watershed; + buffer_size = oprofile_buffer_size; + buffer_watershed = oprofile_buffer_watershed; spin_unlock_irqrestore(&oprofilefs_lock, flags); if (buffer_watershed >= buffer_size) diff --git a/drivers/oprofile/oprof.c b/drivers/oprofile/oprof.c index cd375907f26..3cffce90f82 100644 --- a/drivers/oprofile/oprof.c +++ b/drivers/oprofile/oprof.c @@ -23,7 +23,7 @@ struct oprofile_operations oprofile_ops; unsigned long oprofile_started; -unsigned long backtrace_depth; +unsigned long oprofile_backtrace_depth; static unsigned long is_setup; static DEFINE_MUTEX(start_mutex); @@ -172,7 +172,7 @@ int oprofile_set_backtrace(unsigned long val) goto out; } - backtrace_depth = val; + oprofile_backtrace_depth = val; out: mutex_unlock(&start_mutex); diff --git a/drivers/oprofile/oprof.h b/drivers/oprofile/oprof.h index 5df0c21a608..c288d3c24b5 100644 --- a/drivers/oprofile/oprof.h +++ b/drivers/oprofile/oprof.h @@ -21,12 +21,12 @@ void oprofile_stop(void); struct oprofile_operations; -extern unsigned long fs_buffer_size; -extern unsigned long fs_cpu_buffer_size; -extern unsigned long fs_buffer_watershed; +extern unsigned long oprofile_buffer_size; +extern unsigned long oprofile_cpu_buffer_size; +extern unsigned long oprofile_buffer_watershed; extern struct oprofile_operations oprofile_ops; extern unsigned long oprofile_started; -extern unsigned long backtrace_depth; +extern unsigned long oprofile_backtrace_depth; struct super_block; struct dentry; diff --git a/drivers/oprofile/oprofile_files.c b/drivers/oprofile/oprofile_files.c index d8201998b0b..5d36ffc30dd 100644 --- a/drivers/oprofile/oprofile_files.c +++ b/drivers/oprofile/oprofile_files.c @@ -14,17 +14,18 @@ #include "oprofile_stats.h" #include "oprof.h" -#define FS_BUFFER_SIZE_DEFAULT 131072 -#define FS_CPU_BUFFER_SIZE_DEFAULT 8192 -#define FS_BUFFER_WATERSHED_DEFAULT 32768 /* FIXME: tune */ +#define BUFFER_SIZE_DEFAULT 131072 +#define CPU_BUFFER_SIZE_DEFAULT 8192 +#define BUFFER_WATERSHED_DEFAULT 32768 /* FIXME: tune */ -unsigned long fs_buffer_size; -unsigned long fs_cpu_buffer_size; -unsigned long fs_buffer_watershed; +unsigned long oprofile_buffer_size; +unsigned long oprofile_cpu_buffer_size; +unsigned long oprofile_buffer_watershed; static ssize_t depth_read(struct file *file, char __user *buf, size_t count, loff_t *offset) { - return oprofilefs_ulong_to_user(backtrace_depth, buf, count, offset); + return oprofilefs_ulong_to_user(oprofile_backtrace_depth, buf, count, + offset); } @@ -125,16 +126,16 @@ static const struct file_operations dump_fops = { void oprofile_create_files(struct super_block *sb, struct dentry *root) { /* reinitialize default values */ - fs_buffer_size = FS_BUFFER_SIZE_DEFAULT; - fs_cpu_buffer_size = FS_CPU_BUFFER_SIZE_DEFAULT; - fs_buffer_watershed = FS_BUFFER_WATERSHED_DEFAULT; + oprofile_buffer_size = BUFFER_SIZE_DEFAULT; + oprofile_cpu_buffer_size = CPU_BUFFER_SIZE_DEFAULT; + oprofile_buffer_watershed = BUFFER_WATERSHED_DEFAULT; oprofilefs_create_file(sb, root, "enable", &enable_fops); oprofilefs_create_file_perm(sb, root, "dump", &dump_fops, 0666); oprofilefs_create_file(sb, root, "buffer", &event_buffer_fops); - oprofilefs_create_ulong(sb, root, "buffer_size", &fs_buffer_size); - oprofilefs_create_ulong(sb, root, "buffer_watershed", &fs_buffer_watershed); - oprofilefs_create_ulong(sb, root, "cpu_buffer_size", &fs_cpu_buffer_size); + oprofilefs_create_ulong(sb, root, "buffer_size", &oprofile_buffer_size); + oprofilefs_create_ulong(sb, root, "buffer_watershed", &oprofile_buffer_watershed); + oprofilefs_create_ulong(sb, root, "cpu_buffer_size", &oprofile_cpu_buffer_size); oprofilefs_create_file(sb, root, "cpu_type", &cpu_type_fops); oprofilefs_create_file(sb, root, "backtrace_depth", &depth_fops); oprofilefs_create_file(sb, root, "pointer_size", &pointer_size_fops); diff --git a/drivers/parisc/asp.c b/drivers/parisc/asp.c index 82136913536..7931133526c 100644 --- a/drivers/parisc/asp.c +++ b/drivers/parisc/asp.c @@ -71,8 +71,7 @@ static void asp_choose_irq(struct parisc_device *dev, void *ctrl) */ #define ASP_INTERRUPT_ADDR 0xf0800000 -int __init -asp_init_chip(struct parisc_device *dev) +static int __init asp_init_chip(struct parisc_device *dev) { struct gsc_irq gsc_irq; int ret; diff --git a/drivers/parisc/ccio-dma.c b/drivers/parisc/ccio-dma.c index dcc1e9958d2..cd4dd7ed2c0 100644 --- a/drivers/parisc/ccio-dma.c +++ b/drivers/parisc/ccio-dma.c @@ -555,7 +555,7 @@ static u32 hint_lookup[] = { * (Load Coherence Index) instruction. The 8 bits used for the virtual * index are bits 12:19 of the value returned by LCI. */ -void CCIO_INLINE +static void CCIO_INLINE ccio_io_pdir_entry(u64 *pdir_ptr, space_t sid, unsigned long vba, unsigned long hints) { @@ -1578,8 +1578,6 @@ static int __init ccio_probe(struct parisc_device *dev) ioc_count++; - parisc_vmerge_boundary = IOVP_SIZE; - parisc_vmerge_max_size = BITS_PER_LONG * IOVP_SIZE; parisc_has_iommu(); return 0; } diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index 77cc8bfef8c..d539d9df88e 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -287,7 +287,7 @@ DINO_PORT_OUT(b, 8, 3) DINO_PORT_OUT(w, 16, 2) DINO_PORT_OUT(l, 32, 0) -struct pci_port_ops dino_port_ops = { +static struct pci_port_ops dino_port_ops = { .inb = dino_in8, .inw = dino_in16, .inl = dino_in32, @@ -690,7 +690,7 @@ dino_fixup_bus(struct pci_bus *bus) } -struct pci_bios_ops dino_bios_ops = { +static struct pci_bios_ops dino_bios_ops = { .init = dino_bios_init, .fixup_bus = dino_fixup_bus }; diff --git a/drivers/parisc/hppb.c b/drivers/parisc/hppb.c index 65eee67aa2a..13856415b43 100644 --- a/drivers/parisc/hppb.c +++ b/drivers/parisc/hppb.c @@ -29,7 +29,7 @@ struct hppb_card { struct hppb_card *next; }; -struct hppb_card hppb_card_head = { +static struct hppb_card hppb_card_head = { .hpa = 0, .next = NULL, }; diff --git a/drivers/parisc/lasi.c b/drivers/parisc/lasi.c index bee510098ce..e65727ca9fc 100644 --- a/drivers/parisc/lasi.c +++ b/drivers/parisc/lasi.c @@ -107,7 +107,7 @@ lasi_init_irq(struct gsc_asic *this_lasi) #else -void __init lasi_led_init(unsigned long lasi_hpa) +static void __init lasi_led_init(unsigned long lasi_hpa) { unsigned long datareg; @@ -163,8 +163,7 @@ static void lasi_power_off(void) gsc_writel(0x02, datareg); } -int __init -lasi_init_chip(struct parisc_device *dev) +static int __init lasi_init_chip(struct parisc_device *dev) { extern void (*chassis_power_off)(void); struct gsc_asic *lasi; diff --git a/drivers/parisc/lba_pci.c b/drivers/parisc/lba_pci.c index a28c8946dea..d8233de8c75 100644 --- a/drivers/parisc/lba_pci.c +++ b/drivers/parisc/lba_pci.c @@ -824,7 +824,7 @@ lba_fixup_bus(struct pci_bus *bus) } -struct pci_bios_ops lba_bios_ops = { +static struct pci_bios_ops lba_bios_ops = { .init = lba_bios_init, .fixup_bus = lba_fixup_bus, }; diff --git a/drivers/parisc/sba_iommu.c b/drivers/parisc/sba_iommu.c index bc73b96346f..3fac8f81d59 100644 --- a/drivers/parisc/sba_iommu.c +++ b/drivers/parisc/sba_iommu.c @@ -561,7 +561,7 @@ typedef unsigned long space_t; * IOMMU uses little endian for the pdir. */ -void SBA_INLINE +static void SBA_INLINE sba_io_pdir_entry(u64 *pdir_ptr, space_t sid, unsigned long vba, unsigned long hint) { @@ -1874,7 +1874,7 @@ static struct parisc_device_id sba_tbl[] = { { 0, } }; -int sba_driver_callback(struct parisc_device *); +static int sba_driver_callback(struct parisc_device *); static struct parisc_driver sba_driver = { .name = MODULE_NAME, @@ -1887,8 +1887,7 @@ static struct parisc_driver sba_driver = { ** If so, initialize the chip and tell other partners in crime they ** have work to do. */ -int -sba_driver_callback(struct parisc_device *dev) +static int sba_driver_callback(struct parisc_device *dev) { struct sba_device *sba_dev; u32 func_class; @@ -1979,8 +1978,6 @@ sba_driver_callback(struct parisc_device *dev) proc_create("sba_iommu-bitmap", 0, root, &sba_proc_bitmap_fops); #endif - parisc_vmerge_boundary = IOVP_SIZE; - parisc_vmerge_max_size = IOVP_SIZE * BITS_PER_LONG; parisc_has_iommu(); return 0; } diff --git a/drivers/parisc/wax.c b/drivers/parisc/wax.c index 892a83bbe73..da9d5ad1353 100644 --- a/drivers/parisc/wax.c +++ b/drivers/parisc/wax.c @@ -68,8 +68,7 @@ wax_init_irq(struct gsc_asic *wax) // gsc_writel(0xFFFFFFFF, base+0x2000); /* RS232-B on Wax */ } -int __init -wax_init_chip(struct parisc_device *dev) +static int __init wax_init_chip(struct parisc_device *dev) { struct gsc_asic *wax; struct parisc_device *parent; diff --git a/drivers/pci/hotplug/acpi_pcihp.c b/drivers/pci/hotplug/acpi_pcihp.c index c62ab8d240a..1c114180106 100644 --- a/drivers/pci/hotplug/acpi_pcihp.c +++ b/drivers/pci/hotplug/acpi_pcihp.c @@ -33,7 +33,6 @@ #include <linux/pci-acpi.h> #include <acpi/acpi.h> #include <acpi/acpi_bus.h> -#include <acpi/actypes.h> #define MY_NAME "acpi_pcihp" diff --git a/drivers/pci/hotplug/pciehp.h b/drivers/pci/hotplug/pciehp.h index 27fd18f019f..db85284ffb6 100644 --- a/drivers/pci/hotplug/pciehp.h +++ b/drivers/pci/hotplug/pciehp.h @@ -217,7 +217,6 @@ struct hpc_ops { #ifdef CONFIG_ACPI #include <acpi/acpi.h> #include <acpi/acpi_bus.h> -#include <acpi/actypes.h> #include <linux/pci-acpi.h> extern void __init pciehp_acpi_slot_detection_init(void); diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c index 3582512e722..deea8a187eb 100644 --- a/drivers/pci/pci-acpi.c +++ b/drivers/pci/pci-acpi.c @@ -13,8 +13,6 @@ #include <linux/module.h> #include <linux/pci-aspm.h> #include <acpi/acpi.h> -#include <acpi/acnamesp.h> -#include <acpi/acresrc.h> #include <acpi/acpi_bus.h> #include <linux/pci-acpi.h> diff --git a/drivers/platform/Kconfig b/drivers/platform/Kconfig new file mode 100644 index 00000000000..9652c3fe7f5 --- /dev/null +++ b/drivers/platform/Kconfig @@ -0,0 +1,5 @@ +# drivers/platform/Kconfig + +if X86 +source "drivers/platform/x86/Kconfig" +endif diff --git a/drivers/platform/Makefile b/drivers/platform/Makefile new file mode 100644 index 00000000000..782953ae4c0 --- /dev/null +++ b/drivers/platform/Makefile @@ -0,0 +1,5 @@ +# +# Makefile for linux/drivers/platform +# + +obj-$(CONFIG_X86) += x86/ diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig new file mode 100644 index 00000000000..e65448e99b4 --- /dev/null +++ b/drivers/platform/x86/Kconfig @@ -0,0 +1,375 @@ +# +# X86 Platform Specific Drivers +# + +menuconfig X86_PLATFORM_DEVICES + bool "X86 Platform Specific Device Drivers" + default y + ---help--- + Say Y here to get to see options for device drivers for various + x86 platforms, including vendor-specific laptop extension drivers. + This option alone does not add any kernel code. + + If you say N, all options in this submenu will be skipped and disabled. + +if X86_PLATFORM_DEVICES + +config ACER_WMI + tristate "Acer WMI Laptop Extras (EXPERIMENTAL)" + depends on EXPERIMENTAL + depends on ACPI + depends on LEDS_CLASS + depends on NEW_LEDS + depends on BACKLIGHT_CLASS_DEVICE + depends on SERIO_I8042 + depends on RFKILL + select ACPI_WMI + ---help--- + This is a driver for newer Acer (and Wistron) laptops. It adds + wireless radio and bluetooth control, and on some laptops, + exposes the mail LED and LCD backlight. + + For more information about this driver see + <file:Documentation/laptops/acer-wmi.txt> + + If you have an ACPI-WMI compatible Acer/ Wistron laptop, say Y or M + here. + +config ASUS_LAPTOP + tristate "Asus Laptop Extras (EXPERIMENTAL)" + depends on ACPI + depends on EXPERIMENTAL && !ACPI_ASUS + depends on LEDS_CLASS + depends on NEW_LEDS + depends on BACKLIGHT_CLASS_DEVICE + ---help--- + This is the new Linux driver for Asus laptops. It may also support some + MEDION, JVC or VICTOR laptops. It makes all the extra buttons generate + standard ACPI events that go through /proc/acpi/events. It also adds + support for video output switching, LCD backlight control, Bluetooth and + Wlan control, and most importantly, allows you to blink those fancy LEDs. + + For more information and a userspace daemon for handling the extra + buttons see <http://acpi4asus.sf.net/>. + + If you have an ACPI-compatible ASUS laptop, say Y or M here. + +config FUJITSU_LAPTOP + tristate "Fujitsu Laptop Extras" + depends on ACPI + depends on INPUT + depends on BACKLIGHT_CLASS_DEVICE + ---help--- + This is a driver for laptops built by Fujitsu: + + * P2xxx/P5xxx/S6xxx/S7xxx series Lifebooks + * Possibly other Fujitsu laptop models + * Tested with S6410 and S7020 + + It adds support for LCD brightness control and some hotkeys. + + If you have a Fujitsu laptop, say Y or M here. + +config FUJITSU_LAPTOP_DEBUG + bool "Verbose debug mode for Fujitsu Laptop Extras" + depends on FUJITSU_LAPTOP + default n + ---help--- + Enables extra debug output from the fujitsu extras driver, at the + expense of a slight increase in driver size. + + If you are not sure, say N here. + +config TC1100_WMI + tristate "HP Compaq TC1100 Tablet WMI Extras (EXPERIMENTAL)" + depends on !X86_64 + depends on EXPERIMENTAL + depends on ACPI + select ACPI_WMI + ---help--- + This is a driver for the WMI extensions (wireless and bluetooth power + control) of the HP Compaq TC1100 tablet. + +config HP_WMI + tristate "HP WMI extras" + depends on ACPI_WMI + depends on INPUT + depends on RFKILL + help + Say Y here if you want to support WMI-based hotkeys on HP laptops and + to read data from WMI such as docking or ambient light sensor state. + + To compile this driver as a module, choose M here: the module will + be called hp-wmi. + +config MSI_LAPTOP + tristate "MSI Laptop Extras" + depends on ACPI + depends on BACKLIGHT_CLASS_DEVICE + ---help--- + This is a driver for laptops built by MSI (MICRO-STAR + INTERNATIONAL): + + MSI MegaBook S270 (MS-1013) + Cytron/TCM/Medion/Tchibo MD96100/SAM2000 + + It adds support for Bluetooth, WLAN and LCD brightness control. + + More information about this driver is available at + <http://0pointer.de/lennart/tchibo.html>. + + If you have an MSI S270 laptop, say Y or M here. + +config PANASONIC_LAPTOP + tristate "Panasonic Laptop Extras" + depends on INPUT && ACPI + depends on BACKLIGHT_CLASS_DEVICE + ---help--- + This driver adds support for access to backlight control and hotkeys + on Panasonic Let's Note laptops. + + If you have a Panasonic Let's note laptop (such as the R1(N variant), + R2, R3, R5, T2, W2 and Y2 series), say Y. + +config COMPAL_LAPTOP + tristate "Compal Laptop Extras" + depends on ACPI + depends on BACKLIGHT_CLASS_DEVICE + ---help--- + This is a driver for laptops built by Compal: + + Compal FL90/IFL90 + Compal FL91/IFL91 + Compal FL92/JFL92 + Compal FT00/IFT00 + + It adds support for Bluetooth, WLAN and LCD brightness control. + + If you have an Compal FL9x/IFL9x/FT00 laptop, say Y or M here. + +config SONY_LAPTOP + tristate "Sony Laptop Extras" + depends on ACPI + select BACKLIGHT_CLASS_DEVICE + depends on INPUT + ---help--- + This mini-driver drives the SNC and SPIC devices present in the ACPI + BIOS of the Sony Vaio laptops. + + It gives access to some extra laptop functionalities like Bluetooth, + screen brightness control, Fn keys and allows powering on/off some + devices. + + Read <file:Documentation/laptops/sony-laptop.txt> for more information. + +config SONYPI_COMPAT + bool "Sonypi compatibility" + depends on SONY_LAPTOP + ---help--- + Build the sonypi driver compatibility code into the sony-laptop driver. + +config THINKPAD_ACPI + tristate "ThinkPad ACPI Laptop Extras" + depends on ACPI + select BACKLIGHT_LCD_SUPPORT + select BACKLIGHT_CLASS_DEVICE + select HWMON + select NVRAM + select INPUT + select NEW_LEDS + select LEDS_CLASS + select NET + select RFKILL + ---help--- + This is a driver for the IBM and Lenovo ThinkPad laptops. It adds + support for Fn-Fx key combinations, Bluetooth control, video + output switching, ThinkLight control, UltraBay eject and more. + For more information about this driver see + <file:Documentation/laptops/thinkpad-acpi.txt> and + <http://ibm-acpi.sf.net/> . + + This driver was formerly known as ibm-acpi. + + If you have an IBM or Lenovo ThinkPad laptop, say Y or M here. + +config THINKPAD_ACPI_DEBUG + bool "Verbose debug mode" + depends on THINKPAD_ACPI + default n + ---help--- + Enables extra debugging information, at the expense of a slightly + increase in driver size. + + If you are not sure, say N here. + +config THINKPAD_ACPI_DOCK + bool "Legacy Docking Station Support" + depends on THINKPAD_ACPI + depends on ACPI_DOCK=n + default n + ---help--- + Allows the thinkpad_acpi driver to handle docking station events. + This support was made obsolete by the generic ACPI docking station + support (CONFIG_ACPI_DOCK). It will allow locking and removing the + laptop from the docking station, but will not properly connect PCI + devices. + + If you are not sure, say N here. + +config THINKPAD_ACPI_BAY + bool "Legacy Removable Bay Support" + depends on THINKPAD_ACPI + default y + ---help--- + Allows the thinkpad_acpi driver to handle removable bays. It will + electrically disable the device in the bay, and also generate + notifications when the bay lever is ejected or inserted. + + If you are not sure, say Y here. + +config THINKPAD_ACPI_VIDEO + bool "Video output control support" + depends on THINKPAD_ACPI + default y + ---help--- + Allows the thinkpad_acpi driver to provide an interface to control + the various video output ports. + + This feature often won't work well, depending on ThinkPad model, + display state, video output devices in use, whether there is a X + server running, phase of the moon, and the current mood of + Schroedinger's cat. If you can use X.org's RandR to control + your ThinkPad's video output ports instead of this feature, + don't think twice: do it and say N here to save some memory. + + If you are not sure, say Y here. + +config THINKPAD_ACPI_HOTKEY_POLL + bool "Support NVRAM polling for hot keys" + depends on THINKPAD_ACPI + default y + ---help--- + Some thinkpad models benefit from NVRAM polling to detect a few of + the hot key press events. If you know your ThinkPad model does not + need to do NVRAM polling to support any of the hot keys you use, + unselecting this option will save about 1kB of memory. + + ThinkPads T40 and newer, R52 and newer, and X31 and newer are + unlikely to need NVRAM polling in their latest BIOS versions. + + NVRAM polling can detect at most the following keys: ThinkPad/Access + IBM, Zoom, Switch Display (fn+F7), ThinkLight, Volume up/down/mute, + Brightness up/down, Display Expand (fn+F8), Hibernate (fn+F12). + + If you are not sure, say Y here. The driver enables polling only if + it is strictly necessary to do so. + +config INTEL_MENLOW + tristate "Thermal Management driver for Intel menlow platform" + depends on ACPI_THERMAL + select THERMAL + ---help--- + ACPI thermal management enhancement driver on + Intel Menlow platform. + + If unsure, say N. + +config EEEPC_LAPTOP + tristate "Eee PC Hotkey Driver (EXPERIMENTAL)" + depends on ACPI + depends on EXPERIMENTAL + select BACKLIGHT_CLASS_DEVICE + select HWMON + select RFKILL + ---help--- + This driver supports the Fn-Fx keys on Eee PC laptops. + It also adds the ability to switch camera/wlan on/off. + + If you have an Eee PC laptop, say Y or M here. + + +config ACPI_WMI + tristate "WMI (EXPERIMENTAL)" + depends on ACPI + depends on EXPERIMENTAL + help + This driver adds support for the ACPI-WMI (Windows Management + Instrumentation) mapper device (PNP0C14) found on some systems. + + ACPI-WMI is a proprietary extension to ACPI to expose parts of the + ACPI firmware to userspace - this is done through various vendor + defined methods and data blocks in a PNP0C14 device, which are then + made available for userspace to call. + + The implementation of this in Linux currently only exposes this to + other kernel space drivers. + + This driver is a required dependency to build the firmware specific + drivers needed on many machines, including Acer and HP laptops. + + It is safe to enable this driver even if your DSDT doesn't define + any ACPI-WMI devices. + +config ACPI_ASUS + tristate "ASUS/Medion Laptop Extras" + depends on ACPI + select BACKLIGHT_CLASS_DEVICE + ---help--- + This driver provides support for extra features of ACPI-compatible + ASUS laptops. As some of Medion laptops are made by ASUS, it may also + support some Medion laptops (such as 9675 for example). It makes all + the extra buttons generate standard ACPI events that go through + /proc/acpi/events, and (on some models) adds support for changing the + display brightness and output, switching the LCD backlight on and off, + and most importantly, allows you to blink those fancy LEDs intended + for reporting mail and wireless status. + + Note: display switching code is currently considered EXPERIMENTAL, + toying with these values may even lock your machine. + + All settings are changed via /proc/acpi/asus directory entries. Owner + and group for these entries can be set with asus_uid and asus_gid + parameters. + + More information and a userspace daemon for handling the extra buttons + at <http://sourceforge.net/projects/acpi4asus/>. + + If you have an ACPI-compatible ASUS laptop, say Y or M here. This + driver is still under development, so if your laptop is unsupported or + something works not quite as expected, please use the mailing list + available on the above page (acpi4asus-user@lists.sourceforge.net). + + NOTE: This driver is deprecated and will probably be removed soon, + use asus-laptop instead. + +config ACPI_TOSHIBA + tristate "Toshiba Laptop Extras" + depends on ACPI + depends on INPUT + select INPUT_POLLDEV + select NET + select RFKILL + select BACKLIGHT_CLASS_DEVICE + ---help--- + This driver adds support for access to certain system settings + on "legacy free" Toshiba laptops. These laptops can be recognized by + their lack of a BIOS setup menu and APM support. + + On these machines, all system configuration is handled through the + ACPI. This driver is required for access to controls not covered + by the general ACPI drivers, such as LCD brightness, video output, + etc. + + This driver differs from the non-ACPI Toshiba laptop driver (located + under "Processor type and features") in several aspects. + Configuration is accessed by reading and writing text files in the + /proc tree instead of by program interface to /dev. Furthermore, no + power management functions are exposed, as those are handled by the + general ACPI drivers. + + More information about this driver is available at + <http://memebeam.org/toys/ToshibaAcpiDriver>. + + If you have a legacy free Toshiba laptop (such as the Libretto L1 + series), say Y. +endif # X86_PLATFORM_DEVICES diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile new file mode 100644 index 00000000000..1e9de2ae0de --- /dev/null +++ b/drivers/platform/x86/Makefile @@ -0,0 +1,19 @@ +# +# Makefile for linux/drivers/platform/x86 +# x86 Platform-Specific Drivers +# +obj-$(CONFIG_ASUS_LAPTOP) += asus-laptop.o +obj-$(CONFIG_EEEPC_LAPTOP) += eeepc-laptop.o +obj-$(CONFIG_MSI_LAPTOP) += msi-laptop.o +obj-$(CONFIG_COMPAL_LAPTOP) += compal-laptop.o +obj-$(CONFIG_ACER_WMI) += acer-wmi.o +obj-$(CONFIG_HP_WMI) += hp-wmi.o +obj-$(CONFIG_TC1100_WMI) += tc1100-wmi.o +obj-$(CONFIG_SONY_LAPTOP) += sony-laptop.o +obj-$(CONFIG_THINKPAD_ACPI) += thinkpad_acpi.o +obj-$(CONFIG_FUJITSU_LAPTOP) += fujitsu-laptop.o +obj-$(CONFIG_PANASONIC_LAPTOP) += panasonic-laptop.o +obj-$(CONFIG_INTEL_MENLOW) += intel_menlow.o +obj-$(CONFIG_ACPI_WMI) += wmi.o +obj-$(CONFIG_ACPI_ASUS) += asus_acpi.o +obj-$(CONFIG_ACPI_TOSHIBA) += toshiba_acpi.o diff --git a/drivers/misc/acer-wmi.c b/drivers/platform/x86/acer-wmi.c index 94c9f911824..94c9f911824 100644 --- a/drivers/misc/acer-wmi.c +++ b/drivers/platform/x86/acer-wmi.c diff --git a/drivers/misc/asus-laptop.c b/drivers/platform/x86/asus-laptop.c index 8fb8b359104..8fb8b359104 100644 --- a/drivers/misc/asus-laptop.c +++ b/drivers/platform/x86/asus-laptop.c diff --git a/drivers/acpi/asus_acpi.c b/drivers/platform/x86/asus_acpi.c index 1e74988c7b2..1e74988c7b2 100644 --- a/drivers/acpi/asus_acpi.c +++ b/drivers/platform/x86/asus_acpi.c diff --git a/drivers/misc/compal-laptop.c b/drivers/platform/x86/compal-laptop.c index 11003bba10d..11003bba10d 100644 --- a/drivers/misc/compal-laptop.c +++ b/drivers/platform/x86/compal-laptop.c diff --git a/drivers/misc/eeepc-laptop.c b/drivers/platform/x86/eeepc-laptop.c index 02fe2b8b893..02fe2b8b893 100644 --- a/drivers/misc/eeepc-laptop.c +++ b/drivers/platform/x86/eeepc-laptop.c diff --git a/drivers/misc/fujitsu-laptop.c b/drivers/platform/x86/fujitsu-laptop.c index a7dd3e9fb79..65dc41540c6 100644 --- a/drivers/misc/fujitsu-laptop.c +++ b/drivers/platform/x86/fujitsu-laptop.c @@ -3,6 +3,7 @@ /* Copyright (C) 2007,2008 Jonathan Woithe <jwoithe@physics.adelaide.edu.au> Copyright (C) 2008 Peter Gruber <nokos@gmx.net> + Copyright (C) 2008 Tony Vroon <tony@linx.net> Based on earlier work: Copyright (C) 2003 Shane Spencer <shane@bogomip.com> Adrian Yee <brewt-fujitsu@brewt.org> @@ -65,8 +66,11 @@ #include <linux/kfifo.h> #include <linux/video_output.h> #include <linux/platform_device.h> +#ifdef CONFIG_LEDS_CLASS +#include <linux/leds.h> +#endif -#define FUJITSU_DRIVER_VERSION "0.4.3" +#define FUJITSU_DRIVER_VERSION "0.5.0" #define FUJITSU_LCD_N_LEVELS 8 @@ -83,6 +87,24 @@ #define ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS 0x86 #define ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS 0x87 +/* FUNC interface - command values */ +#define FUNC_RFKILL 0x1000 +#define FUNC_LEDS 0x1001 +#define FUNC_BUTTONS 0x1002 +#define FUNC_BACKLIGHT 0x1004 + +/* FUNC interface - responses */ +#define UNSUPPORTED_CMD 0x80000000 + +#ifdef CONFIG_LEDS_CLASS +/* FUNC interface - LED control */ +#define FUNC_LED_OFF 0x1 +#define FUNC_LED_ON 0x30001 +#define KEYBOARD_LAMPS 0x100 +#define LOGOLAMP_POWERON 0x2000 +#define LOGOLAMP_ALWAYS 0x4000 +#endif + /* Hotkey details */ #define KEY1_CODE 0x410 /* codes for the keys in the GIRB register */ #define KEY2_CODE 0x411 @@ -133,7 +155,6 @@ struct fujitsu_t { static struct fujitsu_t *fujitsu; static int use_alt_lcd_levels = -1; -static int disable_brightness_keys = -1; static int disable_brightness_adjust = -1; /* Device used to access other hotkeys on the laptop */ @@ -145,8 +166,9 @@ struct fujitsu_hotkey_t { struct platform_device *pf_device; struct kfifo *fifo; spinlock_t fifo_lock; - - unsigned int irb; /* info about the pressed buttons */ + int rfkill_state; + int logolamp_registered; + int kblamps_registered; }; static struct fujitsu_hotkey_t *fujitsu_hotkey; @@ -154,12 +176,139 @@ static struct fujitsu_hotkey_t *fujitsu_hotkey; static void acpi_fujitsu_hotkey_notify(acpi_handle handle, u32 event, void *data); +#ifdef CONFIG_LEDS_CLASS +static enum led_brightness logolamp_get(struct led_classdev *cdev); +static void logolamp_set(struct led_classdev *cdev, + enum led_brightness brightness); + +struct led_classdev logolamp_led = { + .name = "fujitsu::logolamp", + .brightness_get = logolamp_get, + .brightness_set = logolamp_set +}; + +static enum led_brightness kblamps_get(struct led_classdev *cdev); +static void kblamps_set(struct led_classdev *cdev, + enum led_brightness brightness); + +struct led_classdev kblamps_led = { + .name = "fujitsu::kblamps", + .brightness_get = kblamps_get, + .brightness_set = kblamps_set +}; +#endif + #ifdef CONFIG_FUJITSU_LAPTOP_DEBUG static u32 dbg_level = 0x03; #endif static void acpi_fujitsu_notify(acpi_handle handle, u32 event, void *data); +/* Fujitsu ACPI interface function */ + +static int call_fext_func(int cmd, int arg0, int arg1, int arg2) +{ + acpi_status status = AE_OK; + union acpi_object params[4] = { + { .type = ACPI_TYPE_INTEGER }, + { .type = ACPI_TYPE_INTEGER }, + { .type = ACPI_TYPE_INTEGER }, + { .type = ACPI_TYPE_INTEGER } + }; + struct acpi_object_list arg_list = { 4, ¶ms[0] }; + struct acpi_buffer output; + union acpi_object out_obj; + acpi_handle handle = NULL; + + status = acpi_get_handle(fujitsu_hotkey->acpi_handle, "FUNC", &handle); + if (ACPI_FAILURE(status)) { + vdbg_printk(FUJLAPTOP_DBG_ERROR, + "FUNC interface is not present\n"); + return -ENODEV; + } + + params[0].integer.value = cmd; + params[1].integer.value = arg0; + params[2].integer.value = arg1; + params[3].integer.value = arg2; + + output.length = sizeof(out_obj); + output.pointer = &out_obj; + + status = acpi_evaluate_object(handle, NULL, &arg_list, &output); + if (ACPI_FAILURE(status)) { + vdbg_printk(FUJLAPTOP_DBG_WARN, + "FUNC 0x%x (args 0x%x, 0x%x, 0x%x) call failed\n", + cmd, arg0, arg1, arg2); + return -ENODEV; + } + + if (out_obj.type != ACPI_TYPE_INTEGER) { + vdbg_printk(FUJLAPTOP_DBG_WARN, + "FUNC 0x%x (args 0x%x, 0x%x, 0x%x) did not " + "return an integer\n", + cmd, arg0, arg1, arg2); + return -ENODEV; + } + + vdbg_printk(FUJLAPTOP_DBG_TRACE, + "FUNC 0x%x (args 0x%x, 0x%x, 0x%x) returned 0x%x\n", + cmd, arg0, arg1, arg2, (int)out_obj.integer.value); + return out_obj.integer.value; +} + +#ifdef CONFIG_LEDS_CLASS +/* LED class callbacks */ + +static void logolamp_set(struct led_classdev *cdev, + enum led_brightness brightness) +{ + if (brightness >= LED_FULL) { + call_fext_func(FUNC_LEDS, 0x1, LOGOLAMP_POWERON, FUNC_LED_ON); + call_fext_func(FUNC_LEDS, 0x1, LOGOLAMP_ALWAYS, FUNC_LED_ON); + } else if (brightness >= LED_HALF) { + call_fext_func(FUNC_LEDS, 0x1, LOGOLAMP_POWERON, FUNC_LED_ON); + call_fext_func(FUNC_LEDS, 0x1, LOGOLAMP_ALWAYS, FUNC_LED_OFF); + } else { + call_fext_func(FUNC_LEDS, 0x1, LOGOLAMP_POWERON, FUNC_LED_OFF); + } +} + +static void kblamps_set(struct led_classdev *cdev, + enum led_brightness brightness) +{ + if (brightness >= LED_FULL) + call_fext_func(FUNC_LEDS, 0x1, KEYBOARD_LAMPS, FUNC_LED_ON); + else + call_fext_func(FUNC_LEDS, 0x1, KEYBOARD_LAMPS, FUNC_LED_OFF); +} + +static enum led_brightness logolamp_get(struct led_classdev *cdev) +{ + enum led_brightness brightness = LED_OFF; + int poweron, always; + + poweron = call_fext_func(FUNC_LEDS, 0x2, LOGOLAMP_POWERON, 0x0); + if (poweron == FUNC_LED_ON) { + brightness = LED_HALF; + always = call_fext_func(FUNC_LEDS, 0x2, LOGOLAMP_ALWAYS, 0x0); + if (always == FUNC_LED_ON) + brightness = LED_FULL; + } + return brightness; +} + +static enum led_brightness kblamps_get(struct led_classdev *cdev) +{ + enum led_brightness brightness = LED_OFF; + + if (call_fext_func(FUNC_LEDS, 0x2, KEYBOARD_LAMPS, 0x0) == FUNC_LED_ON) + brightness = LED_FULL; + + return brightness; +} +#endif + /* Hardware access for LCD brightness control */ static int set_lcd_level(int level) @@ -263,44 +412,34 @@ static int get_max_brightness(void) return fujitsu->max_brightness; } -static int get_lcd_level_alt(void) -{ - unsigned long long state = 0; - acpi_status status = AE_OK; - - vdbg_printk(FUJLAPTOP_DBG_TRACE, "get lcd level via GBLS\n"); - - status = - acpi_evaluate_integer(fujitsu->acpi_handle, "GBLS", NULL, &state); - if (status < 0) - return status; - - fujitsu->brightness_level = state & 0x0fffffff; - - if (state & 0x80000000) - fujitsu->brightness_changed = 1; - else - fujitsu->brightness_changed = 0; - - return fujitsu->brightness_level; -} - /* Backlight device stuff */ static int bl_get_brightness(struct backlight_device *b) { - if (use_alt_lcd_levels) - return get_lcd_level_alt(); - else - return get_lcd_level(); + return get_lcd_level(); } static int bl_update_status(struct backlight_device *b) { + int ret; + if (b->props.power == 4) + ret = call_fext_func(FUNC_BACKLIGHT, 0x1, 0x4, 0x3); + else + ret = call_fext_func(FUNC_BACKLIGHT, 0x1, 0x4, 0x0); + if (ret != 0) + vdbg_printk(FUJLAPTOP_DBG_ERROR, + "Unable to adjust backlight power, error code %i\n", + ret); + if (use_alt_lcd_levels) - return set_lcd_level_alt(b->props.brightness); + ret = set_lcd_level_alt(b->props.brightness); else - return set_lcd_level(b->props.brightness); + ret = set_lcd_level(b->props.brightness); + if (ret != 0) + vdbg_printk(FUJLAPTOP_DBG_ERROR, + "Unable to adjust LCD brightness, error code %i\n", + ret); + return ret; } static struct backlight_ops fujitsubl_ops = { @@ -344,10 +483,7 @@ static ssize_t show_lcd_level(struct device *dev, int ret; - if (use_alt_lcd_levels) - ret = get_lcd_level_alt(); - else - ret = get_lcd_level(); + ret = get_lcd_level(); if (ret < 0) return ret; @@ -372,52 +508,71 @@ static ssize_t store_lcd_level(struct device *dev, if (ret < 0) return ret; - if (use_alt_lcd_levels) - ret = get_lcd_level_alt(); - else - ret = get_lcd_level(); + ret = get_lcd_level(); if (ret < 0) return ret; return count; } -/* Hardware access for hotkey device */ - -static int get_irb(void) +static ssize_t +ignore_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) { - unsigned long long state = 0; - acpi_status status = AE_OK; - - vdbg_printk(FUJLAPTOP_DBG_TRACE, "Get irb\n"); - - status = - acpi_evaluate_integer(fujitsu_hotkey->acpi_handle, "GIRB", NULL, - &state); - if (status < 0) - return status; + return count; +} - fujitsu_hotkey->irb = state; +static ssize_t +show_lid_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (fujitsu_hotkey->rfkill_state == UNSUPPORTED_CMD) + return sprintf(buf, "unknown\n"); + if (fujitsu_hotkey->rfkill_state & 0x100) + return sprintf(buf, "open\n"); + else + return sprintf(buf, "closed\n"); +} - return fujitsu_hotkey->irb; +static ssize_t +show_dock_state(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (fujitsu_hotkey->rfkill_state == UNSUPPORTED_CMD) + return sprintf(buf, "unknown\n"); + if (fujitsu_hotkey->rfkill_state & 0x200) + return sprintf(buf, "docked\n"); + else + return sprintf(buf, "undocked\n"); } static ssize_t -ignore_store(struct device *dev, - struct device_attribute *attr, const char *buf, size_t count) +show_radios_state(struct device *dev, + struct device_attribute *attr, char *buf) { - return count; + if (fujitsu_hotkey->rfkill_state == UNSUPPORTED_CMD) + return sprintf(buf, "unknown\n"); + if (fujitsu_hotkey->rfkill_state & 0x20) + return sprintf(buf, "on\n"); + else + return sprintf(buf, "killed\n"); } static DEVICE_ATTR(max_brightness, 0444, show_max_brightness, ignore_store); static DEVICE_ATTR(brightness_changed, 0444, show_brightness_changed, ignore_store); static DEVICE_ATTR(lcd_level, 0644, show_lcd_level, store_lcd_level); +static DEVICE_ATTR(lid, 0444, show_lid_state, ignore_store); +static DEVICE_ATTR(dock, 0444, show_dock_state, ignore_store); +static DEVICE_ATTR(radios, 0444, show_radios_state, ignore_store); static struct attribute *fujitsupf_attributes[] = { &dev_attr_brightness_changed.attr, &dev_attr_max_brightness.attr, &dev_attr_lcd_level.attr, + &dev_attr_lid.attr, + &dev_attr_dock.attr, + &dev_attr_radios.attr, NULL }; @@ -435,24 +590,16 @@ static struct platform_driver fujitsupf_driver = { static void dmi_check_cb_common(const struct dmi_system_id *id) { acpi_handle handle; - int have_blnf; printk(KERN_INFO "fujitsu-laptop: Identified laptop model '%s'.\n", id->ident); - have_blnf = ACPI_SUCCESS - (acpi_get_handle(NULL, "\\_SB.PCI0.GFX0.LCD.BLNF", &handle)); if (use_alt_lcd_levels == -1) { - vdbg_printk(FUJLAPTOP_DBG_TRACE, "auto-detecting usealt\n"); - use_alt_lcd_levels = 1; - } - if (disable_brightness_keys == -1) { - vdbg_printk(FUJLAPTOP_DBG_TRACE, - "auto-detecting disable_keys\n"); - disable_brightness_keys = have_blnf ? 1 : 0; - } - if (disable_brightness_adjust == -1) { - vdbg_printk(FUJLAPTOP_DBG_TRACE, - "auto-detecting disable_adjust\n"); - disable_brightness_adjust = have_blnf ? 0 : 1; + if (ACPI_SUCCESS(acpi_get_handle(NULL, + "\\_SB.PCI0.LPCB.FJEX.SBL2", &handle))) + use_alt_lcd_levels = 1; + else + use_alt_lcd_levels = 0; + vdbg_printk(FUJLAPTOP_DBG_TRACE, "auto-detected usealt as " + "%i\n", use_alt_lcd_levels); } } @@ -581,19 +728,14 @@ static int acpi_fujitsu_add(struct acpi_device *device) /* do config (detect defaults) */ use_alt_lcd_levels = use_alt_lcd_levels == 1 ? 1 : 0; - disable_brightness_keys = disable_brightness_keys == 1 ? 1 : 0; disable_brightness_adjust = disable_brightness_adjust == 1 ? 1 : 0; vdbg_printk(FUJLAPTOP_DBG_INFO, - "config: [alt interface: %d], [key disable: %d], [adjust disable: %d]\n", - use_alt_lcd_levels, disable_brightness_keys, - disable_brightness_adjust); + "config: [alt interface: %d], [adjust disable: %d]\n", + use_alt_lcd_levels, disable_brightness_adjust); if (get_max_brightness() <= 0) fujitsu->max_brightness = FUJITSU_LCD_N_LEVELS; - if (use_alt_lcd_levels) - get_lcd_level_alt(); - else - get_lcd_level(); + get_lcd_level(); return result; @@ -644,43 +786,23 @@ static void acpi_fujitsu_notify(acpi_handle handle, u32 event, void *data) case ACPI_FUJITSU_NOTIFY_CODE1: keycode = 0; oldb = fujitsu->brightness_level; - get_lcd_level(); /* the alt version always yields changed */ + get_lcd_level(); newb = fujitsu->brightness_level; vdbg_printk(FUJLAPTOP_DBG_TRACE, "brightness button event [%i -> %i (%i)]\n", oldb, newb, fujitsu->brightness_changed); - if (oldb == newb && fujitsu->brightness_changed) { - keycode = 0; - if (disable_brightness_keys != 1) { - if (oldb == 0) { - acpi_bus_generate_proc_event - (fujitsu->dev, - ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS, - 0); - keycode = KEY_BRIGHTNESSDOWN; - } else if (oldb == - (fujitsu->max_brightness) - 1) { - acpi_bus_generate_proc_event - (fujitsu->dev, - ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS, - 0); - keycode = KEY_BRIGHTNESSUP; - } - } - } else if (oldb < newb) { + if (oldb < newb) { if (disable_brightness_adjust != 1) { if (use_alt_lcd_levels) set_lcd_level_alt(newb); else set_lcd_level(newb); } - if (disable_brightness_keys != 1) { - acpi_bus_generate_proc_event(fujitsu->dev, - ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS, 0); - keycode = KEY_BRIGHTNESSUP; - } + acpi_bus_generate_proc_event(fujitsu->dev, + ACPI_VIDEO_NOTIFY_INC_BRIGHTNESS, 0); + keycode = KEY_BRIGHTNESSUP; } else if (oldb > newb) { if (disable_brightness_adjust != 1) { if (use_alt_lcd_levels) @@ -688,13 +810,9 @@ static void acpi_fujitsu_notify(acpi_handle handle, u32 event, void *data) else set_lcd_level(newb); } - if (disable_brightness_keys != 1) { - acpi_bus_generate_proc_event(fujitsu->dev, - ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS, 0); - keycode = KEY_BRIGHTNESSDOWN; - } - } else { - keycode = KEY_UNKNOWN; + acpi_bus_generate_proc_event(fujitsu->dev, + ACPI_VIDEO_NOTIFY_DEC_BRIGHTNESS, 0); + keycode = KEY_BRIGHTNESSDOWN; } break; default: @@ -771,7 +889,8 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) input->id.bustype = BUS_HOST; input->id.product = 0x06; input->dev.parent = &device->dev; - input->evbit[0] = BIT(EV_KEY); + + set_bit(EV_KEY, input->evbit); set_bit(fujitsu->keycode1, input->keybit); set_bit(fujitsu->keycode2, input->keybit); set_bit(fujitsu->keycode3, input->keybit); @@ -803,10 +922,44 @@ static int acpi_fujitsu_hotkey_add(struct acpi_device *device) printk(KERN_ERR "_INI Method failed\n"); } - i = 0; /* Discard hotkey ringbuffer */ - while (get_irb() != 0 && (i++) < MAX_HOTKEY_RINGBUFFER_SIZE) ; + i = 0; + while (call_fext_func(FUNC_BUTTONS, 0x1, 0x0, 0x0) != 0 + && (i++) < MAX_HOTKEY_RINGBUFFER_SIZE) + ; /* No action, result is discarded */ vdbg_printk(FUJLAPTOP_DBG_INFO, "Discarded %i ringbuffer entries\n", i); + fujitsu_hotkey->rfkill_state = + call_fext_func(FUNC_RFKILL, 0x4, 0x0, 0x0); + + /* Suspect this is a keymap of the application panel, print it */ + printk(KERN_INFO "fujitsu-laptop: BTNI: [0x%x]\n", + call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0)); + + #ifdef CONFIG_LEDS_CLASS + if (call_fext_func(FUNC_LEDS, 0x0, 0x0, 0x0) & LOGOLAMP_POWERON) { + result = led_classdev_register(&fujitsu->pf_device->dev, + &logolamp_led); + if (result == 0) { + fujitsu_hotkey->logolamp_registered = 1; + } else { + printk(KERN_ERR "fujitsu-laptop: Could not register " + "LED handler for logo lamp, error %i\n", result); + } + } + + if ((call_fext_func(FUNC_LEDS, 0x0, 0x0, 0x0) & KEYBOARD_LAMPS) && + (call_fext_func(FUNC_BUTTONS, 0x0, 0x0, 0x0) == 0x0)) { + result = led_classdev_register(&fujitsu->pf_device->dev, + &kblamps_led); + if (result == 0) { + fujitsu_hotkey->kblamps_registered = 1; + } else { + printk(KERN_ERR "fujitsu-laptop: Could not register " + "LED handler for keyboard lamps, error %i\n", result); + } + } + #endif + return result; end: @@ -852,16 +1005,15 @@ static void acpi_fujitsu_hotkey_notify(acpi_handle handle, u32 event, input = fujitsu_hotkey->input; - vdbg_printk(FUJLAPTOP_DBG_TRACE, "Hotkey event\n"); + fujitsu_hotkey->rfkill_state = + call_fext_func(FUNC_RFKILL, 0x4, 0x0, 0x0); switch (event) { case ACPI_FUJITSU_NOTIFY_CODE1: i = 0; - while ((irb = get_irb()) != 0 - && (i++) < MAX_HOTKEY_RINGBUFFER_SIZE) { - vdbg_printk(FUJLAPTOP_DBG_TRACE, "GIRB result [%x]\n", - irb); - + while ((irb = + call_fext_func(FUNC_BUTTONS, 0x1, 0x0, 0x0)) != 0 + && (i++) < MAX_HOTKEY_RINGBUFFER_SIZE) { switch (irb & 0x4ff) { case KEY1_CODE: keycode = fujitsu->keycode1; @@ -1035,6 +1187,15 @@ static int __init fujitsu_init(void) goto fail_hotkey1; } + /* Sync backlight power status (needs FUJ02E3 device, hence deferred) */ + + if (!acpi_video_backlight_support()) { + if (call_fext_func(FUNC_BACKLIGHT, 0x2, 0x4, 0x0) == 3) + fujitsu->bl_device->props.power = 4; + else + fujitsu->bl_device->props.power = 0; + } + printk(KERN_INFO "fujitsu-laptop: driver " FUJITSU_DRIVER_VERSION " successfully loaded.\n"); @@ -1074,6 +1235,14 @@ fail_acpi: static void __exit fujitsu_cleanup(void) { + #ifdef CONFIG_LEDS_CLASS + if (fujitsu_hotkey->logolamp_registered != 0) + led_classdev_unregister(&logolamp_led); + + if (fujitsu_hotkey->kblamps_registered != 0) + led_classdev_unregister(&kblamps_led); + #endif + sysfs_remove_group(&fujitsu->pf_device->dev.kobj, &fujitsupf_attribute_group); platform_device_unregister(fujitsu->pf_device); @@ -1098,9 +1267,6 @@ module_exit(fujitsu_cleanup); module_param(use_alt_lcd_levels, uint, 0644); MODULE_PARM_DESC(use_alt_lcd_levels, "Use alternative interface for lcd_levels (needed for Lifebook s6410)."); -module_param(disable_brightness_keys, uint, 0644); -MODULE_PARM_DESC(disable_brightness_keys, - "Disable brightness keys (eg. if they are already handled by the generic ACPI_VIDEO device)."); module_param(disable_brightness_adjust, uint, 0644); MODULE_PARM_DESC(disable_brightness_adjust, "Disable brightness adjustment ."); #ifdef CONFIG_FUJITSU_LAPTOP_DEBUG @@ -1108,12 +1274,13 @@ module_param_named(debug, dbg_level, uint, 0644); MODULE_PARM_DESC(debug, "Sets debug level bit-mask"); #endif -MODULE_AUTHOR("Jonathan Woithe, Peter Gruber"); +MODULE_AUTHOR("Jonathan Woithe, Peter Gruber, Tony Vroon"); MODULE_DESCRIPTION("Fujitsu laptop extras support"); MODULE_VERSION(FUJITSU_DRIVER_VERSION); MODULE_LICENSE("GPL"); MODULE_ALIAS("dmi:*:svnFUJITSUSIEMENS:*:pvr:rvnFUJITSU:rnFJNB1D3:*:cvrS6410:*"); +MODULE_ALIAS("dmi:*:svnFUJITSUSIEMENS:*:pvr:rvnFUJITSU:rnFJNB1E6:*:cvrS6420:*"); MODULE_ALIAS("dmi:*:svnFUJITSU:*:pvr:rvnFUJITSU:rnFJNB19C:*:cvrS7020:*"); static struct pnp_device_id pnp_ids[] = { diff --git a/drivers/misc/hp-wmi.c b/drivers/platform/x86/hp-wmi.c index 4b7c24c519c..4b7c24c519c 100644 --- a/drivers/misc/hp-wmi.c +++ b/drivers/platform/x86/hp-wmi.c diff --git a/drivers/misc/intel_menlow.c b/drivers/platform/x86/intel_menlow.c index 27b7662955b..27b7662955b 100644 --- a/drivers/misc/intel_menlow.c +++ b/drivers/platform/x86/intel_menlow.c diff --git a/drivers/misc/msi-laptop.c b/drivers/platform/x86/msi-laptop.c index 759763d18e4..759763d18e4 100644 --- a/drivers/misc/msi-laptop.c +++ b/drivers/platform/x86/msi-laptop.c diff --git a/drivers/misc/panasonic-laptop.c b/drivers/platform/x86/panasonic-laptop.c index 4a1bc64485d..f30db367c82 100644 --- a/drivers/misc/panasonic-laptop.c +++ b/drivers/platform/x86/panasonic-laptop.c @@ -241,8 +241,6 @@ static int acpi_pcc_write_sset(struct pcc_acpi *pcc, int func, int val) }; acpi_status status = AE_OK; - ACPI_FUNCTION_TRACE("acpi_pcc_write_sset"); - status = acpi_evaluate_object(pcc->handle, METHOD_HKEY_SSET, ¶ms, NULL); @@ -254,8 +252,6 @@ static inline int acpi_pcc_get_sqty(struct acpi_device *device) unsigned long long s; acpi_status status; - ACPI_FUNCTION_TRACE("acpi_pcc_get_sqty"); - status = acpi_evaluate_integer(device->handle, METHOD_HKEY_SQTY, NULL, &s); if (ACPI_SUCCESS(status)) @@ -274,8 +270,6 @@ static int acpi_pcc_retrieve_biosdata(struct pcc_acpi *pcc, u32 *sinf) union acpi_object *hkey = NULL; int i; - ACPI_FUNCTION_TRACE("acpi_pcc_retrieve_biosdata"); - status = acpi_evaluate_object(pcc->handle, METHOD_HKEY_SINF, 0, &buffer); if (ACPI_FAILURE(status)) { @@ -501,8 +495,6 @@ static void acpi_pcc_generate_keyinput(struct pcc_acpi *pcc) int key_code, hkey_num; unsigned long long result; - ACPI_FUNCTION_TRACE("acpi_pcc_generate_keyinput"); - rc = acpi_evaluate_integer(pcc->handle, METHOD_HKEY_QUERY, NULL, &result); if (!ACPI_SUCCESS(rc)) { @@ -538,8 +530,6 @@ static void acpi_pcc_hotkey_notify(acpi_handle handle, u32 event, void *data) { struct pcc_acpi *pcc = (struct pcc_acpi *) data; - ACPI_FUNCTION_TRACE("acpi_pcc_hotkey_notify"); - switch (event) { case HKEY_NOTIFY: acpi_pcc_generate_keyinput(pcc); @@ -554,8 +544,6 @@ static int acpi_pcc_init_input(struct pcc_acpi *pcc) { int i, rc; - ACPI_FUNCTION_TRACE("acpi_pcc_init_input"); - pcc->input_dev = input_allocate_device(); if (!pcc->input_dev) { ACPI_DEBUG_PRINT((ACPI_DB_ERROR, @@ -597,8 +585,6 @@ static int acpi_pcc_hotkey_resume(struct acpi_device *device) struct pcc_acpi *pcc = acpi_driver_data(device); acpi_status status = AE_OK; - ACPI_FUNCTION_TRACE("acpi_pcc_hotkey_resume"); - if (device == NULL || pcc == NULL) return -EINVAL; @@ -616,8 +602,6 @@ static int acpi_pcc_hotkey_add(struct acpi_device *device) struct pcc_acpi *pcc; int num_sifr, result; - ACPI_FUNCTION_TRACE("acpi_pcc_hotkey_add"); - if (!device) return -EINVAL; @@ -714,8 +698,6 @@ static int __init acpi_pcc_init(void) { int result = 0; - ACPI_FUNCTION_TRACE("acpi_pcc_init"); - if (acpi_disabled) return -ENODEV; @@ -733,8 +715,6 @@ static int acpi_pcc_hotkey_remove(struct acpi_device *device, int type) { struct pcc_acpi *pcc = acpi_driver_data(device); - ACPI_FUNCTION_TRACE("acpi_pcc_hotkey_remove"); - if (!device || !pcc) return -EINVAL; @@ -757,8 +737,6 @@ static int acpi_pcc_hotkey_remove(struct acpi_device *device, int type) static void __exit acpi_pcc_exit(void) { - ACPI_FUNCTION_TRACE("acpi_pcc_exit"); - acpi_bus_unregister_driver(&acpi_pcc_driver); } diff --git a/drivers/misc/sony-laptop.c b/drivers/platform/x86/sony-laptop.c index 571b211608d..537959d0714 100644 --- a/drivers/misc/sony-laptop.c +++ b/drivers/platform/x86/sony-laptop.c @@ -935,14 +935,17 @@ static void sony_acpi_notify(acpi_handle handle, u32 event, void *data) static acpi_status sony_walk_callback(acpi_handle handle, u32 level, void *context, void **return_value) { - struct acpi_namespace_node *node; - union acpi_operand_object *operand; + struct acpi_device_info *info; + struct acpi_buffer buffer = {ACPI_ALLOCATE_BUFFER, NULL}; - node = (struct acpi_namespace_node *)handle; - operand = (union acpi_operand_object *)node->object; + if (ACPI_SUCCESS(acpi_get_object_info(handle, &buffer))) { + info = buffer.pointer; - printk(KERN_WARNING DRV_PFX "method: name: %4.4s, args %X\n", node->name.ascii, - (u32) operand->method.param_count); + printk(KERN_WARNING DRV_PFX "method: name: %4.4s, args %X\n", + (char *)&info->name, info->param_count); + + kfree(buffer.pointer); + } return AE_OK; } diff --git a/drivers/misc/tc1100-wmi.c b/drivers/platform/x86/tc1100-wmi.c index f25e4c974dc..b4a4aa9ee48 100644 --- a/drivers/misc/tc1100-wmi.c +++ b/drivers/platform/x86/tc1100-wmi.c @@ -30,7 +30,6 @@ #include <linux/init.h> #include <linux/types.h> #include <acpi/acpi.h> -#include <acpi/actypes.h> #include <acpi/acpi_bus.h> #include <acpi/acpi_drivers.h> #include <linux/platform_device.h> diff --git a/drivers/misc/thinkpad_acpi.c b/drivers/platform/x86/thinkpad_acpi.c index 899766e16fa..3478453eba7 100644 --- a/drivers/misc/thinkpad_acpi.c +++ b/drivers/platform/x86/thinkpad_acpi.c @@ -76,7 +76,6 @@ #include <linux/workqueue.h> #include <acpi/acpi_drivers.h> -#include <acpi/acnamesp.h> #include <linux/pci_ids.h> diff --git a/drivers/acpi/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c index 40e60fc2e59..40e60fc2e59 100644 --- a/drivers/acpi/toshiba_acpi.c +++ b/drivers/platform/x86/toshiba_acpi.c diff --git a/drivers/acpi/wmi.c b/drivers/platform/x86/wmi.c index 8a8b377712c..8a8b377712c 100644 --- a/drivers/acpi/wmi.c +++ b/drivers/platform/x86/wmi.c diff --git a/drivers/pnp/pnpacpi/core.c b/drivers/pnp/pnpacpi/core.c index 383e47c392a..2834846a185 100644 --- a/drivers/pnp/pnpacpi/core.c +++ b/drivers/pnp/pnpacpi/core.c @@ -23,7 +23,6 @@ #include <linux/pnp.h> #include <linux/mod_devicetable.h> #include <acpi/acpi_bus.h> -#include <acpi/actypes.h> #include "../base.h" #include "pnpacpi.h" diff --git a/drivers/regulator/wm8350-regulator.c b/drivers/regulator/wm8350-regulator.c index c68c496b2c4..7aa35248181 100644 --- a/drivers/regulator/wm8350-regulator.c +++ b/drivers/regulator/wm8350-regulator.c @@ -1412,6 +1412,97 @@ int wm8350_register_regulator(struct wm8350 *wm8350, int reg, } EXPORT_SYMBOL_GPL(wm8350_register_regulator); +/** + * wm8350_register_led - Register a WM8350 LED output + * + * @param wm8350 The WM8350 device to configure. + * @param lednum LED device index to create. + * @param dcdc The DCDC to use for the LED. + * @param isink The ISINK to use for the LED. + * @param pdata Configuration for the LED. + * + * The WM8350 supports the use of an ISINK together with a DCDC to + * provide a power-efficient LED driver. This function registers the + * regulators and instantiates the platform device for a LED. The + * operating modes for the LED regulators must be configured using + * wm8350_isink_set_flash(), wm8350_dcdc25_set_mode() and + * wm8350_dcdc_set_slot() prior to calling this function. + */ +int wm8350_register_led(struct wm8350 *wm8350, int lednum, int dcdc, int isink, + struct wm8350_led_platform_data *pdata) +{ + struct wm8350_led *led; + struct platform_device *pdev; + int ret; + + if (lednum > ARRAY_SIZE(wm8350->pmic.led) || lednum < 0) { + dev_err(wm8350->dev, "Invalid LED index %d\n", lednum); + return -ENODEV; + } + + led = &wm8350->pmic.led[lednum]; + + if (led->pdev) { + dev_err(wm8350->dev, "LED %d already allocated\n", lednum); + return -EINVAL; + } + + pdev = platform_device_alloc("wm8350-led", lednum); + if (pdev == NULL) { + dev_err(wm8350->dev, "Failed to allocate LED %d\n", lednum); + return -ENOMEM; + } + + led->isink_consumer.dev = &pdev->dev; + led->isink_consumer.supply = "led_isink"; + led->isink_init.num_consumer_supplies = 1; + led->isink_init.consumer_supplies = &led->isink_consumer; + led->isink_init.constraints.min_uA = 0; + led->isink_init.constraints.max_uA = pdata->max_uA; + led->isink_init.constraints.valid_ops_mask = REGULATOR_CHANGE_CURRENT; + led->isink_init.constraints.valid_modes_mask = REGULATOR_MODE_NORMAL; + ret = wm8350_register_regulator(wm8350, isink, &led->isink_init); + if (ret != 0) { + platform_device_put(pdev); + return ret; + } + + led->dcdc_consumer.dev = &pdev->dev; + led->dcdc_consumer.supply = "led_vcc"; + led->dcdc_init.num_consumer_supplies = 1; + led->dcdc_init.consumer_supplies = &led->dcdc_consumer; + led->dcdc_init.constraints.valid_modes_mask = REGULATOR_MODE_NORMAL; + ret = wm8350_register_regulator(wm8350, dcdc, &led->dcdc_init); + if (ret != 0) { + platform_device_put(pdev); + return ret; + } + + switch (isink) { + case WM8350_ISINK_A: + wm8350->pmic.isink_A_dcdc = dcdc; + break; + case WM8350_ISINK_B: + wm8350->pmic.isink_B_dcdc = dcdc; + break; + } + + pdev->dev.platform_data = pdata; + pdev->dev.parent = wm8350->dev; + ret = platform_device_add(pdev); + if (ret != 0) { + dev_err(wm8350->dev, "Failed to register LED %d: %d\n", + lednum, ret); + platform_device_put(pdev); + return ret; + } + + led->pdev = pdev; + + return 0; +} +EXPORT_SYMBOL_GPL(wm8350_register_led); + static struct platform_driver wm8350_regulator_driver = { .probe = wm8350_regulator_probe, .remove = wm8350_regulator_remove, diff --git a/drivers/rtc/rtc-parisc.c b/drivers/rtc/rtc-parisc.c index 346d633655e..c6bfa6fe1a2 100644 --- a/drivers/rtc/rtc-parisc.c +++ b/drivers/rtc/rtc-parisc.c @@ -34,7 +34,8 @@ static int parisc_get_time(struct device *dev, struct rtc_time *tm) static int parisc_set_time(struct device *dev, struct rtc_time *tm) { struct parisc_rtc *p = dev_get_drvdata(dev); - unsigned long flags, ret; + unsigned long flags; + int ret; spin_lock_irqsave(&p->lock, flags); ret = set_rtc_time(tm); diff --git a/drivers/video/backlight/Kconfig b/drivers/video/backlight/Kconfig index 4a4dd9adc32..72facb9eb7d 100644 --- a/drivers/video/backlight/Kconfig +++ b/drivers/video/backlight/Kconfig @@ -52,11 +52,11 @@ config LCD_ILI9320 then say y to include a power driver for it. config LCD_TDO24M - tristate "Toppoly TDO24M LCD Panels support" + tristate "Toppoly TDO24M and TDO35S LCD Panels support" depends on LCD_CLASS_DEVICE && SPI_MASTER default n help - If you have a Toppoly TDO24M series LCD panel, say y here to + If you have a Toppoly TDO24M/TDO35S series LCD panel, say y here to include the support for it. config LCD_VGG2432A4 @@ -123,17 +123,14 @@ config BACKLIGHT_ATMEL_PWM To compile this driver as a module, choose M here: the module will be called atmel-pwm-bl. -config BACKLIGHT_CORGI - tristate "Generic (aka Sharp Corgi) Backlight Driver (DEPRECATED)" +config BACKLIGHT_GENERIC + tristate "Generic (aka Sharp Corgi) Backlight Driver" depends on BACKLIGHT_CLASS_DEVICE - default n + default y help Say y to enable the generic platform backlight driver previously known as the Corgi backlight driver. If you have a Sharp Zaurus - SL-C7xx, SL-Cxx00 or SL-6000x say y. Most users can say n. - - Note: this driver is marked as deprecated, try enable SPI and - use the new corgi_lcd driver with integrated backlight control + SL-C7xx, SL-Cxx00 or SL-6000x say y. config BACKLIGHT_LOCOMO tristate "Sharp LOCOMO LCD/Backlight Driver" diff --git a/drivers/video/backlight/Makefile b/drivers/video/backlight/Makefile index 103427de670..363b3cb2f01 100644 --- a/drivers/video/backlight/Makefile +++ b/drivers/video/backlight/Makefile @@ -11,7 +11,7 @@ obj-$(CONFIG_LCD_TOSA) += tosa_lcd.o obj-$(CONFIG_BACKLIGHT_CLASS_DEVICE) += backlight.o obj-$(CONFIG_BACKLIGHT_ATMEL_PWM) += atmel-pwm-bl.o -obj-$(CONFIG_BACKLIGHT_CORGI) += corgi_bl.o +obj-$(CONFIG_BACKLIGHT_GENERIC) += generic_bl.o obj-$(CONFIG_BACKLIGHT_HP680) += hp680_bl.o obj-$(CONFIG_BACKLIGHT_LOCOMO) += locomolcd.o obj-$(CONFIG_BACKLIGHT_OMAP1) += omap1_bl.o diff --git a/drivers/video/backlight/backlight.c b/drivers/video/backlight/backlight.c index 0664fc03223..157057c79ca 100644 --- a/drivers/video/backlight/backlight.c +++ b/drivers/video/backlight/backlight.c @@ -40,6 +40,10 @@ static int fb_notifier_callback(struct notifier_block *self, if (!bd->ops->check_fb || bd->ops->check_fb(evdata->info)) { bd->props.fb_blank = *(int *)evdata->data; + if (bd->props.fb_blank == FB_BLANK_UNBLANK) + bd->props.state &= ~BL_CORE_FBBLANK; + else + bd->props.state |= BL_CORE_FBBLANK; backlight_update_status(bd); } mutex_unlock(&bd->ops_lock); @@ -80,20 +84,18 @@ static ssize_t backlight_show_power(struct device *dev, static ssize_t backlight_store_power(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - int rc = -ENXIO; - char *endp; + int rc; struct backlight_device *bd = to_backlight_device(dev); - int power = simple_strtoul(buf, &endp, 0); - size_t size = endp - buf; + unsigned long power; - if (*endp && isspace(*endp)) - size++; - if (size != count) - return -EINVAL; + rc = strict_strtoul(buf, 0, &power); + if (rc) + return rc; + rc = -ENXIO; mutex_lock(&bd->ops_lock); if (bd->ops) { - pr_debug("backlight: set power to %d\n", power); + pr_debug("backlight: set power to %lu\n", power); if (bd->props.power != power) { bd->props.power = power; backlight_update_status(bd); @@ -116,28 +118,25 @@ static ssize_t backlight_show_brightness(struct device *dev, static ssize_t backlight_store_brightness(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - int rc = -ENXIO; - char *endp; + int rc; struct backlight_device *bd = to_backlight_device(dev); - int brightness = simple_strtoul(buf, &endp, 0); - size_t size = endp - buf; + unsigned long brightness; + + rc = strict_strtoul(buf, 0, &brightness); + if (rc) + return rc; - if (*endp && isspace(*endp)) - size++; - if (size != count) - return -EINVAL; + rc = -ENXIO; mutex_lock(&bd->ops_lock); if (bd->ops) { if (brightness > bd->props.max_brightness) rc = -EINVAL; else { - pr_debug("backlight: set brightness to %d\n", + pr_debug("backlight: set brightness to %lu\n", brightness); - if (bd->props.brightness != brightness) { - bd->props.brightness = brightness; - backlight_update_status(bd); - } + bd->props.brightness = brightness; + backlight_update_status(bd); rc = count; } } @@ -170,6 +169,34 @@ static ssize_t backlight_show_actual_brightness(struct device *dev, static struct class *backlight_class; +static int backlight_suspend(struct device *dev, pm_message_t state) +{ + struct backlight_device *bd = to_backlight_device(dev); + + if (bd->ops->options & BL_CORE_SUSPENDRESUME) { + mutex_lock(&bd->ops_lock); + bd->props.state |= BL_CORE_SUSPENDED; + backlight_update_status(bd); + mutex_unlock(&bd->ops_lock); + } + + return 0; +} + +static int backlight_resume(struct device *dev) +{ + struct backlight_device *bd = to_backlight_device(dev); + + if (bd->ops->options & BL_CORE_SUSPENDRESUME) { + mutex_lock(&bd->ops_lock); + bd->props.state &= ~BL_CORE_SUSPENDED; + backlight_update_status(bd); + mutex_unlock(&bd->ops_lock); + } + + return 0; +} + static void bl_device_release(struct device *dev) { struct backlight_device *bd = to_backlight_device(dev); @@ -286,6 +313,8 @@ static int __init backlight_class_init(void) } backlight_class->dev_attrs = bl_device_attributes; + backlight_class->suspend = backlight_suspend; + backlight_class->resume = backlight_resume; return 0; } diff --git a/drivers/video/backlight/corgi_bl.c b/drivers/video/backlight/corgi_bl.c deleted file mode 100644 index 4d4d037e3ec..00000000000 --- a/drivers/video/backlight/corgi_bl.c +++ /dev/null @@ -1,169 +0,0 @@ -/* - * Backlight Driver for Sharp Zaurus Handhelds (various models) - * - * Copyright (c) 2004-2006 Richard Purdie - * - * Based on Sharp's 2.4 Backlight Driver - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 as - * published by the Free Software Foundation. - * - */ - -#include <linux/module.h> -#include <linux/kernel.h> -#include <linux/init.h> -#include <linux/platform_device.h> -#include <linux/mutex.h> -#include <linux/fb.h> -#include <linux/backlight.h> - -static int corgibl_intensity; -static struct backlight_properties corgibl_data; -static struct backlight_device *corgi_backlight_device; -static struct generic_bl_info *bl_machinfo; - -static unsigned long corgibl_flags; -#define CORGIBL_SUSPENDED 0x01 -#define CORGIBL_BATTLOW 0x02 - -static int corgibl_send_intensity(struct backlight_device *bd) -{ - int intensity = bd->props.brightness; - - if (bd->props.power != FB_BLANK_UNBLANK) - intensity = 0; - if (bd->props.fb_blank != FB_BLANK_UNBLANK) - intensity = 0; - if (corgibl_flags & CORGIBL_SUSPENDED) - intensity = 0; - if (corgibl_flags & CORGIBL_BATTLOW) - intensity &= bl_machinfo->limit_mask; - - bl_machinfo->set_bl_intensity(intensity); - - corgibl_intensity = intensity; - - if (bl_machinfo->kick_battery) - bl_machinfo->kick_battery(); - - return 0; -} - -#ifdef CONFIG_PM -static int corgibl_suspend(struct platform_device *pdev, pm_message_t state) -{ - struct backlight_device *bd = platform_get_drvdata(pdev); - - corgibl_flags |= CORGIBL_SUSPENDED; - backlight_update_status(bd); - return 0; -} - -static int corgibl_resume(struct platform_device *pdev) -{ - struct backlight_device *bd = platform_get_drvdata(pdev); - - corgibl_flags &= ~CORGIBL_SUSPENDED; - backlight_update_status(bd); - return 0; -} -#else -#define corgibl_suspend NULL -#define corgibl_resume NULL -#endif - -static int corgibl_get_intensity(struct backlight_device *bd) -{ - return corgibl_intensity; -} - -/* - * Called when the battery is low to limit the backlight intensity. - * If limit==0 clear any limit, otherwise limit the intensity - */ -void corgibl_limit_intensity(int limit) -{ - if (limit) - corgibl_flags |= CORGIBL_BATTLOW; - else - corgibl_flags &= ~CORGIBL_BATTLOW; - backlight_update_status(corgi_backlight_device); -} -EXPORT_SYMBOL(corgibl_limit_intensity); - - -static struct backlight_ops corgibl_ops = { - .get_brightness = corgibl_get_intensity, - .update_status = corgibl_send_intensity, -}; - -static int corgibl_probe(struct platform_device *pdev) -{ - struct generic_bl_info *machinfo = pdev->dev.platform_data; - const char *name = "generic-bl"; - - bl_machinfo = machinfo; - if (!machinfo->limit_mask) - machinfo->limit_mask = -1; - - if (machinfo->name) - name = machinfo->name; - - corgi_backlight_device = backlight_device_register (name, - &pdev->dev, NULL, &corgibl_ops); - if (IS_ERR (corgi_backlight_device)) - return PTR_ERR (corgi_backlight_device); - - platform_set_drvdata(pdev, corgi_backlight_device); - - corgi_backlight_device->props.max_brightness = machinfo->max_intensity; - corgi_backlight_device->props.power = FB_BLANK_UNBLANK; - corgi_backlight_device->props.brightness = machinfo->default_intensity; - backlight_update_status(corgi_backlight_device); - - printk("Corgi Backlight Driver Initialized.\n"); - return 0; -} - -static int corgibl_remove(struct platform_device *pdev) -{ - struct backlight_device *bd = platform_get_drvdata(pdev); - - corgibl_data.power = 0; - corgibl_data.brightness = 0; - backlight_update_status(bd); - - backlight_device_unregister(bd); - - printk("Corgi Backlight Driver Unloaded\n"); - return 0; -} - -static struct platform_driver corgibl_driver = { - .probe = corgibl_probe, - .remove = corgibl_remove, - .suspend = corgibl_suspend, - .resume = corgibl_resume, - .driver = { - .name = "generic-bl", - }, -}; - -static int __init corgibl_init(void) -{ - return platform_driver_register(&corgibl_driver); -} - -static void __exit corgibl_exit(void) -{ - platform_driver_unregister(&corgibl_driver); -} - -module_init(corgibl_init); -module_exit(corgibl_exit); - -MODULE_AUTHOR("Richard Purdie <rpurdie@rpsys.net>"); -MODULE_DESCRIPTION("Corgi Backlight Driver"); -MODULE_LICENSE("GPL"); diff --git a/drivers/video/backlight/cr_bllcd.c b/drivers/video/backlight/cr_bllcd.c index 26add889860..b9fe62b475c 100644 --- a/drivers/video/backlight/cr_bllcd.c +++ b/drivers/video/backlight/cr_bllcd.c @@ -259,22 +259,18 @@ static int __init cr_backlight_init(void) { int ret = platform_driver_register(&cr_backlight_driver); - if (!ret) { - crp = platform_device_alloc("cr_backlight", -1); - if (!crp) - return -ENOMEM; + if (ret) + return ret; - ret = platform_device_add(crp); - - if (ret) { - platform_device_put(crp); - platform_driver_unregister(&cr_backlight_driver); - } + crp = platform_device_register_simple("cr_backlight", -1, NULL, 0); + if (IS_ERR(crp)) { + platform_driver_unregister(&cr_backlight_driver); + return PTR_ERR(crp); } printk("Carillo Ranch Backlight Driver Initialized.\n"); - return ret; + return 0; } static void __exit cr_backlight_exit(void) diff --git a/drivers/video/backlight/generic_bl.c b/drivers/video/backlight/generic_bl.c new file mode 100644 index 00000000000..6d27f62fdcd --- /dev/null +++ b/drivers/video/backlight/generic_bl.c @@ -0,0 +1,147 @@ +/* + * Generic Backlight Driver + * + * Copyright (c) 2004-2008 Richard Purdie + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include <linux/module.h> +#include <linux/kernel.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/mutex.h> +#include <linux/fb.h> +#include <linux/backlight.h> + +static int genericbl_intensity; +static struct backlight_device *generic_backlight_device; +static struct generic_bl_info *bl_machinfo; + +/* Flag to signal when the battery is low */ +#define GENERICBL_BATTLOW BL_CORE_DRIVER1 + +static int genericbl_send_intensity(struct backlight_device *bd) +{ + int intensity = bd->props.brightness; + + if (bd->props.power != FB_BLANK_UNBLANK) + intensity = 0; + if (bd->props.state & BL_CORE_FBBLANK) + intensity = 0; + if (bd->props.state & BL_CORE_SUSPENDED) + intensity = 0; + if (bd->props.state & GENERICBL_BATTLOW) + intensity &= bl_machinfo->limit_mask; + + bl_machinfo->set_bl_intensity(intensity); + + genericbl_intensity = intensity; + + if (bl_machinfo->kick_battery) + bl_machinfo->kick_battery(); + + return 0; +} + +static int genericbl_get_intensity(struct backlight_device *bd) +{ + return genericbl_intensity; +} + +/* + * Called when the battery is low to limit the backlight intensity. + * If limit==0 clear any limit, otherwise limit the intensity + */ +void corgibl_limit_intensity(int limit) +{ + struct backlight_device *bd = generic_backlight_device; + + mutex_lock(&bd->ops_lock); + if (limit) + bd->props.state |= GENERICBL_BATTLOW; + else + bd->props.state &= ~GENERICBL_BATTLOW; + backlight_update_status(generic_backlight_device); + mutex_unlock(&bd->ops_lock); +} +EXPORT_SYMBOL(corgibl_limit_intensity); + +static struct backlight_ops genericbl_ops = { + .options = BL_CORE_SUSPENDRESUME, + .get_brightness = genericbl_get_intensity, + .update_status = genericbl_send_intensity, +}; + +static int genericbl_probe(struct platform_device *pdev) +{ + struct generic_bl_info *machinfo = pdev->dev.platform_data; + const char *name = "generic-bl"; + struct backlight_device *bd; + + bl_machinfo = machinfo; + if (!machinfo->limit_mask) + machinfo->limit_mask = -1; + + if (machinfo->name) + name = machinfo->name; + + bd = backlight_device_register (name, + &pdev->dev, NULL, &genericbl_ops); + if (IS_ERR (bd)) + return PTR_ERR (bd); + + platform_set_drvdata(pdev, bd); + + bd->props.max_brightness = machinfo->max_intensity; + bd->props.power = FB_BLANK_UNBLANK; + bd->props.brightness = machinfo->default_intensity; + backlight_update_status(bd); + + generic_backlight_device = bd; + + printk("Generic Backlight Driver Initialized.\n"); + return 0; +} + +static int genericbl_remove(struct platform_device *pdev) +{ + struct backlight_device *bd = platform_get_drvdata(pdev); + + bd->props.power = 0; + bd->props.brightness = 0; + backlight_update_status(bd); + + backlight_device_unregister(bd); + + printk("Generic Backlight Driver Unloaded\n"); + return 0; +} + +static struct platform_driver genericbl_driver = { + .probe = genericbl_probe, + .remove = genericbl_remove, + .driver = { + .name = "generic-bl", + }, +}; + +static int __init genericbl_init(void) +{ + return platform_driver_register(&genericbl_driver); +} + +static void __exit genericbl_exit(void) +{ + platform_driver_unregister(&genericbl_driver); +} + +module_init(genericbl_init); +module_exit(genericbl_exit); + +MODULE_AUTHOR("Richard Purdie <rpurdie@rpsys.net>"); +MODULE_DESCRIPTION("Generic Backlight Driver"); +MODULE_LICENSE("GPL"); diff --git a/drivers/video/backlight/hp680_bl.c b/drivers/video/backlight/hp680_bl.c index d4cfed0b26d..5be55a20d8c 100644 --- a/drivers/video/backlight/hp680_bl.c +++ b/drivers/video/backlight/hp680_bl.c @@ -151,19 +151,15 @@ static int __init hp680bl_init(void) int ret; ret = platform_driver_register(&hp680bl_driver); - if (!ret) { - hp680bl_device = platform_device_alloc("hp680-bl", -1); - if (!hp680bl_device) - return -ENOMEM; - - ret = platform_device_add(hp680bl_device); - - if (ret) { - platform_device_put(hp680bl_device); - platform_driver_unregister(&hp680bl_driver); - } + if (ret) + return ret; + hp680bl_device = platform_device_register_simple("hp680-bl", -1, + NULL, 0); + if (IS_ERR(hp680bl_device)) { + platform_driver_unregister(&hp680bl_driver); + return PTR_ERR(hp680bl_device); } - return ret; + return 0; } static void __exit hp680bl_exit(void) diff --git a/drivers/video/backlight/mbp_nvidia_bl.c b/drivers/video/backlight/mbp_nvidia_bl.c index 06964af761c..65864c50045 100644 --- a/drivers/video/backlight/mbp_nvidia_bl.c +++ b/drivers/video/backlight/mbp_nvidia_bl.c @@ -70,6 +70,7 @@ static int mbp_get_intensity(struct backlight_device *bd) } static struct backlight_ops mbp_ops = { + .options = BL_CORE_SUSPENDRESUME, .get_brightness = mbp_get_intensity, .update_status = mbp_send_intensity, }; diff --git a/drivers/video/backlight/progear_bl.c b/drivers/video/backlight/progear_bl.c index 15fb4d58b5b..9edaf24fd82 100644 --- a/drivers/video/backlight/progear_bl.c +++ b/drivers/video/backlight/progear_bl.c @@ -119,20 +119,16 @@ static int __init progearbl_init(void) { int ret = platform_driver_register(&progearbl_driver); - if (!ret) { - progearbl_device = platform_device_alloc("progear-bl", -1); - if (!progearbl_device) - return -ENOMEM; - - ret = platform_device_add(progearbl_device); - - if (ret) { - platform_device_put(progearbl_device); - platform_driver_unregister(&progearbl_driver); - } + if (ret) + return ret; + progearbl_device = platform_device_register_simple("progear-bl", -1, + NULL, 0); + if (IS_ERR(progearbl_device)) { + platform_driver_unregister(&progearbl_driver); + return PTR_ERR(progearbl_device); } - return ret; + return 0; } static void __exit progearbl_exit(void) diff --git a/drivers/video/backlight/tdo24m.c b/drivers/video/backlight/tdo24m.c index 8427669162e..1dae7f8f3c6 100644 --- a/drivers/video/backlight/tdo24m.c +++ b/drivers/video/backlight/tdo24m.c @@ -14,6 +14,7 @@ #include <linux/init.h> #include <linux/device.h> #include <linux/spi/spi.h> +#include <linux/spi/tdo24m.h> #include <linux/fb.h> #include <linux/lcd.h> @@ -31,6 +32,9 @@ struct tdo24m { struct spi_transfer xfer; uint8_t *buf; + int (*adj_mode)(struct tdo24m *lcd, int mode); + int color_invert; + int power; int mode; }; @@ -66,7 +70,7 @@ static uint32_t lcd_panel_off[] = { CMD_NULL, }; -static uint32_t lcd_vga_pass_through[] = { +static uint32_t lcd_vga_pass_through_tdo24m[] = { CMD1(0xB0, 0x16), CMD1(0xBC, 0x80), CMD1(0xE1, 0x00), @@ -75,7 +79,7 @@ static uint32_t lcd_vga_pass_through[] = { CMD_NULL, }; -static uint32_t lcd_qvga_pass_through[] = { +static uint32_t lcd_qvga_pass_through_tdo24m[] = { CMD1(0xB0, 0x16), CMD1(0xBC, 0x81), CMD1(0xE1, 0x00), @@ -84,7 +88,7 @@ static uint32_t lcd_qvga_pass_through[] = { CMD_NULL, }; -static uint32_t lcd_vga_transfer[] = { +static uint32_t lcd_vga_transfer_tdo24m[] = { CMD1(0xcf, 0x02), /* Blanking period control (1) */ CMD2(0xd0, 0x08, 0x04), /* Blanking period control (2) */ CMD1(0xd1, 0x01), /* CKV timing control on/off */ @@ -110,6 +114,35 @@ static uint32_t lcd_qvga_transfer[] = { CMD_NULL, }; +static uint32_t lcd_vga_pass_through_tdo35s[] = { + CMD1(0xB0, 0x16), + CMD1(0xBC, 0x80), + CMD1(0xE1, 0x00), + CMD1(0x3B, 0x00), + CMD_NULL, +}; + +static uint32_t lcd_qvga_pass_through_tdo35s[] = { + CMD1(0xB0, 0x16), + CMD1(0xBC, 0x81), + CMD1(0xE1, 0x00), + CMD1(0x3B, 0x22), + CMD_NULL, +}; + +static uint32_t lcd_vga_transfer_tdo35s[] = { + CMD1(0xcf, 0x02), /* Blanking period control (1) */ + CMD2(0xd0, 0x08, 0x04), /* Blanking period control (2) */ + CMD1(0xd1, 0x01), /* CKV timing control on/off */ + CMD2(0xd2, 0x00, 0x1e), /* CKV 1,2 timing control */ + CMD2(0xd3, 0x14, 0x28), /* OEV timing control */ + CMD2(0xd4, 0x28, 0x64), /* ASW timing control (1) */ + CMD1(0xd5, 0x28), /* ASW timing control (2) */ + CMD0(0x21), /* Invert for normally black display */ + CMD0(0x29), /* Display on */ + CMD_NULL, +}; + static uint32_t lcd_panel_config[] = { CMD2(0xb8, 0xff, 0xf9), /* Output control */ CMD0(0x11), /* sleep out */ @@ -148,6 +181,8 @@ static int tdo24m_writes(struct tdo24m *lcd, uint32_t *array) int nparams, err = 0; for (; *p != CMD_NULL; p++) { + if (!lcd->color_invert && *p == CMD0(0x21)) + continue; nparams = (*p >> 30) & 0x3; @@ -184,12 +219,33 @@ static int tdo24m_adj_mode(struct tdo24m *lcd, int mode) { switch (mode) { case MODE_VGA: - tdo24m_writes(lcd, lcd_vga_pass_through); + tdo24m_writes(lcd, lcd_vga_pass_through_tdo24m); tdo24m_writes(lcd, lcd_panel_config); - tdo24m_writes(lcd, lcd_vga_transfer); + tdo24m_writes(lcd, lcd_vga_transfer_tdo24m); break; case MODE_QVGA: - tdo24m_writes(lcd, lcd_qvga_pass_through); + tdo24m_writes(lcd, lcd_qvga_pass_through_tdo24m); + tdo24m_writes(lcd, lcd_panel_config); + tdo24m_writes(lcd, lcd_qvga_transfer); + break; + default: + return -EINVAL; + } + + lcd->mode = mode; + return 0; +} + +static int tdo35s_adj_mode(struct tdo24m *lcd, int mode) +{ + switch (mode) { + case MODE_VGA: + tdo24m_writes(lcd, lcd_vga_pass_through_tdo35s); + tdo24m_writes(lcd, lcd_panel_config); + tdo24m_writes(lcd, lcd_vga_transfer_tdo35s); + break; + case MODE_QVGA: + tdo24m_writes(lcd, lcd_qvga_pass_through_tdo35s); tdo24m_writes(lcd, lcd_panel_config); tdo24m_writes(lcd, lcd_qvga_transfer); break; @@ -213,7 +269,7 @@ static int tdo24m_power_on(struct tdo24m *lcd) if (err) goto out; - err = tdo24m_adj_mode(lcd, lcd->mode); + err = lcd->adj_mode(lcd, lcd->mode); out: return err; } @@ -262,7 +318,7 @@ static int tdo24m_set_mode(struct lcd_device *ld, struct fb_videomode *m) if (lcd->mode == mode) return 0; - return tdo24m_adj_mode(lcd, mode); + return lcd->adj_mode(lcd, mode); } static struct lcd_ops tdo24m_ops = { @@ -276,8 +332,16 @@ static int __devinit tdo24m_probe(struct spi_device *spi) struct tdo24m *lcd; struct spi_message *m; struct spi_transfer *x; + struct tdo24m_platform_data *pdata; + enum tdo24m_model model; int err; + pdata = spi->dev.platform_data; + if (pdata) + model = pdata->model; + else + model = TDO24M; + spi->bits_per_word = 8; spi->mode = SPI_MODE_3; err = spi_setup(spi); @@ -306,6 +370,20 @@ static int __devinit tdo24m_probe(struct spi_device *spi) x->tx_buf = &lcd->buf[0]; spi_message_add_tail(x, m); + switch (model) { + case TDO24M: + lcd->color_invert = 1; + lcd->adj_mode = tdo24m_adj_mode; + break; + case TDO35S: + lcd->adj_mode = tdo35s_adj_mode; + lcd->color_invert = 0; + break; + default: + dev_err(&spi->dev, "Unsupported model"); + goto out_free; + } + lcd->lcd_dev = lcd_device_register("tdo24m", &spi->dev, lcd, &tdo24m_ops); if (IS_ERR(lcd->lcd_dev)) { diff --git a/drivers/video/backlight/tosa_lcd.c b/drivers/video/backlight/tosa_lcd.c index 57a26649f1a..b7fbc75a62f 100644 --- a/drivers/video/backlight/tosa_lcd.c +++ b/drivers/video/backlight/tosa_lcd.c @@ -39,6 +39,7 @@ struct tosa_lcd_data { struct i2c_client *i2c; int lcd_power; + bool is_vga; }; static int tosa_tg_send(struct spi_device *spi, int adrs, uint8_t data) @@ -81,8 +82,12 @@ static void tosa_lcd_tg_init(struct tosa_lcd_data *data) static void tosa_lcd_tg_on(struct tosa_lcd_data *data) { struct spi_device *spi = data->spi; - const int value = TG_REG0_COLOR | TG_REG0_UD | TG_REG0_LR; - tosa_tg_send(spi, TG_PNLCTL, value | TG_REG0_VQV); /* this depends on mode */ + int value = TG_REG0_COLOR | TG_REG0_UD | TG_REG0_LR; + + if (data->is_vga) + value |= TG_REG0_VQV; + + tosa_tg_send(spi, TG_PNLCTL, value); /* TG LCD pannel power up */ tosa_tg_send(spi, TG_PINICTL,0x4); @@ -142,9 +147,25 @@ static int tosa_lcd_get_power(struct lcd_device *lcd) return data->lcd_power; } +static int tosa_lcd_set_mode(struct lcd_device *lcd, struct fb_videomode *mode) +{ + struct tosa_lcd_data *data = lcd_get_data(lcd); + + if (mode->xres == 320 || mode->yres == 320) + data->is_vga = false; + else + data->is_vga = true; + + if (POWER_IS_ON(data->lcd_power)) + tosa_lcd_tg_on(data); + + return 0; +} + static struct lcd_ops tosa_lcd_ops = { .set_power = tosa_lcd_set_power, .get_power = tosa_lcd_get_power, + .set_mode = tosa_lcd_set_mode, }; static int __devinit tosa_lcd_probe(struct spi_device *spi) @@ -156,6 +177,8 @@ static int __devinit tosa_lcd_probe(struct spi_device *spi) if (!data) return -ENOMEM; + data->is_vga = true; /* defaut to VGA mode */ + /* * bits_per_word cannot be configured in platform data */ diff --git a/drivers/video/backlight/vgg2432a4.c b/drivers/video/backlight/vgg2432a4.c index 593c7687d54..8e653b8a6f1 100644 --- a/drivers/video/backlight/vgg2432a4.c +++ b/drivers/video/backlight/vgg2432a4.c @@ -137,7 +137,7 @@ static int vgg2432a4_lcd_init(struct ili9320 *lcd, ili9320_write(lcd, ILI9320_RGB_IF1, cfg->rgb_if1); ili9320_write(lcd, ILI9320_FRAMEMAKER, 0x0); - ili9320_write(lcd, ILI9320_RGB_IF2, ILI9320_RGBIF2_DPL); + ili9320_write(lcd, ILI9320_RGB_IF2, cfg->rgb_if2); ret = ili9320_write_regs(lcd, vgg_init1, ARRAY_SIZE(vgg_init1)); if (ret != 0) diff --git a/fs/Kconfig b/fs/Kconfig index 32883589ee5..02cff86af1b 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -269,6 +269,25 @@ config OCFS2_FS_POSIX_ACL Posix Access Control Lists (ACLs) support permissions for users and groups beyond the owner/group/world scheme. +config BTRFS_FS + tristate "Btrfs filesystem (EXPERIMENTAL) Unstable disk format" + depends on EXPERIMENTAL + select LIBCRC32C + select ZLIB_INFLATE + select ZLIB_DEFLATE + help + Btrfs is a new filesystem with extents, writable snapshotting, + support for multiple devices and many more features. + + Btrfs is highly experimental, and THE DISK FORMAT IS NOT YET + FINALIZED. You should say N here unless you are interested in + testing Btrfs with non-critical data. + + To compile this file system support as a module, choose M here. The + module will be called btrfs. + + If unsure, say N. + endif # BLOCK source "fs/notify/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index c830611550d..bc4e14df108 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -119,4 +119,5 @@ obj-$(CONFIG_HOSTFS) += hostfs/ obj-$(CONFIG_HPPFS) += hppfs/ obj-$(CONFIG_DEBUG_FS) += debugfs/ obj-$(CONFIG_OCFS2_FS) += ocfs2/ +obj-$(CONFIG_BTRFS_FS) += btrfs/ obj-$(CONFIG_GFS2_FS) += gfs2/ diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile new file mode 100644 index 00000000000..d2cf5a54a4b --- /dev/null +++ b/fs/btrfs/Makefile @@ -0,0 +1,25 @@ +ifneq ($(KERNELRELEASE),) +# kbuild part of makefile + +obj-$(CONFIG_BTRFS_FS) := btrfs.o +btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ + file-item.o inode-item.o inode-map.o disk-io.o \ + transaction.o inode.o file.o tree-defrag.o \ + extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \ + extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \ + ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \ + compression.o +else + +# Normal Makefile + +KERNELDIR := /lib/modules/`uname -r`/build +all: + $(MAKE) -C $(KERNELDIR) M=`pwd` CONFIG_BTRFS_FS=m modules + +modules_install: + $(MAKE) -C $(KERNELDIR) M=`pwd` modules_install +clean: + $(MAKE) -C $(KERNELDIR) M=`pwd` clean + +endif diff --git a/fs/btrfs/acl.c b/fs/btrfs/acl.c new file mode 100644 index 00000000000..1d53b62dbba --- /dev/null +++ b/fs/btrfs/acl.c @@ -0,0 +1,351 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/string.h> +#include <linux/xattr.h> +#include <linux/posix_acl_xattr.h> +#include <linux/posix_acl.h> +#include <linux/sched.h> + +#include "ctree.h" +#include "btrfs_inode.h" +#include "xattr.h" + +#ifdef CONFIG_FS_POSIX_ACL + +static void btrfs_update_cached_acl(struct inode *inode, + struct posix_acl **p_acl, + struct posix_acl *acl) +{ + spin_lock(&inode->i_lock); + if (*p_acl && *p_acl != BTRFS_ACL_NOT_CACHED) + posix_acl_release(*p_acl); + *p_acl = posix_acl_dup(acl); + spin_unlock(&inode->i_lock); +} + +static struct posix_acl *btrfs_get_acl(struct inode *inode, int type) +{ + int size; + const char *name; + char *value = NULL; + struct posix_acl *acl = NULL, **p_acl; + + switch (type) { + case ACL_TYPE_ACCESS: + name = POSIX_ACL_XATTR_ACCESS; + p_acl = &BTRFS_I(inode)->i_acl; + break; + case ACL_TYPE_DEFAULT: + name = POSIX_ACL_XATTR_DEFAULT; + p_acl = &BTRFS_I(inode)->i_default_acl; + break; + default: + return ERR_PTR(-EINVAL); + } + + spin_lock(&inode->i_lock); + if (*p_acl != BTRFS_ACL_NOT_CACHED) + acl = posix_acl_dup(*p_acl); + spin_unlock(&inode->i_lock); + + if (acl) + return acl; + + + size = __btrfs_getxattr(inode, name, "", 0); + if (size > 0) { + value = kzalloc(size, GFP_NOFS); + if (!value) + return ERR_PTR(-ENOMEM); + size = __btrfs_getxattr(inode, name, value, size); + if (size > 0) { + acl = posix_acl_from_xattr(value, size); + btrfs_update_cached_acl(inode, p_acl, acl); + } + kfree(value); + } else if (size == -ENOENT) { + acl = NULL; + btrfs_update_cached_acl(inode, p_acl, acl); + } + + return acl; +} + +static int btrfs_xattr_get_acl(struct inode *inode, int type, + void *value, size_t size) +{ + struct posix_acl *acl; + int ret = 0; + + acl = btrfs_get_acl(inode, type); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl == NULL) + return -ENODATA; + ret = posix_acl_to_xattr(acl, value, size); + posix_acl_release(acl); + + return ret; +} + +/* + * Needs to be called with fs_mutex held + */ +static int btrfs_set_acl(struct inode *inode, struct posix_acl *acl, int type) +{ + int ret, size = 0; + const char *name; + struct posix_acl **p_acl; + char *value = NULL; + mode_t mode; + + if (acl) { + ret = posix_acl_valid(acl); + if (ret < 0) + return ret; + ret = 0; + } + + switch (type) { + case ACL_TYPE_ACCESS: + mode = inode->i_mode; + ret = posix_acl_equiv_mode(acl, &mode); + if (ret < 0) + return ret; + ret = 0; + inode->i_mode = mode; + name = POSIX_ACL_XATTR_ACCESS; + p_acl = &BTRFS_I(inode)->i_acl; + break; + case ACL_TYPE_DEFAULT: + if (!S_ISDIR(inode->i_mode)) + return acl ? -EINVAL : 0; + name = POSIX_ACL_XATTR_DEFAULT; + p_acl = &BTRFS_I(inode)->i_default_acl; + break; + default: + return -EINVAL; + } + + if (acl) { + size = posix_acl_xattr_size(acl->a_count); + value = kmalloc(size, GFP_NOFS); + if (!value) { + ret = -ENOMEM; + goto out; + } + + ret = posix_acl_to_xattr(acl, value, size); + if (ret < 0) + goto out; + } + + ret = __btrfs_setxattr(inode, name, value, size, 0); + +out: + kfree(value); + + if (!ret) + btrfs_update_cached_acl(inode, p_acl, acl); + + return ret; +} + +static int btrfs_xattr_set_acl(struct inode *inode, int type, + const void *value, size_t size) +{ + int ret = 0; + struct posix_acl *acl = NULL; + + if (value) { + acl = posix_acl_from_xattr(value, size); + if (acl == NULL) { + value = NULL; + size = 0; + } else if (IS_ERR(acl)) { + return PTR_ERR(acl); + } + } + + ret = btrfs_set_acl(inode, acl, type); + + posix_acl_release(acl); + + return ret; +} + + +static int btrfs_xattr_acl_access_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return btrfs_xattr_get_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int btrfs_xattr_acl_access_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return btrfs_xattr_set_acl(inode, ACL_TYPE_ACCESS, value, size); +} + +static int btrfs_xattr_acl_default_get(struct inode *inode, const char *name, + void *value, size_t size) +{ + return btrfs_xattr_get_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +static int btrfs_xattr_acl_default_set(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + return btrfs_xattr_set_acl(inode, ACL_TYPE_DEFAULT, value, size); +} + +int btrfs_check_acl(struct inode *inode, int mask) +{ + struct posix_acl *acl; + int error = -EAGAIN; + + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + + if (IS_ERR(acl)) + return PTR_ERR(acl); + if (acl) { + error = posix_acl_permission(inode, acl, mask); + posix_acl_release(acl); + } + + return error; +} + +/* + * btrfs_init_acl is already generally called under fs_mutex, so the locking + * stuff has been fixed to work with that. If the locking stuff changes, we + * need to re-evaluate the acl locking stuff. + */ +int btrfs_init_acl(struct inode *inode, struct inode *dir) +{ + struct posix_acl *acl = NULL; + int ret = 0; + + /* this happens with subvols */ + if (!dir) + return 0; + + if (!S_ISLNK(inode->i_mode)) { + if (IS_POSIXACL(dir)) { + acl = btrfs_get_acl(dir, ACL_TYPE_DEFAULT); + if (IS_ERR(acl)) + return PTR_ERR(acl); + } + + if (!acl) + inode->i_mode &= ~current->fs->umask; + } + + if (IS_POSIXACL(dir) && acl) { + struct posix_acl *clone; + mode_t mode; + + if (S_ISDIR(inode->i_mode)) { + ret = btrfs_set_acl(inode, acl, ACL_TYPE_DEFAULT); + if (ret) + goto failed; + } + clone = posix_acl_clone(acl, GFP_NOFS); + ret = -ENOMEM; + if (!clone) + goto failed; + + mode = inode->i_mode; + ret = posix_acl_create_masq(clone, &mode); + if (ret >= 0) { + inode->i_mode = mode; + if (ret > 0) { + /* we need an acl */ + ret = btrfs_set_acl(inode, clone, + ACL_TYPE_ACCESS); + } + } + } +failed: + posix_acl_release(acl); + + return ret; +} + +int btrfs_acl_chmod(struct inode *inode) +{ + struct posix_acl *acl, *clone; + int ret = 0; + + if (S_ISLNK(inode->i_mode)) + return -EOPNOTSUPP; + + if (!IS_POSIXACL(inode)) + return 0; + + acl = btrfs_get_acl(inode, ACL_TYPE_ACCESS); + if (IS_ERR(acl) || !acl) + return PTR_ERR(acl); + + clone = posix_acl_clone(acl, GFP_KERNEL); + posix_acl_release(acl); + if (!clone) + return -ENOMEM; + + ret = posix_acl_chmod_masq(clone, inode->i_mode); + if (!ret) + ret = btrfs_set_acl(inode, clone, ACL_TYPE_ACCESS); + + posix_acl_release(clone); + + return ret; +} + +struct xattr_handler btrfs_xattr_acl_default_handler = { + .prefix = POSIX_ACL_XATTR_DEFAULT, + .get = btrfs_xattr_acl_default_get, + .set = btrfs_xattr_acl_default_set, +}; + +struct xattr_handler btrfs_xattr_acl_access_handler = { + .prefix = POSIX_ACL_XATTR_ACCESS, + .get = btrfs_xattr_acl_access_get, + .set = btrfs_xattr_acl_access_set, +}; + +#else /* CONFIG_FS_POSIX_ACL */ + +int btrfs_acl_chmod(struct inode *inode) +{ + return 0; +} + +int btrfs_init_acl(struct inode *inode, struct inode *dir) +{ + return 0; +} + +int btrfs_check_acl(struct inode *inode, int mask) +{ + return 0; +} + +#endif /* CONFIG_FS_POSIX_ACL */ diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c new file mode 100644 index 00000000000..8e2fec05dbe --- /dev/null +++ b/fs/btrfs/async-thread.c @@ -0,0 +1,419 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/version.h> +#include <linux/kthread.h> +#include <linux/list.h> +#include <linux/spinlock.h> +# include <linux/freezer.h> +#include "async-thread.h" + +#define WORK_QUEUED_BIT 0 +#define WORK_DONE_BIT 1 +#define WORK_ORDER_DONE_BIT 2 + +/* + * container for the kthread task pointer and the list of pending work + * One of these is allocated per thread. + */ +struct btrfs_worker_thread { + /* pool we belong to */ + struct btrfs_workers *workers; + + /* list of struct btrfs_work that are waiting for service */ + struct list_head pending; + + /* list of worker threads from struct btrfs_workers */ + struct list_head worker_list; + + /* kthread */ + struct task_struct *task; + + /* number of things on the pending list */ + atomic_t num_pending; + + unsigned long sequence; + + /* protects the pending list. */ + spinlock_t lock; + + /* set to non-zero when this thread is already awake and kicking */ + int working; + + /* are we currently idle */ + int idle; +}; + +/* + * helper function to move a thread onto the idle list after it + * has finished some requests. + */ +static void check_idle_worker(struct btrfs_worker_thread *worker) +{ + if (!worker->idle && atomic_read(&worker->num_pending) < + worker->workers->idle_thresh / 2) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 1; + list_move(&worker->worker_list, &worker->workers->idle_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +/* + * helper function to move a thread off the idle list after new + * pending work is added. + */ +static void check_busy_worker(struct btrfs_worker_thread *worker) +{ + if (worker->idle && atomic_read(&worker->num_pending) >= + worker->workers->idle_thresh) { + unsigned long flags; + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } +} + +static noinline int run_ordered_completions(struct btrfs_workers *workers, + struct btrfs_work *work) +{ + unsigned long flags; + + if (!workers->ordered) + return 0; + + set_bit(WORK_DONE_BIT, &work->flags); + + spin_lock_irqsave(&workers->lock, flags); + + while (!list_empty(&workers->order_list)) { + work = list_entry(workers->order_list.next, + struct btrfs_work, order_list); + + if (!test_bit(WORK_DONE_BIT, &work->flags)) + break; + + /* we are going to call the ordered done function, but + * we leave the work item on the list as a barrier so + * that later work items that are done don't have their + * functions called before this one returns + */ + if (test_and_set_bit(WORK_ORDER_DONE_BIT, &work->flags)) + break; + + spin_unlock_irqrestore(&workers->lock, flags); + + work->ordered_func(work); + + /* now take the lock again and call the freeing code */ + spin_lock_irqsave(&workers->lock, flags); + list_del(&work->order_list); + work->ordered_free(work); + } + + spin_unlock_irqrestore(&workers->lock, flags); + return 0; +} + +/* + * main loop for servicing work items + */ +static int worker_loop(void *arg) +{ + struct btrfs_worker_thread *worker = arg; + struct list_head *cur; + struct btrfs_work *work; + do { + spin_lock_irq(&worker->lock); + while (!list_empty(&worker->pending)) { + cur = worker->pending.next; + work = list_entry(cur, struct btrfs_work, list); + list_del(&work->list); + clear_bit(WORK_QUEUED_BIT, &work->flags); + + work->worker = worker; + spin_unlock_irq(&worker->lock); + + work->func(work); + + atomic_dec(&worker->num_pending); + /* + * unless this is an ordered work queue, + * 'work' was probably freed by func above. + */ + run_ordered_completions(worker->workers, work); + + spin_lock_irq(&worker->lock); + check_idle_worker(worker); + + } + worker->working = 0; + if (freezing(current)) { + refrigerator(); + } else { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irq(&worker->lock); + if (!kthread_should_stop()) + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +/* + * this will wait for all the worker threads to shutdown + */ +int btrfs_stop_workers(struct btrfs_workers *workers) +{ + struct list_head *cur; + struct btrfs_worker_thread *worker; + + list_splice_init(&workers->idle_list, &workers->worker_list); + while (!list_empty(&workers->worker_list)) { + cur = workers->worker_list.next; + worker = list_entry(cur, struct btrfs_worker_thread, + worker_list); + kthread_stop(worker->task); + list_del(&worker->worker_list); + kfree(worker); + } + return 0; +} + +/* + * simple init on struct btrfs_workers + */ +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max) +{ + workers->num_workers = 0; + INIT_LIST_HEAD(&workers->worker_list); + INIT_LIST_HEAD(&workers->idle_list); + INIT_LIST_HEAD(&workers->order_list); + spin_lock_init(&workers->lock); + workers->max_workers = max; + workers->idle_thresh = 32; + workers->name = name; + workers->ordered = 0; +} + +/* + * starts new worker threads. This does not enforce the max worker + * count in case you need to temporarily go past it. + */ +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers) +{ + struct btrfs_worker_thread *worker; + int ret = 0; + int i; + + for (i = 0; i < num_workers; i++) { + worker = kzalloc(sizeof(*worker), GFP_NOFS); + if (!worker) { + ret = -ENOMEM; + goto fail; + } + + INIT_LIST_HEAD(&worker->pending); + INIT_LIST_HEAD(&worker->worker_list); + spin_lock_init(&worker->lock); + atomic_set(&worker->num_pending, 0); + worker->task = kthread_run(worker_loop, worker, + "btrfs-%s-%d", workers->name, + workers->num_workers + i); + worker->workers = workers; + if (IS_ERR(worker->task)) { + kfree(worker); + ret = PTR_ERR(worker->task); + goto fail; + } + + spin_lock_irq(&workers->lock); + list_add_tail(&worker->worker_list, &workers->idle_list); + worker->idle = 1; + workers->num_workers++; + spin_unlock_irq(&workers->lock); + } + return 0; +fail: + btrfs_stop_workers(workers); + return ret; +} + +/* + * run through the list and find a worker thread that doesn't have a lot + * to do right now. This can return null if we aren't yet at the thread + * count limit and all of the threads are busy. + */ +static struct btrfs_worker_thread *next_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + struct list_head *next; + int enforce_min = workers->num_workers < workers->max_workers; + + /* + * if we find an idle thread, don't move it to the end of the + * idle list. This improves the chance that the next submission + * will reuse the same thread, and maybe catch it while it is still + * working + */ + if (!list_empty(&workers->idle_list)) { + next = workers->idle_list.next; + worker = list_entry(next, struct btrfs_worker_thread, + worker_list); + return worker; + } + if (enforce_min || list_empty(&workers->worker_list)) + return NULL; + + /* + * if we pick a busy task, move the task to the end of the list. + * hopefully this will keep things somewhat evenly balanced. + * Do the move in batches based on the sequence number. This groups + * requests submitted at roughly the same time onto the same worker. + */ + next = workers->worker_list.next; + worker = list_entry(next, struct btrfs_worker_thread, worker_list); + atomic_inc(&worker->num_pending); + worker->sequence++; + + if (worker->sequence % workers->idle_thresh == 0) + list_move_tail(next, &workers->worker_list); + return worker; +} + +/* + * selects a worker thread to take the next job. This will either find + * an idle worker, start a new worker up to the max count, or just return + * one of the existing busy workers. + */ +static struct btrfs_worker_thread *find_worker(struct btrfs_workers *workers) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + +again: + spin_lock_irqsave(&workers->lock, flags); + worker = next_worker(workers); + spin_unlock_irqrestore(&workers->lock, flags); + + if (!worker) { + spin_lock_irqsave(&workers->lock, flags); + if (workers->num_workers >= workers->max_workers) { + struct list_head *fallback = NULL; + /* + * we have failed to find any workers, just + * return the force one + */ + if (!list_empty(&workers->worker_list)) + fallback = workers->worker_list.next; + if (!list_empty(&workers->idle_list)) + fallback = workers->idle_list.next; + BUG_ON(!fallback); + worker = list_entry(fallback, + struct btrfs_worker_thread, worker_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + spin_unlock_irqrestore(&workers->lock, flags); + /* we're below the limit, start another worker */ + btrfs_start_workers(workers, 1); + goto again; + } + } + return worker; +} + +/* + * btrfs_requeue_work just puts the work item back on the tail of the list + * it was taken from. It is intended for use with long running work functions + * that make some progress and want to give the cpu up for others. + */ +int btrfs_requeue_work(struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker = work->worker; + unsigned long flags; + + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + list_add_tail(&work->list, &worker->pending); + + /* by definition we're busy, take ourselves off the idle + * list + */ + if (worker->idle) { + spin_lock_irqsave(&worker->workers->lock, flags); + worker->idle = 0; + list_move_tail(&worker->worker_list, + &worker->workers->worker_list); + spin_unlock_irqrestore(&worker->workers->lock, flags); + } + + spin_unlock_irqrestore(&worker->lock, flags); + +out: + return 0; +} + +/* + * places a struct btrfs_work into the pending queue of one of the kthreads + */ +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work) +{ + struct btrfs_worker_thread *worker; + unsigned long flags; + int wake = 0; + + /* don't requeue something already on a list */ + if (test_and_set_bit(WORK_QUEUED_BIT, &work->flags)) + goto out; + + worker = find_worker(workers); + if (workers->ordered) { + spin_lock_irqsave(&workers->lock, flags); + list_add_tail(&work->order_list, &workers->order_list); + spin_unlock_irqrestore(&workers->lock, flags); + } else { + INIT_LIST_HEAD(&work->order_list); + } + + spin_lock_irqsave(&worker->lock, flags); + atomic_inc(&worker->num_pending); + check_busy_worker(worker); + list_add_tail(&work->list, &worker->pending); + + /* + * avoid calling into wake_up_process if this thread has already + * been kicked + */ + if (!worker->working) + wake = 1; + worker->working = 1; + + spin_unlock_irqrestore(&worker->lock, flags); + + if (wake) + wake_up_process(worker->task); +out: + return 0; +} diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h new file mode 100644 index 00000000000..31be4ed8b63 --- /dev/null +++ b/fs/btrfs/async-thread.h @@ -0,0 +1,101 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ASYNC_THREAD_ +#define __BTRFS_ASYNC_THREAD_ + +struct btrfs_worker_thread; + +/* + * This is similar to a workqueue, but it is meant to spread the operations + * across all available cpus instead of just the CPU that was used to + * queue the work. There is also some batching introduced to try and + * cut down on context switches. + * + * By default threads are added on demand up to 2 * the number of cpus. + * Changing struct btrfs_workers->max_workers is one way to prevent + * demand creation of kthreads. + * + * the basic model of these worker threads is to embed a btrfs_work + * structure in your own data struct, and use container_of in a + * work function to get back to your data struct. + */ +struct btrfs_work { + /* + * func should be set to the function you want called + * your work struct is passed as the only arg + * + * ordered_func must be set for work sent to an ordered work queue, + * and it is called to complete a given work item in the same + * order they were sent to the queue. + */ + void (*func)(struct btrfs_work *work); + void (*ordered_func)(struct btrfs_work *work); + void (*ordered_free)(struct btrfs_work *work); + + /* + * flags should be set to zero. It is used to make sure the + * struct is only inserted once into the list. + */ + unsigned long flags; + + /* don't touch these */ + struct btrfs_worker_thread *worker; + struct list_head list; + struct list_head order_list; +}; + +struct btrfs_workers { + /* current number of running workers */ + int num_workers; + + /* max number of workers allowed. changed by btrfs_start_workers */ + int max_workers; + + /* once a worker has this many requests or fewer, it is idle */ + int idle_thresh; + + /* force completions in the order they were queued */ + int ordered; + + /* list with all the work threads. The workers on the idle thread + * may be actively servicing jobs, but they haven't yet hit the + * idle thresh limit above. + */ + struct list_head worker_list; + struct list_head idle_list; + + /* + * when operating in ordered mode, this maintains the list + * of work items waiting for completion + */ + struct list_head order_list; + + /* lock for finding the next worker thread to queue on */ + spinlock_t lock; + + /* extra name for this worker, used for current->name */ + char *name; +}; + +int btrfs_queue_worker(struct btrfs_workers *workers, struct btrfs_work *work); +int btrfs_start_workers(struct btrfs_workers *workers, int num_workers); +int btrfs_stop_workers(struct btrfs_workers *workers); +void btrfs_init_workers(struct btrfs_workers *workers, char *name, int max); +int btrfs_requeue_work(struct btrfs_work *work); +#endif diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h new file mode 100644 index 00000000000..a8c9693b75a --- /dev/null +++ b/fs/btrfs/btrfs_inode.h @@ -0,0 +1,131 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_I__ +#define __BTRFS_I__ + +#include "extent_map.h" +#include "extent_io.h" +#include "ordered-data.h" + +/* in memory btrfs inode */ +struct btrfs_inode { + /* which subvolume this inode belongs to */ + struct btrfs_root *root; + + /* key used to find this inode on disk. This is used by the code + * to read in roots of subvolumes + */ + struct btrfs_key location; + + /* the extent_tree has caches of all the extent mappings to disk */ + struct extent_map_tree extent_tree; + + /* the io_tree does range state (DIRTY, LOCKED etc) */ + struct extent_io_tree io_tree; + + /* special utility tree used to record which mirrors have already been + * tried when checksums fail for a given block + */ + struct extent_io_tree io_failure_tree; + + /* held while inesrting or deleting extents from files */ + struct mutex extent_mutex; + + /* held while logging the inode in tree-log.c */ + struct mutex log_mutex; + + /* used to order data wrt metadata */ + struct btrfs_ordered_inode_tree ordered_tree; + + /* standard acl pointers */ + struct posix_acl *i_acl; + struct posix_acl *i_default_acl; + + /* for keeping track of orphaned inodes */ + struct list_head i_orphan; + + /* list of all the delalloc inodes in the FS. There are times we need + * to write all the delalloc pages to disk, and this list is used + * to walk them all. + */ + struct list_head delalloc_inodes; + + /* full 64 bit generation number, struct vfs_inode doesn't have a big + * enough field for this. + */ + u64 generation; + + /* sequence number for NFS changes */ + u64 sequence; + + /* + * transid of the trans_handle that last modified this inode + */ + u64 last_trans; + /* + * transid that last logged this inode + */ + u64 logged_trans; + + /* + * trans that last made a change that should be fully fsync'd. This + * gets reset to zero each time the inode is logged + */ + u64 log_dirty_trans; + + /* total number of bytes pending delalloc, used by stat to calc the + * real block usage of the file + */ + u64 delalloc_bytes; + + /* + * the size of the file stored in the metadata on disk. data=ordered + * means the in-memory i_size might be larger than the size on disk + * because not all the blocks are written yet. + */ + u64 disk_i_size; + + /* flags field from the on disk inode */ + u32 flags; + + /* + * if this is a directory then index_cnt is the counter for the index + * number for new files that are created + */ + u64 index_cnt; + + /* the start of block group preferred for allocations. */ + u64 block_group; + + struct inode vfs_inode; +}; + +static inline struct btrfs_inode *BTRFS_I(struct inode *inode) +{ + return container_of(inode, struct btrfs_inode, vfs_inode); +} + +static inline void btrfs_i_size_write(struct inode *inode, u64 size) +{ + inode->i_size = size; + BTRFS_I(inode)->disk_i_size = size; +} + + +#endif diff --git a/fs/btrfs/compat.h b/fs/btrfs/compat.h new file mode 100644 index 00000000000..7c4503ef6ef --- /dev/null +++ b/fs/btrfs/compat.h @@ -0,0 +1,7 @@ +#ifndef _COMPAT_H_ +#define _COMPAT_H_ + +#define btrfs_drop_nlink(inode) drop_nlink(inode) +#define btrfs_inc_nlink(inode) inc_nlink(inode) + +#endif /* _COMPAT_H_ */ diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c new file mode 100644 index 00000000000..ee848d8585d --- /dev/null +++ b/fs/btrfs/compression.c @@ -0,0 +1,709 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/bit_spinlock.h> +#include <linux/version.h> +#include <linux/pagevec.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "ordered-data.h" +#include "compression.h" +#include "extent_io.h" +#include "extent_map.h" + +struct compressed_bio { + /* number of bios pending for this compressed extent */ + atomic_t pending_bios; + + /* the pages with the compressed data on them */ + struct page **compressed_pages; + + /* inode that owns this data */ + struct inode *inode; + + /* starting offset in the inode for our pages */ + u64 start; + + /* number of bytes in the inode we're working on */ + unsigned long len; + + /* number of bytes on disk */ + unsigned long compressed_len; + + /* number of compressed pages in the array */ + unsigned long nr_pages; + + /* IO errors */ + int errors; + int mirror_num; + + /* for reads, this is the bio we are copying the data into */ + struct bio *orig_bio; + + /* + * the start of a variable length array of checksums only + * used by reads + */ + u32 sums; +}; + +static inline int compressed_bio_size(struct btrfs_root *root, + unsigned long disk_size) +{ + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + return sizeof(struct compressed_bio) + + ((disk_size + root->sectorsize - 1) / root->sectorsize) * + csum_size; +} + +static struct bio *compressed_bio_alloc(struct block_device *bdev, + u64 first_byte, gfp_t gfp_flags) +{ + struct bio *bio; + int nr_vecs; + + nr_vecs = bio_get_nr_vecs(bdev); + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_byte >> 9; + } + return bio; +} + +static int check_compressed_csum(struct inode *inode, + struct compressed_bio *cb, + u64 disk_start) +{ + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page *page; + unsigned long i; + char *kaddr; + u32 csum; + u32 *cb_sum = &cb->sums; + + if (btrfs_test_flag(inode, NODATASUM)) + return 0; + + for (i = 0; i < cb->nr_pages; i++) { + page = cb->compressed_pages[i]; + csum = ~(u32)0; + + kaddr = kmap_atomic(page, KM_USER0); + csum = btrfs_csum_data(root, kaddr, csum, PAGE_CACHE_SIZE); + btrfs_csum_final(csum, (char *)&csum); + kunmap_atomic(kaddr, KM_USER0); + + if (csum != *cb_sum) { + printk(KERN_INFO "btrfs csum failed ino %lu " + "extent %llu csum %u " + "wanted %u mirror %d\n", inode->i_ino, + (unsigned long long)disk_start, + csum, *cb_sum, cb->mirror_num); + ret = -EIO; + goto fail; + } + cb_sum++; + + } + ret = 0; +fail: + return ret; +} + +/* when we finish reading compressed pages from the disk, we + * decompress them and then run the bio end_io routines on the + * decompressed pages (in the inode address space). + * + * This allows the checksumming and other IO error handling routines + * to work normally + * + * The compressed pages are freed here, and it must be run + * in process context + */ +static void end_compressed_bio_read(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + int ret; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + inode = cb->inode; + ret = check_compressed_csum(inode, cb, (u64)bio->bi_sector << 9); + if (ret) + goto csum_failed; + + /* ok, we're the last bio for this extent, lets start + * the decompression. + */ + tree = &BTRFS_I(inode)->io_tree; + ret = btrfs_zlib_decompress_biovec(cb->compressed_pages, + cb->start, + cb->orig_bio->bi_io_vec, + cb->orig_bio->bi_vcnt, + cb->compressed_len); +csum_failed: + if (ret) + cb->errors = 1; + + /* release the compressed pages */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* do io completion on the original bio */ + if (cb->errors) { + bio_io_error(cb->orig_bio); + } else { + int bio_index = 0; + struct bio_vec *bvec = cb->orig_bio->bi_io_vec; + + /* + * we have verified the checksum already, set page + * checked so the end_io handlers know about it + */ + while (bio_index < cb->orig_bio->bi_vcnt) { + SetPageChecked(bvec->bv_page); + bvec++; + bio_index++; + } + bio_endio(cb->orig_bio, 0); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * Clear the writeback bits on all of the file + * pages for a compressed write + */ +static noinline int end_compressed_writeback(struct inode *inode, u64 start, + unsigned long ram_size) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = (start + ram_size - 1) >> PAGE_CACHE_SHIFT; + struct page *pages[16]; + unsigned long nr_pages = end_index - index + 1; + int i; + int ret; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + nr_pages -= 1; + index += 1; + continue; + } + for (i = 0; i < ret; i++) { + end_page_writeback(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + } + /* the inode may be gone now */ + return 0; +} + +/* + * do the cleanup once all the compressed pages hit the disk. + * This will clear writeback on the file pages and free the compressed + * pages. + * + * This also calls the writeback end hooks for the file pages so that + * metadata and checksums can be updated in the file. + */ +static void end_compressed_bio_write(struct bio *bio, int err) +{ + struct extent_io_tree *tree; + struct compressed_bio *cb = bio->bi_private; + struct inode *inode; + struct page *page; + unsigned long index; + + if (err) + cb->errors = 1; + + /* if there are more bios still pending for this compressed + * extent, just exit + */ + if (!atomic_dec_and_test(&cb->pending_bios)) + goto out; + + /* ok, we're the last bio for this extent, step one is to + * call back into the FS and do all the end_io operations + */ + inode = cb->inode; + tree = &BTRFS_I(inode)->io_tree; + cb->compressed_pages[0]->mapping = cb->inode->i_mapping; + tree->ops->writepage_end_io_hook(cb->compressed_pages[0], + cb->start, + cb->start + cb->len - 1, + NULL, 1); + cb->compressed_pages[0]->mapping = NULL; + + end_compressed_writeback(inode, cb->start, cb->len); + /* note, our inode could be gone now */ + + /* + * release the compressed pages, these came from alloc_page and + * are not attached to the inode at all + */ + index = 0; + for (index = 0; index < cb->nr_pages; index++) { + page = cb->compressed_pages[index]; + page->mapping = NULL; + page_cache_release(page); + } + + /* finally free the cb struct */ + kfree(cb->compressed_pages); + kfree(cb); +out: + bio_put(bio); +} + +/* + * worker function to build and submit bios for previously compressed pages. + * The corresponding pages in the inode should be marked for writeback + * and the compressed pages should have a reference on them for dropping + * when the IO is complete. + * + * This also checksums the file bytes and gets things ready for + * the end io hooks. + */ +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages) +{ + struct bio *bio = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct compressed_bio *cb; + unsigned long bytes_left; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int page_index = 0; + struct page *page; + u64 first_byte = disk_start; + struct block_device *bdev; + int ret; + + WARN_ON(start & ((u64)PAGE_CACHE_SIZE - 1)); + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->start = start; + cb->len = len; + cb->mirror_num = 0; + cb->compressed_pages = compressed_pages; + cb->compressed_len = compressed_len; + cb->orig_bio = NULL; + cb->nr_pages = nr_pages; + + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + atomic_inc(&cb->pending_bios); + + /* create and submit bios for the compressed pages */ + bytes_left = compressed_len; + for (page_index = 0; page_index < cb->nr_pages; page_index++) { + page = compressed_pages[page_index]; + page->mapping = inode->i_mapping; + if (bio->bi_size) + ret = io_tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(bio); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + + bio = compressed_bio_alloc(bdev, first_byte, GFP_NOFS); + bio->bi_private = cb; + bio->bi_end_io = end_compressed_bio_write; + bio_add_page(bio, page, PAGE_CACHE_SIZE, 0); + } + if (bytes_left < PAGE_CACHE_SIZE) { + printk("bytes left %lu compress len %lu nr %lu\n", + bytes_left, cb->compressed_len, cb->nr_pages); + } + bytes_left -= PAGE_CACHE_SIZE; + first_byte += PAGE_CACHE_SIZE; + cond_resched(); + } + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + ret = btrfs_csum_one_bio(root, inode, bio, start, 1); + BUG_ON(ret); + + ret = btrfs_map_bio(root, WRITE, bio, 0, 1); + BUG_ON(ret); + + bio_put(bio); + return 0; +} + +static noinline int add_ra_bio_pages(struct inode *inode, + u64 compressed_end, + struct compressed_bio *cb) +{ + unsigned long end_index; + unsigned long page_index; + u64 last_offset; + u64 isize = i_size_read(inode); + int ret; + struct page *page; + unsigned long nr_pages = 0; + struct extent_map *em; + struct address_space *mapping = inode->i_mapping; + struct pagevec pvec; + struct extent_map_tree *em_tree; + struct extent_io_tree *tree; + u64 end; + int misses = 0; + + page = cb->orig_bio->bi_io_vec[cb->orig_bio->bi_vcnt - 1].bv_page; + last_offset = (page_offset(page) + PAGE_CACHE_SIZE); + em_tree = &BTRFS_I(inode)->extent_tree; + tree = &BTRFS_I(inode)->io_tree; + + if (isize == 0) + return 0; + + end_index = (i_size_read(inode) - 1) >> PAGE_CACHE_SHIFT; + + pagevec_init(&pvec, 0); + while (last_offset < compressed_end) { + page_index = last_offset >> PAGE_CACHE_SHIFT; + + if (page_index > end_index) + break; + + rcu_read_lock(); + page = radix_tree_lookup(&mapping->page_tree, page_index); + rcu_read_unlock(); + if (page) { + misses++; + if (misses > 4) + break; + goto next; + } + + page = alloc_page(mapping_gfp_mask(mapping) | GFP_NOFS); + if (!page) + break; + + page->index = page_index; + /* + * what we want to do here is call add_to_page_cache_lru, + * but that isn't exported, so we reproduce it here + */ + if (add_to_page_cache(page, mapping, + page->index, GFP_NOFS)) { + page_cache_release(page); + goto next; + } + + /* open coding of lru_cache_add, also not exported */ + page_cache_get(page); + if (!pagevec_add(&pvec, page)) + __pagevec_lru_add_file(&pvec); + + end = last_offset + PAGE_CACHE_SIZE - 1; + /* + * at this point, we have a locked page in the page cache + * for these bytes in the file. But, we have to make + * sure they map to this compressed extent on disk. + */ + set_page_extent_mapped(page); + lock_extent(tree, last_offset, end, GFP_NOFS); + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, last_offset, + PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + + if (!em || last_offset < em->start || + (last_offset + PAGE_CACHE_SIZE > extent_map_end(em)) || + (em->block_start >> 9) != cb->orig_bio->bi_sector) { + free_extent_map(em); + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } + free_extent_map(em); + + if (page->index == end_index) { + char *userpage; + size_t zero_offset = isize & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + int zeros; + zeros = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, zeros); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } + + ret = bio_add_page(cb->orig_bio, page, + PAGE_CACHE_SIZE, 0); + + if (ret == PAGE_CACHE_SIZE) { + nr_pages++; + page_cache_release(page); + } else { + unlock_extent(tree, last_offset, end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + break; + } +next: + last_offset += PAGE_CACHE_SIZE; + } + if (pagevec_count(&pvec)) + __pagevec_lru_add_file(&pvec); + return 0; +} + +/* + * for a compressed read, the bio we get passed has all the inode pages + * in it. We don't actually do IO on those pages but allocate new ones + * to hold the compressed pages on disk. + * + * bio->bi_sector points to the compressed extent on disk + * bio->bi_io_vec points to all of the inode pages + * bio->bi_vcnt is a count of pages + * + * After the compressed pages are read, we copy the bytes into the + * bio we were passed and then call the bio end_io calls + */ +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *em_tree; + struct compressed_bio *cb; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + unsigned long compressed_len; + unsigned long nr_pages; + unsigned long page_index; + struct page *page; + struct block_device *bdev; + struct bio *comp_bio; + u64 cur_disk_byte = (u64)bio->bi_sector << 9; + u64 em_len; + u64 em_start; + struct extent_map *em; + int ret; + u32 *sums; + + tree = &BTRFS_I(inode)->io_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + + /* we need the actual starting offset of this extent in the file */ + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, + page_offset(bio->bi_io_vec->bv_page), + PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + + compressed_len = em->block_len; + cb = kmalloc(compressed_bio_size(root, compressed_len), GFP_NOFS); + atomic_set(&cb->pending_bios, 0); + cb->errors = 0; + cb->inode = inode; + cb->mirror_num = mirror_num; + sums = &cb->sums; + + cb->start = em->orig_start; + em_len = em->len; + em_start = em->start; + + free_extent_map(em); + em = NULL; + + cb->len = uncompressed_len; + cb->compressed_len = compressed_len; + cb->orig_bio = bio; + + nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + cb->compressed_pages = kmalloc(sizeof(struct page *) * nr_pages, + GFP_NOFS); + bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + for (page_index = 0; page_index < nr_pages; page_index++) { + cb->compressed_pages[page_index] = alloc_page(GFP_NOFS | + __GFP_HIGHMEM); + } + cb->nr_pages = nr_pages; + + add_ra_bio_pages(inode, em_start + em_len, cb); + + /* include any pages we added in add_ra-bio_pages */ + uncompressed_len = bio->bi_vcnt * PAGE_CACHE_SIZE; + cb->len = uncompressed_len; + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + atomic_inc(&cb->pending_bios); + + for (page_index = 0; page_index < nr_pages; page_index++) { + page = cb->compressed_pages[page_index]; + page->mapping = inode->i_mapping; + page->index = em_start >> PAGE_CACHE_SHIFT; + + if (comp_bio->bi_size) + ret = tree->ops->merge_bio_hook(page, 0, + PAGE_CACHE_SIZE, + comp_bio, 0); + else + ret = 0; + + page->mapping = NULL; + if (ret || bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0) < + PAGE_CACHE_SIZE) { + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + /* + * inc the count before we submit the bio so + * we know the end IO handler won't happen before + * we inc the count. Otherwise, the cb might get + * freed before we're done setting it up + */ + atomic_inc(&cb->pending_bios); + + if (!btrfs_test_flag(inode, NODATASUM)) { + btrfs_lookup_bio_sums(root, inode, comp_bio, + sums); + } + sums += (comp_bio->bi_size + root->sectorsize - 1) / + root->sectorsize; + + ret = btrfs_map_bio(root, READ, comp_bio, + mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + + comp_bio = compressed_bio_alloc(bdev, cur_disk_byte, + GFP_NOFS); + comp_bio->bi_private = cb; + comp_bio->bi_end_io = end_compressed_bio_read; + + bio_add_page(comp_bio, page, PAGE_CACHE_SIZE, 0); + } + cur_disk_byte += PAGE_CACHE_SIZE; + } + bio_get(comp_bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + BUG_ON(ret); + + if (!btrfs_test_flag(inode, NODATASUM)) + btrfs_lookup_bio_sums(root, inode, comp_bio, sums); + + ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); + BUG_ON(ret); + + bio_put(comp_bio); + return 0; +} diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h new file mode 100644 index 00000000000..421f5b4aa71 --- /dev/null +++ b/fs/btrfs/compression.h @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_COMPRESSION_ +#define __BTRFS_COMPRESSION_ + +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen); +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out); +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen); +void btrfs_zlib_exit(void); +int btrfs_submit_compressed_write(struct inode *inode, u64 start, + unsigned long len, u64 disk_start, + unsigned long compressed_len, + struct page **compressed_pages, + unsigned long nr_pages); +int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); +#endif diff --git a/fs/btrfs/crc32c.h b/fs/btrfs/crc32c.h new file mode 100644 index 00000000000..6e1b3de3670 --- /dev/null +++ b/fs/btrfs/crc32c.h @@ -0,0 +1,29 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_CRC32C__ +#define __BTRFS_CRC32C__ +#include <linux/crc32c.h> + +/* + * this file used to do more for selecting the HW version of crc32c, + * perhaps it will one day again soon. + */ +#define btrfs_crc32c(seed, data, length) crc32c(seed, data, length) +#endif + diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c new file mode 100644 index 00000000000..9e46c077681 --- /dev/null +++ b/fs/btrfs/ctree.c @@ -0,0 +1,3953 @@ +/* + * Copyright (C) 2007,2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "locking.h" + +static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int level); +static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, int extend); +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty); +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst_buf, + struct extent_buffer *src_buf); +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot); + +inline void btrfs_init_path(struct btrfs_path *p) +{ + memset(p, 0, sizeof(*p)); +} + +struct btrfs_path *btrfs_alloc_path(void) +{ + struct btrfs_path *path; + path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); + if (path) { + btrfs_init_path(path); + path->reada = 1; + } + return path; +} + +/* this also releases the path */ +void btrfs_free_path(struct btrfs_path *p) +{ + btrfs_release_path(NULL, p); + kmem_cache_free(btrfs_path_cachep, p); +} + +/* + * path release drops references on the extent buffers in the path + * and it drops any locks held by this path + * + * It is safe to call this on paths that no locks or extent buffers held. + */ +noinline void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p) +{ + int i; + + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + p->slots[i] = 0; + if (!p->nodes[i]) + continue; + if (p->locks[i]) { + btrfs_tree_unlock(p->nodes[i]); + p->locks[i] = 0; + } + free_extent_buffer(p->nodes[i]); + p->nodes[i] = NULL; + } +} + +/* + * safely gets a reference on the root node of a tree. A lock + * is not taken, so a concurrent writer may put a different node + * at the root of the tree. See btrfs_lock_root_node for the + * looping required. + * + * The extent buffer returned by this has a reference taken, so + * it won't disappear. It may stop being the root of the tree + * at any time because there are no locks held. + */ +struct extent_buffer *btrfs_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + spin_lock(&root->node_lock); + eb = root->node; + extent_buffer_get(eb); + spin_unlock(&root->node_lock); + return eb; +} + +/* loop around taking references on and locking the root node of the + * tree until you end up with a lock on the root. A locked buffer + * is returned, with a reference held. + */ +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) +{ + struct extent_buffer *eb; + + while (1) { + eb = btrfs_root_node(root); + btrfs_tree_lock(eb); + + spin_lock(&root->node_lock); + if (eb == root->node) { + spin_unlock(&root->node_lock); + break; + } + spin_unlock(&root->node_lock); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + return eb; +} + +/* cowonly root (everything not a reference counted cow subvolume), just get + * put onto a simple dirty list. transaction.c walks this to make sure they + * get properly updated on disk. + */ +static void add_root_to_dirty_list(struct btrfs_root *root) +{ + if (root->track_dirty && list_empty(&root->dirty_list)) { + list_add(&root->dirty_list, + &root->fs_info->dirty_cowonly_roots); + } +} + +/* + * used by snapshot creation to make a copy of a root for a tree with + * a given objectid. The buffer with the new root node is returned in + * cow_ret, and this func returns zero on success or a negative error code. + */ +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid) +{ + struct extent_buffer *cow; + u32 nritems; + int ret = 0; + int level; + struct btrfs_root *new_root; + + new_root = kmalloc(sizeof(*new_root), GFP_NOFS); + if (!new_root) + return -ENOMEM; + + memcpy(new_root, root, sizeof(*new_root)); + new_root->root_key.objectid = new_root_objectid; + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + cow = btrfs_alloc_free_block(trans, new_root, buf->len, 0, + new_root_objectid, trans->transid, + level, buf->start, 0); + if (IS_ERR(cow)) { + kfree(new_root); + return PTR_ERR(cow); + } + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_owner(cow, new_root_objectid); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + ret = btrfs_inc_ref(trans, new_root, buf, cow, NULL); + kfree(new_root); + + if (ret) + return ret; + + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +/* + * does the dirty work in cow of a single block. The parent block (if + * supplied) is updated to point to the new cow copy. The new buffer is marked + * dirty and returned locked. If you modify the block it needs to be marked + * dirty again. + * + * search_start -- an allocation hint for the new block + * + * empty_size -- a hint that you plan on doing more cow. This is the size in + * bytes the allocator should try to find free next to the block it returns. + * This is just a hint and may be ignored by the allocator. + * + * prealloc_dest -- if you have already reserved a destination for the cow, + * this uses that block instead of allocating a new one. + * btrfs_alloc_reserved_extent is used to finish the allocation. + */ +static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, + u64 search_start, u64 empty_size, + u64 prealloc_dest) +{ + u64 parent_start; + struct extent_buffer *cow; + u32 nritems; + int ret = 0; + int level; + int unlock_orig = 0; + + if (*cow_ret == buf) + unlock_orig = 1; + + WARN_ON(!btrfs_tree_locked(buf)); + + if (parent) + parent_start = parent->start; + else + parent_start = 0; + + WARN_ON(root->ref_cows && trans->transid != + root->fs_info->running_transaction->transid); + WARN_ON(root->ref_cows && trans->transid != root->last_trans); + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + if (prealloc_dest) { + struct btrfs_key ins; + + ins.objectid = prealloc_dest; + ins.offset = buf->len; + ins.type = BTRFS_EXTENT_ITEM_KEY; + + ret = btrfs_alloc_reserved_extent(trans, root, parent_start, + root->root_key.objectid, + trans->transid, level, &ins); + BUG_ON(ret); + cow = btrfs_init_new_buffer(trans, root, prealloc_dest, + buf->len); + } else { + cow = btrfs_alloc_free_block(trans, root, buf->len, + parent_start, + root->root_key.objectid, + trans->transid, level, + search_start, empty_size); + } + if (IS_ERR(cow)) + return PTR_ERR(cow); + + copy_extent_buffer(cow, buf, 0, 0, cow->len); + btrfs_set_header_bytenr(cow, cow->start); + btrfs_set_header_generation(cow, trans->transid); + btrfs_set_header_owner(cow, root->root_key.objectid); + btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN); + + write_extent_buffer(cow, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(cow), + BTRFS_FSID_SIZE); + + WARN_ON(btrfs_header_generation(buf) > trans->transid); + if (btrfs_header_generation(buf) != trans->transid) { + u32 nr_extents; + ret = btrfs_inc_ref(trans, root, buf, cow, &nr_extents); + if (ret) + return ret; + + ret = btrfs_cache_ref(trans, root, buf, nr_extents); + WARN_ON(ret); + } else if (btrfs_header_owner(buf) == BTRFS_TREE_RELOC_OBJECTID) { + /* + * There are only two places that can drop reference to + * tree blocks owned by living reloc trees, one is here, + * the other place is btrfs_drop_subtree. In both places, + * we check reference count while tree block is locked. + * Furthermore, if reference count is one, it won't get + * increased by someone else. + */ + u32 refs; + ret = btrfs_lookup_extent_ref(trans, root, buf->start, + buf->len, &refs); + BUG_ON(ret); + if (refs == 1) { + ret = btrfs_update_ref(trans, root, buf, cow, + 0, nritems); + clean_tree_block(trans, root, buf); + } else { + ret = btrfs_inc_ref(trans, root, buf, cow, NULL); + } + BUG_ON(ret); + } else { + ret = btrfs_update_ref(trans, root, buf, cow, 0, nritems); + if (ret) + return ret; + clean_tree_block(trans, root, buf); + } + + if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { + ret = btrfs_reloc_tree_cache_ref(trans, root, cow, buf->start); + WARN_ON(ret); + } + + if (buf == root->node) { + WARN_ON(parent && parent != buf); + + spin_lock(&root->node_lock); + root->node = cow; + extent_buffer_get(cow); + spin_unlock(&root->node_lock); + + if (buf != root->commit_root) { + btrfs_free_extent(trans, root, buf->start, + buf->len, buf->start, + root->root_key.objectid, + btrfs_header_generation(buf), + level, 1); + } + free_extent_buffer(buf); + add_root_to_dirty_list(root); + } else { + btrfs_set_node_blockptr(parent, parent_slot, + cow->start); + WARN_ON(trans->transid == 0); + btrfs_set_node_ptr_generation(parent, parent_slot, + trans->transid); + btrfs_mark_buffer_dirty(parent); + WARN_ON(btrfs_header_generation(parent) != trans->transid); + btrfs_free_extent(trans, root, buf->start, buf->len, + parent_start, btrfs_header_owner(parent), + btrfs_header_generation(parent), level, 1); + } + if (unlock_orig) + btrfs_tree_unlock(buf); + free_extent_buffer(buf); + btrfs_mark_buffer_dirty(cow); + *cow_ret = cow; + return 0; +} + +/* + * cows a single block, see __btrfs_cow_block for the real work. + * This version of it has extra checks so that a block isn't cow'd more than + * once per transaction, as long as it hasn't been written yet + */ +noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, u64 prealloc_dest) +{ + u64 search_start; + int ret; + + if (trans->transaction != root->fs_info->running_transaction) { + printk(KERN_CRIT "trans %llu running %llu\n", + (unsigned long long)trans->transid, + (unsigned long long) + root->fs_info->running_transaction->transid); + WARN_ON(1); + } + if (trans->transid != root->fs_info->generation) { + printk(KERN_CRIT "trans %llu running %llu\n", + (unsigned long long)trans->transid, + (unsigned long long)root->fs_info->generation); + WARN_ON(1); + } + + spin_lock(&root->fs_info->hash_lock); + if (btrfs_header_generation(buf) == trans->transid && + btrfs_header_owner(buf) == root->root_key.objectid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + *cow_ret = buf; + spin_unlock(&root->fs_info->hash_lock); + WARN_ON(prealloc_dest); + return 0; + } + spin_unlock(&root->fs_info->hash_lock); + search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1); + ret = __btrfs_cow_block(trans, root, buf, parent, + parent_slot, cow_ret, search_start, 0, + prealloc_dest); + return ret; +} + +/* + * helper function for defrag to decide if two blocks pointed to by a + * node are actually close by + */ +static int close_blocks(u64 blocknr, u64 other, u32 blocksize) +{ + if (blocknr < other && other - (blocknr + blocksize) < 32768) + return 1; + if (blocknr > other && blocknr - (other + blocksize) < 32768) + return 1; + return 0; +} + +/* + * compare two keys in a memcmp fashion + */ +static int comp_keys(struct btrfs_disk_key *disk, struct btrfs_key *k2) +{ + struct btrfs_key k1; + + btrfs_disk_key_to_cpu(&k1, disk); + + if (k1.objectid > k2->objectid) + return 1; + if (k1.objectid < k2->objectid) + return -1; + if (k1.type > k2->type) + return 1; + if (k1.type < k2->type) + return -1; + if (k1.offset > k2->offset) + return 1; + if (k1.offset < k2->offset) + return -1; + return 0; +} + +/* + * same as comp_keys only with two btrfs_key's + */ +static int comp_cpu_keys(struct btrfs_key *k1, struct btrfs_key *k2) +{ + if (k1->objectid > k2->objectid) + return 1; + if (k1->objectid < k2->objectid) + return -1; + if (k1->type > k2->type) + return 1; + if (k1->type < k2->type) + return -1; + if (k1->offset > k2->offset) + return 1; + if (k1->offset < k2->offset) + return -1; + return 0; +} + +/* + * this is used by the defrag code to go through all the + * leaves pointed to by a node and reallocate them so that + * disk order is close to key order + */ +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, int cache_only, u64 *last_ret, + struct btrfs_key *progress) +{ + struct extent_buffer *cur; + u64 blocknr; + u64 gen; + u64 search_start = *last_ret; + u64 last_block = 0; + u64 other; + u32 parent_nritems; + int end_slot; + int i; + int err = 0; + int parent_level; + int uptodate; + u32 blocksize; + int progress_passed = 0; + struct btrfs_disk_key disk_key; + + parent_level = btrfs_header_level(parent); + if (cache_only && parent_level != 1) + return 0; + + if (trans->transaction != root->fs_info->running_transaction) + WARN_ON(1); + if (trans->transid != root->fs_info->generation) + WARN_ON(1); + + parent_nritems = btrfs_header_nritems(parent); + blocksize = btrfs_level_size(root, parent_level - 1); + end_slot = parent_nritems; + + if (parent_nritems == 1) + return 0; + + for (i = start_slot; i < end_slot; i++) { + int close = 1; + + if (!parent->map_token) { + map_extent_buffer(parent, + btrfs_node_key_ptr_offset(i), + sizeof(struct btrfs_key_ptr), + &parent->map_token, &parent->kaddr, + &parent->map_start, &parent->map_len, + KM_USER1); + } + btrfs_node_key(parent, &disk_key, i); + if (!progress_passed && comp_keys(&disk_key, progress) < 0) + continue; + + progress_passed = 1; + blocknr = btrfs_node_blockptr(parent, i); + gen = btrfs_node_ptr_generation(parent, i); + if (last_block == 0) + last_block = blocknr; + + if (i > 0) { + other = btrfs_node_blockptr(parent, i - 1); + close = close_blocks(blocknr, other, blocksize); + } + if (!close && i < end_slot - 2) { + other = btrfs_node_blockptr(parent, i + 1); + close = close_blocks(blocknr, other, blocksize); + } + if (close) { + last_block = blocknr; + continue; + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + + cur = btrfs_find_tree_block(root, blocknr, blocksize); + if (cur) + uptodate = btrfs_buffer_uptodate(cur, gen); + else + uptodate = 0; + if (!cur || !uptodate) { + if (cache_only) { + free_extent_buffer(cur); + continue; + } + if (!cur) { + cur = read_tree_block(root, blocknr, + blocksize, gen); + } else if (!uptodate) { + btrfs_read_buffer(cur, gen); + } + } + if (search_start == 0) + search_start = last_block; + + btrfs_tree_lock(cur); + err = __btrfs_cow_block(trans, root, cur, parent, i, + &cur, search_start, + min(16 * blocksize, + (end_slot - i) * blocksize), 0); + if (err) { + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + break; + } + search_start = cur->start; + last_block = cur->start; + *last_ret = search_start; + btrfs_tree_unlock(cur); + free_extent_buffer(cur); + } + if (parent->map_token) { + unmap_extent_buffer(parent, parent->map_token, + KM_USER1); + parent->map_token = NULL; + } + return err; +} + +/* + * The leaf data grows from end-to-front in the node. + * this returns the address of the start of the last item, + * which is the stop of the leaf data stack + */ +static inline unsigned int leaf_data_end(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + u32 nr = btrfs_header_nritems(leaf); + if (nr == 0) + return BTRFS_LEAF_DATA_SIZE(root); + return btrfs_item_offset_nr(leaf, nr - 1); +} + +/* + * extra debugging checks to make sure all the items in a key are + * well formed and in the proper order + */ +static int check_node(struct btrfs_root *root, struct btrfs_path *path, + int level) +{ + struct extent_buffer *parent = NULL; + struct extent_buffer *node = path->nodes[level]; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key node_key; + int parent_slot; + int slot; + struct btrfs_key cpukey; + u32 nritems = btrfs_header_nritems(node); + + if (path->nodes[level + 1]) + parent = path->nodes[level + 1]; + + slot = path->slots[level]; + BUG_ON(nritems == 0); + if (parent) { + parent_slot = path->slots[level + 1]; + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_node_key(node, &node_key, 0); + BUG_ON(memcmp(&parent_key, &node_key, + sizeof(struct btrfs_disk_key))); + BUG_ON(btrfs_node_blockptr(parent, parent_slot) != + btrfs_header_bytenr(node)); + } + BUG_ON(nritems > BTRFS_NODEPTRS_PER_BLOCK(root)); + if (slot != 0) { + btrfs_node_key_to_cpu(node, &cpukey, slot - 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) <= 0); + } + if (slot < nritems - 1) { + btrfs_node_key_to_cpu(node, &cpukey, slot + 1); + btrfs_node_key(node, &node_key, slot); + BUG_ON(comp_keys(&node_key, &cpukey) >= 0); + } + return 0; +} + +/* + * extra checking to make sure all the items in a leaf are + * well formed and in the proper order + */ +static int check_leaf(struct btrfs_root *root, struct btrfs_path *path, + int level) +{ + struct extent_buffer *leaf = path->nodes[level]; + struct extent_buffer *parent = NULL; + int parent_slot; + struct btrfs_key cpukey; + struct btrfs_disk_key parent_key; + struct btrfs_disk_key leaf_key; + int slot = path->slots[0]; + + u32 nritems = btrfs_header_nritems(leaf); + + if (path->nodes[level + 1]) + parent = path->nodes[level + 1]; + + if (nritems == 0) + return 0; + + if (parent) { + parent_slot = path->slots[level + 1]; + btrfs_node_key(parent, &parent_key, parent_slot); + btrfs_item_key(leaf, &leaf_key, 0); + + BUG_ON(memcmp(&parent_key, &leaf_key, + sizeof(struct btrfs_disk_key))); + BUG_ON(btrfs_node_blockptr(parent, parent_slot) != + btrfs_header_bytenr(leaf)); + } + if (slot != 0 && slot < nritems - 1) { + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot - 1); + if (comp_keys(&leaf_key, &cpukey) <= 0) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad key\n", slot); + BUG_ON(1); + } + if (btrfs_item_offset_nr(leaf, slot - 1) != + btrfs_item_end_nr(leaf, slot)) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad\n", slot); + BUG_ON(1); + } + } + if (slot < nritems - 1) { + btrfs_item_key(leaf, &leaf_key, slot); + btrfs_item_key_to_cpu(leaf, &cpukey, slot + 1); + BUG_ON(comp_keys(&leaf_key, &cpukey) >= 0); + if (btrfs_item_offset_nr(leaf, slot) != + btrfs_item_end_nr(leaf, slot + 1)) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d offset bad\n", slot); + BUG_ON(1); + } + } + BUG_ON(btrfs_item_offset_nr(leaf, 0) + + btrfs_item_size_nr(leaf, 0) != BTRFS_LEAF_DATA_SIZE(root)); + return 0; +} + +static noinline int check_block(struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + return 0; + if (level == 0) + return check_leaf(root, path, level); + return check_node(root, path, level); +} + +/* + * search for key in the extent_buffer. The items start at offset p, + * and they are item_size apart. There are 'max' items in p. + * + * the slot in the array is returned via slot, and it points to + * the place where you would insert key if it is not found in + * the array. + * + * slot may point to max if the key is bigger than all of the keys + */ +static noinline int generic_bin_search(struct extent_buffer *eb, + unsigned long p, + int item_size, struct btrfs_key *key, + int max, int *slot) +{ + int low = 0; + int high = max; + int mid; + int ret; + struct btrfs_disk_key *tmp = NULL; + struct btrfs_disk_key unaligned; + unsigned long offset; + char *map_token = NULL; + char *kaddr = NULL; + unsigned long map_start = 0; + unsigned long map_len = 0; + int err; + + while (low < high) { + mid = (low + high) / 2; + offset = p + mid * item_size; + + if (!map_token || offset < map_start || + (offset + sizeof(struct btrfs_disk_key)) > + map_start + map_len) { + if (map_token) { + unmap_extent_buffer(eb, map_token, KM_USER0); + map_token = NULL; + } + + err = map_private_extent_buffer(eb, offset, + sizeof(struct btrfs_disk_key), + &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + + if (!err) { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } else { + read_extent_buffer(eb, &unaligned, + offset, sizeof(unaligned)); + tmp = &unaligned; + } + + } else { + tmp = (struct btrfs_disk_key *)(kaddr + offset - + map_start); + } + ret = comp_keys(tmp, key); + + if (ret < 0) + low = mid + 1; + else if (ret > 0) + high = mid; + else { + *slot = mid; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 0; + } + } + *slot = low; + if (map_token) + unmap_extent_buffer(eb, map_token, KM_USER0); + return 1; +} + +/* + * simple bin_search frontend that does the right thing for + * leaves vs nodes + */ +static int bin_search(struct extent_buffer *eb, struct btrfs_key *key, + int level, int *slot) +{ + if (level == 0) { + return generic_bin_search(eb, + offsetof(struct btrfs_leaf, items), + sizeof(struct btrfs_item), + key, btrfs_header_nritems(eb), + slot); + } else { + return generic_bin_search(eb, + offsetof(struct btrfs_node, ptrs), + sizeof(struct btrfs_key_ptr), + key, btrfs_header_nritems(eb), + slot); + } + return -1; +} + +/* given a node and slot number, this reads the blocks it points to. The + * extent buffer is returned with a reference taken (but unlocked). + * NULL is returned on error. + */ +static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, + struct extent_buffer *parent, int slot) +{ + int level = btrfs_header_level(parent); + if (slot < 0) + return NULL; + if (slot >= btrfs_header_nritems(parent)) + return NULL; + + BUG_ON(level == 0); + + return read_tree_block(root, btrfs_node_blockptr(parent, slot), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(parent, slot)); +} + +/* + * node level balancing, used to make sure nodes are in proper order for + * item deletion. We balance from the top down, so we have to make sure + * that a deletion won't leave an node completely empty later on. + */ +static noinline int balance_level(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + int err_on_enospc = 0; + u64 orig_ptr; + + if (level == 0) + return 0; + + mid = path->nodes[level]; + WARN_ON(!path->locks[level]); + WARN_ON(btrfs_header_generation(mid) != trans->transid); + + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + /* + * deal with the case where there is only one pointer in the root + * by promoting the node below to a root + */ + if (!parent) { + struct extent_buffer *child; + + if (btrfs_header_nritems(mid) != 1) + return 0; + + /* promote the child to a root */ + child = read_node_slot(root, mid, 0); + btrfs_tree_lock(child); + BUG_ON(!child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); + BUG_ON(ret); + + spin_lock(&root->node_lock); + root->node = child; + spin_unlock(&root->node_lock); + + ret = btrfs_update_extent_ref(trans, root, child->start, + mid->start, child->start, + root->root_key.objectid, + trans->transid, level - 1); + BUG_ON(ret); + + add_root_to_dirty_list(root); + btrfs_tree_unlock(child); + path->locks[level] = 0; + path->nodes[level] = NULL; + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + /* once for the path */ + free_extent_buffer(mid); + ret = btrfs_free_extent(trans, root, mid->start, mid->len, + mid->start, root->root_key.objectid, + btrfs_header_generation(mid), + level, 1); + /* once for the root ptr */ + free_extent_buffer(mid); + return ret; + } + if (btrfs_header_nritems(mid) > + BTRFS_NODEPTRS_PER_BLOCK(root) / 4) + return 0; + + if (btrfs_header_nritems(mid) < 2) + err_on_enospc = 1; + + left = read_node_slot(root, parent, pslot - 1); + if (left) { + btrfs_tree_lock(left); + wret = btrfs_cow_block(trans, root, left, + parent, pslot - 1, &left, 0); + if (wret) { + ret = wret; + goto enospc; + } + } + right = read_node_slot(root, parent, pslot + 1); + if (right) { + btrfs_tree_lock(right); + wret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, &right, 0); + if (wret) { + ret = wret; + goto enospc; + } + } + + /* first, try to make some room in the middle buffer */ + if (left) { + orig_slot += btrfs_header_nritems(left); + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + if (btrfs_header_nritems(mid) < 2) + err_on_enospc = 1; + } + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + wret = push_node_left(trans, root, mid, right, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + if (btrfs_header_nritems(right) == 0) { + u64 bytenr = right->start; + u64 generation = btrfs_header_generation(parent); + u32 blocksize = right->len; + + clean_tree_block(trans, root, right); + btrfs_tree_unlock(right); + free_extent_buffer(right); + right = NULL; + wret = del_ptr(trans, root, path, level + 1, pslot + + 1); + if (wret) + ret = wret; + wret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + generation, level, 1); + if (wret) + ret = wret; + } else { + struct btrfs_disk_key right_key; + btrfs_node_key(right, &right_key, 0); + btrfs_set_node_key(parent, &right_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + } + } + if (btrfs_header_nritems(mid) == 1) { + /* + * we're not allowed to leave a node with one item in the + * tree during a delete. A deletion from lower in the tree + * could try to delete the only pointer in this node. + * So, pull some keys from the left. + * There has to be a left pointer at this point because + * otherwise we would have pulled some pointers from the + * right + */ + BUG_ON(!left); + wret = balance_node_right(trans, root, mid, left); + if (wret < 0) { + ret = wret; + goto enospc; + } + if (wret == 1) { + wret = push_node_left(trans, root, left, mid, 1); + if (wret < 0) + ret = wret; + } + BUG_ON(wret == 1); + } + if (btrfs_header_nritems(mid) == 0) { + /* we've managed to empty the middle node, drop it */ + u64 root_gen = btrfs_header_generation(parent); + u64 bytenr = mid->start; + u32 blocksize = mid->len; + + clean_tree_block(trans, root, mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + mid = NULL; + wret = del_ptr(trans, root, path, level + 1, pslot); + if (wret) + ret = wret; + wret = btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, + btrfs_header_owner(parent), + root_gen, level, 1); + if (wret) + ret = wret; + } else { + /* update the parent key to reflect our changes */ + struct btrfs_disk_key mid_key; + btrfs_node_key(mid, &mid_key, 0); + btrfs_set_node_key(parent, &mid_key, pslot); + btrfs_mark_buffer_dirty(parent); + } + + /* update the path */ + if (left) { + if (btrfs_header_nritems(left) > orig_slot) { + extent_buffer_get(left); + /* left was locked after cow */ + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + if (mid) { + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } + } else { + orig_slot -= btrfs_header_nritems(left); + path->slots[level] = orig_slot; + } + } + /* double check we haven't messed things up */ + check_block(root, path, level); + if (orig_ptr != + btrfs_node_blockptr(path->nodes[level], path->slots[level])) + BUG(); +enospc: + if (right) { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + if (left) { + if (path->nodes[level] != left) + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return ret; +} + +/* Node balancing for insertion. Here we only split or push nodes around + * when they are completely full. This is also done top down, so we + * have to be pessimistic. + */ +static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *right = NULL; + struct extent_buffer *mid; + struct extent_buffer *left = NULL; + struct extent_buffer *parent = NULL; + int ret = 0; + int wret; + int pslot; + int orig_slot = path->slots[level]; + u64 orig_ptr; + + if (level == 0) + return 1; + + mid = path->nodes[level]; + WARN_ON(btrfs_header_generation(mid) != trans->transid); + orig_ptr = btrfs_node_blockptr(mid, orig_slot); + + if (level < BTRFS_MAX_LEVEL - 1) + parent = path->nodes[level + 1]; + pslot = path->slots[level + 1]; + + if (!parent) + return 1; + + left = read_node_slot(root, parent, pslot - 1); + + /* first, try to make some room in the middle buffer */ + if (left) { + u32 left_nr; + + btrfs_tree_lock(left); + left_nr = btrfs_header_nritems(left); + if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, left, parent, + pslot - 1, &left, 0); + if (ret) + wret = 1; + else { + wret = push_node_left(trans, root, + left, mid, 0); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + orig_slot += left_nr; + btrfs_node_key(mid, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot); + btrfs_mark_buffer_dirty(parent); + if (btrfs_header_nritems(left) > orig_slot) { + path->nodes[level] = left; + path->slots[level + 1] -= 1; + path->slots[level] = orig_slot; + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + orig_slot -= + btrfs_header_nritems(left); + path->slots[level] = orig_slot; + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + return 0; + } + btrfs_tree_unlock(left); + free_extent_buffer(left); + } + right = read_node_slot(root, parent, pslot + 1); + + /* + * then try to empty the right most buffer into the middle + */ + if (right) { + u32 right_nr; + btrfs_tree_lock(right); + right_nr = btrfs_header_nritems(right); + if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(root) - 1) { + wret = 1; + } else { + ret = btrfs_cow_block(trans, root, right, + parent, pslot + 1, + &right, 0); + if (ret) + wret = 1; + else { + wret = balance_node_right(trans, root, + right, mid); + } + } + if (wret < 0) + ret = wret; + if (wret == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(right, &disk_key, 0); + btrfs_set_node_key(parent, &disk_key, pslot + 1); + btrfs_mark_buffer_dirty(parent); + + if (btrfs_header_nritems(mid) <= orig_slot) { + path->nodes[level] = right; + path->slots[level + 1] += 1; + path->slots[level] = orig_slot - + btrfs_header_nritems(mid); + btrfs_tree_unlock(mid); + free_extent_buffer(mid); + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + } + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 1; +} + +/* + * readahead one full node of leaves, finding things that are close + * to the block in 'slot', and triggering ra on them. + */ +static noinline void reada_for_search(struct btrfs_root *root, + struct btrfs_path *path, + int level, int slot, u64 objectid) +{ + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + u32 nritems; + u64 search; + u64 lowest_read; + u64 highest_read; + u64 nread = 0; + int direction = path->reada; + struct extent_buffer *eb; + u32 nr; + u32 blocksize; + u32 nscan = 0; + + if (level != 1) + return; + + if (!path->nodes[level]) + return; + + node = path->nodes[level]; + + search = btrfs_node_blockptr(node, slot); + blocksize = btrfs_level_size(root, level - 1); + eb = btrfs_find_tree_block(root, search, blocksize); + if (eb) { + free_extent_buffer(eb); + return; + } + + highest_read = search; + lowest_read = search; + + nritems = btrfs_header_nritems(node); + nr = slot; + while (1) { + if (direction < 0) { + if (nr == 0) + break; + nr--; + } else if (direction > 0) { + nr++; + if (nr >= nritems) + break; + } + if (path->reada < 0 && objectid) { + btrfs_node_key(node, &disk_key, nr); + if (btrfs_disk_key_objectid(&disk_key) != objectid) + break; + } + search = btrfs_node_blockptr(node, nr); + if ((search >= lowest_read && search <= highest_read) || + (search < lowest_read && lowest_read - search <= 16384) || + (search > highest_read && search - highest_read <= 16384)) { + readahead_tree_block(root, search, blocksize, + btrfs_node_ptr_generation(node, nr)); + nread += blocksize; + } + nscan++; + if (path->reada < 2 && (nread > (64 * 1024) || nscan > 32)) + break; + + if (nread > (256 * 1024) || nscan > 128) + break; + + if (search < lowest_read) + lowest_read = search; + if (search > highest_read) + highest_read = search; + } +} + +/* + * when we walk down the tree, it is usually safe to unlock the higher layers + * in the tree. The exceptions are when our path goes through slot 0, because + * operations on the tree might require changing key pointers higher up in the + * tree. + * + * callers might also have set path->keep_locks, which tells this code to keep + * the lock if the path points to the last slot in the block. This is part of + * walking through the tree, and selecting the next slot in the higher block. + * + * lowest_unlock sets the lowest level in the tree we're allowed to unlock. so + * if lowest_unlock is 1, level 0 won't be unlocked + */ +static noinline void unlock_up(struct btrfs_path *path, int level, + int lowest_unlock) +{ + int i; + int skip_level = level; + int no_skips = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + if (!path->nodes[i]) + break; + if (!path->locks[i]) + break; + if (!no_skips && path->slots[i] == 0) { + skip_level = i + 1; + continue; + } + if (!no_skips && path->keep_locks) { + u32 nritems; + t = path->nodes[i]; + nritems = btrfs_header_nritems(t); + if (nritems < 1 || path->slots[i] >= nritems - 1) { + skip_level = i + 1; + continue; + } + } + if (skip_level < i && i >= lowest_unlock) + no_skips = 1; + + t = path->nodes[i]; + if (i >= lowest_unlock && i > skip_level && path->locks[i]) { + btrfs_tree_unlock(t); + path->locks[i] = 0; + } + } +} + +/* + * look for key in the tree. path is filled in with nodes along the way + * if key is found, we return zero and you can find the item in the leaf + * level of the path (level 0) + * + * If the key isn't found, the path points to the slot where it should + * be inserted, and 1 is returned. If there are other errors during the + * search a negative error number is returned. + * + * if ins_len > 0, nodes and leaves will be split as we walk down the + * tree. if ins_len < 0, nodes will be merged as we walk down the tree (if + * possible) + */ +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow) +{ + struct extent_buffer *b; + struct extent_buffer *tmp; + int slot; + int ret; + int level; + int should_reada = p->reada; + int lowest_unlock = 1; + int blocksize; + u8 lowest_level = 0; + u64 blocknr; + u64 gen; + struct btrfs_key prealloc_block; + + lowest_level = p->lowest_level; + WARN_ON(lowest_level && ins_len > 0); + WARN_ON(p->nodes[0] != NULL); + + if (ins_len < 0) + lowest_unlock = 2; + + prealloc_block.objectid = 0; + +again: + if (p->skip_locking) + b = btrfs_root_node(root); + else + b = btrfs_lock_root_node(root); + + while (b) { + level = btrfs_header_level(b); + + /* + * setup the path here so we can release it under lock + * contention with the cow code + */ + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + if (cow) { + int wret; + + /* is a cow on this block not required */ + spin_lock(&root->fs_info->hash_lock); + if (btrfs_header_generation(b) == trans->transid && + btrfs_header_owner(b) == root->root_key.objectid && + !btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) { + spin_unlock(&root->fs_info->hash_lock); + goto cow_done; + } + spin_unlock(&root->fs_info->hash_lock); + + /* ok, we have to cow, is our old prealloc the right + * size? + */ + if (prealloc_block.objectid && + prealloc_block.offset != b->len) { + btrfs_free_reserved_extent(root, + prealloc_block.objectid, + prealloc_block.offset); + prealloc_block.objectid = 0; + } + + /* + * for higher level blocks, try not to allocate blocks + * with the block and the parent locks held. + */ + if (level > 1 && !prealloc_block.objectid && + btrfs_path_lock_waiting(p, level)) { + u32 size = b->len; + u64 hint = b->start; + + btrfs_release_path(root, p); + ret = btrfs_reserve_extent(trans, root, + size, size, 0, + hint, (u64)-1, + &prealloc_block, 0); + BUG_ON(ret); + goto again; + } + + wret = btrfs_cow_block(trans, root, b, + p->nodes[level + 1], + p->slots[level + 1], + &b, prealloc_block.objectid); + prealloc_block.objectid = 0; + if (wret) { + free_extent_buffer(b); + ret = wret; + goto done; + } + } +cow_done: + BUG_ON(!cow && ins_len); + if (level != btrfs_header_level(b)) + WARN_ON(1); + level = btrfs_header_level(b); + + p->nodes[level] = b; + if (!p->skip_locking) + p->locks[level] = 1; + + ret = check_block(root, p, level); + if (ret) { + ret = -1; + goto done; + } + + ret = bin_search(b, key, level, &slot); + if (level != 0) { + if (ret && slot > 0) + slot -= 1; + p->slots[level] = slot; + if ((p->search_for_split || ins_len > 0) && + btrfs_header_nritems(b) >= + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) { + int sret = split_node(trans, root, p, level); + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + slot = p->slots[level]; + } else if (ins_len < 0) { + int sret = balance_level(trans, root, p, + level); + if (sret) { + ret = sret; + goto done; + } + b = p->nodes[level]; + if (!b) { + btrfs_release_path(NULL, p); + goto again; + } + slot = p->slots[level]; + BUG_ON(btrfs_header_nritems(b) == 1); + } + unlock_up(p, level, lowest_unlock); + + /* this is only true while dropping a snapshot */ + if (level == lowest_level) { + ret = 0; + goto done; + } + + blocknr = btrfs_node_blockptr(b, slot); + gen = btrfs_node_ptr_generation(b, slot); + blocksize = btrfs_level_size(root, level - 1); + + tmp = btrfs_find_tree_block(root, blocknr, blocksize); + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + b = tmp; + } else { + /* + * reduce lock contention at high levels + * of the btree by dropping locks before + * we read. + */ + if (level > 1) { + btrfs_release_path(NULL, p); + if (tmp) + free_extent_buffer(tmp); + if (should_reada) + reada_for_search(root, p, + level, slot, + key->objectid); + + tmp = read_tree_block(root, blocknr, + blocksize, gen); + if (tmp) + free_extent_buffer(tmp); + goto again; + } else { + if (tmp) + free_extent_buffer(tmp); + if (should_reada) + reada_for_search(root, p, + level, slot, + key->objectid); + b = read_node_slot(root, b, slot); + } + } + if (!p->skip_locking) + btrfs_tree_lock(b); + } else { + p->slots[level] = slot; + if (ins_len > 0 && + btrfs_leaf_free_space(root, b) < ins_len) { + int sret = split_leaf(trans, root, key, + p, ins_len, ret == 0); + BUG_ON(sret > 0); + if (sret) { + ret = sret; + goto done; + } + } + if (!p->search_for_split) + unlock_up(p, level, lowest_unlock); + goto done; + } + } + ret = 1; +done: + if (prealloc_block.objectid) { + btrfs_free_reserved_extent(root, + prealloc_block.objectid, + prealloc_block.offset); + } + + return ret; +} + +int btrfs_merge_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *node_keys, + u64 *nodes, int lowest_level) +{ + struct extent_buffer *eb; + struct extent_buffer *parent; + struct btrfs_key key; + u64 bytenr; + u64 generation; + u32 blocksize; + int level; + int slot; + int key_match; + int ret; + + eb = btrfs_lock_root_node(root); + ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0); + BUG_ON(ret); + + parent = eb; + while (1) { + level = btrfs_header_level(parent); + if (level == 0 || level <= lowest_level) + break; + + ret = bin_search(parent, &node_keys[lowest_level], level, + &slot); + if (ret && slot > 0) + slot--; + + bytenr = btrfs_node_blockptr(parent, slot); + if (nodes[level - 1] == bytenr) + break; + + blocksize = btrfs_level_size(root, level - 1); + generation = btrfs_node_ptr_generation(parent, slot); + btrfs_node_key_to_cpu(eb, &key, slot); + key_match = !memcmp(&key, &node_keys[level - 1], sizeof(key)); + + if (generation == trans->transid) { + eb = read_tree_block(root, bytenr, blocksize, + generation); + btrfs_tree_lock(eb); + } + + /* + * if node keys match and node pointer hasn't been modified + * in the running transaction, we can merge the path. for + * blocks owened by reloc trees, the node pointer check is + * skipped, this is because these blocks are fully controlled + * by the space balance code, no one else can modify them. + */ + if (!nodes[level - 1] || !key_match || + (generation == trans->transid && + btrfs_header_owner(eb) != BTRFS_TREE_RELOC_OBJECTID)) { + if (level == 1 || level == lowest_level + 1) { + if (generation == trans->transid) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + break; + } + + if (generation != trans->transid) { + eb = read_tree_block(root, bytenr, blocksize, + generation); + btrfs_tree_lock(eb); + } + + ret = btrfs_cow_block(trans, root, eb, parent, slot, + &eb, 0); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_TREE_RELOC_OBJECTID) { + if (!nodes[level - 1]) { + nodes[level - 1] = eb->start; + memcpy(&node_keys[level - 1], &key, + sizeof(node_keys[0])); + } else { + WARN_ON(1); + } + } + + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + parent = eb; + continue; + } + + btrfs_set_node_blockptr(parent, slot, nodes[level - 1]); + btrfs_set_node_ptr_generation(parent, slot, trans->transid); + btrfs_mark_buffer_dirty(parent); + + ret = btrfs_inc_extent_ref(trans, root, + nodes[level - 1], + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + level - 1); + BUG_ON(ret); + + /* + * If the block was created in the running transaction, + * it's possible this is the last reference to it, so we + * should drop the subtree. + */ + if (generation == trans->transid) { + ret = btrfs_drop_subtree(trans, root, eb, parent); + BUG_ON(ret); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } else { + ret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + level - 1, 1); + BUG_ON(ret); + } + break; + } + btrfs_tree_unlock(parent); + free_extent_buffer(parent); + return 0; +} + +/* + * adjust the pointers going up the tree, starting at level + * making sure the right key of each node is points to 'key'. + * This is used after shifting pointers to the left, so it stops + * fixing up pointers when a given leaf/node is not in slot 0 of the + * higher levels + * + * If this fails to write a tree block, it returns -1, but continues + * fixing up the blocks in ram so the tree is consistent. + */ +static int fixup_low_keys(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_disk_key *key, int level) +{ + int i; + int ret = 0; + struct extent_buffer *t; + + for (i = level; i < BTRFS_MAX_LEVEL; i++) { + int tslot = path->slots[i]; + if (!path->nodes[i]) + break; + t = path->nodes[i]; + btrfs_set_node_key(t, key, tslot); + btrfs_mark_buffer_dirty(path->nodes[i]); + if (tslot != 0) + break; + } + return ret; +} + +/* + * update item key. + * + * This function isn't completely safe. It's the caller's responsibility + * that the new key won't break the order + */ +int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *eb; + int slot; + + eb = path->nodes[0]; + slot = path->slots[0]; + if (slot > 0) { + btrfs_item_key(eb, &disk_key, slot - 1); + if (comp_keys(&disk_key, new_key) >= 0) + return -1; + } + if (slot < btrfs_header_nritems(eb) - 1) { + btrfs_item_key(eb, &disk_key, slot + 1); + if (comp_keys(&disk_key, new_key) <= 0) + return -1; + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(eb, &disk_key, slot); + btrfs_mark_buffer_dirty(eb); + if (slot == 0) + fixup_low_keys(trans, root, path, &disk_key, 1); + return 0; +} + +/* + * try to push data from one node into the next node left in the + * tree. + * + * returns 0 if some ptrs were pushed left, < 0 if there was some horrible + * error, and > 0 if there was no room in the left hand block. + */ +static int push_node_left(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *dst, + struct extent_buffer *src, int empty) +{ + int push_items = 0; + int src_nritems; + int dst_nritems; + int ret = 0; + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + if (!empty && src_nritems <= 8) + return 1; + + if (push_items <= 0) + return 1; + + if (empty) { + push_items = min(src_nritems, push_items); + if (push_items < src_nritems) { + /* leave at least 8 pointers in the node if + * we aren't going to empty it + */ + if (src_nritems - push_items < 8) { + if (push_items <= 8) + return 1; + push_items -= 8; + } + } + } else + push_items = min(src_nritems - 8, push_items); + + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(dst_nritems), + btrfs_node_key_ptr_offset(0), + push_items * sizeof(struct btrfs_key_ptr)); + + if (push_items < src_nritems) { + memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(push_items), + (src_nritems - push_items) * + sizeof(struct btrfs_key_ptr)); + } + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + ret = btrfs_update_ref(trans, root, src, dst, dst_nritems, push_items); + BUG_ON(ret); + + return ret; +} + +/* + * try to push data from one node into the next node right in the + * tree. + * + * returns 0 if some ptrs were pushed, < 0 if there was some horrible + * error, and > 0 if there was no room in the right hand block. + * + * this will only push up to 1/2 the contents of the left node over + */ +static int balance_node_right(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *dst, + struct extent_buffer *src) +{ + int push_items = 0; + int max_push; + int src_nritems; + int dst_nritems; + int ret = 0; + + WARN_ON(btrfs_header_generation(src) != trans->transid); + WARN_ON(btrfs_header_generation(dst) != trans->transid); + + src_nritems = btrfs_header_nritems(src); + dst_nritems = btrfs_header_nritems(dst); + push_items = BTRFS_NODEPTRS_PER_BLOCK(root) - dst_nritems; + if (push_items <= 0) + return 1; + + if (src_nritems < 4) + return 1; + + max_push = src_nritems / 2 + 1; + /* don't try to empty the node */ + if (max_push >= src_nritems) + return 1; + + if (max_push < push_items) + push_items = max_push; + + memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), + btrfs_node_key_ptr_offset(0), + (dst_nritems) * + sizeof(struct btrfs_key_ptr)); + + copy_extent_buffer(dst, src, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(src_nritems - push_items), + push_items * sizeof(struct btrfs_key_ptr)); + + btrfs_set_header_nritems(src, src_nritems - push_items); + btrfs_set_header_nritems(dst, dst_nritems + push_items); + + btrfs_mark_buffer_dirty(src); + btrfs_mark_buffer_dirty(dst); + + ret = btrfs_update_ref(trans, root, src, dst, 0, push_items); + BUG_ON(ret); + + return ret; +} + +/* + * helper function to insert a new root level in the tree. + * A new node is allocated, and a single item is inserted to + * point to the existing root + * + * returns zero on success or < 0 on failure. + */ +static noinline int insert_new_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + u64 lower_gen; + struct extent_buffer *lower; + struct extent_buffer *c; + struct extent_buffer *old; + struct btrfs_disk_key lower_key; + int ret; + + BUG_ON(path->nodes[level]); + BUG_ON(path->nodes[level-1] != root->node); + + lower = path->nodes[level-1]; + if (level == 1) + btrfs_item_key(lower, &lower_key, 0); + else + btrfs_node_key(lower, &lower_key, 0); + + c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, + root->root_key.objectid, trans->transid, + level, root->node->start, 0); + if (IS_ERR(c)) + return PTR_ERR(c); + + memset_extent_buffer(c, 0, 0, root->nodesize); + btrfs_set_header_nritems(c, 1); + btrfs_set_header_level(c, level); + btrfs_set_header_bytenr(c, c->start); + btrfs_set_header_generation(c, trans->transid); + btrfs_set_header_owner(c, root->root_key.objectid); + + write_extent_buffer(c, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(c), + BTRFS_FSID_SIZE); + + write_extent_buffer(c, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(c), + BTRFS_UUID_SIZE); + + btrfs_set_node_key(c, &lower_key, 0); + btrfs_set_node_blockptr(c, 0, lower->start); + lower_gen = btrfs_header_generation(lower); + WARN_ON(lower_gen != trans->transid); + + btrfs_set_node_ptr_generation(c, 0, lower_gen); + + btrfs_mark_buffer_dirty(c); + + spin_lock(&root->node_lock); + old = root->node; + root->node = c; + spin_unlock(&root->node_lock); + + ret = btrfs_update_extent_ref(trans, root, lower->start, + lower->start, c->start, + root->root_key.objectid, + trans->transid, level - 1); + BUG_ON(ret); + + /* the super has an extra ref to root->node */ + free_extent_buffer(old); + + add_root_to_dirty_list(root); + extent_buffer_get(c); + path->nodes[level] = c; + path->locks[level] = 1; + path->slots[level] = 0; + return 0; +} + +/* + * worker function to insert a single pointer in a node. + * the node should have enough room for the pointer already + * + * slot and level indicate where you want the key to go, and + * blocknr is the block the key points to. + * + * returns zero on success and < 0 on any error + */ +static int insert_ptr(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, struct btrfs_disk_key + *key, u64 bytenr, int slot, int level) +{ + struct extent_buffer *lower; + int nritems; + + BUG_ON(!path->nodes[level]); + lower = path->nodes[level]; + nritems = btrfs_header_nritems(lower); + if (slot > nritems) + BUG(); + if (nritems == BTRFS_NODEPTRS_PER_BLOCK(root)) + BUG(); + if (slot != nritems) { + memmove_extent_buffer(lower, + btrfs_node_key_ptr_offset(slot + 1), + btrfs_node_key_ptr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_key_ptr)); + } + btrfs_set_node_key(lower, key, slot); + btrfs_set_node_blockptr(lower, slot, bytenr); + WARN_ON(trans->transid == 0); + btrfs_set_node_ptr_generation(lower, slot, trans->transid); + btrfs_set_header_nritems(lower, nritems + 1); + btrfs_mark_buffer_dirty(lower); + return 0; +} + +/* + * split the node at the specified level in path in two. + * The path is corrected to point to the appropriate node after the split + * + * Before splitting this tries to make some room in the node by pushing + * left and right, if either one works, it returns right away. + * + * returns 0 on success and < 0 on failure + */ +static noinline int split_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int level) +{ + struct extent_buffer *c; + struct extent_buffer *split; + struct btrfs_disk_key disk_key; + int mid; + int ret; + int wret; + u32 c_nritems; + + c = path->nodes[level]; + WARN_ON(btrfs_header_generation(c) != trans->transid); + if (c == root->node) { + /* trying to split the root, lets make a new one */ + ret = insert_new_root(trans, root, path, level + 1); + if (ret) + return ret; + } else { + ret = push_nodes_for_insert(trans, root, path, level); + c = path->nodes[level]; + if (!ret && btrfs_header_nritems(c) < + BTRFS_NODEPTRS_PER_BLOCK(root) - 3) + return 0; + if (ret < 0) + return ret; + } + + c_nritems = btrfs_header_nritems(c); + + split = btrfs_alloc_free_block(trans, root, root->nodesize, + path->nodes[level + 1]->start, + root->root_key.objectid, + trans->transid, level, c->start, 0); + if (IS_ERR(split)) + return PTR_ERR(split); + + btrfs_set_header_flags(split, btrfs_header_flags(c)); + btrfs_set_header_level(split, btrfs_header_level(c)); + btrfs_set_header_bytenr(split, split->start); + btrfs_set_header_generation(split, trans->transid); + btrfs_set_header_owner(split, root->root_key.objectid); + btrfs_set_header_flags(split, 0); + write_extent_buffer(split, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(split), + BTRFS_FSID_SIZE); + write_extent_buffer(split, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(split), + BTRFS_UUID_SIZE); + + mid = (c_nritems + 1) / 2; + + copy_extent_buffer(split, c, + btrfs_node_key_ptr_offset(0), + btrfs_node_key_ptr_offset(mid), + (c_nritems - mid) * sizeof(struct btrfs_key_ptr)); + btrfs_set_header_nritems(split, c_nritems - mid); + btrfs_set_header_nritems(c, mid); + ret = 0; + + btrfs_mark_buffer_dirty(c); + btrfs_mark_buffer_dirty(split); + + btrfs_node_key(split, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, split->start, + path->slots[level + 1] + 1, + level + 1); + if (wret) + ret = wret; + + ret = btrfs_update_ref(trans, root, c, split, 0, c_nritems - mid); + BUG_ON(ret); + + if (path->slots[level] >= mid) { + path->slots[level] -= mid; + btrfs_tree_unlock(c); + free_extent_buffer(c); + path->nodes[level] = split; + path->slots[level + 1] += 1; + } else { + btrfs_tree_unlock(split); + free_extent_buffer(split); + } + return ret; +} + +/* + * how many bytes are required to store the items in a leaf. start + * and nr indicate which items in the leaf to check. This totals up the + * space used both by the item structs and the item data + */ +static int leaf_space_used(struct extent_buffer *l, int start, int nr) +{ + int data_len; + int nritems = btrfs_header_nritems(l); + int end = min(nritems, start + nr) - 1; + + if (!nr) + return 0; + data_len = btrfs_item_end_nr(l, start); + data_len = data_len - btrfs_item_offset_nr(l, end); + data_len += sizeof(struct btrfs_item) * nr; + WARN_ON(data_len < 0); + return data_len; +} + +/* + * The space between the end of the leaf items and + * the start of the leaf data. IOW, how much room + * the leaf has left for both items and data + */ +noinline int btrfs_leaf_free_space(struct btrfs_root *root, + struct extent_buffer *leaf) +{ + int nritems = btrfs_header_nritems(leaf); + int ret; + ret = BTRFS_LEAF_DATA_SIZE(root) - leaf_space_used(leaf, 0, nritems); + if (ret < 0) { + printk(KERN_CRIT "leaf free space ret %d, leaf data size %lu, " + "used %d nritems %d\n", + ret, (unsigned long) BTRFS_LEAF_DATA_SIZE(root), + leaf_space_used(leaf, 0, nritems), nritems); + } + return ret; +} + +/* + * push some data in the path leaf to the right, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + * + * returns 1 if the push failed because the other node didn't have enough + * room, 0 if everything worked out and < 0 if there were major errors. + */ +static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct extent_buffer *left = path->nodes[0]; + struct extent_buffer *right; + struct extent_buffer *upper; + struct btrfs_disk_key disk_key; + int slot; + u32 i; + int free_space; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 left_nritems; + u32 nr; + u32 right_nritems; + u32 data_end; + u32 this_item_size; + int ret; + + slot = path->slots[1]; + if (!path->nodes[1]) + return 1; + + upper = path->nodes[1]; + if (slot >= btrfs_header_nritems(upper) - 1) + return 1; + + WARN_ON(!btrfs_tree_locked(path->nodes[1])); + + right = read_node_slot(root, upper, slot + 1); + btrfs_tree_lock(right); + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, right, upper, + slot + 1, &right, 0); + if (ret) + goto out_unlock; + + free_space = btrfs_leaf_free_space(root, right); + if (free_space < data_size) + goto out_unlock; + + left_nritems = btrfs_header_nritems(left); + if (left_nritems == 0) + goto out_unlock; + + if (empty) + nr = 0; + else + nr = 1; + + if (path->slots[0] >= left_nritems) + push_space += data_size; + + i = left_nritems - 1; + while (i >= nr) { + item = btrfs_item_nr(left, i); + + if (!empty && push_items > 0) { + if (path->slots[0] > i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, left); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + if (!left->map_token) { + map_extent_buffer(left, (unsigned long)item, + sizeof(struct btrfs_item), + &left->map_token, &left->kaddr, + &left->map_start, &left->map_len, + KM_USER1); + } + + this_item_size = btrfs_item_size(left, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + if (i == 0) + break; + i--; + } + if (left->map_token) { + unmap_extent_buffer(left, left->map_token, KM_USER1); + left->map_token = NULL; + } + + if (push_items == 0) + goto out_unlock; + + if (!empty && push_items == left_nritems) + WARN_ON(1); + + /* push left to right */ + right_nritems = btrfs_header_nritems(right); + + push_space = btrfs_item_end_nr(left, left_nritems - push_items); + push_space -= leaf_data_end(root, left); + + /* make room in the right data area */ + data_end = leaf_data_end(root, right); + memmove_extent_buffer(right, + btrfs_leaf_data(right) + data_end - push_space, + btrfs_leaf_data(right) + data_end, + BTRFS_LEAF_DATA_SIZE(root) - data_end); + + /* copy from the left data area */ + copy_extent_buffer(right, left, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(left) + leaf_data_end(root, left), + push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(push_items), + btrfs_item_nr_offset(0), + right_nritems * sizeof(struct btrfs_item)); + + /* copy the items from left to right */ + copy_extent_buffer(right, left, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(left_nritems - push_items), + push_items * sizeof(struct btrfs_item)); + + /* update the item pointers */ + right_nritems += push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(right, i); + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + push_space -= btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + left_nritems -= push_items; + btrfs_set_header_nritems(left, left_nritems); + + if (left_nritems) + btrfs_mark_buffer_dirty(left); + btrfs_mark_buffer_dirty(right); + + ret = btrfs_update_ref(trans, root, left, right, 0, push_items); + BUG_ON(ret); + + btrfs_item_key(right, &disk_key, 0); + btrfs_set_node_key(upper, &disk_key, slot + 1); + btrfs_mark_buffer_dirty(upper); + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] >= left_nritems) { + path->slots[0] -= left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + return 0; + +out_unlock: + btrfs_tree_unlock(right); + free_extent_buffer(right); + return 1; +} + +/* + * push some data in the path leaf to the left, trying to free up at + * least data_size bytes. returns zero if the push worked, nonzero otherwise + */ +static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, int data_size, + int empty) +{ + struct btrfs_disk_key disk_key; + struct extent_buffer *right = path->nodes[0]; + struct extent_buffer *left; + int slot; + int i; + int free_space; + int push_space = 0; + int push_items = 0; + struct btrfs_item *item; + u32 old_left_nritems; + u32 right_nritems; + u32 nr; + int ret = 0; + int wret; + u32 this_item_size; + u32 old_left_item_size; + + slot = path->slots[1]; + if (slot == 0) + return 1; + if (!path->nodes[1]) + return 1; + + right_nritems = btrfs_header_nritems(right); + if (right_nritems == 0) + return 1; + + WARN_ON(!btrfs_tree_locked(path->nodes[1])); + + left = read_node_slot(root, path->nodes[1], slot - 1); + btrfs_tree_lock(left); + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + /* cow and double check */ + ret = btrfs_cow_block(trans, root, left, + path->nodes[1], slot - 1, &left, 0); + if (ret) { + /* we hit -ENOSPC, but it isn't fatal here */ + ret = 1; + goto out; + } + + free_space = btrfs_leaf_free_space(root, left); + if (free_space < data_size) { + ret = 1; + goto out; + } + + if (empty) + nr = right_nritems; + else + nr = right_nritems - 1; + + for (i = 0; i < nr; i++) { + item = btrfs_item_nr(right, i); + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + if (!empty && push_items > 0) { + if (path->slots[0] < i) + break; + if (path->slots[0] == i) { + int space = btrfs_leaf_free_space(root, right); + if (space + push_space * 2 > free_space) + break; + } + } + + if (path->slots[0] == i) + push_space += data_size; + + this_item_size = btrfs_item_size(right, item); + if (this_item_size + sizeof(*item) + push_space > free_space) + break; + + push_items++; + push_space += this_item_size + sizeof(*item); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + if (push_items == 0) { + ret = 1; + goto out; + } + if (!empty && push_items == btrfs_header_nritems(right)) + WARN_ON(1); + + /* push data from right to left */ + copy_extent_buffer(left, right, + btrfs_item_nr_offset(btrfs_header_nritems(left)), + btrfs_item_nr_offset(0), + push_items * sizeof(struct btrfs_item)); + + push_space = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_offset_nr(right, push_items - 1); + + copy_extent_buffer(left, right, btrfs_leaf_data(left) + + leaf_data_end(root, left) - push_space, + btrfs_leaf_data(right) + + btrfs_item_offset_nr(right, push_items - 1), + push_space); + old_left_nritems = btrfs_header_nritems(left); + BUG_ON(old_left_nritems <= 0); + + old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1); + for (i = old_left_nritems; i < old_left_nritems + push_items; i++) { + u32 ioff; + + item = btrfs_item_nr(left, i); + if (!left->map_token) { + map_extent_buffer(left, (unsigned long)item, + sizeof(struct btrfs_item), + &left->map_token, &left->kaddr, + &left->map_start, &left->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(left, item); + btrfs_set_item_offset(left, item, + ioff - (BTRFS_LEAF_DATA_SIZE(root) - old_left_item_size)); + } + btrfs_set_header_nritems(left, old_left_nritems + push_items); + if (left->map_token) { + unmap_extent_buffer(left, left->map_token, KM_USER1); + left->map_token = NULL; + } + + /* fixup right node */ + if (push_items > right_nritems) { + printk(KERN_CRIT "push items %d nr %u\n", push_items, + right_nritems); + WARN_ON(1); + } + + if (push_items < right_nritems) { + push_space = btrfs_item_offset_nr(right, push_items - 1) - + leaf_data_end(root, right); + memmove_extent_buffer(right, btrfs_leaf_data(right) + + BTRFS_LEAF_DATA_SIZE(root) - push_space, + btrfs_leaf_data(right) + + leaf_data_end(root, right), push_space); + + memmove_extent_buffer(right, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(push_items), + (btrfs_header_nritems(right) - push_items) * + sizeof(struct btrfs_item)); + } + right_nritems -= push_items; + btrfs_set_header_nritems(right, right_nritems); + push_space = BTRFS_LEAF_DATA_SIZE(root); + for (i = 0; i < right_nritems; i++) { + item = btrfs_item_nr(right, i); + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + push_space = push_space - btrfs_item_size(right, item); + btrfs_set_item_offset(right, item, push_space); + } + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_mark_buffer_dirty(left); + if (right_nritems) + btrfs_mark_buffer_dirty(right); + + ret = btrfs_update_ref(trans, root, right, left, + old_left_nritems, push_items); + BUG_ON(ret); + + btrfs_item_key(right, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, 1); + if (wret) + ret = wret; + + /* then fixup the leaf pointer in the path */ + if (path->slots[0] < push_items) { + path->slots[0] += old_left_nritems; + if (btrfs_header_nritems(path->nodes[0]) == 0) + clean_tree_block(trans, root, path->nodes[0]); + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = left; + path->slots[1] -= 1; + } else { + btrfs_tree_unlock(left); + free_extent_buffer(left); + path->slots[0] -= push_items; + } + BUG_ON(path->slots[0] < 0); + return ret; +out: + btrfs_tree_unlock(left); + free_extent_buffer(left); + return ret; +} + +/* + * split the path's leaf in two, making sure there is at least data_size + * available for the resulting leaf level of the path. + * + * returns 0 if all went well and < 0 on failure. + */ +static noinline int split_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *ins_key, + struct btrfs_path *path, int data_size, + int extend) +{ + struct extent_buffer *l; + u32 nritems; + int mid; + int slot; + struct extent_buffer *right; + int data_copy_size; + int rt_data_off; + int i; + int ret = 0; + int wret; + int double_split; + int num_doubles = 0; + struct btrfs_disk_key disk_key; + + /* first try to make some room by pushing left and right */ + if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) { + wret = push_leaf_right(trans, root, path, data_size, 0); + if (wret < 0) + return wret; + if (wret) { + wret = push_leaf_left(trans, root, path, data_size, 0); + if (wret < 0) + return wret; + } + l = path->nodes[0]; + + /* did the pushes work? */ + if (btrfs_leaf_free_space(root, l) >= data_size) + return 0; + } + + if (!path->nodes[1]) { + ret = insert_new_root(trans, root, path, 1); + if (ret) + return ret; + } +again: + double_split = 0; + l = path->nodes[0]; + slot = path->slots[0]; + nritems = btrfs_header_nritems(l); + mid = (nritems + 1) / 2; + + right = btrfs_alloc_free_block(trans, root, root->leafsize, + path->nodes[1]->start, + root->root_key.objectid, + trans->transid, 0, l->start, 0); + if (IS_ERR(right)) { + BUG_ON(1); + return PTR_ERR(right); + } + + memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); + btrfs_set_header_bytenr(right, right->start); + btrfs_set_header_generation(right, trans->transid); + btrfs_set_header_owner(right, root->root_key.objectid); + btrfs_set_header_level(right, 0); + write_extent_buffer(right, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(right), + BTRFS_FSID_SIZE); + + write_extent_buffer(right, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(right), + BTRFS_UUID_SIZE); + if (mid <= slot) { + if (nritems == 1 || + leaf_space_used(l, mid, nritems - mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (slot >= nritems) { + btrfs_cpu_key_to_disk(&disk_key, ins_key); + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + path->slots[1] += 1; + btrfs_mark_buffer_dirty(right); + return ret; + } + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + double_split = 1; + } + } + } else { + if (leaf_space_used(l, 0, mid) + data_size > + BTRFS_LEAF_DATA_SIZE(root)) { + if (!extend && data_size && slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, ins_key); + btrfs_set_header_nritems(right, 0); + wret = insert_ptr(trans, root, path, + &disk_key, + right->start, + path->slots[1], 1); + if (wret) + ret = wret; + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] = 0; + if (path->slots[1] == 0) { + wret = fixup_low_keys(trans, root, + path, &disk_key, 1); + if (wret) + ret = wret; + } + btrfs_mark_buffer_dirty(right); + return ret; + } else if ((extend || !data_size) && slot == 0) { + mid = 1; + } else { + mid = slot; + if (mid != nritems && + leaf_space_used(l, mid, nritems - mid) + + data_size > BTRFS_LEAF_DATA_SIZE(root)) { + double_split = 1; + } + } + } + } + nritems = nritems - mid; + btrfs_set_header_nritems(right, nritems); + data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l); + + copy_extent_buffer(right, l, btrfs_item_nr_offset(0), + btrfs_item_nr_offset(mid), + nritems * sizeof(struct btrfs_item)); + + copy_extent_buffer(right, l, + btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) - + data_copy_size, btrfs_leaf_data(l) + + leaf_data_end(root, l), data_copy_size); + + rt_data_off = BTRFS_LEAF_DATA_SIZE(root) - + btrfs_item_end_nr(l, mid); + + for (i = 0; i < nritems; i++) { + struct btrfs_item *item = btrfs_item_nr(right, i); + u32 ioff; + + if (!right->map_token) { + map_extent_buffer(right, (unsigned long)item, + sizeof(struct btrfs_item), + &right->map_token, &right->kaddr, + &right->map_start, &right->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(right, item); + btrfs_set_item_offset(right, item, ioff + rt_data_off); + } + + if (right->map_token) { + unmap_extent_buffer(right, right->map_token, KM_USER1); + right->map_token = NULL; + } + + btrfs_set_header_nritems(l, mid); + ret = 0; + btrfs_item_key(right, &disk_key, 0); + wret = insert_ptr(trans, root, path, &disk_key, right->start, + path->slots[1] + 1, 1); + if (wret) + ret = wret; + + btrfs_mark_buffer_dirty(right); + btrfs_mark_buffer_dirty(l); + BUG_ON(path->slots[0] != slot); + + ret = btrfs_update_ref(trans, root, l, right, 0, nritems); + BUG_ON(ret); + + if (mid <= slot) { + btrfs_tree_unlock(path->nodes[0]); + free_extent_buffer(path->nodes[0]); + path->nodes[0] = right; + path->slots[0] -= mid; + path->slots[1] += 1; + } else { + btrfs_tree_unlock(right); + free_extent_buffer(right); + } + + BUG_ON(path->slots[0] < 0); + + if (double_split) { + BUG_ON(num_doubles != 0); + num_doubles++; + goto again; + } + return ret; +} + +/* + * This function splits a single item into two items, + * giving 'new_key' to the new item and splitting the + * old one at split_offset (from the start of the item). + * + * The path may be released by this operation. After + * the split, the path is pointing to the old item. The + * new item is going to be in the same node as the old one. + * + * Note, the item being split must be smaller enough to live alone on + * a tree block with room for one extra struct btrfs_item + * + * This allows us to split the item in place, keeping a lock on the + * leaf the entire time. + */ +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset) +{ + u32 item_size; + struct extent_buffer *leaf; + struct btrfs_key orig_key; + struct btrfs_item *item; + struct btrfs_item *new_item; + int ret = 0; + int slot; + u32 nritems; + u32 orig_offset; + struct btrfs_disk_key disk_key; + char *buf; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &orig_key, path->slots[0]); + if (btrfs_leaf_free_space(root, leaf) >= sizeof(struct btrfs_item)) + goto split; + + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + btrfs_release_path(root, path); + + path->search_for_split = 1; + path->keep_locks = 1; + + ret = btrfs_search_slot(trans, root, &orig_key, path, 0, 1); + path->search_for_split = 0; + + /* if our item isn't there or got smaller, return now */ + if (ret != 0 || item_size != btrfs_item_size_nr(path->nodes[0], + path->slots[0])) { + path->keep_locks = 0; + return -EAGAIN; + } + + ret = split_leaf(trans, root, &orig_key, path, + sizeof(struct btrfs_item), 1); + path->keep_locks = 0; + BUG_ON(ret); + + leaf = path->nodes[0]; + BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item)); + +split: + item = btrfs_item_nr(leaf, path->slots[0]); + orig_offset = btrfs_item_offset(leaf, item); + item_size = btrfs_item_size(leaf, item); + + + buf = kmalloc(item_size, GFP_NOFS); + read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf, + path->slots[0]), item_size); + slot = path->slots[0] + 1; + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + + if (slot != nritems) { + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + } + + btrfs_cpu_key_to_disk(&disk_key, new_key); + btrfs_set_item_key(leaf, &disk_key, slot); + + new_item = btrfs_item_nr(leaf, slot); + + btrfs_set_item_offset(leaf, new_item, orig_offset); + btrfs_set_item_size(leaf, new_item, item_size - split_offset); + + btrfs_set_item_offset(leaf, item, + orig_offset + item_size - split_offset); + btrfs_set_item_size(leaf, item, split_offset); + + btrfs_set_header_nritems(leaf, nritems + 1); + + /* write the data for the start of the original item */ + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, path->slots[0]), + split_offset); + + /* write the data for the new item */ + write_extent_buffer(leaf, buf + split_offset, + btrfs_item_ptr_offset(leaf, slot), + item_size - split_offset); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + kfree(buf); + return ret; +} + +/* + * make the item pointed to by the path smaller. new_size indicates + * how small to make it, and from_end tells us if we just chop bytes + * off the end of the item or if we shift the item to chop bytes off + * the front. + */ +int btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end) +{ + int ret = 0; + int slot; + int slot_orig; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data_start; + unsigned int old_size; + unsigned int size_diff; + int i; + + slot_orig = path->slots[0]; + leaf = path->nodes[0]; + slot = path->slots[0]; + + old_size = btrfs_item_size_nr(leaf, slot); + if (old_size == new_size) + return 0; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + old_data_start = btrfs_item_offset_nr(leaf, slot); + + size_diff = old_size - new_size; + + BUG_ON(slot < 0); + BUG_ON(slot >= nritems); + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(leaf, i); + + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + size_diff); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the data */ + if (from_end) { + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start + new_size - data_end); + } else { + struct btrfs_disk_key disk_key; + u64 offset; + + btrfs_item_key(leaf, &disk_key, slot); + + if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) { + unsigned long ptr; + struct btrfs_file_extent_item *fi; + + fi = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + fi = (struct btrfs_file_extent_item *)( + (unsigned long)fi - size_diff); + + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) { + ptr = btrfs_item_ptr_offset(leaf, slot); + memmove_extent_buffer(leaf, ptr, + (unsigned long)fi, + offsetof(struct btrfs_file_extent_item, + disk_bytenr)); + } + } + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + size_diff, btrfs_leaf_data(leaf) + + data_end, old_data_start - data_end); + + offset = btrfs_disk_key_offset(&disk_key); + btrfs_set_disk_key_offset(&disk_key, offset + size_diff); + btrfs_set_item_key(leaf, &disk_key, slot); + if (slot == 0) + fixup_low_keys(trans, root, path, &disk_key, 1); + } + + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, new_size); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return ret; +} + +/* + * make the item pointed to by the path bigger, data_size is the new size. + */ +int btrfs_extend_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u32 data_size) +{ + int ret = 0; + int slot; + int slot_orig; + struct extent_buffer *leaf; + struct btrfs_item *item; + u32 nritems; + unsigned int data_end; + unsigned int old_data; + unsigned int old_size; + int i; + + slot_orig = path->slots[0]; + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < data_size) { + btrfs_print_leaf(root, leaf); + BUG(); + } + slot = path->slots[0]; + old_data = btrfs_item_end_nr(leaf, slot); + + BUG_ON(slot < 0); + if (slot >= nritems) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d too large, nritems %d\n", + slot, nritems); + BUG_ON(1); + } + + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + for (i = slot; i < nritems; i++) { + u32 ioff; + item = btrfs_item_nr(leaf, i); + + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - data_size); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - data_size, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + + data_end = old_data; + old_size = btrfs_item_size_nr(leaf, slot); + item = btrfs_item_nr(leaf, slot); + btrfs_set_item_size(leaf, item, old_size + data_size); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + * Returns the number of keys that were inserted. + */ +int btrfs_insert_some_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int ret = 0; + int slot; + int i; + u32 nritems; + u32 total_data = 0; + u32 total_size = 0; + unsigned int data_end; + struct btrfs_disk_key disk_key; + struct btrfs_key found_key; + + for (i = 0; i < nr; i++) { + if (total_size + data_size[i] + sizeof(struct btrfs_item) > + BTRFS_LEAF_DATA_SIZE(root)) { + break; + nr = i; + } + total_data += data_size[i]; + total_size += data_size[i] + sizeof(struct btrfs_item); + } + BUG_ON(nr == 0); + + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < total_size) { + for (i = nr; i >= 0; i--) { + total_data -= data_size[i]; + total_size -= data_size[i] + sizeof(struct btrfs_item); + if (total_size < btrfs_leaf_free_space(root, leaf)) + break; + } + nr = i; + } + + slot = path->slots[0]; + BUG_ON(slot < 0); + + if (slot != nritems) { + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* figure out how many keys we can insert in here */ + total_data = data_size[0]; + for (i = 1; i < nr; i++) { + if (comp_cpu_keys(&found_key, cpu_key + i) <= 0) + break; + total_data += data_size[i]; + } + nr = i; + + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + slot, old_data, data_end); + BUG_ON(1); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + WARN_ON(leaf->map_token); + for (i = slot; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); + } + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - total_data, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + data_end = old_data; + } else { + /* + * this sucks but it has to be done, if we are inserting at + * the end of the leaf only insert 1 of the items, since we + * have no way of knowing whats on the next leaf and we'd have + * to drop our current locks to figure it out + */ + nr = 1; + } + + /* setup the item for the new data */ + for (i = 0; i < nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + btrfs_set_item_key(leaf, &disk_key, slot + i); + item = btrfs_item_nr(leaf, slot + i); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + data_end -= data_size[i]; + btrfs_set_item_size(leaf, item, data_size[i]); + } + btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); + } + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } +out: + if (!ret) + ret = nr; + return ret; +} + +/* + * Given a key and some data, insert items into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int ret = 0; + int slot; + int slot_orig; + int i; + u32 nritems; + u32 total_size = 0; + u32 total_data = 0; + unsigned int data_end; + struct btrfs_disk_key disk_key; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + + total_size = total_data + (nr * sizeof(struct btrfs_item)); + ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1); + if (ret == 0) + return -EEXIST; + if (ret < 0) + goto out; + + slot_orig = path->slots[0]; + leaf = path->nodes[0]; + + nritems = btrfs_header_nritems(leaf); + data_end = leaf_data_end(root, leaf); + + if (btrfs_leaf_free_space(root, leaf) < total_size) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "not enough freespace need %u have %d\n", + total_size, btrfs_leaf_free_space(root, leaf)); + BUG(); + } + + slot = path->slots[0]; + BUG_ON(slot < 0); + + if (slot != nritems) { + unsigned int old_data = btrfs_item_end_nr(leaf, slot); + + if (old_data < data_end) { + btrfs_print_leaf(root, leaf); + printk(KERN_CRIT "slot %d old_data %d data_end %d\n", + slot, old_data, data_end); + BUG_ON(1); + } + /* + * item0..itemN ... dataN.offset..dataN.size .. data0.size + */ + /* first correct the data pointers */ + WARN_ON(leaf->map_token); + for (i = slot; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff - total_data); + } + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + /* shift the items */ + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr), + btrfs_item_nr_offset(slot), + (nritems - slot) * sizeof(struct btrfs_item)); + + /* shift the data */ + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end - total_data, btrfs_leaf_data(leaf) + + data_end, old_data - data_end); + data_end = old_data; + } + + /* setup the item for the new data */ + for (i = 0; i < nr; i++) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); + btrfs_set_item_key(leaf, &disk_key, slot + i); + item = btrfs_item_nr(leaf, slot + i); + btrfs_set_item_offset(leaf, item, data_end - data_size[i]); + data_end -= data_size[i]; + btrfs_set_item_size(leaf, item, data_size[i]); + } + btrfs_set_header_nritems(leaf, nritems + nr); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; + if (slot == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + ret = fixup_low_keys(trans, root, path, &disk_key, 1); + } + + if (btrfs_leaf_free_space(root, leaf) < 0) { + btrfs_print_leaf(root, leaf); + BUG(); + } +out: + return ret; +} + +/* + * Given a key and some data, insert an item into the tree. + * This does all the path init required, making room in the tree if needed. + */ +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *cpu_key, void *data, u32 + data_size) +{ + int ret = 0; + struct btrfs_path *path; + struct extent_buffer *leaf; + unsigned long ptr; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (!ret) { + leaf = path->nodes[0]; + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, data, ptr, data_size); + btrfs_mark_buffer_dirty(leaf); + } + btrfs_free_path(path); + return ret; +} + +/* + * delete the pointer from a given node. + * + * the tree should have been previously balanced so the deletion does not + * empty a node. + */ +static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int level, int slot) +{ + struct extent_buffer *parent = path->nodes[level]; + u32 nritems; + int ret = 0; + int wret; + + nritems = btrfs_header_nritems(parent); + if (slot != nritems - 1) { + memmove_extent_buffer(parent, + btrfs_node_key_ptr_offset(slot), + btrfs_node_key_ptr_offset(slot + 1), + sizeof(struct btrfs_key_ptr) * + (nritems - slot - 1)); + } + nritems--; + btrfs_set_header_nritems(parent, nritems); + if (nritems == 0 && parent == root->node) { + BUG_ON(btrfs_header_level(root->node) != 1); + /* just turn the root into a leaf and break */ + btrfs_set_header_level(root->node, 0); + } else if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_node_key(parent, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, &disk_key, level + 1); + if (wret) + ret = wret; + } + btrfs_mark_buffer_dirty(parent); + return ret; +} + +/* + * a helper function to delete the leaf pointed to by path->slots[1] and + * path->nodes[1]. bytenr is the node block pointer, but since the callers + * already know it, it is faster to have them pass it down than to + * read it out of the node again. + * + * This deletes the pointer in path->nodes[1] and frees the leaf + * block extent. zero is returned if it all worked out, < 0 otherwise. + * + * The path must have already been setup for deleting the leaf, including + * all the proper balancing. path->nodes[1] must be locked. + */ +noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 bytenr) +{ + int ret; + u64 root_gen = btrfs_header_generation(path->nodes[1]); + + ret = del_ptr(trans, root, path, 1, path->slots[1]); + if (ret) + return ret; + + ret = btrfs_free_extent(trans, root, bytenr, + btrfs_level_size(root, 0), + path->nodes[1]->start, + btrfs_header_owner(path->nodes[1]), + root_gen, 0, 1); + return ret; +} +/* + * delete the item at the leaf level in path. If that empties + * the leaf, remove it from the tree + */ +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr) +{ + struct extent_buffer *leaf; + struct btrfs_item *item; + int last_off; + int dsize = 0; + int ret = 0; + int wret; + int i; + u32 nritems; + + leaf = path->nodes[0]; + last_off = btrfs_item_offset_nr(leaf, slot + nr - 1); + + for (i = 0; i < nr; i++) + dsize += btrfs_item_size_nr(leaf, slot + i); + + nritems = btrfs_header_nritems(leaf); + + if (slot + nr != nritems) { + int data_end = leaf_data_end(root, leaf); + + memmove_extent_buffer(leaf, btrfs_leaf_data(leaf) + + data_end + dsize, + btrfs_leaf_data(leaf) + data_end, + last_off - data_end); + + for (i = slot + nr; i < nritems; i++) { + u32 ioff; + + item = btrfs_item_nr(leaf, i); + if (!leaf->map_token) { + map_extent_buffer(leaf, (unsigned long)item, + sizeof(struct btrfs_item), + &leaf->map_token, &leaf->kaddr, + &leaf->map_start, &leaf->map_len, + KM_USER1); + } + ioff = btrfs_item_offset(leaf, item); + btrfs_set_item_offset(leaf, item, ioff + dsize); + } + + if (leaf->map_token) { + unmap_extent_buffer(leaf, leaf->map_token, KM_USER1); + leaf->map_token = NULL; + } + + memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot), + btrfs_item_nr_offset(slot + nr), + sizeof(struct btrfs_item) * + (nritems - slot - nr)); + } + btrfs_set_header_nritems(leaf, nritems - nr); + nritems -= nr; + + /* delete the leaf if we've emptied it */ + if (nritems == 0) { + if (leaf == root->node) { + btrfs_set_header_level(leaf, 0); + } else { + ret = btrfs_del_leaf(trans, root, path, leaf->start); + BUG_ON(ret); + } + } else { + int used = leaf_space_used(leaf, 0, nritems); + if (slot == 0) { + struct btrfs_disk_key disk_key; + + btrfs_item_key(leaf, &disk_key, 0); + wret = fixup_low_keys(trans, root, path, + &disk_key, 1); + if (wret) + ret = wret; + } + + /* delete the leaf if it is mostly empty */ + if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) { + /* push_leaf_left fixes the path. + * make sure the path still points to our leaf + * for possible call to del_ptr below + */ + slot = path->slots[1]; + extent_buffer_get(leaf); + + wret = push_leaf_left(trans, root, path, 1, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + + if (path->nodes[0] == leaf && + btrfs_header_nritems(leaf)) { + wret = push_leaf_right(trans, root, path, 1, 1); + if (wret < 0 && wret != -ENOSPC) + ret = wret; + } + + if (btrfs_header_nritems(leaf) == 0) { + path->slots[1] = slot; + ret = btrfs_del_leaf(trans, root, path, + leaf->start); + BUG_ON(ret); + free_extent_buffer(leaf); + } else { + /* if we're still in the path, make sure + * we're dirty. Otherwise, one of the + * push_leaf functions must have already + * dirtied this buffer + */ + if (path->nodes[0] == leaf) + btrfs_mark_buffer_dirty(leaf); + free_extent_buffer(leaf); + } + } else { + btrfs_mark_buffer_dirty(leaf); + } + } + return ret; +} + +/* + * search the tree again to find a leaf with lesser keys + * returns 0 if it found something or 1 if there are no lesser leaves. + * returns < 0 on io errors. + * + * This may release the path, and so you may lose any locks held at the + * time you call it. + */ +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + struct btrfs_key key; + struct btrfs_disk_key found_key; + int ret; + + btrfs_item_key_to_cpu(path->nodes[0], &key, 0); + + if (key.offset > 0) + key.offset--; + else if (key.type > 0) + key.type--; + else if (key.objectid > 0) + key.objectid--; + else + return 1; + + btrfs_release_path(root, path); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + return ret; + btrfs_item_key(path->nodes[0], &found_key, 0); + ret = comp_keys(&found_key, &key); + if (ret < 0) + return 0; + return 1; +} + +/* + * A helper function to walk down the tree starting at min_key, and looking + * for nodes or leaves that are either in cache or have a minimum + * transaction id. This is used by the btree defrag code, and tree logging + * + * This does not cow, but it does stuff the starting key it finds back + * into min_key, so you can call btrfs_search_slot with cow=1 on the + * key and get a writable path. + * + * This does lock as it descends, and path->keep_locks should be set + * to 1 by the caller. + * + * This honors path->lowest_level to prevent descent past a given level + * of the tree. + * + * min_trans indicates the oldest transaction that you are interested + * in walking through. Any nodes or leaves older than min_trans are + * skipped over (without reading them). + * + * returns zero if something useful was found, < 0 on error and 1 if there + * was nothing in the tree that matched the search criteria. + */ +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, + struct btrfs_path *path, int cache_only, + u64 min_trans) +{ + struct extent_buffer *cur; + struct btrfs_key found_key; + int slot; + int sret; + u32 nritems; + int level; + int ret = 1; + + WARN_ON(!path->keep_locks); +again: + cur = btrfs_lock_root_node(root); + level = btrfs_header_level(cur); + WARN_ON(path->nodes[level]); + path->nodes[level] = cur; + path->locks[level] = 1; + + if (btrfs_header_generation(cur) < min_trans) { + ret = 1; + goto out; + } + while (1) { + nritems = btrfs_header_nritems(cur); + level = btrfs_header_level(cur); + sret = bin_search(cur, min_key, level, &slot); + + /* at the lowest level, we're done, setup the path and exit */ + if (level == path->lowest_level) { + if (slot >= nritems) + goto find_next_key; + ret = 0; + path->slots[level] = slot; + btrfs_item_key_to_cpu(cur, &found_key, slot); + goto out; + } + if (sret && slot > 0) + slot--; + /* + * check this node pointer against the cache_only and + * min_trans parameters. If it isn't in cache or is too + * old, skip to the next one. + */ + while (slot < nritems) { + u64 blockptr; + u64 gen; + struct extent_buffer *tmp; + struct btrfs_disk_key disk_key; + + blockptr = btrfs_node_blockptr(cur, slot); + gen = btrfs_node_ptr_generation(cur, slot); + if (gen < min_trans) { + slot++; + continue; + } + if (!cache_only) + break; + + if (max_key) { + btrfs_node_key(cur, &disk_key, slot); + if (comp_keys(&disk_key, max_key) >= 0) { + ret = 1; + goto out; + } + } + + tmp = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + + if (tmp && btrfs_buffer_uptodate(tmp, gen)) { + free_extent_buffer(tmp); + break; + } + if (tmp) + free_extent_buffer(tmp); + slot++; + } +find_next_key: + /* + * we didn't find a candidate key in this node, walk forward + * and find another one + */ + if (slot >= nritems) { + path->slots[level] = slot; + sret = btrfs_find_next_key(root, path, min_key, level, + cache_only, min_trans); + if (sret == 0) { + btrfs_release_path(root, path); + goto again; + } else { + goto out; + } + } + /* save our key for returning back */ + btrfs_node_key_to_cpu(cur, &found_key, slot); + path->slots[level] = slot; + if (level == path->lowest_level) { + ret = 0; + unlock_up(path, level, 1); + goto out; + } + cur = read_node_slot(root, cur, slot); + + btrfs_tree_lock(cur); + path->locks[level - 1] = 1; + path->nodes[level - 1] = cur; + unlock_up(path, level, 1); + } +out: + if (ret == 0) + memcpy(min_key, &found_key, sizeof(found_key)); + return ret; +} + +/* + * this is similar to btrfs_next_leaf, but does not try to preserve + * and fixup the path. It looks for and returns the next key in the + * tree based on the current path and the cache_only and min_trans + * parameters. + * + * 0 is returned if another key is found, < 0 if there are any errors + * and 1 is returned if there are no higher keys in the tree + * + * path->keep_locks should be set to 1 on the search made before + * calling this function. + */ +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int lowest_level, + int cache_only, u64 min_trans) +{ + int level = lowest_level; + int slot; + struct extent_buffer *c; + + WARN_ON(!path->keep_locks); + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + return 1; + + slot = path->slots[level] + 1; + c = path->nodes[level]; +next: + if (slot >= btrfs_header_nritems(c)) { + level++; + if (level == BTRFS_MAX_LEVEL) + return 1; + continue; + } + if (level == 0) + btrfs_item_key_to_cpu(c, key, slot); + else { + u64 blockptr = btrfs_node_blockptr(c, slot); + u64 gen = btrfs_node_ptr_generation(c, slot); + + if (cache_only) { + struct extent_buffer *cur; + cur = btrfs_find_tree_block(root, blockptr, + btrfs_level_size(root, level - 1)); + if (!cur || !btrfs_buffer_uptodate(cur, gen)) { + slot++; + if (cur) + free_extent_buffer(cur); + goto next; + } + free_extent_buffer(cur); + } + if (gen < min_trans) { + slot++; + goto next; + } + btrfs_node_key_to_cpu(c, key, slot); + } + return 0; + } + return 1; +} + +/* + * search the tree again to find a leaf with greater keys + * returns 0 if it found something or 1 if there are no greater leaves. + * returns < 0 on io errors. + */ +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) +{ + int slot; + int level = 1; + struct extent_buffer *c; + struct extent_buffer *next = NULL; + struct btrfs_key key; + u32 nritems; + int ret; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (nritems == 0) + return 1; + + btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1); + + btrfs_release_path(root, path); + path->keep_locks = 1; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + path->keep_locks = 0; + + if (ret < 0) + return ret; + + nritems = btrfs_header_nritems(path->nodes[0]); + /* + * by releasing the path above we dropped all our locks. A balance + * could have added more items next to the key that used to be + * at the very end of the block. So, check again here and + * advance the path if there are now more items available. + */ + if (nritems > 0 && path->slots[0] < nritems - 1) { + path->slots[0]++; + goto done; + } + + while (level < BTRFS_MAX_LEVEL) { + if (!path->nodes[level]) + return 1; + + slot = path->slots[level] + 1; + c = path->nodes[level]; + if (slot >= btrfs_header_nritems(c)) { + level++; + if (level == BTRFS_MAX_LEVEL) + return 1; + continue; + } + + if (next) { + btrfs_tree_unlock(next); + free_extent_buffer(next); + } + + if (level == 1 && (path->locks[1] || path->skip_locking) && + path->reada) + reada_for_search(root, path, level, slot, 0); + + next = read_node_slot(root, c, slot); + if (!path->skip_locking) { + WARN_ON(!btrfs_tree_locked(c)); + btrfs_tree_lock(next); + } + break; + } + path->slots[level] = slot; + while (1) { + level--; + c = path->nodes[level]; + if (path->locks[level]) + btrfs_tree_unlock(c); + free_extent_buffer(c); + path->nodes[level] = next; + path->slots[level] = 0; + if (!path->skip_locking) + path->locks[level] = 1; + if (!level) + break; + if (level == 1 && path->locks[1] && path->reada) + reada_for_search(root, path, level, slot, 0); + next = read_node_slot(root, next, 0); + if (!path->skip_locking) { + WARN_ON(!btrfs_tree_locked(path->nodes[level])); + btrfs_tree_lock(next); + } + } +done: + unlock_up(path, 0, 1); + return 0; +} + +/* + * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps + * searching until it gets past min_objectid or finds an item of 'type' + * + * returns 0 if something is found, 1 if nothing was found and < 0 on error + */ +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type) +{ + struct btrfs_key found_key; + struct extent_buffer *leaf; + u32 nritems; + int ret; + + while (1) { + if (path->slots[0] == 0) { + ret = btrfs_prev_leaf(root, path); + if (ret != 0) + return ret; + } else { + path->slots[0]--; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (nritems == 0) + return 1; + if (path->slots[0] == nritems) + path->slots[0]--; + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.type == type) + return 0; + if (found_key.objectid < min_objectid) + break; + if (found_key.objectid == min_objectid && + found_key.type < type) + break; + } + return 1; +} diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h new file mode 100644 index 00000000000..eee060f8811 --- /dev/null +++ b/fs/btrfs/ctree.h @@ -0,0 +1,2129 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_CTREE__ +#define __BTRFS_CTREE__ + +#include <linux/version.h> +#include <linux/mm.h> +#include <linux/highmem.h> +#include <linux/fs.h> +#include <linux/completion.h> +#include <linux/backing-dev.h> +#include <linux/wait.h> +#include <asm/kmap_types.h> +#include "extent_io.h" +#include "extent_map.h" +#include "async-thread.h" + +struct btrfs_trans_handle; +struct btrfs_transaction; +extern struct kmem_cache *btrfs_trans_handle_cachep; +extern struct kmem_cache *btrfs_transaction_cachep; +extern struct kmem_cache *btrfs_bit_radix_cachep; +extern struct kmem_cache *btrfs_path_cachep; +struct btrfs_ordered_sum; + +#define BTRFS_MAGIC "_BHRfS_M" + +#define BTRFS_ACL_NOT_CACHED ((void *)-1) + +#ifdef CONFIG_LOCKDEP +# define BTRFS_MAX_LEVEL 7 +#else +# define BTRFS_MAX_LEVEL 8 +#endif + +/* holds pointers to all of the tree roots */ +#define BTRFS_ROOT_TREE_OBJECTID 1ULL + +/* stores information about which extents are in use, and reference counts */ +#define BTRFS_EXTENT_TREE_OBJECTID 2ULL + +/* + * chunk tree stores translations from logical -> physical block numbering + * the super block points to the chunk tree + */ +#define BTRFS_CHUNK_TREE_OBJECTID 3ULL + +/* + * stores information about which areas of a given device are in use. + * one per device. The tree of tree roots points to the device tree + */ +#define BTRFS_DEV_TREE_OBJECTID 4ULL + +/* one per subvolume, storing files and directories */ +#define BTRFS_FS_TREE_OBJECTID 5ULL + +/* directory objectid inside the root tree */ +#define BTRFS_ROOT_TREE_DIR_OBJECTID 6ULL + +/* holds checksums of all the data extents */ +#define BTRFS_CSUM_TREE_OBJECTID 7ULL + +/* orhpan objectid for tracking unlinked/truncated files */ +#define BTRFS_ORPHAN_OBJECTID -5ULL + +/* does write ahead logging to speed up fsyncs */ +#define BTRFS_TREE_LOG_OBJECTID -6ULL +#define BTRFS_TREE_LOG_FIXUP_OBJECTID -7ULL + +/* for space balancing */ +#define BTRFS_TREE_RELOC_OBJECTID -8ULL +#define BTRFS_DATA_RELOC_TREE_OBJECTID -9ULL + +/* + * extent checksums all have this objectid + * this allows them to share the logging tree + * for fsyncs + */ +#define BTRFS_EXTENT_CSUM_OBJECTID -10ULL + +/* dummy objectid represents multiple objectids */ +#define BTRFS_MULTIPLE_OBJECTIDS -255ULL + +/* + * All files have objectids in this range. + */ +#define BTRFS_FIRST_FREE_OBJECTID 256ULL +#define BTRFS_LAST_FREE_OBJECTID -256ULL +#define BTRFS_FIRST_CHUNK_TREE_OBJECTID 256ULL + + +/* + * the device items go into the chunk tree. The key is in the form + * [ 1 BTRFS_DEV_ITEM_KEY device_id ] + */ +#define BTRFS_DEV_ITEMS_OBJECTID 1ULL + +/* + * we can actually store much bigger names, but lets not confuse the rest + * of linux + */ +#define BTRFS_NAME_LEN 255 + +/* 32 bytes in various csum fields */ +#define BTRFS_CSUM_SIZE 32 + +/* csum types */ +#define BTRFS_CSUM_TYPE_CRC32 0 + +static int btrfs_csum_sizes[] = { 4, 0 }; + +/* four bytes for CRC32 */ +#define BTRFS_EMPTY_DIR_SIZE 0 + +#define BTRFS_FT_UNKNOWN 0 +#define BTRFS_FT_REG_FILE 1 +#define BTRFS_FT_DIR 2 +#define BTRFS_FT_CHRDEV 3 +#define BTRFS_FT_BLKDEV 4 +#define BTRFS_FT_FIFO 5 +#define BTRFS_FT_SOCK 6 +#define BTRFS_FT_SYMLINK 7 +#define BTRFS_FT_XATTR 8 +#define BTRFS_FT_MAX 9 + +/* + * the key defines the order in the tree, and so it also defines (optimal) + * block layout. objectid corresonds to the inode number. The flags + * tells us things about the object, and is a kind of stream selector. + * so for a given inode, keys with flags of 1 might refer to the inode + * data, flags of 2 may point to file data in the btree and flags == 3 + * may point to extents. + * + * offset is the starting byte offset for this key in the stream. + * + * btrfs_disk_key is in disk byte order. struct btrfs_key is always + * in cpu native order. Otherwise they are identical and their sizes + * should be the same (ie both packed) + */ +struct btrfs_disk_key { + __le64 objectid; + u8 type; + __le64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_key { + u64 objectid; + u8 type; + u64 offset; +} __attribute__ ((__packed__)); + +struct btrfs_mapping_tree { + struct extent_map_tree map_tree; +}; + +#define BTRFS_UUID_SIZE 16 +struct btrfs_dev_item { + /* the internal btrfs device id */ + __le64 devid; + + /* size of the device */ + __le64 total_bytes; + + /* bytes used */ + __le64 bytes_used; + + /* optimal io alignment for this device */ + __le32 io_align; + + /* optimal io width for this device */ + __le32 io_width; + + /* minimal io size for this device */ + __le32 sector_size; + + /* type and info about this device */ + __le64 type; + + /* expected generation for this device */ + __le64 generation; + + /* + * starting byte of this partition on the device, + * to allowr for stripe alignment in the future + */ + __le64 start_offset; + + /* grouping information for allocation decisions */ + __le32 dev_group; + + /* seek speed 0-100 where 100 is fastest */ + u8 seek_speed; + + /* bandwidth 0-100 where 100 is fastest */ + u8 bandwidth; + + /* btrfs generated uuid for this device */ + u8 uuid[BTRFS_UUID_SIZE]; + + /* uuid of FS who owns this device */ + u8 fsid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_stripe { + __le64 devid; + __le64 offset; + u8 dev_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_chunk { + /* size of this chunk in bytes */ + __le64 length; + + /* objectid of the root referencing this chunk */ + __le64 owner; + + __le64 stripe_len; + __le64 type; + + /* optimal io alignment for this chunk */ + __le32 io_align; + + /* optimal io width for this chunk */ + __le32 io_width; + + /* minimal io size for this chunk */ + __le32 sector_size; + + /* 2^16 stripes is quite a lot, a second limit is the size of a single + * item in the btree + */ + __le16 num_stripes; + + /* sub stripes only matter for raid10 */ + __le16 sub_stripes; + struct btrfs_stripe stripe; + /* additional stripes go here */ +} __attribute__ ((__packed__)); + +static inline unsigned long btrfs_chunk_item_size(int num_stripes) +{ + BUG_ON(num_stripes == 0); + return sizeof(struct btrfs_chunk) + + sizeof(struct btrfs_stripe) * (num_stripes - 1); +} + +#define BTRFS_FSID_SIZE 16 +#define BTRFS_HEADER_FLAG_WRITTEN (1 << 0) + +/* + * every tree block (leaf or node) starts with this header. + */ +struct btrfs_header { + /* these first four must match the super block */ + u8 csum[BTRFS_CSUM_SIZE]; + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* which block this node is supposed to live in */ + __le64 flags; + + /* allowed to be different from the super from here on down */ + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + __le64 generation; + __le64 owner; + __le32 nritems; + u8 level; +} __attribute__ ((__packed__)); + +#define BTRFS_NODEPTRS_PER_BLOCK(r) (((r)->nodesize - \ + sizeof(struct btrfs_header)) / \ + sizeof(struct btrfs_key_ptr)) +#define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) +#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) +#define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) - \ + sizeof(struct btrfs_file_extent_item)) + +#define BTRFS_SUPER_FLAG_SEEDING (1ULL << 32) + +/* + * this is a very generous portion of the super block, giving us + * room to translate 14 chunks with 3 stripes each. + */ +#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048 +#define BTRFS_LABEL_SIZE 256 + +/* + * the super block basically lists the main trees of the FS + * it currently lacks any block count etc etc + */ +struct btrfs_super_block { + u8 csum[BTRFS_CSUM_SIZE]; + /* the first 4 fields must match struct btrfs_header */ + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + __le64 bytenr; /* this block number */ + __le64 flags; + + /* allowed to be different from the btrfs_header from here own down */ + __le64 magic; + __le64 generation; + __le64 root; + __le64 chunk_root; + __le64 log_root; + + /* this will help find the new super based on the log root */ + __le64 log_root_transid; + __le64 total_bytes; + __le64 bytes_used; + __le64 root_dir_objectid; + __le64 num_devices; + __le32 sectorsize; + __le32 nodesize; + __le32 leafsize; + __le32 stripesize; + __le32 sys_chunk_array_size; + __le64 chunk_root_generation; + __le64 compat_flags; + __le64 compat_ro_flags; + __le64 incompat_flags; + __le16 csum_type; + u8 root_level; + u8 chunk_root_level; + u8 log_root_level; + struct btrfs_dev_item dev_item; + + char label[BTRFS_LABEL_SIZE]; + + /* future expansion */ + __le64 reserved[32]; + u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE]; +} __attribute__ ((__packed__)); + +/* + * Compat flags that we support. If any incompat flags are set other than the + * ones specified below then we will fail to mount + */ +#define BTRFS_FEATURE_COMPAT_SUPP 0x0 +#define BTRFS_FEATURE_COMPAT_RO_SUPP 0x0 +#define BTRFS_FEATURE_INCOMPAT_SUPP 0x0 + +/* + * A leaf is full of items. offset and size tell us where to find + * the item in the leaf (relative to the start of the data area) + */ +struct btrfs_item { + struct btrfs_disk_key key; + __le32 offset; + __le32 size; +} __attribute__ ((__packed__)); + +/* + * leaves have an item area and a data area: + * [item0, item1....itemN] [free space] [dataN...data1, data0] + * + * The data is separate from the items to get the keys closer together + * during searches. + */ +struct btrfs_leaf { + struct btrfs_header header; + struct btrfs_item items[]; +} __attribute__ ((__packed__)); + +/* + * all non-leaf blocks are nodes, they hold only keys and pointers to + * other blocks + */ +struct btrfs_key_ptr { + struct btrfs_disk_key key; + __le64 blockptr; + __le64 generation; +} __attribute__ ((__packed__)); + +struct btrfs_node { + struct btrfs_header header; + struct btrfs_key_ptr ptrs[]; +} __attribute__ ((__packed__)); + +/* + * btrfs_paths remember the path taken from the root down to the leaf. + * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point + * to any other levels that are present. + * + * The slots array records the index of the item or block pointer + * used while walking the tree. + */ +struct btrfs_path { + struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; + int slots[BTRFS_MAX_LEVEL]; + /* if there is real range locking, this locks field will change */ + int locks[BTRFS_MAX_LEVEL]; + int reada; + /* keep some upper locks as we walk down */ + int keep_locks; + int skip_locking; + int lowest_level; + + /* + * set by btrfs_split_item, tells search_slot to keep all locks + * and to force calls to keep space in the nodes + */ + int search_for_split; +}; + +/* + * items in the extent btree are used to record the objectid of the + * owner of the block and the number of references + */ +struct btrfs_extent_item { + __le32 refs; +} __attribute__ ((__packed__)); + +struct btrfs_extent_ref { + __le64 root; + __le64 generation; + __le64 objectid; + __le32 num_refs; +} __attribute__ ((__packed__)); + +/* dev extents record free space on individual devices. The owner + * field points back to the chunk allocation mapping tree that allocated + * the extent. The chunk tree uuid field is a way to double check the owner + */ +struct btrfs_dev_extent { + __le64 chunk_tree; + __le64 chunk_objectid; + __le64 chunk_offset; + __le64 length; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; +} __attribute__ ((__packed__)); + +struct btrfs_inode_ref { + __le64 index; + __le16 name_len; + /* name goes here */ +} __attribute__ ((__packed__)); + +struct btrfs_timespec { + __le64 sec; + __le32 nsec; +} __attribute__ ((__packed__)); + +typedef enum { + BTRFS_COMPRESS_NONE = 0, + BTRFS_COMPRESS_ZLIB = 1, + BTRFS_COMPRESS_LAST = 2, +} btrfs_compression_type; + +/* we don't understand any encryption methods right now */ +typedef enum { + BTRFS_ENCRYPTION_NONE = 0, + BTRFS_ENCRYPTION_LAST = 1, +} btrfs_encryption_type; + +struct btrfs_inode_item { + /* nfs style generation number */ + __le64 generation; + /* transid that last touched this inode */ + __le64 transid; + __le64 size; + __le64 nbytes; + __le64 block_group; + __le32 nlink; + __le32 uid; + __le32 gid; + __le32 mode; + __le64 rdev; + __le64 flags; + + /* modification sequence number for NFS */ + __le64 sequence; + + /* + * a little future expansion, for more than this we can + * just grow the inode item and version it + */ + __le64 reserved[4]; + struct btrfs_timespec atime; + struct btrfs_timespec ctime; + struct btrfs_timespec mtime; + struct btrfs_timespec otime; +} __attribute__ ((__packed__)); + +struct btrfs_dir_log_item { + __le64 end; +} __attribute__ ((__packed__)); + +struct btrfs_dir_item { + struct btrfs_disk_key location; + __le64 transid; + __le16 data_len; + __le16 name_len; + u8 type; +} __attribute__ ((__packed__)); + +struct btrfs_root_item { + struct btrfs_inode_item inode; + __le64 generation; + __le64 root_dirid; + __le64 bytenr; + __le64 byte_limit; + __le64 bytes_used; + __le64 last_snapshot; + __le64 flags; + __le32 refs; + struct btrfs_disk_key drop_progress; + u8 drop_level; + u8 level; +} __attribute__ ((__packed__)); + +/* + * this is used for both forward and backward root refs + */ +struct btrfs_root_ref { + __le64 dirid; + __le64 sequence; + __le16 name_len; +} __attribute__ ((__packed__)); + +#define BTRFS_FILE_EXTENT_INLINE 0 +#define BTRFS_FILE_EXTENT_REG 1 +#define BTRFS_FILE_EXTENT_PREALLOC 2 + +struct btrfs_file_extent_item { + /* + * transaction id that created this extent + */ + __le64 generation; + /* + * max number of bytes to hold this extent in ram + * when we split a compressed extent we can't know how big + * each of the resulting pieces will be. So, this is + * an upper limit on the size of the extent in ram instead of + * an exact limit. + */ + __le64 ram_bytes; + + /* + * 32 bits for the various ways we might encode the data, + * including compression and encryption. If any of these + * are set to something a given disk format doesn't understand + * it is treated like an incompat flag for reading and writing, + * but not for stat. + */ + u8 compression; + u8 encryption; + __le16 other_encoding; /* spare for later use */ + + /* are we inline data or a real extent? */ + u8 type; + + /* + * disk space consumed by the extent, checksum blocks are included + * in these numbers + */ + __le64 disk_bytenr; + __le64 disk_num_bytes; + /* + * the logical offset in file blocks (no csums) + * this extent record is for. This allows a file extent to point + * into the middle of an existing extent on disk, sharing it + * between two snapshots (useful if some bytes in the middle of the + * extent have changed + */ + __le64 offset; + /* + * the logical number of file blocks (no csums included). This + * always reflects the size uncompressed and without encoding. + */ + __le64 num_bytes; + +} __attribute__ ((__packed__)); + +struct btrfs_csum_item { + u8 csum; +} __attribute__ ((__packed__)); + +/* different types of block groups (and chunks) */ +#define BTRFS_BLOCK_GROUP_DATA (1 << 0) +#define BTRFS_BLOCK_GROUP_SYSTEM (1 << 1) +#define BTRFS_BLOCK_GROUP_METADATA (1 << 2) +#define BTRFS_BLOCK_GROUP_RAID0 (1 << 3) +#define BTRFS_BLOCK_GROUP_RAID1 (1 << 4) +#define BTRFS_BLOCK_GROUP_DUP (1 << 5) +#define BTRFS_BLOCK_GROUP_RAID10 (1 << 6) + +struct btrfs_block_group_item { + __le64 used; + __le64 chunk_objectid; + __le64 flags; +} __attribute__ ((__packed__)); + +struct btrfs_space_info { + u64 flags; + u64 total_bytes; + u64 bytes_used; + u64 bytes_pinned; + u64 bytes_reserved; + u64 bytes_readonly; + int full; + int force_alloc; + struct list_head list; + + /* for block groups in our same type */ + struct list_head block_groups; + spinlock_t lock; + struct rw_semaphore groups_sem; +}; + +struct btrfs_free_space { + struct rb_node bytes_index; + struct rb_node offset_index; + u64 offset; + u64 bytes; +}; + +struct btrfs_block_group_cache { + struct btrfs_key key; + struct btrfs_block_group_item item; + spinlock_t lock; + struct mutex alloc_mutex; + struct mutex cache_mutex; + u64 pinned; + u64 reserved; + u64 flags; + int cached; + int ro; + int dirty; + + struct btrfs_space_info *space_info; + + /* free space cache stuff */ + struct rb_root free_space_bytes; + struct rb_root free_space_offset; + + /* block group cache stuff */ + struct rb_node cache_node; + + /* for block groups in the same raid type */ + struct list_head list; + + /* usage count */ + atomic_t count; +}; + +struct btrfs_leaf_ref_tree { + struct rb_root root; + struct list_head list; + spinlock_t lock; +}; + +struct btrfs_device; +struct btrfs_fs_devices; +struct btrfs_fs_info { + u8 fsid[BTRFS_FSID_SIZE]; + u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; + struct btrfs_root *extent_root; + struct btrfs_root *tree_root; + struct btrfs_root *chunk_root; + struct btrfs_root *dev_root; + struct btrfs_root *fs_root; + struct btrfs_root *csum_root; + + /* the log root tree is a directory of all the other log roots */ + struct btrfs_root *log_root_tree; + struct radix_tree_root fs_roots_radix; + + /* block group cache stuff */ + spinlock_t block_group_cache_lock; + struct rb_root block_group_cache_tree; + + struct extent_io_tree pinned_extents; + struct extent_io_tree pending_del; + struct extent_io_tree extent_ins; + + /* logical->physical extent mapping */ + struct btrfs_mapping_tree mapping_tree; + + u64 generation; + u64 last_trans_committed; + u64 last_trans_new_blockgroup; + u64 open_ioctl_trans; + unsigned long mount_opt; + u64 max_extent; + u64 max_inline; + u64 alloc_start; + struct btrfs_transaction *running_transaction; + wait_queue_head_t transaction_throttle; + wait_queue_head_t transaction_wait; + + wait_queue_head_t async_submit_wait; + wait_queue_head_t tree_log_wait; + + struct btrfs_super_block super_copy; + struct btrfs_super_block super_for_commit; + struct block_device *__bdev; + struct super_block *sb; + struct inode *btree_inode; + struct backing_dev_info bdi; + spinlock_t hash_lock; + struct mutex trans_mutex; + struct mutex tree_log_mutex; + struct mutex transaction_kthread_mutex; + struct mutex cleaner_mutex; + struct mutex extent_ins_mutex; + struct mutex pinned_mutex; + struct mutex chunk_mutex; + struct mutex drop_mutex; + struct mutex volume_mutex; + struct mutex tree_reloc_mutex; + struct list_head trans_list; + struct list_head hashers; + struct list_head dead_roots; + + atomic_t nr_async_submits; + atomic_t async_submit_draining; + atomic_t nr_async_bios; + atomic_t async_delalloc_pages; + atomic_t tree_log_writers; + atomic_t tree_log_commit; + unsigned long tree_log_batch; + u64 tree_log_transid; + + /* + * this is used by the balancing code to wait for all the pending + * ordered extents + */ + spinlock_t ordered_extent_lock; + struct list_head ordered_extents; + struct list_head delalloc_inodes; + + /* + * there is a pool of worker threads for checksumming during writes + * and a pool for checksumming after reads. This is because readers + * can run with FS locks held, and the writers may be waiting for + * those locks. We don't want ordering in the pending list to cause + * deadlocks, and so the two are serviced separately. + * + * A third pool does submit_bio to avoid deadlocking with the other + * two + */ + struct btrfs_workers workers; + struct btrfs_workers delalloc_workers; + struct btrfs_workers endio_workers; + struct btrfs_workers endio_meta_workers; + struct btrfs_workers endio_meta_write_workers; + struct btrfs_workers endio_write_workers; + struct btrfs_workers submit_workers; + /* + * fixup workers take dirty pages that didn't properly go through + * the cow mechanism and make them safe to write. It happens + * for the sys_munmap function call path + */ + struct btrfs_workers fixup_workers; + struct task_struct *transaction_kthread; + struct task_struct *cleaner_kthread; + int thread_pool_size; + + /* tree relocation relocated fields */ + struct list_head dead_reloc_roots; + struct btrfs_leaf_ref_tree reloc_ref_tree; + struct btrfs_leaf_ref_tree shared_ref_tree; + + struct kobject super_kobj; + struct completion kobj_unregister; + int do_barriers; + int closing; + int log_root_recovering; + atomic_t throttles; + atomic_t throttle_gen; + + u64 total_pinned; + struct list_head dirty_cowonly_roots; + + struct btrfs_fs_devices *fs_devices; + struct list_head space_info; + spinlock_t delalloc_lock; + spinlock_t new_trans_lock; + u64 delalloc_bytes; + u64 last_alloc; + u64 last_data_alloc; + + spinlock_t ref_cache_lock; + u64 total_ref_cache_size; + + u64 avail_data_alloc_bits; + u64 avail_metadata_alloc_bits; + u64 avail_system_alloc_bits; + u64 data_alloc_profile; + u64 metadata_alloc_profile; + u64 system_alloc_profile; + + void *bdev_holder; +}; + +/* + * in ram representation of the tree. extent_root is used for all allocations + * and for the extent tree extent_root root. + */ +struct btrfs_dirty_root; +struct btrfs_root { + struct extent_buffer *node; + + /* the node lock is held while changing the node pointer */ + spinlock_t node_lock; + + struct extent_buffer *commit_root; + struct btrfs_leaf_ref_tree *ref_tree; + struct btrfs_leaf_ref_tree ref_tree_struct; + struct btrfs_dirty_root *dirty_root; + struct btrfs_root *log_root; + struct btrfs_root *reloc_root; + + struct btrfs_root_item root_item; + struct btrfs_key root_key; + struct btrfs_fs_info *fs_info; + struct extent_io_tree dirty_log_pages; + + struct kobject root_kobj; + struct completion kobj_unregister; + struct mutex objectid_mutex; + struct mutex log_mutex; + + u64 objectid; + u64 last_trans; + + /* data allocations are done in sectorsize units */ + u32 sectorsize; + + /* node allocations are done in nodesize units */ + u32 nodesize; + + /* leaf allocations are done in leafsize units */ + u32 leafsize; + + u32 stripesize; + + u32 type; + u64 highest_inode; + u64 last_inode_alloc; + int ref_cows; + int track_dirty; + u64 defrag_trans_start; + struct btrfs_key defrag_progress; + struct btrfs_key defrag_max; + int defrag_running; + int defrag_level; + char *name; + int in_sysfs; + + /* the dirty list is only used by non-reference counted roots */ + struct list_head dirty_list; + + spinlock_t list_lock; + struct list_head dead_list; + struct list_head orphan_list; + + /* + * right now this just gets used so that a root has its own devid + * for stat. It may be used for more later + */ + struct super_block anon_super; +}; + +/* + + * inode items have the data typically returned from stat and store other + * info about object characteristics. There is one for every file and dir in + * the FS + */ +#define BTRFS_INODE_ITEM_KEY 1 +#define BTRFS_INODE_REF_KEY 12 +#define BTRFS_XATTR_ITEM_KEY 24 +#define BTRFS_ORPHAN_ITEM_KEY 48 +/* reserve 2-15 close to the inode for later flexibility */ + +/* + * dir items are the name -> inode pointers in a directory. There is one + * for every name in a directory. + */ +#define BTRFS_DIR_LOG_ITEM_KEY 60 +#define BTRFS_DIR_LOG_INDEX_KEY 72 +#define BTRFS_DIR_ITEM_KEY 84 +#define BTRFS_DIR_INDEX_KEY 96 +/* + * extent data is for file data + */ +#define BTRFS_EXTENT_DATA_KEY 108 + +/* + * extent csums are stored in a separate tree and hold csums for + * an entire extent on disk. + */ +#define BTRFS_EXTENT_CSUM_KEY 128 + +/* + * root items point to tree roots. There are typically in the root + * tree used by the super block to find all the other trees + */ +#define BTRFS_ROOT_ITEM_KEY 132 + +/* + * root backrefs tie subvols and snapshots to the directory entries that + * reference them + */ +#define BTRFS_ROOT_BACKREF_KEY 144 + +/* + * root refs make a fast index for listing all of the snapshots and + * subvolumes referenced by a given root. They point directly to the + * directory item in the root that references the subvol + */ +#define BTRFS_ROOT_REF_KEY 156 + +/* + * extent items are in the extent map tree. These record which blocks + * are used, and how many references there are to each block + */ +#define BTRFS_EXTENT_ITEM_KEY 168 +#define BTRFS_EXTENT_REF_KEY 180 + +/* + * block groups give us hints into the extent allocation trees. Which + * blocks are free etc etc + */ +#define BTRFS_BLOCK_GROUP_ITEM_KEY 192 + +#define BTRFS_DEV_EXTENT_KEY 204 +#define BTRFS_DEV_ITEM_KEY 216 +#define BTRFS_CHUNK_ITEM_KEY 228 + +/* + * string items are for debugging. They just store a short string of + * data in the FS + */ +#define BTRFS_STRING_ITEM_KEY 253 + +#define BTRFS_MOUNT_NODATASUM (1 << 0) +#define BTRFS_MOUNT_NODATACOW (1 << 1) +#define BTRFS_MOUNT_NOBARRIER (1 << 2) +#define BTRFS_MOUNT_SSD (1 << 3) +#define BTRFS_MOUNT_DEGRADED (1 << 4) +#define BTRFS_MOUNT_COMPRESS (1 << 5) + +#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) +#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) +#define btrfs_test_opt(root, opt) ((root)->fs_info->mount_opt & \ + BTRFS_MOUNT_##opt) +/* + * Inode flags + */ +#define BTRFS_INODE_NODATASUM (1 << 0) +#define BTRFS_INODE_NODATACOW (1 << 1) +#define BTRFS_INODE_READONLY (1 << 2) +#define BTRFS_INODE_NOCOMPRESS (1 << 3) +#define BTRFS_INODE_PREALLOC (1 << 4) +#define btrfs_clear_flag(inode, flag) (BTRFS_I(inode)->flags &= \ + ~BTRFS_INODE_##flag) +#define btrfs_set_flag(inode, flag) (BTRFS_I(inode)->flags |= \ + BTRFS_INODE_##flag) +#define btrfs_test_flag(inode, flag) (BTRFS_I(inode)->flags & \ + BTRFS_INODE_##flag) +/* some macros to generate set/get funcs for the struct fields. This + * assumes there is a lefoo_to_cpu for every type, so lets make a simple + * one for u8: + */ +#define le8_to_cpu(v) (v) +#define cpu_to_le8(v) (v) +#define __le8 u8 + +#define read_eb_member(eb, ptr, type, member, result) ( \ + read_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#define write_eb_member(eb, ptr, type, member, result) ( \ + write_extent_buffer(eb, (char *)(result), \ + ((unsigned long)(ptr)) + \ + offsetof(type, member), \ + sizeof(((type *)0)->member))) + +#ifndef BTRFS_SETGET_FUNCS +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); +#endif + +#define BTRFS_SETGET_HEADER_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(struct extent_buffer *eb) \ +{ \ + type *p = kmap_atomic(eb->first_page, KM_USER0); \ + u##bits res = le##bits##_to_cpu(p->member); \ + kunmap_atomic(p, KM_USER0); \ + return res; \ +} \ +static inline void btrfs_set_##name(struct extent_buffer *eb, \ + u##bits val) \ +{ \ + type *p = kmap_atomic(eb->first_page, KM_USER0); \ + p->member = cpu_to_le##bits(val); \ + kunmap_atomic(p, KM_USER0); \ +} + +#define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ +static inline u##bits btrfs_##name(type *s) \ +{ \ + return le##bits##_to_cpu(s->member); \ +} \ +static inline void btrfs_set_##name(type *s, u##bits val) \ +{ \ + s->member = cpu_to_le##bits(val); \ +} + +BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64); +BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64); +BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32); +BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32); +BTRFS_SETGET_FUNCS(device_start_offset, struct btrfs_dev_item, + start_offset, 64); +BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32); +BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_FUNCS(device_group, struct btrfs_dev_item, dev_group, 32); +BTRFS_SETGET_FUNCS(device_seek_speed, struct btrfs_dev_item, seek_speed, 8); +BTRFS_SETGET_FUNCS(device_bandwidth, struct btrfs_dev_item, bandwidth, 8); +BTRFS_SETGET_FUNCS(device_generation, struct btrfs_dev_item, generation, 64); + +BTRFS_SETGET_STACK_FUNCS(stack_device_type, struct btrfs_dev_item, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_total_bytes, struct btrfs_dev_item, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_bytes_used, struct btrfs_dev_item, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_align, struct btrfs_dev_item, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_io_width, struct btrfs_dev_item, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_sector_size, struct btrfs_dev_item, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_id, struct btrfs_dev_item, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_device_group, struct btrfs_dev_item, + dev_group, 32); +BTRFS_SETGET_STACK_FUNCS(stack_device_seek_speed, struct btrfs_dev_item, + seek_speed, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_bandwidth, struct btrfs_dev_item, + bandwidth, 8); +BTRFS_SETGET_STACK_FUNCS(stack_device_generation, struct btrfs_dev_item, + generation, 64); + +static inline char *btrfs_device_uuid(struct btrfs_dev_item *d) +{ + return (char *)d + offsetof(struct btrfs_dev_item, uuid); +} + +static inline char *btrfs_device_fsid(struct btrfs_dev_item *d) +{ + return (char *)d + offsetof(struct btrfs_dev_item, fsid); +} + +BTRFS_SETGET_FUNCS(chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64); +BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32); +BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32); +BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32); +BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16); +BTRFS_SETGET_FUNCS(chunk_sub_stripes, struct btrfs_chunk, sub_stripes, 16); +BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64); + +static inline char *btrfs_stripe_dev_uuid(struct btrfs_stripe *s) +{ + return (char *)s + offsetof(struct btrfs_stripe, dev_uuid); +} + +BTRFS_SETGET_STACK_FUNCS(stack_chunk_length, struct btrfs_chunk, length, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk, + stripe_len, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk, + io_align, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk, + io_width, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk, + sector_size, 32); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk, + num_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_chunk_sub_stripes, struct btrfs_chunk, + sub_stripes, 16); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64); + +static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c, + int nr) +{ + unsigned long offset = (unsigned long)c; + offset += offsetof(struct btrfs_chunk, stripe); + offset += nr * sizeof(struct btrfs_stripe); + return (struct btrfs_stripe *)offset; +} + +static inline char *btrfs_stripe_dev_uuid_nr(struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_dev_uuid(btrfs_stripe_nr(c, nr)); +} + +static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val); +} + +static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr) +{ + return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr)); +} + +static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb, + struct btrfs_chunk *c, int nr, + u64 val) +{ + btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val); +} + +/* struct btrfs_block_group_item */ +BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item, + used, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); + +BTRFS_SETGET_FUNCS(disk_block_group_chunk_objectid, + struct btrfs_block_group_item, chunk_objectid, 64); +BTRFS_SETGET_FUNCS(disk_block_group_flags, + struct btrfs_block_group_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(block_group_flags, + struct btrfs_block_group_item, flags, 64); + +/* struct btrfs_inode_ref */ +BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16); +BTRFS_SETGET_FUNCS(inode_ref_index, struct btrfs_inode_ref, index, 64); + +/* struct btrfs_inode_item */ +BTRFS_SETGET_FUNCS(inode_generation, struct btrfs_inode_item, generation, 64); +BTRFS_SETGET_FUNCS(inode_sequence, struct btrfs_inode_item, sequence, 64); +BTRFS_SETGET_FUNCS(inode_transid, struct btrfs_inode_item, transid, 64); +BTRFS_SETGET_FUNCS(inode_size, struct btrfs_inode_item, size, 64); +BTRFS_SETGET_FUNCS(inode_nbytes, struct btrfs_inode_item, nbytes, 64); +BTRFS_SETGET_FUNCS(inode_block_group, struct btrfs_inode_item, block_group, 64); +BTRFS_SETGET_FUNCS(inode_nlink, struct btrfs_inode_item, nlink, 32); +BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32); +BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32); +BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32); +BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64); +BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 64); + +static inline struct btrfs_timespec * +btrfs_inode_atime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, atime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_mtime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, mtime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_ctime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, ctime); + return (struct btrfs_timespec *)ptr; +} + +static inline struct btrfs_timespec * +btrfs_inode_otime(struct btrfs_inode_item *inode_item) +{ + unsigned long ptr = (unsigned long)inode_item; + ptr += offsetof(struct btrfs_inode_item, otime); + return (struct btrfs_timespec *)ptr; +} + +BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64); +BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32); + +/* struct btrfs_dev_extent */ +BTRFS_SETGET_FUNCS(dev_extent_chunk_tree, struct btrfs_dev_extent, + chunk_tree, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_objectid, struct btrfs_dev_extent, + chunk_objectid, 64); +BTRFS_SETGET_FUNCS(dev_extent_chunk_offset, struct btrfs_dev_extent, + chunk_offset, 64); +BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64); + +static inline u8 *btrfs_dev_extent_chunk_tree_uuid(struct btrfs_dev_extent *dev) +{ + unsigned long ptr = offsetof(struct btrfs_dev_extent, chunk_tree_uuid); + return (u8 *)((unsigned long)dev + ptr); +} + +/* struct btrfs_extent_ref */ +BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64); +BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64); +BTRFS_SETGET_FUNCS(ref_objectid, struct btrfs_extent_ref, objectid, 64); +BTRFS_SETGET_FUNCS(ref_num_refs, struct btrfs_extent_ref, num_refs, 32); + +BTRFS_SETGET_STACK_FUNCS(stack_ref_root, struct btrfs_extent_ref, root, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_generation, struct btrfs_extent_ref, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_objectid, struct btrfs_extent_ref, + objectid, 64); +BTRFS_SETGET_STACK_FUNCS(stack_ref_num_refs, struct btrfs_extent_ref, + num_refs, 32); + +/* struct btrfs_extent_item */ +BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(stack_extent_refs, struct btrfs_extent_item, + refs, 32); + +/* struct btrfs_node */ +BTRFS_SETGET_FUNCS(key_blockptr, struct btrfs_key_ptr, blockptr, 64); +BTRFS_SETGET_FUNCS(key_generation, struct btrfs_key_ptr, generation, 64); + +static inline u64 btrfs_node_blockptr(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_blockptr(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_blockptr(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_blockptr(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline u64 btrfs_node_ptr_generation(struct extent_buffer *eb, int nr) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + return btrfs_key_generation(eb, (struct btrfs_key_ptr *)ptr); +} + +static inline void btrfs_set_node_ptr_generation(struct extent_buffer *eb, + int nr, u64 val) +{ + unsigned long ptr; + ptr = offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; + btrfs_set_key_generation(eb, (struct btrfs_key_ptr *)ptr, val); +} + +static inline unsigned long btrfs_node_key_ptr_offset(int nr) +{ + return offsetof(struct btrfs_node, ptrs) + + sizeof(struct btrfs_key_ptr) * nr; +} + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr); + +static inline void btrfs_set_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr; + ptr = btrfs_node_key_ptr_offset(nr); + write_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} + +/* struct btrfs_item */ +BTRFS_SETGET_FUNCS(item_offset, struct btrfs_item, offset, 32); +BTRFS_SETGET_FUNCS(item_size, struct btrfs_item, size, 32); + +static inline unsigned long btrfs_item_nr_offset(int nr) +{ + return offsetof(struct btrfs_leaf, items) + + sizeof(struct btrfs_item) * nr; +} + +static inline struct btrfs_item *btrfs_item_nr(struct extent_buffer *eb, + int nr) +{ + return (struct btrfs_item *)btrfs_item_nr_offset(nr); +} + +static inline u32 btrfs_item_end(struct extent_buffer *eb, + struct btrfs_item *item) +{ + return btrfs_item_offset(eb, item) + btrfs_item_size(eb, item); +} + +static inline u32 btrfs_item_end_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_end(eb, btrfs_item_nr(eb, nr)); +} + +static inline u32 btrfs_item_offset_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_offset(eb, btrfs_item_nr(eb, nr)); +} + +static inline u32 btrfs_item_size_nr(struct extent_buffer *eb, int nr) +{ + return btrfs_item_size(eb, btrfs_item_nr(eb, nr)); +} + +static inline void btrfs_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + read_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +static inline void btrfs_set_item_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + struct btrfs_item *item = btrfs_item_nr(eb, nr); + write_eb_member(eb, item, struct btrfs_item, key, disk_key); +} + +BTRFS_SETGET_FUNCS(dir_log_end, struct btrfs_dir_log_item, end, 64); + +/* + * struct btrfs_root_ref + */ +BTRFS_SETGET_FUNCS(root_ref_dirid, struct btrfs_root_ref, dirid, 64); +BTRFS_SETGET_FUNCS(root_ref_sequence, struct btrfs_root_ref, sequence, 64); +BTRFS_SETGET_FUNCS(root_ref_name_len, struct btrfs_root_ref, name_len, 16); + +/* struct btrfs_dir_item */ +BTRFS_SETGET_FUNCS(dir_data_len, struct btrfs_dir_item, data_len, 16); +BTRFS_SETGET_FUNCS(dir_type, struct btrfs_dir_item, type, 8); +BTRFS_SETGET_FUNCS(dir_name_len, struct btrfs_dir_item, name_len, 16); +BTRFS_SETGET_FUNCS(dir_transid, struct btrfs_dir_item, transid, 64); + +static inline void btrfs_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + read_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +static inline void btrfs_set_dir_item_key(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_disk_key *key) +{ + write_eb_member(eb, item, struct btrfs_dir_item, location, key); +} + +/* struct btrfs_disk_key */ +BTRFS_SETGET_STACK_FUNCS(disk_key_objectid, struct btrfs_disk_key, + objectid, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_offset, struct btrfs_disk_key, offset, 64); +BTRFS_SETGET_STACK_FUNCS(disk_key_type, struct btrfs_disk_key, type, 8); + +static inline void btrfs_disk_key_to_cpu(struct btrfs_key *cpu, + struct btrfs_disk_key *disk) +{ + cpu->offset = le64_to_cpu(disk->offset); + cpu->type = disk->type; + cpu->objectid = le64_to_cpu(disk->objectid); +} + +static inline void btrfs_cpu_key_to_disk(struct btrfs_disk_key *disk, + struct btrfs_key *cpu) +{ + disk->offset = cpu_to_le64(cpu->offset); + disk->type = cpu->type; + disk->objectid = cpu_to_le64(cpu->objectid); +} + +static inline void btrfs_node_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_node_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_key *key, int nr) +{ + struct btrfs_disk_key disk_key; + btrfs_item_key(eb, &disk_key, nr); + btrfs_disk_key_to_cpu(key, &disk_key); +} + +static inline void btrfs_dir_item_key_to_cpu(struct extent_buffer *eb, + struct btrfs_dir_item *item, + struct btrfs_key *key) +{ + struct btrfs_disk_key disk_key; + btrfs_dir_item_key(eb, item, &disk_key); + btrfs_disk_key_to_cpu(key, &disk_key); +} + + +static inline u8 btrfs_key_type(struct btrfs_key *key) +{ + return key->type; +} + +static inline void btrfs_set_key_type(struct btrfs_key *key, u8 val) +{ + key->type = val; +} + +/* struct btrfs_header */ +BTRFS_SETGET_HEADER_FUNCS(header_bytenr, struct btrfs_header, bytenr, 64); +BTRFS_SETGET_HEADER_FUNCS(header_generation, struct btrfs_header, + generation, 64); +BTRFS_SETGET_HEADER_FUNCS(header_owner, struct btrfs_header, owner, 64); +BTRFS_SETGET_HEADER_FUNCS(header_nritems, struct btrfs_header, nritems, 32); +BTRFS_SETGET_HEADER_FUNCS(header_flags, struct btrfs_header, flags, 64); +BTRFS_SETGET_HEADER_FUNCS(header_level, struct btrfs_header, level, 8); + +static inline int btrfs_header_flag(struct extent_buffer *eb, u64 flag) +{ + return (btrfs_header_flags(eb) & flag) == flag; +} + +static inline int btrfs_set_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags | flag); + return (flags & flag) == flag; +} + +static inline int btrfs_clear_header_flag(struct extent_buffer *eb, u64 flag) +{ + u64 flags = btrfs_header_flags(eb); + btrfs_set_header_flags(eb, flags & ~flag); + return (flags & flag) == flag; +} + +static inline u8 *btrfs_header_fsid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, fsid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_header_chunk_tree_uuid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, chunk_tree_uuid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_super_fsid(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_super_block, fsid); + return (u8 *)ptr; +} + +static inline u8 *btrfs_header_csum(struct extent_buffer *eb) +{ + unsigned long ptr = offsetof(struct btrfs_header, csum); + return (u8 *)ptr; +} + +static inline struct btrfs_node *btrfs_buffer_node(struct extent_buffer *eb) +{ + return NULL; +} + +static inline struct btrfs_leaf *btrfs_buffer_leaf(struct extent_buffer *eb) +{ + return NULL; +} + +static inline struct btrfs_header *btrfs_buffer_header(struct extent_buffer *eb) +{ + return NULL; +} + +static inline int btrfs_is_leaf(struct extent_buffer *eb) +{ + return btrfs_header_level(eb) == 0; +} + +/* struct btrfs_root_item */ +BTRFS_SETGET_FUNCS(disk_root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_FUNCS(disk_root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_FUNCS(disk_root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_FUNCS(disk_root_level, struct btrfs_root_item, level, 8); + +BTRFS_SETGET_STACK_FUNCS(root_generation, struct btrfs_root_item, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(root_bytenr, struct btrfs_root_item, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(root_level, struct btrfs_root_item, level, 8); +BTRFS_SETGET_STACK_FUNCS(root_dirid, struct btrfs_root_item, root_dirid, 64); +BTRFS_SETGET_STACK_FUNCS(root_refs, struct btrfs_root_item, refs, 32); +BTRFS_SETGET_STACK_FUNCS(root_flags, struct btrfs_root_item, flags, 64); +BTRFS_SETGET_STACK_FUNCS(root_used, struct btrfs_root_item, bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(root_limit, struct btrfs_root_item, byte_limit, 64); +BTRFS_SETGET_STACK_FUNCS(root_last_snapshot, struct btrfs_root_item, + last_snapshot, 64); + +/* struct btrfs_super_block */ + +BTRFS_SETGET_STACK_FUNCS(super_bytenr, struct btrfs_super_block, bytenr, 64); +BTRFS_SETGET_STACK_FUNCS(super_flags, struct btrfs_super_block, flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block, + generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64); +BTRFS_SETGET_STACK_FUNCS(super_sys_array_size, + struct btrfs_super_block, sys_chunk_array_size, 32); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_generation, + struct btrfs_super_block, chunk_root_generation, 64); +BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block, + root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block, + chunk_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block, + chunk_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_log_root, struct btrfs_super_block, + log_root, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_transid, struct btrfs_super_block, + log_root_transid, 64); +BTRFS_SETGET_STACK_FUNCS(super_log_root_level, struct btrfs_super_block, + log_root_level, 8); +BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block, + total_bytes, 64); +BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block, + bytes_used, 64); +BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, + sectorsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, + nodesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block, + leafsize, 32); +BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, + stripesize, 32); +BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, + root_dir_objectid, 64); +BTRFS_SETGET_STACK_FUNCS(super_num_devices, struct btrfs_super_block, + num_devices, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_compat_ro_flags, struct btrfs_super_block, + compat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_incompat_flags, struct btrfs_super_block, + incompat_flags, 64); +BTRFS_SETGET_STACK_FUNCS(super_csum_type, struct btrfs_super_block, + csum_type, 16); + +static inline int btrfs_super_csum_size(struct btrfs_super_block *s) +{ + int t = btrfs_super_csum_type(s); + BUG_ON(t >= ARRAY_SIZE(btrfs_csum_sizes)); + return btrfs_csum_sizes[t]; +} + +static inline unsigned long btrfs_leaf_data(struct extent_buffer *l) +{ + return offsetof(struct btrfs_leaf, items); +} + +/* struct btrfs_file_extent_item */ +BTRFS_SETGET_FUNCS(file_extent_type, struct btrfs_file_extent_item, type, 8); + +static inline unsigned long +btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) +{ + unsigned long offset = (unsigned long)e; + offset += offsetof(struct btrfs_file_extent_item, disk_bytenr); + return offset; +} + +static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) +{ + return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; +} + +BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, + disk_bytenr, 64); +BTRFS_SETGET_FUNCS(file_extent_generation, struct btrfs_file_extent_item, + generation, 64); +BTRFS_SETGET_FUNCS(file_extent_disk_num_bytes, struct btrfs_file_extent_item, + disk_num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_offset, struct btrfs_file_extent_item, + offset, 64); +BTRFS_SETGET_FUNCS(file_extent_num_bytes, struct btrfs_file_extent_item, + num_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_ram_bytes, struct btrfs_file_extent_item, + ram_bytes, 64); +BTRFS_SETGET_FUNCS(file_extent_compression, struct btrfs_file_extent_item, + compression, 8); +BTRFS_SETGET_FUNCS(file_extent_encryption, struct btrfs_file_extent_item, + encryption, 8); +BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, + other_encoding, 16); + +/* this returns the number of file bytes represented by the inline item. + * If an item is compressed, this is the uncompressed size + */ +static inline u32 btrfs_file_extent_inline_len(struct extent_buffer *eb, + struct btrfs_file_extent_item *e) +{ + return btrfs_file_extent_ram_bytes(eb, e); +} + +/* + * this returns the number of bytes used by the item on disk, minus the + * size of any extent headers. If a file is compressed on disk, this is + * the compressed size + */ +static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, + struct btrfs_item *e) +{ + unsigned long offset; + offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); + return btrfs_item_size(eb, e) - offset; +} + +static inline struct btrfs_root *btrfs_sb(struct super_block *sb) +{ + return sb->s_fs_info; +} + +static inline int btrfs_set_root_name(struct btrfs_root *root, + const char *name, int len) +{ + /* if we already have a name just free it */ + kfree(root->name); + + root->name = kmalloc(len+1, GFP_KERNEL); + if (!root->name) + return -ENOMEM; + + memcpy(root->name, name, len); + root->name[len] = '\0'; + + return 0; +} + +static inline u32 btrfs_level_size(struct btrfs_root *root, int level) +{ + if (level == 0) + return root->leafsize; + return root->nodesize; +} + +/* helper function to cast into the data area of the leaf. */ +#define btrfs_item_ptr(leaf, slot, type) \ + ((type *)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +#define btrfs_item_ptr_offset(leaf, slot) \ + ((unsigned long)(btrfs_leaf_data(leaf) + \ + btrfs_item_offset_nr(leaf, slot))) + +static inline struct dentry *fdentry(struct file *file) +{ + return file->f_path.dentry; +} + +/* extent-tree.c */ +int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs); +int btrfs_update_pinned_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int pin); +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 bytenr); +int btrfs_extent_post_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy); +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr); +u64 btrfs_find_block_group(struct btrfs_root *root, + u64 search_start, u64 search_hint, int owner); +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize, u64 parent, + u64 root_objectid, + u64 ref_generation, + int level, + u64 hint, + u64 empty_size); +struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize); +int btrfs_alloc_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 min_bytes, + u64 root_objectid, u64 ref_generation, + u64 owner, u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, u64 data); +int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins); +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins); +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data); +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *orig_buf, struct extent_buffer *buf, + u32 *nr_extents); +int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, u32 nr_extents); +int btrfs_update_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *orig_buf, + struct extent_buffer *buf, int start_slot, int nr); +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin); +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len); +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_io_tree *unpin); +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid); +int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid); +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr); +int btrfs_free_block_groups(struct btrfs_fs_info *info); +int btrfs_read_block_groups(struct btrfs_root *root); +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size); +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start); +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start); +int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_drop_dead_reloc_roots(struct btrfs_root *root); +int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, u64 orig_start); +int btrfs_add_dead_reloc_root(struct btrfs_root *root); +int btrfs_cleanup_reloc_trees(struct btrfs_root *root); +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); +u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +/* ctree.c */ +int btrfs_previous_item(struct btrfs_root *root, + struct btrfs_path *path, u64 min_objectid, + int type); +int btrfs_merge_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *node_keys, + u64 *nodes, int lowest_level); +int btrfs_set_item_key_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *new_key); +struct extent_buffer *btrfs_root_node(struct btrfs_root *root); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, + struct btrfs_key *key, int lowest_level, + int cache_only, u64 min_trans); +int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, + struct btrfs_key *max_key, + struct btrfs_path *path, int cache_only, + u64 min_trans); +int btrfs_cow_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf, + struct extent_buffer *parent, int parent_slot, + struct extent_buffer **cow_ret, u64 prealloc_dest); +int btrfs_copy_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, + struct extent_buffer **cow_ret, u64 new_root_objectid); +int btrfs_extend_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, u32 data_size); +int btrfs_truncate_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u32 new_size, int from_end); +int btrfs_split_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *new_key, + unsigned long split_offset); +int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_path *p, int + ins_len, int cow); +int btrfs_realloc_node(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *parent, + int start_slot, int cache_only, u64 *last_ret, + struct btrfs_key *progress); +void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); +struct btrfs_path *btrfs_alloc_path(void); +void btrfs_free_path(struct btrfs_path *p); +void btrfs_init_path(struct btrfs_path *p); +int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_path *path, int slot, int nr); +int btrfs_del_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 bytenr); +static inline int btrfs_del_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + return btrfs_del_items(trans, root, path, path->slots[0], 1); +} + +int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, void *data, u32 data_size); +int btrfs_insert_some_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, + int nr); +int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, u32 *data_size, int nr); + +static inline int btrfs_insert_empty_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u32 data_size) +{ + return btrfs_insert_empty_items(trans, root, path, key, &data_size, 1); +} + +int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path); +int btrfs_leaf_free_space(struct btrfs_root *root, struct extent_buffer *leaf); +int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root + *root); +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent); +/* root-item.c */ +int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, + u64 root_id, u64 ref_id); +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u8 type, u64 ref_id, + u64 dirid, u64 sequence, + const char *name, int name_len); +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key); +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item); +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, struct + btrfs_root_item *item, struct btrfs_key *key); +int btrfs_search_root(struct btrfs_root *root, u64 search_start, + u64 *found_objectid); +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, + struct btrfs_root *latest_root); +/* dir-item.c */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + int name_len, u64 dir, + struct btrfs_key *location, u8 type, u64 index); +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod); +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod); +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len); +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di); +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + u16 name_len, const void *data, u16 data_len, + u64 dir); +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod); + +/* orphan.c */ +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset); + +/* inode-map.c */ +int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *fs_root, + u64 dirid, u64 *objectid); +int btrfs_find_highest_inode(struct btrfs_root *fs_root, u64 *objectid); + +/* inode-item.c */ +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index); +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index); +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod); + +/* file-item.c */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len); +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst); +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding); +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 bytenr, int mod); +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums); +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig); +int btrfs_csum_file_bytes(struct btrfs_root *root, struct inode *inode, + u64 start, unsigned long len); +struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow); +int btrfs_csum_truncate(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct btrfs_path *path, + u64 isize); +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, + u64 end, struct list_head *list); +/* inode.c */ + +/* RHEL and EL kernels have a patch that renames PG_checked to FsMisc */ +#if defined(ClearPageFsMisc) && !defined(ClearPageChecked) +#define ClearPageChecked ClearPageFsMisc +#define SetPageChecked SetPageFsMisc +#define PageChecked PageFsMisc +#endif + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry); +int btrfs_set_inode_index(struct inode *dir, u64 *index); +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len); +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index); +int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 new_size, + u32 min_type); + +int btrfs_start_delalloc_inodes(struct btrfs_root *root); +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end); +int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc); +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, struct dentry *dentry, + u64 new_dirid, u64 alloc_hint); +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, unsigned long bio_flags); + +unsigned long btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, pgoff_t last_index); +int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, + int for_del); +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); +int btrfs_readpage(struct file *file, struct page *page); +void btrfs_delete_inode(struct inode *inode); +void btrfs_put_inode(struct inode *inode); +void btrfs_read_locked_inode(struct inode *inode); +int btrfs_write_inode(struct inode *inode, int wait); +void btrfs_dirty_inode(struct inode *inode); +struct inode *btrfs_alloc_inode(struct super_block *sb); +void btrfs_destroy_inode(struct inode *inode); +int btrfs_init_cachep(void); +void btrfs_destroy_cachep(void); +long btrfs_ioctl_trans_end(struct file *file); +struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, + struct btrfs_root *root, int wait); +struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, + struct btrfs_root *root); +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *is_new); +int btrfs_commit_write(struct file *file, struct page *page, + unsigned from, unsigned to); +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t page_offset, u64 start, u64 end, + int create); +int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode); +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode); +int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode); +void btrfs_orphan_cleanup(struct btrfs_root *root); +int btrfs_cont_expand(struct inode *inode, loff_t size); + +/* ioctl.c */ +long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); + +/* file.c */ +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync); +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned); +int btrfs_check_file(struct btrfs_root *root, struct inode *inode); +extern struct file_operations btrfs_file_operations; +int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 inline_limit, u64 *hint_block); +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end); +int btrfs_release_file(struct inode *inode, struct file *file); + +/* tree-defrag.c */ +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only); + +/* sysfs.c */ +int btrfs_init_sysfs(void); +void btrfs_exit_sysfs(void); +int btrfs_sysfs_add_super(struct btrfs_fs_info *fs); +int btrfs_sysfs_add_root(struct btrfs_root *root); +void btrfs_sysfs_del_root(struct btrfs_root *root); +void btrfs_sysfs_del_super(struct btrfs_fs_info *root); + +/* xattr.c */ +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size); + +/* super.c */ +u64 btrfs_parse_size(char *str); +int btrfs_parse_options(struct btrfs_root *root, char *options); +int btrfs_sync_fs(struct super_block *sb, int wait); + +/* acl.c */ +int btrfs_check_acl(struct inode *inode, int mask); +int btrfs_init_acl(struct inode *inode, struct inode *dir); +int btrfs_acl_chmod(struct inode *inode); + +/* free-space-cache.c */ +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes); +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 bytenr, u64 size); +int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes); +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache + *block_group); +struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes); +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes); +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group); +#endif diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c new file mode 100644 index 00000000000..926a0b287a7 --- /dev/null +++ b/fs/btrfs/dir-item.c @@ -0,0 +1,386 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "hash.h" +#include "transaction.h" + +/* + * insert a name into a directory, doing overflow properly if there is a hash + * collision. data_size indicates how big the item inserted should be. On + * success a struct btrfs_dir_item pointer is returned, otherwise it is + * an ERR_PTR. + * + * The name is not copied into the dir item, you have to do that yourself. + */ +static struct btrfs_dir_item *insert_with_overflow(struct btrfs_trans_handle + *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *cpu_key, + u32 data_size, + const char *name, + int name_len) +{ + int ret; + char *ptr; + struct btrfs_item *item; + struct extent_buffer *leaf; + + ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size); + if (ret == -EEXIST) { + struct btrfs_dir_item *di; + di = btrfs_match_dir_item_name(root, path, name, name_len); + if (di) + return ERR_PTR(-EEXIST); + ret = btrfs_extend_item(trans, root, path, data_size); + WARN_ON(ret > 0); + } + if (ret < 0) + return ERR_PTR(ret); + WARN_ON(ret > 0); + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr(leaf, path->slots[0], char); + BUG_ON(data_size > btrfs_item_size(leaf, item)); + ptr += btrfs_item_size(leaf, item) - data_size; + return (struct btrfs_dir_item *)ptr; +} + +/* + * xattrs work a lot like directories, this inserts an xattr item + * into the tree + */ +int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, const char *name, + u16 name_len, const void *data, u16 data_len, + u64 dir) +{ + int ret = 0; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + unsigned long name_ptr, data_ptr; + struct btrfs_key key, location; + struct btrfs_disk_key disk_key; + struct extent_buffer *leaf; + u32 data_size; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + if (name_len + data_len + sizeof(struct btrfs_dir_item) > + BTRFS_LEAF_DATA_SIZE(root) - sizeof(struct btrfs_item)) + return -ENOSPC; + + data_size = sizeof(*dir_item) + name_len + data_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + /* + * FIXME: at some point we should handle xattr's that are larger than + * what we can fit in our leaf. We set location to NULL b/c we arent + * pointing at anything else, that will change if we store the xattr + * data in a separate inode. + */ + BUG_ON(IS_ERR(dir_item)); + memset(&location, 0, sizeof(location)); + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, &location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + btrfs_set_dir_data_len(leaf, dir_item, data_len); + name_ptr = (unsigned long)(dir_item + 1); + data_ptr = (unsigned long)((char *)name_ptr + name_len); + + write_extent_buffer(leaf, name, name_ptr, name_len); + write_extent_buffer(leaf, data, data_ptr, data_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_free_path(path); + return ret; +} + +/* + * insert a directory item in the tree, doing all the magic for + * both indexes. 'dir' indicates which objectid to insert it into, + * 'location' is the key to stuff into the directory item, 'type' is the + * type of the inode we're pointing to, and 'index' is the sequence number + * to use for the second index (if one is created). + */ +int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root + *root, const char *name, int name_len, u64 dir, + struct btrfs_key *location, u8 type, u64 index) +{ + int ret = 0; + int ret2 = 0; + struct btrfs_path *path; + struct btrfs_dir_item *dir_item; + struct extent_buffer *leaf; + unsigned long name_ptr; + struct btrfs_key key; + struct btrfs_disk_key disk_key; + u32 data_size; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + path = btrfs_alloc_path(); + data_size = sizeof(*dir_item) + name_len; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) { + ret = PTR_ERR(dir_item); + if (ret == -EEXIST) + goto second_insert; + goto out; + } + + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_data_len(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + name_ptr = (unsigned long)(dir_item + 1); + + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + +second_insert: + /* FIXME, use some real flag for selecting the extra index */ + if (root == root->fs_info->tree_root) { + ret = 0; + goto out; + } + btrfs_release_path(root, path); + + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = index; + dir_item = insert_with_overflow(trans, root, path, &key, data_size, + name, name_len); + if (IS_ERR(dir_item)) { + ret2 = PTR_ERR(dir_item); + goto out; + } + leaf = path->nodes[0]; + btrfs_cpu_key_to_disk(&disk_key, location); + btrfs_set_dir_item_key(leaf, dir_item, &disk_key); + btrfs_set_dir_type(leaf, dir_item, type); + btrfs_set_dir_data_len(leaf, dir_item, 0); + btrfs_set_dir_name_len(leaf, dir_item, name_len); + btrfs_set_dir_transid(leaf, dir_item, trans->transid); + name_ptr = (unsigned long)(dir_item + 1); + write_extent_buffer(leaf, name, name_ptr, name_len); + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + if (ret) + return ret; + if (ret2) + return ret2; + return 0; +} + +/* + * lookup a directory item based on name. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + */ +struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + + key.offset = btrfs_name_hash(name, name_len); + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + if (path->slots[0] == 0) + return NULL; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_DIR_ITEM_KEY || + found_key.offset != key.offset) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +/* + * lookup a directory item based on index. 'dir' is the objectid + * we're searching in, and 'mod' tells us if you plan on deleting the + * item (use mod < 0) or changing the options (use mod > 0) + * + * The name is used to make sure the index really points to the name you were + * looking for. + */ +struct btrfs_dir_item * +btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + u64 objectid, const char *name, int name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = objectid; + + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 dir, + const char *name, u16 name_len, + int mod) +{ + int ret; + struct btrfs_key key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + key.objectid = dir; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = btrfs_name_hash(name, name_len); + ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) { + if (path->slots[0] == 0) + return NULL; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != dir || + btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY || + found_key.offset != key.offset) + return NULL; + + return btrfs_match_dir_item_name(root, path, name, name_len); +} + +/* + * helper function to look at the directory item pointed to by 'path' + * this walks through all the entries in a dir item and finds one + * for a specific name. + */ +struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_root *root, + struct btrfs_path *path, + const char *name, int name_len) +{ + struct btrfs_dir_item *dir_item; + unsigned long name_ptr; + u32 total_len; + u32 cur = 0; + u32 this_len; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + dir_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dir_item); + total_len = btrfs_item_size_nr(leaf, path->slots[0]); + while (cur < total_len) { + this_len = sizeof(*dir_item) + + btrfs_dir_name_len(leaf, dir_item) + + btrfs_dir_data_len(leaf, dir_item); + name_ptr = (unsigned long)(dir_item + 1); + + if (btrfs_dir_name_len(leaf, dir_item) == name_len && + memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) + return dir_item; + + cur += this_len; + dir_item = (struct btrfs_dir_item *)((char *)dir_item + + this_len); + } + return NULL; +} + +/* + * given a pointer into a directory item, delete it. This + * handles items that have more than one entry in them. + */ +int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_dir_item *di) +{ + + struct extent_buffer *leaf; + u32 sub_item_len; + u32 item_len; + int ret = 0; + + leaf = path->nodes[0]; + sub_item_len = sizeof(*di) + btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di); + item_len = btrfs_item_size_nr(leaf, path->slots[0]); + if (sub_item_len == item_len) { + ret = btrfs_del_item(trans, root, path); + } else { + /* MARKER */ + unsigned long ptr = (unsigned long)di; + unsigned long start; + + start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_len - (ptr + sub_item_len - start)); + ret = btrfs_truncate_item(trans, root, path, + item_len - sub_item_len, 1); + } + return 0; +} diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c new file mode 100644 index 00000000000..81a313874ae --- /dev/null +++ b/fs/btrfs/disk-io.c @@ -0,0 +1,2343 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/version.h> +#include <linux/fs.h> +#include <linux/blkdev.h> +#include <linux/scatterlist.h> +#include <linux/swap.h> +#include <linux/radix-tree.h> +#include <linux/writeback.h> +#include <linux/buffer_head.h> +#include <linux/workqueue.h> +#include <linux/kthread.h> +#include <linux/freezer.h> +#include "compat.h" +#include "crc32c.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "volumes.h" +#include "print-tree.h" +#include "async-thread.h" +#include "locking.h" +#include "ref-cache.h" +#include "tree-log.h" + +static struct extent_io_ops btree_extent_io_ops; +static void end_workqueue_fn(struct btrfs_work *work); + +/* + * end_io_wq structs are used to do processing in task context when an IO is + * complete. This is used during reads to verify checksums, and it is used + * by writes to insert metadata for new file extents after IO is complete. + */ +struct end_io_wq { + struct bio *bio; + bio_end_io_t *end_io; + void *private; + struct btrfs_fs_info *info; + int error; + int metadata; + struct list_head list; + struct btrfs_work work; +}; + +/* + * async submit bios are used to offload expensive checksumming + * onto the worker threads. They checksum file and metadata bios + * just before they are sent down the IO stack. + */ +struct async_submit_bio { + struct inode *inode; + struct bio *bio; + struct list_head list; + extent_submit_bio_hook_t *submit_bio_start; + extent_submit_bio_hook_t *submit_bio_done; + int rw; + int mirror_num; + unsigned long bio_flags; + struct btrfs_work work; +}; + +/* + * extents on the btree inode are pretty simple, there's one extent + * that covers the entire device + */ +static struct extent_map *btree_get_extent(struct inode *inode, + struct page *page, size_t page_offset, u64 start, u64 len, + int create) +{ + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_map *em; + int ret; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + em->bdev = + BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + spin_unlock(&em_tree->lock); + goto out; + } + spin_unlock(&em_tree->lock); + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + em = ERR_PTR(-ENOMEM); + goto out; + } + em->start = 0; + em->len = (u64)-1; + em->block_len = (u64)-1; + em->block_start = 0; + em->bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; + + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + if (ret == -EEXIST) { + u64 failed_start = em->start; + u64 failed_len = em->len; + + free_extent_map(em); + em = lookup_extent_mapping(em_tree, start, len); + if (em) { + ret = 0; + } else { + em = lookup_extent_mapping(em_tree, failed_start, + failed_len); + ret = -EIO; + } + } else if (ret) { + free_extent_map(em); + em = NULL; + } + spin_unlock(&em_tree->lock); + + if (ret) + em = ERR_PTR(ret); +out: + return em; +} + +u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len) +{ + return btrfs_crc32c(seed, data, len); +} + +void btrfs_csum_final(u32 crc, char *result) +{ + *(__le32 *)result = ~cpu_to_le32(crc); +} + +/* + * compute the csum for a btree block, and either verify it or write it + * into the csum field of the block. + */ +static int csum_tree_block(struct btrfs_root *root, struct extent_buffer *buf, + int verify) +{ + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + char *result = NULL; + unsigned long len; + unsigned long cur_len; + unsigned long offset = BTRFS_CSUM_SIZE; + char *map_token = NULL; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + int err; + u32 crc = ~(u32)0; + unsigned long inline_result; + + len = buf->len - offset; + while (len > 0) { + err = map_private_extent_buffer(buf, offset, 32, + &map_token, &kaddr, + &map_start, &map_len, KM_USER0); + if (err) + return 1; + cur_len = min(len, map_len - (offset - map_start)); + crc = btrfs_csum_data(root, kaddr + offset - map_start, + crc, cur_len); + len -= cur_len; + offset += cur_len; + unmap_extent_buffer(buf, map_token, KM_USER0); + } + if (csum_size > sizeof(inline_result)) { + result = kzalloc(csum_size * sizeof(char), GFP_NOFS); + if (!result) + return 1; + } else { + result = (char *)&inline_result; + } + + btrfs_csum_final(crc, result); + + if (verify) { + if (memcmp_extent_buffer(buf, result, 0, csum_size)) { + u32 val; + u32 found = 0; + memcpy(&found, result, csum_size); + + read_extent_buffer(buf, &val, 0, csum_size); + printk(KERN_INFO "btrfs: %s checksum verify failed " + "on %llu wanted %X found %X level %d\n", + root->fs_info->sb->s_id, + buf->start, val, found, btrfs_header_level(buf)); + if (result != (char *)&inline_result) + kfree(result); + return 1; + } + } else { + write_extent_buffer(buf, result, 0, csum_size); + } + if (result != (char *)&inline_result) + kfree(result); + return 0; +} + +/* + * we can't consider a given block up to date unless the transid of the + * block matches the transid in the parent node's pointer. This is how we + * detect blocks that either didn't get written at all or got written + * in the wrong place. + */ +static int verify_parent_transid(struct extent_io_tree *io_tree, + struct extent_buffer *eb, u64 parent_transid) +{ + int ret; + + if (!parent_transid || btrfs_header_generation(eb) == parent_transid) + return 0; + + lock_extent(io_tree, eb->start, eb->start + eb->len - 1, GFP_NOFS); + if (extent_buffer_uptodate(io_tree, eb) && + btrfs_header_generation(eb) == parent_transid) { + ret = 0; + goto out; + } + printk("parent transid verify failed on %llu wanted %llu found %llu\n", + (unsigned long long)eb->start, + (unsigned long long)parent_transid, + (unsigned long long)btrfs_header_generation(eb)); + ret = 1; + clear_extent_buffer_uptodate(io_tree, eb); +out: + unlock_extent(io_tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + return ret; +} + +/* + * helper to read a given tree block, doing retries as required when + * the checksums don't match and we have alternate mirrors to try. + */ +static int btree_read_extent_buffer_pages(struct btrfs_root *root, + struct extent_buffer *eb, + u64 start, u64 parent_transid) +{ + struct extent_io_tree *io_tree; + int ret; + int num_copies = 0; + int mirror_num = 0; + + io_tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + while (1) { + ret = read_extent_buffer_pages(io_tree, eb, start, 1, + btree_get_extent, mirror_num); + if (!ret && + !verify_parent_transid(io_tree, eb, parent_transid)) + return ret; + + num_copies = btrfs_num_copies(&root->fs_info->mapping_tree, + eb->start, eb->len); + if (num_copies == 1) + return ret; + + mirror_num++; + if (mirror_num > num_copies) + return ret; + } + return -EIO; +} + +/* + * checksum a dirty tree block before IO. This has extra checks to make sure + * we only fill in the checksum field in the first page of a multi-page block + */ + +static int csum_dirty_buffer(struct btrfs_root *root, struct page *page) +{ + struct extent_io_tree *tree; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 found_start; + int found_level; + unsigned long len; + struct extent_buffer *eb; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + ret = btree_read_extent_buffer_pages(root, eb, start + PAGE_CACHE_SIZE, + btrfs_header_generation(eb)); + BUG_ON(ret); + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + WARN_ON(1); + goto err; + } + if (eb->first_page != page) { + WARN_ON(1); + goto err; + } + if (!PageUptodate(page)) { + WARN_ON(1); + goto err; + } + found_level = btrfs_header_level(eb); + + csum_tree_block(root, eb, 0); +err: + free_extent_buffer(eb); +out: + return 0; +} + +static int check_tree_block_fsid(struct btrfs_root *root, + struct extent_buffer *eb) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + u8 fsid[BTRFS_UUID_SIZE]; + int ret = 1; + + read_extent_buffer(eb, fsid, (unsigned long)btrfs_header_fsid(eb), + BTRFS_FSID_SIZE); + while (fs_devices) { + if (!memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE)) { + ret = 0; + break; + } + fs_devices = fs_devices->seed; + } + return ret; +} + +static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct extent_io_tree *tree; + u64 found_start; + int found_level; + unsigned long len; + struct extent_buffer *eb; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + int ret = 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + if (!page->private) + goto out; + + len = page->private >> 2; + WARN_ON(len == 0); + + eb = alloc_extent_buffer(tree, start, len, page, GFP_NOFS); + + found_start = btrfs_header_bytenr(eb); + if (found_start != start) { + printk(KERN_INFO "btrfs bad tree block start %llu %llu\n", + (unsigned long long)found_start, + (unsigned long long)eb->start); + ret = -EIO; + goto err; + } + if (eb->first_page != page) { + printk(KERN_INFO "btrfs bad first page %lu %lu\n", + eb->first_page->index, page->index); + WARN_ON(1); + ret = -EIO; + goto err; + } + if (check_tree_block_fsid(root, eb)) { + printk(KERN_INFO "btrfs bad fsid on block %llu\n", + (unsigned long long)eb->start); + ret = -EIO; + goto err; + } + found_level = btrfs_header_level(eb); + + ret = csum_tree_block(root, eb, 1); + if (ret) + ret = -EIO; + + end = min_t(u64, eb->len, PAGE_CACHE_SIZE); + end = eb->start + end - 1; +err: + free_extent_buffer(eb); +out: + return ret; +} + +static void end_workqueue_bio(struct bio *bio, int err) +{ + struct end_io_wq *end_io_wq = bio->bi_private; + struct btrfs_fs_info *fs_info; + + fs_info = end_io_wq->info; + end_io_wq->error = err; + end_io_wq->work.func = end_workqueue_fn; + end_io_wq->work.flags = 0; + + if (bio->bi_rw & (1 << BIO_RW)) { + if (end_io_wq->metadata) + btrfs_queue_worker(&fs_info->endio_meta_write_workers, + &end_io_wq->work); + else + btrfs_queue_worker(&fs_info->endio_write_workers, + &end_io_wq->work); + } else { + if (end_io_wq->metadata) + btrfs_queue_worker(&fs_info->endio_meta_workers, + &end_io_wq->work); + else + btrfs_queue_worker(&fs_info->endio_workers, + &end_io_wq->work); + } +} + +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata) +{ + struct end_io_wq *end_io_wq; + end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); + if (!end_io_wq) + return -ENOMEM; + + end_io_wq->private = bio->bi_private; + end_io_wq->end_io = bio->bi_end_io; + end_io_wq->info = info; + end_io_wq->error = 0; + end_io_wq->bio = bio; + end_io_wq->metadata = metadata; + + bio->bi_private = end_io_wq; + bio->bi_end_io = end_workqueue_bio; + return 0; +} + +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info) +{ + unsigned long limit = min_t(unsigned long, + info->workers.max_workers, + info->fs_devices->open_devices); + return 256 * limit; +} + +int btrfs_congested_async(struct btrfs_fs_info *info, int iodone) +{ + return atomic_read(&info->nr_async_bios) > + btrfs_async_submit_limit(info); +} + +static void run_one_async_start(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + async->submit_bio_start(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags); +} + +static void run_one_async_done(struct btrfs_work *work) +{ + struct btrfs_fs_info *fs_info; + struct async_submit_bio *async; + int limit; + + async = container_of(work, struct async_submit_bio, work); + fs_info = BTRFS_I(async->inode)->root->fs_info; + + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + + atomic_dec(&fs_info->nr_async_submits); + + if (atomic_read(&fs_info->nr_async_submits) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + async->submit_bio_done(async->inode, async->rw, async->bio, + async->mirror_num, async->bio_flags); +} + +static void run_one_async_free(struct btrfs_work *work) +{ + struct async_submit_bio *async; + + async = container_of(work, struct async_submit_bio, work); + kfree(async); +} + +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done) +{ + struct async_submit_bio *async; + + async = kmalloc(sizeof(*async), GFP_NOFS); + if (!async) + return -ENOMEM; + + async->inode = inode; + async->rw = rw; + async->bio = bio; + async->mirror_num = mirror_num; + async->submit_bio_start = submit_bio_start; + async->submit_bio_done = submit_bio_done; + + async->work.func = run_one_async_start; + async->work.ordered_func = run_one_async_done; + async->work.ordered_free = run_one_async_free; + + async->work.flags = 0; + async->bio_flags = bio_flags; + + atomic_inc(&fs_info->nr_async_submits); + btrfs_queue_worker(&fs_info->workers, &async->work); +#if 0 + int limit = btrfs_async_submit_limit(fs_info); + if (atomic_read(&fs_info->nr_async_submits) > limit) { + wait_event_timeout(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) < limit), + HZ/10); + + wait_event_timeout(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_bios) < limit), + HZ/10); + } +#endif + while (atomic_read(&fs_info->async_submit_draining) && + atomic_read(&fs_info->nr_async_submits)) { + wait_event(fs_info->async_submit_wait, + (atomic_read(&fs_info->nr_async_submits) == 0)); + } + + return 0; +} + +static int btree_csum_one_bio(struct bio *bio) +{ + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + struct btrfs_root *root; + + WARN_ON(bio->bi_vcnt <= 0); + while (bio_index < bio->bi_vcnt) { + root = BTRFS_I(bvec->bv_page->mapping->host)->root; + csum_dirty_buffer(root, bvec->bv_page); + bio_index++; + bvec++; + } + return 0; +} + +static int __btree_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + btree_csum_one_bio(bio); + return 0; +} + +static int __btree_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + /* + * when we're called for a write, we're already in the async + * submission context. Just jump into btrfs_map_bio + */ + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, mirror_num, 1); +} + +static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + int ret; + + ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, + bio, 1); + BUG_ON(ret); + + if (!(rw & (1 << BIO_RW))) { + /* + * called for a read, do the setup so that checksum validation + * can happen in the async kernel threads + */ + return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, + mirror_num, 0); + } + /* + * kthread helpers are used to submit writes so that checksumming + * can happen in parallel across all CPUs + */ + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, 0, + __btree_submit_bio_start, + __btree_submit_bio_done); +} + +static int btree_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + return extent_write_full_page(tree, page, btree_get_extent, wbc); +} + +static int btree_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + if (wbc->sync_mode == WB_SYNC_NONE) { + u64 num_dirty; + u64 start = 0; + unsigned long thresh = 32 * 1024 * 1024; + + if (wbc->for_kupdate) + return 0; + + num_dirty = count_range_bits(tree, &start, (u64)-1, + thresh, EXTENT_DIRTY); + if (num_dirty < thresh) + return 0; + } + return extent_writepages(tree, mapping, btree_get_extent, wbc); +} + +static int btree_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btree_get_extent); +} + +static int btree_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + + if (PageWriteback(page) || PageDirty(page)) + return 0; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + + ret = try_release_extent_state(map, tree, page, gfp_flags); + if (!ret) + return 0; + + ret = try_release_extent_buffer(tree, page); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + + return ret; +} + +static void btree_invalidatepage(struct page *page, unsigned long offset) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + extent_invalidatepage(tree, page, offset); + btree_releasepage(page, GFP_NOFS); + if (PagePrivate(page)) { + printk(KERN_WARNING "btrfs warning page private not zero " + "on page %llu\n", (unsigned long long)page_offset(page)); + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +#if 0 +static int btree_writepage(struct page *page, struct writeback_control *wbc) +{ + struct buffer_head *bh; + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct buffer_head *head; + if (!page_has_buffers(page)) { + create_empty_buffers(page, root->fs_info->sb->s_blocksize, + (1 << BH_Dirty)|(1 << BH_Uptodate)); + } + head = page_buffers(page); + bh = head; + do { + if (buffer_dirty(bh)) + csum_tree_block(root, bh, 0); + bh = bh->b_this_page; + } while (bh != head); + return block_write_full_page(page, btree_get_block, wbc); +} +#endif + +static struct address_space_operations btree_aops = { + .readpage = btree_readpage, + .writepage = btree_writepage, + .writepages = btree_writepages, + .releasepage = btree_releasepage, + .invalidatepage = btree_invalidatepage, + .sync_page = block_sync_page, +}; + +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + int ret = 0; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return 0; + read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, + buf, 0, 0, btree_get_extent, 0); + free_extent_buffer(buf); + return ret; +} + +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + eb = find_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize, GFP_NOFS); + return eb; +} + +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_buffer *eb; + + eb = alloc_extent_buffer(&BTRFS_I(btree_inode)->io_tree, + bytenr, blocksize, NULL, GFP_NOFS); + return eb; +} + + +int btrfs_write_tree_block(struct extent_buffer *buf) +{ + return btrfs_fdatawrite_range(buf->first_page->mapping, buf->start, + buf->start + buf->len - 1, WB_SYNC_ALL); +} + +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) +{ + return btrfs_wait_on_page_writeback_range(buf->first_page->mapping, + buf->start, buf->start + buf->len - 1); +} + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid) +{ + struct extent_buffer *buf = NULL; + struct inode *btree_inode = root->fs_info->btree_inode; + struct extent_io_tree *io_tree; + int ret; + + io_tree = &BTRFS_I(btree_inode)->io_tree; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return NULL; + + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + + if (ret == 0) + buf->flags |= EXTENT_UPTODATE; + else + WARN_ON(1); + return buf; + +} + +int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf) +{ + struct inode *btree_inode = root->fs_info->btree_inode; + if (btrfs_header_generation(buf) == + root->fs_info->running_transaction->transid) { + WARN_ON(!btrfs_tree_locked(buf)); + clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, + buf); + } + return 0; +} + +static int __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, + u32 stripesize, struct btrfs_root *root, + struct btrfs_fs_info *fs_info, + u64 objectid) +{ + root->node = NULL; + root->commit_root = NULL; + root->ref_tree = NULL; + root->sectorsize = sectorsize; + root->nodesize = nodesize; + root->leafsize = leafsize; + root->stripesize = stripesize; + root->ref_cows = 0; + root->track_dirty = 0; + + root->fs_info = fs_info; + root->objectid = objectid; + root->last_trans = 0; + root->highest_inode = 0; + root->last_inode_alloc = 0; + root->name = NULL; + root->in_sysfs = 0; + + INIT_LIST_HEAD(&root->dirty_list); + INIT_LIST_HEAD(&root->orphan_list); + INIT_LIST_HEAD(&root->dead_list); + spin_lock_init(&root->node_lock); + spin_lock_init(&root->list_lock); + mutex_init(&root->objectid_mutex); + mutex_init(&root->log_mutex); + extent_io_tree_init(&root->dirty_log_pages, + fs_info->btree_inode->i_mapping, GFP_NOFS); + + btrfs_leaf_ref_tree_init(&root->ref_tree_struct); + root->ref_tree = &root->ref_tree_struct; + + memset(&root->root_key, 0, sizeof(root->root_key)); + memset(&root->root_item, 0, sizeof(root->root_item)); + memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); + memset(&root->root_kobj, 0, sizeof(root->root_kobj)); + root->defrag_trans_start = fs_info->generation; + init_completion(&root->kobj_unregister); + root->defrag_running = 0; + root->defrag_level = 0; + root->root_key.objectid = objectid; + root->anon_super.s_root = NULL; + root->anon_super.s_dev = 0; + INIT_LIST_HEAD(&root->anon_super.s_list); + INIT_LIST_HEAD(&root->anon_super.s_instances); + init_rwsem(&root->anon_super.s_umount); + + return 0; +} + +static int find_and_setup_root(struct btrfs_root *tree_root, + struct btrfs_fs_info *fs_info, + u64 objectid, + struct btrfs_root *root) +{ + int ret; + u32 blocksize; + u64 generation; + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, objectid); + ret = btrfs_find_last_root(tree_root, objectid, + &root->root_item, &root->root_key); + BUG_ON(ret); + + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + BUG_ON(!root->node); + return 0; +} + +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct extent_buffer *eb; + struct btrfs_root *log_root_tree = fs_info->log_root_tree; + u64 start = 0; + u64 end = 0; + int ret; + + if (!log_root_tree) + return 0; + + while (1) { + ret = find_first_extent_bit(&log_root_tree->dirty_log_pages, + 0, &start, &end, EXTENT_DIRTY); + if (ret) + break; + + clear_extent_dirty(&log_root_tree->dirty_log_pages, + start, end, GFP_NOFS); + } + eb = fs_info->log_root_tree->node; + + WARN_ON(btrfs_header_level(eb) != 0); + WARN_ON(btrfs_header_nritems(eb) != 0); + + ret = btrfs_free_reserved_extent(fs_info->tree_root, + eb->start, eb->len); + BUG_ON(ret); + + free_extent_buffer(eb); + kfree(fs_info->log_root_tree); + fs_info->log_root_tree = NULL; + return 0; +} + +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_root *root; + struct btrfs_root *tree_root = fs_info->tree_root; + + root = kzalloc(sizeof(*root), GFP_NOFS); + if (!root) + return -ENOMEM; + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; + root->root_key.type = BTRFS_ROOT_ITEM_KEY; + root->root_key.offset = BTRFS_TREE_LOG_OBJECTID; + root->ref_cows = 0; + + root->node = btrfs_alloc_free_block(trans, root, root->leafsize, + 0, BTRFS_TREE_LOG_OBJECTID, + trans->transid, 0, 0, 0); + + btrfs_set_header_nritems(root->node, 0); + btrfs_set_header_level(root->node, 0); + btrfs_set_header_bytenr(root->node, root->node->start); + btrfs_set_header_generation(root->node, trans->transid); + btrfs_set_header_owner(root->node, BTRFS_TREE_LOG_OBJECTID); + + write_extent_buffer(root->node, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(root->node), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(root->node); + btrfs_tree_unlock(root->node); + fs_info->log_root_tree = root; + return 0; +} + +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_key *location) +{ + struct btrfs_root *root; + struct btrfs_fs_info *fs_info = tree_root->fs_info; + struct btrfs_path *path; + struct extent_buffer *l; + u64 highest_inode; + u64 generation; + u32 blocksize; + int ret = 0; + + root = kzalloc(sizeof(*root), GFP_NOFS); + if (!root) + return ERR_PTR(-ENOMEM); + if (location->offset == (u64)-1) { + ret = find_and_setup_root(tree_root, fs_info, + location->objectid, root); + if (ret) { + kfree(root); + return ERR_PTR(ret); + } + goto insert; + } + + __setup_root(tree_root->nodesize, tree_root->leafsize, + tree_root->sectorsize, tree_root->stripesize, + root, fs_info, location->objectid); + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(NULL, tree_root, location, path, 0, 0); + if (ret != 0) { + if (ret > 0) + ret = -ENOENT; + goto out; + } + l = path->nodes[0]; + read_extent_buffer(l, &root->root_item, + btrfs_item_ptr_offset(l, path->slots[0]), + sizeof(root->root_item)); + memcpy(&root->root_key, location, sizeof(*location)); + ret = 0; +out: + btrfs_release_path(root, path); + btrfs_free_path(path); + if (ret) { + kfree(root); + return ERR_PTR(ret); + } + generation = btrfs_root_generation(&root->root_item); + blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); + root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), + blocksize, generation); + BUG_ON(!root->node); +insert: + if (location->objectid != BTRFS_TREE_LOG_OBJECTID) { + root->ref_cows = 1; + ret = btrfs_find_highest_inode(root, &highest_inode); + if (ret == 0) { + root->highest_inode = highest_inode; + root->last_inode_alloc = highest_inode; + } + } + return root; +} + +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid) +{ + struct btrfs_root *root; + + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)root_objectid); + return root; +} + +struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location) +{ + struct btrfs_root *root; + int ret; + + if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + return fs_info->tree_root; + if (location->objectid == BTRFS_EXTENT_TREE_OBJECTID) + return fs_info->extent_root; + if (location->objectid == BTRFS_CHUNK_TREE_OBJECTID) + return fs_info->chunk_root; + if (location->objectid == BTRFS_DEV_TREE_OBJECTID) + return fs_info->dev_root; + if (location->objectid == BTRFS_CSUM_TREE_OBJECTID) + return fs_info->csum_root; + + root = radix_tree_lookup(&fs_info->fs_roots_radix, + (unsigned long)location->objectid); + if (root) + return root; + + root = btrfs_read_fs_root_no_radix(fs_info->tree_root, location); + if (IS_ERR(root)) + return root; + + set_anon_super(&root->anon_super, NULL); + + ret = radix_tree_insert(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + root); + if (ret) { + free_extent_buffer(root->node); + kfree(root); + return ERR_PTR(ret); + } + if (!(fs_info->sb->s_flags & MS_RDONLY)) { + ret = btrfs_find_dead_roots(fs_info->tree_root, + root->root_key.objectid, root); + BUG_ON(ret); + btrfs_orphan_cleanup(root); + } + return root; +} + +struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + const char *name, int namelen) +{ + struct btrfs_root *root; + int ret; + + root = btrfs_read_fs_root_no_name(fs_info, location); + if (!root) + return NULL; + + if (root->in_sysfs) + return root; + + ret = btrfs_set_root_name(root, name, namelen); + if (ret) { + free_extent_buffer(root->node); + kfree(root); + return ERR_PTR(ret); + } +#if 0 + ret = btrfs_sysfs_add_root(root); + if (ret) { + free_extent_buffer(root->node); + kfree(root->name); + kfree(root); + return ERR_PTR(ret); + } +#endif + root->in_sysfs = 1; + return root; +} + +static int btrfs_congested_fn(void *congested_data, int bdi_bits) +{ + struct btrfs_fs_info *info = (struct btrfs_fs_info *)congested_data; + int ret = 0; + struct list_head *cur; + struct btrfs_device *device; + struct backing_dev_info *bdi; +#if 0 + if ((bdi_bits & (1 << BDI_write_congested)) && + btrfs_congested_async(info, 0)) + return 1; +#endif + list_for_each(cur, &info->fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (!device->bdev) + continue; + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi && bdi_congested(bdi, bdi_bits)) { + ret = 1; + break; + } + } + return ret; +} + +/* + * this unplugs every device on the box, and it is only used when page + * is null + */ +static void __unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ + struct list_head *cur; + struct btrfs_device *device; + struct btrfs_fs_info *info; + + info = (struct btrfs_fs_info *)bdi->unplug_io_data; + list_for_each(cur, &info->fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (!device->bdev) + continue; + + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi, page); + } +} + +static void btrfs_unplug_io_fn(struct backing_dev_info *bdi, struct page *page) +{ + struct inode *inode; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct address_space *mapping; + u64 offset; + + /* the generic O_DIRECT read code does this */ + if (1 || !page) { + __unplug_io_fn(bdi, page); + return; + } + + /* + * page->mapping may change at any time. Get a consistent copy + * and use that for everything below + */ + smp_mb(); + mapping = page->mapping; + if (!mapping) + return; + + inode = mapping->host; + + /* + * don't do the expensive searching for a small number of + * devices + */ + if (BTRFS_I(inode)->root->fs_info->fs_devices->open_devices <= 2) { + __unplug_io_fn(bdi, page); + return; + } + + offset = page_offset(page); + + em_tree = &BTRFS_I(inode)->extent_tree; + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, offset, PAGE_CACHE_SIZE); + spin_unlock(&em_tree->lock); + if (!em) { + __unplug_io_fn(bdi, page); + return; + } + + if (em->block_start >= EXTENT_MAP_LAST_BYTE) { + free_extent_map(em); + __unplug_io_fn(bdi, page); + return; + } + offset = offset - em->start; + btrfs_unplug_page(&BTRFS_I(inode)->root->fs_info->mapping_tree, + em->block_start + offset, page); + free_extent_map(em); +} + +static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) +{ + bdi_init(bdi); + bdi->ra_pages = default_backing_dev_info.ra_pages; + bdi->state = 0; + bdi->capabilities = default_backing_dev_info.capabilities; + bdi->unplug_io_fn = btrfs_unplug_io_fn; + bdi->unplug_io_data = info; + bdi->congested_fn = btrfs_congested_fn; + bdi->congested_data = info; + return 0; +} + +static int bio_ready_for_csum(struct bio *bio) +{ + u64 length = 0; + u64 buf_len = 0; + u64 start = 0; + struct page *page; + struct extent_io_tree *io_tree = NULL; + struct btrfs_fs_info *info = NULL; + struct bio_vec *bvec; + int i; + int ret; + + bio_for_each_segment(bvec, bio, i) { + page = bvec->bv_page; + if (page->private == EXTENT_PAGE_PRIVATE) { + length += bvec->bv_len; + continue; + } + if (!page->private) { + length += bvec->bv_len; + continue; + } + length = bvec->bv_len; + buf_len = page->private >> 2; + start = page_offset(page) + bvec->bv_offset; + io_tree = &BTRFS_I(page->mapping->host)->io_tree; + info = BTRFS_I(page->mapping->host)->root->fs_info; + } + /* are we fully contained in this bio? */ + if (buf_len <= length) + return 1; + + ret = extent_range_uptodate(io_tree, start + length, + start + buf_len - 1); + if (ret == 1) + return ret; + return ret; +} + +/* + * called by the kthread helper functions to finally call the bio end_io + * functions. This is where read checksum verification actually happens + */ +static void end_workqueue_fn(struct btrfs_work *work) +{ + struct bio *bio; + struct end_io_wq *end_io_wq; + struct btrfs_fs_info *fs_info; + int error; + + end_io_wq = container_of(work, struct end_io_wq, work); + bio = end_io_wq->bio; + fs_info = end_io_wq->info; + + /* metadata bio reads are special because the whole tree block must + * be checksummed at once. This makes sure the entire block is in + * ram and up to date before trying to verify things. For + * blocksize <= pagesize, it is basically a noop + */ + if (!(bio->bi_rw & (1 << BIO_RW)) && end_io_wq->metadata && + !bio_ready_for_csum(bio)) { + btrfs_queue_worker(&fs_info->endio_meta_workers, + &end_io_wq->work); + return; + } + error = end_io_wq->error; + bio->bi_private = end_io_wq->private; + bio->bi_end_io = end_io_wq->end_io; + kfree(end_io_wq); + bio_endio(bio, error); +} + +static int cleaner_kthread(void *arg) +{ + struct btrfs_root *root = arg; + + do { + smp_mb(); + if (root->fs_info->closing) + break; + + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + + if (freezing(current)) { + refrigerator(); + } else { + smp_mb(); + if (root->fs_info->closing) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule(); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +static int transaction_kthread(void *arg) +{ + struct btrfs_root *root = arg; + struct btrfs_trans_handle *trans; + struct btrfs_transaction *cur; + unsigned long now; + unsigned long delay; + int ret; + + do { + smp_mb(); + if (root->fs_info->closing) + break; + + delay = HZ * 30; + vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + + if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) { + printk(KERN_INFO "btrfs: total reference cache " + "size %llu\n", + root->fs_info->total_ref_cache_size); + } + + mutex_lock(&root->fs_info->trans_mutex); + cur = root->fs_info->running_transaction; + if (!cur) { + mutex_unlock(&root->fs_info->trans_mutex); + goto sleep; + } + + now = get_seconds(); + if (now < cur->start_time || now - cur->start_time < 30) { + mutex_unlock(&root->fs_info->trans_mutex); + delay = HZ * 5; + goto sleep; + } + mutex_unlock(&root->fs_info->trans_mutex); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); +sleep: + wake_up_process(root->fs_info->cleaner_kthread); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); + + if (freezing(current)) { + refrigerator(); + } else { + if (root->fs_info->closing) + break; + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(delay); + __set_current_state(TASK_RUNNING); + } + } while (!kthread_should_stop()); + return 0; +} + +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options) +{ + u32 sectorsize; + u32 nodesize; + u32 leafsize; + u32 blocksize; + u32 stripesize; + u64 generation; + u64 features; + struct btrfs_key location; + struct buffer_head *bh; + struct btrfs_root *extent_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *csum_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *tree_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_fs_info *fs_info = kzalloc(sizeof(*fs_info), + GFP_NOFS); + struct btrfs_root *chunk_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *dev_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + struct btrfs_root *log_tree_root; + + int ret; + int err = -EINVAL; + + struct btrfs_super_block *disk_super; + + if (!extent_root || !tree_root || !fs_info || + !chunk_root || !dev_root || !csum_root) { + err = -ENOMEM; + goto fail; + } + INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_NOFS); + INIT_LIST_HEAD(&fs_info->trans_list); + INIT_LIST_HEAD(&fs_info->dead_roots); + INIT_LIST_HEAD(&fs_info->hashers); + INIT_LIST_HEAD(&fs_info->delalloc_inodes); + spin_lock_init(&fs_info->hash_lock); + spin_lock_init(&fs_info->delalloc_lock); + spin_lock_init(&fs_info->new_trans_lock); + spin_lock_init(&fs_info->ref_cache_lock); + + init_completion(&fs_info->kobj_unregister); + fs_info->tree_root = tree_root; + fs_info->extent_root = extent_root; + fs_info->csum_root = csum_root; + fs_info->chunk_root = chunk_root; + fs_info->dev_root = dev_root; + fs_info->fs_devices = fs_devices; + INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); + INIT_LIST_HEAD(&fs_info->space_info); + btrfs_mapping_init(&fs_info->mapping_tree); + atomic_set(&fs_info->nr_async_submits, 0); + atomic_set(&fs_info->async_delalloc_pages, 0); + atomic_set(&fs_info->async_submit_draining, 0); + atomic_set(&fs_info->nr_async_bios, 0); + atomic_set(&fs_info->throttles, 0); + atomic_set(&fs_info->throttle_gen, 0); + fs_info->sb = sb; + fs_info->max_extent = (u64)-1; + fs_info->max_inline = 8192 * 1024; + setup_bdi(fs_info, &fs_info->bdi); + fs_info->btree_inode = new_inode(sb); + fs_info->btree_inode->i_ino = 1; + fs_info->btree_inode->i_nlink = 1; + + fs_info->thread_pool_size = min_t(unsigned long, + num_online_cpus() + 2, 8); + + INIT_LIST_HEAD(&fs_info->ordered_extents); + spin_lock_init(&fs_info->ordered_extent_lock); + + sb->s_blocksize = 4096; + sb->s_blocksize_bits = blksize_bits(4096); + + /* + * we set the i_size on the btree inode to the max possible int. + * the real end of the address space is determined by all of + * the devices in the system + */ + fs_info->btree_inode->i_size = OFFSET_MAX; + fs_info->btree_inode->i_mapping->a_ops = &btree_aops; + fs_info->btree_inode->i_mapping->backing_dev_info = &fs_info->bdi; + + extent_io_tree_init(&BTRFS_I(fs_info->btree_inode)->io_tree, + fs_info->btree_inode->i_mapping, + GFP_NOFS); + extent_map_tree_init(&BTRFS_I(fs_info->btree_inode)->extent_tree, + GFP_NOFS); + + BTRFS_I(fs_info->btree_inode)->io_tree.ops = &btree_extent_io_ops; + + spin_lock_init(&fs_info->block_group_cache_lock); + fs_info->block_group_cache_tree.rb_node = NULL; + + extent_io_tree_init(&fs_info->pinned_extents, + fs_info->btree_inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&fs_info->pending_del, + fs_info->btree_inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&fs_info->extent_ins, + fs_info->btree_inode->i_mapping, GFP_NOFS); + fs_info->do_barriers = 1; + + INIT_LIST_HEAD(&fs_info->dead_reloc_roots); + btrfs_leaf_ref_tree_init(&fs_info->reloc_ref_tree); + btrfs_leaf_ref_tree_init(&fs_info->shared_ref_tree); + + BTRFS_I(fs_info->btree_inode)->root = tree_root; + memset(&BTRFS_I(fs_info->btree_inode)->location, 0, + sizeof(struct btrfs_key)); + insert_inode_hash(fs_info->btree_inode); + + mutex_init(&fs_info->trans_mutex); + mutex_init(&fs_info->tree_log_mutex); + mutex_init(&fs_info->drop_mutex); + mutex_init(&fs_info->extent_ins_mutex); + mutex_init(&fs_info->pinned_mutex); + mutex_init(&fs_info->chunk_mutex); + mutex_init(&fs_info->transaction_kthread_mutex); + mutex_init(&fs_info->cleaner_mutex); + mutex_init(&fs_info->volume_mutex); + mutex_init(&fs_info->tree_reloc_mutex); + init_waitqueue_head(&fs_info->transaction_throttle); + init_waitqueue_head(&fs_info->transaction_wait); + init_waitqueue_head(&fs_info->async_submit_wait); + init_waitqueue_head(&fs_info->tree_log_wait); + atomic_set(&fs_info->tree_log_commit, 0); + atomic_set(&fs_info->tree_log_writers, 0); + fs_info->tree_log_transid = 0; + + __setup_root(4096, 4096, 4096, 4096, tree_root, + fs_info, BTRFS_ROOT_TREE_OBJECTID); + + + bh = btrfs_read_dev_super(fs_devices->latest_bdev); + if (!bh) + goto fail_iput; + + memcpy(&fs_info->super_copy, bh->b_data, sizeof(fs_info->super_copy)); + memcpy(&fs_info->super_for_commit, &fs_info->super_copy, + sizeof(fs_info->super_for_commit)); + brelse(bh); + + memcpy(fs_info->fsid, fs_info->super_copy.fsid, BTRFS_FSID_SIZE); + + disk_super = &fs_info->super_copy; + if (!btrfs_super_root(disk_super)) + goto fail_iput; + + ret = btrfs_parse_options(tree_root, options); + if (ret) { + err = ret; + goto fail_iput; + } + + features = btrfs_super_incompat_flags(disk_super) & + ~BTRFS_FEATURE_INCOMPAT_SUPP; + if (features) { + printk(KERN_ERR "BTRFS: couldn't mount because of " + "unsupported optional features (%Lx).\n", + features); + err = -EINVAL; + goto fail_iput; + } + + features = btrfs_super_compat_ro_flags(disk_super) & + ~BTRFS_FEATURE_COMPAT_RO_SUPP; + if (!(sb->s_flags & MS_RDONLY) && features) { + printk(KERN_ERR "BTRFS: couldn't mount RDWR because of " + "unsupported option features (%Lx).\n", + features); + err = -EINVAL; + goto fail_iput; + } + + /* + * we need to start all the end_io workers up front because the + * queue work function gets called at interrupt time, and so it + * cannot dynamically grow. + */ + btrfs_init_workers(&fs_info->workers, "worker", + fs_info->thread_pool_size); + + btrfs_init_workers(&fs_info->delalloc_workers, "delalloc", + fs_info->thread_pool_size); + + btrfs_init_workers(&fs_info->submit_workers, "submit", + min_t(u64, fs_devices->num_devices, + fs_info->thread_pool_size)); + + /* a higher idle thresh on the submit workers makes it much more + * likely that bios will be send down in a sane order to the + * devices + */ + fs_info->submit_workers.idle_thresh = 64; + + fs_info->workers.idle_thresh = 16; + fs_info->workers.ordered = 1; + + fs_info->delalloc_workers.idle_thresh = 2; + fs_info->delalloc_workers.ordered = 1; + + btrfs_init_workers(&fs_info->fixup_workers, "fixup", 1); + btrfs_init_workers(&fs_info->endio_workers, "endio", + fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->endio_meta_workers, "endio-meta", + fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->endio_meta_write_workers, + "endio-meta-write", fs_info->thread_pool_size); + btrfs_init_workers(&fs_info->endio_write_workers, "endio-write", + fs_info->thread_pool_size); + + /* + * endios are largely parallel and should have a very + * low idle thresh + */ + fs_info->endio_workers.idle_thresh = 4; + fs_info->endio_write_workers.idle_thresh = 64; + fs_info->endio_meta_write_workers.idle_thresh = 64; + + btrfs_start_workers(&fs_info->workers, 1); + btrfs_start_workers(&fs_info->submit_workers, 1); + btrfs_start_workers(&fs_info->delalloc_workers, 1); + btrfs_start_workers(&fs_info->fixup_workers, 1); + btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->endio_meta_workers, + fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->endio_meta_write_workers, + fs_info->thread_pool_size); + btrfs_start_workers(&fs_info->endio_write_workers, + fs_info->thread_pool_size); + + fs_info->bdi.ra_pages *= btrfs_super_num_devices(disk_super); + fs_info->bdi.ra_pages = max(fs_info->bdi.ra_pages, + 4 * 1024 * 1024 / PAGE_CACHE_SIZE); + + nodesize = btrfs_super_nodesize(disk_super); + leafsize = btrfs_super_leafsize(disk_super); + sectorsize = btrfs_super_sectorsize(disk_super); + stripesize = btrfs_super_stripesize(disk_super); + tree_root->nodesize = nodesize; + tree_root->leafsize = leafsize; + tree_root->sectorsize = sectorsize; + tree_root->stripesize = stripesize; + + sb->s_blocksize = sectorsize; + sb->s_blocksize_bits = blksize_bits(sectorsize); + + if (strncmp((char *)(&disk_super->magic), BTRFS_MAGIC, + sizeof(disk_super->magic))) { + printk(KERN_INFO "btrfs: valid FS not found on %s\n", sb->s_id); + goto fail_sb_buffer; + } + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_sys_array(tree_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_WARNING "btrfs: failed to read the system " + "array on %s\n", sb->s_id); + goto fail_sys_array; + } + + blocksize = btrfs_level_size(tree_root, + btrfs_super_chunk_root_level(disk_super)); + generation = btrfs_super_chunk_root_generation(disk_super); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); + + chunk_root->node = read_tree_block(chunk_root, + btrfs_super_chunk_root(disk_super), + blocksize, generation); + BUG_ON(!chunk_root->node); + + read_extent_buffer(chunk_root->node, fs_info->chunk_tree_uuid, + (unsigned long)btrfs_header_chunk_tree_uuid(chunk_root->node), + BTRFS_UUID_SIZE); + + mutex_lock(&fs_info->chunk_mutex); + ret = btrfs_read_chunk_tree(chunk_root); + mutex_unlock(&fs_info->chunk_mutex); + if (ret) { + printk(KERN_WARNING "btrfs: failed to read chunk tree on %s\n", + sb->s_id); + goto fail_chunk_root; + } + + btrfs_close_extra_devices(fs_devices); + + blocksize = btrfs_level_size(tree_root, + btrfs_super_root_level(disk_super)); + generation = btrfs_super_generation(disk_super); + + tree_root->node = read_tree_block(tree_root, + btrfs_super_root(disk_super), + blocksize, generation); + if (!tree_root->node) + goto fail_chunk_root; + + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_EXTENT_TREE_OBJECTID, extent_root); + if (ret) + goto fail_tree_root; + extent_root->track_dirty = 1; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_DEV_TREE_OBJECTID, dev_root); + dev_root->track_dirty = 1; + + if (ret) + goto fail_extent_root; + + ret = find_and_setup_root(tree_root, fs_info, + BTRFS_CSUM_TREE_OBJECTID, csum_root); + if (ret) + goto fail_extent_root; + + csum_root->track_dirty = 1; + + btrfs_read_block_groups(extent_root); + + fs_info->generation = generation; + fs_info->last_trans_committed = generation; + fs_info->data_alloc_profile = (u64)-1; + fs_info->metadata_alloc_profile = (u64)-1; + fs_info->system_alloc_profile = fs_info->metadata_alloc_profile; + fs_info->cleaner_kthread = kthread_run(cleaner_kthread, tree_root, + "btrfs-cleaner"); + if (!fs_info->cleaner_kthread) + goto fail_csum_root; + + fs_info->transaction_kthread = kthread_run(transaction_kthread, + tree_root, + "btrfs-transaction"); + if (!fs_info->transaction_kthread) + goto fail_cleaner; + + if (btrfs_super_log_root(disk_super) != 0) { + u64 bytenr = btrfs_super_log_root(disk_super); + + if (fs_devices->rw_devices == 0) { + printk(KERN_WARNING "Btrfs log replay required " + "on RO media\n"); + err = -EIO; + goto fail_trans_kthread; + } + blocksize = + btrfs_level_size(tree_root, + btrfs_super_log_root_level(disk_super)); + + log_tree_root = kzalloc(sizeof(struct btrfs_root), + GFP_NOFS); + + __setup_root(nodesize, leafsize, sectorsize, stripesize, + log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); + + log_tree_root->node = read_tree_block(tree_root, bytenr, + blocksize, + generation + 1); + ret = btrfs_recover_log_trees(log_tree_root); + BUG_ON(ret); + + if (sb->s_flags & MS_RDONLY) { + ret = btrfs_commit_super(tree_root); + BUG_ON(ret); + } + } + + if (!(sb->s_flags & MS_RDONLY)) { + ret = btrfs_cleanup_reloc_trees(tree_root); + BUG_ON(ret); + } + + location.objectid = BTRFS_FS_TREE_OBJECTID; + location.type = BTRFS_ROOT_ITEM_KEY; + location.offset = (u64)-1; + + fs_info->fs_root = btrfs_read_fs_root_no_name(fs_info, &location); + if (!fs_info->fs_root) + goto fail_trans_kthread; + return tree_root; + +fail_trans_kthread: + kthread_stop(fs_info->transaction_kthread); +fail_cleaner: + kthread_stop(fs_info->cleaner_kthread); + + /* + * make sure we're done with the btree inode before we stop our + * kthreads + */ + filemap_write_and_wait(fs_info->btree_inode->i_mapping); + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + +fail_csum_root: + free_extent_buffer(csum_root->node); +fail_extent_root: + free_extent_buffer(extent_root->node); +fail_tree_root: + free_extent_buffer(tree_root->node); +fail_chunk_root: + free_extent_buffer(chunk_root->node); +fail_sys_array: + free_extent_buffer(dev_root->node); +fail_sb_buffer: + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); +fail_iput: + invalidate_inode_pages2(fs_info->btree_inode->i_mapping); + iput(fs_info->btree_inode); +fail: + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + kfree(extent_root); + kfree(tree_root); + bdi_destroy(&fs_info->bdi); + kfree(fs_info); + kfree(chunk_root); + kfree(dev_root); + kfree(csum_root); + return ERR_PTR(err); +} + +static void btrfs_end_buffer_write_sync(struct buffer_head *bh, int uptodate) +{ + char b[BDEVNAME_SIZE]; + + if (uptodate) { + set_buffer_uptodate(bh); + } else { + if (!buffer_eopnotsupp(bh) && printk_ratelimit()) { + printk(KERN_WARNING "lost page write due to " + "I/O error on %s\n", + bdevname(bh->b_bdev, b)); + } + /* note, we dont' set_buffer_write_io_error because we have + * our own ways of dealing with the IO errors + */ + clear_buffer_uptodate(bh); + } + unlock_buffer(bh); + put_bh(bh); +} + +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev) +{ + struct buffer_head *bh; + struct buffer_head *latest = NULL; + struct btrfs_super_block *super; + int i; + u64 transid = 0; + u64 bytenr; + + /* we would like to check all the supers, but that would make + * a btrfs mount succeed after a mkfs from a different FS. + * So, we need to add a special mount option to scan for + * later supers, using BTRFS_SUPER_MIRROR_MAX instead + */ + for (i = 0; i < 1; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + 4096 >= i_size_read(bdev->bd_inode)) + break; + bh = __bread(bdev, bytenr / 4096, 4096); + if (!bh) + continue; + + super = (struct btrfs_super_block *)bh->b_data; + if (btrfs_super_bytenr(super) != bytenr || + strncmp((char *)(&super->magic), BTRFS_MAGIC, + sizeof(super->magic))) { + brelse(bh); + continue; + } + + if (!latest || btrfs_super_generation(super) > transid) { + brelse(latest); + latest = bh; + transid = btrfs_super_generation(super); + } else { + brelse(bh); + } + } + return latest; +} + +static int write_dev_supers(struct btrfs_device *device, + struct btrfs_super_block *sb, + int do_barriers, int wait, int max_mirrors) +{ + struct buffer_head *bh; + int i; + int ret; + int errors = 0; + u32 crc; + u64 bytenr; + int last_barrier = 0; + + if (max_mirrors == 0) + max_mirrors = BTRFS_SUPER_MIRROR_MAX; + + /* make sure only the last submit_bh does a barrier */ + if (do_barriers) { + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->total_bytes) + break; + last_barrier = i; + } + } + + for (i = 0; i < max_mirrors; i++) { + bytenr = btrfs_sb_offset(i); + if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + break; + + if (wait) { + bh = __find_get_block(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + BUG_ON(!bh); + brelse(bh); + wait_on_buffer(bh); + if (buffer_uptodate(bh)) { + brelse(bh); + continue; + } + } else { + btrfs_set_super_bytenr(sb, bytenr); + + crc = ~(u32)0; + crc = btrfs_csum_data(NULL, (char *)sb + + BTRFS_CSUM_SIZE, crc, + BTRFS_SUPER_INFO_SIZE - + BTRFS_CSUM_SIZE); + btrfs_csum_final(crc, sb->csum); + + bh = __getblk(device->bdev, bytenr / 4096, + BTRFS_SUPER_INFO_SIZE); + memcpy(bh->b_data, sb, BTRFS_SUPER_INFO_SIZE); + + set_buffer_uptodate(bh); + get_bh(bh); + lock_buffer(bh); + bh->b_end_io = btrfs_end_buffer_write_sync; + } + + if (i == last_barrier && do_barriers && device->barriers) { + ret = submit_bh(WRITE_BARRIER, bh); + if (ret == -EOPNOTSUPP) { + printk("btrfs: disabling barriers on dev %s\n", + device->name); + set_buffer_uptodate(bh); + device->barriers = 0; + get_bh(bh); + lock_buffer(bh); + ret = submit_bh(WRITE, bh); + } + } else { + ret = submit_bh(WRITE, bh); + } + + if (!ret && wait) { + wait_on_buffer(bh); + if (!buffer_uptodate(bh)) + errors++; + } else if (ret) { + errors++; + } + if (wait) + brelse(bh); + } + return errors < i ? 0 : -1; +} + +int write_all_supers(struct btrfs_root *root, int max_mirrors) +{ + struct list_head *cur; + struct list_head *head = &root->fs_info->fs_devices->devices; + struct btrfs_device *dev; + struct btrfs_super_block *sb; + struct btrfs_dev_item *dev_item; + int ret; + int do_barriers; + int max_errors; + int total_errors = 0; + u64 flags; + + max_errors = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + do_barriers = !btrfs_test_opt(root, NOBARRIER); + + sb = &root->fs_info->super_for_commit; + dev_item = &sb->dev_item; + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) { + total_errors++; + continue; + } + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + btrfs_set_stack_device_generation(dev_item, 0); + btrfs_set_stack_device_type(dev_item, dev->type); + btrfs_set_stack_device_id(dev_item, dev->devid); + btrfs_set_stack_device_total_bytes(dev_item, dev->total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + btrfs_set_stack_device_io_align(dev_item, dev->io_align); + btrfs_set_stack_device_io_width(dev_item, dev->io_width); + btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); + memcpy(dev_item->uuid, dev->uuid, BTRFS_UUID_SIZE); + memcpy(dev_item->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE); + + flags = btrfs_super_flags(sb); + btrfs_set_super_flags(sb, flags | BTRFS_HEADER_FLAG_WRITTEN); + + ret = write_dev_supers(dev, sb, do_barriers, 0, max_mirrors); + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + printk(KERN_ERR "btrfs: %d errors while writing supers\n", + total_errors); + BUG(); + } + + total_errors = 0; + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (!dev->bdev) + continue; + if (!dev->in_fs_metadata || !dev->writeable) + continue; + + ret = write_dev_supers(dev, sb, do_barriers, 1, max_mirrors); + if (ret) + total_errors++; + } + if (total_errors > max_errors) { + printk(KERN_ERR "btrfs: %d errors while writing supers\n", + total_errors); + BUG(); + } + return 0; +} + +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors) +{ + int ret; + + ret = write_all_supers(root, max_mirrors); + return ret; +} + +int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root) +{ + radix_tree_delete(&fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid); + if (root->anon_super.s_dev) { + down_write(&root->anon_super.s_umount); + kill_anon_super(&root->anon_super); + } + if (root->node) + free_extent_buffer(root->node); + if (root->commit_root) + free_extent_buffer(root->commit_root); + kfree(root->name); + kfree(root); + return 0; +} + +static int del_fs_roots(struct btrfs_fs_info *fs_info) +{ + int ret; + struct btrfs_root *gang[8]; + int i; + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, 0, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) + btrfs_free_fs_root(fs_info, gang[i]); + } + return 0; +} + +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info) +{ + u64 root_objectid = 0; + struct btrfs_root *gang[8]; + int i; + int ret; + + while (1) { + ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix, + (void **)gang, root_objectid, + ARRAY_SIZE(gang)); + if (!ret) + break; + for (i = 0; i < ret; i++) { + root_objectid = gang[i]->root_key.objectid; + ret = btrfs_find_dead_roots(fs_info->tree_root, + root_objectid, gang[i]); + BUG_ON(ret); + btrfs_orphan_cleanup(gang[i]); + } + root_objectid++; + } + return 0; +} + +int btrfs_commit_super(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + int ret; + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(root); + mutex_unlock(&root->fs_info->cleaner_mutex); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + /* run commit again to drop the original snapshot */ + trans = btrfs_start_transaction(root, 1); + btrfs_commit_transaction(trans, root); + ret = btrfs_write_and_wait_transaction(NULL, root); + BUG_ON(ret); + + ret = write_ctree_super(NULL, root, 0); + return ret; +} + +int close_ctree(struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + int ret; + + fs_info->closing = 1; + smp_mb(); + + kthread_stop(root->fs_info->transaction_kthread); + kthread_stop(root->fs_info->cleaner_kthread); + + if (!(fs_info->sb->s_flags & MS_RDONLY)) { + ret = btrfs_commit_super(root); + if (ret) + printk(KERN_ERR "btrfs: commit super ret %d\n", ret); + } + + if (fs_info->delalloc_bytes) { + printk(KERN_INFO "btrfs: at unmount delalloc count %llu\n", + fs_info->delalloc_bytes); + } + if (fs_info->total_ref_cache_size) { + printk(KERN_INFO "btrfs: at umount reference cache size %llu\n", + (unsigned long long)fs_info->total_ref_cache_size); + } + + if (fs_info->extent_root->node) + free_extent_buffer(fs_info->extent_root->node); + + if (fs_info->tree_root->node) + free_extent_buffer(fs_info->tree_root->node); + + if (root->fs_info->chunk_root->node) + free_extent_buffer(root->fs_info->chunk_root->node); + + if (root->fs_info->dev_root->node) + free_extent_buffer(root->fs_info->dev_root->node); + + if (root->fs_info->csum_root->node) + free_extent_buffer(root->fs_info->csum_root->node); + + btrfs_free_block_groups(root->fs_info); + + del_fs_roots(fs_info); + + iput(fs_info->btree_inode); + + btrfs_stop_workers(&fs_info->fixup_workers); + btrfs_stop_workers(&fs_info->delalloc_workers); + btrfs_stop_workers(&fs_info->workers); + btrfs_stop_workers(&fs_info->endio_workers); + btrfs_stop_workers(&fs_info->endio_meta_workers); + btrfs_stop_workers(&fs_info->endio_meta_write_workers); + btrfs_stop_workers(&fs_info->endio_write_workers); + btrfs_stop_workers(&fs_info->submit_workers); + +#if 0 + while (!list_empty(&fs_info->hashers)) { + struct btrfs_hasher *hasher; + hasher = list_entry(fs_info->hashers.next, struct btrfs_hasher, + hashers); + list_del(&hasher->hashers); + crypto_free_hash(&fs_info->hash_tfm); + kfree(hasher); + } +#endif + btrfs_close_devices(fs_info->fs_devices); + btrfs_mapping_tree_free(&fs_info->mapping_tree); + + bdi_destroy(&fs_info->bdi); + + kfree(fs_info->extent_root); + kfree(fs_info->tree_root); + kfree(fs_info->chunk_root); + kfree(fs_info->dev_root); + kfree(fs_info->csum_root); + return 0; +} + +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid) +{ + int ret; + struct inode *btree_inode = buf->first_page->mapping->host; + + ret = extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, buf); + if (!ret) + return ret; + + ret = verify_parent_transid(&BTRFS_I(btree_inode)->io_tree, buf, + parent_transid); + return !ret; +} + +int btrfs_set_buffer_uptodate(struct extent_buffer *buf) +{ + struct inode *btree_inode = buf->first_page->mapping->host; + return set_extent_buffer_uptodate(&BTRFS_I(btree_inode)->io_tree, + buf); +} + +void btrfs_mark_buffer_dirty(struct extent_buffer *buf) +{ + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + u64 transid = btrfs_header_generation(buf); + struct inode *btree_inode = root->fs_info->btree_inode; + + WARN_ON(!btrfs_tree_locked(buf)); + if (transid != root->fs_info->generation) { + printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " + "found %llu running %llu\n", + (unsigned long long)buf->start, + (unsigned long long)transid, + (unsigned long long)root->fs_info->generation); + WARN_ON(1); + } + set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf); +} + +void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr) +{ + /* + * looks as though older kernels can get into trouble with + * this code, they end up stuck in balance_dirty_pages forever + */ + struct extent_io_tree *tree; + u64 num_dirty; + u64 start = 0; + unsigned long thresh = 32 * 1024 * 1024; + tree = &BTRFS_I(root->fs_info->btree_inode)->io_tree; + + if (current_is_pdflush() || current->flags & PF_MEMALLOC) + return; + + num_dirty = count_range_bits(tree, &start, (u64)-1, + thresh, EXTENT_DIRTY); + if (num_dirty > thresh) { + balance_dirty_pages_ratelimited_nr( + root->fs_info->btree_inode->i_mapping, 1); + } + return; +} + +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) +{ + struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root; + int ret; + ret = btree_read_extent_buffer_pages(root, buf, 0, parent_transid); + if (ret == 0) + buf->flags |= EXTENT_UPTODATE; + return ret; +} + +int btree_lock_page_hook(struct page *page) +{ + struct inode *inode = page->mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_buffer *eb; + unsigned long len; + u64 bytenr = page_offset(page); + + if (page->private == EXTENT_PAGE_PRIVATE) + goto out; + + len = page->private >> 2; + eb = find_extent_buffer(io_tree, bytenr, len, GFP_NOFS); + if (!eb) + goto out; + + btrfs_tree_lock(eb); + spin_lock(&root->fs_info->hash_lock); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); + spin_unlock(&root->fs_info->hash_lock); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); +out: + lock_page(page); + return 0; +} + +static struct extent_io_ops btree_extent_io_ops = { + .write_cache_pages_lock_hook = btree_lock_page_hook, + .readpage_end_io_hook = btree_readpage_end_io_hook, + .submit_bio_hook = btree_submit_bio_hook, + /* note we're sharing with inode.c for the merge bio hook */ + .merge_bio_hook = btrfs_merge_bio_hook, +}; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h new file mode 100644 index 00000000000..c0ff404c31b --- /dev/null +++ b/fs/btrfs/disk-io.h @@ -0,0 +1,102 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __DISKIO__ +#define __DISKIO__ + +#define BTRFS_SUPER_INFO_OFFSET (64 * 1024) +#define BTRFS_SUPER_INFO_SIZE 4096 + +#define BTRFS_SUPER_MIRROR_MAX 3 +#define BTRFS_SUPER_MIRROR_SHIFT 12 + +static inline u64 btrfs_sb_offset(int mirror) +{ + u64 start = 16 * 1024; + if (mirror) + return start << (BTRFS_SUPER_MIRROR_SHIFT * mirror); + return BTRFS_SUPER_INFO_OFFSET; +} + +struct btrfs_device; +struct btrfs_fs_devices; + +struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, + u32 blocksize, u64 parent_transid); +int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, + u64 parent_transid); +struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize); +int clean_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *buf); +struct btrfs_root *open_ctree(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + char *options); +int close_ctree(struct btrfs_root *root); +int write_ctree_super(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int max_mirrors); +struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); +int btrfs_commit_super(struct btrfs_root *root); +struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, + u64 bytenr, u32 blocksize); +struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, + u64 root_objectid); +struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info, + struct btrfs_key *location, + const char *name, int namelen); +struct btrfs_root *btrfs_read_fs_root_no_radix(struct btrfs_root *tree_root, + struct btrfs_key *location); +struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info, + struct btrfs_key *location); +int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info); +int btrfs_insert_dev_radix(struct btrfs_root *root, + struct block_device *bdev, + u64 device_id, + u64 block_start, + u64 num_blocks); +void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr); +int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); +void btrfs_mark_buffer_dirty(struct extent_buffer *buf); +int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid); +int btrfs_set_buffer_uptodate(struct extent_buffer *buf); +int wait_on_tree_block_writeback(struct btrfs_root *root, + struct extent_buffer *buf); +int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); +u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len); +void btrfs_csum_final(u32 crc, char *result); +int btrfs_open_device(struct btrfs_device *dev); +int btrfs_verify_block_csum(struct btrfs_root *root, + struct extent_buffer *buf); +int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, + int metadata); +int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, + int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags, + extent_submit_bio_hook_t *submit_bio_start, + extent_submit_bio_hook_t *submit_bio_done); + +int btrfs_congested_async(struct btrfs_fs_info *info, int iodone); +unsigned long btrfs_async_submit_limit(struct btrfs_fs_info *info); +int btrfs_write_tree_block(struct extent_buffer *buf); +int btrfs_wait_tree_block_writeback(struct extent_buffer *buf); +int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info); +int btree_lock_page_hook(struct page *page); +#endif diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c new file mode 100644 index 00000000000..85315d2c90d --- /dev/null +++ b/fs/btrfs/export.c @@ -0,0 +1,203 @@ +#include <linux/fs.h> +#include <linux/types.h> +#include "ctree.h" +#include "disk-io.h" +#include "btrfs_inode.h" +#include "print-tree.h" +#include "export.h" +#include "compat.h" + +#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE (offsetof(struct btrfs_fid, \ + parent_root_objectid) / 4) +#define BTRFS_FID_SIZE_CONNECTABLE_ROOT (sizeof(struct btrfs_fid) / 4) + +static int btrfs_encode_fh(struct dentry *dentry, u32 *fh, int *max_len, + int connectable) +{ + struct btrfs_fid *fid = (struct btrfs_fid *)fh; + struct inode *inode = dentry->d_inode; + int len = *max_len; + int type; + + if ((len < BTRFS_FID_SIZE_NON_CONNECTABLE) || + (connectable && len < BTRFS_FID_SIZE_CONNECTABLE)) + return 255; + + len = BTRFS_FID_SIZE_NON_CONNECTABLE; + type = FILEID_BTRFS_WITHOUT_PARENT; + + fid->objectid = BTRFS_I(inode)->location.objectid; + fid->root_objectid = BTRFS_I(inode)->root->objectid; + fid->gen = inode->i_generation; + + if (connectable && !S_ISDIR(inode->i_mode)) { + struct inode *parent; + u64 parent_root_id; + + spin_lock(&dentry->d_lock); + + parent = dentry->d_parent->d_inode; + fid->parent_objectid = BTRFS_I(parent)->location.objectid; + fid->parent_gen = parent->i_generation; + parent_root_id = BTRFS_I(parent)->root->objectid; + + spin_unlock(&dentry->d_lock); + + if (parent_root_id != fid->root_objectid) { + fid->parent_root_objectid = parent_root_id; + len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; + type = FILEID_BTRFS_WITH_PARENT_ROOT; + } else { + len = BTRFS_FID_SIZE_CONNECTABLE; + type = FILEID_BTRFS_WITH_PARENT; + } + } + + *max_len = len; + return type; +} + +static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, + u64 root_objectid, u32 generation) +{ + struct btrfs_root *root; + struct inode *inode; + struct btrfs_key key; + + key.objectid = root_objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = (u64)-1; + + root = btrfs_read_fs_root_no_name(btrfs_sb(sb)->fs_info, &key); + if (IS_ERR(root)) + return ERR_CAST(root); + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + inode = btrfs_iget(sb, &key, root, NULL); + if (IS_ERR(inode)) + return (void *)inode; + + if (generation != inode->i_generation) { + iput(inode); + return ERR_PTR(-ESTALE); + } + + return d_obtain_alias(inode); +} + +static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if (fh_type == FILEID_BTRFS_WITH_PARENT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE) + return NULL; + root_objectid = fid->root_objectid; + } else if (fh_type == FILEID_BTRFS_WITH_PARENT_ROOT) { + if (fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) + return NULL; + root_objectid = fid->parent_root_objectid; + } else + return NULL; + + objectid = fid->parent_objectid; + generation = fid->parent_gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, + int fh_len, int fh_type) +{ + struct btrfs_fid *fid = (struct btrfs_fid *) fh; + u64 objectid, root_objectid; + u32 generation; + + if ((fh_type != FILEID_BTRFS_WITH_PARENT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE) && + (fh_type != FILEID_BTRFS_WITH_PARENT_ROOT || + fh_len != BTRFS_FID_SIZE_CONNECTABLE_ROOT) && + (fh_type != FILEID_BTRFS_WITHOUT_PARENT || + fh_len != BTRFS_FID_SIZE_NON_CONNECTABLE)) + return NULL; + + objectid = fid->objectid; + root_objectid = fid->root_objectid; + generation = fid->gen; + + return btrfs_get_dentry(sb, objectid, root_objectid, generation); +} + +static struct dentry *btrfs_get_parent(struct dentry *child) +{ + struct inode *dir = child->d_inode; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int slot; + u64 objectid; + int ret; + + path = btrfs_alloc_path(); + + key.objectid = dir->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + /* Error */ + btrfs_free_path(path); + return ERR_PTR(ret); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + if (ret) { + /* btrfs_search_slot() returns the slot where we'd want to + insert a backref for parent inode #0xFFFFFFFFFFFFFFFF. + The _real_ backref, telling us what the parent inode + _actually_ is, will be in the slot _before_ the one + that btrfs_search_slot() returns. */ + if (!slot) { + /* Unless there is _no_ key in the tree before... */ + btrfs_free_path(path); + return ERR_PTR(-EIO); + } + slot--; + } + + btrfs_item_key_to_cpu(leaf, &key, slot); + btrfs_free_path(path); + + if (key.objectid != dir->i_ino || key.type != BTRFS_INODE_REF_KEY) + return ERR_PTR(-EINVAL); + + objectid = key.offset; + + /* If we are already at the root of a subvol, return the real root */ + if (objectid == dir->i_ino) + return dget(dir->i_sb->s_root); + + /* Build a new key for the inode item */ + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + return d_obtain_alias(btrfs_iget(root->fs_info->sb, &key, root, NULL)); +} + +const struct export_operations btrfs_export_ops = { + .encode_fh = btrfs_encode_fh, + .fh_to_dentry = btrfs_fh_to_dentry, + .fh_to_parent = btrfs_fh_to_parent, + .get_parent = btrfs_get_parent, +}; diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h new file mode 100644 index 00000000000..074348a9584 --- /dev/null +++ b/fs/btrfs/export.h @@ -0,0 +1,19 @@ +#ifndef BTRFS_EXPORT_H +#define BTRFS_EXPORT_H + +#include <linux/exportfs.h> + +extern const struct export_operations btrfs_export_ops; + +struct btrfs_fid { + u64 objectid; + u64 root_objectid; + u32 gen; + + u64 parent_objectid; + u32 parent_gen; + + u64 parent_root_objectid; +} __attribute__ ((packed)); + +#endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c new file mode 100644 index 00000000000..293da650873 --- /dev/null +++ b/fs/btrfs/extent-tree.c @@ -0,0 +1,5986 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/writeback.h> +#include <linux/blkdev.h> +#include <linux/version.h> +#include "compat.h" +#include "hash.h" +#include "crc32c.h" +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "volumes.h" +#include "locking.h" +#include "ref-cache.h" +#include "compat.h" + +#define PENDING_EXTENT_INSERT 0 +#define PENDING_EXTENT_DELETE 1 +#define PENDING_BACKREF_UPDATE 2 + +struct pending_extent_op { + int type; + u64 bytenr; + u64 num_bytes; + u64 parent; + u64 orig_parent; + u64 generation; + u64 orig_generation; + int level; + struct list_head list; + int del; +}; + +static int finish_current_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all); +static int del_pending_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all); +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int is_data); +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc, + int mark_free); + +static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) +{ + return (cache->flags & bits) == bits; +} + +/* + * this adds the block group to the fs_info rb tree for the block group + * cache + */ +static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, + struct btrfs_block_group_cache *block_group) +{ + struct rb_node **p; + struct rb_node *parent = NULL; + struct btrfs_block_group_cache *cache; + + spin_lock(&info->block_group_cache_lock); + p = &info->block_group_cache_tree.rb_node; + + while (*p) { + parent = *p; + cache = rb_entry(parent, struct btrfs_block_group_cache, + cache_node); + if (block_group->key.objectid < cache->key.objectid) { + p = &(*p)->rb_left; + } else if (block_group->key.objectid > cache->key.objectid) { + p = &(*p)->rb_right; + } else { + spin_unlock(&info->block_group_cache_lock); + return -EEXIST; + } + } + + rb_link_node(&block_group->cache_node, parent, p); + rb_insert_color(&block_group->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + + return 0; +} + +/* + * This will return the block group at or after bytenr if contains is 0, else + * it will return the block group that contains the bytenr + */ +static struct btrfs_block_group_cache * +block_group_cache_tree_search(struct btrfs_fs_info *info, u64 bytenr, + int contains) +{ + struct btrfs_block_group_cache *cache, *ret = NULL; + struct rb_node *n; + u64 end, start; + + spin_lock(&info->block_group_cache_lock); + n = info->block_group_cache_tree.rb_node; + + while (n) { + cache = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + end = cache->key.objectid + cache->key.offset - 1; + start = cache->key.objectid; + + if (bytenr < start) { + if (!contains && (!ret || start < ret->key.objectid)) + ret = cache; + n = n->rb_left; + } else if (bytenr > start) { + if (contains && bytenr <= end) { + ret = cache; + break; + } + n = n->rb_right; + } else { + ret = cache; + break; + } + } + if (ret) + atomic_inc(&ret->count); + spin_unlock(&info->block_group_cache_lock); + + return ret; +} + +/* + * this is only called by cache_block_group, since we could have freed extents + * we need to check the pinned_extents for any extents that can't be used yet + * since their free space will be released as soon as the transaction commits. + */ +static int add_new_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_fs_info *info, u64 start, u64 end) +{ + u64 extent_start, extent_end, size; + int ret; + + mutex_lock(&info->pinned_mutex); + while (start < end) { + ret = find_first_extent_bit(&info->pinned_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY); + if (ret) + break; + + if (extent_start == start) { + start = extent_end + 1; + } else if (extent_start > start && extent_start < end) { + size = extent_start - start; + ret = btrfs_add_free_space(block_group, start, + size); + BUG_ON(ret); + start = extent_end + 1; + } else { + break; + } + } + + if (start < end) { + size = end - start; + ret = btrfs_add_free_space(block_group, start, size); + BUG_ON(ret); + } + mutex_unlock(&info->pinned_mutex); + + return 0; +} + +static int remove_sb_from_cache(struct btrfs_root *root, + struct btrfs_block_group_cache *cache) +{ + u64 bytenr; + u64 *logical; + int stripe_len; + int i, nr, ret; + + for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { + bytenr = btrfs_sb_offset(i); + ret = btrfs_rmap_block(&root->fs_info->mapping_tree, + cache->key.objectid, bytenr, 0, + &logical, &nr, &stripe_len); + BUG_ON(ret); + while (nr--) { + btrfs_remove_free_space(cache, logical[nr], + stripe_len); + } + kfree(logical); + } + return 0; +} + +static int cache_block_group(struct btrfs_root *root, + struct btrfs_block_group_cache *block_group) +{ + struct btrfs_path *path; + int ret = 0; + struct btrfs_key key; + struct extent_buffer *leaf; + int slot; + u64 last; + + if (!block_group) + return 0; + + root = root->fs_info->extent_root; + + if (block_group->cached) + return 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 2; + /* + * we get into deadlocks with paths held by callers of this function. + * since the alloc_mutex is protecting things right now, just + * skip the locking here + */ + path->skip_locking = 1; + last = max_t(u64, block_group->key.objectid, BTRFS_SUPER_INFO_OFFSET); + key.objectid = last; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto err; + if (ret == 0) + continue; + else + break; + } + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid < block_group->key.objectid) + goto next; + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + + if (btrfs_key_type(&key) == BTRFS_EXTENT_ITEM_KEY) { + add_new_free_space(block_group, root->fs_info, last, + key.objectid); + + last = key.objectid + key.offset; + } +next: + path->slots[0]++; + } + + add_new_free_space(block_group, root->fs_info, last, + block_group->key.objectid + + block_group->key.offset); + + remove_sb_from_cache(root, block_group); + block_group->cached = 1; + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +/* + * return the block group that starts at or after bytenr + */ +static struct btrfs_block_group_cache * +btrfs_lookup_first_block_group(struct btrfs_fs_info *info, u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 0); + + return cache; +} + +/* + * return the block group that contains teh given bytenr + */ +struct btrfs_block_group_cache *btrfs_lookup_block_group( + struct btrfs_fs_info *info, + u64 bytenr) +{ + struct btrfs_block_group_cache *cache; + + cache = block_group_cache_tree_search(info, bytenr, 1); + + return cache; +} + +static inline void put_block_group(struct btrfs_block_group_cache *cache) +{ + if (atomic_dec_and_test(&cache->count)) + kfree(cache); +} + +static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, + u64 flags) +{ + struct list_head *head = &info->space_info; + struct list_head *cur; + struct btrfs_space_info *found; + list_for_each(cur, head) { + found = list_entry(cur, struct btrfs_space_info, list); + if (found->flags == flags) + return found; + } + return NULL; +} + +static u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +u64 btrfs_find_block_group(struct btrfs_root *root, + u64 search_start, u64 search_hint, int owner) +{ + struct btrfs_block_group_cache *cache; + u64 used; + u64 last = max(search_hint, search_start); + u64 group_start = 0; + int full_search = 0; + int factor = 9; + int wrapped = 0; +again: + while (1) { + cache = btrfs_lookup_first_block_group(root->fs_info, last); + if (!cache) + break; + + spin_lock(&cache->lock); + last = cache->key.objectid + cache->key.offset; + used = btrfs_block_group_used(&cache->item); + + if ((full_search || !cache->ro) && + block_group_bits(cache, BTRFS_BLOCK_GROUP_METADATA)) { + if (used + cache->pinned + cache->reserved < + div_factor(cache->key.offset, factor)) { + group_start = cache->key.objectid; + spin_unlock(&cache->lock); + put_block_group(cache); + goto found; + } + } + spin_unlock(&cache->lock); + put_block_group(cache); + cond_resched(); + } + if (!wrapped) { + last = search_start; + wrapped = 1; + goto again; + } + if (!full_search && factor < 10) { + last = search_start; + full_search = 1; + factor = 10; + goto again; + } +found: + return group_start; +} + +/* simple helper to search for an existing extent at a given offset */ +int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len) +{ + int ret; + struct btrfs_key key; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = start; + key.offset = len; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(NULL, root->fs_info->extent_root, &key, path, + 0, 0); + btrfs_free_path(path); + return ret; +} + +/* + * Back reference rules. Back refs have three main goals: + * + * 1) differentiate between all holders of references to an extent so that + * when a reference is dropped we can make sure it was a valid reference + * before freeing the extent. + * + * 2) Provide enough information to quickly find the holders of an extent + * if we notice a given block is corrupted or bad. + * + * 3) Make it easy to migrate blocks for FS shrinking or storage pool + * maintenance. This is actually the same as #2, but with a slightly + * different use case. + * + * File extents can be referenced by: + * + * - multiple snapshots, subvolumes, or different generations in one subvol + * - different files inside a single subvolume + * - different offsets inside a file (bookend extents in file.c) + * + * The extent ref structure has fields for: + * + * - Objectid of the subvolume root + * - Generation number of the tree holding the reference + * - objectid of the file holding the reference + * - number of references holding by parent node (alway 1 for tree blocks) + * + * Btree leaf may hold multiple references to a file extent. In most cases, + * these references are from same file and the corresponding offsets inside + * the file are close together. + * + * When a file extent is allocated the fields are filled in: + * (root_key.objectid, trans->transid, inode objectid, 1) + * + * When a leaf is cow'd new references are added for every file extent found + * in the leaf. It looks similar to the create case, but trans->transid will + * be different when the block is cow'd. + * + * (root_key.objectid, trans->transid, inode objectid, + * number of references in the leaf) + * + * When a file extent is removed either during snapshot deletion or + * file truncation, we find the corresponding back reference and check + * the following fields: + * + * (btrfs_header_owner(leaf), btrfs_header_generation(leaf), + * inode objectid) + * + * Btree extents can be referenced by: + * + * - Different subvolumes + * - Different generations of the same subvolume + * + * When a tree block is created, back references are inserted: + * + * (root->root_key.objectid, trans->transid, level, 1) + * + * When a tree block is cow'd, new back references are added for all the + * blocks it points to. If the tree block isn't in reference counted root, + * the old back references are removed. These new back references are of + * the form (trans->transid will have increased since creation): + * + * (root->root_key.objectid, trans->transid, level, 1) + * + * When a backref is in deleting, the following fields are checked: + * + * if backref was for a tree root: + * (btrfs_header_owner(itself), btrfs_header_generation(itself), level) + * else + * (btrfs_header_owner(parent), btrfs_header_generation(parent), level) + * + * Back Reference Key composing: + * + * The key objectid corresponds to the first byte in the extent, the key + * type is set to BTRFS_EXTENT_REF_KEY, and the key offset is the first + * byte of parent extent. If a extent is tree root, the key offset is set + * to the key objectid. + */ + +static noinline int lookup_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid, int del) +{ + struct btrfs_key key; + struct btrfs_extent_ref *ref; + struct extent_buffer *leaf; + u64 ref_objectid; + int ret; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = parent; + + ret = btrfs_search_slot(trans, root, &key, path, del ? -1 : 0, 1); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + ref_objectid = btrfs_ref_objectid(leaf, ref); + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != ref_generation || + (ref_objectid != owner_objectid && + ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { + ret = -EIO; + WARN_ON(1); + goto out; + } + ret = 0; +out: + return ret; +} + +/* + * updates all the backrefs that are pending on update_list for the + * extent_root + */ +static noinline int update_backrefs(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct list_head *update_list) +{ + struct btrfs_key key; + struct btrfs_extent_ref *ref; + struct btrfs_fs_info *info = extent_root->fs_info; + struct pending_extent_op *op; + struct extent_buffer *leaf; + int ret = 0; + struct list_head *cur = update_list->next; + u64 ref_objectid; + u64 ref_root = extent_root->root_key.objectid; + + op = list_entry(cur, struct pending_extent_op, list); + +search: + key.objectid = op->bytenr; + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = op->orig_parent; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1); + BUG_ON(ret); + + leaf = path->nodes[0]; + +loop: + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + + ref_objectid = btrfs_ref_objectid(leaf, ref); + + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != op->orig_generation || + (ref_objectid != op->level && + ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) { + printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, " + "root %llu, owner %u\n", + (unsigned long long)op->bytenr, + (unsigned long long)op->orig_parent, + (unsigned long long)ref_root, op->level); + btrfs_print_leaf(extent_root, leaf); + BUG(); + } + + key.objectid = op->bytenr; + key.offset = op->parent; + key.type = BTRFS_EXTENT_REF_KEY; + ret = btrfs_set_item_key_safe(trans, extent_root, path, &key); + BUG_ON(ret); + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + btrfs_set_ref_generation(leaf, ref, op->generation); + + cur = cur->next; + + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + + if (cur == update_list) { + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(extent_root, path); + goto out; + } + + op = list_entry(cur, struct pending_extent_op, list); + + path->slots[0]++; + while (path->slots[0] < btrfs_header_nritems(leaf)) { + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid == op->bytenr && + key.type == BTRFS_EXTENT_REF_KEY) + goto loop; + path->slots[0]++; + } + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(extent_root, path); + goto search; + +out: + return 0; +} + +static noinline int insert_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct list_head *insert_list, int nr) +{ + struct btrfs_key *keys; + u32 *data_size; + struct pending_extent_op *op; + struct extent_buffer *leaf; + struct list_head *cur = insert_list->next; + struct btrfs_fs_info *info = extent_root->fs_info; + u64 ref_root = extent_root->root_key.objectid; + int i = 0, last = 0, ret; + int total = nr * 2; + + if (!nr) + return 0; + + keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS); + if (!keys) + return -ENOMEM; + + data_size = kzalloc(total * sizeof(u32), GFP_NOFS); + if (!data_size) { + kfree(keys); + return -ENOMEM; + } + + list_for_each_entry(op, insert_list, list) { + keys[i].objectid = op->bytenr; + keys[i].offset = op->num_bytes; + keys[i].type = BTRFS_EXTENT_ITEM_KEY; + data_size[i] = sizeof(struct btrfs_extent_item); + i++; + + keys[i].objectid = op->bytenr; + keys[i].offset = op->parent; + keys[i].type = BTRFS_EXTENT_REF_KEY; + data_size[i] = sizeof(struct btrfs_extent_ref); + i++; + } + + op = list_entry(cur, struct pending_extent_op, list); + i = 0; + while (i < total) { + int c; + ret = btrfs_insert_some_items(trans, extent_root, path, + keys+i, data_size+i, total-i); + BUG_ON(ret < 0); + + if (last && ret > 1) + BUG(); + + leaf = path->nodes[0]; + for (c = 0; c < ret; c++) { + int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY; + + /* + * if the first item we inserted was a backref, then + * the EXTENT_ITEM will be the odd c's, else it will + * be the even c's + */ + if ((ref_first && (c % 2)) || + (!ref_first && !(c % 2))) { + struct btrfs_extent_item *itm; + + itm = btrfs_item_ptr(leaf, path->slots[0] + c, + struct btrfs_extent_item); + btrfs_set_extent_refs(path->nodes[0], itm, 1); + op->del++; + } else { + struct btrfs_extent_ref *ref; + + ref = btrfs_item_ptr(leaf, path->slots[0] + c, + struct btrfs_extent_ref); + btrfs_set_ref_root(leaf, ref, ref_root); + btrfs_set_ref_generation(leaf, ref, + op->generation); + btrfs_set_ref_objectid(leaf, ref, op->level); + btrfs_set_ref_num_refs(leaf, ref, 1); + op->del++; + } + + /* + * using del to see when its ok to free up the + * pending_extent_op. In the case where we insert the + * last item on the list in order to help do batching + * we need to not free the extent op until we actually + * insert the extent_item + */ + if (op->del == 2) { + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, + GFP_NOFS); + cur = cur->next; + list_del_init(&op->list); + kfree(op); + if (cur != insert_list) + op = list_entry(cur, + struct pending_extent_op, + list); + } + } + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(extent_root, path); + + /* + * Ok backref's and items usually go right next to eachother, + * but if we could only insert 1 item that means that we + * inserted on the end of a leaf, and we have no idea what may + * be on the next leaf so we just play it safe. In order to + * try and help this case we insert the last thing on our + * insert list so hopefully it will end up being the last + * thing on the leaf and everything else will be before it, + * which will let us insert a whole bunch of items at the same + * time. + */ + if (ret == 1 && !last && (i + ret < total)) { + /* + * last: where we will pick up the next time around + * i: our current key to insert, will be total - 1 + * cur: the current op we are screwing with + * op: duh + */ + last = i + ret; + i = total - 1; + cur = insert_list->prev; + op = list_entry(cur, struct pending_extent_op, list); + } else if (last) { + /* + * ok we successfully inserted the last item on the + * list, lets reset everything + * + * i: our current key to insert, so where we left off + * last time + * last: done with this + * cur: the op we are messing with + * op: duh + * total: since we inserted the last key, we need to + * decrement total so we dont overflow + */ + i = last; + last = 0; + total--; + if (i < total) { + cur = insert_list->next; + op = list_entry(cur, struct pending_extent_op, + list); + } + } else { + i += ret; + } + + cond_resched(); + } + ret = 0; + kfree(keys); + kfree(data_size); + return ret; +} + +static noinline int insert_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid) +{ + struct btrfs_key key; + struct extent_buffer *leaf; + struct btrfs_extent_ref *ref; + u32 num_refs; + int ret; + + key.objectid = bytenr; + key.type = BTRFS_EXTENT_REF_KEY; + key.offset = parent; + + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*ref)); + if (ret == 0) { + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + btrfs_set_ref_root(leaf, ref, ref_root); + btrfs_set_ref_generation(leaf, ref, ref_generation); + btrfs_set_ref_objectid(leaf, ref, owner_objectid); + btrfs_set_ref_num_refs(leaf, ref, 1); + } else if (ret == -EEXIST) { + u64 existing_owner; + BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID); + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + if (btrfs_ref_root(leaf, ref) != ref_root || + btrfs_ref_generation(leaf, ref) != ref_generation) { + ret = -EIO; + WARN_ON(1); + goto out; + } + + num_refs = btrfs_ref_num_refs(leaf, ref); + BUG_ON(num_refs == 0); + btrfs_set_ref_num_refs(leaf, ref, num_refs + 1); + + existing_owner = btrfs_ref_objectid(leaf, ref); + if (existing_owner != owner_objectid && + existing_owner != BTRFS_MULTIPLE_OBJECTIDS) { + btrfs_set_ref_objectid(leaf, ref, + BTRFS_MULTIPLE_OBJECTIDS); + } + ret = 0; + } else { + goto out; + } + btrfs_mark_buffer_dirty(path->nodes[0]); +out: + btrfs_release_path(root, path); + return ret; +} + +static noinline int remove_extent_backref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + struct extent_buffer *leaf; + struct btrfs_extent_ref *ref; + u32 num_refs; + int ret = 0; + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref); + num_refs = btrfs_ref_num_refs(leaf, ref); + BUG_ON(num_refs == 0); + num_refs -= 1; + if (num_refs == 0) { + ret = btrfs_del_item(trans, root, path); + } else { + btrfs_set_ref_num_refs(leaf, ref, num_refs); + btrfs_mark_buffer_dirty(leaf); + } + btrfs_release_path(root, path); + return ret; +} + +#ifdef BIO_RW_DISCARD +static void btrfs_issue_discard(struct block_device *bdev, + u64 start, u64 len) +{ + blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); +} +#endif + +static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr, + u64 num_bytes) +{ +#ifdef BIO_RW_DISCARD + int ret; + u64 map_length = num_bytes; + struct btrfs_multi_bio *multi = NULL; + + /* Tell the block device(s) that the sectors can be discarded */ + ret = btrfs_map_block(&root->fs_info->mapping_tree, READ, + bytenr, &map_length, &multi, 0); + if (!ret) { + struct btrfs_bio_stripe *stripe = multi->stripes; + int i; + + if (map_length > num_bytes) + map_length = num_bytes; + + for (i = 0; i < multi->num_stripes; i++, stripe++) { + btrfs_issue_discard(stripe->dev->bdev, + stripe->physical, + map_length); + } + kfree(multi); + } + + return ret; +#else + return 0; +#endif +} + +static noinline int free_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct list_head *del_list) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_path *path; + struct btrfs_key key, found_key; + struct extent_buffer *leaf; + struct list_head *cur; + struct pending_extent_op *op; + struct btrfs_extent_item *ei; + int ret, num_to_del, extent_slot = 0, found_extent = 0; + u32 refs; + u64 bytes_freed = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 1; + +search: + /* search for the backref for the current ref we want to delete */ + cur = del_list->next; + op = list_entry(cur, struct pending_extent_op, list); + ret = lookup_extent_backref(trans, extent_root, path, op->bytenr, + op->orig_parent, + extent_root->root_key.objectid, + op->orig_generation, op->level, 1); + if (ret) { + printk(KERN_ERR "btrfs unable to find backref byte nr %llu " + "root %llu gen %llu owner %u\n", + (unsigned long long)op->bytenr, + (unsigned long long)extent_root->root_key.objectid, + (unsigned long long)op->orig_generation, op->level); + btrfs_print_leaf(extent_root, path->nodes[0]); + WARN_ON(1); + goto out; + } + + extent_slot = path->slots[0]; + num_to_del = 1; + found_extent = 0; + + /* + * if we aren't the first item on the leaf we can move back one and see + * if our ref is right next to our extent item + */ + if (likely(extent_slot)) { + extent_slot--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + extent_slot); + if (found_key.objectid == op->bytenr && + found_key.type == BTRFS_EXTENT_ITEM_KEY && + found_key.offset == op->num_bytes) { + num_to_del++; + found_extent = 1; + } + } + + /* + * if we didn't find the extent we need to delete the backref and then + * search for the extent item key so we can update its ref count + */ + if (!found_extent) { + key.objectid = op->bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = op->num_bytes; + + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1); + BUG_ON(ret); + extent_slot = path->slots[0]; + } + + /* this is where we update the ref count for the extent */ + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + BUG_ON(refs == 0); + refs--; + btrfs_set_extent_refs(leaf, ei, refs); + + btrfs_mark_buffer_dirty(leaf); + + /* + * This extent needs deleting. The reason cur_slot is extent_slot + + * num_to_del is because extent_slot points to the slot where the extent + * is, and if the backref was not right next to the extent we will be + * deleting at least 1 item, and will want to start searching at the + * slot directly next to extent_slot. However if we did find the + * backref next to the extent item them we will be deleting at least 2 + * items and will want to start searching directly after the ref slot + */ + if (!refs) { + struct list_head *pos, *n, *end; + int cur_slot = extent_slot+num_to_del; + u64 super_used; + u64 root_used; + + path->slots[0] = extent_slot; + bytes_freed = op->num_bytes; + + mutex_lock(&info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, op->bytenr, + op->num_bytes, op->level >= + BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&info->pinned_mutex); + BUG_ON(ret < 0); + op->del = ret; + + /* + * we need to see if we can delete multiple things at once, so + * start looping through the list of extents we are wanting to + * delete and see if their extent/backref's are right next to + * eachother and the extents only have 1 ref + */ + for (pos = cur->next; pos != del_list; pos = pos->next) { + struct pending_extent_op *tmp; + + tmp = list_entry(pos, struct pending_extent_op, list); + + /* we only want to delete extent+ref at this stage */ + if (cur_slot >= btrfs_header_nritems(leaf) - 1) + break; + + btrfs_item_key_to_cpu(leaf, &found_key, cur_slot); + if (found_key.objectid != tmp->bytenr || + found_key.type != BTRFS_EXTENT_ITEM_KEY || + found_key.offset != tmp->num_bytes) + break; + + /* check to make sure this extent only has one ref */ + ei = btrfs_item_ptr(leaf, cur_slot, + struct btrfs_extent_item); + if (btrfs_extent_refs(leaf, ei) != 1) + break; + + btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1); + if (found_key.objectid != tmp->bytenr || + found_key.type != BTRFS_EXTENT_REF_KEY || + found_key.offset != tmp->orig_parent) + break; + + /* + * the ref is right next to the extent, we can set the + * ref count to 0 since we will delete them both now + */ + btrfs_set_extent_refs(leaf, ei, 0); + + /* pin down the bytes for this extent */ + mutex_lock(&info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, tmp->bytenr, + tmp->num_bytes, tmp->level >= + BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&info->pinned_mutex); + BUG_ON(ret < 0); + + /* + * use the del field to tell if we need to go ahead and + * free up the extent when we delete the item or not. + */ + tmp->del = ret; + bytes_freed += tmp->num_bytes; + + num_to_del += 2; + cur_slot += 2; + } + end = pos; + + /* update the free space counters */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + super_used - bytes_freed); + + root_used = btrfs_root_used(&extent_root->root_item); + btrfs_set_root_used(&extent_root->root_item, + root_used - bytes_freed); + spin_unlock(&info->delalloc_lock); + + /* delete the items */ + ret = btrfs_del_items(trans, extent_root, path, + path->slots[0], num_to_del); + BUG_ON(ret); + + /* + * loop through the extents we deleted and do the cleanup work + * on them + */ + for (pos = cur, n = pos->next; pos != end; + pos = n, n = pos->next) { + struct pending_extent_op *tmp; + tmp = list_entry(pos, struct pending_extent_op, list); + + /* + * remember tmp->del tells us wether or not we pinned + * down the extent + */ + ret = update_block_group(trans, extent_root, + tmp->bytenr, tmp->num_bytes, 0, + tmp->del); + BUG_ON(ret); + + list_del_init(&tmp->list); + unlock_extent(&info->extent_ins, tmp->bytenr, + tmp->bytenr + tmp->num_bytes - 1, + GFP_NOFS); + kfree(tmp); + } + } else if (refs && found_extent) { + /* + * the ref and extent were right next to eachother, but the + * extent still has a ref, so just free the backref and keep + * going + */ + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + } else { + /* + * the extent has multiple refs and the backref we were looking + * for was not right next to it, so just unlock and go next, + * we're good to go + */ + list_del_init(&op->list); + unlock_extent(&info->extent_ins, op->bytenr, + op->bytenr + op->num_bytes - 1, GFP_NOFS); + kfree(op); + } + + btrfs_release_path(extent_root, path); + if (!list_empty(del_list)) + goto search; + +out: + btrfs_free_path(path); + return ret; +} + +static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 orig_root, u64 ref_root, + u64 orig_generation, u64 ref_generation, + u64 owner_objectid) +{ + int ret; + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_path *path; + + if (root == root->fs_info->extent_root) { + struct pending_extent_op *extent_op; + u64 num_bytes; + + BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL); + num_bytes = btrfs_level_size(root, (int)owner_objectid); + mutex_lock(&root->fs_info->extent_ins_mutex); + if (test_range_bit(&root->fs_info->extent_ins, bytenr, + bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { + u64 priv; + ret = get_state_private(&root->fs_info->extent_ins, + bytenr, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; + BUG_ON(extent_op->parent != orig_parent); + BUG_ON(extent_op->generation != orig_generation); + + extent_op->parent = parent; + extent_op->generation = ref_generation; + } else { + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_BACKREF_UPDATE; + extent_op->bytenr = bytenr; + extent_op->num_bytes = num_bytes; + extent_op->parent = parent; + extent_op->orig_parent = orig_parent; + extent_op->generation = ref_generation; + extent_op->orig_generation = orig_generation; + extent_op->level = (int)owner_objectid; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + set_extent_bits(&root->fs_info->extent_ins, + bytenr, bytenr + num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->extent_ins, + bytenr, (unsigned long)extent_op); + } + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + ret = lookup_extent_backref(trans, extent_root, path, + bytenr, orig_parent, orig_root, + orig_generation, owner_objectid, 1); + if (ret) + goto out; + ret = remove_extent_backref(trans, extent_root, path); + if (ret) + goto out; + ret = insert_extent_backref(trans, extent_root, path, bytenr, + parent, ref_root, ref_generation, + owner_objectid); + BUG_ON(ret); + finish_current_insert(trans, extent_root, 0); + del_pending_extents(trans, extent_root, 0); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_update_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid) +{ + int ret; + if (ref_root == BTRFS_TREE_LOG_OBJECTID && + owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + return 0; + ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent, + parent, ref_root, ref_root, + ref_generation, ref_generation, + owner_objectid); + return ret; +} + +static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 orig_parent, u64 parent, + u64 orig_root, u64 ref_root, + u64 orig_generation, u64 ref_generation, + u64 owner_objectid) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_extent_item *item; + u32 refs; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + key.objectid = bytenr; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, + 0, 1); + if (ret < 0) + return ret; + BUG_ON(ret == 0 || path->slots[0] == 0); + + path->slots[0]--; + l = path->nodes[0]; + + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + if (key.objectid != bytenr) { + btrfs_print_leaf(root->fs_info->extent_root, path->nodes[0]); + printk(KERN_ERR "btrfs wanted %llu found %llu\n", + (unsigned long long)bytenr, + (unsigned long long)key.objectid); + BUG(); + } + BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY); + + item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); + refs = btrfs_extent_refs(l, item); + btrfs_set_extent_refs(l, item, refs + 1); + btrfs_mark_buffer_dirty(path->nodes[0]); + + btrfs_release_path(root->fs_info->extent_root, path); + + path->reada = 1; + ret = insert_extent_backref(trans, root->fs_info->extent_root, + path, bytenr, parent, + ref_root, ref_generation, + owner_objectid); + BUG_ON(ret); + finish_current_insert(trans, root->fs_info->extent_root, 0); + del_pending_extents(trans, root->fs_info->extent_root, 0); + + btrfs_free_path(path); + return 0; +} + +int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 ref_root, u64 ref_generation, + u64 owner_objectid) +{ + int ret; + if (ref_root == BTRFS_TREE_LOG_OBJECTID && + owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + return 0; + ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent, + 0, ref_root, 0, ref_generation, + owner_objectid); + return ret; +} + +int btrfs_extent_post_op(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + finish_current_insert(trans, root->fs_info->extent_root, 1); + del_pending_extents(trans, root->fs_info->extent_root, 1); + return 0; +} + +int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, + u64 num_bytes, u32 *refs) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct extent_buffer *l; + struct btrfs_extent_item *item; + + WARN_ON(num_bytes < root->sectorsize); + path = btrfs_alloc_path(); + path->reada = 1; + key.objectid = bytenr; + key.offset = num_bytes; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path, + 0, 0); + if (ret < 0) + goto out; + if (ret != 0) { + btrfs_print_leaf(root, path->nodes[0]); + printk(KERN_INFO "btrfs failed to find block number %llu\n", + (unsigned long long)bytenr); + BUG(); + } + l = path->nodes[0]; + item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item); + *refs = btrfs_extent_refs(l, item); +out: + btrfs_free_path(path); + return 0; +} + +int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid, u64 bytenr) +{ + struct btrfs_root *extent_root = root->fs_info->extent_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_extent_ref *ref_item; + struct btrfs_key key; + struct btrfs_key found_key; + u64 ref_root; + u64 last_snapshot; + u32 nritems; + int ret; + + key.objectid = bytenr; + key.offset = (u64)-1; + key.type = BTRFS_EXTENT_ITEM_KEY; + + path = btrfs_alloc_path(); + ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + ret = -ENOENT; + if (path->slots[0] == 0) + goto out; + + path->slots[0]--; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != bytenr || + found_key.type != BTRFS_EXTENT_ITEM_KEY) + goto out; + + last_snapshot = btrfs_root_last_snapshot(&root->root_item); + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret == 0) + continue; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != bytenr) + break; + + if (found_key.type != BTRFS_EXTENT_REF_KEY) { + path->slots[0]++; + continue; + } + + ref_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + ref_root = btrfs_ref_root(leaf, ref_item); + if ((ref_root != root->root_key.objectid && + ref_root != BTRFS_TREE_LOG_OBJECTID) || + objectid != btrfs_ref_objectid(leaf, ref_item)) { + ret = 1; + goto out; + } + if (btrfs_ref_generation(leaf, ref_item) <= last_snapshot) { + ret = 1; + goto out; + } + + path->slots[0]++; + } + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_cache_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *buf, u32 nr_extents) +{ + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + u64 root_gen; + u32 nritems; + int i; + int level; + int ret = 0; + int shared = 0; + + if (!root->ref_cows) + return 0; + + if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) { + shared = 0; + root_gen = root->root_key.offset; + } else { + shared = 1; + root_gen = trans->transid - 1; + } + + level = btrfs_header_level(buf); + nritems = btrfs_header_nritems(buf); + + if (level == 0) { + struct btrfs_leaf_ref *ref; + struct btrfs_extent_info *info; + + ref = btrfs_alloc_leaf_ref(root, nr_extents); + if (!ref) { + ret = -ENOMEM; + goto out; + } + + ref->root_gen = root_gen; + ref->bytenr = buf->start; + ref->owner = btrfs_header_owner(buf); + ref->generation = btrfs_header_generation(buf); + ref->nritems = nr_extents; + info = ref->extents; + + for (i = 0; nr_extents > 0 && i < nritems; i++) { + u64 disk_bytenr; + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + disk_bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (disk_bytenr == 0) + continue; + + info->bytenr = disk_bytenr; + info->num_bytes = + btrfs_file_extent_disk_num_bytes(buf, fi); + info->objectid = key.objectid; + info->offset = key.offset; + info++; + } + + ret = btrfs_add_leaf_ref(root, ref, shared); + if (ret == -EEXIST && shared) { + struct btrfs_leaf_ref *old; + old = btrfs_lookup_leaf_ref(root, ref->bytenr); + BUG_ON(!old); + btrfs_remove_leaf_ref(root, old); + btrfs_free_leaf_ref(root, old); + ret = btrfs_add_leaf_ref(root, ref, shared); + } + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } +out: + return ret; +} + +int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct extent_buffer *orig_buf, struct extent_buffer *buf, + u32 *nr_extents) +{ + u64 bytenr; + u64 ref_root; + u64 orig_root; + u64 ref_generation; + u64 orig_generation; + u32 nritems; + u32 nr_file_extents = 0; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int level; + int ret = 0; + int faili = 0; + int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, + u64, u64, u64, u64, u64, u64, u64, u64); + + ref_root = btrfs_header_owner(buf); + ref_generation = btrfs_header_generation(buf); + orig_root = btrfs_header_owner(orig_buf); + orig_generation = btrfs_header_generation(orig_buf); + + nritems = btrfs_header_nritems(buf); + level = btrfs_header_level(buf); + + if (root->ref_cows) { + process_func = __btrfs_inc_extent_ref; + } else { + if (level == 0 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + goto out; + if (level != 0 && + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) + goto out; + process_func = __btrfs_update_extent_ref; + } + + for (i = 0; i < nritems; i++) { + cond_resched(); + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + + nr_file_extents++; + + ret = process_func(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + key.objectid); + + if (ret) { + faili = i; + WARN_ON(1); + goto fail; + } + } else { + bytenr = btrfs_node_blockptr(buf, i); + ret = process_func(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + level - 1); + if (ret) { + faili = i; + WARN_ON(1); + goto fail; + } + } + } +out: + if (nr_extents) { + if (level == 0) + *nr_extents = nr_file_extents; + else + *nr_extents = nritems; + } + return 0; +fail: + WARN_ON(1); + return ret; +} + +int btrfs_update_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *orig_buf, + struct extent_buffer *buf, int start_slot, int nr) + +{ + u64 bytenr; + u64 ref_root; + u64 orig_root; + u64 ref_generation; + u64 orig_generation; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int ret; + int slot; + int level; + + BUG_ON(start_slot < 0); + BUG_ON(start_slot + nr > btrfs_header_nritems(buf)); + + ref_root = btrfs_header_owner(buf); + ref_generation = btrfs_header_generation(buf); + orig_root = btrfs_header_owner(orig_buf); + orig_generation = btrfs_header_generation(orig_buf); + level = btrfs_header_level(buf); + + if (!root->ref_cows) { + if (level == 0 && + root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + return 0; + if (level != 0 && + root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) + return 0; + } + + for (i = 0, slot = start_slot; i < nr; i++, slot++) { + cond_resched(); + if (level == 0) { + btrfs_item_key_to_cpu(buf, &key, slot); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(buf, slot, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(buf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(buf, fi); + if (bytenr == 0) + continue; + ret = __btrfs_update_extent_ref(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + key.objectid); + if (ret) + goto fail; + } else { + bytenr = btrfs_node_blockptr(buf, slot); + ret = __btrfs_update_extent_ref(trans, root, bytenr, + orig_buf->start, buf->start, + orig_root, ref_root, + orig_generation, ref_generation, + level - 1); + if (ret) + goto fail; + } + } + return 0; +fail: + WARN_ON(1); + return -1; +} + +static int write_one_cache_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_block_group_cache *cache) +{ + int ret; + int pending_ret; + struct btrfs_root *extent_root = root->fs_info->extent_root; + unsigned long bi; + struct extent_buffer *leaf; + + ret = btrfs_search_slot(trans, extent_root, &cache->key, path, 0, 1); + if (ret < 0) + goto fail; + BUG_ON(ret); + + leaf = path->nodes[0]; + bi = btrfs_item_ptr_offset(leaf, path->slots[0]); + write_extent_buffer(leaf, &cache->item, bi, sizeof(cache->item)); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(extent_root, path); +fail: + finish_current_insert(trans, extent_root, 0); + pending_ret = del_pending_extents(trans, extent_root, 0); + if (ret) + return ret; + if (pending_ret) + return pending_ret; + return 0; + +} + +int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_block_group_cache *cache, *entry; + struct rb_node *n; + int err = 0; + int werr = 0; + struct btrfs_path *path; + u64 last = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + cache = NULL; + spin_lock(&root->fs_info->block_group_cache_lock); + for (n = rb_first(&root->fs_info->block_group_cache_tree); + n; n = rb_next(n)) { + entry = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + if (entry->dirty) { + cache = entry; + break; + } + } + spin_unlock(&root->fs_info->block_group_cache_lock); + + if (!cache) + break; + + cache->dirty = 0; + last += cache->key.offset; + + err = write_one_cache_group(trans, root, + path, cache); + /* + * if we fail to write the cache group, we want + * to keep it marked dirty in hopes that a later + * write will work + */ + if (err) { + werr = err; + continue; + } + } + btrfs_free_path(path); + return werr; +} + +int btrfs_extent_readonly(struct btrfs_root *root, u64 bytenr) +{ + struct btrfs_block_group_cache *block_group; + int readonly = 0; + + block_group = btrfs_lookup_block_group(root->fs_info, bytenr); + if (!block_group || block_group->ro) + readonly = 1; + if (block_group) + put_block_group(block_group); + return readonly; +} + +static int update_space_info(struct btrfs_fs_info *info, u64 flags, + u64 total_bytes, u64 bytes_used, + struct btrfs_space_info **space_info) +{ + struct btrfs_space_info *found; + + found = __find_space_info(info, flags); + if (found) { + spin_lock(&found->lock); + found->total_bytes += total_bytes; + found->bytes_used += bytes_used; + found->full = 0; + spin_unlock(&found->lock); + *space_info = found; + return 0; + } + found = kzalloc(sizeof(*found), GFP_NOFS); + if (!found) + return -ENOMEM; + + list_add(&found->list, &info->space_info); + INIT_LIST_HEAD(&found->block_groups); + init_rwsem(&found->groups_sem); + spin_lock_init(&found->lock); + found->flags = flags; + found->total_bytes = total_bytes; + found->bytes_used = bytes_used; + found->bytes_pinned = 0; + found->bytes_reserved = 0; + found->bytes_readonly = 0; + found->full = 0; + found->force_alloc = 0; + *space_info = found; + return 0; +} + +static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags) +{ + u64 extra_flags = flags & (BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP); + if (extra_flags) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + fs_info->avail_data_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_METADATA) + fs_info->avail_metadata_alloc_bits |= extra_flags; + if (flags & BTRFS_BLOCK_GROUP_SYSTEM) + fs_info->avail_system_alloc_bits |= extra_flags; + } +} + +static void set_block_group_readonly(struct btrfs_block_group_cache *cache) +{ + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + if (!cache->ro) { + cache->space_info->bytes_readonly += cache->key.offset - + btrfs_block_group_used(&cache->item); + cache->ro = 1; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); +} + +u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) +{ + u64 num_devices = root->fs_info->fs_devices->rw_devices; + + if (num_devices == 1) + flags &= ~(BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID0); + if (num_devices < 4) + flags &= ~BTRFS_BLOCK_GROUP_RAID10; + + if ((flags & BTRFS_BLOCK_GROUP_DUP) && + (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10))) { + flags &= ~BTRFS_BLOCK_GROUP_DUP; + } + + if ((flags & BTRFS_BLOCK_GROUP_RAID1) && + (flags & BTRFS_BLOCK_GROUP_RAID10)) { + flags &= ~BTRFS_BLOCK_GROUP_RAID1; + } + + if ((flags & BTRFS_BLOCK_GROUP_RAID0) && + ((flags & BTRFS_BLOCK_GROUP_RAID1) | + (flags & BTRFS_BLOCK_GROUP_RAID10) | + (flags & BTRFS_BLOCK_GROUP_DUP))) + flags &= ~BTRFS_BLOCK_GROUP_RAID0; + return flags; +} + +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force) +{ + struct btrfs_space_info *space_info; + u64 thresh; + int ret = 0; + + mutex_lock(&extent_root->fs_info->chunk_mutex); + + flags = btrfs_reduce_alloc_profile(extent_root, flags); + + space_info = __find_space_info(extent_root->fs_info, flags); + if (!space_info) { + ret = update_space_info(extent_root->fs_info, flags, + 0, 0, &space_info); + BUG_ON(ret); + } + BUG_ON(!space_info); + + spin_lock(&space_info->lock); + if (space_info->force_alloc) { + force = 1; + space_info->force_alloc = 0; + } + if (space_info->full) { + spin_unlock(&space_info->lock); + goto out; + } + + thresh = space_info->total_bytes - space_info->bytes_readonly; + thresh = div_factor(thresh, 6); + if (!force && + (space_info->bytes_used + space_info->bytes_pinned + + space_info->bytes_reserved + alloc_bytes) < thresh) { + spin_unlock(&space_info->lock); + goto out; + } + spin_unlock(&space_info->lock); + + ret = btrfs_alloc_chunk(trans, extent_root, flags); + if (ret) + space_info->full = 1; +out: + mutex_unlock(&extent_root->fs_info->chunk_mutex); + return ret; +} + +static int update_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int alloc, + int mark_free) +{ + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *info = root->fs_info; + u64 total = num_bytes; + u64 old_val; + u64 byte_in_group; + + while (total) { + cache = btrfs_lookup_block_group(info, bytenr); + if (!cache) + return -1; + byte_in_group = bytenr - cache->key.objectid; + WARN_ON(byte_in_group > cache->key.offset); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->dirty = 1; + old_val = btrfs_block_group_used(&cache->item); + num_bytes = min(total, cache->key.offset - byte_in_group); + if (alloc) { + old_val += num_bytes; + cache->space_info->bytes_used += num_bytes; + if (cache->ro) + cache->space_info->bytes_readonly -= num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + } else { + old_val -= num_bytes; + cache->space_info->bytes_used -= num_bytes; + if (cache->ro) + cache->space_info->bytes_readonly += num_bytes; + btrfs_set_block_group_used(&cache->item, old_val); + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + if (mark_free) { + int ret; + + ret = btrfs_discard_extent(root, bytenr, + num_bytes); + WARN_ON(ret); + + ret = btrfs_add_free_space(cache, bytenr, + num_bytes); + WARN_ON(ret); + } + } + put_block_group(cache); + total -= num_bytes; + bytenr += num_bytes; + } + return 0; +} + +static u64 first_logical_byte(struct btrfs_root *root, u64 search_start) +{ + struct btrfs_block_group_cache *cache; + u64 bytenr; + + cache = btrfs_lookup_first_block_group(root->fs_info, search_start); + if (!cache) + return 0; + + bytenr = cache->key.objectid; + put_block_group(cache); + + return bytenr; +} + +int btrfs_update_pinned_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int pin) +{ + u64 len; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = root->fs_info; + + WARN_ON(!mutex_is_locked(&root->fs_info->pinned_mutex)); + if (pin) { + set_extent_dirty(&fs_info->pinned_extents, + bytenr, bytenr + num - 1, GFP_NOFS); + } else { + clear_extent_dirty(&fs_info->pinned_extents, + bytenr, bytenr + num - 1, GFP_NOFS); + } + while (num > 0) { + cache = btrfs_lookup_block_group(fs_info, bytenr); + BUG_ON(!cache); + len = min(num, cache->key.offset - + (bytenr - cache->key.objectid)); + if (pin) { + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned += len; + cache->space_info->bytes_pinned += len; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fs_info->total_pinned += len; + } else { + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + cache->pinned -= len; + cache->space_info->bytes_pinned -= len; + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + fs_info->total_pinned -= len; + if (cache->cached) + btrfs_add_free_space(cache, bytenr, len); + } + put_block_group(cache); + bytenr += len; + num -= len; + } + return 0; +} + +static int update_reserved_extents(struct btrfs_root *root, + u64 bytenr, u64 num, int reserve) +{ + u64 len; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *fs_info = root->fs_info; + + while (num > 0) { + cache = btrfs_lookup_block_group(fs_info, bytenr); + BUG_ON(!cache); + len = min(num, cache->key.offset - + (bytenr - cache->key.objectid)); + + spin_lock(&cache->space_info->lock); + spin_lock(&cache->lock); + if (reserve) { + cache->reserved += len; + cache->space_info->bytes_reserved += len; + } else { + cache->reserved -= len; + cache->space_info->bytes_reserved -= len; + } + spin_unlock(&cache->lock); + spin_unlock(&cache->space_info->lock); + put_block_group(cache); + bytenr += len; + num -= len; + } + return 0; +} + +int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy) +{ + u64 last = 0; + u64 start; + u64 end; + struct extent_io_tree *pinned_extents = &root->fs_info->pinned_extents; + int ret; + + mutex_lock(&root->fs_info->pinned_mutex); + while (1) { + ret = find_first_extent_bit(pinned_extents, last, + &start, &end, EXTENT_DIRTY); + if (ret) + break; + set_extent_dirty(copy, start, end, GFP_NOFS); + last = end + 1; + } + mutex_unlock(&root->fs_info->pinned_mutex); + return 0; +} + +int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_io_tree *unpin) +{ + u64 start; + u64 end; + int ret; + + mutex_lock(&root->fs_info->pinned_mutex); + while (1) { + ret = find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + ret = btrfs_discard_extent(root, start, end + 1 - start); + + btrfs_update_pinned_extents(root, start, end + 1 - start, 0); + clear_extent_dirty(unpin, start, end, GFP_NOFS); + + if (need_resched()) { + mutex_unlock(&root->fs_info->pinned_mutex); + cond_resched(); + mutex_lock(&root->fs_info->pinned_mutex); + } + } + mutex_unlock(&root->fs_info->pinned_mutex); + return ret; +} + +static int finish_current_insert(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all) +{ + u64 start; + u64 end; + u64 priv; + u64 search = 0; + u64 skipped = 0; + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_path *path; + struct pending_extent_op *extent_op, *tmp; + struct list_head insert_list, update_list; + int ret; + int num_inserts = 0, max_inserts; + + path = btrfs_alloc_path(); + INIT_LIST_HEAD(&insert_list); + INIT_LIST_HEAD(&update_list); + + max_inserts = extent_root->leafsize / + (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) + + sizeof(struct btrfs_extent_ref) + + sizeof(struct btrfs_extent_item)); +again: + mutex_lock(&info->extent_ins_mutex); + while (1) { + ret = find_first_extent_bit(&info->extent_ins, search, &start, + &end, EXTENT_WRITEBACK); + if (ret) { + if (skipped && all && !num_inserts) { + skipped = 0; + search = 0; + continue; + } + mutex_unlock(&info->extent_ins_mutex); + break; + } + + ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); + if (!ret) { + skipped = 1; + search = end + 1; + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + continue; + } + + ret = get_state_private(&info->extent_ins, start, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *)(unsigned long) priv; + + if (extent_op->type == PENDING_EXTENT_INSERT) { + num_inserts++; + list_add_tail(&extent_op->list, &insert_list); + search = end + 1; + if (num_inserts == max_inserts) { + mutex_unlock(&info->extent_ins_mutex); + break; + } + } else if (extent_op->type == PENDING_BACKREF_UPDATE) { + list_add_tail(&extent_op->list, &update_list); + search = end + 1; + } else { + BUG(); + } + } + + /* + * process the update list, clear the writeback bit for it, and if + * somebody marked this thing for deletion then just unlock it and be + * done, the free_extents will handle it + */ + mutex_lock(&info->extent_ins_mutex); + list_for_each_entry_safe(extent_op, tmp, &update_list, list) { + clear_extent_bits(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + if (extent_op->del) { + list_del_init(&extent_op->list); + unlock_extent(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes + - 1, GFP_NOFS); + kfree(extent_op); + } + } + mutex_unlock(&info->extent_ins_mutex); + + /* + * still have things left on the update list, go ahead an update + * everything + */ + if (!list_empty(&update_list)) { + ret = update_backrefs(trans, extent_root, path, &update_list); + BUG_ON(ret); + } + + /* + * if no inserts need to be done, but we skipped some extents and we + * need to make sure everything is cleaned then reset everything and + * go back to the beginning + */ + if (!num_inserts && all && skipped) { + search = 0; + skipped = 0; + INIT_LIST_HEAD(&update_list); + INIT_LIST_HEAD(&insert_list); + goto again; + } else if (!num_inserts) { + goto out; + } + + /* + * process the insert extents list. Again if we are deleting this + * extent, then just unlock it, pin down the bytes if need be, and be + * done with it. Saves us from having to actually insert the extent + * into the tree and then subsequently come along and delete it + */ + mutex_lock(&info->extent_ins_mutex); + list_for_each_entry_safe(extent_op, tmp, &insert_list, list) { + clear_extent_bits(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + if (extent_op->del) { + u64 used; + list_del_init(&extent_op->list); + unlock_extent(&info->extent_ins, extent_op->bytenr, + extent_op->bytenr + extent_op->num_bytes + - 1, GFP_NOFS); + + mutex_lock(&extent_root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, + extent_op->bytenr, + extent_op->num_bytes, 0); + mutex_unlock(&extent_root->fs_info->pinned_mutex); + + spin_lock(&info->delalloc_lock); + used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + used - extent_op->num_bytes); + used = btrfs_root_used(&extent_root->root_item); + btrfs_set_root_used(&extent_root->root_item, + used - extent_op->num_bytes); + spin_unlock(&info->delalloc_lock); + + ret = update_block_group(trans, extent_root, + extent_op->bytenr, + extent_op->num_bytes, + 0, ret > 0); + BUG_ON(ret); + kfree(extent_op); + num_inserts--; + } + } + mutex_unlock(&info->extent_ins_mutex); + + ret = insert_extents(trans, extent_root, path, &insert_list, + num_inserts); + BUG_ON(ret); + + /* + * if we broke out of the loop in order to insert stuff because we hit + * the maximum number of inserts at a time we can handle, then loop + * back and pick up where we left off + */ + if (num_inserts == max_inserts) { + INIT_LIST_HEAD(&insert_list); + INIT_LIST_HEAD(&update_list); + num_inserts = 0; + goto again; + } + + /* + * again, if we need to make absolutely sure there are no more pending + * extent operations left and we know that we skipped some, go back to + * the beginning and do it all again + */ + if (all && skipped) { + INIT_LIST_HEAD(&insert_list); + INIT_LIST_HEAD(&update_list); + search = 0; + skipped = 0; + num_inserts = 0; + goto again; + } +out: + btrfs_free_path(path); + return 0; +} + +static int pin_down_bytes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, int is_data) +{ + int err = 0; + struct extent_buffer *buf; + + if (is_data) + goto pinit; + + buf = btrfs_find_tree_block(root, bytenr, num_bytes); + if (!buf) + goto pinit; + + /* we can reuse a block if it hasn't been written + * and it is from this transaction. We can't + * reuse anything from the tree log root because + * it has tiny sub-transactions. + */ + if (btrfs_buffer_uptodate(buf, 0) && + btrfs_try_tree_lock(buf)) { + u64 header_owner = btrfs_header_owner(buf); + u64 header_transid = btrfs_header_generation(buf); + if (header_owner != BTRFS_TREE_LOG_OBJECTID && + header_owner != BTRFS_TREE_RELOC_OBJECTID && + header_transid == trans->transid && + !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) { + clean_tree_block(NULL, root, buf); + btrfs_tree_unlock(buf); + free_extent_buffer(buf); + return 1; + } + btrfs_tree_unlock(buf); + } + free_extent_buffer(buf); +pinit: + btrfs_update_pinned_extents(root, bytenr, num_bytes, 1); + + BUG_ON(err < 0); + return 0; +} + +/* + * remove an extent from the root, returns 0 on success + */ +static int __free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin, int mark_free) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; + struct extent_buffer *leaf; + int ret; + int extent_slot = 0; + int found_extent = 0; + int num_to_del = 1; + struct btrfs_extent_item *ei; + u32 refs; + + key.objectid = bytenr; + btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY); + key.offset = num_bytes; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + path->reada = 1; + ret = lookup_extent_backref(trans, extent_root, path, + bytenr, parent, root_objectid, + ref_generation, owner_objectid, 1); + if (ret == 0) { + struct btrfs_key found_key; + extent_slot = path->slots[0]; + while (extent_slot > 0) { + extent_slot--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + extent_slot); + if (found_key.objectid != bytenr) + break; + if (found_key.type == BTRFS_EXTENT_ITEM_KEY && + found_key.offset == num_bytes) { + found_extent = 1; + break; + } + if (path->slots[0] - extent_slot > 5) + break; + } + if (!found_extent) { + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + ret = btrfs_search_slot(trans, extent_root, + &key, path, -1, 1); + if (ret) { + printk(KERN_ERR "umm, got %d back from search" + ", was looking for %llu\n", ret, + (unsigned long long)bytenr); + btrfs_print_leaf(extent_root, path->nodes[0]); + } + BUG_ON(ret); + extent_slot = path->slots[0]; + } + } else { + btrfs_print_leaf(extent_root, path->nodes[0]); + WARN_ON(1); + printk(KERN_ERR "btrfs unable to find ref byte nr %llu " + "root %llu gen %llu owner %llu\n", + (unsigned long long)bytenr, + (unsigned long long)root_objectid, + (unsigned long long)ref_generation, + (unsigned long long)owner_objectid); + } + + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, extent_slot, + struct btrfs_extent_item); + refs = btrfs_extent_refs(leaf, ei); + BUG_ON(refs == 0); + refs -= 1; + btrfs_set_extent_refs(leaf, ei, refs); + + btrfs_mark_buffer_dirty(leaf); + + if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) { + struct btrfs_extent_ref *ref; + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1); + /* if the back ref and the extent are next to each other + * they get deleted below in one shot + */ + path->slots[0] = extent_slot; + num_to_del = 2; + } else if (found_extent) { + /* otherwise delete the extent back ref */ + ret = remove_extent_backref(trans, extent_root, path); + BUG_ON(ret); + /* if refs are 0, we need to setup the path for deletion */ + if (refs == 0) { + btrfs_release_path(extent_root, path); + ret = btrfs_search_slot(trans, extent_root, &key, path, + -1, 1); + BUG_ON(ret); + } + } + + if (refs == 0) { + u64 super_used; + u64 root_used; + + if (pin) { + mutex_lock(&root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, root, bytenr, num_bytes, + owner_objectid >= BTRFS_FIRST_FREE_OBJECTID); + mutex_unlock(&root->fs_info->pinned_mutex); + if (ret > 0) + mark_free = 1; + BUG_ON(ret < 0); + } + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, + super_used - num_bytes); + + /* block accounting for root item */ + root_used = btrfs_root_used(&root->root_item); + btrfs_set_root_used(&root->root_item, + root_used - num_bytes); + spin_unlock(&info->delalloc_lock); + ret = btrfs_del_items(trans, extent_root, path, path->slots[0], + num_to_del); + BUG_ON(ret); + btrfs_release_path(extent_root, path); + + if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_del_csums(trans, root, bytenr, num_bytes); + BUG_ON(ret); + } + + ret = update_block_group(trans, root, bytenr, num_bytes, 0, + mark_free); + BUG_ON(ret); + } + btrfs_free_path(path); + finish_current_insert(trans, extent_root, 0); + return ret; +} + +/* + * find all the blocks marked as pending in the radix tree and remove + * them from the extent map + */ +static int del_pending_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, int all) +{ + int ret; + int err = 0; + u64 start; + u64 end; + u64 priv; + u64 search = 0; + int nr = 0, skipped = 0; + struct extent_io_tree *pending_del; + struct extent_io_tree *extent_ins; + struct pending_extent_op *extent_op; + struct btrfs_fs_info *info = extent_root->fs_info; + struct list_head delete_list; + + INIT_LIST_HEAD(&delete_list); + extent_ins = &extent_root->fs_info->extent_ins; + pending_del = &extent_root->fs_info->pending_del; + +again: + mutex_lock(&info->extent_ins_mutex); + while (1) { + ret = find_first_extent_bit(pending_del, search, &start, &end, + EXTENT_WRITEBACK); + if (ret) { + if (all && skipped && !nr) { + search = 0; + continue; + } + mutex_unlock(&info->extent_ins_mutex); + break; + } + + ret = try_lock_extent(extent_ins, start, end, GFP_NOFS); + if (!ret) { + search = end+1; + skipped = 1; + + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + + continue; + } + BUG_ON(ret < 0); + + ret = get_state_private(pending_del, start, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *)(unsigned long)priv; + + clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK, + GFP_NOFS); + if (!test_range_bit(extent_ins, start, end, + EXTENT_WRITEBACK, 0)) { + list_add_tail(&extent_op->list, &delete_list); + nr++; + } else { + kfree(extent_op); + + ret = get_state_private(&info->extent_ins, start, + &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; + + clear_extent_bits(&info->extent_ins, start, end, + EXTENT_WRITEBACK, GFP_NOFS); + + if (extent_op->type == PENDING_BACKREF_UPDATE) { + list_add_tail(&extent_op->list, &delete_list); + search = end + 1; + nr++; + continue; + } + + mutex_lock(&extent_root->fs_info->pinned_mutex); + ret = pin_down_bytes(trans, extent_root, start, + end + 1 - start, 0); + mutex_unlock(&extent_root->fs_info->pinned_mutex); + + ret = update_block_group(trans, extent_root, start, + end + 1 - start, 0, ret > 0); + + unlock_extent(extent_ins, start, end, GFP_NOFS); + BUG_ON(ret); + kfree(extent_op); + } + if (ret) + err = ret; + + search = end + 1; + + if (need_resched()) { + mutex_unlock(&info->extent_ins_mutex); + cond_resched(); + mutex_lock(&info->extent_ins_mutex); + } + } + + if (nr) { + ret = free_extents(trans, extent_root, &delete_list); + BUG_ON(ret); + } + + if (all && skipped) { + INIT_LIST_HEAD(&delete_list); + search = 0; + nr = 0; + goto again; + } + + return err; +} + +/* + * remove an extent from the root, returns 0 on success + */ +static int __btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin) +{ + struct btrfs_root *extent_root = root->fs_info->extent_root; + int pending_ret; + int ret; + + WARN_ON(num_bytes < root->sectorsize); + if (root == extent_root) { + struct pending_extent_op *extent_op = NULL; + + mutex_lock(&root->fs_info->extent_ins_mutex); + if (test_range_bit(&root->fs_info->extent_ins, bytenr, + bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) { + u64 priv; + ret = get_state_private(&root->fs_info->extent_ins, + bytenr, &priv); + BUG_ON(ret); + extent_op = (struct pending_extent_op *) + (unsigned long)priv; + + extent_op->del = 1; + if (extent_op->type == PENDING_EXTENT_INSERT) { + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + } + + if (extent_op) { + ref_generation = extent_op->orig_generation; + parent = extent_op->orig_parent; + } + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_EXTENT_DELETE; + extent_op->bytenr = bytenr; + extent_op->num_bytes = num_bytes; + extent_op->parent = parent; + extent_op->orig_parent = parent; + extent_op->generation = ref_generation; + extent_op->orig_generation = ref_generation; + extent_op->level = (int)owner_objectid; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + set_extent_bits(&root->fs_info->pending_del, + bytenr, bytenr + num_bytes - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->pending_del, + bytenr, (unsigned long)extent_op); + mutex_unlock(&root->fs_info->extent_ins_mutex); + return 0; + } + /* if metadata always pin */ + if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + struct btrfs_block_group_cache *cache; + + /* btrfs_free_reserved_extent */ + cache = btrfs_lookup_block_group(root->fs_info, bytenr); + BUG_ON(!cache); + btrfs_add_free_space(cache, bytenr, num_bytes); + put_block_group(cache); + update_reserved_extents(root, bytenr, num_bytes, 0); + return 0; + } + pin = 1; + } + + /* if data pin when any transaction has committed this */ + if (ref_generation != trans->transid) + pin = 1; + + ret = __free_extent(trans, root, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, pin, pin == 0); + + finish_current_insert(trans, root->fs_info->extent_root, 0); + pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0); + return ret ? ret : pending_ret; +} + +int btrfs_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u64 num_bytes, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, int pin) +{ + int ret; + + ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent, + root_objectid, ref_generation, + owner_objectid, pin); + return ret; +} + +static u64 stripe_align(struct btrfs_root *root, u64 val) +{ + u64 mask = ((u64)root->stripesize - 1); + u64 ret = (val + mask) & ~mask; + return ret; +} + +/* + * walks the btree of allocated extents and find a hole of a given size. + * The key ins is changed to record the hole: + * ins->objectid == block start + * ins->flags = BTRFS_EXTENT_ITEM_KEY + * ins->offset == number of blocks + * Any available blocks before search_start are skipped. + */ +static noinline int find_free_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *orig_root, + u64 num_bytes, u64 empty_size, + u64 search_start, u64 search_end, + u64 hint_byte, struct btrfs_key *ins, + u64 exclude_start, u64 exclude_nr, + int data) +{ + int ret = 0; + struct btrfs_root *root = orig_root->fs_info->extent_root; + u64 total_needed = num_bytes; + u64 *last_ptr = NULL; + u64 last_wanted = 0; + struct btrfs_block_group_cache *block_group = NULL; + int chunk_alloc_done = 0; + int empty_cluster = 2 * 1024 * 1024; + int allowed_chunk_alloc = 0; + struct list_head *head = NULL, *cur = NULL; + int loop = 0; + int extra_loop = 0; + struct btrfs_space_info *space_info; + + WARN_ON(num_bytes < root->sectorsize); + btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + ins->objectid = 0; + ins->offset = 0; + + if (orig_root->ref_cows || empty_size) + allowed_chunk_alloc = 1; + + if (data & BTRFS_BLOCK_GROUP_METADATA) { + last_ptr = &root->fs_info->last_alloc; + empty_cluster = 64 * 1024; + } + + if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) + last_ptr = &root->fs_info->last_data_alloc; + + if (last_ptr) { + if (*last_ptr) { + hint_byte = *last_ptr; + last_wanted = *last_ptr; + } else + empty_size += empty_cluster; + } else { + empty_cluster = 0; + } + search_start = max(search_start, first_logical_byte(root, 0)); + search_start = max(search_start, hint_byte); + + if (last_wanted && search_start != last_wanted) { + last_wanted = 0; + empty_size += empty_cluster; + } + + total_needed += empty_size; + block_group = btrfs_lookup_block_group(root->fs_info, search_start); + if (!block_group) + block_group = btrfs_lookup_first_block_group(root->fs_info, + search_start); + space_info = __find_space_info(root->fs_info, data); + + down_read(&space_info->groups_sem); + while (1) { + struct btrfs_free_space *free_space; + /* + * the only way this happens if our hint points to a block + * group thats not of the proper type, while looping this + * should never happen + */ + if (empty_size) + extra_loop = 1; + + if (!block_group) + goto new_group_no_lock; + + if (unlikely(!block_group->cached)) { + mutex_lock(&block_group->cache_mutex); + ret = cache_block_group(root, block_group); + mutex_unlock(&block_group->cache_mutex); + if (ret) + break; + } + + mutex_lock(&block_group->alloc_mutex); + if (unlikely(!block_group_bits(block_group, data))) + goto new_group; + + if (unlikely(block_group->ro)) + goto new_group; + + free_space = btrfs_find_free_space(block_group, search_start, + total_needed); + if (free_space) { + u64 start = block_group->key.objectid; + u64 end = block_group->key.objectid + + block_group->key.offset; + + search_start = stripe_align(root, free_space->offset); + + /* move on to the next group */ + if (search_start + num_bytes >= search_end) + goto new_group; + + /* move on to the next group */ + if (search_start + num_bytes > end) + goto new_group; + + if (last_wanted && search_start != last_wanted) { + total_needed += empty_cluster; + empty_size += empty_cluster; + last_wanted = 0; + /* + * if search_start is still in this block group + * then we just re-search this block group + */ + if (search_start >= start && + search_start < end) { + mutex_unlock(&block_group->alloc_mutex); + continue; + } + + /* else we go to the next block group */ + goto new_group; + } + + if (exclude_nr > 0 && + (search_start + num_bytes > exclude_start && + search_start < exclude_start + exclude_nr)) { + search_start = exclude_start + exclude_nr; + /* + * if search_start is still in this block group + * then we just re-search this block group + */ + if (search_start >= start && + search_start < end) { + mutex_unlock(&block_group->alloc_mutex); + last_wanted = 0; + continue; + } + + /* else we go to the next block group */ + goto new_group; + } + + ins->objectid = search_start; + ins->offset = num_bytes; + + btrfs_remove_free_space_lock(block_group, search_start, + num_bytes); + /* we are all good, lets return */ + mutex_unlock(&block_group->alloc_mutex); + break; + } +new_group: + mutex_unlock(&block_group->alloc_mutex); + put_block_group(block_group); + block_group = NULL; +new_group_no_lock: + /* don't try to compare new allocations against the + * last allocation any more + */ + last_wanted = 0; + + /* + * Here's how this works. + * loop == 0: we were searching a block group via a hint + * and didn't find anything, so we start at + * the head of the block groups and keep searching + * loop == 1: we're searching through all of the block groups + * if we hit the head again we have searched + * all of the block groups for this space and we + * need to try and allocate, if we cant error out. + * loop == 2: we allocated more space and are looping through + * all of the block groups again. + */ + if (loop == 0) { + head = &space_info->block_groups; + cur = head->next; + loop++; + } else if (loop == 1 && cur == head) { + int keep_going; + + /* at this point we give up on the empty_size + * allocations and just try to allocate the min + * space. + * + * The extra_loop field was set if an empty_size + * allocation was attempted above, and if this + * is try we need to try the loop again without + * the additional empty_size. + */ + total_needed -= empty_size; + empty_size = 0; + keep_going = extra_loop; + loop++; + + if (allowed_chunk_alloc && !chunk_alloc_done) { + up_read(&space_info->groups_sem); + ret = do_chunk_alloc(trans, root, num_bytes + + 2 * 1024 * 1024, data, 1); + down_read(&space_info->groups_sem); + if (ret < 0) + goto loop_check; + head = &space_info->block_groups; + /* + * we've allocated a new chunk, keep + * trying + */ + keep_going = 1; + chunk_alloc_done = 1; + } else if (!allowed_chunk_alloc) { + space_info->force_alloc = 1; + } +loop_check: + if (keep_going) { + cur = head->next; + extra_loop = 0; + } else { + break; + } + } else if (cur == head) { + break; + } + + block_group = list_entry(cur, struct btrfs_block_group_cache, + list); + atomic_inc(&block_group->count); + + search_start = block_group->key.objectid; + cur = cur->next; + } + + /* we found what we needed */ + if (ins->objectid) { + if (!(data & BTRFS_BLOCK_GROUP_DATA)) + trans->block_group = block_group->key.objectid; + + if (last_ptr) + *last_ptr = ins->objectid + ins->offset; + ret = 0; + } else if (!ret) { + printk(KERN_ERR "btrfs searching for %llu bytes, " + "num_bytes %llu, loop %d, allowed_alloc %d\n", + (unsigned long long)total_needed, + (unsigned long long)num_bytes, + loop, allowed_chunk_alloc); + ret = -ENOSPC; + } + if (block_group) + put_block_group(block_group); + + up_read(&space_info->groups_sem); + return ret; +} + +static void dump_space_info(struct btrfs_space_info *info, u64 bytes) +{ + struct btrfs_block_group_cache *cache; + struct list_head *l; + + printk(KERN_INFO "space_info has %llu free, is %sfull\n", + (unsigned long long)(info->total_bytes - info->bytes_used - + info->bytes_pinned - info->bytes_reserved), + (info->full) ? "" : "not "); + + down_read(&info->groups_sem); + list_for_each(l, &info->block_groups) { + cache = list_entry(l, struct btrfs_block_group_cache, list); + spin_lock(&cache->lock); + printk(KERN_INFO "block group %llu has %llu bytes, %llu used " + "%llu pinned %llu reserved\n", + (unsigned long long)cache->key.objectid, + (unsigned long long)cache->key.offset, + (unsigned long long)btrfs_block_group_used(&cache->item), + (unsigned long long)cache->pinned, + (unsigned long long)cache->reserved); + btrfs_dump_free_space(cache, bytes); + spin_unlock(&cache->lock); + } + up_read(&info->groups_sem); +} + +static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data) +{ + int ret; + u64 search_start = 0; + u64 alloc_profile; + struct btrfs_fs_info *info = root->fs_info; + + if (data) { + alloc_profile = info->avail_data_alloc_bits & + info->data_alloc_profile; + data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; + } else if (root == root->fs_info->chunk_root) { + alloc_profile = info->avail_system_alloc_bits & + info->system_alloc_profile; + data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; + } else { + alloc_profile = info->avail_metadata_alloc_bits & + info->metadata_alloc_profile; + data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; + } +again: + data = btrfs_reduce_alloc_profile(root, data); + /* + * the only place that sets empty_size is btrfs_realloc_node, which + * is not called recursively on allocations + */ + if (empty_size || root->ref_cows) { + if (!(data & BTRFS_BLOCK_GROUP_METADATA)) { + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, + BTRFS_BLOCK_GROUP_METADATA | + (info->metadata_alloc_profile & + info->avail_metadata_alloc_bits), 0); + } + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes + 2 * 1024 * 1024, data, 0); + } + + WARN_ON(num_bytes < root->sectorsize); + ret = find_free_extent(trans, root, num_bytes, empty_size, + search_start, search_end, hint_byte, ins, + trans->alloc_exclude_start, + trans->alloc_exclude_nr, data); + + if (ret == -ENOSPC && num_bytes > min_alloc_size) { + num_bytes = num_bytes >> 1; + num_bytes = num_bytes & ~(root->sectorsize - 1); + num_bytes = max(num_bytes, min_alloc_size); + do_chunk_alloc(trans, root->fs_info->extent_root, + num_bytes, data, 1); + goto again; + } + if (ret) { + struct btrfs_space_info *sinfo; + + sinfo = __find_space_info(root->fs_info, data); + printk(KERN_ERR "btrfs allocation failed flags %llu, " + "wanted %llu\n", (unsigned long long)data, + (unsigned long long)num_bytes); + dump_space_info(sinfo, num_bytes); + BUG(); + } + + return ret; +} + +int btrfs_free_reserved_extent(struct btrfs_root *root, u64 start, u64 len) +{ + struct btrfs_block_group_cache *cache; + int ret = 0; + + cache = btrfs_lookup_block_group(root->fs_info, start); + if (!cache) { + printk(KERN_ERR "Unable to find block group for %llu\n", + (unsigned long long)start); + return -ENOSPC; + } + + ret = btrfs_discard_extent(root, start, len); + + btrfs_add_free_space(cache, start, len); + put_block_group(cache); + update_reserved_extents(root, start, len, 0); + + return ret; +} + +int btrfs_reserve_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 min_alloc_size, + u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, + u64 data) +{ + int ret; + ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size, + empty_size, hint_byte, search_end, ins, + data); + update_reserved_extents(root, ins->objectid, ins->offset, 1); + return ret; +} + +static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins) +{ + int ret; + int pending_ret; + u64 super_used; + u64 root_used; + u64 num_bytes = ins->offset; + u32 sizes[2]; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_root *extent_root = info->extent_root; + struct btrfs_extent_item *extent_item; + struct btrfs_extent_ref *ref; + struct btrfs_path *path; + struct btrfs_key keys[2]; + + if (parent == 0) + parent = ins->objectid; + + /* block accounting for super block */ + spin_lock(&info->delalloc_lock); + super_used = btrfs_super_bytes_used(&info->super_copy); + btrfs_set_super_bytes_used(&info->super_copy, super_used + num_bytes); + + /* block accounting for root item */ + root_used = btrfs_root_used(&root->root_item); + btrfs_set_root_used(&root->root_item, root_used + num_bytes); + spin_unlock(&info->delalloc_lock); + + if (root == extent_root) { + struct pending_extent_op *extent_op; + + extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS); + BUG_ON(!extent_op); + + extent_op->type = PENDING_EXTENT_INSERT; + extent_op->bytenr = ins->objectid; + extent_op->num_bytes = ins->offset; + extent_op->parent = parent; + extent_op->orig_parent = 0; + extent_op->generation = ref_generation; + extent_op->orig_generation = 0; + extent_op->level = (int)owner; + INIT_LIST_HEAD(&extent_op->list); + extent_op->del = 0; + + mutex_lock(&root->fs_info->extent_ins_mutex); + set_extent_bits(&root->fs_info->extent_ins, ins->objectid, + ins->objectid + ins->offset - 1, + EXTENT_WRITEBACK, GFP_NOFS); + set_state_private(&root->fs_info->extent_ins, + ins->objectid, (unsigned long)extent_op); + mutex_unlock(&root->fs_info->extent_ins_mutex); + goto update_block; + } + + memcpy(&keys[0], ins, sizeof(*ins)); + keys[1].objectid = ins->objectid; + keys[1].type = BTRFS_EXTENT_REF_KEY; + keys[1].offset = parent; + sizes[0] = sizeof(*extent_item); + sizes[1] = sizeof(*ref); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + ret = btrfs_insert_empty_items(trans, extent_root, path, keys, + sizes, 2); + BUG_ON(ret); + + extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_extent_item); + btrfs_set_extent_refs(path->nodes[0], extent_item, 1); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_extent_ref); + + btrfs_set_ref_root(path->nodes[0], ref, root_objectid); + btrfs_set_ref_generation(path->nodes[0], ref, ref_generation); + btrfs_set_ref_objectid(path->nodes[0], ref, owner); + btrfs_set_ref_num_refs(path->nodes[0], ref, 1); + + btrfs_mark_buffer_dirty(path->nodes[0]); + + trans->alloc_exclude_start = 0; + trans->alloc_exclude_nr = 0; + btrfs_free_path(path); + finish_current_insert(trans, extent_root, 0); + pending_ret = del_pending_extents(trans, extent_root, 0); + + if (ret) + goto out; + if (pending_ret) { + ret = pending_ret; + goto out; + } + +update_block: + ret = update_block_group(trans, root, ins->objectid, + ins->offset, 1, 0); + if (ret) { + printk(KERN_ERR "btrfs update block group failed for %llu " + "%llu\n", (unsigned long long)ins->objectid, + (unsigned long long)ins->offset); + BUG(); + } +out: + return ret; +} + +int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins) +{ + int ret; + + if (root_objectid == BTRFS_TREE_LOG_OBJECTID) + return 0; + ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, + ref_generation, owner, ins); + update_reserved_extents(root, ins->objectid, ins->offset, 0); + return ret; +} + +/* + * this is used by the tree logging recovery code. It records that + * an extent has been allocated and makes sure to clear the free + * space cache bits as well + */ +int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, u64 ref_generation, + u64 owner, struct btrfs_key *ins) +{ + int ret; + struct btrfs_block_group_cache *block_group; + + block_group = btrfs_lookup_block_group(root->fs_info, ins->objectid); + mutex_lock(&block_group->cache_mutex); + cache_block_group(root, block_group); + mutex_unlock(&block_group->cache_mutex); + + ret = btrfs_remove_free_space(block_group, ins->objectid, + ins->offset); + BUG_ON(ret); + put_block_group(block_group); + ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid, + ref_generation, owner, ins); + return ret; +} + +/* + * finds a free extent and does all the dirty work required for allocation + * returns the key for the extent through ins, and a tree buffer for + * the first block of the extent through buf. + * + * returns 0 if everything worked, non-zero otherwise. + */ +int btrfs_alloc_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 num_bytes, u64 parent, u64 min_alloc_size, + u64 root_objectid, u64 ref_generation, + u64 owner_objectid, u64 empty_size, u64 hint_byte, + u64 search_end, struct btrfs_key *ins, u64 data) +{ + int ret; + + ret = __btrfs_reserve_extent(trans, root, num_bytes, + min_alloc_size, empty_size, hint_byte, + search_end, ins, data); + BUG_ON(ret); + if (root_objectid != BTRFS_TREE_LOG_OBJECTID) { + ret = __btrfs_alloc_reserved_extent(trans, root, parent, + root_objectid, ref_generation, + owner_objectid, ins); + BUG_ON(ret); + + } else { + update_reserved_extents(root, ins->objectid, ins->offset, 1); + } + return ret; +} + +struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 bytenr, u32 blocksize) +{ + struct extent_buffer *buf; + + buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + if (!buf) + return ERR_PTR(-ENOMEM); + btrfs_set_header_generation(buf, trans->transid); + btrfs_tree_lock(buf); + clean_tree_block(trans, root, buf); + btrfs_set_buffer_uptodate(buf); + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + set_extent_dirty(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } else { + set_extent_dirty(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, GFP_NOFS); + } + trans->blocks_used++; + return buf; +} + +/* + * helper function to allocate a block for a given tree + * returns the tree buffer or NULL. + */ +struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u32 blocksize, u64 parent, + u64 root_objectid, + u64 ref_generation, + int level, + u64 hint, + u64 empty_size) +{ + struct btrfs_key ins; + int ret; + struct extent_buffer *buf; + + ret = btrfs_alloc_extent(trans, root, blocksize, parent, blocksize, + root_objectid, ref_generation, level, + empty_size, hint, (u64)-1, &ins, 0); + if (ret) { + BUG_ON(ret > 0); + return ERR_PTR(ret); + } + + buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); + return buf; +} + +int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct extent_buffer *leaf) +{ + u64 leaf_owner; + u64 leaf_generation; + struct btrfs_key key; + struct btrfs_file_extent_item *fi; + int i; + int nritems; + int ret; + + BUG_ON(!btrfs_is_leaf(leaf)); + nritems = btrfs_header_nritems(leaf); + leaf_owner = btrfs_header_owner(leaf); + leaf_generation = btrfs_header_generation(leaf); + + for (i = 0; i < nritems; i++) { + u64 disk_bytenr; + cond_resched(); + + btrfs_item_key_to_cpu(leaf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + /* + * FIXME make sure to insert a trans record that + * repeats the snapshot del on crash + */ + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + if (disk_bytenr == 0) + continue; + + ret = __btrfs_free_extent(trans, root, disk_bytenr, + btrfs_file_extent_disk_num_bytes(leaf, fi), + leaf->start, leaf_owner, leaf_generation, + key.objectid, 0); + BUG_ON(ret); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + } + return 0; +} + +static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_leaf_ref *ref) +{ + int i; + int ret; + struct btrfs_extent_info *info = ref->extents; + + for (i = 0; i < ref->nritems; i++) { + ret = __btrfs_free_extent(trans, root, info->bytenr, + info->num_bytes, ref->bytenr, + ref->owner, ref->generation, + info->objectid, 0); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + + BUG_ON(ret); + info++; + } + + return 0; +} + +static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start, + u64 len, u32 *refs) +{ + int ret; + + ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs); + BUG_ON(ret); + +#if 0 /* some debugging code in case we see problems here */ + /* if the refs count is one, it won't get increased again. But + * if the ref count is > 1, someone may be decreasing it at + * the same time we are. + */ + if (*refs != 1) { + struct extent_buffer *eb = NULL; + eb = btrfs_find_create_tree_block(root, start, len); + if (eb) + btrfs_tree_lock(eb); + + mutex_lock(&root->fs_info->alloc_mutex); + ret = lookup_extent_ref(NULL, root, start, len, refs); + BUG_ON(ret); + mutex_unlock(&root->fs_info->alloc_mutex); + + if (eb) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + } + if (*refs == 1) { + printk(KERN_ERR "btrfs block %llu went down to one " + "during drop_snap\n", (unsigned long long)start); + } + + } +#endif + + cond_resched(); + return ret; +} + +/* + * helper function for drop_snapshot, this walks down the tree dropping ref + * counts as it goes. + */ +static noinline int walk_down_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level) +{ + u64 root_owner; + u64 root_gen; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + struct btrfs_leaf_ref *ref; + u32 blocksize; + int ret; + u32 refs; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start, + path->nodes[*level]->len, &refs); + BUG_ON(ret); + if (refs > 1) + goto out; + + /* + * walk down to the last node level and free all the leaves + */ + while (*level >= 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + if (btrfs_header_level(cur) != *level) + WARN_ON(1); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, cur); + BUG_ON(ret); + break; + } + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + + ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs); + BUG_ON(ret); + if (refs != 1) { + parent = path->nodes[*level]; + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + path->slots[*level]++; + + ret = __btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + root_owner, root_gen, + *level - 1, 1); + BUG_ON(ret); + + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + cond_resched(); + + continue; + } + /* + * at this point, we have a single ref, and since the + * only place referencing this extent is a dead root + * the reference count should never go higher. + * So, we don't need to check it again + */ + if (*level == 1) { + ref = btrfs_lookup_leaf_ref(root, bytenr); + if (ref && ref->generation != ptr_gen) { + btrfs_free_leaf_ref(root, ref); + ref = NULL; + } + if (ref) { + ret = cache_drop_leaf_ref(trans, root, ref); + BUG_ON(ret); + btrfs_remove_leaf_ref(root, ref); + btrfs_free_leaf_ref(root, ref); + *level = 0; + break; + } + } + next = btrfs_find_tree_block(root, bytenr, blocksize); + if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) { + free_extent_buffer(next); + + next = read_tree_block(root, bytenr, blocksize, + ptr_gen); + cond_resched(); +#if 0 + /* + * this is a debugging check and can go away + * the ref should never go all the way down to 1 + * at this point + */ + ret = lookup_extent_ref(NULL, root, bytenr, blocksize, + &refs); + BUG_ON(ret); + WARN_ON(refs != 1); +#endif + } + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } +out: + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + if (path->nodes[*level] == root->node) { + parent = path->nodes[*level]; + bytenr = path->nodes[*level]->start; + } else { + parent = path->nodes[*level + 1]; + bytenr = btrfs_node_blockptr(parent, path->slots[*level + 1]); + } + + blocksize = btrfs_level_size(root, *level); + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + ret = __btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, root_owner, root_gen, + *level, 1); + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + BUG_ON(ret); + + cond_resched(); + return 0; +} + +/* + * helper function for drop_subtree, this function is similar to + * walk_down_tree. The main difference is that it checks reference + * counts while tree blocks are locked. + */ +static noinline int walk_down_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level) +{ + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + u64 bytenr; + u64 ptr_gen; + u32 blocksize; + u32 refs; + int ret; + + cur = path->nodes[*level]; + ret = btrfs_lookup_extent_ref(trans, root, cur->start, cur->len, + &refs); + BUG_ON(ret); + if (refs > 1) + goto out; + + while (*level >= 0) { + cur = path->nodes[*level]; + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, cur); + BUG_ON(ret); + clean_tree_block(trans, root, cur); + break; + } + if (path->slots[*level] >= btrfs_header_nritems(cur)) { + clean_tree_block(trans, root, cur); + break; + } + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + + next = read_tree_block(root, bytenr, blocksize, ptr_gen); + btrfs_tree_lock(next); + + ret = btrfs_lookup_extent_ref(trans, root, bytenr, blocksize, + &refs); + BUG_ON(ret); + if (refs > 1) { + parent = path->nodes[*level]; + ret = btrfs_free_extent(trans, root, bytenr, + blocksize, parent->start, + btrfs_header_owner(parent), + btrfs_header_generation(parent), + *level - 1, 1); + BUG_ON(ret); + path->slots[*level]++; + btrfs_tree_unlock(next); + free_extent_buffer(next); + continue; + } + + *level = btrfs_header_level(next); + path->nodes[*level] = next; + path->slots[*level] = 0; + path->locks[*level] = 1; + cond_resched(); + } +out: + parent = path->nodes[*level + 1]; + bytenr = path->nodes[*level]->start; + blocksize = path->nodes[*level]->len; + + ret = btrfs_free_extent(trans, root, bytenr, blocksize, + parent->start, btrfs_header_owner(parent), + btrfs_header_generation(parent), *level, 1); + BUG_ON(ret); + + if (path->locks[*level]) { + btrfs_tree_unlock(path->nodes[*level]); + path->locks[*level] = 0; + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + cond_resched(); + return 0; +} + +/* + * helper for dropping snapshots. This walks back up the tree in the path + * to find the first node higher up where we haven't yet gone through + * all the slots + */ +static noinline int walk_up_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + int *level, int max_level) +{ + u64 root_owner; + u64 root_gen; + struct btrfs_root_item *root_item = &root->root_item; + int i; + int slot; + int ret; + + for (i = *level; i < max_level && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + struct extent_buffer *node; + struct btrfs_disk_key disk_key; + node = path->nodes[i]; + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + btrfs_node_key(node, &disk_key, path->slots[i]); + memcpy(&root_item->drop_progress, + &disk_key, sizeof(disk_key)); + root_item->drop_level = i; + return 0; + } else { + struct extent_buffer *parent; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + clean_tree_block(trans, root, path->nodes[*level]); + ret = btrfs_free_extent(trans, root, + path->nodes[*level]->start, + path->nodes[*level]->len, + parent->start, root_owner, + root_gen, *level, 1); + BUG_ON(ret); + if (path->locks[*level]) { + btrfs_tree_unlock(path->nodes[*level]); + path->locks[*level] = 0; + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root + *root) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int i; + int orig_level; + struct btrfs_root_item *root_item = &root->root_item; + + WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex)); + path = btrfs_alloc_path(); + BUG_ON(!path); + + level = btrfs_header_level(root->node); + orig_level = level; + if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) { + path->nodes[level] = root->node; + extent_buffer_get(root->node); + path->slots[level] = 0; + } else { + struct btrfs_key key; + struct btrfs_disk_key found_key; + struct extent_buffer *node; + + btrfs_disk_key_to_cpu(&key, &root_item->drop_progress); + level = root_item->drop_level; + path->lowest_level = level; + wret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (wret < 0) { + ret = wret; + goto out; + } + node = path->nodes[level]; + btrfs_node_key(node, &found_key, path->slots[level]); + WARN_ON(memcmp(&found_key, &root_item->drop_progress, + sizeof(found_key))); + /* + * unlock our path, this is safe because only this + * function is allowed to delete this snapshot + */ + for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + if (path->nodes[i] && path->locks[i]) { + path->locks[i] = 0; + btrfs_tree_unlock(path->nodes[i]); + } + } + } + while (1) { + wret = walk_down_tree(trans, root, path, &level); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + + wret = walk_up_tree(trans, root, path, &level, + BTRFS_MAX_LEVEL); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + if (trans->transaction->in_commit) { + ret = -EAGAIN; + break; + } + atomic_inc(&root->fs_info->throttle_gen); + wake_up(&root->fs_info->transaction_throttle); + } + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + } +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_drop_subtree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *node, + struct extent_buffer *parent) +{ + struct btrfs_path *path; + int level; + int parent_level; + int ret = 0; + int wret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + BUG_ON(!btrfs_tree_locked(parent)); + parent_level = btrfs_header_level(parent); + extent_buffer_get(parent); + path->nodes[parent_level] = parent; + path->slots[parent_level] = btrfs_header_nritems(parent); + + BUG_ON(!btrfs_tree_locked(node)); + level = btrfs_header_level(node); + extent_buffer_get(node); + path->nodes[level] = node; + path->slots[level] = 0; + + while (1) { + wret = walk_down_subtree(trans, root, path, &level); + if (wret < 0) + ret = wret; + if (wret != 0) + break; + + wret = walk_up_tree(trans, root, path, &level, parent_level); + if (wret < 0) + ret = wret; + if (wret != 0) + break; + } + + btrfs_free_path(path); + return ret; +} + +static unsigned long calc_ra(unsigned long start, unsigned long last, + unsigned long nr) +{ + return min(last, start + nr - 1); +} + +static noinline int relocate_inode_pages(struct inode *inode, u64 start, + u64 len) +{ + u64 page_start; + u64 page_end; + unsigned long first_index; + unsigned long last_index; + unsigned long i; + struct page *page; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct file_ra_state *ra; + struct btrfs_ordered_extent *ordered; + unsigned int total_read = 0; + unsigned int total_dirty = 0; + int ret = 0; + + ra = kzalloc(sizeof(*ra), GFP_NOFS); + + mutex_lock(&inode->i_mutex); + first_index = start >> PAGE_CACHE_SHIFT; + last_index = (start + len - 1) >> PAGE_CACHE_SHIFT; + + /* make sure the dirty trick played by the caller work */ + ret = invalidate_inode_pages2_range(inode->i_mapping, + first_index, last_index); + if (ret) + goto out_unlock; + + file_ra_state_init(ra, inode->i_mapping); + + for (i = first_index ; i <= last_index; i++) { + if (total_read % ra->ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, ra, NULL, i, + calc_ra(i, last_index, ra->ra_pages)); + } + total_read++; +again: + if (((u64)i << PAGE_CACHE_SHIFT) > i_size_read(inode)) + BUG_ON(1); + page = grab_cache_page(inode->i_mapping, i); + if (!page) { + ret = -ENOMEM; + goto out_unlock; + } + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + + if (i == first_index) + set_extent_bits(io_tree, page_start, page_end, + EXTENT_BOUNDARY, GFP_NOFS); + btrfs_set_extent_delalloc(inode, page_start, page_end); + + set_page_dirty(page); + total_dirty++; + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + } + +out_unlock: + kfree(ra); + mutex_unlock(&inode->i_mutex); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, total_dirty); + return ret; +} + +static noinline int relocate_data_extent(struct inode *reloc_inode, + struct btrfs_key *extent_key, + u64 offset) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(reloc_inode)->extent_tree; + struct extent_map *em; + u64 start = extent_key->objectid - offset; + u64 end = start + extent_key->offset - 1; + + em = alloc_extent_map(GFP_NOFS); + BUG_ON(!em || IS_ERR(em)); + + em->start = start; + em->len = extent_key->offset; + em->block_len = extent_key->offset; + em->block_start = extent_key->objectid; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + /* setup extent map to cheat btrfs_readpage */ + lock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); + while (1) { + int ret; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(reloc_inode, start, end, 0); + } + unlock_extent(&BTRFS_I(reloc_inode)->io_tree, start, end, GFP_NOFS); + + return relocate_inode_pages(reloc_inode, start, extent_key->offset); +} + +struct btrfs_ref_path { + u64 extent_start; + u64 nodes[BTRFS_MAX_LEVEL]; + u64 root_objectid; + u64 root_generation; + u64 owner_objectid; + u32 num_refs; + int lowest_level; + int current_level; + int shared_level; + + struct btrfs_key node_keys[BTRFS_MAX_LEVEL]; + u64 new_nodes[BTRFS_MAX_LEVEL]; +}; + +struct disk_extent { + u64 ram_bytes; + u64 disk_bytenr; + u64 disk_num_bytes; + u64 offset; + u64 num_bytes; + u8 compression; + u8 encryption; + u16 other_encoding; +}; + +static int is_cowonly_root(u64 root_objectid) +{ + if (root_objectid == BTRFS_ROOT_TREE_OBJECTID || + root_objectid == BTRFS_EXTENT_TREE_OBJECTID || + root_objectid == BTRFS_CHUNK_TREE_OBJECTID || + root_objectid == BTRFS_DEV_TREE_OBJECTID || + root_objectid == BTRFS_TREE_LOG_OBJECTID || + root_objectid == BTRFS_CSUM_TREE_OBJECTID) + return 1; + return 0; +} + +static noinline int __next_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path, + int first_time) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_extent_ref *ref; + struct btrfs_key key; + struct btrfs_key found_key; + u64 bytenr; + u32 nritems; + int level; + int ret = 1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + if (first_time) { + ref_path->lowest_level = -1; + ref_path->current_level = -1; + ref_path->shared_level = -1; + goto walk_up; + } +walk_down: + level = ref_path->current_level - 1; + while (level >= -1) { + u64 parent; + if (level < ref_path->lowest_level) + break; + + if (level >= 0) + bytenr = ref_path->nodes[level]; + else + bytenr = ref_path->extent_start; + BUG_ON(bytenr == 0); + + parent = ref_path->nodes[level + 1]; + ref_path->nodes[level + 1] = 0; + ref_path->current_level = level; + BUG_ON(parent == 0); + + key.objectid = bytenr; + key.offset = parent + 1; + key.type = BTRFS_EXTENT_REF_KEY; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + BUG_ON(ret == 0); + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret > 0) + goto next; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid == bytenr && + found_key.type == BTRFS_EXTENT_REF_KEY) { + if (level < ref_path->shared_level) + ref_path->shared_level = level; + goto found; + } +next: + level--; + btrfs_release_path(extent_root, path); + cond_resched(); + } + /* reached lowest level */ + ret = 1; + goto out; +walk_up: + level = ref_path->current_level; + while (level < BTRFS_MAX_LEVEL - 1) { + u64 ref_objectid; + + if (level >= 0) + bytenr = ref_path->nodes[level]; + else + bytenr = ref_path->extent_start; + + BUG_ON(bytenr == 0); + + key.objectid = bytenr; + key.offset = 0; + key.type = BTRFS_EXTENT_REF_KEY; + + ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 0); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(extent_root, path); + if (ret < 0) + goto out; + if (ret > 0) { + /* the extent was freed by someone */ + if (ref_path->lowest_level == level) + goto out; + btrfs_release_path(extent_root, path); + goto walk_down; + } + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != bytenr || + found_key.type != BTRFS_EXTENT_REF_KEY) { + /* the extent was freed by someone */ + if (ref_path->lowest_level == level) { + ret = 1; + goto out; + } + btrfs_release_path(extent_root, path); + goto walk_down; + } +found: + ref = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_extent_ref); + ref_objectid = btrfs_ref_objectid(leaf, ref); + if (ref_objectid < BTRFS_FIRST_FREE_OBJECTID) { + if (first_time) { + level = (int)ref_objectid; + BUG_ON(level >= BTRFS_MAX_LEVEL); + ref_path->lowest_level = level; + ref_path->current_level = level; + ref_path->nodes[level] = bytenr; + } else { + WARN_ON(ref_objectid != level); + } + } else { + WARN_ON(level != -1); + } + first_time = 0; + + if (ref_path->lowest_level == level) { + ref_path->owner_objectid = ref_objectid; + ref_path->num_refs = btrfs_ref_num_refs(leaf, ref); + } + + /* + * the block is tree root or the block isn't in reference + * counted tree. + */ + if (found_key.objectid == found_key.offset || + is_cowonly_root(btrfs_ref_root(leaf, ref))) { + ref_path->root_objectid = btrfs_ref_root(leaf, ref); + ref_path->root_generation = + btrfs_ref_generation(leaf, ref); + if (level < 0) { + /* special reference from the tree log */ + ref_path->nodes[0] = found_key.offset; + ref_path->current_level = 0; + } + ret = 0; + goto out; + } + + level++; + BUG_ON(ref_path->nodes[level] != 0); + ref_path->nodes[level] = found_key.offset; + ref_path->current_level = level; + + /* + * the reference was created in the running transaction, + * no need to continue walking up. + */ + if (btrfs_ref_generation(leaf, ref) == trans->transid) { + ref_path->root_objectid = btrfs_ref_root(leaf, ref); + ref_path->root_generation = + btrfs_ref_generation(leaf, ref); + ret = 0; + goto out; + } + + btrfs_release_path(extent_root, path); + cond_resched(); + } + /* reached max tree level, but no tree root found. */ + BUG(); +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_first_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path, + u64 extent_start) +{ + memset(ref_path, 0, sizeof(*ref_path)); + ref_path->extent_start = extent_start; + + return __next_ref_path(trans, extent_root, ref_path, 1); +} + +static int btrfs_next_ref_path(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_ref_path *ref_path) +{ + return __next_ref_path(trans, extent_root, ref_path, 0); +} + +static noinline int get_new_locations(struct inode *reloc_inode, + struct btrfs_key *extent_key, + u64 offset, int no_fragment, + struct disk_extent **extents, + int *nr_extents) +{ + struct btrfs_root *root = BTRFS_I(reloc_inode)->root; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct disk_extent *exts = *extents; + struct btrfs_key found_key; + u64 cur_pos; + u64 last_byte; + u32 nritems; + int nr = 0; + int max = *nr_extents; + int ret; + + WARN_ON(!no_fragment && *extents); + if (!exts) { + max = 1; + exts = kmalloc(sizeof(*exts) * max, GFP_NOFS); + if (!exts) + return -ENOMEM; + } + + path = btrfs_alloc_path(); + BUG_ON(!path); + + cur_pos = extent_key->objectid - offset; + last_byte = extent_key->objectid + extent_key->offset; + ret = btrfs_lookup_file_extent(NULL, root, path, reloc_inode->i_ino, + cur_pos, 0); + if (ret < 0) + goto out; + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.offset != cur_pos || + found_key.type != BTRFS_EXTENT_DATA_KEY || + found_key.objectid != reloc_inode->i_ino) + break; + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != + BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) == 0) + break; + + if (nr == max) { + struct disk_extent *old = exts; + max *= 2; + exts = kzalloc(sizeof(*exts) * max, GFP_NOFS); + memcpy(exts, old, sizeof(*exts) * nr); + if (old != *extents) + kfree(old); + } + + exts[nr].disk_bytenr = + btrfs_file_extent_disk_bytenr(leaf, fi); + exts[nr].disk_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, fi); + exts[nr].offset = btrfs_file_extent_offset(leaf, fi); + exts[nr].num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + exts[nr].ram_bytes = btrfs_file_extent_ram_bytes(leaf, fi); + exts[nr].compression = btrfs_file_extent_compression(leaf, fi); + exts[nr].encryption = btrfs_file_extent_encryption(leaf, fi); + exts[nr].other_encoding = btrfs_file_extent_other_encoding(leaf, + fi); + BUG_ON(exts[nr].offset > 0); + BUG_ON(exts[nr].compression || exts[nr].encryption); + BUG_ON(exts[nr].num_bytes != exts[nr].disk_num_bytes); + + cur_pos += exts[nr].num_bytes; + nr++; + + if (cur_pos + offset >= last_byte) + break; + + if (no_fragment) { + ret = 1; + goto out; + } + path->slots[0]++; + } + + BUG_ON(cur_pos + offset > last_byte); + if (cur_pos + offset < last_byte) { + ret = -ENOENT; + goto out; + } + ret = 0; +out: + btrfs_free_path(path); + if (ret) { + if (exts != *extents) + kfree(exts); + } else { + *extents = exts; + *nr_extents = nr; + } + return ret; +} + +static noinline int replace_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *extent_key, + struct btrfs_key *leaf_key, + struct btrfs_ref_path *ref_path, + struct disk_extent *new_extents, + int nr_extents) +{ + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + struct inode *inode = NULL; + struct btrfs_key key; + u64 lock_start = 0; + u64 lock_end = 0; + u64 num_bytes; + u64 ext_offset; + u64 first_pos; + u32 nritems; + int nr_scaned = 0; + int extent_locked = 0; + int extent_type; + int ret; + + memcpy(&key, leaf_key, sizeof(key)); + first_pos = INT_LIMIT(loff_t) - extent_key->offset; + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { + if (key.objectid < ref_path->owner_objectid || + (key.objectid == ref_path->owner_objectid && + key.type < BTRFS_EXTENT_DATA_KEY)) { + key.objectid = ref_path->owner_objectid; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + } + } + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); +next: + if (extent_locked && ret > 0) { + /* + * the file extent item was modified by someone + * before the extent got locked. + */ + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + extent_locked = 0; + } + + if (path->slots[0] >= nritems) { + if (++nr_scaned > 2) + break; + + BUG_ON(extent_locked); + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS) { + if ((key.objectid > ref_path->owner_objectid) || + (key.objectid == ref_path->owner_objectid && + key.type > BTRFS_EXTENT_DATA_KEY) || + (key.offset >= first_pos + extent_key->offset)) + break; + } + + if (inode && key.objectid != inode->i_ino) { + BUG_ON(extent_locked); + btrfs_release_path(root, path); + mutex_unlock(&inode->i_mutex); + iput(inode); + inode = NULL; + continue; + } + + if (key.type != BTRFS_EXTENT_DATA_KEY) { + path->slots[0]++; + ret = 1; + goto next; + } + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + if ((extent_type != BTRFS_FILE_EXTENT_REG && + extent_type != BTRFS_FILE_EXTENT_PREALLOC) || + (btrfs_file_extent_disk_bytenr(leaf, fi) != + extent_key->objectid)) { + path->slots[0]++; + ret = 1; + goto next; + } + + num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + ext_offset = btrfs_file_extent_offset(leaf, fi); + + if (first_pos > key.offset - ext_offset) + first_pos = key.offset - ext_offset; + + if (!extent_locked) { + lock_start = key.offset; + lock_end = lock_start + num_bytes - 1; + } else { + if (lock_start > key.offset || + lock_end + 1 < key.offset + num_bytes) { + unlock_extent(&BTRFS_I(inode)->io_tree, + lock_start, lock_end, GFP_NOFS); + extent_locked = 0; + } + } + + if (!inode) { + btrfs_release_path(root, path); + + inode = btrfs_iget_locked(root->fs_info->sb, + key.objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = + key.objectid; + BTRFS_I(inode)->location.type = + BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + /* + * some code call btrfs_commit_transaction while + * holding the i_mutex, so we can't use mutex_lock + * here. + */ + if (is_bad_inode(inode) || + !mutex_trylock(&inode->i_mutex)) { + iput(inode); + inode = NULL; + key.offset = (u64)-1; + goto skip; + } + } + + if (!extent_locked) { + struct btrfs_ordered_extent *ordered; + + btrfs_release_path(root, path); + + lock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + lock_end); + if (ordered && + ordered->file_offset <= lock_end && + ordered->file_offset + ordered->len > lock_start) { + unlock_extent(&BTRFS_I(inode)->io_tree, + lock_start, lock_end, GFP_NOFS); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + key.offset += num_bytes; + goto skip; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + extent_locked = 1; + continue; + } + + if (nr_extents == 1) { + /* update extent pointer in place */ + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extents[0].disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extents[0].disk_num_bytes); + btrfs_mark_buffer_dirty(leaf); + + btrfs_drop_extent_cache(inode, key.offset, + key.offset + num_bytes - 1, 0); + + ret = btrfs_inc_extent_ref(trans, root, + new_extents[0].disk_bytenr, + new_extents[0].disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, + key.objectid); + BUG_ON(ret); + + ret = btrfs_free_extent(trans, root, + extent_key->objectid, + extent_key->offset, + leaf->start, + btrfs_header_owner(leaf), + btrfs_header_generation(leaf), + key.objectid, 0); + BUG_ON(ret); + + btrfs_release_path(root, path); + key.offset += num_bytes; + } else { + BUG_ON(1); +#if 0 + u64 alloc_hint; + u64 extent_len; + int i; + /* + * drop old extent pointer at first, then insert the + * new pointers one bye one + */ + btrfs_release_path(root, path); + ret = btrfs_drop_extents(trans, root, inode, key.offset, + key.offset + num_bytes, + key.offset, &alloc_hint); + BUG_ON(ret); + + for (i = 0; i < nr_extents; i++) { + if (ext_offset >= new_extents[i].num_bytes) { + ext_offset -= new_extents[i].num_bytes; + continue; + } + extent_len = min(new_extents[i].num_bytes - + ext_offset, num_bytes); + + ret = btrfs_insert_empty_item(trans, root, + path, &key, + sizeof(*fi)); + BUG_ON(ret); + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, + trans->transid); + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extents[i].disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extents[i].disk_num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, + new_extents[i].ram_bytes); + + btrfs_set_file_extent_compression(leaf, fi, + new_extents[i].compression); + btrfs_set_file_extent_encryption(leaf, fi, + new_extents[i].encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, + new_extents[i].other_encoding); + + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_len); + ext_offset += new_extents[i].offset; + btrfs_set_file_extent_offset(leaf, fi, + ext_offset); + btrfs_mark_buffer_dirty(leaf); + + btrfs_drop_extent_cache(inode, key.offset, + key.offset + extent_len - 1, 0); + + ret = btrfs_inc_extent_ref(trans, root, + new_extents[i].disk_bytenr, + new_extents[i].disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, key.objectid); + BUG_ON(ret); + btrfs_release_path(root, path); + + inode_add_bytes(inode, extent_len); + + ext_offset = 0; + num_bytes -= extent_len; + key.offset += extent_len; + + if (num_bytes == 0) + break; + } + BUG_ON(i >= nr_extents); +#endif + } + + if (extent_locked) { + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + extent_locked = 0; + } +skip: + if (ref_path->owner_objectid != BTRFS_MULTIPLE_OBJECTIDS && + key.offset >= first_pos + extent_key->offset) + break; + + cond_resched(); + } + ret = 0; +out: + btrfs_release_path(root, path); + if (inode) { + mutex_unlock(&inode->i_mutex); + if (extent_locked) { + unlock_extent(&BTRFS_I(inode)->io_tree, lock_start, + lock_end, GFP_NOFS); + } + iput(inode); + } + return ret; +} + +int btrfs_reloc_tree_cache_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *buf, u64 orig_start) +{ + int level; + int ret; + + BUG_ON(btrfs_header_generation(buf) != trans->transid); + BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); + + level = btrfs_header_level(buf); + if (level == 0) { + struct btrfs_leaf_ref *ref; + struct btrfs_leaf_ref *orig_ref; + + orig_ref = btrfs_lookup_leaf_ref(root, orig_start); + if (!orig_ref) + return -ENOENT; + + ref = btrfs_alloc_leaf_ref(root, orig_ref->nritems); + if (!ref) { + btrfs_free_leaf_ref(root, orig_ref); + return -ENOMEM; + } + + ref->nritems = orig_ref->nritems; + memcpy(ref->extents, orig_ref->extents, + sizeof(ref->extents[0]) * ref->nritems); + + btrfs_free_leaf_ref(root, orig_ref); + + ref->root_gen = trans->transid; + ref->bytenr = buf->start; + ref->owner = btrfs_header_owner(buf); + ref->generation = btrfs_header_generation(buf); + ret = btrfs_add_leaf_ref(root, ref, 0); + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } + return 0; +} + +static noinline int invalidate_extent_cache(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_block_group_cache *group, + struct btrfs_root *target_root) +{ + struct btrfs_key key; + struct inode *inode = NULL; + struct btrfs_file_extent_item *fi; + u64 num_bytes; + u64 skip_objectid = 0; + u32 nritems; + u32 i; + + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + if (key.objectid == skip_objectid || + key.type != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + if (btrfs_file_extent_disk_bytenr(leaf, fi) == 0) + continue; + if (!inode || inode->i_ino != key.objectid) { + iput(inode); + inode = btrfs_ilookup(target_root->fs_info->sb, + key.objectid, target_root, 1); + } + if (!inode) { + skip_objectid = key.objectid; + continue; + } + num_bytes = btrfs_file_extent_num_bytes(leaf, fi); + + lock_extent(&BTRFS_I(inode)->io_tree, key.offset, + key.offset + num_bytes - 1, GFP_NOFS); + btrfs_drop_extent_cache(inode, key.offset, + key.offset + num_bytes - 1, 1); + unlock_extent(&BTRFS_I(inode)->io_tree, key.offset, + key.offset + num_bytes - 1, GFP_NOFS); + cond_resched(); + } + iput(inode); + return 0; +} + +static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode) +{ + struct btrfs_key key; + struct btrfs_key extent_key; + struct btrfs_file_extent_item *fi; + struct btrfs_leaf_ref *ref; + struct disk_extent *new_extent; + u64 bytenr; + u64 num_bytes; + u32 nritems; + u32 i; + int ext_index; + int nr_extent; + int ret; + + new_extent = kmalloc(sizeof(*new_extent), GFP_NOFS); + BUG_ON(!new_extent); + + ref = btrfs_lookup_leaf_ref(root, leaf->start); + BUG_ON(!ref); + + ext_index = -1; + nritems = btrfs_header_nritems(leaf); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(leaf, &key, i); + if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + continue; + fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) == + BTRFS_FILE_EXTENT_INLINE) + continue; + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + if (bytenr == 0) + continue; + + ext_index++; + if (bytenr >= group->key.objectid + group->key.offset || + bytenr + num_bytes <= group->key.objectid) + continue; + + extent_key.objectid = bytenr; + extent_key.offset = num_bytes; + extent_key.type = BTRFS_EXTENT_ITEM_KEY; + nr_extent = 1; + ret = get_new_locations(reloc_inode, &extent_key, + group->key.objectid, 1, + &new_extent, &nr_extent); + if (ret > 0) + continue; + BUG_ON(ret < 0); + + BUG_ON(ref->extents[ext_index].bytenr != bytenr); + BUG_ON(ref->extents[ext_index].num_bytes != num_bytes); + ref->extents[ext_index].bytenr = new_extent->disk_bytenr; + ref->extents[ext_index].num_bytes = new_extent->disk_num_bytes; + + btrfs_set_file_extent_disk_bytenr(leaf, fi, + new_extent->disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, + new_extent->disk_num_bytes); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_inc_extent_ref(trans, root, + new_extent->disk_bytenr, + new_extent->disk_num_bytes, + leaf->start, + root->root_key.objectid, + trans->transid, key.objectid); + BUG_ON(ret); + ret = btrfs_free_extent(trans, root, + bytenr, num_bytes, leaf->start, + btrfs_header_owner(leaf), + btrfs_header_generation(leaf), + key.objectid, 0); + BUG_ON(ret); + cond_resched(); + } + kfree(new_extent); + BUG_ON(ext_index + 1 != ref->nritems); + btrfs_free_leaf_ref(root, ref); + return 0; +} + +int btrfs_free_reloc_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + int ret; + + if (root->reloc_root) { + reloc_root = root->reloc_root; + root->reloc_root = NULL; + list_add(&reloc_root->dead_list, + &root->fs_info->dead_reloc_roots); + + btrfs_set_root_bytenr(&reloc_root->root_item, + reloc_root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(reloc_root->node)); + memset(&reloc_root->root_item.drop_progress, 0, + sizeof(struct btrfs_disk_key)); + reloc_root->root_item.drop_level = 0; + + ret = btrfs_update_root(trans, root->fs_info->tree_root, + &reloc_root->root_key, + &reloc_root->root_item); + BUG_ON(ret); + } + return 0; +} + +int btrfs_drop_dead_reloc_roots(struct btrfs_root *root) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *reloc_root; + struct btrfs_root *prev_root = NULL; + struct list_head dead_roots; + int ret; + unsigned long nr; + + INIT_LIST_HEAD(&dead_roots); + list_splice_init(&root->fs_info->dead_reloc_roots, &dead_roots); + + while (!list_empty(&dead_roots)) { + reloc_root = list_entry(dead_roots.prev, + struct btrfs_root, dead_list); + list_del_init(&reloc_root->dead_list); + + BUG_ON(reloc_root->commit_root != NULL); + while (1) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + mutex_lock(&root->fs_info->drop_mutex); + ret = btrfs_drop_snapshot(trans, reloc_root); + if (ret != -EAGAIN) + break; + mutex_unlock(&root->fs_info->drop_mutex); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); + } + + free_extent_buffer(reloc_root->node); + + ret = btrfs_del_root(trans, root->fs_info->tree_root, + &reloc_root->root_key); + BUG_ON(ret); + mutex_unlock(&root->fs_info->drop_mutex); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); + + kfree(prev_root); + prev_root = reloc_root; + } + if (prev_root) { + btrfs_remove_leaf_refs(prev_root, (u64)-1, 0); + kfree(prev_root); + } + return 0; +} + +int btrfs_add_dead_reloc_root(struct btrfs_root *root) +{ + list_add(&root->dead_list, &root->fs_info->dead_reloc_roots); + return 0; +} + +int btrfs_cleanup_reloc_trees(struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct btrfs_trans_handle *trans; + struct btrfs_key location; + int found; + int ret; + + mutex_lock(&root->fs_info->tree_reloc_mutex); + ret = btrfs_find_dead_roots(root, BTRFS_TREE_RELOC_OBJECTID, NULL); + BUG_ON(ret); + found = !list_empty(&root->fs_info->dead_reloc_roots); + mutex_unlock(&root->fs_info->tree_reloc_mutex); + + if (found) { + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + ret = btrfs_commit_transaction(trans, root); + BUG_ON(ret); + } + + location.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + location.offset = (u64)-1; + location.type = BTRFS_ROOT_ITEM_KEY; + + reloc_root = btrfs_read_fs_root_no_name(root->fs_info, &location); + BUG_ON(!reloc_root); + btrfs_orphan_cleanup(reloc_root); + return 0; +} + +static noinline int init_reloc_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb; + struct btrfs_root_item *root_item; + struct btrfs_key root_key; + int ret; + + BUG_ON(!root->ref_cows); + if (root->reloc_root) + return 0; + + root_item = kmalloc(sizeof(*root_item), GFP_NOFS); + BUG_ON(!root_item); + + ret = btrfs_copy_root(trans, root, root->commit_root, + &eb, BTRFS_TREE_RELOC_OBJECTID); + BUG_ON(ret); + + root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; + root_key.offset = root->root_key.objectid; + root_key.type = BTRFS_ROOT_ITEM_KEY; + + memcpy(root_item, &root->root_item, sizeof(root_item)); + btrfs_set_root_refs(root_item, 0); + btrfs_set_root_bytenr(root_item, eb->start); + btrfs_set_root_level(root_item, btrfs_header_level(eb)); + btrfs_set_root_generation(root_item, trans->transid); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + ret = btrfs_insert_root(trans, root->fs_info->tree_root, + &root_key, root_item); + BUG_ON(ret); + kfree(root_item); + + reloc_root = btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &root_key); + BUG_ON(!reloc_root); + reloc_root->last_trans = trans->transid; + reloc_root->commit_root = NULL; + reloc_root->ref_tree = &root->fs_info->reloc_ref_tree; + + root->reloc_root = reloc_root; + return 0; +} + +/* + * Core function of space balance. + * + * The idea is using reloc trees to relocate tree blocks in reference + * counted roots. There is one reloc tree for each subvol, and all + * reloc trees share same root key objectid. Reloc trees are snapshots + * of the latest committed roots of subvols (root->commit_root). + * + * To relocate a tree block referenced by a subvol, there are two steps. + * COW the block through subvol's reloc tree, then update block pointer + * in the subvol to point to the new block. Since all reloc trees share + * same root key objectid, doing special handing for tree blocks owned + * by them is easy. Once a tree block has been COWed in one reloc tree, + * we can use the resulting new block directly when the same block is + * required to COW again through other reloc trees. By this way, relocated + * tree blocks are shared between reloc trees, so they are also shared + * between subvols. + */ +static noinline int relocate_one_path(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *first_key, + struct btrfs_ref_path *ref_path, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode) +{ + struct btrfs_root *reloc_root; + struct extent_buffer *eb = NULL; + struct btrfs_key *keys; + u64 *nodes; + int level; + int shared_level; + int lowest_level = 0; + int ret; + + if (ref_path->owner_objectid < BTRFS_FIRST_FREE_OBJECTID) + lowest_level = ref_path->owner_objectid; + + if (!root->ref_cows) { + path->lowest_level = lowest_level; + ret = btrfs_search_slot(trans, root, first_key, path, 0, 1); + BUG_ON(ret < 0); + path->lowest_level = 0; + btrfs_release_path(root, path); + return 0; + } + + mutex_lock(&root->fs_info->tree_reloc_mutex); + ret = init_reloc_tree(trans, root); + BUG_ON(ret); + reloc_root = root->reloc_root; + + shared_level = ref_path->shared_level; + ref_path->shared_level = BTRFS_MAX_LEVEL - 1; + + keys = ref_path->node_keys; + nodes = ref_path->new_nodes; + memset(&keys[shared_level + 1], 0, + sizeof(*keys) * (BTRFS_MAX_LEVEL - shared_level - 1)); + memset(&nodes[shared_level + 1], 0, + sizeof(*nodes) * (BTRFS_MAX_LEVEL - shared_level - 1)); + + if (nodes[lowest_level] == 0) { + path->lowest_level = lowest_level; + ret = btrfs_search_slot(trans, reloc_root, first_key, path, + 0, 1); + BUG_ON(ret); + for (level = lowest_level; level < BTRFS_MAX_LEVEL; level++) { + eb = path->nodes[level]; + if (!eb || eb == reloc_root->node) + break; + nodes[level] = eb->start; + if (level == 0) + btrfs_item_key_to_cpu(eb, &keys[level], 0); + else + btrfs_node_key_to_cpu(eb, &keys[level], 0); + } + if (nodes[0] && + ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + eb = path->nodes[0]; + ret = replace_extents_in_leaf(trans, reloc_root, eb, + group, reloc_inode); + BUG_ON(ret); + } + btrfs_release_path(reloc_root, path); + } else { + ret = btrfs_merge_path(trans, reloc_root, keys, nodes, + lowest_level); + BUG_ON(ret); + } + + /* + * replace tree blocks in the fs tree with tree blocks in + * the reloc tree. + */ + ret = btrfs_merge_path(trans, root, keys, nodes, lowest_level); + BUG_ON(ret < 0); + + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + ret = btrfs_search_slot(trans, reloc_root, first_key, path, + 0, 0); + BUG_ON(ret); + extent_buffer_get(path->nodes[0]); + eb = path->nodes[0]; + btrfs_release_path(reloc_root, path); + ret = invalidate_extent_cache(reloc_root, eb, group, root); + BUG_ON(ret); + free_extent_buffer(eb); + } + + mutex_unlock(&root->fs_info->tree_reloc_mutex); + path->lowest_level = 0; + return 0; +} + +static noinline int relocate_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *first_key, + struct btrfs_ref_path *ref_path) +{ + int ret; + + ret = relocate_one_path(trans, root, path, first_key, + ref_path, NULL, NULL); + BUG_ON(ret); + + if (root == root->fs_info->extent_root) + btrfs_extent_post_op(trans, root); + + return 0; +} + +static noinline int del_extent_zero(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct btrfs_path *path, + struct btrfs_key *extent_key) +{ + int ret; + + ret = btrfs_search_slot(trans, extent_root, extent_key, path, -1, 1); + if (ret) + goto out; + ret = btrfs_del_item(trans, extent_root, path); +out: + btrfs_release_path(extent_root, path); + return ret; +} + +static noinline struct btrfs_root *read_ref_root(struct btrfs_fs_info *fs_info, + struct btrfs_ref_path *ref_path) +{ + struct btrfs_key root_key; + + root_key.objectid = ref_path->root_objectid; + root_key.type = BTRFS_ROOT_ITEM_KEY; + if (is_cowonly_root(ref_path->root_objectid)) + root_key.offset = 0; + else + root_key.offset = (u64)-1; + + return btrfs_read_fs_root_no_name(fs_info, &root_key); +} + +static noinline int relocate_one_extent(struct btrfs_root *extent_root, + struct btrfs_path *path, + struct btrfs_key *extent_key, + struct btrfs_block_group_cache *group, + struct inode *reloc_inode, int pass) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *found_root; + struct btrfs_ref_path *ref_path = NULL; + struct disk_extent *new_extents = NULL; + int nr_extents = 0; + int loops; + int ret; + int level; + struct btrfs_key first_key; + u64 prev_block = 0; + + + trans = btrfs_start_transaction(extent_root, 1); + BUG_ON(!trans); + + if (extent_key->objectid == 0) { + ret = del_extent_zero(trans, extent_root, path, extent_key); + goto out; + } + + ref_path = kmalloc(sizeof(*ref_path), GFP_NOFS); + if (!ref_path) { + ret = -ENOMEM; + goto out; + } + + for (loops = 0; ; loops++) { + if (loops == 0) { + ret = btrfs_first_ref_path(trans, extent_root, ref_path, + extent_key->objectid); + } else { + ret = btrfs_next_ref_path(trans, extent_root, ref_path); + } + if (ret < 0) + goto out; + if (ret > 0) + break; + + if (ref_path->root_objectid == BTRFS_TREE_LOG_OBJECTID || + ref_path->root_objectid == BTRFS_TREE_RELOC_OBJECTID) + continue; + + found_root = read_ref_root(extent_root->fs_info, ref_path); + BUG_ON(!found_root); + /* + * for reference counted tree, only process reference paths + * rooted at the latest committed root. + */ + if (found_root->ref_cows && + ref_path->root_generation != found_root->root_key.offset) + continue; + + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + if (pass == 0) { + /* + * copy data extents to new locations + */ + u64 group_start = group->key.objectid; + ret = relocate_data_extent(reloc_inode, + extent_key, + group_start); + if (ret < 0) + goto out; + break; + } + level = 0; + } else { + level = ref_path->owner_objectid; + } + + if (prev_block != ref_path->nodes[level]) { + struct extent_buffer *eb; + u64 block_start = ref_path->nodes[level]; + u64 block_size = btrfs_level_size(found_root, level); + + eb = read_tree_block(found_root, block_start, + block_size, 0); + btrfs_tree_lock(eb); + BUG_ON(level != btrfs_header_level(eb)); + + if (level == 0) + btrfs_item_key_to_cpu(eb, &first_key, 0); + else + btrfs_node_key_to_cpu(eb, &first_key, 0); + + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + prev_block = block_start; + } + + btrfs_record_root_in_trans(found_root); + if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { + /* + * try to update data extent references while + * keeping metadata shared between snapshots. + */ + if (pass == 1) { + ret = relocate_one_path(trans, found_root, + path, &first_key, ref_path, + group, reloc_inode); + if (ret < 0) + goto out; + continue; + } + /* + * use fallback method to process the remaining + * references. + */ + if (!new_extents) { + u64 group_start = group->key.objectid; + new_extents = kmalloc(sizeof(*new_extents), + GFP_NOFS); + nr_extents = 1; + ret = get_new_locations(reloc_inode, + extent_key, + group_start, 1, + &new_extents, + &nr_extents); + if (ret) + goto out; + } + ret = replace_one_extent(trans, found_root, + path, extent_key, + &first_key, ref_path, + new_extents, nr_extents); + } else { + ret = relocate_tree_block(trans, found_root, path, + &first_key, ref_path); + } + if (ret < 0) + goto out; + } + ret = 0; +out: + btrfs_end_transaction(trans, extent_root); + kfree(new_extents); + kfree(ref_path); + return ret; +} + +static u64 update_block_group_flags(struct btrfs_root *root, u64 flags) +{ + u64 num_devices; + u64 stripped = BTRFS_BLOCK_GROUP_RAID0 | + BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_RAID10; + + num_devices = root->fs_info->fs_devices->rw_devices; + if (num_devices == 1) { + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* turn raid0 into single device chunks */ + if (flags & BTRFS_BLOCK_GROUP_RAID0) + return stripped; + + /* turn mirroring into duplication */ + if (flags & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10)) + return stripped | BTRFS_BLOCK_GROUP_DUP; + return flags; + } else { + /* they already had raid on here, just return */ + if (flags & stripped) + return flags; + + stripped |= BTRFS_BLOCK_GROUP_DUP; + stripped = flags & ~stripped; + + /* switch duplicated blocks with raid1 */ + if (flags & BTRFS_BLOCK_GROUP_DUP) + return stripped | BTRFS_BLOCK_GROUP_RAID1; + + /* turn single device chunks into raid0 */ + return stripped | BTRFS_BLOCK_GROUP_RAID0; + } + return flags; +} + +static int __alloc_chunk_for_shrink(struct btrfs_root *root, + struct btrfs_block_group_cache *shrink_block_group, + int force) +{ + struct btrfs_trans_handle *trans; + u64 new_alloc_flags; + u64 calc; + + spin_lock(&shrink_block_group->lock); + if (btrfs_block_group_used(&shrink_block_group->item) > 0) { + spin_unlock(&shrink_block_group->lock); + + trans = btrfs_start_transaction(root, 1); + spin_lock(&shrink_block_group->lock); + + new_alloc_flags = update_block_group_flags(root, + shrink_block_group->flags); + if (new_alloc_flags != shrink_block_group->flags) { + calc = + btrfs_block_group_used(&shrink_block_group->item); + } else { + calc = shrink_block_group->key.offset; + } + spin_unlock(&shrink_block_group->lock); + + do_chunk_alloc(trans, root->fs_info->extent_root, + calc + 2 * 1024 * 1024, new_alloc_flags, force); + + btrfs_end_transaction(trans, root); + } else + spin_unlock(&shrink_block_group->lock); + return 0; +} + +static int __insert_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 size) +{ + struct btrfs_path *path; + struct btrfs_inode_item *item; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_inode(trans, root, path, objectid); + if (ret) + goto out; + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); + memset_extent_buffer(leaf, 0, (unsigned long)item, sizeof(*item)); + btrfs_set_inode_generation(leaf, item, 1); + btrfs_set_inode_size(leaf, item, size); + btrfs_set_inode_mode(leaf, item, S_IFREG | 0600); + btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS); + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); +out: + btrfs_free_path(path); + return ret; +} + +static noinline struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, + struct btrfs_block_group_cache *group) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + struct btrfs_key root_key; + u64 objectid = BTRFS_FIRST_FREE_OBJECTID; + int err = 0; + + root_key.objectid = BTRFS_DATA_RELOC_TREE_OBJECTID; + root_key.type = BTRFS_ROOT_ITEM_KEY; + root_key.offset = (u64)-1; + root = btrfs_read_fs_root_no_name(fs_info, &root_key); + if (IS_ERR(root)) + return ERR_CAST(root); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + err = btrfs_find_free_objectid(trans, root, objectid, &objectid); + if (err) + goto out; + + err = __insert_orphan_inode(trans, root, objectid, group->key.offset); + BUG_ON(err); + + err = btrfs_insert_file_extent(trans, root, objectid, 0, 0, 0, + group->key.offset, 0, group->key.offset, + 0, 0, 0); + BUG_ON(err); + + inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = objectid; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + BUG_ON(is_bad_inode(inode)); + } else { + BUG_ON(1); + } + BTRFS_I(inode)->index_cnt = group->key.objectid; + + err = btrfs_orphan_add(trans, inode); +out: + btrfs_end_transaction(trans, root); + if (err) { + if (inode) + iput(inode); + inode = ERR_PTR(err); + } + return inode; +} + +int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len) +{ + + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct list_head list; + size_t offset; + int ret; + u64 disk_bytenr; + + INIT_LIST_HEAD(&list); + + ordered = btrfs_lookup_ordered_extent(inode, file_pos); + BUG_ON(ordered->file_offset != file_pos || ordered->len != len); + + disk_bytenr = file_pos + BTRFS_I(inode)->index_cnt; + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, disk_bytenr, + disk_bytenr + len - 1, &list); + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del_init(&sums->list); + + sector_sum = sums->sums; + sums->bytenr = ordered->start; + + offset = 0; + while (offset < sums->len) { + sector_sum->bytenr += ordered->start - disk_bytenr; + sector_sum++; + offset += root->sectorsize; + } + + btrfs_add_ordered_sum(inode, ordered, sums); + } + btrfs_put_ordered_extent(ordered); + return 0; +} + +int btrfs_relocate_block_group(struct btrfs_root *root, u64 group_start) +{ + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct btrfs_fs_info *info = root->fs_info; + struct extent_buffer *leaf; + struct inode *reloc_inode; + struct btrfs_block_group_cache *block_group; + struct btrfs_key key; + u64 skipped; + u64 cur_byte; + u64 total_found; + u32 nritems; + int ret; + int progress; + int pass = 0; + + root = root->fs_info->extent_root; + + block_group = btrfs_lookup_block_group(info, group_start); + BUG_ON(!block_group); + + printk(KERN_INFO "btrfs relocating block group %llu flags %llu\n", + (unsigned long long)block_group->key.objectid, + (unsigned long long)block_group->flags); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + reloc_inode = create_reloc_inode(info, block_group); + BUG_ON(IS_ERR(reloc_inode)); + + __alloc_chunk_for_shrink(root, block_group, 1); + set_block_group_readonly(block_group); + + btrfs_start_delalloc_inodes(info->tree_root); + btrfs_wait_ordered_extents(info->tree_root, 0); +again: + skipped = 0; + total_found = 0; + progress = 0; + key.objectid = block_group->key.objectid; + key.offset = 0; + key.type = 0; + cur_byte = key.objectid; + + trans = btrfs_start_transaction(info->tree_root, 1); + btrfs_commit_transaction(trans, info->tree_root); + + mutex_lock(&root->fs_info->cleaner_mutex); + btrfs_clean_old_snapshots(info->tree_root); + btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1); + mutex_unlock(&root->fs_info->cleaner_mutex); + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; +next: + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret == 1) { + ret = 0; + break; + } + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid >= block_group->key.objectid + + block_group->key.offset) + break; + + if (progress && need_resched()) { + btrfs_release_path(root, path); + cond_resched(); + progress = 0; + continue; + } + progress = 1; + + if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY || + key.objectid + key.offset <= cur_byte) { + path->slots[0]++; + goto next; + } + + total_found++; + cur_byte = key.objectid + key.offset; + btrfs_release_path(root, path); + + __alloc_chunk_for_shrink(root, block_group, 0); + ret = relocate_one_extent(root, path, &key, block_group, + reloc_inode, pass); + BUG_ON(ret < 0); + if (ret > 0) + skipped++; + + key.objectid = cur_byte; + key.type = 0; + key.offset = 0; + } + + btrfs_release_path(root, path); + + if (pass == 0) { + btrfs_wait_ordered_range(reloc_inode, 0, (u64)-1); + invalidate_mapping_pages(reloc_inode->i_mapping, 0, -1); + } + + if (total_found > 0) { + printk(KERN_INFO "btrfs found %llu extents in pass %d\n", + (unsigned long long)total_found, pass); + pass++; + if (total_found == skipped && pass > 2) { + iput(reloc_inode); + reloc_inode = create_reloc_inode(info, block_group); + pass = 0; + } + goto again; + } + + /* delete reloc_inode */ + iput(reloc_inode); + + /* unpin extents in this range */ + trans = btrfs_start_transaction(info->tree_root, 1); + btrfs_commit_transaction(trans, info->tree_root); + + spin_lock(&block_group->lock); + WARN_ON(block_group->pinned > 0); + WARN_ON(block_group->reserved > 0); + WARN_ON(btrfs_block_group_used(&block_group->item) > 0); + spin_unlock(&block_group->lock); + put_block_group(block_group); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int find_first_block_group(struct btrfs_root *root, + struct btrfs_path *path, struct btrfs_key *key) +{ + int ret = 0; + struct btrfs_key found_key; + struct extent_buffer *leaf; + int slot; + + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret < 0) + goto out; + + while (1) { + slot = path->slots[0]; + leaf = path->nodes[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto out; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid >= key->objectid && + found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) { + ret = 0; + goto out; + } + path->slots[0]++; + } + ret = -ENOENT; +out: + return ret; +} + +int btrfs_free_block_groups(struct btrfs_fs_info *info) +{ + struct btrfs_block_group_cache *block_group; + struct rb_node *n; + + spin_lock(&info->block_group_cache_lock); + while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { + block_group = rb_entry(n, struct btrfs_block_group_cache, + cache_node); + rb_erase(&block_group->cache_node, + &info->block_group_cache_tree); + spin_unlock(&info->block_group_cache_lock); + + btrfs_remove_free_space_cache(block_group); + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + WARN_ON(atomic_read(&block_group->count) != 1); + kfree(block_group); + + spin_lock(&info->block_group_cache_lock); + } + spin_unlock(&info->block_group_cache_lock); + return 0; +} + +int btrfs_read_block_groups(struct btrfs_root *root) +{ + struct btrfs_path *path; + int ret; + struct btrfs_block_group_cache *cache; + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *space_info; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf; + + root = info->extent_root; + key.objectid = 0; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + while (1) { + ret = find_first_block_group(root, path, &key); + if (ret > 0) { + ret = 0; + goto error; + } + if (ret != 0) + goto error; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) { + ret = -ENOMEM; + break; + } + + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + mutex_init(&cache->alloc_mutex); + mutex_init(&cache->cache_mutex); + INIT_LIST_HEAD(&cache->list); + read_extent_buffer(leaf, &cache->item, + btrfs_item_ptr_offset(leaf, path->slots[0]), + sizeof(cache->item)); + memcpy(&cache->key, &found_key, sizeof(found_key)); + + key.objectid = found_key.objectid + found_key.offset; + btrfs_release_path(root, path); + cache->flags = btrfs_block_group_flags(&cache->item); + + ret = update_space_info(info, cache->flags, found_key.offset, + btrfs_block_group_used(&cache->item), + &space_info); + BUG_ON(ret); + cache->space_info = space_info; + down_write(&space_info->groups_sem); + list_add_tail(&cache->list, &space_info->block_groups); + up_write(&space_info->groups_sem); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); + + set_avail_alloc_bits(root->fs_info, cache->flags); + if (btrfs_chunk_readonly(root, cache->key.objectid)) + set_block_group_readonly(cache); + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_make_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytes_used, + u64 type, u64 chunk_objectid, u64 chunk_offset, + u64 size) +{ + int ret; + struct btrfs_root *extent_root; + struct btrfs_block_group_cache *cache; + + extent_root = root->fs_info->extent_root; + + root->fs_info->last_trans_new_blockgroup = trans->transid; + + cache = kzalloc(sizeof(*cache), GFP_NOFS); + if (!cache) + return -ENOMEM; + + cache->key.objectid = chunk_offset; + cache->key.offset = size; + cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + atomic_set(&cache->count, 1); + spin_lock_init(&cache->lock); + mutex_init(&cache->alloc_mutex); + mutex_init(&cache->cache_mutex); + INIT_LIST_HEAD(&cache->list); + + btrfs_set_block_group_used(&cache->item, bytes_used); + btrfs_set_block_group_chunk_objectid(&cache->item, chunk_objectid); + cache->flags = type; + btrfs_set_block_group_flags(&cache->item, type); + + ret = update_space_info(root->fs_info, cache->flags, size, bytes_used, + &cache->space_info); + BUG_ON(ret); + down_write(&cache->space_info->groups_sem); + list_add_tail(&cache->list, &cache->space_info->block_groups); + up_write(&cache->space_info->groups_sem); + + ret = btrfs_add_block_group_cache(root->fs_info, cache); + BUG_ON(ret); + + ret = btrfs_insert_item(trans, extent_root, &cache->key, &cache->item, + sizeof(cache->item)); + BUG_ON(ret); + + finish_current_insert(trans, extent_root, 0); + ret = del_pending_extents(trans, extent_root, 0); + BUG_ON(ret); + set_avail_alloc_bits(extent_root->fs_info, type); + + return 0; +} + +int btrfs_remove_block_group(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 group_start) +{ + struct btrfs_path *path; + struct btrfs_block_group_cache *block_group; + struct btrfs_key key; + int ret; + + root = root->fs_info->extent_root; + + block_group = btrfs_lookup_block_group(root->fs_info, group_start); + BUG_ON(!block_group); + BUG_ON(!block_group->ro); + + memcpy(&key, &block_group->key, sizeof(key)); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + btrfs_remove_free_space_cache(block_group); + rb_erase(&block_group->cache_node, + &root->fs_info->block_group_cache_tree); + down_write(&block_group->space_info->groups_sem); + list_del(&block_group->list); + up_write(&block_group->space_info->groups_sem); + + spin_lock(&block_group->space_info->lock); + block_group->space_info->total_bytes -= block_group->key.offset; + block_group->space_info->bytes_readonly -= block_group->key.offset; + spin_unlock(&block_group->space_info->lock); + block_group->space_info->full = 0; + + put_block_group(block_group); + put_block_group(block_group); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) + ret = -EIO; + if (ret < 0) + goto out; + + ret = btrfs_del_item(trans, root, path); +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c new file mode 100644 index 00000000000..e086d407f1f --- /dev/null +++ b/fs/btrfs/extent_io.c @@ -0,0 +1,3717 @@ +#include <linux/bitops.h> +#include <linux/slab.h> +#include <linux/bio.h> +#include <linux/mm.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/page-flags.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/blkdev.h> +#include <linux/swap.h> +#include <linux/version.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include "extent_io.h" +#include "extent_map.h" +#include "compat.h" +#include "ctree.h" +#include "btrfs_inode.h" + +/* temporary define until extent_map moves out of btrfs */ +struct kmem_cache *btrfs_cache_create(const char *name, size_t size, + unsigned long extra_flags, + void (*ctor)(void *, struct kmem_cache *, + unsigned long)); + +static struct kmem_cache *extent_state_cache; +static struct kmem_cache *extent_buffer_cache; + +static LIST_HEAD(buffers); +static LIST_HEAD(states); + +#define LEAK_DEBUG 0 +#ifdef LEAK_DEBUG +static DEFINE_SPINLOCK(leak_lock); +#endif + +#define BUFFER_LRU_MAX 64 + +struct tree_entry { + u64 start; + u64 end; + struct rb_node rb_node; +}; + +struct extent_page_data { + struct bio *bio; + struct extent_io_tree *tree; + get_extent_t *get_extent; + + /* tells writepage not to lock the state bits for this range + * it still does the unlocking + */ + int extent_locked; +}; + +int __init extent_io_init(void) +{ + extent_state_cache = btrfs_cache_create("extent_state", + sizeof(struct extent_state), 0, + NULL); + if (!extent_state_cache) + return -ENOMEM; + + extent_buffer_cache = btrfs_cache_create("extent_buffers", + sizeof(struct extent_buffer), 0, + NULL); + if (!extent_buffer_cache) + goto free_state_cache; + return 0; + +free_state_cache: + kmem_cache_destroy(extent_state_cache); + return -ENOMEM; +} + +void extent_io_exit(void) +{ + struct extent_state *state; + struct extent_buffer *eb; + + while (!list_empty(&states)) { + state = list_entry(states.next, struct extent_state, leak_list); + printk(KERN_ERR "btrfs state leak: start %llu end %llu " + "state %lu in tree %p refs %d\n", + (unsigned long long)state->start, + (unsigned long long)state->end, + state->state, state->tree, atomic_read(&state->refs)); + list_del(&state->leak_list); + kmem_cache_free(extent_state_cache, state); + + } + + while (!list_empty(&buffers)) { + eb = list_entry(buffers.next, struct extent_buffer, leak_list); + printk(KERN_ERR "btrfs buffer leak start %llu len %lu " + "refs %d\n", (unsigned long long)eb->start, + eb->len, atomic_read(&eb->refs)); + list_del(&eb->leak_list); + kmem_cache_free(extent_buffer_cache, eb); + } + if (extent_state_cache) + kmem_cache_destroy(extent_state_cache); + if (extent_buffer_cache) + kmem_cache_destroy(extent_buffer_cache); +} + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping, gfp_t mask) +{ + tree->state.rb_node = NULL; + tree->buffer.rb_node = NULL; + tree->ops = NULL; + tree->dirty_bytes = 0; + spin_lock_init(&tree->lock); + spin_lock_init(&tree->buffer_lock); + tree->mapping = mapping; +} + +static struct extent_state *alloc_extent_state(gfp_t mask) +{ + struct extent_state *state; +#ifdef LEAK_DEBUG + unsigned long flags; +#endif + + state = kmem_cache_alloc(extent_state_cache, mask); + if (!state) + return state; + state->state = 0; + state->private = 0; + state->tree = NULL; +#ifdef LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_add(&state->leak_list, &states); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + atomic_set(&state->refs, 1); + init_waitqueue_head(&state->wq); + return state; +} + +static void free_extent_state(struct extent_state *state) +{ + if (!state) + return; + if (atomic_dec_and_test(&state->refs)) { +#ifdef LEAK_DEBUG + unsigned long flags; +#endif + WARN_ON(state->tree); +#ifdef LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_del(&state->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_state_cache, state); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct tree_entry *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct tree_entry, rb_node); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset > entry->end) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct tree_entry, rb_node); + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *__etree_search(struct extent_io_tree *tree, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_root *root = &tree->state; + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct tree_entry *entry; + struct tree_entry *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct tree_entry, rb_node); + prev = n; + prev_entry = entry; + + if (offset < entry->start) + n = n->rb_left; + else if (offset > entry->end) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while (prev && offset > prev_entry->end) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct tree_entry, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +static inline struct rb_node *tree_search(struct extent_io_tree *tree, + u64 offset) +{ + struct rb_node *prev = NULL; + struct rb_node *ret; + + ret = __etree_search(tree, offset, &prev, NULL); + if (!ret) + return prev; + return ret; +} + +static struct extent_buffer *buffer_tree_insert(struct extent_io_tree *tree, + u64 offset, struct rb_node *node) +{ + struct rb_root *root = &tree->buffer; + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct extent_buffer *eb; + + while (*p) { + parent = *p; + eb = rb_entry(parent, struct extent_buffer, rb_node); + + if (offset < eb->start) + p = &(*p)->rb_left; + else if (offset > eb->start) + p = &(*p)->rb_right; + else + return eb; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct extent_buffer *buffer_search(struct extent_io_tree *tree, + u64 offset) +{ + struct rb_root *root = &tree->buffer; + struct rb_node *n = root->rb_node; + struct extent_buffer *eb; + + while (n) { + eb = rb_entry(n, struct extent_buffer, rb_node); + if (offset < eb->start) + n = n->rb_left; + else if (offset > eb->start) + n = n->rb_right; + else + return eb; + } + return NULL; +} + +/* + * utility function to look for merge candidates inside a given range. + * Any extents with matching state are merged together into a single + * extent in the tree. Extents with EXTENT_IO in their state field + * are not merged because the end_io handlers need to be able to do + * operations on them without sleeping (or doing allocations/splits). + * + * This should be called with the tree lock held. + */ +static int merge_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + struct extent_state *other; + struct rb_node *other_node; + + if (state->state & (EXTENT_IOBITS | EXTENT_BOUNDARY)) + return 0; + + other_node = rb_prev(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->end == state->start - 1 && + other->state == state->state) { + state->start = other->start; + other->tree = NULL; + rb_erase(&other->rb_node, &tree->state); + free_extent_state(other); + } + } + other_node = rb_next(&state->rb_node); + if (other_node) { + other = rb_entry(other_node, struct extent_state, rb_node); + if (other->start == state->end + 1 && + other->state == state->state) { + other->start = state->start; + state->tree = NULL; + rb_erase(&state->rb_node, &tree->state); + free_extent_state(state); + } + } + return 0; +} + +static void set_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) +{ + if (tree->ops && tree->ops->set_bit_hook) { + tree->ops->set_bit_hook(tree->mapping->host, state->start, + state->end, state->state, bits); + } +} + +static void clear_state_cb(struct extent_io_tree *tree, + struct extent_state *state, + unsigned long bits) +{ + if (tree->ops && tree->ops->clear_bit_hook) { + tree->ops->clear_bit_hook(tree->mapping->host, state->start, + state->end, state->state, bits); + } +} + +/* + * insert an extent_state struct into the tree. 'bits' are set on the + * struct before it is inserted. + * + * This may return -EEXIST if the extent is already there, in which case the + * state struct is freed. + * + * The tree lock is not taken internally. This is a utility function and + * probably isn't what you want to call (see set/clear_extent_bit). + */ +static int insert_state(struct extent_io_tree *tree, + struct extent_state *state, u64 start, u64 end, + int bits) +{ + struct rb_node *node; + + if (end < start) { + printk(KERN_ERR "btrfs end < start %llu %llu\n", + (unsigned long long)end, + (unsigned long long)start); + WARN_ON(1); + } + if (bits & EXTENT_DIRTY) + tree->dirty_bytes += end - start + 1; + set_state_cb(tree, state, bits); + state->state |= bits; + state->start = start; + state->end = end; + node = tree_insert(&tree->state, end, &state->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); + printk(KERN_ERR "btrfs found node %llu %llu on insert of " + "%llu %llu\n", (unsigned long long)found->start, + (unsigned long long)found->end, + (unsigned long long)start, (unsigned long long)end); + free_extent_state(state); + return -EEXIST; + } + state->tree = tree; + merge_state(tree, state); + return 0; +} + +/* + * split a given extent state struct in two, inserting the preallocated + * struct 'prealloc' as the newly created second half. 'split' indicates an + * offset inside 'orig' where it should be split. + * + * Before calling, + * the tree has 'orig' at [orig->start, orig->end]. After calling, there + * are two extent state structs in the tree: + * prealloc: [orig->start, split - 1] + * orig: [ split, orig->end ] + * + * The tree locks are not taken by this function. They need to be held + * by the caller. + */ +static int split_state(struct extent_io_tree *tree, struct extent_state *orig, + struct extent_state *prealloc, u64 split) +{ + struct rb_node *node; + prealloc->start = orig->start; + prealloc->end = split - 1; + prealloc->state = orig->state; + orig->start = split; + + node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); + if (node) { + struct extent_state *found; + found = rb_entry(node, struct extent_state, rb_node); + free_extent_state(prealloc); + return -EEXIST; + } + prealloc->tree = tree; + return 0; +} + +/* + * utility function to clear some bits in an extent state struct. + * it will optionally wake up any one waiting on this state (wake == 1), or + * forcibly remove the state from the tree (delete == 1). + * + * If no bits are set on the state struct after clearing things, the + * struct is freed and removed from the tree + */ +static int clear_state_bit(struct extent_io_tree *tree, + struct extent_state *state, int bits, int wake, + int delete) +{ + int ret = state->state & bits; + + if ((bits & EXTENT_DIRTY) && (state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + WARN_ON(range > tree->dirty_bytes); + tree->dirty_bytes -= range; + } + clear_state_cb(tree, state, bits); + state->state &= ~bits; + if (wake) + wake_up(&state->wq); + if (delete || state->state == 0) { + if (state->tree) { + clear_state_cb(tree, state, state->state); + rb_erase(&state->rb_node, &tree->state); + state->tree = NULL; + free_extent_state(state); + } else { + WARN_ON(1); + } + } else { + merge_state(tree, state); + } + return ret; +} + +/* + * clear some bits on a range in the tree. This may require splitting + * or inserting elements in the tree, so the gfp mask is used to + * indicate which allocations or sleeping are allowed. + * + * pass 'wake' == 1 to kick any sleepers, and 'delete' == 1 to remove + * the given range from the tree regardless of state (ie for truncate). + * + * the range [start, end] is inclusive. + * + * This takes the tree lock, and returns < 0 on error, > 0 if any of the + * bits were already set, or zero if none of the bits were already set. + */ +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err; + int set = 0; + +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + /* + * this search will find the extents that end after + * our range starts + */ + node = tree_search(tree, start); + if (!node) + goto out; + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > end) + goto out; + WARN_ON(state->end < start); + + /* + * | ---- desired range ---- | + * | state | or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip + * bits on second half. + * + * If the extent we found extends past our range, we + * just split and search again. It'll get split again + * the next time though. + * + * If the extent we found is inside our range, we clear + * the desired bit on it. + */ + + if (state->start < start) { + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, + wake, delete); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and clear the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + if (!prealloc) + prealloc = alloc_extent_state(GFP_ATOMIC); + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + if (wake) + wake_up(&state->wq); + set |= clear_state_bit(tree, prealloc, bits, + wake, delete); + prealloc = NULL; + goto out; + } + + start = state->end + 1; + set |= clear_state_bit(tree, state, bits, wake, delete); + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return set; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +static int wait_on_state(struct extent_io_tree *tree, + struct extent_state *state) + __releases(tree->lock) + __acquires(tree->lock) +{ + DEFINE_WAIT(wait); + prepare_to_wait(&state->wq, &wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&tree->lock); + schedule(); + spin_lock(&tree->lock); + finish_wait(&state->wq, &wait); + return 0; +} + +/* + * waits for one or more bits to clear on a range in the state tree. + * The range [start, end] is inclusive. + * The tree lock is taken by this function + */ +int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits) +{ + struct extent_state *state; + struct rb_node *node; + + spin_lock(&tree->lock); +again: + while (1) { + /* + * this search will find all the extents that end after + * our range starts + */ + node = tree_search(tree, start); + if (!node) + break; + + state = rb_entry(node, struct extent_state, rb_node); + + if (state->start > end) + goto out; + + if (state->state & bits) { + start = state->start; + atomic_inc(&state->refs); + wait_on_state(tree, state); + free_extent_state(state); + goto again; + } + start = state->end + 1; + + if (start > end) + break; + + if (need_resched()) { + spin_unlock(&tree->lock); + cond_resched(); + spin_lock(&tree->lock); + } + } +out: + spin_unlock(&tree->lock); + return 0; +} + +static void set_state_bits(struct extent_io_tree *tree, + struct extent_state *state, + int bits) +{ + if ((bits & EXTENT_DIRTY) && !(state->state & EXTENT_DIRTY)) { + u64 range = state->end - state->start + 1; + tree->dirty_bytes += range; + } + set_state_cb(tree, state, bits); + state->state |= bits; +} + +/* + * set some bits on a range in the tree. This may require allocations + * or sleeping, so the gfp mask is used to indicate what is allowed. + * + * If 'exclusive' == 1, this will fail with -EEXIST if some part of the + * range already has the desired bits set. The start of the existing + * range is returned in failed_start in this case. + * + * [start, end] is inclusive + * This takes the tree lock. + */ +static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int exclusive, u64 *failed_start, + gfp_t mask) +{ + struct extent_state *state; + struct extent_state *prealloc = NULL; + struct rb_node *node; + int err = 0; + int set; + u64 last_start; + u64 last_end; +again: + if (!prealloc && (mask & __GFP_WAIT)) { + prealloc = alloc_extent_state(mask); + if (!prealloc) + return -ENOMEM; + } + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + err = insert_state(tree, prealloc, start, end, bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + goto out; + } + + state = rb_entry(node, struct extent_state, rb_node); + last_start = state->start; + last_end = state->end; + + /* + * | ---- desired range ---- | + * | state | + * + * Just lock what we found and keep going + */ + if (state->start == start && state->end <= end) { + set = state->state & bits; + if (set && exclusive) { + *failed_start = state->start; + err = -EEXIST; + goto out; + } + set_state_bits(tree, state, bits); + start = state->end + 1; + merge_state(tree, state); + goto search_again; + } + + /* + * | ---- desired range ---- | + * | state | + * or + * | ------------- state -------------- | + * + * We need to split the extent we found, and may flip bits on + * second half. + * + * If the extent we found extends past our + * range, we just split and search again. It'll get split + * again the next time though. + * + * If the extent we found is inside our range, we set the + * desired bit on it. + */ + if (state->start < start) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, start); + BUG_ON(err == -EEXIST); + prealloc = NULL; + if (err) + goto out; + if (state->end <= end) { + set_state_bits(tree, state, bits); + start = state->end + 1; + merge_state(tree, state); + } else { + start = state->start; + } + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | or | state | + * + * There's a hole, we need to insert something in it and + * ignore the extent we found. + */ + if (state->start > start) { + u64 this_end; + if (end < last_start) + this_end = end; + else + this_end = last_start - 1; + err = insert_state(tree, prealloc, start, this_end, + bits); + prealloc = NULL; + BUG_ON(err == -EEXIST); + if (err) + goto out; + start = this_end + 1; + goto search_again; + } + /* + * | ---- desired range ---- | + * | state | + * We need to split the extent, and set the bit + * on the first half + */ + if (state->start <= end && state->end > end) { + set = state->state & bits; + if (exclusive && set) { + *failed_start = start; + err = -EEXIST; + goto out; + } + err = split_state(tree, state, prealloc, end + 1); + BUG_ON(err == -EEXIST); + + set_state_bits(tree, prealloc, bits); + merge_state(tree, prealloc); + prealloc = NULL; + goto out; + } + + goto search_again; + +out: + spin_unlock(&tree->lock); + if (prealloc) + free_extent_state(prealloc); + + return err; + +search_again: + if (start > end) + goto out; + spin_unlock(&tree->lock); + if (mask & __GFP_WAIT) + cond_resched(); + goto again; +} + +/* wrappers around set/clear extent bit */ +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_DIRTY, 0, NULL, + mask); +} + +int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask); +} + +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return set_extent_bit(tree, start, end, bits, 0, NULL, + mask); +} + +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, bits, 0, 0, mask); +} + +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, + EXTENT_DELALLOC | EXTENT_DIRTY, + 0, NULL, mask); +} + +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC, 0, 0, mask); +} + +int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask); +} + +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_NEW, 0, NULL, + mask); +} + +static int clear_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_NEW, 0, 0, mask); +} + +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, NULL, + mask); +} + +static int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_UPTODATE, 0, 0, mask); +} + +static int set_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return set_extent_bit(tree, start, end, EXTENT_WRITEBACK, + 0, NULL, mask); +} + +static int clear_extent_writeback(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_WRITEBACK, 1, 0, mask); +} + +int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + return wait_extent_bit(tree, start, end, EXTENT_WRITEBACK); +} + +/* + * either insert or lock state struct between start and end use mask to tell + * us if waiting is desired. + */ +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask) +{ + int err; + u64 failed_start; + while (1) { + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, + &failed_start, mask); + if (err == -EEXIST && (mask & __GFP_WAIT)) { + wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED); + start = failed_start; + } else { + break; + } + WARN_ON(start > end); + } + return err; +} + +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + int err; + u64 failed_start; + + err = set_extent_bit(tree, start, end, EXTENT_LOCKED, 1, + &failed_start, mask); + if (err == -EEXIST) { + if (failed_start > start) + clear_extent_bit(tree, start, failed_start - 1, + EXTENT_LOCKED, 1, 0, mask); + return 0; + } + return 1; +} + +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask) +{ + return clear_extent_bit(tree, start, end, EXTENT_LOCKED, 1, 0, mask); +} + +/* + * helper function to set pages and extents in the tree dirty + */ +int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + __set_page_dirty_nobuffers(page); + page_cache_release(page); + index++; + } + set_extent_dirty(tree, start, end, GFP_NOFS); + return 0; +} + +/* + * helper function to set both pages and extents in the tree writeback + */ +static int set_range_writeback(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + BUG_ON(!page); + set_page_writeback(page); + page_cache_release(page); + index++; + } + set_extent_writeback(tree, start, end, GFP_NOFS); + return 0; +} + +/* + * find the first offset in the io tree with 'bits' set. zero is + * returned if we find something, and *start_ret and *end_ret are + * set to reflect the state struct that was found. + * + * If nothing was found, 1 is returned, < 0 on error + */ +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 1; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) { + *start_ret = state->start; + *end_ret = state->end; + ret = 0; + break; + } + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return ret; +} + +/* find the first state struct with 'bits' set after 'start', and + * return it. tree->lock must be held. NULL will returned if + * nothing was found after 'start' + */ +struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, int bits) +{ + struct rb_node *node; + struct extent_state *state; + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->end >= start && (state->state & bits)) + return state; + + node = rb_next(node); + if (!node) + break; + } +out: + return NULL; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_delalloc_range(struct extent_io_tree *tree, + u64 *start, u64 *end, u64 max_bytes) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 found = 0; + u64 total_bytes = 0; + + spin_lock(&tree->lock); + + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) { + if (!found) + *end = (u64)-1; + goto out; + } + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (found && (state->start != cur_start || + (state->state & EXTENT_BOUNDARY))) { + goto out; + } + if (!(state->state & EXTENT_DELALLOC)) { + if (!found) + *end = state->end; + goto out; + } + if (!found) + *start = state->start; + found++; + *end = state->end; + cur_start = state->end + 1; + node = rb_next(node); + if (!node) + break; + total_bytes += state->end - state->start + 1; + if (total_bytes >= max_bytes) + break; + } +out: + spin_unlock(&tree->lock); + return found; +} + +static noinline int __unlock_for_delalloc(struct inode *inode, + struct page *locked_page, + u64 start, u64 end) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + + if (index == locked_page->index && end_index == index) + return 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, nr_pages, + ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] != locked_page) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +static noinline int lock_delalloc_pages(struct inode *inode, + struct page *locked_page, + u64 delalloc_start, + u64 delalloc_end) +{ + unsigned long index = delalloc_start >> PAGE_CACHE_SHIFT; + unsigned long start_index = index; + unsigned long end_index = delalloc_end >> PAGE_CACHE_SHIFT; + unsigned long pages_locked = 0; + struct page *pages[16]; + unsigned long nrpages; + int ret; + int i; + + /* the caller is responsible for locking the start index */ + if (index == locked_page->index && index == end_index) + return 0; + + /* skip the page at the start index */ + nrpages = end_index - index + 1; + while (nrpages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nrpages, ARRAY_SIZE(pages)), pages); + if (ret == 0) { + ret = -EAGAIN; + goto done; + } + /* now we have an array of pages, lock them all */ + for (i = 0; i < ret; i++) { + /* + * the caller is taking responsibility for + * locked_page + */ + if (pages[i] != locked_page) { + lock_page(pages[i]); + if (!PageDirty(pages[i]) || + pages[i]->mapping != inode->i_mapping) { + ret = -EAGAIN; + unlock_page(pages[i]); + page_cache_release(pages[i]); + goto done; + } + } + page_cache_release(pages[i]); + pages_locked++; + } + nrpages -= ret; + index += ret; + cond_resched(); + } + ret = 0; +done: + if (ret && pages_locked) { + __unlock_for_delalloc(inode, locked_page, + delalloc_start, + ((u64)(start_index + pages_locked - 1)) << + PAGE_CACHE_SHIFT); + } + return ret; +} + +/* + * find a contiguous range of bytes in the file marked as delalloc, not + * more than 'max_bytes'. start and end are used to return the range, + * + * 1 is returned if we find something, 0 if nothing was in the tree + */ +static noinline u64 find_lock_delalloc_range(struct inode *inode, + struct extent_io_tree *tree, + struct page *locked_page, + u64 *start, u64 *end, + u64 max_bytes) +{ + u64 delalloc_start; + u64 delalloc_end; + u64 found; + int ret; + int loops = 0; + +again: + /* step one, find a bunch of delalloc bytes starting at start */ + delalloc_start = *start; + delalloc_end = 0; + found = find_delalloc_range(tree, &delalloc_start, &delalloc_end, + max_bytes); + if (!found || delalloc_end <= *start) { + *start = delalloc_start; + *end = delalloc_end; + return found; + } + + /* + * start comes from the offset of locked_page. We have to lock + * pages in order, so we can't process delalloc bytes before + * locked_page + */ + if (delalloc_start < *start) + delalloc_start = *start; + + /* + * make sure to limit the number of pages we try to lock down + * if we're looping. + */ + if (delalloc_end + 1 - delalloc_start > max_bytes && loops) + delalloc_end = delalloc_start + PAGE_CACHE_SIZE - 1; + + /* step two, lock all the pages after the page that has start */ + ret = lock_delalloc_pages(inode, locked_page, + delalloc_start, delalloc_end); + if (ret == -EAGAIN) { + /* some of the pages are gone, lets avoid looping by + * shortening the size of the delalloc range we're searching + */ + if (!loops) { + unsigned long offset = (*start) & (PAGE_CACHE_SIZE - 1); + max_bytes = PAGE_CACHE_SIZE - offset; + loops = 1; + goto again; + } else { + found = 0; + goto out_failed; + } + } + BUG_ON(ret); + + /* step three, lock the state bits for the whole range */ + lock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + + /* then test to make sure it is all still delalloc */ + ret = test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, 1); + if (!ret) { + unlock_extent(tree, delalloc_start, delalloc_end, GFP_NOFS); + __unlock_for_delalloc(inode, locked_page, + delalloc_start, delalloc_end); + cond_resched(); + goto again; + } + *start = delalloc_start; + *end = delalloc_end; +out_failed: + return found; +} + +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + int unlock_pages, + int clear_unlock, + int clear_delalloc, int clear_dirty, + int set_writeback, + int end_writeback) +{ + int ret; + struct page *pages[16]; + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + unsigned long nr_pages = end_index - index + 1; + int i; + int clear_bits = 0; + + if (clear_unlock) + clear_bits |= EXTENT_LOCKED; + if (clear_dirty) + clear_bits |= EXTENT_DIRTY; + + if (clear_delalloc) + clear_bits |= EXTENT_DELALLOC; + + clear_extent_bit(tree, start, end, clear_bits, 1, 0, GFP_NOFS); + if (!(unlock_pages || clear_dirty || set_writeback || end_writeback)) + return 0; + + while (nr_pages > 0) { + ret = find_get_pages_contig(inode->i_mapping, index, + min_t(unsigned long, + nr_pages, ARRAY_SIZE(pages)), pages); + for (i = 0; i < ret; i++) { + if (pages[i] == locked_page) { + page_cache_release(pages[i]); + continue; + } + if (clear_dirty) + clear_page_dirty_for_io(pages[i]); + if (set_writeback) + set_page_writeback(pages[i]); + if (end_writeback) + end_page_writeback(pages[i]); + if (unlock_pages) + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + nr_pages -= ret; + index += ret; + cond_resched(); + } + return 0; +} + +/* + * count the number of bytes in the tree that have a given bit(s) + * set. This can be fairly slow, except for EXTENT_DIRTY which is + * cached. The total number found is returned. + */ +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + unsigned long bits) +{ + struct rb_node *node; + struct extent_state *state; + u64 cur_start = *start; + u64 total_bytes = 0; + int found = 0; + + if (search_end <= cur_start) { + WARN_ON(1); + return 0; + } + + spin_lock(&tree->lock); + if (cur_start == 0 && bits == EXTENT_DIRTY) { + total_bytes = tree->dirty_bytes; + goto out; + } + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, cur_start); + if (!node) + goto out; + + while (1) { + state = rb_entry(node, struct extent_state, rb_node); + if (state->start > search_end) + break; + if (state->end >= cur_start && (state->state & bits)) { + total_bytes += min(search_end, state->end) + 1 - + max(cur_start, state->start); + if (total_bytes >= max_bytes) + break; + if (!found) { + *start = state->start; + found = 1; + } + } + node = rb_next(node); + if (!node) + break; + } +out: + spin_unlock(&tree->lock); + return total_bytes; +} + +#if 0 +/* + * helper function to lock both pages and extents in the tree. + * pages must be locked first. + */ +static int lock_range(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + int err; + + while (index <= end_index) { + page = grab_cache_page(tree->mapping, index); + if (!page) { + err = -ENOMEM; + goto failed; + } + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto failed; + } + index++; + } + lock_extent(tree, start, end, GFP_NOFS); + return 0; + +failed: + /* + * we failed above in getting the page at 'index', so we undo here + * up to but not including the page at 'index' + */ + end_index = index; + index = start >> PAGE_CACHE_SHIFT; + while (index < end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + return err; +} + +/* + * helper function to unlock both pages and extents in the tree. + */ +static int unlock_range(struct extent_io_tree *tree, u64 start, u64 end) +{ + unsigned long index = start >> PAGE_CACHE_SHIFT; + unsigned long end_index = end >> PAGE_CACHE_SHIFT; + struct page *page; + + while (index <= end_index) { + page = find_get_page(tree->mapping, index); + unlock_page(page); + page_cache_release(page); + index++; + } + unlock_extent(tree, start, end, GFP_NOFS); + return 0; +} +#endif + +/* + * set the private field for a given byte offset in the tree. If there isn't + * an extent_state there already, this does nothing. + */ +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + state->private = private; +out: + spin_unlock(&tree->lock); + return ret; +} + +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private) +{ + struct rb_node *node; + struct extent_state *state; + int ret = 0; + + spin_lock(&tree->lock); + /* + * this search will find all the extents that end after + * our range starts. + */ + node = tree_search(tree, start); + if (!node) { + ret = -ENOENT; + goto out; + } + state = rb_entry(node, struct extent_state, rb_node); + if (state->start != start) { + ret = -ENOENT; + goto out; + } + *private = state->private; +out: + spin_unlock(&tree->lock); + return ret; +} + +/* + * searches a range in the state tree for a given mask. + * If 'filled' == 1, this returns 1 only if every extent in the tree + * has the bits set. Otherwise, 1 is returned if any bit in the + * range is found set. + */ +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled) +{ + struct extent_state *state = NULL; + struct rb_node *node; + int bitset = 0; + + spin_lock(&tree->lock); + node = tree_search(tree, start); + while (node && start <= end) { + state = rb_entry(node, struct extent_state, rb_node); + + if (filled && state->start > start) { + bitset = 0; + break; + } + + if (state->start > end) + break; + + if (state->state & bits) { + bitset = 1; + if (!filled) + break; + } else if (filled) { + bitset = 0; + break; + } + start = state->end + 1; + if (start > end) + break; + node = rb_next(node); + if (!node) { + if (filled) + bitset = 0; + break; + } + } + spin_unlock(&tree->lock); + return bitset; +} + +/* + * helper function to set a given page up to date if all the + * extents in the tree for that page are up to date + */ +static int check_page_uptodate(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, EXTENT_UPTODATE, 1)) + SetPageUptodate(page); + return 0; +} + +/* + * helper function to unlock a page if all the extents in the tree + * for that page are unlocked + */ +static int check_page_locked(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_LOCKED, 0)) + unlock_page(page); + return 0; +} + +/* + * helper function to end page writeback if all the extents + * in the tree for that page are done with writeback + */ +static int check_page_writeback(struct extent_io_tree *tree, + struct page *page) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + if (!test_range_bit(tree, start, end, EXTENT_WRITEBACK, 0)) + end_page_writeback(page); + return 0; +} + +/* lots and lots of room for performance fixes in the end_bio funcs */ + +/* + * after a writepage IO is done, we need to: + * clear the uptodate bits on error + * clear the writeback bits in the extent tree for this IO + * end_page_writeback if the page has no more pending IO + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_writepage(struct bio *bio, int err) +{ + int uptodate = err == 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + int whole_page; + int ret; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + if (tree->ops && tree->ops->writepage_end_io_hook) { + ret = tree->ops->writepage_end_io_hook(page, start, + end, NULL, uptodate); + if (ret) + uptodate = 0; + } + + if (!uptodate && tree->ops && + tree->ops->writepage_io_failed_hook) { + ret = tree->ops->writepage_io_failed_hook(bio, page, + start, end, NULL); + if (ret == 0) { + uptodate = (err == 0); + continue; + } + } + + if (!uptodate) { + clear_extent_uptodate(tree, start, end, GFP_ATOMIC); + ClearPageUptodate(page); + SetPageError(page); + } + + clear_extent_writeback(tree, start, end, GFP_ATOMIC); + + if (whole_page) + end_page_writeback(page); + else + check_page_writeback(tree, page); + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +/* + * after a readpage IO is done, we need to: + * clear the uptodate bits on error + * set the uptodate bits if things worked + * set the page up to date if all extents in the tree are uptodate + * clear the lock bit in the extent tree + * unlock the page if there are no other extents locked for it + * + * Scheduling is not allowed, so the extent state tree is expected + * to have one and only one object corresponding to this IO. + */ +static void end_bio_extent_readpage(struct bio *bio, int err) +{ + int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + int whole_page; + int ret; + + if (err) + uptodate = 0; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (bvec->bv_offset == 0 && bvec->bv_len == PAGE_CACHE_SIZE) + whole_page = 1; + else + whole_page = 0; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate && tree->ops && tree->ops->readpage_end_io_hook) { + ret = tree->ops->readpage_end_io_hook(page, start, end, + NULL); + if (ret) + uptodate = 0; + } + if (!uptodate && tree->ops && + tree->ops->readpage_io_failed_hook) { + ret = tree->ops->readpage_io_failed_hook(bio, page, + start, end, NULL); + if (ret == 0) { + uptodate = + test_bit(BIO_UPTODATE, &bio->bi_flags); + if (err) + uptodate = 0; + continue; + } + } + + if (uptodate) { + set_extent_uptodate(tree, start, end, + GFP_ATOMIC); + } + unlock_extent(tree, start, end, GFP_ATOMIC); + + if (whole_page) { + if (uptodate) { + SetPageUptodate(page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + unlock_page(page); + } else { + if (uptodate) { + check_page_uptodate(tree, page); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + check_page_locked(tree, page); + } + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +/* + * IO done from prepare_write is pretty simple, we just unlock + * the structs in the extent tree when done, and set the uptodate bits + * as appropriate. + */ +static void end_bio_extent_preparewrite(struct bio *bio, int err) +{ + const int uptodate = test_bit(BIO_UPTODATE, &bio->bi_flags); + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct extent_io_tree *tree; + u64 start; + u64 end; + + do { + struct page *page = bvec->bv_page; + tree = &BTRFS_I(page->mapping->host)->io_tree; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + if (--bvec >= bio->bi_io_vec) + prefetchw(&bvec->bv_page->flags); + + if (uptodate) { + set_extent_uptodate(tree, start, end, GFP_ATOMIC); + } else { + ClearPageUptodate(page); + SetPageError(page); + } + + unlock_extent(tree, start, end, GFP_ATOMIC); + + } while (bvec >= bio->bi_io_vec); + + bio_put(bio); +} + +static struct bio * +extent_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, + gfp_t gfp_flags) +{ + struct bio *bio; + + bio = bio_alloc(gfp_flags, nr_vecs); + + if (bio == NULL && (current->flags & PF_MEMALLOC)) { + while (!bio && (nr_vecs /= 2)) + bio = bio_alloc(gfp_flags, nr_vecs); + } + + if (bio) { + bio->bi_size = 0; + bio->bi_bdev = bdev; + bio->bi_sector = first_sector; + } + return bio; +} + +static int submit_one_bio(int rw, struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + int ret = 0; + struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; + struct page *page = bvec->bv_page; + struct extent_io_tree *tree = bio->bi_private; + u64 start; + u64 end; + + start = ((u64)page->index << PAGE_CACHE_SHIFT) + bvec->bv_offset; + end = start + bvec->bv_len - 1; + + bio->bi_private = NULL; + + bio_get(bio); + + if (tree->ops && tree->ops->submit_bio_hook) + tree->ops->submit_bio_hook(page->mapping->host, rw, bio, + mirror_num, bio_flags); + else + submit_bio(rw, bio); + if (bio_flagged(bio, BIO_EOPNOTSUPP)) + ret = -EOPNOTSUPP; + bio_put(bio); + return ret; +} + +static int submit_extent_page(int rw, struct extent_io_tree *tree, + struct page *page, sector_t sector, + size_t size, unsigned long offset, + struct block_device *bdev, + struct bio **bio_ret, + unsigned long max_pages, + bio_end_io_t end_io_func, + int mirror_num, + unsigned long prev_bio_flags, + unsigned long bio_flags) +{ + int ret = 0; + struct bio *bio; + int nr; + int contig = 0; + int this_compressed = bio_flags & EXTENT_BIO_COMPRESSED; + int old_compressed = prev_bio_flags & EXTENT_BIO_COMPRESSED; + size_t page_size = min_t(size_t, size, PAGE_CACHE_SIZE); + + if (bio_ret && *bio_ret) { + bio = *bio_ret; + if (old_compressed) + contig = bio->bi_sector == sector; + else + contig = bio->bi_sector + (bio->bi_size >> 9) == + sector; + + if (prev_bio_flags != bio_flags || !contig || + (tree->ops && tree->ops->merge_bio_hook && + tree->ops->merge_bio_hook(page, offset, page_size, bio, + bio_flags)) || + bio_add_page(bio, page, page_size, offset) < page_size) { + ret = submit_one_bio(rw, bio, mirror_num, + prev_bio_flags); + bio = NULL; + } else { + return 0; + } + } + if (this_compressed) + nr = BIO_MAX_PAGES; + else + nr = bio_get_nr_vecs(bdev); + + bio = extent_bio_alloc(bdev, sector, nr, GFP_NOFS | __GFP_HIGH); + + bio_add_page(bio, page, page_size, offset); + bio->bi_end_io = end_io_func; + bio->bi_private = tree; + + if (bio_ret) + *bio_ret = bio; + else + ret = submit_one_bio(rw, bio, mirror_num, bio_flags); + + return ret; +} + +void set_page_extent_mapped(struct page *page) +{ + if (!PagePrivate(page)) { + SetPagePrivate(page); + page_cache_get(page); + set_page_private(page, EXTENT_PAGE_PRIVATE); + } +} + +static void set_page_extent_head(struct page *page, unsigned long len) +{ + set_page_private(page, EXTENT_PAGE_PRIVATE_FIRST_PAGE | len << 2); +} + +/* + * basic readpage implementation. Locked extent state structs are inserted + * into the tree that are removed when the IO is done (by the end_io + * handlers) + */ +static int __extent_read_full_page(struct extent_io_tree *tree, + struct page *page, + get_extent_t *get_extent, + struct bio **bio, int mirror_num, + unsigned long *bio_flags) +{ + struct inode *inode = page->mapping->host; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 cur_end; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t page_offset = 0; + size_t iosize; + size_t disk_io_size; + size_t blocksize = inode->i_sb->s_blocksize; + unsigned long this_bio_flag = 0; + + set_page_extent_mapped(page); + + end = page_end; + lock_extent(tree, start, end, GFP_NOFS); + + if (page->index == last_byte >> PAGE_CACHE_SHIFT) { + char *userpage; + size_t zero_offset = last_byte & (PAGE_CACHE_SIZE - 1); + + if (zero_offset) { + iosize = PAGE_CACHE_SIZE - zero_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + zero_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + } + } + while (cur <= end) { + if (cur >= last_byte) { + char *userpage; + iosize = PAGE_CACHE_SIZE - page_offset; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + break; + } + em = get_extent(inode, page, page_offset, cur, + end - cur + 1, 0); + if (IS_ERR(em) || !em) { + SetPageError(page); + unlock_extent(tree, cur, end, GFP_NOFS); + break; + } + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) + this_bio_flag = EXTENT_BIO_COMPRESSED; + + iosize = min(extent_map_end(em) - cur, end - cur + 1); + cur_end = min(extent_map_end(em) - 1, end); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + if (this_bio_flag & EXTENT_BIO_COMPRESSED) { + disk_io_size = em->block_len; + sector = em->block_start >> 9; + } else { + sector = (em->block_start + extent_offset) >> 9; + disk_io_size = iosize; + } + bdev = em->bdev; + block_start = em->block_start; + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + block_start = EXTENT_MAP_HOLE; + free_extent_map(em); + em = NULL; + + /* we've found a hole, just zero and go on */ + if (block_start == EXTENT_MAP_HOLE) { + char *userpage; + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + page_offset, 0, iosize); + flush_dcache_page(page); + kunmap_atomic(userpage, KM_USER0); + + set_extent_uptodate(tree, cur, cur + iosize - 1, + GFP_NOFS); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + /* the get_extent function already copied into the page */ + if (test_range_bit(tree, cur, cur_end, EXTENT_UPTODATE, 1)) { + check_page_uptodate(tree, page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + /* we have an inline extent but it didn't get marked up + * to date. Error out + */ + if (block_start == EXTENT_MAP_INLINE) { + SetPageError(page); + unlock_extent(tree, cur, cur + iosize - 1, GFP_NOFS); + cur = cur + iosize; + page_offset += iosize; + continue; + } + + ret = 0; + if (tree->ops && tree->ops->readpage_io_hook) { + ret = tree->ops->readpage_io_hook(page, cur, + cur + iosize - 1); + } + if (!ret) { + unsigned long pnr = (last_byte >> PAGE_CACHE_SHIFT) + 1; + pnr -= page->index; + ret = submit_extent_page(READ, tree, page, + sector, disk_io_size, page_offset, + bdev, bio, pnr, + end_bio_extent_readpage, mirror_num, + *bio_flags, + this_bio_flag); + nr++; + *bio_flags = this_bio_flag; + } + if (ret) + SetPageError(page); + cur = cur + iosize; + page_offset += iosize; + } + if (!nr) { + if (!PageError(page)) + SetPageUptodate(page); + unlock_page(page); + } + return 0; +} + +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent) +{ + struct bio *bio = NULL; + unsigned long bio_flags = 0; + int ret; + + ret = __extent_read_full_page(tree, page, get_extent, &bio, 0, + &bio_flags); + if (bio) + submit_one_bio(READ, bio, 0, bio_flags); + return ret; +} + +/* + * the writepage semantics are similar to regular writepage. extent + * records are inserted to lock ranges in the tree, and as dirty areas + * are found, they are marked writeback. Then the lock bits are removed + * and the end_io handler clears the writeback ranges + */ +static int __extent_writepage(struct page *page, struct writeback_control *wbc, + void *data) +{ + struct inode *inode = page->mapping->host; + struct extent_page_data *epd = data; + struct extent_io_tree *tree = epd->tree; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 delalloc_start; + u64 page_end = start + PAGE_CACHE_SIZE - 1; + u64 end; + u64 cur = start; + u64 extent_offset; + u64 last_byte = i_size_read(inode); + u64 block_start; + u64 iosize; + u64 unlock_start; + sector_t sector; + struct extent_map *em; + struct block_device *bdev; + int ret; + int nr = 0; + size_t pg_offset = 0; + size_t blocksize; + loff_t i_size = i_size_read(inode); + unsigned long end_index = i_size >> PAGE_CACHE_SHIFT; + u64 nr_delalloc; + u64 delalloc_end; + int page_started; + int compressed; + unsigned long nr_written = 0; + + WARN_ON(!PageLocked(page)); + pg_offset = i_size & (PAGE_CACHE_SIZE - 1); + if (page->index > end_index || + (page->index == end_index && !pg_offset)) { + page->mapping->a_ops->invalidatepage(page, 0); + unlock_page(page); + return 0; + } + + if (page->index == end_index) { + char *userpage; + + userpage = kmap_atomic(page, KM_USER0); + memset(userpage + pg_offset, 0, + PAGE_CACHE_SIZE - pg_offset); + kunmap_atomic(userpage, KM_USER0); + flush_dcache_page(page); + } + pg_offset = 0; + + set_page_extent_mapped(page); + + delalloc_start = start; + delalloc_end = 0; + page_started = 0; + if (!epd->extent_locked) { + while (delalloc_end < page_end) { + nr_delalloc = find_lock_delalloc_range(inode, tree, + page, + &delalloc_start, + &delalloc_end, + 128 * 1024 * 1024); + if (nr_delalloc == 0) { + delalloc_start = delalloc_end + 1; + continue; + } + tree->ops->fill_delalloc(inode, page, delalloc_start, + delalloc_end, &page_started, + &nr_written); + delalloc_start = delalloc_end + 1; + } + + /* did the fill delalloc function already unlock and start + * the IO? + */ + if (page_started) { + ret = 0; + goto update_nr_written; + } + } + lock_extent(tree, start, page_end, GFP_NOFS); + + unlock_start = start; + + if (tree->ops && tree->ops->writepage_start_hook) { + ret = tree->ops->writepage_start_hook(page, start, + page_end); + if (ret == -EAGAIN) { + unlock_extent(tree, start, page_end, GFP_NOFS); + redirty_page_for_writepage(wbc, page); + unlock_page(page); + ret = 0; + goto update_nr_written; + } + } + + nr_written++; + + end = page_end; + if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) + printk(KERN_ERR "btrfs delalloc bits after lock_extent\n"); + + if (last_byte <= start) { + clear_extent_dirty(tree, start, page_end, GFP_NOFS); + unlock_extent(tree, start, page_end, GFP_NOFS); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + page_end, NULL, 1); + unlock_start = page_end + 1; + goto done; + } + + set_extent_uptodate(tree, start, page_end, GFP_NOFS); + blocksize = inode->i_sb->s_blocksize; + + while (cur <= end) { + if (cur >= last_byte) { + clear_extent_dirty(tree, cur, page_end, GFP_NOFS); + unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + page_end, NULL, 1); + unlock_start = page_end + 1; + break; + } + em = epd->get_extent(inode, page, pg_offset, cur, + end - cur + 1, 1); + if (IS_ERR(em) || !em) { + SetPageError(page); + break; + } + + extent_offset = cur - em->start; + BUG_ON(extent_map_end(em) <= cur); + BUG_ON(end < cur); + iosize = min(extent_map_end(em) - cur, end - cur + 1); + iosize = (iosize + blocksize - 1) & ~((u64)blocksize - 1); + sector = (em->block_start + extent_offset) >> 9; + bdev = em->bdev; + block_start = em->block_start; + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + free_extent_map(em); + em = NULL; + + /* + * compressed and inline extents are written through other + * paths in the FS + */ + if (compressed || block_start == EXTENT_MAP_HOLE || + block_start == EXTENT_MAP_INLINE) { + clear_extent_dirty(tree, cur, + cur + iosize - 1, GFP_NOFS); + + unlock_extent(tree, unlock_start, cur + iosize - 1, + GFP_NOFS); + + /* + * end_io notification does not happen here for + * compressed extents + */ + if (!compressed && tree->ops && + tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, cur, + cur + iosize - 1, + NULL, 1); + else if (compressed) { + /* we don't want to end_page_writeback on + * a compressed extent. this happens + * elsewhere + */ + nr++; + } + + cur += iosize; + pg_offset += iosize; + unlock_start = cur; + continue; + } + /* leave this out until we have a page_mkwrite call */ + if (0 && !test_range_bit(tree, cur, cur + iosize - 1, + EXTENT_DIRTY, 0)) { + cur = cur + iosize; + pg_offset += iosize; + continue; + } + + clear_extent_dirty(tree, cur, cur + iosize - 1, GFP_NOFS); + if (tree->ops && tree->ops->writepage_io_hook) { + ret = tree->ops->writepage_io_hook(page, cur, + cur + iosize - 1); + } else { + ret = 0; + } + if (ret) { + SetPageError(page); + } else { + unsigned long max_nr = end_index + 1; + + set_range_writeback(tree, cur, cur + iosize - 1); + if (!PageWriteback(page)) { + printk(KERN_ERR "btrfs warning page %lu not " + "writeback, cur %llu end %llu\n", + page->index, (unsigned long long)cur, + (unsigned long long)end); + } + + ret = submit_extent_page(WRITE, tree, page, sector, + iosize, pg_offset, bdev, + &epd->bio, max_nr, + end_bio_extent_writepage, + 0, 0, 0); + if (ret) + SetPageError(page); + } + cur = cur + iosize; + pg_offset += iosize; + nr++; + } +done: + if (nr == 0) { + /* make sure the mapping tag for page dirty gets cleared */ + set_page_writeback(page); + end_page_writeback(page); + } + if (unlock_start <= page_end) + unlock_extent(tree, unlock_start, page_end, GFP_NOFS); + unlock_page(page); + +update_nr_written: + wbc->nr_to_write -= nr_written; + if (wbc->range_cyclic || (wbc->nr_to_write > 0 && + wbc->range_start == 0 && wbc->range_end == LLONG_MAX)) + page->mapping->writeback_index = page->index + nr_written; + return 0; +} + +/** + * write_cache_pages - walk the list of dirty pages of the given address space and write all of them. + * @mapping: address space structure to write + * @wbc: subtract the number of written pages from *@wbc->nr_to_write + * @writepage: function called for each page + * @data: data passed to writepage function + * + * If a page is already under I/O, write_cache_pages() skips it, even + * if it's dirty. This is desirable behaviour for memory-cleaning writeback, + * but it is INCORRECT for data-integrity system calls such as fsync(). fsync() + * and msync() need to guarantee that all the data which was dirty at the time + * the call was made get new I/O started against them. If wbc->sync_mode is + * WB_SYNC_ALL then we were called for data integrity and we must wait for + * existing IO to complete. + */ +static int extent_write_cache_pages(struct extent_io_tree *tree, + struct address_space *mapping, + struct writeback_control *wbc, + writepage_t writepage, void *data, + void (*flush_fn)(void *)) +{ + struct backing_dev_info *bdi = mapping->backing_dev_info; + int ret = 0; + int done = 0; + struct pagevec pvec; + int nr_pages; + pgoff_t index; + pgoff_t end; /* Inclusive */ + int scanned = 0; + int range_whole = 0; + + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + return 0; + } + + pagevec_init(&pvec, 0); + if (wbc->range_cyclic) { + index = mapping->writeback_index; /* Start from prev offset */ + end = -1; + } else { + index = wbc->range_start >> PAGE_CACHE_SHIFT; + end = wbc->range_end >> PAGE_CACHE_SHIFT; + if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) + range_whole = 1; + scanned = 1; + } +retry: + while (!done && (index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_DIRTY, min(end - index, + (pgoff_t)PAGEVEC_SIZE-1) + 1))) { + unsigned i; + + scanned = 1; + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* + * At this point we hold neither mapping->tree_lock nor + * lock on the page itself: the page may be truncated or + * invalidated (changing page->mapping to NULL), or even + * swizzled back from swapper_space to tmpfs file + * mapping + */ + if (tree->ops && tree->ops->write_cache_pages_lock_hook) + tree->ops->write_cache_pages_lock_hook(page); + else + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + unlock_page(page); + continue; + } + + if (!wbc->range_cyclic && page->index > end) { + done = 1; + unlock_page(page); + continue; + } + + if (wbc->sync_mode != WB_SYNC_NONE) { + if (PageWriteback(page)) + flush_fn(data); + wait_on_page_writeback(page); + } + + if (PageWriteback(page) || + !clear_page_dirty_for_io(page)) { + unlock_page(page); + continue; + } + + ret = (*writepage)(page, wbc, data); + + if (unlikely(ret == AOP_WRITEPAGE_ACTIVATE)) { + unlock_page(page); + ret = 0; + } + if (ret || wbc->nr_to_write <= 0) + done = 1; + if (wbc->nonblocking && bdi_write_congested(bdi)) { + wbc->encountered_congestion = 1; + done = 1; + } + } + pagevec_release(&pvec); + cond_resched(); + } + if (!scanned && !done) { + /* + * We hit the last page and there is more work to be done: wrap + * back to the start of the file + */ + scanned = 1; + index = 0; + goto retry; + } + return ret; +} + +static noinline void flush_write_bio(void *data) +{ + struct extent_page_data *epd = data; + if (epd->bio) { + submit_one_bio(WRITE, epd->bio, 0, 0); + epd->bio = NULL; + } +} + +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret; + struct address_space *mapping = page->mapping; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + }; + struct writeback_control wbc_writepages = { + .bdi = wbc->bdi, + .sync_mode = WB_SYNC_NONE, + .older_than_this = NULL, + .nr_to_write = 64, + .range_start = page_offset(page) + PAGE_CACHE_SIZE, + .range_end = (loff_t)-1, + }; + + + ret = __extent_writepage(page, wbc, &epd); + + extent_write_cache_pages(tree, mapping, &wbc_writepages, + __extent_writepage, &epd, flush_write_bio); + if (epd.bio) + submit_one_bio(WRITE, epd.bio, 0, 0); + return ret; +} + +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode) +{ + int ret = 0; + struct address_space *mapping = inode->i_mapping; + struct page *page; + unsigned long nr_pages = (end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 1, + }; + struct writeback_control wbc_writepages = { + .bdi = inode->i_mapping->backing_dev_info, + .sync_mode = mode, + .older_than_this = NULL, + .nr_to_write = nr_pages * 2, + .range_start = start, + .range_end = end + 1, + }; + + while (start <= end) { + page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + if (clear_page_dirty_for_io(page)) + ret = __extent_writepage(page, &wbc_writepages, &epd); + else { + if (tree->ops && tree->ops->writepage_end_io_hook) + tree->ops->writepage_end_io_hook(page, start, + start + PAGE_CACHE_SIZE - 1, + NULL, 1); + unlock_page(page); + } + page_cache_release(page); + start += PAGE_CACHE_SIZE; + } + + if (epd.bio) + submit_one_bio(WRITE, epd.bio, 0, 0); + return ret; +} + +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc) +{ + int ret = 0; + struct extent_page_data epd = { + .bio = NULL, + .tree = tree, + .get_extent = get_extent, + .extent_locked = 0, + }; + + ret = extent_write_cache_pages(tree, mapping, wbc, + __extent_writepage, &epd, + flush_write_bio); + if (epd.bio) + submit_one_bio(WRITE, epd.bio, 0, 0); + return ret; +} + +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent) +{ + struct bio *bio = NULL; + unsigned page_idx; + struct pagevec pvec; + unsigned long bio_flags = 0; + + pagevec_init(&pvec, 0); + for (page_idx = 0; page_idx < nr_pages; page_idx++) { + struct page *page = list_entry(pages->prev, struct page, lru); + + prefetchw(&page->flags); + list_del(&page->lru); + /* + * what we want to do here is call add_to_page_cache_lru, + * but that isn't exported, so we reproduce it here + */ + if (!add_to_page_cache(page, mapping, + page->index, GFP_KERNEL)) { + + /* open coding of lru_cache_add, also not exported */ + page_cache_get(page); + if (!pagevec_add(&pvec, page)) + __pagevec_lru_add_file(&pvec); + __extent_read_full_page(tree, page, get_extent, + &bio, 0, &bio_flags); + } + page_cache_release(page); + } + if (pagevec_count(&pvec)) + __pagevec_lru_add_file(&pvec); + BUG_ON(!list_empty(pages)); + if (bio) + submit_one_bio(READ, bio, 0, bio_flags); + return 0; +} + +/* + * basic invalidatepage code, this waits on any locked or writeback + * ranges corresponding to the page, and then deletes any extent state + * records from the tree + */ +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset) +{ + u64 start = ((u64)page->index << PAGE_CACHE_SHIFT); + u64 end = start + PAGE_CACHE_SIZE - 1; + size_t blocksize = page->mapping->host->i_sb->s_blocksize; + + start += (offset + blocksize - 1) & ~(blocksize - 1); + if (start > end) + return 0; + + lock_extent(tree, start, end, GFP_NOFS); + wait_on_extent_writeback(tree, start, end); + clear_extent_bit(tree, start, end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC, + 1, 1, GFP_NOFS); + return 0; +} + +/* + * simple commit_write call, set_range_dirty is used to mark both + * the pages and the extent records as dirty + */ +int extent_commit_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to) +{ + loff_t pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to; + + set_page_extent_mapped(page); + set_page_dirty(page); + + if (pos > inode->i_size) { + i_size_write(inode, pos); + mark_inode_dirty(inode); + } + return 0; +} + +int extent_prepare_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent) +{ + u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + u64 block_start; + u64 orig_block_start; + u64 block_end; + u64 cur_end; + struct extent_map *em; + unsigned blocksize = 1 << inode->i_blkbits; + size_t page_offset = 0; + size_t block_off_start; + size_t block_off_end; + int err = 0; + int iocount = 0; + int ret = 0; + int isnew; + + set_page_extent_mapped(page); + + block_start = (page_start + from) & ~((u64)blocksize - 1); + block_end = (page_start + to - 1) | (blocksize - 1); + orig_block_start = block_start; + + lock_extent(tree, page_start, page_end, GFP_NOFS); + while (block_start <= block_end) { + em = get_extent(inode, page, page_offset, block_start, + block_end - block_start + 1, 1); + if (IS_ERR(em) || !em) + goto err; + + cur_end = min(block_end, extent_map_end(em) - 1); + block_off_start = block_start & (PAGE_CACHE_SIZE - 1); + block_off_end = block_off_start + blocksize; + isnew = clear_extent_new(tree, block_start, cur_end, GFP_NOFS); + + if (!PageUptodate(page) && isnew && + (block_off_end > to || block_off_start < from)) { + void *kaddr; + + kaddr = kmap_atomic(page, KM_USER0); + if (block_off_end > to) + memset(kaddr + to, 0, block_off_end - to); + if (block_off_start < from) + memset(kaddr + block_off_start, 0, + from - block_off_start); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + } + if ((em->block_start != EXTENT_MAP_HOLE && + em->block_start != EXTENT_MAP_INLINE) && + !isnew && !PageUptodate(page) && + (block_off_end > to || block_off_start < from) && + !test_range_bit(tree, block_start, cur_end, + EXTENT_UPTODATE, 1)) { + u64 sector; + u64 extent_offset = block_start - em->start; + size_t iosize; + sector = (em->block_start + extent_offset) >> 9; + iosize = (cur_end - block_start + blocksize) & + ~((u64)blocksize - 1); + /* + * we've already got the extent locked, but we + * need to split the state such that our end_bio + * handler can clear the lock. + */ + set_extent_bit(tree, block_start, + block_start + iosize - 1, + EXTENT_LOCKED, 0, NULL, GFP_NOFS); + ret = submit_extent_page(READ, tree, page, + sector, iosize, page_offset, em->bdev, + NULL, 1, + end_bio_extent_preparewrite, 0, + 0, 0); + iocount++; + block_start = block_start + iosize; + } else { + set_extent_uptodate(tree, block_start, cur_end, + GFP_NOFS); + unlock_extent(tree, block_start, cur_end, GFP_NOFS); + block_start = cur_end + 1; + } + page_offset = block_start & (PAGE_CACHE_SIZE - 1); + free_extent_map(em); + } + if (iocount) { + wait_extent_bit(tree, orig_block_start, + block_end, EXTENT_LOCKED); + } + check_page_uptodate(tree, page); +err: + /* FIXME, zero out newly allocated blocks on error */ + return err; +} + +/* + * a helper for releasepage, this tests for areas of the page that + * are locked or under IO and drops the related state bits if it is safe + * to drop the page. + */ +int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask) +{ + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + int ret = 1; + + if (test_range_bit(tree, start, end, + EXTENT_IOBITS | EXTENT_ORDERED, 0)) + ret = 0; + else { + if ((mask & GFP_NOFS) == GFP_NOFS) + mask = GFP_NOFS; + clear_extent_bit(tree, start, end, EXTENT_UPTODATE, + 1, 1, mask); + } + return ret; +} + +/* + * a helper for releasepage. As long as there are no locked extents + * in the range corresponding to the page, both state records and extent + * map records are removed + */ +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask) +{ + struct extent_map *em; + u64 start = (u64)page->index << PAGE_CACHE_SHIFT; + u64 end = start + PAGE_CACHE_SIZE - 1; + + if ((mask & __GFP_WAIT) && + page->mapping->host->i_size > 16 * 1024 * 1024) { + u64 len; + while (start <= end) { + len = end - start + 1; + spin_lock(&map->lock); + em = lookup_extent_mapping(map, start, len); + if (!em || IS_ERR(em)) { + spin_unlock(&map->lock); + break; + } + if (test_bit(EXTENT_FLAG_PINNED, &em->flags) || + em->start != start) { + spin_unlock(&map->lock); + free_extent_map(em); + break; + } + if (!test_range_bit(tree, em->start, + extent_map_end(em) - 1, + EXTENT_LOCKED | EXTENT_WRITEBACK | + EXTENT_ORDERED, + 0)) { + remove_extent_mapping(map, em); + /* once for the rb tree */ + free_extent_map(em); + } + start = extent_map_end(em); + spin_unlock(&map->lock); + + /* once for us */ + free_extent_map(em); + } + } + return try_release_extent_state(map, tree, page, mask); +} + +sector_t extent_bmap(struct address_space *mapping, sector_t iblock, + get_extent_t *get_extent) +{ + struct inode *inode = mapping->host; + u64 start = iblock << inode->i_blkbits; + sector_t sector = 0; + size_t blksize = (1 << inode->i_blkbits); + struct extent_map *em; + + lock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, + GFP_NOFS); + em = get_extent(inode, NULL, 0, start, blksize, 0); + unlock_extent(&BTRFS_I(inode)->io_tree, start, start + blksize - 1, + GFP_NOFS); + if (!em || IS_ERR(em)) + return 0; + + if (em->block_start > EXTENT_MAP_LAST_BYTE) + goto out; + + sector = (em->block_start + start - em->start) >> inode->i_blkbits; +out: + free_extent_map(em); + return sector; +} + +static inline struct page *extent_buffer_page(struct extent_buffer *eb, + unsigned long i) +{ + struct page *p; + struct address_space *mapping; + + if (i == 0) + return eb->first_page; + i += eb->start >> PAGE_CACHE_SHIFT; + mapping = eb->first_page->mapping; + if (!mapping) + return NULL; + + /* + * extent_buffer_page is only called after pinning the page + * by increasing the reference count. So we know the page must + * be in the radix tree. + */ + rcu_read_lock(); + p = radix_tree_lookup(&mapping->page_tree, i); + rcu_read_unlock(); + + return p; +} + +static inline unsigned long num_extent_pages(u64 start, u64 len) +{ + return ((start + len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT) - + (start >> PAGE_CACHE_SHIFT); +} + +static struct extent_buffer *__alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, + unsigned long len, + gfp_t mask) +{ + struct extent_buffer *eb = NULL; +#ifdef LEAK_DEBUG + unsigned long flags; +#endif + + eb = kmem_cache_zalloc(extent_buffer_cache, mask); + eb->start = start; + eb->len = len; + mutex_init(&eb->mutex); +#ifdef LEAK_DEBUG + spin_lock_irqsave(&leak_lock, flags); + list_add(&eb->leak_list, &buffers); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + atomic_set(&eb->refs, 1); + + return eb; +} + +static void __free_extent_buffer(struct extent_buffer *eb) +{ +#ifdef LEAK_DEBUG + unsigned long flags; + spin_lock_irqsave(&leak_lock, flags); + list_del(&eb->leak_list); + spin_unlock_irqrestore(&leak_lock, flags); +#endif + kmem_cache_free(extent_buffer_cache, eb); +} + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0, + gfp_t mask) +{ + unsigned long num_pages = num_extent_pages(start, len); + unsigned long i; + unsigned long index = start >> PAGE_CACHE_SHIFT; + struct extent_buffer *eb; + struct extent_buffer *exists = NULL; + struct page *p; + struct address_space *mapping = tree->mapping; + int uptodate = 1; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (eb) { + atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); + mark_page_accessed(eb->first_page); + return eb; + } + spin_unlock(&tree->buffer_lock); + + eb = __alloc_extent_buffer(tree, start, len, mask); + if (!eb) + return NULL; + + if (page0) { + eb->first_page = page0; + i = 1; + index++; + page_cache_get(page0); + mark_page_accessed(page0); + set_page_extent_mapped(page0); + set_page_extent_head(page0, len); + uptodate = PageUptodate(page0); + } else { + i = 0; + } + for (; i < num_pages; i++, index++) { + p = find_or_create_page(mapping, index, mask | __GFP_HIGHMEM); + if (!p) { + WARN_ON(1); + goto free_eb; + } + set_page_extent_mapped(p); + mark_page_accessed(p); + if (i == 0) { + eb->first_page = p; + set_page_extent_head(p, len); + } else { + set_page_private(p, EXTENT_PAGE_PRIVATE); + } + if (!PageUptodate(p)) + uptodate = 0; + unlock_page(p); + } + if (uptodate) + eb->flags |= EXTENT_UPTODATE; + eb->flags |= EXTENT_BUFFER_FILLED; + + spin_lock(&tree->buffer_lock); + exists = buffer_tree_insert(tree, start, &eb->rb_node); + if (exists) { + /* add one reference for the caller */ + atomic_inc(&exists->refs); + spin_unlock(&tree->buffer_lock); + goto free_eb; + } + spin_unlock(&tree->buffer_lock); + + /* add one reference for the tree */ + atomic_inc(&eb->refs); + return eb; + +free_eb: + if (!atomic_dec_and_test(&eb->refs)) + return exists; + for (index = 1; index < i; index++) + page_cache_release(extent_buffer_page(eb, index)); + page_cache_release(extent_buffer_page(eb, 0)); + __free_extent_buffer(eb); + return exists; +} + +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + gfp_t mask) +{ + struct extent_buffer *eb; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (eb) + atomic_inc(&eb->refs); + spin_unlock(&tree->buffer_lock); + + if (eb) + mark_page_accessed(eb->first_page); + + return eb; +} + +void free_extent_buffer(struct extent_buffer *eb) +{ + if (!eb) + return; + + if (!atomic_dec_and_test(&eb->refs)) + return; + + WARN_ON(1); +} + +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + int set; + unsigned long i; + unsigned long num_pages; + struct page *page; + + u64 start = eb->start; + u64 end = start + eb->len - 1; + + set = clear_extent_dirty(tree, start, end, GFP_NOFS); + num_pages = num_extent_pages(eb->start, eb->len); + + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!set && !PageDirty(page)) + continue; + + lock_page(page); + if (i == 0) + set_page_extent_head(page, eb->len); + else + set_page_private(page, EXTENT_PAGE_PRIVATE); + + /* + * if we're on the last page or the first page and the + * block isn't aligned on a page boundary, do extra checks + * to make sure we don't clean page that is partially dirty + */ + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + start = (u64)page->index << PAGE_CACHE_SHIFT; + end = start + PAGE_CACHE_SIZE - 1; + if (test_range_bit(tree, start, end, + EXTENT_DIRTY, 0)) { + unlock_page(page); + continue; + } + } + clear_page_dirty_for_io(page); + spin_lock_irq(&page->mapping->tree_lock); + if (!PageDirty(page)) { + radix_tree_tag_clear(&page->mapping->page_tree, + page_index(page), + PAGECACHE_TAG_DIRTY); + } + spin_unlock_irq(&page->mapping->tree_lock); + unlock_page(page); + } + return 0; +} + +int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + return wait_on_extent_writeback(tree, eb->start, + eb->start + eb->len - 1); +} + +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + struct page *page = extent_buffer_page(eb, i); + /* writepage may need to do something special for the + * first page, we have to make sure page->private is + * properly set. releasepage may drop page->private + * on us if the page isn't already dirty. + */ + lock_page(page); + if (i == 0) { + set_page_extent_head(page, eb->len); + } else if (PagePrivate(page) && + page->private != EXTENT_PAGE_PRIVATE) { + set_page_extent_mapped(page); + } + __set_page_dirty_nobuffers(extent_buffer_page(eb, i)); + set_extent_dirty(tree, page_offset(page), + page_offset(page) + PAGE_CACHE_SIZE - 1, + GFP_NOFS); + unlock_page(page); + } + return 0; +} + +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + eb->flags &= ~EXTENT_UPTODATE; + + clear_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (page) + ClearPageUptodate(page); + } + return 0; +} + +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + unsigned long i; + struct page *page; + unsigned long num_pages; + + num_pages = num_extent_pages(eb->start, eb->len); + + set_extent_uptodate(tree, eb->start, eb->start + eb->len - 1, + GFP_NOFS); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) || + ((i == num_pages - 1) && + ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) { + check_page_uptodate(tree, page); + continue; + } + SetPageUptodate(page); + } + return 0; +} + +int extent_range_uptodate(struct extent_io_tree *tree, + u64 start, u64 end) +{ + struct page *page; + int ret; + int pg_uptodate = 1; + int uptodate; + unsigned long index; + + ret = test_range_bit(tree, start, end, EXTENT_UPTODATE, 1); + if (ret) + return 1; + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + page = find_get_page(tree->mapping, index); + uptodate = PageUptodate(page); + page_cache_release(page); + if (!uptodate) { + pg_uptodate = 0; + break; + } + start += PAGE_CACHE_SIZE; + } + return pg_uptodate; +} + +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb) +{ + int ret = 0; + unsigned long num_pages; + unsigned long i; + struct page *page; + int pg_uptodate = 1; + + if (eb->flags & EXTENT_UPTODATE) + return 1; + + ret = test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1); + if (ret) + return ret; + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!PageUptodate(page)) { + pg_uptodate = 0; + break; + } + } + return pg_uptodate; +} + +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, + u64 start, int wait, + get_extent_t *get_extent, int mirror_num) +{ + unsigned long i; + unsigned long start_i; + struct page *page; + int err; + int ret = 0; + int locked_pages = 0; + int all_uptodate = 1; + int inc_all_pages = 0; + unsigned long num_pages; + struct bio *bio = NULL; + unsigned long bio_flags = 0; + + if (eb->flags & EXTENT_UPTODATE) + return 0; + + if (test_range_bit(tree, eb->start, eb->start + eb->len - 1, + EXTENT_UPTODATE, 1)) { + return 0; + } + + if (start) { + WARN_ON(start < eb->start); + start_i = (start >> PAGE_CACHE_SHIFT) - + (eb->start >> PAGE_CACHE_SHIFT); + } else { + start_i = 0; + } + + num_pages = num_extent_pages(eb->start, eb->len); + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (!wait) { + if (!trylock_page(page)) + goto unlock_exit; + } else { + lock_page(page); + } + locked_pages++; + if (!PageUptodate(page)) + all_uptodate = 0; + } + if (all_uptodate) { + if (start_i == 0) + eb->flags |= EXTENT_UPTODATE; + goto unlock_exit; + } + + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + if (inc_all_pages) + page_cache_get(page); + if (!PageUptodate(page)) { + if (start_i == 0) + inc_all_pages = 1; + ClearPageError(page); + err = __extent_read_full_page(tree, page, + get_extent, &bio, + mirror_num, &bio_flags); + if (err) + ret = err; + } else { + unlock_page(page); + } + } + + if (bio) + submit_one_bio(READ, bio, mirror_num, bio_flags); + + if (ret || !wait) + return ret; + + for (i = start_i; i < num_pages; i++) { + page = extent_buffer_page(eb, i); + wait_on_page_locked(page); + if (!PageUptodate(page)) + ret = -EIO; + } + + if (!ret) + eb->flags |= EXTENT_UPTODATE; + return ret; + +unlock_exit: + i = start_i; + while (locked_pages > 0) { + page = extent_buffer_page(eb, i); + i++; + unlock_page(page); + locked_pages--; + } + return ret; +} + +void read_extent_buffer(struct extent_buffer *eb, void *dstv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *dst = (char *)dstv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(dst, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER1); + + dst += cur; + len -= cur; + offset = 0; + i++; + } +} + +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + size_t offset = start & (PAGE_CACHE_SIZE - 1); + char *kaddr; + struct page *p; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + unsigned long end_i = (start_offset + start + min_len - 1) >> + PAGE_CACHE_SHIFT; + + if (i != end_i) + return -EINVAL; + + if (i == 0) { + offset = start_offset; + *map_start = 0; + } else { + offset = 0; + *map_start = ((u64)i << PAGE_CACHE_SHIFT) - start_offset; + } + + if (start + min_len > eb->len) { + printk(KERN_ERR "btrfs bad mapping eb start %llu len %lu, " + "wanted %lu %lu\n", (unsigned long long)eb->start, + eb->len, start, min_len); + WARN_ON(1); + } + + p = extent_buffer_page(eb, i); + kaddr = kmap_atomic(p, km); + *token = kaddr; + *map = kaddr + offset; + *map_len = PAGE_CACHE_SIZE - offset; + return 0; +} + +int map_extent_buffer(struct extent_buffer *eb, unsigned long start, + unsigned long min_len, + char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km) +{ + int err; + int save = 0; + if (eb->map_token) { + unmap_extent_buffer(eb, eb->map_token, km); + eb->map_token = NULL; + save = 1; + WARN_ON(!mutex_is_locked(&eb->mutex)); + } + err = map_private_extent_buffer(eb, start, min_len, token, map, + map_start, map_len, km); + if (!err && save) { + eb->map_token = *token; + eb->kaddr = *map; + eb->map_start = *map_start; + eb->map_len = *map_len; + } + return err; +} + +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km) +{ + kunmap_atomic(token, km); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *ptr = (char *)ptrv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + int ret = 0; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + + cur = min(len, (PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + ret = memcmp(ptr, kaddr + offset, cur); + kunmap_atomic(kaddr, KM_USER0); + if (ret) + break; + + ptr += cur; + len -= cur; + offset = 0; + i++; + } + return ret; +} + +void write_extent_buffer(struct extent_buffer *eb, const void *srcv, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + char *src = (char *)srcv; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER1); + memcpy(kaddr + offset, src, cur); + kunmap_atomic(kaddr, KM_USER1); + + src += cur; + len -= cur; + offset = 0; + i++; + } +} + +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len) +{ + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = eb->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + start) >> PAGE_CACHE_SHIFT; + + WARN_ON(start > eb->len); + WARN_ON(start + len > eb->start + eb->len); + + offset = (start_offset + start) & ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(eb, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, PAGE_CACHE_SIZE - offset); + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, c, cur); + kunmap_atomic(kaddr, KM_USER0); + + len -= cur; + offset = 0; + i++; + } +} + +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len) +{ + u64 dst_len = dst->len; + size_t cur; + size_t offset; + struct page *page; + char *kaddr; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + + WARN_ON(src->len != dst_len); + + offset = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + while (len > 0) { + page = extent_buffer_page(dst, i); + WARN_ON(!PageUptodate(page)); + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); + + kaddr = kmap_atomic(page, KM_USER0); + read_extent_buffer(src, kaddr + offset, src_offset, cur); + kunmap_atomic(kaddr, KM_USER0); + + src_offset += cur; + len -= cur; + offset = 0; + i++; + } +} + +static void move_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + if (dst_page == src_page) { + memmove(dst_kaddr + dst_off, dst_kaddr + src_off, len); + } else { + char *src_kaddr = kmap_atomic(src_page, KM_USER1); + char *p = dst_kaddr + dst_off + len; + char *s = src_kaddr + src_off + len; + + while (len--) + *--p = *--s; + + kunmap_atomic(src_kaddr, KM_USER1); + } + kunmap_atomic(dst_kaddr, KM_USER0); +} + +static void copy_pages(struct page *dst_page, struct page *src_page, + unsigned long dst_off, unsigned long src_off, + unsigned long len) +{ + char *dst_kaddr = kmap_atomic(dst_page, KM_USER0); + char *src_kaddr; + + if (dst_page != src_page) + src_kaddr = kmap_atomic(src_page, KM_USER1); + else + src_kaddr = dst_kaddr; + + memcpy(dst_kaddr + dst_off, src_kaddr + src_off, len); + kunmap_atomic(dst_kaddr, KM_USER0); + if (dst_page != src_page) + kunmap_atomic(src_kaddr, KM_USER1); +} + +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu dst len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu dst len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + + while (len > 0) { + dst_off_in_page = (start_offset + dst_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_offset) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + dst_i = (start_offset + dst_offset) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_offset) >> PAGE_CACHE_SHIFT; + + cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - + src_off_in_page)); + cur = min_t(unsigned long, cur, + (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); + + copy_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page, src_off_in_page, cur); + + src_offset += cur; + dst_offset += cur; + len -= cur; + } +} + +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len) +{ + size_t cur; + size_t dst_off_in_page; + size_t src_off_in_page; + unsigned long dst_end = dst_offset + len - 1; + unsigned long src_end = src_offset + len - 1; + size_t start_offset = dst->start & ((u64)PAGE_CACHE_SIZE - 1); + unsigned long dst_i; + unsigned long src_i; + + if (src_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus src_offset %lu move " + "len %lu len %lu\n", src_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset + len > dst->len) { + printk(KERN_ERR "btrfs memmove bogus dst_offset %lu move " + "len %lu len %lu\n", dst_offset, len, dst->len); + BUG_ON(1); + } + if (dst_offset < src_offset) { + memcpy_extent_buffer(dst, dst_offset, src_offset, len); + return; + } + while (len > 0) { + dst_i = (start_offset + dst_end) >> PAGE_CACHE_SHIFT; + src_i = (start_offset + src_end) >> PAGE_CACHE_SHIFT; + + dst_off_in_page = (start_offset + dst_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + src_off_in_page = (start_offset + src_end) & + ((unsigned long)PAGE_CACHE_SIZE - 1); + + cur = min_t(unsigned long, len, src_off_in_page + 1); + cur = min(cur, dst_off_in_page + 1); + move_pages(extent_buffer_page(dst, dst_i), + extent_buffer_page(dst, src_i), + dst_off_in_page - cur + 1, + src_off_in_page - cur + 1, cur); + + dst_end -= cur; + src_end -= cur; + len -= cur; + } +} + +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page) +{ + u64 start = page_offset(page); + struct extent_buffer *eb; + int ret = 1; + unsigned long i; + unsigned long num_pages; + + spin_lock(&tree->buffer_lock); + eb = buffer_search(tree, start); + if (!eb) + goto out; + + if (atomic_read(&eb->refs) > 1) { + ret = 0; + goto out; + } + /* at this point we can safely release the extent buffer */ + num_pages = num_extent_pages(eb->start, eb->len); + for (i = 0; i < num_pages; i++) + page_cache_release(extent_buffer_page(eb, i)); + rb_erase(&eb->rb_node, &tree->buffer); + __free_extent_buffer(eb); +out: + spin_unlock(&tree->buffer_lock); + return ret; +} diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h new file mode 100644 index 00000000000..c5b483a7913 --- /dev/null +++ b/fs/btrfs/extent_io.h @@ -0,0 +1,269 @@ +#ifndef __EXTENTIO__ +#define __EXTENTIO__ + +#include <linux/rbtree.h> + +/* bits for the extent state */ +#define EXTENT_DIRTY 1 +#define EXTENT_WRITEBACK (1 << 1) +#define EXTENT_UPTODATE (1 << 2) +#define EXTENT_LOCKED (1 << 3) +#define EXTENT_NEW (1 << 4) +#define EXTENT_DELALLOC (1 << 5) +#define EXTENT_DEFRAG (1 << 6) +#define EXTENT_DEFRAG_DONE (1 << 7) +#define EXTENT_BUFFER_FILLED (1 << 8) +#define EXTENT_ORDERED (1 << 9) +#define EXTENT_ORDERED_METADATA (1 << 10) +#define EXTENT_BOUNDARY (1 << 11) +#define EXTENT_NODATASUM (1 << 12) +#define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK) + +/* flags for bio submission */ +#define EXTENT_BIO_COMPRESSED 1 + +/* + * page->private values. Every page that is controlled by the extent + * map has page->private set to one. + */ +#define EXTENT_PAGE_PRIVATE 1 +#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 + +struct extent_state; + +typedef int (extent_submit_bio_hook_t)(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags); +struct extent_io_ops { + int (*fill_delalloc)(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written); + int (*writepage_start_hook)(struct page *page, u64 start, u64 end); + int (*writepage_io_hook)(struct page *page, u64 start, u64 end); + extent_submit_bio_hook_t *submit_bio_hook; + int (*merge_bio_hook)(struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags); + int (*readpage_io_hook)(struct page *page, u64 start, u64 end); + int (*readpage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); + int (*writepage_io_failed_hook)(struct bio *bio, struct page *page, + u64 start, u64 end, + struct extent_state *state); + int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state); + int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate); + int (*set_bit_hook)(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits); + int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits); + int (*write_cache_pages_lock_hook)(struct page *page); +}; + +struct extent_io_tree { + struct rb_root state; + struct rb_root buffer; + struct address_space *mapping; + u64 dirty_bytes; + spinlock_t lock; + spinlock_t buffer_lock; + struct extent_io_ops *ops; +}; + +struct extent_state { + u64 start; + u64 end; /* inclusive */ + struct rb_node rb_node; + struct extent_io_tree *tree; + wait_queue_head_t wq; + atomic_t refs; + unsigned long state; + + /* for use by the FS */ + u64 private; + + struct list_head leak_list; +}; + +struct extent_buffer { + u64 start; + unsigned long len; + char *map_token; + char *kaddr; + unsigned long map_start; + unsigned long map_len; + struct page *first_page; + atomic_t refs; + int flags; + struct list_head leak_list; + struct rb_node rb_node; + struct mutex mutex; +}; + +struct extent_map_tree; + +static inline struct extent_state *extent_state_next(struct extent_state *state) +{ + struct rb_node *node; + node = rb_next(&state->rb_node); + if (!node) + return NULL; + return rb_entry(node, struct extent_state, rb_node); +} + +typedef struct extent_map *(get_extent_t)(struct inode *inode, + struct page *page, + size_t page_offset, + u64 start, u64 len, + int create); + +void extent_io_tree_init(struct extent_io_tree *tree, + struct address_space *mapping, gfp_t mask); +int try_release_extent_mapping(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); +int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page); +int try_release_extent_state(struct extent_map_tree *map, + struct extent_io_tree *tree, struct page *page, + gfp_t mask); +int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, gfp_t mask); +int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int extent_read_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent); +int __init extent_io_init(void); +void extent_io_exit(void); + +u64 count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, unsigned long bits); + +int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int filled); +int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + int bits, int wake, int delete, gfp_t mask); +int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + int bits, gfp_t mask); +int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start, + u64 end, gfp_t mask); +int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end, + gfp_t mask); +int find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, int bits); +struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree, + u64 start, int bits); +int extent_invalidatepage(struct extent_io_tree *tree, + struct page *page, unsigned long offset); +int extent_write_full_page(struct extent_io_tree *tree, struct page *page, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_write_locked_range(struct extent_io_tree *tree, struct inode *inode, + u64 start, u64 end, get_extent_t *get_extent, + int mode); +int extent_writepages(struct extent_io_tree *tree, + struct address_space *mapping, + get_extent_t *get_extent, + struct writeback_control *wbc); +int extent_readpages(struct extent_io_tree *tree, + struct address_space *mapping, + struct list_head *pages, unsigned nr_pages, + get_extent_t get_extent); +int extent_prepare_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to, get_extent_t *get_extent); +int extent_commit_write(struct extent_io_tree *tree, + struct inode *inode, struct page *page, + unsigned from, unsigned to); +sector_t extent_bmap(struct address_space *mapping, sector_t iblock, + get_extent_t *get_extent); +int set_range_dirty(struct extent_io_tree *tree, u64 start, u64 end); +int set_state_private(struct extent_io_tree *tree, u64 start, u64 private); +int get_state_private(struct extent_io_tree *tree, u64 start, u64 *private); +void set_page_extent_mapped(struct page *page); + +struct extent_buffer *alloc_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + struct page *page0, + gfp_t mask); +struct extent_buffer *find_extent_buffer(struct extent_io_tree *tree, + u64 start, unsigned long len, + gfp_t mask); +void free_extent_buffer(struct extent_buffer *eb); +int read_extent_buffer_pages(struct extent_io_tree *tree, + struct extent_buffer *eb, u64 start, int wait, + get_extent_t *get_extent, int mirror_num); + +static inline void extent_buffer_get(struct extent_buffer *eb) +{ + atomic_inc(&eb->refs); +} + +int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, + unsigned long start, + unsigned long len); +void read_extent_buffer(struct extent_buffer *eb, void *dst, + unsigned long start, + unsigned long len); +void write_extent_buffer(struct extent_buffer *eb, const void *src, + unsigned long start, unsigned long len); +void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, + unsigned long dst_offset, unsigned long src_offset, + unsigned long len); +void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, + unsigned long src_offset, unsigned long len); +void memset_extent_buffer(struct extent_buffer *eb, char c, + unsigned long start, unsigned long len); +int wait_on_extent_buffer_writeback(struct extent_io_tree *tree, + struct extent_buffer *eb); +int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end); +int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits); +int clear_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_dirty(struct extent_io_tree *tree, + struct extent_buffer *eb); +int set_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int clear_extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int extent_buffer_uptodate(struct extent_io_tree *tree, + struct extent_buffer *eb); +int map_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +int map_private_extent_buffer(struct extent_buffer *eb, unsigned long offset, + unsigned long min_len, char **token, char **map, + unsigned long *map_start, + unsigned long *map_len, int km); +void unmap_extent_buffer(struct extent_buffer *eb, char *token, int km); +int release_extent_buffer_tail_pages(struct extent_buffer *eb); +int extent_range_uptodate(struct extent_io_tree *tree, + u64 start, u64 end); +int extent_clear_unlock_delalloc(struct inode *inode, + struct extent_io_tree *tree, + u64 start, u64 end, struct page *locked_page, + int unlock_page, + int clear_unlock, + int clear_delalloc, int clear_dirty, + int set_writeback, + int end_writeback); +#endif diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c new file mode 100644 index 00000000000..4a83e33ada3 --- /dev/null +++ b/fs/btrfs/extent_map.c @@ -0,0 +1,351 @@ +#include <linux/err.h> +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/module.h> +#include <linux/spinlock.h> +#include <linux/version.h> +#include <linux/hardirq.h> +#include "extent_map.h" + +/* temporary define until extent_map moves out of btrfs */ +struct kmem_cache *btrfs_cache_create(const char *name, size_t size, + unsigned long extra_flags, + void (*ctor)(void *, struct kmem_cache *, + unsigned long)); + +static struct kmem_cache *extent_map_cache; + +int __init extent_map_init(void) +{ + extent_map_cache = btrfs_cache_create("extent_map", + sizeof(struct extent_map), 0, + NULL); + if (!extent_map_cache) + return -ENOMEM; + return 0; +} + +void extent_map_exit(void) +{ + if (extent_map_cache) + kmem_cache_destroy(extent_map_cache); +} + +/** + * extent_map_tree_init - initialize extent map tree + * @tree: tree to initialize + * @mask: flags for memory allocations during tree operations + * + * Initialize the extent tree @tree. Should be called for each new inode + * or other user of the extent_map interface. + */ +void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask) +{ + tree->map.rb_node = NULL; + spin_lock_init(&tree->lock); +} +EXPORT_SYMBOL(extent_map_tree_init); + +/** + * alloc_extent_map - allocate new extent map structure + * @mask: memory allocation flags + * + * Allocate a new extent_map structure. The new structure is + * returned with a reference count of one and needs to be + * freed using free_extent_map() + */ +struct extent_map *alloc_extent_map(gfp_t mask) +{ + struct extent_map *em; + em = kmem_cache_alloc(extent_map_cache, mask); + if (!em || IS_ERR(em)) + return em; + em->in_tree = 0; + em->flags = 0; + atomic_set(&em->refs, 1); + return em; +} +EXPORT_SYMBOL(alloc_extent_map); + +/** + * free_extent_map - drop reference count of an extent_map + * @em: extent map beeing releasead + * + * Drops the reference out on @em by one and free the structure + * if the reference count hits zero. + */ +void free_extent_map(struct extent_map *em) +{ + if (!em) + return; + WARN_ON(atomic_read(&em->refs) == 0); + if (atomic_dec_and_test(&em->refs)) { + WARN_ON(em->in_tree); + kmem_cache_free(extent_map_cache, em); + } +} +EXPORT_SYMBOL(free_extent_map); + +static struct rb_node *tree_insert(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct extent_map *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct extent_map, rb_node); + + WARN_ON(!entry->in_tree); + + if (offset < entry->start) + p = &(*p)->rb_left; + else if (offset >= extent_map_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct extent_map, rb_node); + entry->in_tree = 1; + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * search through the tree for an extent_map with a given offset. If + * it can't be found, try to find some neighboring extents + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_ret, + struct rb_node **next_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *orig_prev = NULL; + struct extent_map *entry; + struct extent_map *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct extent_map, rb_node); + prev = n; + prev_entry = entry; + + WARN_ON(!entry->in_tree); + + if (offset < entry->start) + n = n->rb_left; + else if (offset >= extent_map_end(entry)) + n = n->rb_right; + else + return n; + } + + if (prev_ret) { + orig_prev = prev; + while (prev && offset >= extent_map_end(prev_entry)) { + prev = rb_next(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *prev_ret = prev; + prev = orig_prev; + } + + if (next_ret) { + prev_entry = rb_entry(prev, struct extent_map, rb_node); + while (prev && offset < prev_entry->start) { + prev = rb_prev(prev); + prev_entry = rb_entry(prev, struct extent_map, rb_node); + } + *next_ret = prev; + } + return NULL; +} + +/* + * look for an offset in the tree, and if it can't be found, return + * the first offset we can find smaller than 'offset'. + */ +static inline struct rb_node *tree_search(struct rb_root *root, u64 offset) +{ + struct rb_node *prev; + struct rb_node *ret; + ret = __tree_search(root, offset, &prev, NULL); + if (!ret) + return prev; + return ret; +} + +/* check to see if two extent_map structs are adjacent and safe to merge */ +static int mergable_maps(struct extent_map *prev, struct extent_map *next) +{ + if (test_bit(EXTENT_FLAG_PINNED, &prev->flags)) + return 0; + + /* + * don't merge compressed extents, we need to know their + * actual size + */ + if (test_bit(EXTENT_FLAG_COMPRESSED, &prev->flags)) + return 0; + + if (extent_map_end(prev) == next->start && + prev->flags == next->flags && + prev->bdev == next->bdev && + ((next->block_start == EXTENT_MAP_HOLE && + prev->block_start == EXTENT_MAP_HOLE) || + (next->block_start == EXTENT_MAP_INLINE && + prev->block_start == EXTENT_MAP_INLINE) || + (next->block_start == EXTENT_MAP_DELALLOC && + prev->block_start == EXTENT_MAP_DELALLOC) || + (next->block_start < EXTENT_MAP_LAST_BYTE - 1 && + next->block_start == extent_map_block_end(prev)))) { + return 1; + } + return 0; +} + +/** + * add_extent_mapping - add new extent map to the extent tree + * @tree: tree to insert new map in + * @em: map to insert + * + * Insert @em into @tree or perform a simple forward/backward merge with + * existing mappings. The extent_map struct passed in will be inserted + * into the tree directly, with an additional reference taken, or a + * reference dropped if the merge attempt was sucessfull. + */ +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em) +{ + int ret = 0; + struct extent_map *merge = NULL; + struct rb_node *rb; + struct extent_map *exist; + + exist = lookup_extent_mapping(tree, em->start, em->len); + if (exist) { + free_extent_map(exist); + ret = -EEXIST; + goto out; + } + assert_spin_locked(&tree->lock); + rb = tree_insert(&tree->map, em->start, &em->rb_node); + if (rb) { + ret = -EEXIST; + free_extent_map(merge); + goto out; + } + atomic_inc(&em->refs); + if (em->start != 0) { + rb = rb_prev(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(merge, em)) { + em->start = merge->start; + em->len += merge->len; + em->block_len += merge->block_len; + em->block_start = merge->block_start; + merge->in_tree = 0; + rb_erase(&merge->rb_node, &tree->map); + free_extent_map(merge); + } + } + rb = rb_next(&em->rb_node); + if (rb) + merge = rb_entry(rb, struct extent_map, rb_node); + if (rb && mergable_maps(em, merge)) { + em->len += merge->len; + em->block_len += merge->len; + rb_erase(&merge->rb_node, &tree->map); + merge->in_tree = 0; + free_extent_map(merge); + } +out: + return ret; +} +EXPORT_SYMBOL(add_extent_mapping); + +/* simple helper to do math around the end of an extent, handling wrap */ +static u64 range_end(u64 start, u64 len) +{ + if (start + len < start) + return (u64)-1; + return start + len; +} + +/** + * lookup_extent_mapping - lookup extent_map + * @tree: tree to lookup in + * @start: byte offset to start the search + * @len: length of the lookup range + * + * Find and return the first extent_map struct in @tree that intersects the + * [start, len] range. There may be additional objects in the tree that + * intersect, so check the object returned carefully to make sure that no + * additional lookups are needed. + */ +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) +{ + struct extent_map *em; + struct rb_node *rb_node; + struct rb_node *prev = NULL; + struct rb_node *next = NULL; + u64 end = range_end(start, len); + + assert_spin_locked(&tree->lock); + rb_node = __tree_search(&tree->map, start, &prev, &next); + if (!rb_node && prev) { + em = rb_entry(prev, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + } + if (!rb_node && next) { + em = rb_entry(next, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + } + if (!rb_node) { + em = NULL; + goto out; + } + if (IS_ERR(rb_node)) { + em = ERR_PTR(PTR_ERR(rb_node)); + goto out; + } + em = rb_entry(rb_node, struct extent_map, rb_node); + if (end > em->start && start < extent_map_end(em)) + goto found; + + em = NULL; + goto out; + +found: + atomic_inc(&em->refs); +out: + return em; +} +EXPORT_SYMBOL(lookup_extent_mapping); + +/** + * remove_extent_mapping - removes an extent_map from the extent tree + * @tree: extent tree to remove from + * @em: extent map beeing removed + * + * Removes @em from @tree. No reference counts are dropped, and no checks + * are done to see if the range is in use + */ +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em) +{ + int ret = 0; + + WARN_ON(test_bit(EXTENT_FLAG_PINNED, &em->flags)); + assert_spin_locked(&tree->lock); + rb_erase(&em->rb_node, &tree->map); + em->in_tree = 0; + return ret; +} +EXPORT_SYMBOL(remove_extent_mapping); diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h new file mode 100644 index 00000000000..fb6eeef06bb --- /dev/null +++ b/fs/btrfs/extent_map.h @@ -0,0 +1,62 @@ +#ifndef __EXTENTMAP__ +#define __EXTENTMAP__ + +#include <linux/rbtree.h> + +#define EXTENT_MAP_LAST_BYTE (u64)-4 +#define EXTENT_MAP_HOLE (u64)-3 +#define EXTENT_MAP_INLINE (u64)-2 +#define EXTENT_MAP_DELALLOC (u64)-1 + +/* bits for the flags field */ +#define EXTENT_FLAG_PINNED 0 /* this entry not yet on disk, don't free it */ +#define EXTENT_FLAG_COMPRESSED 1 +#define EXTENT_FLAG_VACANCY 2 /* no file extent item found */ +#define EXTENT_FLAG_PREALLOC 3 /* pre-allocated extent */ + +struct extent_map { + struct rb_node rb_node; + + /* all of these are in bytes */ + u64 start; + u64 len; + u64 orig_start; + u64 block_start; + u64 block_len; + unsigned long flags; + struct block_device *bdev; + atomic_t refs; + int in_tree; +}; + +struct extent_map_tree { + struct rb_root map; + spinlock_t lock; +}; + +static inline u64 extent_map_end(struct extent_map *em) +{ + if (em->start + em->len < em->start) + return (u64)-1; + return em->start + em->len; +} + +static inline u64 extent_map_block_end(struct extent_map *em) +{ + if (em->block_start + em->block_len < em->block_start) + return (u64)-1; + return em->block_start + em->block_len; +} + +void extent_map_tree_init(struct extent_map_tree *tree, gfp_t mask); +struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +int add_extent_mapping(struct extent_map_tree *tree, + struct extent_map *em); +int remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em); + +struct extent_map *alloc_extent_map(gfp_t mask); +void free_extent_map(struct extent_map *em); +int __init extent_map_init(void); +void extent_map_exit(void); +#endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c new file mode 100644 index 00000000000..964652435fd --- /dev/null +++ b/fs/btrfs/file-item.c @@ -0,0 +1,831 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/bio.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" + +#define MAX_CSUM_ITEMS(r, size) ((((BTRFS_LEAF_DATA_SIZE(r) - \ + sizeof(struct btrfs_item) * 2) / \ + size) - 1)) + +#define MAX_ORDERED_SUM_BYTES(r) ((PAGE_SIZE - \ + sizeof(struct btrfs_ordered_sum)) / \ + sizeof(struct btrfs_sector_sum) * \ + (r)->sectorsize - (r)->sectorsize) + +int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 objectid, u64 pos, + u64 disk_offset, u64 disk_num_bytes, + u64 num_bytes, u64 offset, u64 ram_bytes, + u8 compression, u8 encryption, u16 other_encoding) +{ + int ret = 0; + struct btrfs_file_extent_item *item; + struct btrfs_key file_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + BUG_ON(!path); + file_key.objectid = objectid; + file_key.offset = pos; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + sizeof(*item)); + if (ret < 0) + goto out; + BUG_ON(ret); + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_disk_bytenr(leaf, item, disk_offset); + btrfs_set_file_extent_disk_num_bytes(leaf, item, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, item, offset); + btrfs_set_file_extent_num_bytes(leaf, item, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, item, ram_bytes); + btrfs_set_file_extent_generation(leaf, item, trans->transid); + btrfs_set_file_extent_type(leaf, item, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_compression(leaf, item, compression); + btrfs_set_file_extent_encryption(leaf, item, encryption); + btrfs_set_file_extent_other_encoding(leaf, item, other_encoding); + + btrfs_mark_buffer_dirty(leaf); +out: + btrfs_free_path(path); + return ret; +} + +struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 bytenr, int cow) +{ + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + struct btrfs_csum_item *item; + struct extent_buffer *leaf; + u64 csum_offset = 0; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int csums_in_item; + + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = bytenr; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); + if (ret < 0) + goto fail; + leaf = path->nodes[0]; + if (ret > 0) { + ret = 1; + if (path->slots[0] == 0) + goto fail; + path->slots[0]--; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) + goto fail; + + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + csums_in_item = btrfs_item_size_nr(leaf, path->slots[0]); + csums_in_item /= csum_size; + + if (csum_offset >= csums_in_item) { + ret = -EFBIG; + goto fail; + } + } + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); + return item; +fail: + if (ret > 0) + ret = -ENOENT; + return ERR_PTR(ret); +} + + +int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid, + u64 offset, int mod) +{ + int ret; + struct btrfs_key file_key; + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + + file_key.objectid = objectid; + file_key.offset = offset; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); + return ret; +} + + +int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u32 *dst) +{ + u32 sum; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + u64 offset; + u64 item_start_offset = 0; + u64 item_last_offset = 0; + u64 disk_bytenr; + u32 diff; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int ret; + struct btrfs_path *path; + struct btrfs_csum_item *item = NULL; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + + path = btrfs_alloc_path(); + if (bio->bi_size > PAGE_CACHE_SIZE * 8) + path->reada = 2; + + WARN_ON(bio->bi_vcnt <= 0); + + disk_bytenr = (u64)bio->bi_sector << 9; + while (bio_index < bio->bi_vcnt) { + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + ret = btrfs_find_ordered_sum(inode, offset, disk_bytenr, &sum); + if (ret == 0) + goto found; + + if (!item || disk_bytenr < item_start_offset || + disk_bytenr >= item_last_offset) { + struct btrfs_key found_key; + u32 item_size; + + if (item) + btrfs_release_path(root, path); + item = btrfs_lookup_csum(NULL, root->fs_info->csum_root, + path, disk_bytenr, 0); + if (IS_ERR(item)) { + ret = PTR_ERR(item); + if (ret == -ENOENT || ret == -EFBIG) + ret = 0; + sum = 0; + if (BTRFS_I(inode)->root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + set_extent_bits(io_tree, offset, + offset + bvec->bv_len - 1, + EXTENT_NODATASUM, GFP_NOFS); + } else { + printk(KERN_INFO "btrfs no csum found " + "for inode %lu start %llu\n", + inode->i_ino, + (unsigned long long)offset); + } + item = NULL; + btrfs_release_path(root, path); + goto found; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + item_start_offset = found_key.offset; + item_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + item_last_offset = item_start_offset + + (item_size / csum_size) * + root->sectorsize; + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + } + /* + * this byte range must be able to fit inside + * a single leaf so it will also fit inside a u32 + */ + diff = disk_bytenr - item_start_offset; + diff = diff / root->sectorsize; + diff = diff * csum_size; + + read_extent_buffer(path->nodes[0], &sum, + ((unsigned long)item) + diff, + csum_size); +found: + if (dst) + *dst++ = sum; + else + set_state_private(io_tree, offset, sum); + disk_bytenr += bvec->bv_len; + bio_index++; + bvec++; + } + btrfs_free_path(path); + return 0; +} + +int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, + struct list_head *list) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_csum_item *item; + unsigned long offset; + int ret; + size_t size; + u64 csum_end; + u16 csum_size = btrfs_super_csum_size(&root->fs_info->super_copy); + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = start; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto fail; + if (ret > 0 && path->slots[0] > 0) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1); + if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID && + key.type == BTRFS_EXTENT_CSUM_KEY) { + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + if (offset * csum_size < + btrfs_item_size_nr(leaf, path->slots[0] - 1)) + path->slots[0]--; + } + } + + while (start <= end) { + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto fail; + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) + break; + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.offset > end) + break; + + if (key.offset > start) + start = key.offset; + + size = btrfs_item_size_nr(leaf, path->slots[0]); + csum_end = key.offset + (size / csum_size) * root->sectorsize; + if (csum_end <= start) { + path->slots[0]++; + continue; + } + + csum_end = min(csum_end, end + 1); + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_csum_item); + while (start < csum_end) { + size = min_t(size_t, csum_end - start, + MAX_ORDERED_SUM_BYTES(root)); + sums = kzalloc(btrfs_ordered_sum_size(root, size), + GFP_NOFS); + BUG_ON(!sums); + + sector_sum = sums->sums; + sums->bytenr = start; + sums->len = size; + + offset = (start - key.offset) >> + root->fs_info->sb->s_blocksize_bits; + offset *= csum_size; + + while (size > 0) { + read_extent_buffer(path->nodes[0], + §or_sum->sum, + ((unsigned long)item) + + offset, csum_size); + sector_sum->bytenr = start; + + size -= root->sectorsize; + start += root->sectorsize; + offset += csum_size; + sector_sum++; + } + list_add_tail(&sums->list, list); + } + path->slots[0]++; + } + ret = 0; +fail: + btrfs_free_path(path); + return ret; +} + +int btrfs_csum_one_bio(struct btrfs_root *root, struct inode *inode, + struct bio *bio, u64 file_start, int contig) +{ + struct btrfs_ordered_sum *sums; + struct btrfs_sector_sum *sector_sum; + struct btrfs_ordered_extent *ordered; + char *data; + struct bio_vec *bvec = bio->bi_io_vec; + int bio_index = 0; + unsigned long total_bytes = 0; + unsigned long this_sum_bytes = 0; + u64 offset; + u64 disk_bytenr; + + WARN_ON(bio->bi_vcnt <= 0); + sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS); + if (!sums) + return -ENOMEM; + + sector_sum = sums->sums; + disk_bytenr = (u64)bio->bi_sector << 9; + sums->len = bio->bi_size; + INIT_LIST_HEAD(&sums->list); + + if (contig) + offset = file_start; + else + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); + sums->bytenr = ordered->start; + + while (bio_index < bio->bi_vcnt) { + if (!contig) + offset = page_offset(bvec->bv_page) + bvec->bv_offset; + + if (!contig && (offset >= ordered->file_offset + ordered->len || + offset < ordered->file_offset)) { + unsigned long bytes_left; + sums->len = this_sum_bytes; + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + + bytes_left = bio->bi_size - total_bytes; + + sums = kzalloc(btrfs_ordered_sum_size(root, bytes_left), + GFP_NOFS); + BUG_ON(!sums); + sector_sum = sums->sums; + sums->len = bytes_left; + ordered = btrfs_lookup_ordered_extent(inode, offset); + BUG_ON(!ordered); + sums->bytenr = ordered->start; + } + + data = kmap_atomic(bvec->bv_page, KM_USER0); + sector_sum->sum = ~(u32)0; + sector_sum->sum = btrfs_csum_data(root, + data + bvec->bv_offset, + sector_sum->sum, + bvec->bv_len); + kunmap_atomic(data, KM_USER0); + btrfs_csum_final(sector_sum->sum, + (char *)§or_sum->sum); + sector_sum->bytenr = disk_bytenr; + + sector_sum++; + bio_index++; + total_bytes += bvec->bv_len; + this_sum_bytes += bvec->bv_len; + disk_bytenr += bvec->bv_len; + offset += bvec->bv_len; + bvec++; + } + this_sum_bytes = 0; + btrfs_add_ordered_sum(inode, ordered, sums); + btrfs_put_ordered_extent(ordered); + return 0; +} + +/* + * helper function for csum removal, this expects the + * key to describe the csum pointed to by the path, and it expects + * the csum to overlap the range [bytenr, len] + * + * The csum should not be entirely contained in the range and the + * range should not be entirely contained in the csum. + * + * This calls btrfs_truncate_item with the correct args based on the + * overlap, and fixes up the key as required. + */ +static noinline int truncate_one_csum(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct btrfs_key *key, + u64 bytenr, u64 len) +{ + struct extent_buffer *leaf; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + u64 csum_end; + u64 end_byte = bytenr + len; + u32 blocksize_bits = root->fs_info->sb->s_blocksize_bits; + int ret; + + leaf = path->nodes[0]; + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= root->fs_info->sb->s_blocksize_bits; + csum_end += key->offset; + + if (key->offset < bytenr && csum_end <= end_byte) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * A simple truncate off the end of the item + */ + u32 new_size = (bytenr - key->offset) >> blocksize_bits; + new_size *= csum_size; + ret = btrfs_truncate_item(trans, root, path, new_size, 1); + BUG_ON(ret); + } else if (key->offset >= bytenr && csum_end > end_byte && + end_byte > key->offset) { + /* + * [ bytenr - len ] + * [ ] + * [csum ] + * we need to truncate from the beginning of the csum + */ + u32 new_size = (csum_end - end_byte) >> blocksize_bits; + new_size *= csum_size; + + ret = btrfs_truncate_item(trans, root, path, new_size, 0); + BUG_ON(ret); + + key->offset = end_byte; + ret = btrfs_set_item_key_safe(trans, root, path, key); + BUG_ON(ret); + } else { + BUG(); + } + return 0; +} + +/* + * deletes the csum items from the csum tree for a given + * range of bytes. + */ +int btrfs_del_csums(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 bytenr, u64 len) +{ + struct btrfs_path *path; + struct btrfs_key key; + u64 end_byte = bytenr + len; + u64 csum_end; + struct extent_buffer *leaf; + int ret; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + int blocksize_bits = root->fs_info->sb->s_blocksize_bits; + + root = root->fs_info->csum_root; + + path = btrfs_alloc_path(); + + while (1) { + key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + key.offset = end_byte - 1; + key.type = BTRFS_EXTENT_CSUM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + + if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + key.type != BTRFS_EXTENT_CSUM_KEY) { + break; + } + + if (key.offset >= end_byte) + break; + + csum_end = btrfs_item_size_nr(leaf, path->slots[0]) / csum_size; + csum_end <<= blocksize_bits; + csum_end += key.offset; + + /* this csum ends before we start, we're done */ + if (csum_end <= bytenr) + break; + + /* delete the entire item, it is inside our range */ + if (key.offset >= bytenr && csum_end <= end_byte) { + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + if (key.offset == bytenr) + break; + } else if (key.offset < bytenr && csum_end > end_byte) { + unsigned long offset; + unsigned long shift_len; + unsigned long item_offset; + /* + * [ bytenr - len ] + * [csum ] + * + * Our bytes are in the middle of the csum, + * we need to split this item and insert a new one. + * + * But we can't drop the path because the + * csum could change, get removed, extended etc. + * + * The trick here is the max size of a csum item leaves + * enough room in the tree block for a single + * item header. So, we split the item in place, + * adding a new header pointing to the existing + * bytes. Then we loop around again and we have + * a nicely formed csum item that we can neatly + * truncate. + */ + offset = (bytenr - key.offset) >> blocksize_bits; + offset *= csum_size; + + shift_len = (len >> blocksize_bits) * csum_size; + + item_offset = btrfs_item_ptr_offset(leaf, + path->slots[0]); + + memset_extent_buffer(leaf, 0, item_offset + offset, + shift_len); + key.offset = bytenr; + + /* + * btrfs_split_item returns -EAGAIN when the + * item changed size or key + */ + ret = btrfs_split_item(trans, root, path, &key, offset); + BUG_ON(ret && ret != -EAGAIN); + + key.offset = end_byte - 1; + } else { + ret = truncate_one_csum(trans, root, path, + &key, bytenr, len); + BUG_ON(ret); + if (key.offset < bytenr) + break; + } + btrfs_release_path(root, path); + } +out: + btrfs_free_path(path); + return 0; +} + +int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_ordered_sum *sums) +{ + u64 bytenr; + int ret; + struct btrfs_key file_key; + struct btrfs_key found_key; + u64 next_offset; + u64 total_bytes = 0; + int found_next; + struct btrfs_path *path; + struct btrfs_csum_item *item; + struct btrfs_csum_item *item_end; + struct extent_buffer *leaf = NULL; + u64 csum_offset; + struct btrfs_sector_sum *sector_sum; + u32 nritems; + u32 ins_size; + char *eb_map; + char *eb_token; + unsigned long map_len; + unsigned long map_start; + u16 csum_size = + btrfs_super_csum_size(&root->fs_info->super_copy); + + path = btrfs_alloc_path(); + BUG_ON(!path); + sector_sum = sums->sums; +again: + next_offset = (u64)-1; + found_next = 0; + file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; + file_key.offset = sector_sum->bytenr; + bytenr = sector_sum->bytenr; + btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + + item = btrfs_lookup_csum(trans, root, path, sector_sum->bytenr, 1); + if (!IS_ERR(item)) { + leaf = path->nodes[0]; + ret = 0; + goto found; + } + ret = PTR_ERR(item); + if (ret == -EFBIG) { + u32 item_size; + /* we found one, but it isn't big enough yet */ + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + if ((item_size / csum_size) >= + MAX_CSUM_ITEMS(root, csum_size)) { + /* already at max size, make a new one */ + goto insert; + } + } else { + int slot = path->slots[0] + 1; + /* we didn't find a csum item, insert one */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems - 1) { + ret = btrfs_next_leaf(root, path); + if (ret == 1) + found_next = 1; + if (ret != 0) + goto insert; + slot = 0; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); + if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + found_key.type != BTRFS_EXTENT_CSUM_KEY) { + found_next = 1; + goto insert; + } + next_offset = found_key.offset; + found_next = 1; + goto insert; + } + + /* + * at this point, we know the tree has an item, but it isn't big + * enough yet to put our csum in. Grow it + */ + btrfs_release_path(root, path); + ret = btrfs_search_slot(trans, root, &file_key, path, + csum_size, 1); + if (ret < 0) + goto fail_unlock; + + if (ret > 0) { + if (path->slots[0] == 0) + goto insert; + path->slots[0]--; + } + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + csum_offset = (bytenr - found_key.offset) >> + root->fs_info->sb->s_blocksize_bits; + + if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || + found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { + goto insert; + } + + if (csum_offset >= btrfs_item_size_nr(leaf, path->slots[0]) / + csum_size) { + u32 diff = (csum_offset + 1) * csum_size; + + /* + * is the item big enough already? we dropped our lock + * before and need to recheck + */ + if (diff < btrfs_item_size_nr(leaf, path->slots[0])) + goto csum; + + diff = diff - btrfs_item_size_nr(leaf, path->slots[0]); + if (diff != csum_size) + goto insert; + + ret = btrfs_extend_item(trans, root, path, diff); + BUG_ON(ret); + goto csum; + } + +insert: + btrfs_release_path(root, path); + csum_offset = 0; + if (found_next) { + u64 tmp = total_bytes + root->sectorsize; + u64 next_sector = sector_sum->bytenr; + struct btrfs_sector_sum *next = sector_sum + 1; + + while (tmp < sums->len) { + if (next_sector + root->sectorsize != next->bytenr) + break; + tmp += root->sectorsize; + next_sector = next->bytenr; + next++; + } + tmp = min(tmp, next_offset - file_key.offset); + tmp >>= root->fs_info->sb->s_blocksize_bits; + tmp = max((u64)1, tmp); + tmp = min(tmp, (u64)MAX_CSUM_ITEMS(root, csum_size)); + ins_size = csum_size * tmp; + } else { + ins_size = csum_size; + } + ret = btrfs_insert_empty_item(trans, root, path, &file_key, + ins_size); + if (ret < 0) + goto fail_unlock; + if (ret != 0) { + WARN_ON(1); + goto fail_unlock; + } +csum: + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + ret = 0; + item = (struct btrfs_csum_item *)((unsigned char *)item + + csum_offset * csum_size); +found: + item_end = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_csum_item); + item_end = (struct btrfs_csum_item *)((unsigned char *)item_end + + btrfs_item_size_nr(leaf, path->slots[0])); + eb_token = NULL; + cond_resched(); +next_sector: + + if (!eb_token || + (unsigned long)item + csum_size >= map_start + map_len) { + int err; + + if (eb_token) + unmap_extent_buffer(leaf, eb_token, KM_USER1); + eb_token = NULL; + err = map_private_extent_buffer(leaf, (unsigned long)item, + csum_size, + &eb_token, &eb_map, + &map_start, &map_len, KM_USER1); + if (err) + eb_token = NULL; + } + if (eb_token) { + memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)), + §or_sum->sum, csum_size); + } else { + write_extent_buffer(leaf, §or_sum->sum, + (unsigned long)item, csum_size); + } + + total_bytes += root->sectorsize; + sector_sum++; + if (total_bytes < sums->len) { + item = (struct btrfs_csum_item *)((char *)item + + csum_size); + if (item < item_end && bytenr + PAGE_CACHE_SIZE == + sector_sum->bytenr) { + bytenr = sector_sum->bytenr; + goto next_sector; + } + } + if (eb_token) { + unmap_extent_buffer(leaf, eb_token, KM_USER1); + eb_token = NULL; + } + btrfs_mark_buffer_dirty(path->nodes[0]); + cond_resched(); + if (total_bytes < sums->len) { + btrfs_release_path(root, path); + goto again; + } +out: + btrfs_free_path(path); + return ret; + +fail_unlock: + goto out; +} diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c new file mode 100644 index 00000000000..90268334145 --- /dev/null +++ b/fs/btrfs/file.c @@ -0,0 +1,1288 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/version.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "tree-log.h" +#include "locking.h" +#include "compat.h" + + +/* simple helper to fault in pages and copy. This should go away + * and be replaced with calls into generic code. + */ +static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, + int write_bytes, + struct page **prepared_pages, + const char __user *buf) +{ + long page_fault = 0; + int i; + int offset = pos & (PAGE_CACHE_SIZE - 1); + + for (i = 0; i < num_pages && write_bytes > 0; i++, offset = 0) { + size_t count = min_t(size_t, + PAGE_CACHE_SIZE - offset, write_bytes); + struct page *page = prepared_pages[i]; + fault_in_pages_readable(buf, count); + + /* Copy data from userspace to the current page */ + kmap(page); + page_fault = __copy_from_user(page_address(page) + offset, + buf, count); + /* Flush processor's dcache for this page */ + flush_dcache_page(page); + kunmap(page); + buf += count; + write_bytes -= count; + + if (page_fault) + break; + } + return page_fault ? -EFAULT : 0; +} + +/* + * unlocks pages after btrfs_file_write is done with them + */ +static noinline void btrfs_drop_pages(struct page **pages, size_t num_pages) +{ + size_t i; + for (i = 0; i < num_pages; i++) { + if (!pages[i]) + break; + /* page checked is some magic around finding pages that + * have been modified without going through btrfs_set_page_dirty + * clear it here + */ + ClearPageChecked(pages[i]); + unlock_page(pages[i]); + mark_page_accessed(pages[i]); + page_cache_release(pages[i]); + } +} + +/* + * after copy_from_user, pages need to be dirtied and we need to make + * sure holes are created between the current EOF and the start of + * any next extents (if required). + * + * this also makes the decision about creating an inline extent vs + * doing real data extents, marking pages dirty and delalloc as required. + */ +static noinline int dirty_and_release_pages(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct file *file, + struct page **pages, + size_t num_pages, + loff_t pos, + size_t write_bytes) +{ + int err = 0; + int i; + struct inode *inode = fdentry(file)->d_inode; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 hint_byte; + u64 num_bytes; + u64 start_pos; + u64 end_of_last_block; + u64 end_pos = pos + write_bytes; + loff_t isize = i_size_read(inode); + + start_pos = pos & ~((u64)root->sectorsize - 1); + num_bytes = (write_bytes + pos - start_pos + + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + end_of_last_block = start_pos + num_bytes - 1; + + lock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); + trans = btrfs_join_transaction(root, 1); + if (!trans) { + err = -ENOMEM; + goto out_unlock; + } + btrfs_set_trans_block_group(trans, inode); + hint_byte = 0; + + set_extent_uptodate(io_tree, start_pos, end_of_last_block, GFP_NOFS); + + /* check for reserved extents on each page, we don't want + * to reset the delalloc bit on things that already have + * extents reserved. + */ + btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block); + for (i = 0; i < num_pages; i++) { + struct page *p = pages[i]; + SetPageUptodate(p); + ClearPageChecked(p); + set_page_dirty(p); + } + if (end_pos > isize) { + i_size_write(inode, end_pos); + btrfs_update_inode(trans, root, inode); + } + err = btrfs_end_transaction(trans, root); +out_unlock: + unlock_extent(io_tree, start_pos, end_of_last_block, GFP_NOFS); + return err; +} + +/* + * this drops all the extents in the cache that intersect the range + * [start, end]. Existing extents are split as required. + */ +int btrfs_drop_extent_cache(struct inode *inode, u64 start, u64 end, + int skip_pinned) +{ + struct extent_map *em; + struct extent_map *split = NULL; + struct extent_map *split2 = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + u64 len = end - start + 1; + int ret; + int testend = 1; + unsigned long flags; + int compressed = 0; + + WARN_ON(end < start); + if (end == (u64)-1) { + len = (u64)-1; + testend = 0; + } + while (1) { + if (!split) + split = alloc_extent_map(GFP_NOFS); + if (!split2) + split2 = alloc_extent_map(GFP_NOFS); + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (!em) { + spin_unlock(&em_tree->lock); + break; + } + flags = em->flags; + if (skip_pinned && test_bit(EXTENT_FLAG_PINNED, &em->flags)) { + spin_unlock(&em_tree->lock); + if (em->start <= start && + (!testend || em->start + em->len >= start + len)) { + free_extent_map(em); + break; + } + if (start < em->start) { + len = em->start - start; + } else { + len = start + len - (em->start + em->len); + start = em->start + em->len; + } + free_extent_map(em); + continue; + } + compressed = test_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + clear_bit(EXTENT_FLAG_PINNED, &em->flags); + remove_extent_mapping(em_tree, em); + + if (em->block_start < EXTENT_MAP_LAST_BYTE && + em->start < start) { + split->start = em->start; + split->len = start - em->start; + split->orig_start = em->orig_start; + split->block_start = em->block_start; + + if (compressed) + split->block_len = em->block_len; + else + split->block_len = split->len; + + split->bdev = em->bdev; + split->flags = flags; + ret = add_extent_mapping(em_tree, split); + BUG_ON(ret); + free_extent_map(split); + split = split2; + split2 = NULL; + } + if (em->block_start < EXTENT_MAP_LAST_BYTE && + testend && em->start + em->len > start + len) { + u64 diff = start + len - em->start; + + split->start = start + len; + split->len = em->start + em->len - (start + len); + split->bdev = em->bdev; + split->flags = flags; + + if (compressed) { + split->block_len = em->block_len; + split->block_start = em->block_start; + split->orig_start = em->orig_start; + } else { + split->block_len = split->len; + split->block_start = em->block_start + diff; + split->orig_start = split->start; + } + + ret = add_extent_mapping(em_tree, split); + BUG_ON(ret); + free_extent_map(split); + split = NULL; + } + spin_unlock(&em_tree->lock); + + /* once for us */ + free_extent_map(em); + /* once for the tree*/ + free_extent_map(em); + } + if (split) + free_extent_map(split); + if (split2) + free_extent_map(split2); + return 0; +} + +int btrfs_check_file(struct btrfs_root *root, struct inode *inode) +{ + return 0; +#if 0 + struct btrfs_path *path; + struct btrfs_key found_key; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *extent; + u64 last_offset = 0; + int nritems; + int slot; + int found_type; + int ret; + int err = 0; + u64 extent_end = 0; + + path = btrfs_alloc_path(); + ret = btrfs_lookup_file_extent(NULL, root, path, inode->i_ino, + last_offset, 0); + while (1) { + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + nritems = btrfs_header_nritems(path->nodes[0]); + } + slot = path->slots[0]; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid != inode->i_ino) + break; + if (found_key.type != BTRFS_EXTENT_DATA_KEY) + goto out; + + if (found_key.offset < last_offset) { + WARN_ON(1); + btrfs_print_leaf(root, leaf); + printk(KERN_ERR "inode %lu found offset %llu " + "expected %llu\n", inode->i_ino, + (unsigned long long)found_key.offset, + (unsigned long long)last_offset); + err = 1; + goto out; + } + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, extent); + if (found_type == BTRFS_FILE_EXTENT_REG) { + extent_end = found_key.offset + + btrfs_file_extent_num_bytes(leaf, extent); + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + struct btrfs_item *item; + item = btrfs_item_nr(leaf, slot); + extent_end = found_key.offset + + btrfs_file_extent_inline_len(leaf, extent); + extent_end = (extent_end + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + } + last_offset = extent_end; + path->slots[0]++; + } + if (0 && last_offset < inode->i_size) { + WARN_ON(1); + btrfs_print_leaf(root, leaf); + printk(KERN_ERR "inode %lu found offset %llu size %llu\n", + inode->i_ino, (unsigned long long)last_offset, + (unsigned long long)inode->i_size); + err = 1; + + } +out: + btrfs_free_path(path); + return err; +#endif +} + +/* + * this is very complex, but the basic idea is to drop all extents + * in the range start - end. hint_block is filled in with a block number + * that would be a good hint to the block allocator for this file. + * + * If an extent intersects the range but is not entirely inside the range + * it is either truncated or split. Anything entirely inside the range + * is deleted from the tree. + * + * inline_limit is used to tell this code which offsets in the file to keep + * if they contain inline extents. + */ +noinline int btrfs_drop_extents(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, u64 end, u64 inline_limit, u64 *hint_byte) +{ + u64 extent_end = 0; + u64 locked_end = end; + u64 search_start = start; + u64 leaf_start; + u64 ram_bytes = 0; + u64 orig_parent = 0; + u64 disk_bytenr = 0; + u8 compression; + u8 encryption; + u16 other_encoding = 0; + u64 root_gen; + u64 root_owner; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *extent; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_file_extent_item old; + int keep; + int slot; + int bookend; + int found_type = 0; + int found_extent; + int found_inline; + int recow; + int ret; + + inline_limit = 0; + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + while (1) { + recow = 0; + btrfs_release_path(root, path); + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + search_start, -1); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) { + ret = 0; + goto out; + } + path->slots[0]--; + } +next_slot: + keep = 0; + bookend = 0; + found_extent = 0; + found_inline = 0; + leaf_start = 0; + root_gen = 0; + root_owner = 0; + compression = 0; + encryption = 0; + extent = NULL; + leaf = path->nodes[0]; + slot = path->slots[0]; + ret = 0; + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY && + key.offset >= end) { + goto out; + } + if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || + key.objectid != inode->i_ino) { + goto out; + } + if (recow) { + search_start = max(key.offset, start); + continue; + } + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(leaf, extent); + compression = btrfs_file_extent_compression(leaf, + extent); + encryption = btrfs_file_extent_encryption(leaf, + extent); + other_encoding = btrfs_file_extent_other_encoding(leaf, + extent); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = + btrfs_file_extent_disk_bytenr(leaf, + extent); + if (extent_end) + *hint_byte = extent_end; + + extent_end = key.offset + + btrfs_file_extent_num_bytes(leaf, extent); + ram_bytes = btrfs_file_extent_ram_bytes(leaf, + extent); + found_extent = 1; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + found_inline = 1; + extent_end = key.offset + + btrfs_file_extent_inline_len(leaf, extent); + } + } else { + extent_end = search_start; + } + + /* we found nothing we can drop */ + if ((!found_extent && !found_inline) || + search_start >= extent_end) { + int nextret; + u32 nritems; + nritems = btrfs_header_nritems(leaf); + if (slot >= nritems - 1) { + nextret = btrfs_next_leaf(root, path); + if (nextret) + goto out; + recow = 1; + } else { + path->slots[0]++; + } + goto next_slot; + } + + if (end <= extent_end && start >= key.offset && found_inline) + *hint_byte = EXTENT_MAP_INLINE; + + if (found_extent) { + read_extent_buffer(leaf, &old, (unsigned long)extent, + sizeof(old)); + root_gen = btrfs_header_generation(leaf); + root_owner = btrfs_header_owner(leaf); + leaf_start = leaf->start; + } + + if (end < extent_end && end >= key.offset) { + bookend = 1; + if (found_inline && start <= key.offset) + keep = 1; + } + + if (bookend && found_extent) { + if (locked_end < extent_end) { + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, + GFP_NOFS); + if (!ret) { + btrfs_release_path(root, path); + lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, + GFP_NOFS); + locked_end = extent_end; + continue; + } + locked_end = extent_end; + } + orig_parent = path->nodes[0]->start; + disk_bytenr = le64_to_cpu(old.disk_bytenr); + if (disk_bytenr != 0) { + ret = btrfs_inc_extent_ref(trans, root, + disk_bytenr, + le64_to_cpu(old.disk_num_bytes), + orig_parent, root->root_key.objectid, + trans->transid, inode->i_ino); + BUG_ON(ret); + } + } + + if (found_inline) { + u64 mask = root->sectorsize - 1; + search_start = (extent_end + mask) & ~mask; + } else + search_start = extent_end; + + /* truncate existing extent */ + if (start > key.offset) { + u64 new_num; + u64 old_num; + keep = 1; + WARN_ON(start & (root->sectorsize - 1)); + if (found_extent) { + new_num = start - key.offset; + old_num = btrfs_file_extent_num_bytes(leaf, + extent); + *hint_byte = + btrfs_file_extent_disk_bytenr(leaf, + extent); + if (btrfs_file_extent_disk_bytenr(leaf, + extent)) { + inode_sub_bytes(inode, old_num - + new_num); + } + btrfs_set_file_extent_num_bytes(leaf, + extent, new_num); + btrfs_mark_buffer_dirty(leaf); + } else if (key.offset < inline_limit && + (end > extent_end) && + (inline_limit < extent_end)) { + u32 new_size; + new_size = btrfs_file_extent_calc_inline_size( + inline_limit - key.offset); + inode_sub_bytes(inode, extent_end - + inline_limit); + btrfs_set_file_extent_ram_bytes(leaf, extent, + new_size); + if (!compression && !encryption) { + btrfs_truncate_item(trans, root, path, + new_size, 1); + } + } + } + /* delete the entire extent */ + if (!keep) { + if (found_inline) + inode_sub_bytes(inode, extent_end - + key.offset); + ret = btrfs_del_item(trans, root, path); + /* TODO update progress marker and return */ + BUG_ON(ret); + extent = NULL; + btrfs_release_path(root, path); + /* the extent will be freed later */ + } + if (bookend && found_inline && start <= key.offset) { + u32 new_size; + new_size = btrfs_file_extent_calc_inline_size( + extent_end - end); + inode_sub_bytes(inode, end - key.offset); + btrfs_set_file_extent_ram_bytes(leaf, extent, + new_size); + if (!compression && !encryption) + ret = btrfs_truncate_item(trans, root, path, + new_size, 0); + BUG_ON(ret); + } + /* create bookend, splitting the extent in two */ + if (bookend && found_extent) { + struct btrfs_key ins; + ins.objectid = inode->i_ino; + ins.offset = end; + btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY); + + btrfs_release_path(root, path); + ret = btrfs_insert_empty_item(trans, root, path, &ins, + sizeof(*extent)); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + write_extent_buffer(leaf, &old, + (unsigned long)extent, sizeof(old)); + + btrfs_set_file_extent_compression(leaf, extent, + compression); + btrfs_set_file_extent_encryption(leaf, extent, + encryption); + btrfs_set_file_extent_other_encoding(leaf, extent, + other_encoding); + btrfs_set_file_extent_offset(leaf, extent, + le64_to_cpu(old.offset) + end - key.offset); + WARN_ON(le64_to_cpu(old.num_bytes) < + (extent_end - end)); + btrfs_set_file_extent_num_bytes(leaf, extent, + extent_end - end); + + /* + * set the ram bytes to the size of the full extent + * before splitting. This is a worst case flag, + * but its the best we can do because we don't know + * how splitting affects compression + */ + btrfs_set_file_extent_ram_bytes(leaf, extent, + ram_bytes); + btrfs_set_file_extent_type(leaf, extent, found_type); + + btrfs_mark_buffer_dirty(path->nodes[0]); + + if (disk_bytenr != 0) { + ret = btrfs_update_extent_ref(trans, root, + disk_bytenr, orig_parent, + leaf->start, + root->root_key.objectid, + trans->transid, ins.objectid); + + BUG_ON(ret); + } + btrfs_release_path(root, path); + if (disk_bytenr != 0) + inode_add_bytes(inode, extent_end - end); + } + + if (found_extent && !keep) { + u64 old_disk_bytenr = le64_to_cpu(old.disk_bytenr); + + if (old_disk_bytenr != 0) { + inode_sub_bytes(inode, + le64_to_cpu(old.num_bytes)); + ret = btrfs_free_extent(trans, root, + old_disk_bytenr, + le64_to_cpu(old.disk_num_bytes), + leaf_start, root_owner, + root_gen, key.objectid, 0); + BUG_ON(ret); + *hint_byte = old_disk_bytenr; + } + } + + if (search_start >= end) { + ret = 0; + goto out; + } + } +out: + btrfs_free_path(path); + if (locked_end > end) { + unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, + GFP_NOFS); + } + btrfs_check_file(root, inode); + return ret; +} + +static int extent_mergeable(struct extent_buffer *leaf, int slot, + u64 objectid, u64 bytenr, u64 *start, u64 *end) +{ + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 extent_end; + + if (slot < 0 || slot >= btrfs_header_nritems(leaf)) + return 0; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) + return 0; + + fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || + btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr || + btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + return 0; + + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + if ((*start && *start != key.offset) || (*end && *end != extent_end)) + return 0; + + *start = key.offset; + *end = extent_end; + return 1; +} + +/* + * Mark extent in the range start - end as written. + * + * This changes extent type from 'pre-allocated' to 'regular'. If only + * part of extent is marked as written, the extent will be split into + * two or three. + */ +int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end) +{ + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + u64 bytenr; + u64 num_bytes; + u64 extent_end; + u64 extent_offset; + u64 other_start; + u64 other_end; + u64 split = start; + u64 locked_end = end; + u64 orig_parent; + int extent_type; + int split_end = 1; + int ret; + + btrfs_drop_extent_cache(inode, start, end - 1, 0); + + path = btrfs_alloc_path(); + BUG_ON(!path); +again: + key.objectid = inode->i_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + if (split == start) + key.offset = split; + else + key.offset = split - 1; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0 && path->slots[0] > 0) + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + BUG_ON(key.objectid != inode->i_ino || + key.type != BTRFS_EXTENT_DATA_KEY); + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + BUG_ON(extent_type != BTRFS_FILE_EXTENT_PREALLOC); + extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); + BUG_ON(key.offset > start || extent_end < end); + + bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); + extent_offset = btrfs_file_extent_offset(leaf, fi); + + if (key.offset == start) + split = end; + + if (key.offset == start && extent_end == end) { + int del_nr = 0; + int del_slot = 0; + u64 leaf_owner = btrfs_header_owner(leaf); + u64 leaf_gen = btrfs_header_generation(leaf); + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, + bytenr, &other_start, &other_end)) { + extent_end = other_end; + del_slot = path->slots[0] + 1; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + leaf->start, leaf_owner, + leaf_gen, inode->i_ino, 0); + BUG_ON(ret); + } + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1, inode->i_ino, + bytenr, &other_start, &other_end)) { + key.offset = other_start; + del_slot = path->slots[0]; + del_nr++; + ret = btrfs_free_extent(trans, root, bytenr, num_bytes, + leaf->start, leaf_owner, + leaf_gen, inode->i_ino, 0); + BUG_ON(ret); + } + split_end = 0; + if (del_nr == 0) { + btrfs_set_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG); + goto done; + } + + fi = btrfs_item_ptr(leaf, del_slot - 1, + struct btrfs_file_extent_item); + btrfs_set_file_extent_type(leaf, fi, BTRFS_FILE_EXTENT_REG); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_end - key.offset); + btrfs_mark_buffer_dirty(leaf); + + ret = btrfs_del_items(trans, root, path, del_slot, del_nr); + BUG_ON(ret); + goto done; + } else if (split == start) { + if (locked_end < extent_end) { + ret = try_lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, GFP_NOFS); + if (!ret) { + btrfs_release_path(root, path); + lock_extent(&BTRFS_I(inode)->io_tree, + locked_end, extent_end - 1, GFP_NOFS); + locked_end = extent_end; + goto again; + } + locked_end = extent_end; + } + btrfs_set_file_extent_num_bytes(leaf, fi, split - key.offset); + extent_offset += split - key.offset; + } else { + BUG_ON(key.offset != start); + btrfs_set_file_extent_offset(leaf, fi, extent_offset + + split - key.offset); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - split); + key.offset = split; + btrfs_set_item_key_safe(trans, root, path, &key); + extent_end = split; + } + + if (extent_end == end) { + split_end = 0; + extent_type = BTRFS_FILE_EXTENT_REG; + } + if (extent_end == end && split == start) { + other_start = end; + other_end = 0; + if (extent_mergeable(leaf, path->slots[0] + 1, inode->i_ino, + bytenr, &other_start, &other_end)) { + path->slots[0]++; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + key.offset = split; + btrfs_set_item_key_safe(trans, root, path, &key); + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, + other_end - split); + goto done; + } + } + if (extent_end == end && split == end) { + other_start = 0; + other_end = start; + if (extent_mergeable(leaf, path->slots[0] - 1 , inode->i_ino, + bytenr, &other_start, &other_end)) { + path->slots[0]--; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - + other_start); + goto done; + } + } + + btrfs_mark_buffer_dirty(leaf); + + orig_parent = leaf->start; + ret = btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, + orig_parent, root->root_key.objectid, + trans->transid, inode->i_ino); + BUG_ON(ret); + btrfs_release_path(root, path); + + key.offset = start; + ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*fi)); + BUG_ON(ret); + + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_type(leaf, fi, extent_type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_offset(leaf, fi, extent_offset); + btrfs_set_file_extent_num_bytes(leaf, fi, extent_end - key.offset); + btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_compression(leaf, fi, 0); + btrfs_set_file_extent_encryption(leaf, fi, 0); + btrfs_set_file_extent_other_encoding(leaf, fi, 0); + + if (orig_parent != leaf->start) { + ret = btrfs_update_extent_ref(trans, root, bytenr, + orig_parent, leaf->start, + root->root_key.objectid, + trans->transid, inode->i_ino); + BUG_ON(ret); + } +done: + btrfs_mark_buffer_dirty(leaf); + btrfs_release_path(root, path); + if (split_end && split == start) { + split = end; + goto again; + } + if (locked_end > end) { + unlock_extent(&BTRFS_I(inode)->io_tree, end, locked_end - 1, + GFP_NOFS); + } + btrfs_free_path(path); + return 0; +} + +/* + * this gets pages into the page cache and locks them down, it also properly + * waits for data=ordered extents to finish before allowing the pages to be + * modified. + */ +static noinline int prepare_pages(struct btrfs_root *root, struct file *file, + struct page **pages, size_t num_pages, + loff_t pos, unsigned long first_index, + unsigned long last_index, size_t write_bytes) +{ + int i; + unsigned long index = pos >> PAGE_CACHE_SHIFT; + struct inode *inode = fdentry(file)->d_inode; + int err = 0; + u64 start_pos; + u64 last_pos; + + start_pos = pos & ~((u64)root->sectorsize - 1); + last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT; + + if (start_pos > inode->i_size) { + err = btrfs_cont_expand(inode, start_pos); + if (err) + return err; + } + + memset(pages, 0, num_pages * sizeof(struct page *)); +again: + for (i = 0; i < num_pages; i++) { + pages[i] = grab_cache_page(inode->i_mapping, index + i); + if (!pages[i]) { + err = -ENOMEM; + BUG_ON(1); + } + wait_on_page_writeback(pages[i]); + } + if (start_pos < inode->i_size) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + last_pos - 1); + if (ordered && + ordered->file_offset + ordered->len > start_pos && + ordered->file_offset < last_pos) { + btrfs_put_ordered_extent(ordered); + unlock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + for (i = 0; i < num_pages; i++) { + unlock_page(pages[i]); + page_cache_release(pages[i]); + } + btrfs_wait_ordered_range(inode, start_pos, + last_pos - start_pos); + goto again; + } + if (ordered) + btrfs_put_ordered_extent(ordered); + + clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos, + last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC, + GFP_NOFS); + unlock_extent(&BTRFS_I(inode)->io_tree, + start_pos, last_pos - 1, GFP_NOFS); + } + for (i = 0; i < num_pages; i++) { + clear_page_dirty_for_io(pages[i]); + set_page_extent_mapped(pages[i]); + WARN_ON(!PageLocked(pages[i])); + } + return 0; +} + +static ssize_t btrfs_file_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) +{ + loff_t pos; + loff_t start_pos; + ssize_t num_written = 0; + ssize_t err = 0; + int ret = 0; + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct page **pages = NULL; + int nrptrs; + struct page *pinned[2]; + unsigned long first_index; + unsigned long last_index; + int will_write; + + will_write = ((file->f_flags & O_SYNC) || IS_SYNC(inode) || + (file->f_flags & O_DIRECT)); + + nrptrs = min((count + PAGE_CACHE_SIZE - 1) / PAGE_CACHE_SIZE, + PAGE_CACHE_SIZE / (sizeof(struct page *))); + pinned[0] = NULL; + pinned[1] = NULL; + + pos = *ppos; + start_pos = pos; + + vfs_check_frozen(inode->i_sb, SB_FREEZE_WRITE); + current->backing_dev_info = inode->i_mapping->backing_dev_info; + err = generic_write_checks(file, &pos, &count, S_ISBLK(inode->i_mode)); + if (err) + goto out_nolock; + if (count == 0) + goto out_nolock; + + err = file_remove_suid(file); + if (err) + goto out_nolock; + file_update_time(file); + + pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); + + mutex_lock(&inode->i_mutex); + BTRFS_I(inode)->sequence++; + first_index = pos >> PAGE_CACHE_SHIFT; + last_index = (pos + count) >> PAGE_CACHE_SHIFT; + + /* + * there are lots of better ways to do this, but this code + * makes sure the first and last page in the file range are + * up to date and ready for cow + */ + if ((pos & (PAGE_CACHE_SIZE - 1))) { + pinned[0] = grab_cache_page(inode->i_mapping, first_index); + if (!PageUptodate(pinned[0])) { + ret = btrfs_readpage(NULL, pinned[0]); + BUG_ON(ret); + wait_on_page_locked(pinned[0]); + } else { + unlock_page(pinned[0]); + } + } + if ((pos + count) & (PAGE_CACHE_SIZE - 1)) { + pinned[1] = grab_cache_page(inode->i_mapping, last_index); + if (!PageUptodate(pinned[1])) { + ret = btrfs_readpage(NULL, pinned[1]); + BUG_ON(ret); + wait_on_page_locked(pinned[1]); + } else { + unlock_page(pinned[1]); + } + } + + while (count > 0) { + size_t offset = pos & (PAGE_CACHE_SIZE - 1); + size_t write_bytes = min(count, nrptrs * + (size_t)PAGE_CACHE_SIZE - + offset); + size_t num_pages = (write_bytes + PAGE_CACHE_SIZE - 1) >> + PAGE_CACHE_SHIFT; + + WARN_ON(num_pages > nrptrs); + memset(pages, 0, sizeof(struct page *) * nrptrs); + + ret = btrfs_check_free_space(root, write_bytes, 0); + if (ret) + goto out; + + ret = prepare_pages(root, file, pages, num_pages, + pos, first_index, last_index, + write_bytes); + if (ret) + goto out; + + ret = btrfs_copy_from_user(pos, num_pages, + write_bytes, pages, buf); + if (ret) { + btrfs_drop_pages(pages, num_pages); + goto out; + } + + ret = dirty_and_release_pages(NULL, root, file, pages, + num_pages, pos, write_bytes); + btrfs_drop_pages(pages, num_pages); + if (ret) + goto out; + + if (will_write) { + btrfs_fdatawrite_range(inode->i_mapping, pos, + pos + write_bytes - 1, + WB_SYNC_NONE); + } else { + balance_dirty_pages_ratelimited_nr(inode->i_mapping, + num_pages); + if (num_pages < + (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + btrfs_btree_balance_dirty(root, 1); + btrfs_throttle(root); + } + + buf += write_bytes; + count -= write_bytes; + pos += write_bytes; + num_written += write_bytes; + + cond_resched(); + } +out: + mutex_unlock(&inode->i_mutex); + +out_nolock: + kfree(pages); + if (pinned[0]) + page_cache_release(pinned[0]); + if (pinned[1]) + page_cache_release(pinned[1]); + *ppos = pos; + + if (num_written > 0 && will_write) { + struct btrfs_trans_handle *trans; + + err = btrfs_wait_ordered_range(inode, start_pos, num_written); + if (err) + num_written = err; + + if ((file->f_flags & O_SYNC) || IS_SYNC(inode)) { + trans = btrfs_start_transaction(root, 1); + ret = btrfs_log_dentry_safe(trans, root, + file->f_dentry); + if (ret == 0) { + btrfs_sync_log(trans, root); + btrfs_end_transaction(trans, root); + } else { + btrfs_commit_transaction(trans, root); + } + } + if (file->f_flags & O_DIRECT) { + invalidate_mapping_pages(inode->i_mapping, + start_pos >> PAGE_CACHE_SHIFT, + (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT); + } + } + current->backing_dev_info = NULL; + return num_written ? num_written : err; +} + +int btrfs_release_file(struct inode *inode, struct file *filp) +{ + if (filp->private_data) + btrfs_ioctl_trans_end(filp); + return 0; +} + +/* + * fsync call for both files and directories. This logs the inode into + * the tree log instead of forcing full commits whenever possible. + * + * It needs to call filemap_fdatawait so that all ordered extent updates are + * in the metadata btree are up to date for copying to the log. + * + * It drops the inode mutex before doing the tree log commit. This is an + * important optimization for directories because holding the mutex prevents + * new operations on the dir while we write to disk. + */ +int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) +{ + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + struct btrfs_trans_handle *trans; + + /* + * check the transaction that last modified this inode + * and see if its already been committed + */ + if (!BTRFS_I(inode)->last_trans) + goto out; + + mutex_lock(&root->fs_info->trans_mutex); + if (BTRFS_I(inode)->last_trans <= + root->fs_info->last_trans_committed) { + BTRFS_I(inode)->last_trans = 0; + mutex_unlock(&root->fs_info->trans_mutex); + goto out; + } + mutex_unlock(&root->fs_info->trans_mutex); + + root->fs_info->tree_log_batch++; + filemap_fdatawrite(inode->i_mapping); + btrfs_wait_ordered_range(inode, 0, (u64)-1); + root->fs_info->tree_log_batch++; + + /* + * ok we haven't committed the transaction yet, lets do a commit + */ + if (file->private_data) + btrfs_ioctl_trans_end(file); + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto out; + } + + ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + if (ret < 0) + goto out; + + /* we've logged all the items and now have a consistent + * version of the file in the log. It is possible that + * someone will come in and modify the file, but that's + * fine because the log is consistent on disk, and we + * have references to all of the file's extents + * + * It is possible that someone will come in and log the + * file again, but that will end up using the synchronization + * inside btrfs_sync_log to keep things safe. + */ + mutex_unlock(&file->f_dentry->d_inode->i_mutex); + + if (ret > 0) { + ret = btrfs_commit_transaction(trans, root); + } else { + btrfs_sync_log(trans, root); + ret = btrfs_end_transaction(trans, root); + } + mutex_lock(&file->f_dentry->d_inode->i_mutex); +out: + return ret > 0 ? EIO : ret; +} + +static struct vm_operations_struct btrfs_file_vm_ops = { + .fault = filemap_fault, + .page_mkwrite = btrfs_page_mkwrite, +}; + +static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) +{ + vma->vm_ops = &btrfs_file_vm_ops; + file_accessed(filp); + return 0; +} + +struct file_operations btrfs_file_operations = { + .llseek = generic_file_llseek, + .read = do_sync_read, + .aio_read = generic_file_aio_read, + .splice_read = generic_file_splice_read, + .write = btrfs_file_write, + .mmap = btrfs_file_mmap, + .open = generic_file_open, + .release = btrfs_release_file, + .fsync = btrfs_sync_file, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif +}; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c new file mode 100644 index 00000000000..d1e5f0e84c5 --- /dev/null +++ b/fs/btrfs/free-space-cache.c @@ -0,0 +1,495 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" + +static int tree_insert_offset(struct rb_root *root, u64 offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_free_space *info; + + while (*p) { + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, offset_index); + + if (offset < info->offset) + p = &(*p)->rb_left; + else if (offset > info->offset) + p = &(*p)->rb_right; + else + return -EEXIST; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + + return 0; +} + +static int tree_insert_bytes(struct rb_root *root, u64 bytes, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_free_space *info; + + while (*p) { + parent = *p; + info = rb_entry(parent, struct btrfs_free_space, bytes_index); + + if (bytes < info->bytes) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + + return 0; +} + +/* + * searches the tree for the given offset. If contains is set we will return + * the free space that contains the given offset. If contains is not set we + * will return the free space that starts at or after the given offset and is + * at least bytes long. + */ +static struct btrfs_free_space *tree_search_offset(struct rb_root *root, + u64 offset, u64 bytes, + int contains) +{ + struct rb_node *n = root->rb_node; + struct btrfs_free_space *entry, *ret = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_free_space, offset_index); + + if (offset < entry->offset) { + if (!contains && + (!ret || entry->offset < ret->offset) && + (bytes <= entry->bytes)) + ret = entry; + n = n->rb_left; + } else if (offset > entry->offset) { + if ((entry->offset + entry->bytes - 1) >= offset && + bytes <= entry->bytes) { + ret = entry; + break; + } + n = n->rb_right; + } else { + if (bytes > entry->bytes) { + n = n->rb_right; + continue; + } + ret = entry; + break; + } + } + + return ret; +} + +/* + * return a chunk at least bytes size, as close to offset that we can get. + */ +static struct btrfs_free_space *tree_search_bytes(struct rb_root *root, + u64 offset, u64 bytes) +{ + struct rb_node *n = root->rb_node; + struct btrfs_free_space *entry, *ret = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_free_space, bytes_index); + + if (bytes < entry->bytes) { + /* + * We prefer to get a hole size as close to the size we + * are asking for so we don't take small slivers out of + * huge holes, but we also want to get as close to the + * offset as possible so we don't have a whole lot of + * fragmentation. + */ + if (offset <= entry->offset) { + if (!ret) + ret = entry; + else if (entry->bytes < ret->bytes) + ret = entry; + else if (entry->offset < ret->offset) + ret = entry; + } + n = n->rb_left; + } else if (bytes > entry->bytes) { + n = n->rb_right; + } else { + /* + * Ok we may have multiple chunks of the wanted size, + * so we don't want to take the first one we find, we + * want to take the one closest to our given offset, so + * keep searching just in case theres a better match. + */ + n = n->rb_right; + if (offset > entry->offset) + continue; + else if (!ret || entry->offset < ret->offset) + ret = entry; + } + } + + return ret; +} + +static void unlink_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + rb_erase(&info->offset_index, &block_group->free_space_offset); + rb_erase(&info->bytes_index, &block_group->free_space_bytes); +} + +static int link_free_space(struct btrfs_block_group_cache *block_group, + struct btrfs_free_space *info) +{ + int ret = 0; + + + ret = tree_insert_offset(&block_group->free_space_offset, info->offset, + &info->offset_index); + if (ret) + return ret; + + ret = tree_insert_bytes(&block_group->free_space_bytes, info->bytes, + &info->bytes_index); + if (ret) + return ret; + + return ret; +} + +static int __btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *right_info; + struct btrfs_free_space *left_info; + struct btrfs_free_space *info = NULL; + struct btrfs_free_space *alloc_info; + int ret = 0; + + alloc_info = kzalloc(sizeof(struct btrfs_free_space), GFP_NOFS); + if (!alloc_info) + return -ENOMEM; + + /* + * first we want to see if there is free space adjacent to the range we + * are adding, if there is remove that struct and add a new one to + * cover the entire range + */ + right_info = tree_search_offset(&block_group->free_space_offset, + offset+bytes, 0, 1); + left_info = tree_search_offset(&block_group->free_space_offset, + offset-1, 0, 1); + + if (right_info && right_info->offset == offset+bytes) { + unlink_free_space(block_group, right_info); + info = right_info; + info->offset = offset; + info->bytes += bytes; + } else if (right_info && right_info->offset != offset+bytes) { + printk(KERN_ERR "btrfs adding space in the middle of an " + "existing free space area. existing: " + "offset=%llu, bytes=%llu. new: offset=%llu, " + "bytes=%llu\n", (unsigned long long)right_info->offset, + (unsigned long long)right_info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); + BUG(); + } + + if (left_info) { + unlink_free_space(block_group, left_info); + + if (unlikely((left_info->offset + left_info->bytes) != + offset)) { + printk(KERN_ERR "btrfs free space to the left " + "of new free space isn't " + "quite right. existing: offset=%llu, " + "bytes=%llu. new: offset=%llu, bytes=%llu\n", + (unsigned long long)left_info->offset, + (unsigned long long)left_info->bytes, + (unsigned long long)offset, + (unsigned long long)bytes); + BUG(); + } + + if (info) { + info->offset = left_info->offset; + info->bytes += left_info->bytes; + kfree(left_info); + } else { + info = left_info; + info->bytes += bytes; + } + } + + if (info) { + ret = link_free_space(block_group, info); + if (!ret) + info = NULL; + goto out; + } + + info = alloc_info; + alloc_info = NULL; + info->offset = offset; + info->bytes = bytes; + + ret = link_free_space(block_group, info); + if (ret) + kfree(info); +out: + if (ret) { + printk(KERN_ERR "btrfs: unable to add free space :%d\n", ret); + if (ret == -EEXIST) + BUG(); + } + + kfree(alloc_info); + + return ret; +} + +static int +__btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + struct btrfs_free_space *info; + int ret = 0; + + info = tree_search_offset(&block_group->free_space_offset, offset, 0, + 1); + + if (info && info->offset == offset) { + if (info->bytes < bytes) { + printk(KERN_ERR "Found free space at %llu, size %llu," + "trying to use %llu\n", + (unsigned long long)info->offset, + (unsigned long long)info->bytes, + (unsigned long long)bytes); + WARN_ON(1); + ret = -EINVAL; + goto out; + } + unlink_free_space(block_group, info); + + if (info->bytes == bytes) { + kfree(info); + goto out; + } + + info->offset += bytes; + info->bytes -= bytes; + + ret = link_free_space(block_group, info); + BUG_ON(ret); + } else if (info && info->offset < offset && + info->offset + info->bytes >= offset + bytes) { + u64 old_start = info->offset; + /* + * we're freeing space in the middle of the info, + * this can happen during tree log replay + * + * first unlink the old info and then + * insert it again after the hole we're creating + */ + unlink_free_space(block_group, info); + if (offset + bytes < info->offset + info->bytes) { + u64 old_end = info->offset + info->bytes; + + info->offset = offset + bytes; + info->bytes = old_end - info->offset; + ret = link_free_space(block_group, info); + BUG_ON(ret); + } else { + /* the hole we're creating ends at the end + * of the info struct, just free the info + */ + kfree(info); + } + + /* step two, insert a new info struct to cover anything + * before the hole + */ + ret = __btrfs_add_free_space(block_group, old_start, + offset - old_start); + BUG_ON(ret); + } else { + WARN_ON(1); + } +out: + return ret; +} + +int btrfs_add_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret; + struct btrfs_free_space *sp; + + mutex_lock(&block_group->alloc_mutex); + ret = __btrfs_add_free_space(block_group, offset, bytes); + sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); + BUG_ON(!sp); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} + +int btrfs_add_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret; + struct btrfs_free_space *sp; + + ret = __btrfs_add_free_space(block_group, offset, bytes); + sp = tree_search_offset(&block_group->free_space_offset, offset, 0, 1); + BUG_ON(!sp); + + return ret; +} + +int btrfs_remove_free_space(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret = 0; + + mutex_lock(&block_group->alloc_mutex); + ret = __btrfs_remove_free_space(block_group, offset, bytes); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} + +int btrfs_remove_free_space_lock(struct btrfs_block_group_cache *block_group, + u64 offset, u64 bytes) +{ + int ret; + + ret = __btrfs_remove_free_space(block_group, offset, bytes); + + return ret; +} + +void btrfs_dump_free_space(struct btrfs_block_group_cache *block_group, + u64 bytes) +{ + struct btrfs_free_space *info; + struct rb_node *n; + int count = 0; + + for (n = rb_first(&block_group->free_space_offset); n; n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (info->bytes >= bytes) + count++; + } + printk(KERN_INFO "%d blocks of free space at or bigger than bytes is" + "\n", count); +} + +u64 btrfs_block_group_free_space(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space *info; + struct rb_node *n; + u64 ret = 0; + + for (n = rb_first(&block_group->free_space_offset); n; + n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + ret += info->bytes; + } + + return ret; +} + +void btrfs_remove_free_space_cache(struct btrfs_block_group_cache *block_group) +{ + struct btrfs_free_space *info; + struct rb_node *node; + + mutex_lock(&block_group->alloc_mutex); + while ((node = rb_last(&block_group->free_space_bytes)) != NULL) { + info = rb_entry(node, struct btrfs_free_space, bytes_index); + unlink_free_space(block_group, info); + kfree(info); + if (need_resched()) { + mutex_unlock(&block_group->alloc_mutex); + cond_resched(); + mutex_lock(&block_group->alloc_mutex); + } + } + mutex_unlock(&block_group->alloc_mutex); +} + +#if 0 +static struct btrfs_free_space *btrfs_find_free_space_offset(struct + btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret; + + mutex_lock(&block_group->alloc_mutex); + ret = tree_search_offset(&block_group->free_space_offset, offset, + bytes, 0); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} + +static struct btrfs_free_space *btrfs_find_free_space_bytes(struct + btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret; + + mutex_lock(&block_group->alloc_mutex); + + ret = tree_search_bytes(&block_group->free_space_bytes, offset, bytes); + mutex_unlock(&block_group->alloc_mutex); + + return ret; +} +#endif + +struct btrfs_free_space *btrfs_find_free_space(struct btrfs_block_group_cache + *block_group, u64 offset, + u64 bytes) +{ + struct btrfs_free_space *ret = NULL; + + ret = tree_search_offset(&block_group->free_space_offset, offset, + bytes, 0); + if (!ret) + ret = tree_search_bytes(&block_group->free_space_bytes, + offset, bytes); + + return ret; +} diff --git a/fs/btrfs/hash.h b/fs/btrfs/hash.h new file mode 100644 index 00000000000..2a020b27676 --- /dev/null +++ b/fs/btrfs/hash.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __HASH__ +#define __HASH__ + +#include "crc32c.h" +static inline u64 btrfs_name_hash(const char *name, int len) +{ + return btrfs_crc32c((u32)~1, name, len); +} +#endif diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c new file mode 100644 index 00000000000..3d46fa1f29a --- /dev/null +++ b/fs/btrfs/inode-item.c @@ -0,0 +1,206 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +static int find_name_in_backref(struct btrfs_path *path, const char *name, + int name_len, struct btrfs_inode_ref **ref_ret) +{ + struct extent_buffer *leaf; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long name_ptr; + u32 item_size; + u32 cur_offset = 0; + int len; + + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + while (cur_offset < item_size) { + ref = (struct btrfs_inode_ref *)(ptr + cur_offset); + len = btrfs_inode_ref_name_len(leaf, ref); + name_ptr = (unsigned long)(ref + 1); + cur_offset += len + sizeof(*ref); + if (len != name_len) + continue; + if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0) { + *ref_ret = ref; + return 1; + } + } + return 0; +} + +int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 *index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + unsigned long item_start; + u32 item_size; + u32 sub_item_len; + int ret; + int del_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = -ENOENT; + goto out; + } else if (ret < 0) { + goto out; + } + if (!find_name_in_backref(path, name, name_len, &ref)) { + ret = -ENOENT; + goto out; + } + leaf = path->nodes[0]; + item_size = btrfs_item_size_nr(leaf, path->slots[0]); + + if (index) + *index = btrfs_inode_ref_index(leaf, ref); + + if (del_len == item_size) { + ret = btrfs_del_item(trans, root, path); + goto out; + } + ptr = (unsigned long)ref; + sub_item_len = name_len + sizeof(*ref); + item_start = btrfs_item_ptr_offset(leaf, path->slots[0]); + memmove_extent_buffer(leaf, ptr, ptr + sub_item_len, + item_size - (ptr + sub_item_len - item_start)); + ret = btrfs_truncate_item(trans, root, path, + item_size - sub_item_len, 1); + BUG_ON(ret); +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + u64 inode_objectid, u64 ref_objectid, u64 index) +{ + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_inode_ref *ref; + unsigned long ptr; + int ret; + int ins_len = name_len + sizeof(*ref); + + key.objectid = inode_objectid; + key.offset = ref_objectid; + btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + ins_len); + if (ret == -EEXIST) { + u32 old_size; + + if (find_name_in_backref(path, name, name_len, &ref)) + goto out; + + old_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ret = btrfs_extend_item(trans, root, path, ins_len); + BUG_ON(ret); + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + ret = 0; + } else if (ret < 0) { + goto out; + } else { + ref = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, index); + ptr = (unsigned long)(ref + 1); + } + write_extent_buffer(path->nodes[0], name, ptr, name_len); + btrfs_mark_buffer_dirty(path->nodes[0]); + +out: + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid) +{ + struct btrfs_key key; + int ret; + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(struct btrfs_inode_item)); + if (ret == 0 && objectid > root->highest_inode) + root->highest_inode = objectid; + return ret; +} + +int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_path *path, + struct btrfs_key *location, int mod) +{ + int ins_len = mod < 0 ? -1 : 0; + int cow = mod != 0; + int ret; + int slot; + struct extent_buffer *leaf; + struct btrfs_key found_key; + + ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); + if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && + location->offset == (u64)-1 && path->slots[0] != 0) { + slot = path->slots[0] - 1; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (found_key.objectid == location->objectid && + btrfs_key_type(&found_key) == btrfs_key_type(location)) { + path->slots[0]--; + return 0; + } + } + return ret; +} diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c new file mode 100644 index 00000000000..2aa79873eb4 --- /dev/null +++ b/fs/btrfs/inode-map.c @@ -0,0 +1,144 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +int btrfs_find_highest_inode(struct btrfs_root *root, u64 *objectid) +{ + struct btrfs_path *path; + int ret; + struct extent_buffer *l; + struct btrfs_key search_key; + struct btrfs_key found_key; + int slot; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + search_key.objectid = BTRFS_LAST_FREE_OBJECTID; + search_key.type = -1; + search_key.offset = (u64)-1; + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); + if (path->slots[0] > 0) { + slot = path->slots[0] - 1; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &found_key, slot); + *objectid = found_key.objectid; + } else { + *objectid = BTRFS_FIRST_FREE_OBJECTID; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +/* + * walks the btree of allocated inodes and find a hole. + */ +int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 dirid, u64 *objectid) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret; + int slot = 0; + u64 last_ino = 0; + int start_found; + struct extent_buffer *l; + struct btrfs_key search_key; + u64 search_start = dirid; + + mutex_lock(&root->objectid_mutex); + if (root->last_inode_alloc >= BTRFS_FIRST_FREE_OBJECTID && + root->last_inode_alloc < BTRFS_LAST_FREE_OBJECTID) { + *objectid = ++root->last_inode_alloc; + mutex_unlock(&root->objectid_mutex); + return 0; + } + path = btrfs_alloc_path(); + BUG_ON(!path); + search_start = max(search_start, BTRFS_FIRST_FREE_OBJECTID); + search_key.objectid = search_start; + search_key.type = 0; + search_key.offset = 0; + + btrfs_init_path(path); + start_found = 0; + ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); + if (ret < 0) + goto error; + + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + if (!start_found) { + *objectid = search_start; + start_found = 1; + goto found; + } + *objectid = last_ino > search_start ? + last_ino : search_start; + goto found; + } + btrfs_item_key_to_cpu(l, &key, slot); + if (key.objectid >= search_start) { + if (start_found) { + if (last_ino < search_start) + last_ino = search_start; + if (key.objectid > last_ino) { + *objectid = last_ino; + goto found; + } + } else if (key.objectid > search_start) { + *objectid = search_start; + goto found; + } + } + if (key.objectid >= BTRFS_LAST_FREE_OBJECTID) + break; + + start_found = 1; + last_ino = key.objectid + 1; + path->slots[0]++; + } + BUG_ON(1); +found: + btrfs_release_path(root, path); + btrfs_free_path(path); + BUG_ON(*objectid < search_start); + mutex_unlock(&root->objectid_mutex); + return 0; +error: + btrfs_release_path(root, path); + btrfs_free_path(path); + mutex_unlock(&root->objectid_mutex); + return ret; +} diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c new file mode 100644 index 00000000000..8adfe059ab4 --- /dev/null +++ b/fs/btrfs/inode.c @@ -0,0 +1,5035 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/bit_spinlock.h> +#include <linux/version.h> +#include <linux/xattr.h> +#include <linux/posix_acl.h> +#include <linux/falloc.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" +#include "ordered-data.h" +#include "xattr.h" +#include "tree-log.h" +#include "ref-cache.h" +#include "compression.h" + +struct btrfs_iget_args { + u64 ino; + struct btrfs_root *root; +}; + +static struct inode_operations btrfs_dir_inode_operations; +static struct inode_operations btrfs_symlink_inode_operations; +static struct inode_operations btrfs_dir_ro_inode_operations; +static struct inode_operations btrfs_special_inode_operations; +static struct inode_operations btrfs_file_inode_operations; +static struct address_space_operations btrfs_aops; +static struct address_space_operations btrfs_symlink_aops; +static struct file_operations btrfs_dir_file_operations; +static struct extent_io_ops btrfs_extent_io_ops; + +static struct kmem_cache *btrfs_inode_cachep; +struct kmem_cache *btrfs_trans_handle_cachep; +struct kmem_cache *btrfs_transaction_cachep; +struct kmem_cache *btrfs_bit_radix_cachep; +struct kmem_cache *btrfs_path_cachep; + +#define S_SHIFT 12 +static unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { + [S_IFREG >> S_SHIFT] = BTRFS_FT_REG_FILE, + [S_IFDIR >> S_SHIFT] = BTRFS_FT_DIR, + [S_IFCHR >> S_SHIFT] = BTRFS_FT_CHRDEV, + [S_IFBLK >> S_SHIFT] = BTRFS_FT_BLKDEV, + [S_IFIFO >> S_SHIFT] = BTRFS_FT_FIFO, + [S_IFSOCK >> S_SHIFT] = BTRFS_FT_SOCK, + [S_IFLNK >> S_SHIFT] = BTRFS_FT_SYMLINK, +}; + +static void btrfs_truncate(struct inode *inode); +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end); +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, int unlock); + +/* + * a very lame attempt at stopping writes when the FS is 85% full. There + * are countless ways this is incorrect, but it is better than nothing. + */ +int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, + int for_del) +{ + u64 total; + u64 used; + u64 thresh; + int ret = 0; + + spin_lock(&root->fs_info->delalloc_lock); + total = btrfs_super_total_bytes(&root->fs_info->super_copy); + used = btrfs_super_bytes_used(&root->fs_info->super_copy); + if (for_del) + thresh = total * 90; + else + thresh = total * 85; + + do_div(thresh, 100); + + if (used + root->fs_info->delalloc_bytes + num_required > thresh) + ret = -ENOSPC; + spin_unlock(&root->fs_info->delalloc_lock); + return ret; +} + +/* + * this does all the hard work for inserting an inline extent into + * the btree. The caller should have done a btrfs_drop_extents so that + * no overlapping inline items exist in the btree + */ +static noinline int insert_inline_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + u64 start, size_t size, size_t compressed_size, + struct page **compressed_pages) +{ + struct btrfs_key key; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct page *page = NULL; + char *kaddr; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + int err = 0; + int ret; + size_t cur_size = size; + size_t datasize; + unsigned long offset; + int use_compress = 0; + + if (compressed_size && compressed_pages) { + use_compress = 1; + cur_size = compressed_size; + } + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + btrfs_set_trans_block_group(trans, inode); + + key.objectid = inode->i_ino; + key.offset = start; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + datasize = btrfs_file_extent_calc_inline_size(cur_size); + + inode_add_bytes(inode, size); + ret = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + BUG_ON(ret); + if (ret) { + err = ret; + goto fail; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, size); + ptr = btrfs_file_extent_inline_start(ei); + + if (use_compress) { + struct page *cpage; + int i = 0; + while (compressed_size > 0) { + cpage = compressed_pages[i]; + cur_size = min_t(unsigned long, compressed_size, + PAGE_CACHE_SIZE); + + kaddr = kmap(cpage); + write_extent_buffer(leaf, kaddr, ptr, cur_size); + kunmap(cpage); + + i++; + ptr += cur_size; + compressed_size -= cur_size; + } + btrfs_set_file_extent_compression(leaf, ei, + BTRFS_COMPRESS_ZLIB); + } else { + page = find_get_page(inode->i_mapping, + start >> PAGE_CACHE_SHIFT); + btrfs_set_file_extent_compression(leaf, ei, 0); + kaddr = kmap_atomic(page, KM_USER0); + offset = start & (PAGE_CACHE_SIZE - 1); + write_extent_buffer(leaf, kaddr + offset, ptr, size); + kunmap_atomic(kaddr, KM_USER0); + page_cache_release(page); + } + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + BTRFS_I(inode)->disk_i_size = inode->i_size; + btrfs_update_inode(trans, root, inode); + return 0; +fail: + btrfs_free_path(path); + return err; +} + + +/* + * conditionally insert an inline extent into the file. This + * does the checks required to make sure the data is small enough + * to fit as an inline extent. + */ +static int cow_file_range_inline(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, u64 start, u64 end, + size_t compressed_size, + struct page **compressed_pages) +{ + u64 isize = i_size_read(inode); + u64 actual_end = min(end + 1, isize); + u64 inline_len = actual_end - start; + u64 aligned_end = (end + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + u64 hint_byte; + u64 data_len = inline_len; + int ret; + + if (compressed_size) + data_len = compressed_size; + + if (start > 0 || + actual_end >= PAGE_CACHE_SIZE || + data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || + (!compressed_size && + (actual_end & (root->sectorsize - 1)) == 0) || + end + 1 < isize || + data_len > root->fs_info->max_inline) { + return 1; + } + + ret = btrfs_drop_extents(trans, root, inode, start, + aligned_end, start, &hint_byte); + BUG_ON(ret); + + if (isize > actual_end) + inline_len = min_t(u64, isize, actual_end); + ret = insert_inline_extent(trans, root, inode, start, + inline_len, compressed_size, + compressed_pages); + BUG_ON(ret); + btrfs_drop_extent_cache(inode, start, aligned_end, 0); + return 0; +} + +struct async_extent { + u64 start; + u64 ram_size; + u64 compressed_size; + struct page **pages; + unsigned long nr_pages; + struct list_head list; +}; + +struct async_cow { + struct inode *inode; + struct btrfs_root *root; + struct page *locked_page; + u64 start; + u64 end; + struct list_head extents; + struct btrfs_work work; +}; + +static noinline int add_async_extent(struct async_cow *cow, + u64 start, u64 ram_size, + u64 compressed_size, + struct page **pages, + unsigned long nr_pages) +{ + struct async_extent *async_extent; + + async_extent = kmalloc(sizeof(*async_extent), GFP_NOFS); + async_extent->start = start; + async_extent->ram_size = ram_size; + async_extent->compressed_size = compressed_size; + async_extent->pages = pages; + async_extent->nr_pages = nr_pages; + list_add_tail(&async_extent->list, &cow->extents); + return 0; +} + +/* + * we create compressed extents in two phases. The first + * phase compresses a range of pages that have already been + * locked (both pages and state bits are locked). + * + * This is done inside an ordered work queue, and the compression + * is spread across many cpus. The actual IO submission is step + * two, and the ordered work queue takes care of making sure that + * happens in the same order things were put onto the queue by + * writepages and friends. + * + * If this code finds it can't get good compression, it puts an + * entry onto the work queue to write the uncompressed bytes. This + * makes sure that both compressed inodes and uncompressed inodes + * are written in the same order that pdflush sent them down. + */ +static noinline int compress_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, + struct async_cow *async_cow, + int *num_added) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 num_bytes; + u64 orig_start; + u64 disk_num_bytes; + u64 blocksize = root->sectorsize; + u64 actual_end; + u64 isize = i_size_read(inode); + int ret = 0; + struct page **pages = NULL; + unsigned long nr_pages; + unsigned long nr_pages_ret = 0; + unsigned long total_compressed = 0; + unsigned long total_in = 0; + unsigned long max_compressed = 128 * 1024; + unsigned long max_uncompressed = 128 * 1024; + int i; + int will_compress; + + orig_start = start; + + actual_end = min_t(u64, isize, end + 1); +again: + will_compress = 0; + nr_pages = (end >> PAGE_CACHE_SHIFT) - (start >> PAGE_CACHE_SHIFT) + 1; + nr_pages = min(nr_pages, (128 * 1024UL) / PAGE_CACHE_SIZE); + + total_compressed = actual_end - start; + + /* we want to make sure that amount of ram required to uncompress + * an extent is reasonable, so we limit the total size in ram + * of a compressed extent to 128k. This is a crucial number + * because it also controls how easily we can spread reads across + * cpus for decompression. + * + * We also want to make sure the amount of IO required to do + * a random read is reasonably small, so we limit the size of + * a compressed extent to 128k. + */ + total_compressed = min(total_compressed, max_uncompressed); + num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = max(blocksize, num_bytes); + disk_num_bytes = num_bytes; + total_in = 0; + ret = 0; + + /* + * we do compression for mount -o compress and when the + * inode has not been flagged as nocompress. This flag can + * change at any time if we discover bad compression ratios. + */ + if (!btrfs_test_flag(inode, NOCOMPRESS) && + btrfs_test_opt(root, COMPRESS)) { + WARN_ON(pages); + pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); + + ret = btrfs_zlib_compress_pages(inode->i_mapping, start, + total_compressed, pages, + nr_pages, &nr_pages_ret, + &total_in, + &total_compressed, + max_compressed); + + if (!ret) { + unsigned long offset = total_compressed & + (PAGE_CACHE_SIZE - 1); + struct page *page = pages[nr_pages_ret - 1]; + char *kaddr; + + /* zero the tail end of the last page, we might be + * sending it down to disk + */ + if (offset) { + kaddr = kmap_atomic(page, KM_USER0); + memset(kaddr + offset, 0, + PAGE_CACHE_SIZE - offset); + kunmap_atomic(kaddr, KM_USER0); + } + will_compress = 1; + } + } + if (start == 0) { + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + btrfs_set_trans_block_group(trans, inode); + + /* lets try to make an inline extent */ + if (ret || total_in < (actual_end - start)) { + /* we didn't compress the entire range, try + * to make an uncompressed inline extent. + */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, NULL); + } else { + /* try making a compressed inline extent */ + ret = cow_file_range_inline(trans, root, inode, + start, end, + total_compressed, pages); + } + btrfs_end_transaction(trans, root); + if (ret == 0) { + /* + * inline extent creation worked, we don't need + * to create any more async work items. Unlock + * and free up our temp pages. + */ + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, 1, 0, + 0, 1, 1, 1); + ret = 0; + goto free_pages_out; + } + } + + if (will_compress) { + /* + * we aren't doing an inline extent round the compressed size + * up to a block size boundary so the allocator does sane + * things + */ + total_compressed = (total_compressed + blocksize - 1) & + ~(blocksize - 1); + + /* + * one last check to make sure the compression is really a + * win, compare the page count read with the blocks on disk + */ + total_in = (total_in + PAGE_CACHE_SIZE - 1) & + ~(PAGE_CACHE_SIZE - 1); + if (total_compressed >= total_in) { + will_compress = 0; + } else { + disk_num_bytes = total_compressed; + num_bytes = total_in; + } + } + if (!will_compress && pages) { + /* + * the compression code ran but failed to make things smaller, + * free any pages it allocated and our page pointer array + */ + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); + pages = NULL; + total_compressed = 0; + nr_pages_ret = 0; + + /* flag the file so we don't compress in the future */ + btrfs_set_flag(inode, NOCOMPRESS); + } + if (will_compress) { + *num_added += 1; + + /* the async work queues will take care of doing actual + * allocation on disk for these compressed pages, + * and will submit them to the elevator. + */ + add_async_extent(async_cow, start, num_bytes, + total_compressed, pages, nr_pages_ret); + + if (start + num_bytes < end && start + num_bytes < actual_end) { + start += num_bytes; + pages = NULL; + cond_resched(); + goto again; + } + } else { + /* + * No compression, but we still need to write the pages in + * the file we've been given so far. redirty the locked + * page if it corresponds to our extent and set things up + * for the async work queue to run cow_file_range to do + * the normal delalloc dance + */ + if (page_offset(locked_page) >= start && + page_offset(locked_page) <= end) { + __set_page_dirty_nobuffers(locked_page); + /* unlocked later on in the async handlers */ + } + add_async_extent(async_cow, start, end - start + 1, 0, NULL, 0); + *num_added += 1; + } + +out: + return 0; + +free_pages_out: + for (i = 0; i < nr_pages_ret; i++) { + WARN_ON(pages[i]->mapping); + page_cache_release(pages[i]); + } + kfree(pages); + + goto out; +} + +/* + * phase two of compressed writeback. This is the ordered portion + * of the code, which only gets called in the order the work was + * queued. We walk all the async extents created by compress_file_range + * and send them down to the disk. + */ +static noinline int submit_compressed_extents(struct inode *inode, + struct async_cow *async_cow) +{ + struct async_extent *async_extent; + u64 alloc_hint = 0; + struct btrfs_trans_handle *trans; + struct btrfs_key ins; + struct extent_map *em; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree; + int ret; + + if (list_empty(&async_cow->extents)) + return 0; + + trans = btrfs_join_transaction(root, 1); + + while (!list_empty(&async_cow->extents)) { + async_extent = list_entry(async_cow->extents.next, + struct async_extent, list); + list_del(&async_extent->list); + + io_tree = &BTRFS_I(inode)->io_tree; + + /* did the compression code fall back to uncompressed IO? */ + if (!async_extent->pages) { + int page_started = 0; + unsigned long nr_written = 0; + + lock_extent(io_tree, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, GFP_NOFS); + + /* allocate blocks */ + cow_file_range(inode, async_cow->locked_page, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + &page_started, &nr_written, 0); + + /* + * if page_started, cow_file_range inserted an + * inline extent and took care of all the unlocking + * and IO for us. Otherwise, we need to submit + * all those pages down to the drive. + */ + if (!page_started) + extent_write_locked_range(io_tree, + inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + btrfs_get_extent, + WB_SYNC_ALL); + kfree(async_extent); + cond_resched(); + continue; + } + + lock_extent(io_tree, async_extent->start, + async_extent->start + async_extent->ram_size - 1, + GFP_NOFS); + /* + * here we're doing allocation and writeback of the + * compressed pages + */ + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + + ret = btrfs_reserve_extent(trans, root, + async_extent->compressed_size, + async_extent->compressed_size, + 0, alloc_hint, + (u64)-1, &ins, 1); + BUG_ON(ret); + em = alloc_extent_map(GFP_NOFS); + em->start = async_extent->start; + em->len = async_extent->ram_size; + em->orig_start = em->start; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + + while (1) { + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, async_extent->start, + async_extent->start + + async_extent->ram_size - 1, 0); + } + + ret = btrfs_add_ordered_extent(inode, async_extent->start, + ins.objectid, + async_extent->ram_size, + ins.offset, + BTRFS_ORDERED_COMPRESSED); + BUG_ON(ret); + + btrfs_end_transaction(trans, root); + + /* + * clear dirty, set writeback and unlock the pages. + */ + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + async_extent->start, + async_extent->start + + async_extent->ram_size - 1, + NULL, 1, 1, 0, 1, 1, 0); + + ret = btrfs_submit_compressed_write(inode, + async_extent->start, + async_extent->ram_size, + ins.objectid, + ins.offset, async_extent->pages, + async_extent->nr_pages); + + BUG_ON(ret); + trans = btrfs_join_transaction(root, 1); + alloc_hint = ins.objectid + ins.offset; + kfree(async_extent); + cond_resched(); + } + + btrfs_end_transaction(trans, root); + return 0; +} + +/* + * when extent_io.c finds a delayed allocation range in the file, + * the call backs end up in this code. The basic idea is to + * allocate extents on disk for the range, and create ordered data structs + * in ram to track those extents. + * + * locked_page is the page that writepage had locked already. We use + * it to make sure we don't do extra locks or unlocks. + * + * *page_started is set to one if we unlock locked_page and do everything + * required to start IO on it. It may be clean and already done with + * IO when we return. + */ +static noinline int cow_file_range(struct inode *inode, + struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written, + int unlock) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + u64 alloc_hint = 0; + u64 num_bytes; + unsigned long ram_size; + u64 disk_num_bytes; + u64 cur_alloc_size; + u64 blocksize = root->sectorsize; + u64 actual_end; + u64 isize = i_size_read(inode); + struct btrfs_key ins; + struct extent_map *em; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + int ret = 0; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + btrfs_set_trans_block_group(trans, inode); + + actual_end = min_t(u64, isize, end + 1); + + num_bytes = (end - start + blocksize) & ~(blocksize - 1); + num_bytes = max(blocksize, num_bytes); + disk_num_bytes = num_bytes; + ret = 0; + + if (start == 0) { + /* lets try to make an inline extent */ + ret = cow_file_range_inline(trans, root, inode, + start, end, 0, NULL); + if (ret == 0) { + extent_clear_unlock_delalloc(inode, + &BTRFS_I(inode)->io_tree, + start, end, NULL, 1, 1, + 1, 1, 1, 1); + *nr_written = *nr_written + + (end - start + PAGE_CACHE_SIZE) / PAGE_CACHE_SIZE; + *page_started = 1; + ret = 0; + goto out; + } + } + + BUG_ON(disk_num_bytes > + btrfs_super_total_bytes(&root->fs_info->super_copy)); + + btrfs_drop_extent_cache(inode, start, start + num_bytes - 1, 0); + + while (disk_num_bytes > 0) { + cur_alloc_size = min(disk_num_bytes, root->fs_info->max_extent); + ret = btrfs_reserve_extent(trans, root, cur_alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); + BUG_ON(ret); + + em = alloc_extent_map(GFP_NOFS); + em->start = start; + em->orig_start = em->start; + + ram_size = ins.offset; + em->len = ins.offset; + + em->block_start = ins.objectid; + em->block_len = ins.offset; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + + while (1) { + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, start, + start + ram_size - 1, 0); + } + + cur_alloc_size = ins.offset; + ret = btrfs_add_ordered_extent(inode, start, ins.objectid, + ram_size, cur_alloc_size, 0); + BUG_ON(ret); + + if (root->root_key.objectid == + BTRFS_DATA_RELOC_TREE_OBJECTID) { + ret = btrfs_reloc_clone_csums(inode, start, + cur_alloc_size); + BUG_ON(ret); + } + + if (disk_num_bytes < cur_alloc_size) + break; + + /* we're not doing compressed IO, don't unlock the first + * page (which the caller expects to stay locked), don't + * clear any dirty bits and don't set any writeback bits + */ + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + start, start + ram_size - 1, + locked_page, unlock, 1, + 1, 0, 0, 0); + disk_num_bytes -= cur_alloc_size; + num_bytes -= cur_alloc_size; + alloc_hint = ins.objectid + ins.offset; + start += cur_alloc_size; + } +out: + ret = 0; + btrfs_end_transaction(trans, root); + + return ret; +} + +/* + * work queue call back to started compression on a file and pages + */ +static noinline void async_cow_start(struct btrfs_work *work) +{ + struct async_cow *async_cow; + int num_added = 0; + async_cow = container_of(work, struct async_cow, work); + + compress_file_range(async_cow->inode, async_cow->locked_page, + async_cow->start, async_cow->end, async_cow, + &num_added); + if (num_added == 0) + async_cow->inode = NULL; +} + +/* + * work queue call back to submit previously compressed pages + */ +static noinline void async_cow_submit(struct btrfs_work *work) +{ + struct async_cow *async_cow; + struct btrfs_root *root; + unsigned long nr_pages; + + async_cow = container_of(work, struct async_cow, work); + + root = async_cow->root; + nr_pages = (async_cow->end - async_cow->start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + + atomic_sub(nr_pages, &root->fs_info->async_delalloc_pages); + + if (atomic_read(&root->fs_info->async_delalloc_pages) < + 5 * 1042 * 1024 && + waitqueue_active(&root->fs_info->async_submit_wait)) + wake_up(&root->fs_info->async_submit_wait); + + if (async_cow->inode) + submit_compressed_extents(async_cow->inode, async_cow); +} + +static noinline void async_cow_free(struct btrfs_work *work) +{ + struct async_cow *async_cow; + async_cow = container_of(work, struct async_cow, work); + kfree(async_cow); +} + +static int cow_file_range_async(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + struct async_cow *async_cow; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long nr_pages; + u64 cur_end; + int limit = 10 * 1024 * 1042; + + if (!btrfs_test_opt(root, COMPRESS)) { + return cow_file_range(inode, locked_page, start, end, + page_started, nr_written, 1); + } + + clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED | + EXTENT_DELALLOC, 1, 0, GFP_NOFS); + while (start < end) { + async_cow = kmalloc(sizeof(*async_cow), GFP_NOFS); + async_cow->inode = inode; + async_cow->root = root; + async_cow->locked_page = locked_page; + async_cow->start = start; + + if (btrfs_test_flag(inode, NOCOMPRESS)) + cur_end = end; + else + cur_end = min(end, start + 512 * 1024 - 1); + + async_cow->end = cur_end; + INIT_LIST_HEAD(&async_cow->extents); + + async_cow->work.func = async_cow_start; + async_cow->work.ordered_func = async_cow_submit; + async_cow->work.ordered_free = async_cow_free; + async_cow->work.flags = 0; + + nr_pages = (cur_end - start + PAGE_CACHE_SIZE) >> + PAGE_CACHE_SHIFT; + atomic_add(nr_pages, &root->fs_info->async_delalloc_pages); + + btrfs_queue_worker(&root->fs_info->delalloc_workers, + &async_cow->work); + + if (atomic_read(&root->fs_info->async_delalloc_pages) > limit) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) < + limit)); + } + + while (atomic_read(&root->fs_info->async_submit_draining) && + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->async_delalloc_pages) == + 0)); + } + + *nr_written += nr_pages; + start = cur_end + 1; + } + *page_started = 1; + return 0; +} + +static noinline int csum_exist_in_range(struct btrfs_root *root, + u64 bytenr, u64 num_bytes) +{ + int ret; + struct btrfs_ordered_sum *sums; + LIST_HEAD(list); + + ret = btrfs_lookup_csums_range(root->fs_info->csum_root, bytenr, + bytenr + num_bytes - 1, &list); + if (ret == 0 && list_empty(&list)) + return 0; + + while (!list_empty(&list)) { + sums = list_entry(list.next, struct btrfs_ordered_sum, list); + list_del(&sums->list); + kfree(sums); + } + return 1; +} + +/* + * when nowcow writeback call back. This checks for snapshots or COW copies + * of the extents that exist in the file, and COWs the file as required. + * + * If no cow copies or snapshots exist, we write directly to the existing + * blocks on disk + */ +static int run_delalloc_nocow(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, int force, + unsigned long *nr_written) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct extent_buffer *leaf; + struct btrfs_path *path; + struct btrfs_file_extent_item *fi; + struct btrfs_key found_key; + u64 cow_start; + u64 cur_offset; + u64 extent_end; + u64 disk_bytenr; + u64 num_bytes; + int extent_type; + int ret; + int type; + int nocow; + int check_prev = 1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + + cow_start = (u64)-1; + cur_offset = start; + while (1) { + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + cur_offset, 0); + BUG_ON(ret < 0); + if (ret > 0 && path->slots[0] > 0 && check_prev) { + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, + path->slots[0] - 1); + if (found_key.objectid == inode->i_ino && + found_key.type == BTRFS_EXTENT_DATA_KEY) + path->slots[0]--; + } + check_prev = 0; +next_slot: + leaf = path->nodes[0]; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + BUG_ON(1); + if (ret > 0) + break; + leaf = path->nodes[0]; + } + + nocow = 0; + disk_bytenr = 0; + num_bytes = 0; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid > inode->i_ino || + found_key.type > BTRFS_EXTENT_DATA_KEY || + found_key.offset > end) + break; + + if (found_key.offset > cur_offset) { + extent_end = found_key.offset; + goto out_check; + } + + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + + if (extent_type == BTRFS_FILE_EXTENT_REG || + extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi); + extent_end = found_key.offset + + btrfs_file_extent_num_bytes(leaf, fi); + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } + if (disk_bytenr == 0) + goto out_check; + if (btrfs_file_extent_compression(leaf, fi) || + btrfs_file_extent_encryption(leaf, fi) || + btrfs_file_extent_other_encoding(leaf, fi)) + goto out_check; + if (extent_type == BTRFS_FILE_EXTENT_REG && !force) + goto out_check; + if (btrfs_extent_readonly(root, disk_bytenr)) + goto out_check; + if (btrfs_cross_ref_exist(trans, root, inode->i_ino, + disk_bytenr)) + goto out_check; + disk_bytenr += btrfs_file_extent_offset(leaf, fi); + disk_bytenr += cur_offset - found_key.offset; + num_bytes = min(end + 1, extent_end) - cur_offset; + /* + * force cow if csum exists in the range. + * this ensure that csum for a given extent are + * either valid or do not exist. + */ + if (csum_exist_in_range(root, disk_bytenr, num_bytes)) + goto out_check; + nocow = 1; + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + extent_end = found_key.offset + + btrfs_file_extent_inline_len(leaf, fi); + extent_end = ALIGN(extent_end, root->sectorsize); + } else { + BUG_ON(1); + } +out_check: + if (extent_end <= start) { + path->slots[0]++; + goto next_slot; + } + if (!nocow) { + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = extent_end; + if (cur_offset > end) + break; + path->slots[0]++; + goto next_slot; + } + + btrfs_release_path(root, path); + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, cow_start, + found_key.offset - 1, page_started, + nr_written, 1); + BUG_ON(ret); + cow_start = (u64)-1; + } + + if (extent_type == BTRFS_FILE_EXTENT_PREALLOC) { + struct extent_map *em; + struct extent_map_tree *em_tree; + em_tree = &BTRFS_I(inode)->extent_tree; + em = alloc_extent_map(GFP_NOFS); + em->start = cur_offset; + em->orig_start = em->start; + em->len = num_bytes; + em->block_len = num_bytes; + em->block_start = disk_bytenr; + em->bdev = root->fs_info->fs_devices->latest_bdev; + set_bit(EXTENT_FLAG_PINNED, &em->flags); + while (1) { + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + if (ret != -EEXIST) { + free_extent_map(em); + break; + } + btrfs_drop_extent_cache(inode, em->start, + em->start + em->len - 1, 0); + } + type = BTRFS_ORDERED_PREALLOC; + } else { + type = BTRFS_ORDERED_NOCOW; + } + + ret = btrfs_add_ordered_extent(inode, cur_offset, disk_bytenr, + num_bytes, num_bytes, type); + BUG_ON(ret); + + extent_clear_unlock_delalloc(inode, &BTRFS_I(inode)->io_tree, + cur_offset, cur_offset + num_bytes - 1, + locked_page, 1, 1, 1, 0, 0, 0); + cur_offset = extent_end; + if (cur_offset > end) + break; + } + btrfs_release_path(root, path); + + if (cur_offset <= end && cow_start == (u64)-1) + cow_start = cur_offset; + if (cow_start != (u64)-1) { + ret = cow_file_range(inode, locked_page, cow_start, end, + page_started, nr_written, 1); + BUG_ON(ret); + } + + ret = btrfs_end_transaction(trans, root); + BUG_ON(ret); + btrfs_free_path(path); + return 0; +} + +/* + * extent_io.c call back to do delayed allocation processing + */ +static int run_delalloc_range(struct inode *inode, struct page *locked_page, + u64 start, u64 end, int *page_started, + unsigned long *nr_written) +{ + int ret; + + if (btrfs_test_flag(inode, NODATACOW)) + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 1, nr_written); + else if (btrfs_test_flag(inode, PREALLOC)) + ret = run_delalloc_nocow(inode, locked_page, start, end, + page_started, 0, nr_written); + else + ret = cow_file_range_async(inode, locked_page, start, end, + page_started, nr_written); + + return ret; +} + +/* + * extent_io.c set_bit_hook, used to track delayed allocation + * bytes in this file, and to maintain the list of inodes that + * have pending delalloc work to be done. + */ +static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits) +{ + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + spin_lock(&root->fs_info->delalloc_lock); + BTRFS_I(inode)->delalloc_bytes += end - start + 1; + root->fs_info->delalloc_bytes += end - start + 1; + if (list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_add_tail(&BTRFS_I(inode)->delalloc_inodes, + &root->fs_info->delalloc_inodes); + } + spin_unlock(&root->fs_info->delalloc_lock); + } + return 0; +} + +/* + * extent_io.c clear_bit_hook, see set_bit_hook for why + */ +static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, + unsigned long old, unsigned long bits) +{ + /* + * set_bit and clear bit hooks normally require _irqsave/restore + * but in this case, we are only testeing for the DELALLOC + * bit, which is only set or cleared with irqs on + */ + if ((old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { + struct btrfs_root *root = BTRFS_I(inode)->root; + + spin_lock(&root->fs_info->delalloc_lock); + if (end - start + 1 > root->fs_info->delalloc_bytes) { + printk(KERN_INFO "btrfs warning: delalloc account " + "%llu %llu\n", + (unsigned long long)end - start + 1, + (unsigned long long) + root->fs_info->delalloc_bytes); + root->fs_info->delalloc_bytes = 0; + BTRFS_I(inode)->delalloc_bytes = 0; + } else { + root->fs_info->delalloc_bytes -= end - start + 1; + BTRFS_I(inode)->delalloc_bytes -= end - start + 1; + } + if (BTRFS_I(inode)->delalloc_bytes == 0 && + !list_empty(&BTRFS_I(inode)->delalloc_inodes)) { + list_del_init(&BTRFS_I(inode)->delalloc_inodes); + } + spin_unlock(&root->fs_info->delalloc_lock); + } + return 0; +} + +/* + * extent_io.c merge_bio_hook, this must check the chunk tree to make sure + * we don't create bios that span stripes or chunks + */ +int btrfs_merge_bio_hook(struct page *page, unsigned long offset, + size_t size, struct bio *bio, + unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; + struct btrfs_mapping_tree *map_tree; + u64 logical = (u64)bio->bi_sector << 9; + u64 length = 0; + u64 map_length; + int ret; + + if (bio_flags & EXTENT_BIO_COMPRESSED) + return 0; + + length = bio->bi_size; + map_tree = &root->fs_info->mapping_tree; + map_length = length; + ret = btrfs_map_block(map_tree, READ, logical, + &map_length, NULL, 0); + + if (map_length < length + size) + return 1; + return 0; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_start(struct inode *inode, int rw, + struct bio *bio, int mirror_num, + unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + ret = btrfs_csum_one_bio(root, inode, bio, 0, 0); + BUG_ON(ret); + return 0; +} + +/* + * in order to insert checksums into the metadata in large chunks, + * we wait until bio submission time. All the pages in the bio are + * checksummed and sums are attached onto the ordered extent record. + * + * At IO completion time the cums attached on the ordered extent record + * are inserted into the btree + */ +static int __btrfs_submit_bio_done(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + return btrfs_map_bio(root, rw, bio, mirror_num, 1); +} + +/* + * extent_io.c submission hook. This does the right thing for csum calculation + * on write, or reading the csums from the tree before a read + */ +static int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, + int mirror_num, unsigned long bio_flags) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + int skip_sum; + + skip_sum = btrfs_test_flag(inode, NODATASUM); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + BUG_ON(ret); + + if (!(rw & (1 << BIO_RW))) { + if (bio_flags & EXTENT_BIO_COMPRESSED) { + return btrfs_submit_compressed_read(inode, bio, + mirror_num, bio_flags); + } else if (!skip_sum) + btrfs_lookup_bio_sums(root, inode, bio, NULL); + goto mapit; + } else if (!skip_sum) { + /* csum items have already been cloned */ + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID) + goto mapit; + /* we're doing a write, do the async checksumming */ + return btrfs_wq_submit_bio(BTRFS_I(inode)->root->fs_info, + inode, rw, bio, mirror_num, + bio_flags, __btrfs_submit_bio_start, + __btrfs_submit_bio_done); + } + +mapit: + return btrfs_map_bio(root, rw, bio, mirror_num, 0); +} + +/* + * given a list of ordered sums record them in the inode. This happens + * at IO completion time based on sums calculated at bio submission time. + */ +static noinline int add_pending_csums(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_offset, + struct list_head *list) +{ + struct list_head *cur; + struct btrfs_ordered_sum *sum; + + btrfs_set_trans_block_group(trans, inode); + list_for_each(cur, list) { + sum = list_entry(cur, struct btrfs_ordered_sum, list); + btrfs_csum_file_blocks(trans, + BTRFS_I(inode)->root->fs_info->csum_root, sum); + } + return 0; +} + +int btrfs_set_extent_delalloc(struct inode *inode, u64 start, u64 end) +{ + if ((end & (PAGE_CACHE_SIZE - 1)) == 0) + WARN_ON(1); + return set_extent_delalloc(&BTRFS_I(inode)->io_tree, start, end, + GFP_NOFS); +} + +/* see btrfs_writepage_start_hook for details on why this is required */ +struct btrfs_writepage_fixup { + struct page *page; + struct btrfs_work work; +}; + +static void btrfs_writepage_fixup_worker(struct btrfs_work *work) +{ + struct btrfs_writepage_fixup *fixup; + struct btrfs_ordered_extent *ordered; + struct page *page; + struct inode *inode; + u64 page_start; + u64 page_end; + + fixup = container_of(work, struct btrfs_writepage_fixup, work); + page = fixup->page; +again: + lock_page(page); + if (!page->mapping || !PageDirty(page) || !PageChecked(page)) { + ClearPageChecked(page); + goto out_page; + } + + inode = page->mapping->host; + page_start = page_offset(page); + page_end = page_offset(page) + PAGE_CACHE_SIZE - 1; + + lock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); + + /* already ordered? We're done */ + if (test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end, + EXTENT_ORDERED, 0)) { + goto out; + } + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(&BTRFS_I(inode)->io_tree, page_start, + page_end, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + goto again; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end); + ClearPageChecked(page); +out: + unlock_extent(&BTRFS_I(inode)->io_tree, page_start, page_end, GFP_NOFS); +out_page: + unlock_page(page); + page_cache_release(page); +} + +/* + * There are a few paths in the higher layers of the kernel that directly + * set the page dirty bit without asking the filesystem if it is a + * good idea. This causes problems because we want to make sure COW + * properly happens and the data=ordered rules are followed. + * + * In our case any range that doesn't have the ORDERED bit set + * hasn't been properly setup for IO. We kick off an async process + * to fix it up. The async helper will wait for ordered extents, set + * the delalloc bit and make it safe to write the page. + */ +static int btrfs_writepage_start_hook(struct page *page, u64 start, u64 end) +{ + struct inode *inode = page->mapping->host; + struct btrfs_writepage_fixup *fixup; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = test_range_bit(&BTRFS_I(inode)->io_tree, start, end, + EXTENT_ORDERED, 0); + if (ret) + return 0; + + if (PageChecked(page)) + return -EAGAIN; + + fixup = kzalloc(sizeof(*fixup), GFP_NOFS); + if (!fixup) + return -EAGAIN; + + SetPageChecked(page); + page_cache_get(page); + fixup->work.func = btrfs_writepage_fixup_worker; + fixup->page = page; + btrfs_queue_worker(&root->fs_info->fixup_workers, &fixup->work); + return -EAGAIN; +} + +static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, + struct inode *inode, u64 file_pos, + u64 disk_bytenr, u64 disk_num_bytes, + u64 num_bytes, u64 ram_bytes, + u8 compression, u8 encryption, + u16 other_encoding, int extent_type) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *fi; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key ins; + u64 hint; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + ret = btrfs_drop_extents(trans, root, inode, file_pos, + file_pos + num_bytes, file_pos, &hint); + BUG_ON(ret); + + ins.objectid = inode->i_ino; + ins.offset = file_pos; + ins.type = BTRFS_EXTENT_DATA_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*fi)); + BUG_ON(ret); + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, fi, trans->transid); + btrfs_set_file_extent_type(leaf, fi, extent_type); + btrfs_set_file_extent_disk_bytenr(leaf, fi, disk_bytenr); + btrfs_set_file_extent_disk_num_bytes(leaf, fi, disk_num_bytes); + btrfs_set_file_extent_offset(leaf, fi, 0); + btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes); + btrfs_set_file_extent_ram_bytes(leaf, fi, ram_bytes); + btrfs_set_file_extent_compression(leaf, fi, compression); + btrfs_set_file_extent_encryption(leaf, fi, encryption); + btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding); + btrfs_mark_buffer_dirty(leaf); + + inode_add_bytes(inode, num_bytes); + btrfs_drop_extent_cache(inode, file_pos, file_pos + num_bytes - 1, 0); + + ins.objectid = disk_bytenr; + ins.offset = disk_num_bytes; + ins.type = BTRFS_EXTENT_ITEM_KEY; + ret = btrfs_alloc_reserved_extent(trans, root, leaf->start, + root->root_key.objectid, + trans->transid, inode->i_ino, &ins); + BUG_ON(ret); + + btrfs_free_path(path); + return 0; +} + +/* as ordered data IO finishes, this gets called so we can finish + * an ordered extent if the range of bytes in the file it covers are + * fully written. + */ +static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_ordered_extent *ordered_extent; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int compressed = 0; + int ret; + + ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1); + if (!ret) + return 0; + + trans = btrfs_join_transaction(root, 1); + + ordered_extent = btrfs_lookup_ordered_extent(inode, start); + BUG_ON(!ordered_extent); + if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) + goto nocow; + + lock_extent(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + GFP_NOFS); + + if (test_bit(BTRFS_ORDERED_COMPRESSED, &ordered_extent->flags)) + compressed = 1; + if (test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) { + BUG_ON(compressed); + ret = btrfs_mark_extent_written(trans, root, inode, + ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len); + BUG_ON(ret); + } else { + ret = insert_reserved_file_extent(trans, inode, + ordered_extent->file_offset, + ordered_extent->start, + ordered_extent->disk_len, + ordered_extent->len, + ordered_extent->len, + compressed, 0, 0, + BTRFS_FILE_EXTENT_REG); + BUG_ON(ret); + } + unlock_extent(io_tree, ordered_extent->file_offset, + ordered_extent->file_offset + ordered_extent->len - 1, + GFP_NOFS); +nocow: + add_pending_csums(trans, inode, ordered_extent->file_offset, + &ordered_extent->list); + + mutex_lock(&BTRFS_I(inode)->extent_mutex); + btrfs_ordered_update_i_size(inode, ordered_extent); + btrfs_update_inode(trans, root, inode); + btrfs_remove_ordered_extent(inode, ordered_extent); + mutex_unlock(&BTRFS_I(inode)->extent_mutex); + + /* once for us */ + btrfs_put_ordered_extent(ordered_extent); + /* once for the tree */ + btrfs_put_ordered_extent(ordered_extent); + + btrfs_end_transaction(trans, root); + return 0; +} + +static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state, int uptodate) +{ + return btrfs_finish_ordered_io(page->mapping->host, start, end); +} + +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int last_mirror; +}; + +static int btrfs_io_failed_hook(struct bio *failed_bio, + struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + struct io_failure_record *failrec = NULL; + u64 private; + struct extent_map *em; + struct inode *inode = page->mapping->host; + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct bio *bio; + int num_copies; + int ret; + int rw; + u64 logical; + + ret = get_state_private(failure_tree, start, &private); + if (ret) { + failrec = kmalloc(sizeof(*failrec), GFP_NOFS); + if (!failrec) + return -ENOMEM; + failrec->start = start; + failrec->len = end - start + 1; + failrec->last_mirror = 0; + failrec->bio_flags = 0; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, failrec->len); + if (em->start > start || em->start + em->len < start) { + free_extent_map(em); + em = NULL; + } + spin_unlock(&em_tree->lock); + + if (!em || IS_ERR(em)) { + kfree(failrec); + return -EIO; + } + logical = start - em->start; + logical = em->block_start + logical; + if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + logical = em->block_start; + failrec->bio_flags = EXTENT_BIO_COMPRESSED; + } + failrec->logical = logical; + free_extent_map(em); + set_extent_bits(failure_tree, start, end, EXTENT_LOCKED | + EXTENT_DIRTY, GFP_NOFS); + set_state_private(failure_tree, start, + (u64)(unsigned long)failrec); + } else { + failrec = (struct io_failure_record *)(unsigned long)private; + } + num_copies = btrfs_num_copies( + &BTRFS_I(inode)->root->fs_info->mapping_tree, + failrec->logical, failrec->len); + failrec->last_mirror++; + if (!state) { + spin_lock(&BTRFS_I(inode)->io_tree.lock); + state = find_first_extent_bit_state(&BTRFS_I(inode)->io_tree, + failrec->start, + EXTENT_LOCKED); + if (state && state->start != failrec->start) + state = NULL; + spin_unlock(&BTRFS_I(inode)->io_tree.lock); + } + if (!state || failrec->last_mirror > num_copies) { + set_state_private(failure_tree, failrec->start, 0); + clear_extent_bits(failure_tree, failrec->start, + failrec->start + failrec->len - 1, + EXTENT_LOCKED | EXTENT_DIRTY, GFP_NOFS); + kfree(failrec); + return -EIO; + } + bio = bio_alloc(GFP_NOFS, 1); + bio->bi_private = state; + bio->bi_end_io = failed_bio->bi_end_io; + bio->bi_sector = failrec->logical >> 9; + bio->bi_bdev = failed_bio->bi_bdev; + bio->bi_size = 0; + + bio_add_page(bio, page, failrec->len, start - page_offset(page)); + if (failed_bio->bi_rw & (1 << BIO_RW)) + rw = WRITE; + else + rw = READ; + + BTRFS_I(inode)->io_tree.ops->submit_bio_hook(inode, rw, bio, + failrec->last_mirror, + failrec->bio_flags); + return 0; +} + +/* + * each time an IO finishes, we do a fast check in the IO failure tree + * to see if we need to process or clean up an io_failure_record + */ +static int btrfs_clean_io_failures(struct inode *inode, u64 start) +{ + u64 private; + u64 private_failure; + struct io_failure_record *failure; + int ret; + + private = 0; + if (count_range_bits(&BTRFS_I(inode)->io_failure_tree, &private, + (u64)-1, 1, EXTENT_DIRTY)) { + ret = get_state_private(&BTRFS_I(inode)->io_failure_tree, + start, &private_failure); + if (ret == 0) { + failure = (struct io_failure_record *)(unsigned long) + private_failure; + set_state_private(&BTRFS_I(inode)->io_failure_tree, + failure->start, 0); + clear_extent_bits(&BTRFS_I(inode)->io_failure_tree, + failure->start, + failure->start + failure->len - 1, + EXTENT_DIRTY | EXTENT_LOCKED, + GFP_NOFS); + kfree(failure); + } + } + return 0; +} + +/* + * when reads are done, we need to check csums to verify the data is correct + * if there's a match, we allow the bio to finish. If not, we go through + * the io_failure_record routines to find good copies + */ +static int btrfs_readpage_end_io_hook(struct page *page, u64 start, u64 end, + struct extent_state *state) +{ + size_t offset = start - ((u64)page->index << PAGE_CACHE_SHIFT); + struct inode *inode = page->mapping->host; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + char *kaddr; + u64 private = ~(u32)0; + int ret; + struct btrfs_root *root = BTRFS_I(inode)->root; + u32 csum = ~(u32)0; + + if (PageChecked(page)) { + ClearPageChecked(page); + goto good; + } + if (btrfs_test_flag(inode, NODATASUM)) + return 0; + + if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && + test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1)) { + clear_extent_bits(io_tree, start, end, EXTENT_NODATASUM, + GFP_NOFS); + return 0; + } + + if (state && state->start == start) { + private = state->private; + ret = 0; + } else { + ret = get_state_private(io_tree, start, &private); + } + kaddr = kmap_atomic(page, KM_USER0); + if (ret) + goto zeroit; + + csum = btrfs_csum_data(root, kaddr + offset, csum, end - start + 1); + btrfs_csum_final(csum, (char *)&csum); + if (csum != private) + goto zeroit; + + kunmap_atomic(kaddr, KM_USER0); +good: + /* if the io failure tree for this inode is non-empty, + * check to see if we've recovered from a failed IO + */ + btrfs_clean_io_failures(inode, start); + return 0; + +zeroit: + printk(KERN_INFO "btrfs csum failed ino %lu off %llu csum %u " + "private %llu\n", page->mapping->host->i_ino, + (unsigned long long)start, csum, + (unsigned long long)private); + memset(kaddr + offset, 1, end - start + 1); + flush_dcache_page(page); + kunmap_atomic(kaddr, KM_USER0); + if (private == 0) + return 0; + return -EIO; +} + +/* + * This creates an orphan entry for the given inode in case something goes + * wrong in the middle of an unlink/truncate. + */ +int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + spin_lock(&root->list_lock); + + /* already on the orphan list, we're good */ + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + spin_unlock(&root->list_lock); + return 0; + } + + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + + spin_unlock(&root->list_lock); + + /* + * insert an orphan item to track this unlinked/truncated file + */ + ret = btrfs_insert_orphan_item(trans, root, inode->i_ino); + + return ret; +} + +/* + * We have done the truncate/delete so we can go ahead and remove the orphan + * item for this particular inode. + */ +int btrfs_orphan_del(struct btrfs_trans_handle *trans, struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret = 0; + + spin_lock(&root->list_lock); + + if (list_empty(&BTRFS_I(inode)->i_orphan)) { + spin_unlock(&root->list_lock); + return 0; + } + + list_del_init(&BTRFS_I(inode)->i_orphan); + if (!trans) { + spin_unlock(&root->list_lock); + return 0; + } + + spin_unlock(&root->list_lock); + + ret = btrfs_del_orphan_item(trans, root, inode->i_ino); + + return ret; +} + +/* + * this cleans up any orphans that may be left on the list from the last use + * of this root. + */ +void btrfs_orphan_cleanup(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_item *item; + struct btrfs_key key, found_key; + struct btrfs_trans_handle *trans; + struct inode *inode; + int ret = 0, nr_unlink = 0, nr_truncate = 0; + + path = btrfs_alloc_path(); + if (!path) + return; + path->reada = -1; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = (u64)-1; + + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) { + printk(KERN_ERR "Error searching slot for orphan: %d" + "\n", ret); + break; + } + + /* + * if ret == 0 means we found what we were searching for, which + * is weird, but possible, so only screw with path if we didnt + * find the key and see if we have stuff that matches + */ + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + /* pull out the item */ + leaf = path->nodes[0]; + item = btrfs_item_nr(leaf, path->slots[0]); + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + /* make sure the item matches what we want */ + if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) + break; + if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) + break; + + /* release the path since we're done with it */ + btrfs_release_path(root, path); + + /* + * this is where we are basically btrfs_lookup, without the + * crossing root thing. we store the inode number in the + * offset of the orphan item. + */ + inode = btrfs_iget_locked(root->fs_info->sb, + found_key.offset, root); + if (!inode) + break; + + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + + /* have to set the location manually */ + BTRFS_I(inode)->location.objectid = inode->i_ino; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + + /* + * add this inode to the orphan list so btrfs_orphan_del does + * the proper thing when we hit it + */ + spin_lock(&root->list_lock); + list_add(&BTRFS_I(inode)->i_orphan, &root->orphan_list); + spin_unlock(&root->list_lock); + + /* + * if this is a bad inode, means we actually succeeded in + * removing the inode, but not the orphan record, which means + * we need to manually delete the orphan since iput will just + * do a destroy_inode + */ + if (is_bad_inode(inode)) { + trans = btrfs_start_transaction(root, 1); + btrfs_orphan_del(trans, inode); + btrfs_end_transaction(trans, root); + iput(inode); + continue; + } + + /* if we have links, this was a truncate, lets do that */ + if (inode->i_nlink) { + nr_truncate++; + btrfs_truncate(inode); + } else { + nr_unlink++; + } + + /* this will do delete_inode and everything for us */ + iput(inode); + } + + if (nr_unlink) + printk(KERN_INFO "btrfs: unlinked %d orphans\n", nr_unlink); + if (nr_truncate) + printk(KERN_INFO "btrfs: truncated %d orphans\n", nr_truncate); + + btrfs_free_path(path); +} + +/* + * read an inode from the btree into the in-memory inode + */ +void btrfs_read_locked_inode(struct inode *inode) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_inode_item *inode_item; + struct btrfs_timespec *tspec; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key location; + u64 alloc_group_block; + u32 rdev; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + memcpy(&location, &BTRFS_I(inode)->location, sizeof(location)); + + ret = btrfs_lookup_inode(NULL, root, path, &location, 0); + if (ret) + goto make_bad; + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + inode->i_mode = btrfs_inode_mode(leaf, inode_item); + inode->i_nlink = btrfs_inode_nlink(leaf, inode_item); + inode->i_uid = btrfs_inode_uid(leaf, inode_item); + inode->i_gid = btrfs_inode_gid(leaf, inode_item); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + + tspec = btrfs_inode_atime(inode_item); + inode->i_atime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_atime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_mtime(inode_item); + inode->i_mtime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_mtime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + tspec = btrfs_inode_ctime(inode_item); + inode->i_ctime.tv_sec = btrfs_timespec_sec(leaf, tspec); + inode->i_ctime.tv_nsec = btrfs_timespec_nsec(leaf, tspec); + + inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); + BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); + BTRFS_I(inode)->sequence = btrfs_inode_sequence(leaf, inode_item); + inode->i_generation = BTRFS_I(inode)->generation; + inode->i_rdev = 0; + rdev = btrfs_inode_rdev(leaf, inode_item); + + BTRFS_I(inode)->index_cnt = (u64)-1; + BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item); + + alloc_group_block = btrfs_inode_block_group(leaf, inode_item); + BTRFS_I(inode)->block_group = btrfs_find_block_group(root, 0, + alloc_group_block, 0); + btrfs_free_path(path); + inode_item = NULL; + + switch (inode->i_mode & S_IFMT) { + case S_IFREG: + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + break; + case S_IFDIR: + inode->i_fop = &btrfs_dir_file_operations; + if (root == root->fs_info->tree_root) + inode->i_op = &btrfs_dir_ro_inode_operations; + else + inode->i_op = &btrfs_dir_inode_operations; + break; + case S_IFLNK: + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + break; + default: + init_special_inode(inode, inode->i_mode, rdev); + break; + } + return; + +make_bad: + btrfs_free_path(path); + make_bad_inode(inode); +} + +/* + * given a leaf and an inode, copy the inode fields into the leaf + */ +static void fill_inode_item(struct btrfs_trans_handle *trans, + struct extent_buffer *leaf, + struct btrfs_inode_item *item, + struct inode *inode) +{ + btrfs_set_inode_uid(leaf, item, inode->i_uid); + btrfs_set_inode_gid(leaf, item, inode->i_gid); + btrfs_set_inode_size(leaf, item, BTRFS_I(inode)->disk_i_size); + btrfs_set_inode_mode(leaf, item, inode->i_mode); + btrfs_set_inode_nlink(leaf, item, inode->i_nlink); + + btrfs_set_timespec_sec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_atime(item), + inode->i_atime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_mtime(item), + inode->i_mtime.tv_nsec); + + btrfs_set_timespec_sec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_sec); + btrfs_set_timespec_nsec(leaf, btrfs_inode_ctime(item), + inode->i_ctime.tv_nsec); + + btrfs_set_inode_nbytes(leaf, item, inode_get_bytes(inode)); + btrfs_set_inode_generation(leaf, item, BTRFS_I(inode)->generation); + btrfs_set_inode_sequence(leaf, item, BTRFS_I(inode)->sequence); + btrfs_set_inode_transid(leaf, item, trans->transid); + btrfs_set_inode_rdev(leaf, item, inode->i_rdev); + btrfs_set_inode_flags(leaf, item, BTRFS_I(inode)->flags); + btrfs_set_inode_block_group(leaf, item, BTRFS_I(inode)->block_group); +} + +/* + * copy everything in the in-memory inode into the btree. + */ +noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode) +{ + struct btrfs_inode_item *inode_item; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_lookup_inode(trans, root, path, + &BTRFS_I(inode)->location, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto failed; + } + + leaf = path->nodes[0]; + inode_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_inode_item); + + fill_inode_item(trans, leaf, inode_item, inode); + btrfs_mark_buffer_dirty(leaf); + btrfs_set_inode_last_trans(trans, inode); + ret = 0; +failed: + btrfs_free_path(path); + return ret; +} + + +/* + * unlink helper that gets used here in inode.c and in the tree logging + * recovery code. It remove a link in a directory with a given name, and + * also drops the back refs in the inode to the directory + */ +int btrfs_unlink_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, struct inode *inode, + const char *name, int name_len) +{ + struct btrfs_path *path; + int ret = 0; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + struct btrfs_key key; + u64 index; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto err; + } + + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + leaf = path->nodes[0]; + btrfs_dir_item_key_to_cpu(leaf, di, &key); + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto err; + btrfs_release_path(root, path); + + ret = btrfs_del_inode_ref(trans, root, name, name_len, + inode->i_ino, + dir->i_ino, &index); + if (ret) { + printk(KERN_INFO "btrfs failed to delete reference to %.*s, " + "inode %lu parent %lu\n", name_len, name, + inode->i_ino, dir->i_ino); + goto err; + } + + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + index, name, name_len, -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto err; + } + if (!di) { + ret = -ENOENT; + goto err; + } + ret = btrfs_delete_one_dir_name(trans, root, path, di); + btrfs_release_path(root, path); + + ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len, + inode, dir->i_ino); + BUG_ON(ret != 0 && ret != -ENOENT); + if (ret != -ENOENT) + BTRFS_I(dir)->log_dirty_trans = trans->transid; + + ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len, + dir, index); + BUG_ON(ret); +err: + btrfs_free_path(path); + if (ret) + goto out; + + btrfs_i_size_write(dir, dir->i_size - name_len * 2); + inode->i_ctime = dir->i_mtime = dir->i_ctime = CURRENT_TIME; + btrfs_update_inode(trans, root, dir); + btrfs_drop_nlink(inode); + ret = btrfs_update_inode(trans, root, inode); + dir->i_sb->s_dirt = 1; +out: + return ret; +} + +static int btrfs_unlink(struct inode *dir, struct dentry *dentry) +{ + struct btrfs_root *root; + struct btrfs_trans_handle *trans; + struct inode *inode = dentry->d_inode; + int ret; + unsigned long nr = 0; + + root = BTRFS_I(dir)->root; + + ret = btrfs_check_free_space(root, 1, 1); + if (ret) + goto fail; + + trans = btrfs_start_transaction(root, 1); + + btrfs_set_trans_block_group(trans, dir); + ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + + if (inode->i_nlink == 0) + ret = btrfs_orphan_add(trans, inode); + + nr = trans->blocks_used; + + btrfs_end_transaction_throttle(trans, root); +fail: + btrfs_btree_balance_dirty(root, nr); + return ret; +} + +static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + int err = 0; + int ret; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_trans_handle *trans; + unsigned long nr = 0; + + /* + * the FIRST_FREE_OBJECTID check makes sure we don't try to rmdir + * the root of a subvolume or snapshot + */ + if (inode->i_size > BTRFS_EMPTY_DIR_SIZE || + inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) { + return -ENOTEMPTY; + } + + ret = btrfs_check_free_space(root, 1, 1); + if (ret) + goto fail; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_orphan_add(trans, inode); + if (err) + goto fail_trans; + + /* now the directory is empty */ + err = btrfs_unlink_inode(trans, root, dir, dentry->d_inode, + dentry->d_name.name, dentry->d_name.len); + if (!err) + btrfs_i_size_write(inode, 0); + +fail_trans: + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); +fail: + btrfs_btree_balance_dirty(root, nr); + + if (ret && !err) + err = ret; + return err; +} + +#if 0 +/* + * when truncating bytes in a file, it is possible to avoid reading + * the leaves that contain only checksum items. This can be the + * majority of the IO required to delete a large file, but it must + * be done carefully. + * + * The keys in the level just above the leaves are checked to make sure + * the lowest key in a given leaf is a csum key, and starts at an offset + * after the new size. + * + * Then the key for the next leaf is checked to make sure it also has + * a checksum item for the same file. If it does, we know our target leaf + * contains only checksum items, and it can be safely freed without reading + * it. + * + * This is just an optimization targeted at large files. It may do + * nothing. It will return 0 unless things went badly. + */ +static noinline int drop_csum_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct inode *inode, u64 new_size) +{ + struct btrfs_key key; + int ret; + int nritems; + struct btrfs_key found_key; + struct btrfs_key other_key; + struct btrfs_leaf_ref *ref; + u64 leaf_gen; + u64 leaf_start; + + path->lowest_level = 1; + key.objectid = inode->i_ino; + key.type = BTRFS_CSUM_ITEM_KEY; + key.offset = new_size; +again: + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (path->nodes[1] == NULL) { + ret = 0; + goto out; + } + ret = 0; + btrfs_node_key_to_cpu(path->nodes[1], &found_key, path->slots[1]); + nritems = btrfs_header_nritems(path->nodes[1]); + + if (!nritems) + goto out; + + if (path->slots[1] >= nritems) + goto next_node; + + /* did we find a key greater than anything we want to delete? */ + if (found_key.objectid > inode->i_ino || + (found_key.objectid == inode->i_ino && found_key.type > key.type)) + goto out; + + /* we check the next key in the node to make sure the leave contains + * only checksum items. This comparison doesn't work if our + * leaf is the last one in the node + */ + if (path->slots[1] + 1 >= nritems) { +next_node: + /* search forward from the last key in the node, this + * will bring us into the next node in the tree + */ + btrfs_node_key_to_cpu(path->nodes[1], &found_key, nritems - 1); + + /* unlikely, but we inc below, so check to be safe */ + if (found_key.offset == (u64)-1) + goto out; + + /* search_forward needs a path with locks held, do the + * search again for the original key. It is possible + * this will race with a balance and return a path that + * we could modify, but this drop is just an optimization + * and is allowed to miss some leaves. + */ + btrfs_release_path(root, path); + found_key.offset++; + + /* setup a max key for search_forward */ + other_key.offset = (u64)-1; + other_key.type = key.type; + other_key.objectid = key.objectid; + + path->keep_locks = 1; + ret = btrfs_search_forward(root, &found_key, &other_key, + path, 0, 0); + path->keep_locks = 0; + if (ret || found_key.objectid != key.objectid || + found_key.type != key.type) { + ret = 0; + goto out; + } + + key.offset = found_key.offset; + btrfs_release_path(root, path); + cond_resched(); + goto again; + } + + /* we know there's one more slot after us in the tree, + * read that key so we can verify it is also a checksum item + */ + btrfs_node_key_to_cpu(path->nodes[1], &other_key, path->slots[1] + 1); + + if (found_key.objectid < inode->i_ino) + goto next_key; + + if (found_key.type != key.type || found_key.offset < new_size) + goto next_key; + + /* + * if the key for the next leaf isn't a csum key from this objectid, + * we can't be sure there aren't good items inside this leaf. + * Bail out + */ + if (other_key.objectid != inode->i_ino || other_key.type != key.type) + goto out; + + leaf_start = btrfs_node_blockptr(path->nodes[1], path->slots[1]); + leaf_gen = btrfs_node_ptr_generation(path->nodes[1], path->slots[1]); + /* + * it is safe to delete this leaf, it contains only + * csum items from this inode at an offset >= new_size + */ + ret = btrfs_del_leaf(trans, root, path, leaf_start); + BUG_ON(ret); + + if (root->ref_cows && leaf_gen < trans->transid) { + ref = btrfs_alloc_leaf_ref(root, 0); + if (ref) { + ref->root_gen = root->root_key.offset; + ref->bytenr = leaf_start; + ref->owner = 0; + ref->generation = leaf_gen; + ref->nritems = 0; + + ret = btrfs_add_leaf_ref(root, ref, 0); + WARN_ON(ret); + btrfs_free_leaf_ref(root, ref); + } else { + WARN_ON(1); + } + } +next_key: + btrfs_release_path(root, path); + + if (other_key.objectid == inode->i_ino && + other_key.type == key.type && other_key.offset > key.offset) { + key.offset = other_key.offset; + cond_resched(); + goto again; + } + ret = 0; +out: + /* fixup any changes we've made to the path */ + path->lowest_level = 0; + path->keep_locks = 0; + btrfs_release_path(root, path); + return ret; +} + +#endif + +/* + * this can truncate away extent items, csum items and directory items. + * It starts at a high offset and removes keys until it can't find + * any higher than new_size + * + * csum items that cross the new i_size are truncated to the new size + * as well. + * + * min_type is the minimum key type to truncate down to. If set to 0, this + * will kill all the items on this inode, including the INODE_ITEM_KEY. + */ +noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode, + u64 new_size, u32 min_type) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_key found_key; + u32 found_type; + struct extent_buffer *leaf; + struct btrfs_file_extent_item *fi; + u64 extent_start = 0; + u64 extent_num_bytes = 0; + u64 item_end = 0; + u64 root_gen = 0; + u64 root_owner = 0; + int found_extent; + int del_item; + int pending_del_nr = 0; + int pending_del_slot = 0; + int extent_type = -1; + int encoding; + u64 mask = root->sectorsize - 1; + + if (root->ref_cows) + btrfs_drop_extent_cache(inode, new_size & (~mask), (u64)-1, 0); + path = btrfs_alloc_path(); + path->reada = -1; + BUG_ON(!path); + + /* FIXME, add redo link to tree so we don't leak on crash */ + key.objectid = inode->i_ino; + key.offset = (u64)-1; + key.type = (u8)-1; + + btrfs_init_path(path); + +search_again: + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto error; + + if (ret > 0) { + /* there are no items in the tree for us to truncate, we're + * done + */ + if (path->slots[0] == 0) { + ret = 0; + goto error; + } + path->slots[0]--; + } + + while (1) { + fi = NULL; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + encoding = 0; + + if (found_key.objectid != inode->i_ino) + break; + + if (found_type < min_type) + break; + + item_end = found_key.offset; + if (found_type == BTRFS_EXTENT_DATA_KEY) { + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + extent_type = btrfs_file_extent_type(leaf, fi); + encoding = btrfs_file_extent_compression(leaf, fi); + encoding |= btrfs_file_extent_encryption(leaf, fi); + encoding |= btrfs_file_extent_other_encoding(leaf, fi); + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + item_end += + btrfs_file_extent_num_bytes(leaf, fi); + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + item_end += btrfs_file_extent_inline_len(leaf, + fi); + } + item_end--; + } + if (item_end < new_size) { + if (found_type == BTRFS_DIR_ITEM_KEY) + found_type = BTRFS_INODE_ITEM_KEY; + else if (found_type == BTRFS_EXTENT_ITEM_KEY) + found_type = BTRFS_EXTENT_DATA_KEY; + else if (found_type == BTRFS_EXTENT_DATA_KEY) + found_type = BTRFS_XATTR_ITEM_KEY; + else if (found_type == BTRFS_XATTR_ITEM_KEY) + found_type = BTRFS_INODE_REF_KEY; + else if (found_type) + found_type--; + else + break; + btrfs_set_key_type(&key, found_type); + goto next; + } + if (found_key.offset >= new_size) + del_item = 1; + else + del_item = 0; + found_extent = 0; + + /* FIXME, shrink the extent if the ref count is only 1 */ + if (found_type != BTRFS_EXTENT_DATA_KEY) + goto delete; + + if (extent_type != BTRFS_FILE_EXTENT_INLINE) { + u64 num_dec; + extent_start = btrfs_file_extent_disk_bytenr(leaf, fi); + if (!del_item && !encoding) { + u64 orig_num_bytes = + btrfs_file_extent_num_bytes(leaf, fi); + extent_num_bytes = new_size - + found_key.offset + root->sectorsize - 1; + extent_num_bytes = extent_num_bytes & + ~((u64)root->sectorsize - 1); + btrfs_set_file_extent_num_bytes(leaf, fi, + extent_num_bytes); + num_dec = (orig_num_bytes - + extent_num_bytes); + if (root->ref_cows && extent_start != 0) + inode_sub_bytes(inode, num_dec); + btrfs_mark_buffer_dirty(leaf); + } else { + extent_num_bytes = + btrfs_file_extent_disk_num_bytes(leaf, + fi); + /* FIXME blocksize != 4096 */ + num_dec = btrfs_file_extent_num_bytes(leaf, fi); + if (extent_start != 0) { + found_extent = 1; + if (root->ref_cows) + inode_sub_bytes(inode, num_dec); + } + root_gen = btrfs_header_generation(leaf); + root_owner = btrfs_header_owner(leaf); + } + } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) { + /* + * we can't truncate inline items that have had + * special encodings + */ + if (!del_item && + btrfs_file_extent_compression(leaf, fi) == 0 && + btrfs_file_extent_encryption(leaf, fi) == 0 && + btrfs_file_extent_other_encoding(leaf, fi) == 0) { + u32 size = new_size - found_key.offset; + + if (root->ref_cows) { + inode_sub_bytes(inode, item_end + 1 - + new_size); + } + size = + btrfs_file_extent_calc_inline_size(size); + ret = btrfs_truncate_item(trans, root, path, + size, 1); + BUG_ON(ret); + } else if (root->ref_cows) { + inode_sub_bytes(inode, item_end + 1 - + found_key.offset); + } + } +delete: + if (del_item) { + if (!pending_del_nr) { + /* no pending yet, add ourselves */ + pending_del_slot = path->slots[0]; + pending_del_nr = 1; + } else if (pending_del_nr && + path->slots[0] + 1 == pending_del_slot) { + /* hop on the pending chunk */ + pending_del_nr++; + pending_del_slot = path->slots[0]; + } else { + BUG(); + } + } else { + break; + } + if (found_extent) { + ret = btrfs_free_extent(trans, root, extent_start, + extent_num_bytes, + leaf->start, root_owner, + root_gen, inode->i_ino, 0); + BUG_ON(ret); + } +next: + if (path->slots[0] == 0) { + if (pending_del_nr) + goto del_pending; + btrfs_release_path(root, path); + goto search_again; + } + + path->slots[0]--; + if (pending_del_nr && + path->slots[0] + 1 != pending_del_slot) { + struct btrfs_key debug; +del_pending: + btrfs_item_key_to_cpu(path->nodes[0], &debug, + pending_del_slot); + ret = btrfs_del_items(trans, root, path, + pending_del_slot, + pending_del_nr); + BUG_ON(ret); + pending_del_nr = 0; + btrfs_release_path(root, path); + goto search_again; + } + } + ret = 0; +error: + if (pending_del_nr) { + ret = btrfs_del_items(trans, root, path, pending_del_slot, + pending_del_nr); + } + btrfs_free_path(path); + inode->i_sb->s_dirt = 1; + return ret; +} + +/* + * taken from block_truncate_page, but does cow as it zeros out + * any bytes left in the last page in the file. + */ +static int btrfs_truncate_page(struct address_space *mapping, loff_t from) +{ + struct inode *inode = mapping->host; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + char *kaddr; + u32 blocksize = root->sectorsize; + pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_CACHE_SIZE-1); + struct page *page; + int ret = 0; + u64 page_start; + u64 page_end; + + if ((offset & (blocksize - 1)) == 0) + goto out; + + ret = -ENOMEM; +again: + page = grab_cache_page(mapping, index); + if (!page) + goto out; + + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if (!PageUptodate(page)) { + ret = btrfs_readpage(NULL, page); + lock_page(page); + if (page->mapping != mapping) { + unlock_page(page); + page_cache_release(page); + goto again; + } + if (!PageUptodate(page)) { + ret = -EIO; + goto out_unlock; + } + } + wait_on_page_writeback(page); + + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_extent_mapped(page); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end); + ret = 0; + if (offset != PAGE_CACHE_SIZE) { + kaddr = kmap(page); + memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + +out_unlock: + unlock_page(page); + page_cache_release(page); +out: + return ret; +} + +int btrfs_cont_expand(struct inode *inode, loff_t size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_map *em; + u64 mask = root->sectorsize - 1; + u64 hole_start = (inode->i_size + mask) & ~mask; + u64 block_end = (size + mask) & ~mask; + u64 last_byte; + u64 cur_offset; + u64 hole_size; + int err; + + if (size <= hole_start) + return 0; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + return err; + + btrfs_truncate_page(inode->i_mapping, inode->i_size); + + while (1) { + struct btrfs_ordered_extent *ordered; + btrfs_wait_ordered_range(inode, hole_start, + block_end - hole_start); + lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(inode, hole_start); + if (!ordered) + break; + unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + btrfs_put_ordered_extent(ordered); + } + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + cur_offset = hole_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + block_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), block_end); + last_byte = (last_byte + mask) & ~mask; + if (test_bit(EXTENT_FLAG_VACANCY, &em->flags)) { + u64 hint_byte = 0; + hole_size = last_byte - cur_offset; + err = btrfs_drop_extents(trans, root, inode, + cur_offset, + cur_offset + hole_size, + cur_offset, &hint_byte); + if (err) + break; + err = btrfs_insert_file_extent(trans, root, + inode->i_ino, cur_offset, 0, + 0, hole_size, 0, hole_size, + 0, 0, 0); + btrfs_drop_extent_cache(inode, hole_start, + last_byte - 1, 0); + } + free_extent_map(em); + cur_offset = last_byte; + if (err || cur_offset >= block_end) + break; + } + + btrfs_end_transaction(trans, root); + unlock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS); + return err; +} + +static int btrfs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = dentry->d_inode; + int err; + + err = inode_change_ok(inode, attr); + if (err) + return err; + + if (S_ISREG(inode->i_mode) && + attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) { + err = btrfs_cont_expand(inode, attr->ia_size); + if (err) + return err; + } + + err = inode_setattr(inode, attr); + + if (!err && ((attr->ia_valid & ATTR_MODE))) + err = btrfs_acl_chmod(inode); + return err; +} + +void btrfs_delete_inode(struct inode *inode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + unsigned long nr; + int ret; + + truncate_inode_pages(&inode->i_data, 0); + if (is_bad_inode(inode)) { + btrfs_orphan_del(NULL, inode); + goto no_delete; + } + btrfs_wait_ordered_range(inode, 0, (u64)-1); + + btrfs_i_size_write(inode, 0); + trans = btrfs_join_transaction(root, 1); + + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, 0); + if (ret) { + btrfs_orphan_del(NULL, inode); + goto no_delete_lock; + } + + btrfs_orphan_del(trans, inode); + + nr = trans->blocks_used; + clear_inode(inode); + + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); + return; + +no_delete_lock: + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(root, nr); +no_delete: + clear_inode(inode); +} + +/* + * this returns the key found in the dir entry in the location pointer. + * If no dir entries were found, location->objectid is 0. + */ +static int btrfs_inode_by_name(struct inode *dir, struct dentry *dentry, + struct btrfs_key *location) +{ + const char *name = dentry->d_name.name; + int namelen = dentry->d_name.len; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(dir)->root; + int ret = 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + di = btrfs_lookup_dir_item(NULL, root, path, dir->i_ino, name, + namelen, 0); + if (IS_ERR(di)) + ret = PTR_ERR(di); + + if (!di || IS_ERR(di)) + goto out_err; + + btrfs_dir_item_key_to_cpu(path->nodes[0], di, location); +out: + btrfs_free_path(path); + return ret; +out_err: + location->objectid = 0; + goto out; +} + +/* + * when we hit a tree root in a directory, the btrfs part of the inode + * needs to be changed to reflect the root directory of the tree root. This + * is kind of like crossing a mount point. + */ +static int fixup_tree_root_location(struct btrfs_root *root, + struct btrfs_key *location, + struct btrfs_root **sub_root, + struct dentry *dentry) +{ + struct btrfs_root_item *ri; + + if (btrfs_key_type(location) != BTRFS_ROOT_ITEM_KEY) + return 0; + if (location->objectid == BTRFS_ROOT_TREE_OBJECTID) + return 0; + + *sub_root = btrfs_read_fs_root(root->fs_info, location, + dentry->d_name.name, + dentry->d_name.len); + if (IS_ERR(*sub_root)) + return PTR_ERR(*sub_root); + + ri = &(*sub_root)->root_item; + location->objectid = btrfs_root_dirid(ri); + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + location->offset = 0; + + return 0; +} + +static noinline void init_btrfs_i(struct inode *inode) +{ + struct btrfs_inode *bi = BTRFS_I(inode); + + bi->i_acl = NULL; + bi->i_default_acl = NULL; + + bi->generation = 0; + bi->sequence = 0; + bi->last_trans = 0; + bi->logged_trans = 0; + bi->delalloc_bytes = 0; + bi->disk_i_size = 0; + bi->flags = 0; + bi->index_cnt = (u64)-1; + bi->log_dirty_trans = 0; + extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_tree, + inode->i_mapping, GFP_NOFS); + extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree, + inode->i_mapping, GFP_NOFS); + INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes); + btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree); + mutex_init(&BTRFS_I(inode)->extent_mutex); + mutex_init(&BTRFS_I(inode)->log_mutex); +} + +static int btrfs_init_locked_inode(struct inode *inode, void *p) +{ + struct btrfs_iget_args *args = p; + inode->i_ino = args->ino; + init_btrfs_i(inode); + BTRFS_I(inode)->root = args->root; + return 0; +} + +static int btrfs_find_actor(struct inode *inode, void *opaque) +{ + struct btrfs_iget_args *args = opaque; + return args->ino == inode->i_ino && + args->root == BTRFS_I(inode)->root; +} + +struct inode *btrfs_ilookup(struct super_block *s, u64 objectid, + struct btrfs_root *root, int wait) +{ + struct inode *inode; + struct btrfs_iget_args args; + args.ino = objectid; + args.root = root; + + if (wait) { + inode = ilookup5(s, objectid, btrfs_find_actor, + (void *)&args); + } else { + inode = ilookup5_nowait(s, objectid, btrfs_find_actor, + (void *)&args); + } + return inode; +} + +struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid, + struct btrfs_root *root) +{ + struct inode *inode; + struct btrfs_iget_args args; + args.ino = objectid; + args.root = root; + + inode = iget5_locked(s, objectid, btrfs_find_actor, + btrfs_init_locked_inode, + (void *)&args); + return inode; +} + +/* Get an inode object given its location and corresponding root. + * Returns in *is_new if the inode was read from disk + */ +struct inode *btrfs_iget(struct super_block *s, struct btrfs_key *location, + struct btrfs_root *root, int *is_new) +{ + struct inode *inode; + + inode = btrfs_iget_locked(s, location->objectid, root); + if (!inode) + return ERR_PTR(-EACCES); + + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + memcpy(&BTRFS_I(inode)->location, location, sizeof(*location)); + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + if (is_new) + *is_new = 1; + } else { + if (is_new) + *is_new = 0; + } + + return inode; +} + +struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) +{ + struct inode *inode; + struct btrfs_inode *bi = BTRFS_I(dir); + struct btrfs_root *root = bi->root; + struct btrfs_root *sub_root = root; + struct btrfs_key location; + int ret, new; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + ret = btrfs_inode_by_name(dir, dentry, &location); + + if (ret < 0) + return ERR_PTR(ret); + + inode = NULL; + if (location.objectid) { + ret = fixup_tree_root_location(root, &location, &sub_root, + dentry); + if (ret < 0) + return ERR_PTR(ret); + if (ret > 0) + return ERR_PTR(-ENOENT); + inode = btrfs_iget(dir->i_sb, &location, sub_root, &new); + if (IS_ERR(inode)) + return ERR_CAST(inode); + } + return inode; +} + +static struct dentry *btrfs_lookup(struct inode *dir, struct dentry *dentry, + struct nameidata *nd) +{ + struct inode *inode; + + if (dentry->d_name.len > BTRFS_NAME_LEN) + return ERR_PTR(-ENAMETOOLONG); + + inode = btrfs_lookup_dentry(dir, dentry); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_splice_alias(inode, dentry); +} + +static unsigned char btrfs_filetype_table[] = { + DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK, DT_FIFO, DT_SOCK, DT_LNK +}; + +static int btrfs_real_readdir(struct file *filp, void *dirent, + filldir_t filldir) +{ + struct inode *inode = filp->f_dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_item *item; + struct btrfs_dir_item *di; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int ret; + u32 nritems; + struct extent_buffer *leaf; + int slot; + int advance; + unsigned char d_type; + int over = 0; + u32 di_cur; + u32 di_total; + u32 di_len; + int key_type = BTRFS_DIR_INDEX_KEY; + char tmp_name[32]; + char *name_ptr; + int name_len; + + /* FIXME, use a real flag for deciding about the key type */ + if (root->fs_info->tree_root == root) + key_type = BTRFS_DIR_ITEM_KEY; + + /* special case for "." */ + if (filp->f_pos == 0) { + over = filldir(dirent, ".", 1, + 1, inode->i_ino, + DT_DIR); + if (over) + return 0; + filp->f_pos = 1; + } + /* special case for .., just use the back ref */ + if (filp->f_pos == 1) { + u64 pino = parent_ino(filp->f_path.dentry); + over = filldir(dirent, "..", 2, + 2, pino, DT_DIR); + if (over) + return 0; + filp->f_pos = 2; + } + path = btrfs_alloc_path(); + path->reada = 2; + + btrfs_set_key_type(&key, key_type); + key.offset = filp->f_pos; + key.objectid = inode->i_ino; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + advance = 0; + + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + if (advance || slot >= nritems) { + if (slot >= nritems - 1) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } else { + slot++; + path->slots[0]++; + } + } + + advance = 1; + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != key_type) + break; + if (found_key.offset < filp->f_pos) + continue; + + filp->f_pos = found_key.offset; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + di_cur = 0; + di_total = btrfs_item_size(leaf, item); + + while (di_cur < di_total) { + struct btrfs_key location; + + name_len = btrfs_dir_name_len(leaf, di); + if (name_len <= sizeof(tmp_name)) { + name_ptr = tmp_name; + } else { + name_ptr = kmalloc(name_len, GFP_NOFS); + if (!name_ptr) { + ret = -ENOMEM; + goto err; + } + } + read_extent_buffer(leaf, name_ptr, + (unsigned long)(di + 1), name_len); + + d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)]; + btrfs_dir_item_key_to_cpu(leaf, di, &location); + + /* is this a reference to our own snapshot? If so + * skip it + */ + if (location.type == BTRFS_ROOT_ITEM_KEY && + location.objectid == root->root_key.objectid) { + over = 0; + goto skip; + } + over = filldir(dirent, name_ptr, name_len, + found_key.offset, location.objectid, + d_type); + +skip: + if (name_ptr != tmp_name) + kfree(name_ptr); + + if (over) + goto nopos; + di_len = btrfs_dir_name_len(leaf, di) + + btrfs_dir_data_len(leaf, di) + sizeof(*di); + di_cur += di_len; + di = (struct btrfs_dir_item *)((char *)di + di_len); + } + } + + /* Reached end of directory/root. Bump pos past the last item. */ + if (key_type == BTRFS_DIR_INDEX_KEY) + filp->f_pos = INT_LIMIT(typeof(filp->f_pos)); + else + filp->f_pos++; +nopos: + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +int btrfs_write_inode(struct inode *inode, int wait) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (root->fs_info->btree_inode == inode) + return 0; + + if (wait) { + trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + ret = btrfs_commit_transaction(trans, root); + } + return ret; +} + +/* + * This is somewhat expensive, updating the tree every time the + * inode changes. But, it is most likely to find the inode in cache. + * FIXME, needs more benchmarking...there are no reasons other than performance + * to keep or drop this code. + */ +void btrfs_dirty_inode(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + + trans = btrfs_join_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + btrfs_update_inode(trans, root, inode); + btrfs_end_transaction(trans, root); +} + +/* + * find the highest existing sequence number in a directory + * and then set the in-memory index_cnt variable to reflect + * free sequence numbers + */ +static int btrfs_set_inode_index_count(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key key, found_key; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret; + + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + /* FIXME: we should be able to handle this */ + if (ret == 0) + goto out; + ret = 0; + + /* + * MAGIC NUMBER EXPLANATION: + * since we search a directory based on f_pos we have to start at 2 + * since '.' and '..' have f_pos of 0 and 1 respectively, so everybody + * else has to start at 2 + */ + if (path->slots[0] == 0) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + path->slots[0]--; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + if (found_key.objectid != inode->i_ino || + btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { + BTRFS_I(inode)->index_cnt = 2; + goto out; + } + + BTRFS_I(inode)->index_cnt = found_key.offset + 1; +out: + btrfs_free_path(path); + return ret; +} + +/* + * helper to find a free sequence number in a given directory. This current + * code is very simple, later versions will do smarter things in the btree + */ +int btrfs_set_inode_index(struct inode *dir, u64 *index) +{ + int ret = 0; + + if (BTRFS_I(dir)->index_cnt == (u64)-1) { + ret = btrfs_set_inode_index_count(dir); + if (ret) + return ret; + } + + *index = BTRFS_I(dir)->index_cnt; + BTRFS_I(dir)->index_cnt++; + + return ret; +} + +static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *dir, + const char *name, int name_len, + u64 ref_objectid, u64 objectid, + u64 alloc_hint, int mode, u64 *index) +{ + struct inode *inode; + struct btrfs_inode_item *inode_item; + struct btrfs_key *location; + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + struct btrfs_key key[2]; + u32 sizes[2]; + unsigned long ptr; + int ret; + int owner; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + inode = new_inode(root->fs_info->sb); + if (!inode) + return ERR_PTR(-ENOMEM); + + if (dir) { + ret = btrfs_set_inode_index(dir, index); + if (ret) + return ERR_PTR(ret); + } + /* + * index_cnt is ignored for everything but a dir, + * btrfs_get_inode_index_count has an explanation for the magic + * number + */ + init_btrfs_i(inode); + BTRFS_I(inode)->index_cnt = 2; + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->generation = trans->transid; + + if (mode & S_IFDIR) + owner = 0; + else + owner = 1; + BTRFS_I(inode)->block_group = + btrfs_find_block_group(root, 0, alloc_hint, owner); + if ((mode & S_IFREG)) { + if (btrfs_test_opt(root, NODATASUM)) + btrfs_set_flag(inode, NODATASUM); + if (btrfs_test_opt(root, NODATACOW)) + btrfs_set_flag(inode, NODATACOW); + } + + key[0].objectid = objectid; + btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); + key[0].offset = 0; + + key[1].objectid = objectid; + btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].offset = ref_objectid; + + sizes[0] = sizeof(struct btrfs_inode_item); + sizes[1] = name_len + sizeof(*ref); + + ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2); + if (ret != 0) + goto fail; + + if (objectid > root->highest_inode) + root->highest_inode = objectid; + + inode->i_uid = current_fsuid(); + inode->i_gid = current_fsgid(); + inode->i_mode = mode; + inode->i_ino = objectid; + inode_set_bytes(inode, 0); + inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode_item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + fill_inode_item(trans, path->nodes[0], inode_item, inode); + + ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1, + struct btrfs_inode_ref); + btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len); + btrfs_set_inode_ref_index(path->nodes[0], ref, *index); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(path->nodes[0], name, ptr, name_len); + + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_free_path(path); + + location = &BTRFS_I(inode)->location; + location->objectid = objectid; + location->offset = 0; + btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + + insert_inode_hash(inode); + return inode; +fail: + if (dir) + BTRFS_I(dir)->index_cnt--; + btrfs_free_path(path); + return ERR_PTR(ret); +} + +static inline u8 btrfs_inode_type(struct inode *inode) +{ + return btrfs_type_by_mode[(inode->i_mode & S_IFMT) >> S_SHIFT]; +} + +/* + * utility function to add 'inode' into 'parent_inode' with + * a give name and a given sequence number. + * if 'add_backref' is true, also insert a backref from the + * inode to the parent directory. + */ +int btrfs_add_link(struct btrfs_trans_handle *trans, + struct inode *parent_inode, struct inode *inode, + const char *name, int name_len, int add_backref, u64 index) +{ + int ret; + struct btrfs_key key; + struct btrfs_root *root = BTRFS_I(parent_inode)->root; + + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.offset = 0; + + ret = btrfs_insert_dir_item(trans, root, name, name_len, + parent_inode->i_ino, + &key, btrfs_inode_type(inode), + index); + if (ret == 0) { + if (add_backref) { + ret = btrfs_insert_inode_ref(trans, root, + name, name_len, + inode->i_ino, + parent_inode->i_ino, + index); + } + btrfs_i_size_write(parent_inode, parent_inode->i_size + + name_len * 2); + parent_inode->i_mtime = parent_inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, parent_inode); + } + return ret; +} + +static int btrfs_add_nondir(struct btrfs_trans_handle *trans, + struct dentry *dentry, struct inode *inode, + int backref, u64 index) +{ + int err = btrfs_add_link(trans, dentry->d_parent->d_inode, + inode, dentry->d_name.name, + dentry->d_name.len, backref, index); + if (!err) { + d_instantiate(dentry, inode); + return 0; + } + if (err > 0) + err = -EEXIST; + return err; +} + +static int btrfs_mknod(struct inode *dir, struct dentry *dentry, + int mode, dev_t rdev) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + unsigned long nr = 0; + u64 index = 0; + + if (!new_valid_dev(rdev)) + return -EINVAL; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto fail; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, mode, &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_acl(inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_op = &btrfs_special_inode_operations; + init_special_inode(inode, inode->i_mode, rdev); + btrfs_update_inode(trans, root, inode); + } + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_create(struct inode *dir, struct dentry *dentry, + int mode, struct nameidata *nd) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + unsigned long nr = 0; + u64 objectid; + u64 index = 0; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto fail; + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, + objectid, BTRFS_I(dir)->block_group, mode, + &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_acl(inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + } + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct inode *inode = old_dentry->d_inode; + u64 index; + unsigned long nr = 0; + int err; + int drop_inode = 0; + + if (inode->i_nlink == 0) + return -ENOENT; + + btrfs_inc_nlink(inode); + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto fail; + err = btrfs_set_inode_index(dir, &index); + if (err) + goto fail; + + trans = btrfs_start_transaction(root, 1); + + btrfs_set_trans_block_group(trans, dir); + atomic_inc(&inode->i_count); + + err = btrfs_add_nondir(trans, dentry, inode, 1, index); + + if (err) + drop_inode = 1; + + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, dir); + err = btrfs_update_inode(trans, root, inode); + + if (err) + drop_inode = 1; + + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) +{ + struct inode *inode = NULL; + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + int err = 0; + int drop_on_err = 0; + u64 objectid = 0; + u64 index = 0; + unsigned long nr = 1; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto out_unlock; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + if (IS_ERR(trans)) { + err = PTR_ERR(trans); + goto out_unlock; + } + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, S_IFDIR | mode, + &index); + if (IS_ERR(inode)) { + err = PTR_ERR(inode); + goto out_fail; + } + + drop_on_err = 1; + + err = btrfs_init_acl(inode, dir); + if (err) + goto out_fail; + + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + btrfs_set_trans_block_group(trans, inode); + + btrfs_i_size_write(inode, 0); + err = btrfs_update_inode(trans, root, inode); + if (err) + goto out_fail; + + err = btrfs_add_link(trans, dentry->d_parent->d_inode, + inode, dentry->d_name.name, + dentry->d_name.len, 0, index); + if (err) + goto out_fail; + + d_instantiate(dentry, inode); + drop_on_err = 0; + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); + +out_fail: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); + +out_unlock: + if (drop_on_err) + iput(inode); + btrfs_btree_balance_dirty(root, nr); + return err; +} + +/* helper for btfs_get_extent. Given an existing extent in the tree, + * and an extent that you want to insert, deal with overlap and insert + * the new extent into the tree. + */ +static int merge_extent_mapping(struct extent_map_tree *em_tree, + struct extent_map *existing, + struct extent_map *em, + u64 map_start, u64 map_len) +{ + u64 start_diff; + + BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); + start_diff = map_start - em->start; + em->start = map_start; + em->len = map_len; + if (em->block_start < EXTENT_MAP_LAST_BYTE && + !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { + em->block_start += start_diff; + em->block_len -= start_diff; + } + return add_extent_mapping(em_tree, em); +} + +static noinline int uncompress_inline(struct btrfs_path *path, + struct inode *inode, struct page *page, + size_t pg_offset, u64 extent_offset, + struct btrfs_file_extent_item *item) +{ + int ret; + struct extent_buffer *leaf = path->nodes[0]; + char *tmp; + size_t max_size; + unsigned long inline_size; + unsigned long ptr; + + WARN_ON(pg_offset != 0); + max_size = btrfs_file_extent_ram_bytes(leaf, item); + inline_size = btrfs_file_extent_inline_item_len(leaf, + btrfs_item_nr(leaf, path->slots[0])); + tmp = kmalloc(inline_size, GFP_NOFS); + ptr = btrfs_file_extent_inline_start(item); + + read_extent_buffer(leaf, tmp, ptr, inline_size); + + max_size = min_t(unsigned long, PAGE_CACHE_SIZE, max_size); + ret = btrfs_zlib_decompress(tmp, page, extent_offset, + inline_size, max_size); + if (ret) { + char *kaddr = kmap_atomic(page, KM_USER0); + unsigned long copy_size = min_t(u64, + PAGE_CACHE_SIZE - pg_offset, + max_size - extent_offset); + memset(kaddr + pg_offset, 0, copy_size); + kunmap_atomic(kaddr, KM_USER0); + } + kfree(tmp); + return 0; +} + +/* + * a bit scary, this does extent mapping from logical file offset to the disk. + * the ugly parts come from merging extents from the disk with the in-ram + * representation. This gets more complex because of the data=ordered code, + * where the in-ram extents might be locked pending data=ordered completion. + * + * This also copies inline extents directly into the page. + */ + +struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page, + size_t pg_offset, u64 start, u64 len, + int create) +{ + int ret; + int err = 0; + u64 bytenr; + u64 extent_start = 0; + u64 extent_end = 0; + u64 objectid = inode->i_ino; + u32 found_type; + struct btrfs_path *path = NULL; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_file_extent_item *item; + struct extent_buffer *leaf; + struct btrfs_key found_key; + struct extent_map *em = NULL; + struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_trans_handle *trans = NULL; + int compressed; + +again: + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, start, len); + if (em) + em->bdev = root->fs_info->fs_devices->latest_bdev; + spin_unlock(&em_tree->lock); + + if (em) { + if (em->start > start || em->start + em->len <= start) + free_extent_map(em); + else if (em->block_start == EXTENT_MAP_INLINE && page) + free_extent_map(em); + else + goto out; + } + em = alloc_extent_map(GFP_NOFS); + if (!em) { + err = -ENOMEM; + goto out; + } + em->bdev = root->fs_info->fs_devices->latest_bdev; + em->start = EXTENT_MAP_HOLE; + em->orig_start = EXTENT_MAP_HOLE; + em->len = (u64)-1; + em->block_len = (u64)-1; + + if (!path) { + path = btrfs_alloc_path(); + BUG_ON(!path); + } + + ret = btrfs_lookup_file_extent(trans, root, path, + objectid, start, trans != NULL); + if (ret < 0) { + err = ret; + goto out; + } + + if (ret != 0) { + if (path->slots[0] == 0) + goto not_found; + path->slots[0]--; + } + + leaf = path->nodes[0]; + item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + /* are we inside the extent that was found? */ + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + found_type = btrfs_key_type(&found_key); + if (found_key.objectid != objectid || + found_type != BTRFS_EXTENT_DATA_KEY) { + goto not_found; + } + + found_type = btrfs_file_extent_type(leaf, item); + extent_start = found_key.offset; + compressed = btrfs_file_extent_compression(leaf, item); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + extent_end = extent_start + + btrfs_file_extent_num_bytes(leaf, item); + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size_t size; + size = btrfs_file_extent_inline_len(leaf, item); + extent_end = (extent_start + size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + } + + if (start >= extent_end) { + path->slots[0]++; + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + err = ret; + goto out; + } + if (ret > 0) + goto not_found; + leaf = path->nodes[0]; + } + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + if (found_key.objectid != objectid || + found_key.type != BTRFS_EXTENT_DATA_KEY) + goto not_found; + if (start + len <= found_key.offset) + goto not_found; + em->start = start; + em->len = found_key.offset - start; + goto not_found_em; + } + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + em->start = extent_start; + em->len = extent_end - extent_start; + em->orig_start = extent_start - + btrfs_file_extent_offset(leaf, item); + bytenr = btrfs_file_extent_disk_bytenr(leaf, item); + if (bytenr == 0) { + em->block_start = EXTENT_MAP_HOLE; + goto insert; + } + if (compressed) { + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + em->block_start = bytenr; + em->block_len = btrfs_file_extent_disk_num_bytes(leaf, + item); + } else { + bytenr += btrfs_file_extent_offset(leaf, item); + em->block_start = bytenr; + em->block_len = em->len; + if (found_type == BTRFS_FILE_EXTENT_PREALLOC) + set_bit(EXTENT_FLAG_PREALLOC, &em->flags); + } + goto insert; + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + unsigned long ptr; + char *map; + size_t size; + size_t extent_offset; + size_t copy_size; + + em->block_start = EXTENT_MAP_INLINE; + if (!page || create) { + em->start = extent_start; + em->len = extent_end - extent_start; + goto out; + } + + size = btrfs_file_extent_inline_len(leaf, item); + extent_offset = page_offset(page) + pg_offset - extent_start; + copy_size = min_t(u64, PAGE_CACHE_SIZE - pg_offset, + size - extent_offset); + em->start = extent_start + extent_offset; + em->len = (copy_size + root->sectorsize - 1) & + ~((u64)root->sectorsize - 1); + em->orig_start = EXTENT_MAP_INLINE; + if (compressed) + set_bit(EXTENT_FLAG_COMPRESSED, &em->flags); + ptr = btrfs_file_extent_inline_start(item) + extent_offset; + if (create == 0 && !PageUptodate(page)) { + if (btrfs_file_extent_compression(leaf, item) == + BTRFS_COMPRESS_ZLIB) { + ret = uncompress_inline(path, inode, page, + pg_offset, + extent_offset, item); + BUG_ON(ret); + } else { + map = kmap(page); + read_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + kunmap(page); + } + flush_dcache_page(page); + } else if (create && PageUptodate(page)) { + if (!trans) { + kunmap(page); + free_extent_map(em); + em = NULL; + btrfs_release_path(root, path); + trans = btrfs_join_transaction(root, 1); + goto again; + } + map = kmap(page); + write_extent_buffer(leaf, map + pg_offset, ptr, + copy_size); + kunmap(page); + btrfs_mark_buffer_dirty(leaf); + } + set_extent_uptodate(io_tree, em->start, + extent_map_end(em) - 1, GFP_NOFS); + goto insert; + } else { + printk(KERN_ERR "btrfs unknown found_type %d\n", found_type); + WARN_ON(1); + } +not_found: + em->start = start; + em->len = len; +not_found_em: + em->block_start = EXTENT_MAP_HOLE; + set_bit(EXTENT_FLAG_VACANCY, &em->flags); +insert: + btrfs_release_path(root, path); + if (em->start > start || extent_map_end(em) <= start) { + printk(KERN_ERR "Btrfs: bad extent! em: [%llu %llu] passed " + "[%llu %llu]\n", (unsigned long long)em->start, + (unsigned long long)em->len, + (unsigned long long)start, + (unsigned long long)len); + err = -EIO; + goto out; + } + + err = 0; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + /* it is possible that someone inserted the extent into the tree + * while we had the lock dropped. It is also possible that + * an overlapping map exists in the tree + */ + if (ret == -EEXIST) { + struct extent_map *existing; + + ret = 0; + + existing = lookup_extent_mapping(em_tree, start, len); + if (existing && (existing->start > start || + existing->start + existing->len <= start)) { + free_extent_map(existing); + existing = NULL; + } + if (!existing) { + existing = lookup_extent_mapping(em_tree, em->start, + em->len); + if (existing) { + err = merge_extent_mapping(em_tree, existing, + em, start, + root->sectorsize); + free_extent_map(existing); + if (err) { + free_extent_map(em); + em = NULL; + } + } else { + err = -EIO; + free_extent_map(em); + em = NULL; + } + } else { + free_extent_map(em); + em = existing; + err = 0; + } + } + spin_unlock(&em_tree->lock); +out: + if (path) + btrfs_free_path(path); + if (trans) { + ret = btrfs_end_transaction(trans, root); + if (!err) + err = ret; + } + if (err) { + free_extent_map(em); + WARN_ON(1); + return ERR_PTR(err); + } + return em; +} + +static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, + const struct iovec *iov, loff_t offset, + unsigned long nr_segs) +{ + return -EINVAL; +} + +static sector_t btrfs_bmap(struct address_space *mapping, sector_t iblock) +{ + return extent_bmap(mapping, iblock, btrfs_get_extent); +} + +int btrfs_readpage(struct file *file, struct page *page) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_read_full_page(tree, page, btrfs_get_extent); +} + +static int btrfs_writepage(struct page *page, struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + + if (current->flags & PF_MEMALLOC) { + redirty_page_for_writepage(wbc, page); + unlock_page(page); + return 0; + } + tree = &BTRFS_I(page->mapping->host)->io_tree; + return extent_write_full_page(tree, page, btrfs_get_extent, wbc); +} + +int btrfs_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct extent_io_tree *tree; + + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_writepages(tree, mapping, btrfs_get_extent, wbc); +} + +static int +btrfs_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + struct extent_io_tree *tree; + tree = &BTRFS_I(mapping->host)->io_tree; + return extent_readpages(tree, mapping, pages, nr_pages, + btrfs_get_extent); +} +static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + struct extent_io_tree *tree; + struct extent_map_tree *map; + int ret; + + tree = &BTRFS_I(page->mapping->host)->io_tree; + map = &BTRFS_I(page->mapping->host)->extent_tree; + ret = try_release_extent_mapping(map, tree, page, gfp_flags); + if (ret == 1) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } + return ret; +} + +static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) +{ + if (PageWriteback(page) || PageDirty(page)) + return 0; + return __btrfs_releasepage(page, gfp_flags); +} + +static void btrfs_invalidatepage(struct page *page, unsigned long offset) +{ + struct extent_io_tree *tree; + struct btrfs_ordered_extent *ordered; + u64 page_start = page_offset(page); + u64 page_end = page_start + PAGE_CACHE_SIZE - 1; + + wait_on_page_writeback(page); + tree = &BTRFS_I(page->mapping->host)->io_tree; + if (offset) { + btrfs_releasepage(page, GFP_NOFS); + return; + } + + lock_extent(tree, page_start, page_end, GFP_NOFS); + ordered = btrfs_lookup_ordered_extent(page->mapping->host, + page_offset(page)); + if (ordered) { + /* + * IO on this page will never be started, so we need + * to account for any ordered extents now + */ + clear_extent_bit(tree, page_start, page_end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_LOCKED, 1, 0, GFP_NOFS); + btrfs_finish_ordered_io(page->mapping->host, + page_start, page_end); + btrfs_put_ordered_extent(ordered); + lock_extent(tree, page_start, page_end, GFP_NOFS); + } + clear_extent_bit(tree, page_start, page_end, + EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_ORDERED, + 1, 1, GFP_NOFS); + __btrfs_releasepage(page, GFP_NOFS); + + ClearPageChecked(page); + if (PagePrivate(page)) { + ClearPagePrivate(page); + set_page_private(page, 0); + page_cache_release(page); + } +} + +/* + * btrfs_page_mkwrite() is not allowed to change the file size as it gets + * called from a page fault handler when a page is first dirtied. Hence we must + * be careful to check for EOF conditions here. We set the page up correctly + * for a written page which means we get ENOSPC checking when writing into + * holes and correct delalloc and unwritten extent mapping on filesystems that + * support these features. + * + * We are not allowed to take the i_mutex here so we have to play games to + * protect against truncate races as the page could now be beyond EOF. Because + * vmtruncate() writes the inode size before removing pages, once we have the + * page lock we can determine safely if the page is beyond EOF. If it is not + * beyond EOF, then the page is guaranteed safe against truncation until we + * unlock the page. + */ +int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) +{ + struct inode *inode = fdentry(vma->vm_file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + char *kaddr; + unsigned long zero_start; + loff_t size; + int ret; + u64 page_start; + u64 page_end; + + ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); + if (ret) + goto out; + + ret = -EINVAL; +again: + lock_page(page); + size = i_size_read(inode); + page_start = page_offset(page); + page_end = page_start + PAGE_CACHE_SIZE - 1; + + if ((page->mapping != inode->i_mapping) || + (page_start >= size)) { + /* page got truncated out from underneath us */ + goto out_unlock; + } + wait_on_page_writeback(page); + + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_extent_mapped(page); + + /* + * we can't set the delalloc bits if there are pending ordered + * extents. Drop our locks and wait for them to finish + */ + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + + btrfs_set_extent_delalloc(inode, page_start, page_end); + ret = 0; + + /* page is wholly or partially inside EOF */ + if (page_start + PAGE_CACHE_SIZE > size) + zero_start = size & ~PAGE_CACHE_MASK; + else + zero_start = PAGE_CACHE_SIZE; + + if (zero_start != PAGE_CACHE_SIZE) { + kaddr = kmap(page); + memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start); + flush_dcache_page(page); + kunmap(page); + } + ClearPageChecked(page); + set_page_dirty(page); + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + +out_unlock: + unlock_page(page); +out: + return ret; +} + +static void btrfs_truncate(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + struct btrfs_trans_handle *trans; + unsigned long nr; + u64 mask = root->sectorsize - 1; + + if (!S_ISREG(inode->i_mode)) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + + btrfs_truncate_page(inode->i_mapping, inode->i_size); + btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1); + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + btrfs_i_size_write(inode, inode->i_size); + + ret = btrfs_orphan_add(trans, inode); + if (ret) + goto out; + /* FIXME, add redo link to tree so we don't leak on crash */ + ret = btrfs_truncate_inode_items(trans, root, inode, inode->i_size, + BTRFS_EXTENT_DATA_KEY); + btrfs_update_inode(trans, root, inode); + + ret = btrfs_orphan_del(trans, inode); + BUG_ON(ret); + +out: + nr = trans->blocks_used; + ret = btrfs_end_transaction_throttle(trans, root); + BUG_ON(ret); + btrfs_btree_balance_dirty(root, nr); +} + +/* + * create a new subvolume directory/inode (helper for the ioctl). + */ +int btrfs_create_subvol_root(struct btrfs_trans_handle *trans, + struct btrfs_root *new_root, struct dentry *dentry, + u64 new_dirid, u64 alloc_hint) +{ + struct inode *inode; + int error; + u64 index = 0; + + inode = btrfs_new_inode(trans, new_root, NULL, "..", 2, new_dirid, + new_dirid, alloc_hint, S_IFDIR | 0700, &index); + if (IS_ERR(inode)) + return PTR_ERR(inode); + inode->i_op = &btrfs_dir_inode_operations; + inode->i_fop = &btrfs_dir_file_operations; + + inode->i_nlink = 1; + btrfs_i_size_write(inode, 0); + + error = btrfs_update_inode(trans, new_root, inode); + if (error) + return error; + + d_instantiate(dentry, inode); + return 0; +} + +/* helper function for file defrag and space balancing. This + * forces readahead on a given range of bytes in an inode + */ +unsigned long btrfs_force_ra(struct address_space *mapping, + struct file_ra_state *ra, struct file *file, + pgoff_t offset, pgoff_t last_index) +{ + pgoff_t req_size = last_index - offset + 1; + + page_cache_sync_readahead(mapping, ra, file, offset, req_size); + return offset + req_size; +} + +struct inode *btrfs_alloc_inode(struct super_block *sb) +{ + struct btrfs_inode *ei; + + ei = kmem_cache_alloc(btrfs_inode_cachep, GFP_NOFS); + if (!ei) + return NULL; + ei->last_trans = 0; + ei->logged_trans = 0; + btrfs_ordered_inode_tree_init(&ei->ordered_tree); + ei->i_acl = BTRFS_ACL_NOT_CACHED; + ei->i_default_acl = BTRFS_ACL_NOT_CACHED; + INIT_LIST_HEAD(&ei->i_orphan); + return &ei->vfs_inode; +} + +void btrfs_destroy_inode(struct inode *inode) +{ + struct btrfs_ordered_extent *ordered; + WARN_ON(!list_empty(&inode->i_dentry)); + WARN_ON(inode->i_data.nrpages); + + if (BTRFS_I(inode)->i_acl && + BTRFS_I(inode)->i_acl != BTRFS_ACL_NOT_CACHED) + posix_acl_release(BTRFS_I(inode)->i_acl); + if (BTRFS_I(inode)->i_default_acl && + BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED) + posix_acl_release(BTRFS_I(inode)->i_default_acl); + + spin_lock(&BTRFS_I(inode)->root->list_lock); + if (!list_empty(&BTRFS_I(inode)->i_orphan)) { + printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan" + " list\n", inode->i_ino); + dump_stack(); + } + spin_unlock(&BTRFS_I(inode)->root->list_lock); + + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); + if (!ordered) + break; + else { + printk(KERN_ERR "btrfs found ordered " + "extent %llu %llu on inode cleanup\n", + (unsigned long long)ordered->file_offset, + (unsigned long long)ordered->len); + btrfs_remove_ordered_extent(inode, ordered); + btrfs_put_ordered_extent(ordered); + btrfs_put_ordered_extent(ordered); + } + } + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); +} + +static void init_once(void *foo) +{ + struct btrfs_inode *ei = (struct btrfs_inode *) foo; + + inode_init_once(&ei->vfs_inode); +} + +void btrfs_destroy_cachep(void) +{ + if (btrfs_inode_cachep) + kmem_cache_destroy(btrfs_inode_cachep); + if (btrfs_trans_handle_cachep) + kmem_cache_destroy(btrfs_trans_handle_cachep); + if (btrfs_transaction_cachep) + kmem_cache_destroy(btrfs_transaction_cachep); + if (btrfs_bit_radix_cachep) + kmem_cache_destroy(btrfs_bit_radix_cachep); + if (btrfs_path_cachep) + kmem_cache_destroy(btrfs_path_cachep); +} + +struct kmem_cache *btrfs_cache_create(const char *name, size_t size, + unsigned long extra_flags, + void (*ctor)(void *)) +{ + return kmem_cache_create(name, size, 0, (SLAB_RECLAIM_ACCOUNT | + SLAB_MEM_SPREAD | extra_flags), ctor); +} + +int btrfs_init_cachep(void) +{ + btrfs_inode_cachep = btrfs_cache_create("btrfs_inode_cache", + sizeof(struct btrfs_inode), + 0, init_once); + if (!btrfs_inode_cachep) + goto fail; + btrfs_trans_handle_cachep = + btrfs_cache_create("btrfs_trans_handle_cache", + sizeof(struct btrfs_trans_handle), + 0, NULL); + if (!btrfs_trans_handle_cachep) + goto fail; + btrfs_transaction_cachep = btrfs_cache_create("btrfs_transaction_cache", + sizeof(struct btrfs_transaction), + 0, NULL); + if (!btrfs_transaction_cachep) + goto fail; + btrfs_path_cachep = btrfs_cache_create("btrfs_path_cache", + sizeof(struct btrfs_path), + 0, NULL); + if (!btrfs_path_cachep) + goto fail; + btrfs_bit_radix_cachep = btrfs_cache_create("btrfs_radix", 256, + SLAB_DESTROY_BY_RCU, NULL); + if (!btrfs_bit_radix_cachep) + goto fail; + return 0; +fail: + btrfs_destroy_cachep(); + return -ENOMEM; +} + +static int btrfs_getattr(struct vfsmount *mnt, + struct dentry *dentry, struct kstat *stat) +{ + struct inode *inode = dentry->d_inode; + generic_fillattr(inode, stat); + stat->dev = BTRFS_I(inode)->root->anon_super.s_dev; + stat->blksize = PAGE_CACHE_SIZE; + stat->blocks = (inode_get_bytes(inode) + + BTRFS_I(inode)->delalloc_bytes) >> 9; + return 0; +} + +static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, + struct inode *new_dir, struct dentry *new_dentry) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(old_dir)->root; + struct inode *new_inode = new_dentry->d_inode; + struct inode *old_inode = old_dentry->d_inode; + struct timespec ctime = CURRENT_TIME; + u64 index = 0; + int ret; + + /* we're not allowed to rename between subvolumes */ + if (BTRFS_I(old_inode)->root->root_key.objectid != + BTRFS_I(new_dir)->root->root_key.objectid) + return -EXDEV; + + if (S_ISDIR(old_inode->i_mode) && new_inode && + new_inode->i_size > BTRFS_EMPTY_DIR_SIZE) { + return -ENOTEMPTY; + } + + /* to rename a snapshot or subvolume, we need to juggle the + * backrefs. This isn't coded yet + */ + if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) + return -EXDEV; + + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto out_unlock; + + trans = btrfs_start_transaction(root, 1); + + btrfs_set_trans_block_group(trans, new_dir); + + btrfs_inc_nlink(old_dentry->d_inode); + old_dir->i_ctime = old_dir->i_mtime = ctime; + new_dir->i_ctime = new_dir->i_mtime = ctime; + old_inode->i_ctime = ctime; + + ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode, + old_dentry->d_name.name, + old_dentry->d_name.len); + if (ret) + goto out_fail; + + if (new_inode) { + new_inode->i_ctime = CURRENT_TIME; + ret = btrfs_unlink_inode(trans, root, new_dir, + new_dentry->d_inode, + new_dentry->d_name.name, + new_dentry->d_name.len); + if (ret) + goto out_fail; + if (new_inode->i_nlink == 0) { + ret = btrfs_orphan_add(trans, new_dentry->d_inode); + if (ret) + goto out_fail; + } + + } + ret = btrfs_set_inode_index(new_dir, &index); + if (ret) + goto out_fail; + + ret = btrfs_add_link(trans, new_dentry->d_parent->d_inode, + old_inode, new_dentry->d_name.name, + new_dentry->d_name.len, 1, index); + if (ret) + goto out_fail; + +out_fail: + btrfs_end_transaction_throttle(trans, root); +out_unlock: + return ret; +} + +/* + * some fairly slow code that needs optimization. This walks the list + * of all the inodes with pending delalloc and forces them to disk. + */ +int btrfs_start_delalloc_inodes(struct btrfs_root *root) +{ + struct list_head *head = &root->fs_info->delalloc_inodes; + struct btrfs_inode *binode; + struct inode *inode; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + spin_lock(&root->fs_info->delalloc_lock); + while (!list_empty(head)) { + binode = list_entry(head->next, struct btrfs_inode, + delalloc_inodes); + inode = igrab(&binode->vfs_inode); + if (!inode) + list_del_init(&binode->delalloc_inodes); + spin_unlock(&root->fs_info->delalloc_lock); + if (inode) { + filemap_flush(inode->i_mapping); + iput(inode); + } + cond_resched(); + spin_lock(&root->fs_info->delalloc_lock); + } + spin_unlock(&root->fs_info->delalloc_lock); + + /* the filemap_flush will queue IO into the worker threads, but + * we have to make sure the IO is actually started and that + * ordered extents get created before we return + */ + atomic_inc(&root->fs_info->async_submit_draining); + while (atomic_read(&root->fs_info->nr_async_submits) || + atomic_read(&root->fs_info->async_delalloc_pages)) { + wait_event(root->fs_info->async_submit_wait, + (atomic_read(&root->fs_info->nr_async_submits) == 0 && + atomic_read(&root->fs_info->async_delalloc_pages) == 0)); + } + atomic_dec(&root->fs_info->async_submit_draining); + return 0; +} + +static int btrfs_symlink(struct inode *dir, struct dentry *dentry, + const char *symname) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_path *path; + struct btrfs_key key; + struct inode *inode = NULL; + int err; + int drop_inode = 0; + u64 objectid; + u64 index = 0 ; + int name_len; + int datasize; + unsigned long ptr; + struct btrfs_file_extent_item *ei; + struct extent_buffer *leaf; + unsigned long nr = 0; + + name_len = strlen(symname) + 1; + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) + return -ENAMETOOLONG; + + err = btrfs_check_free_space(root, 1, 0); + if (err) + goto out_fail; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, dir); + + err = btrfs_find_free_objectid(trans, root, dir->i_ino, &objectid); + if (err) { + err = -ENOSPC; + goto out_unlock; + } + + inode = btrfs_new_inode(trans, root, dir, dentry->d_name.name, + dentry->d_name.len, + dentry->d_parent->d_inode->i_ino, objectid, + BTRFS_I(dir)->block_group, S_IFLNK|S_IRWXUGO, + &index); + err = PTR_ERR(inode); + if (IS_ERR(inode)) + goto out_unlock; + + err = btrfs_init_acl(inode, dir); + if (err) { + drop_inode = 1; + goto out_unlock; + } + + btrfs_set_trans_block_group(trans, inode); + err = btrfs_add_nondir(trans, dentry, inode, 0, index); + if (err) + drop_inode = 1; + else { + inode->i_mapping->a_ops = &btrfs_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode->i_fop = &btrfs_file_operations; + inode->i_op = &btrfs_file_inode_operations; + BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; + } + dir->i_sb->s_dirt = 1; + btrfs_update_inode_block_group(trans, inode); + btrfs_update_inode_block_group(trans, dir); + if (drop_inode) + goto out_unlock; + + path = btrfs_alloc_path(); + BUG_ON(!path); + key.objectid = inode->i_ino; + key.offset = 0; + btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + datasize = btrfs_file_extent_calc_inline_size(name_len); + err = btrfs_insert_empty_item(trans, root, path, &key, + datasize); + if (err) { + drop_inode = 1; + goto out_unlock; + } + leaf = path->nodes[0]; + ei = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + btrfs_set_file_extent_generation(leaf, ei, trans->transid); + btrfs_set_file_extent_type(leaf, ei, + BTRFS_FILE_EXTENT_INLINE); + btrfs_set_file_extent_encryption(leaf, ei, 0); + btrfs_set_file_extent_compression(leaf, ei, 0); + btrfs_set_file_extent_other_encoding(leaf, ei, 0); + btrfs_set_file_extent_ram_bytes(leaf, ei, name_len); + + ptr = btrfs_file_extent_inline_start(ei); + write_extent_buffer(leaf, symname, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + + inode->i_op = &btrfs_symlink_inode_operations; + inode->i_mapping->a_ops = &btrfs_symlink_aops; + inode->i_mapping->backing_dev_info = &root->fs_info->bdi; + inode_set_bytes(inode, name_len); + btrfs_i_size_write(inode, name_len - 1); + err = btrfs_update_inode(trans, root, inode); + if (err) + drop_inode = 1; + +out_unlock: + nr = trans->blocks_used; + btrfs_end_transaction_throttle(trans, root); +out_fail: + if (drop_inode) { + inode_dec_link_count(inode); + iput(inode); + } + btrfs_btree_balance_dirty(root, nr); + return err; +} + +static int prealloc_file_range(struct inode *inode, u64 start, u64 end, + u64 alloc_hint, int mode) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_key ins; + u64 alloc_size; + u64 cur_offset = start; + u64 num_bytes = end - start; + int ret = 0; + + trans = btrfs_join_transaction(root, 1); + BUG_ON(!trans); + btrfs_set_trans_block_group(trans, inode); + + while (num_bytes > 0) { + alloc_size = min(num_bytes, root->fs_info->max_extent); + ret = btrfs_reserve_extent(trans, root, alloc_size, + root->sectorsize, 0, alloc_hint, + (u64)-1, &ins, 1); + if (ret) { + WARN_ON(1); + goto out; + } + ret = insert_reserved_file_extent(trans, inode, + cur_offset, ins.objectid, + ins.offset, ins.offset, + ins.offset, 0, 0, 0, + BTRFS_FILE_EXTENT_PREALLOC); + BUG_ON(ret); + num_bytes -= ins.offset; + cur_offset += ins.offset; + alloc_hint = ins.objectid + ins.offset; + } +out: + if (cur_offset > start) { + inode->i_ctime = CURRENT_TIME; + btrfs_set_flag(inode, PREALLOC); + if (!(mode & FALLOC_FL_KEEP_SIZE) && + cur_offset > i_size_read(inode)) + btrfs_i_size_write(inode, cur_offset); + ret = btrfs_update_inode(trans, root, inode); + BUG_ON(ret); + } + + btrfs_end_transaction(trans, root); + return ret; +} + +static long btrfs_fallocate(struct inode *inode, int mode, + loff_t offset, loff_t len) +{ + u64 cur_offset; + u64 last_byte; + u64 alloc_start; + u64 alloc_end; + u64 alloc_hint = 0; + u64 mask = BTRFS_I(inode)->root->sectorsize - 1; + struct extent_map *em; + int ret; + + alloc_start = offset & ~mask; + alloc_end = (offset + len + mask) & ~mask; + + mutex_lock(&inode->i_mutex); + if (alloc_start > inode->i_size) { + ret = btrfs_cont_expand(inode, alloc_start); + if (ret) + goto out; + } + + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, + alloc_end - 1, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, + alloc_end - 1); + if (ordered && + ordered->file_offset + ordered->len > alloc_start && + ordered->file_offset < alloc_end) { + btrfs_put_ordered_extent(ordered); + unlock_extent(&BTRFS_I(inode)->io_tree, + alloc_start, alloc_end - 1, GFP_NOFS); + btrfs_wait_ordered_range(inode, alloc_start, + alloc_end - alloc_start); + } else { + if (ordered) + btrfs_put_ordered_extent(ordered); + break; + } + } + + cur_offset = alloc_start; + while (1) { + em = btrfs_get_extent(inode, NULL, 0, cur_offset, + alloc_end - cur_offset, 0); + BUG_ON(IS_ERR(em) || !em); + last_byte = min(extent_map_end(em), alloc_end); + last_byte = (last_byte + mask) & ~mask; + if (em->block_start == EXTENT_MAP_HOLE) { + ret = prealloc_file_range(inode, cur_offset, + last_byte, alloc_hint, mode); + if (ret < 0) { + free_extent_map(em); + break; + } + } + if (em->block_start <= EXTENT_MAP_LAST_BYTE) + alloc_hint = em->block_start; + free_extent_map(em); + + cur_offset = last_byte; + if (cur_offset >= alloc_end) { + ret = 0; + break; + } + } + unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, alloc_end - 1, + GFP_NOFS); +out: + mutex_unlock(&inode->i_mutex); + return ret; +} + +static int btrfs_set_page_dirty(struct page *page) +{ + return __set_page_dirty_nobuffers(page); +} + +static int btrfs_permission(struct inode *inode, int mask) +{ + if (btrfs_test_flag(inode, READONLY) && (mask & MAY_WRITE)) + return -EACCES; + return generic_permission(inode, mask, btrfs_check_acl); +} + +static struct inode_operations btrfs_dir_inode_operations = { + .getattr = btrfs_getattr, + .lookup = btrfs_lookup, + .create = btrfs_create, + .unlink = btrfs_unlink, + .link = btrfs_link, + .mkdir = btrfs_mkdir, + .rmdir = btrfs_rmdir, + .rename = btrfs_rename, + .symlink = btrfs_symlink, + .setattr = btrfs_setattr, + .mknod = btrfs_mknod, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, +}; +static struct inode_operations btrfs_dir_ro_inode_operations = { + .lookup = btrfs_lookup, + .permission = btrfs_permission, +}; +static struct file_operations btrfs_dir_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .readdir = btrfs_real_readdir, + .unlocked_ioctl = btrfs_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = btrfs_ioctl, +#endif + .release = btrfs_release_file, + .fsync = btrfs_sync_file, +}; + +static struct extent_io_ops btrfs_extent_io_ops = { + .fill_delalloc = run_delalloc_range, + .submit_bio_hook = btrfs_submit_bio_hook, + .merge_bio_hook = btrfs_merge_bio_hook, + .readpage_end_io_hook = btrfs_readpage_end_io_hook, + .writepage_end_io_hook = btrfs_writepage_end_io_hook, + .writepage_start_hook = btrfs_writepage_start_hook, + .readpage_io_failed_hook = btrfs_io_failed_hook, + .set_bit_hook = btrfs_set_bit_hook, + .clear_bit_hook = btrfs_clear_bit_hook, +}; + +static struct address_space_operations btrfs_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .writepages = btrfs_writepages, + .readpages = btrfs_readpages, + .sync_page = block_sync_page, + .bmap = btrfs_bmap, + .direct_IO = btrfs_direct_IO, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, + .set_page_dirty = btrfs_set_page_dirty, +}; + +static struct address_space_operations btrfs_symlink_aops = { + .readpage = btrfs_readpage, + .writepage = btrfs_writepage, + .invalidatepage = btrfs_invalidatepage, + .releasepage = btrfs_releasepage, +}; + +static struct inode_operations btrfs_file_inode_operations = { + .truncate = btrfs_truncate, + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, + .permission = btrfs_permission, + .fallocate = btrfs_fallocate, +}; +static struct inode_operations btrfs_special_inode_operations = { + .getattr = btrfs_getattr, + .setattr = btrfs_setattr, + .permission = btrfs_permission, + .setxattr = btrfs_setxattr, + .getxattr = btrfs_getxattr, + .listxattr = btrfs_listxattr, + .removexattr = btrfs_removexattr, +}; +static struct inode_operations btrfs_symlink_inode_operations = { + .readlink = generic_readlink, + .follow_link = page_follow_link_light, + .put_link = page_put_link, + .permission = btrfs_permission, +}; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c new file mode 100644 index 00000000000..c2aa33e3feb --- /dev/null +++ b/fs/btrfs/ioctl.c @@ -0,0 +1,1132 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/kernel.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/fsnotify.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mount.h> +#include <linux/mpage.h> +#include <linux/namei.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/bit_spinlock.h> +#include <linux/security.h> +#include <linux/version.h> +#include <linux/xattr.h> +#include <linux/vmalloc.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "volumes.h" +#include "locking.h" + + + +static noinline int create_subvol(struct btrfs_root *root, + struct dentry *dentry, + char *name, int namelen) +{ + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *new_root = root; + struct inode *dir; + int ret; + int err; + u64 objectid; + u64 new_dirid = BTRFS_FIRST_FREE_OBJECTID; + u64 index = 0; + unsigned long nr = 1; + + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto fail_commit; + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root, + 0, &objectid); + if (ret) + goto fail; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + objectid, trans->transid, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + goto fail; + } + + btrfs_set_header_nritems(leaf, 0); + btrfs_set_header_level(leaf, 0); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_owner(leaf, objectid); + + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_generation(&root_item, trans->transid); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 1); + btrfs_set_root_used(&root_item, 0); + btrfs_set_root_last_snapshot(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); + root_item.drop_level = 0; + + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, new_dirid); + + key.objectid = objectid; + key.offset = 1; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + &root_item); + if (ret) + goto fail; + + /* + * insert the directory item + */ + key.offset = (u64)-1; + dir = dentry->d_parent->d_inode; + ret = btrfs_set_inode_index(dir, &index); + BUG_ON(ret); + + ret = btrfs_insert_dir_item(trans, root, + name, namelen, dir->i_ino, &key, + BTRFS_FT_DIR, index); + if (ret) + goto fail; + + btrfs_i_size_write(dir, dir->i_size + namelen * 2); + ret = btrfs_update_inode(trans, root, dir); + BUG_ON(ret); + + /* add the backref first */ + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + objectid, BTRFS_ROOT_BACKREF_KEY, + root->root_key.objectid, + dir->i_ino, index, name, namelen); + + BUG_ON(ret); + + /* now add the forward ref */ + ret = btrfs_add_root_ref(trans, root->fs_info->tree_root, + root->root_key.objectid, BTRFS_ROOT_REF_KEY, + objectid, + dir->i_ino, index, name, namelen); + + BUG_ON(ret); + + ret = btrfs_commit_transaction(trans, root); + if (ret) + goto fail_commit; + + new_root = btrfs_read_fs_root_no_name(root->fs_info, &key); + BUG_ON(!new_root); + + trans = btrfs_start_transaction(new_root, 1); + BUG_ON(!trans); + + ret = btrfs_create_subvol_root(trans, new_root, dentry, new_dirid, + BTRFS_I(dir)->block_group); + if (ret) + goto fail; + +fail: + nr = trans->blocks_used; + err = btrfs_commit_transaction(trans, new_root); + if (err && !ret) + ret = err; +fail_commit: + btrfs_btree_balance_dirty(root, nr); + return ret; +} + +static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, + char *name, int namelen) +{ + struct btrfs_pending_snapshot *pending_snapshot; + struct btrfs_trans_handle *trans; + int ret = 0; + int err; + unsigned long nr = 0; + + if (!root->ref_cows) + return -EINVAL; + + ret = btrfs_check_free_space(root, 1, 0); + if (ret) + goto fail_unlock; + + pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_NOFS); + if (!pending_snapshot) { + ret = -ENOMEM; + goto fail_unlock; + } + pending_snapshot->name = kmalloc(namelen + 1, GFP_NOFS); + if (!pending_snapshot->name) { + ret = -ENOMEM; + kfree(pending_snapshot); + goto fail_unlock; + } + memcpy(pending_snapshot->name, name, namelen); + pending_snapshot->name[namelen] = '\0'; + pending_snapshot->dentry = dentry; + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + pending_snapshot->root = root; + list_add(&pending_snapshot->list, + &trans->transaction->pending_snapshots); + err = btrfs_commit_transaction(trans, root); + +fail_unlock: + btrfs_btree_balance_dirty(root, nr); + return ret; +} + +/* copy of may_create in fs/namei.c() */ +static inline int btrfs_may_create(struct inode *dir, struct dentry *child) +{ + if (child->d_inode) + return -EEXIST; + if (IS_DEADDIR(dir)) + return -ENOENT; + return inode_permission(dir, MAY_WRITE | MAY_EXEC); +} + +/* + * Create a new subvolume below @parent. This is largely modeled after + * sys_mkdirat and vfs_mkdir, but we only do a single component lookup + * inside this filesystem so it's quite a bit simpler. + */ +static noinline int btrfs_mksubvol(struct path *parent, char *name, + int mode, int namelen, + struct btrfs_root *snap_src) +{ + struct dentry *dentry; + int error; + + mutex_lock_nested(&parent->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + + dentry = lookup_one_len(name, parent->dentry, namelen); + error = PTR_ERR(dentry); + if (IS_ERR(dentry)) + goto out_unlock; + + error = -EEXIST; + if (dentry->d_inode) + goto out_dput; + + if (!IS_POSIXACL(parent->dentry->d_inode)) + mode &= ~current->fs->umask; + + error = mnt_want_write(parent->mnt); + if (error) + goto out_dput; + + error = btrfs_may_create(parent->dentry->d_inode, dentry); + if (error) + goto out_drop_write; + + /* + * Actually perform the low-level subvolume creation after all + * this VFS fuzz. + * + * Eventually we want to pass in an inode under which we create this + * subvolume, but for now all are under the filesystem root. + * + * Also we should pass on the mode eventually to allow creating new + * subvolume with specific mode bits. + */ + if (snap_src) { + struct dentry *dir = dentry->d_parent; + struct dentry *test = dir->d_parent; + struct btrfs_path *path = btrfs_alloc_path(); + int ret; + u64 test_oid; + u64 parent_oid = BTRFS_I(dir->d_inode)->root->root_key.objectid; + + test_oid = snap_src->root_key.objectid; + + ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, + path, parent_oid, test_oid); + if (ret == 0) + goto create; + btrfs_release_path(snap_src->fs_info->tree_root, path); + + /* we need to make sure we aren't creating a directory loop + * by taking a snapshot of something that has our current + * subvol in its directory tree. So, this loops through + * the dentries and checks the forward refs for each subvolume + * to see if is references the subvolume where we are + * placing this new snapshot. + */ + while (1) { + if (!test || + dir == snap_src->fs_info->sb->s_root || + test == snap_src->fs_info->sb->s_root || + test->d_inode->i_sb != snap_src->fs_info->sb) { + break; + } + if (S_ISLNK(test->d_inode->i_mode)) { + printk(KERN_INFO "Btrfs symlink in snapshot " + "path, failed\n"); + error = -EMLINK; + btrfs_free_path(path); + goto out_drop_write; + } + test_oid = + BTRFS_I(test->d_inode)->root->root_key.objectid; + ret = btrfs_find_root_ref(snap_src->fs_info->tree_root, + path, test_oid, parent_oid); + if (ret == 0) { + printk(KERN_INFO "Btrfs snapshot creation " + "failed, looping\n"); + error = -EMLINK; + btrfs_free_path(path); + goto out_drop_write; + } + btrfs_release_path(snap_src->fs_info->tree_root, path); + test = test->d_parent; + } +create: + btrfs_free_path(path); + error = create_snapshot(snap_src, dentry, name, namelen); + } else { + error = create_subvol(BTRFS_I(parent->dentry->d_inode)->root, + dentry, name, namelen); + } + if (error) + goto out_drop_write; + + fsnotify_mkdir(parent->dentry->d_inode, dentry); +out_drop_write: + mnt_drop_write(parent->mnt); +out_dput: + dput(dentry); +out_unlock: + mutex_unlock(&parent->dentry->d_inode->i_mutex); + return error; +} + + +static int btrfs_defrag_file(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct btrfs_ordered_extent *ordered; + struct page *page; + unsigned long last_index; + unsigned long ra_pages = root->fs_info->bdi.ra_pages; + unsigned long total_read = 0; + u64 page_start; + u64 page_end; + unsigned long i; + int ret; + + ret = btrfs_check_free_space(root, inode->i_size, 0); + if (ret) + return -ENOSPC; + + mutex_lock(&inode->i_mutex); + last_index = inode->i_size >> PAGE_CACHE_SHIFT; + for (i = 0; i <= last_index; i++) { + if (total_read % ra_pages == 0) { + btrfs_force_ra(inode->i_mapping, &file->f_ra, file, i, + min(last_index, i + ra_pages - 1)); + } + total_read++; +again: + page = grab_cache_page(inode->i_mapping, i); + if (!page) + goto out_unlock; + if (!PageUptodate(page)) { + btrfs_readpage(NULL, page); + lock_page(page); + if (!PageUptodate(page)) { + unlock_page(page); + page_cache_release(page); + goto out_unlock; + } + } + + wait_on_page_writeback(page); + + page_start = (u64)page->index << PAGE_CACHE_SHIFT; + page_end = page_start + PAGE_CACHE_SIZE - 1; + lock_extent(io_tree, page_start, page_end, GFP_NOFS); + + ordered = btrfs_lookup_ordered_extent(inode, page_start); + if (ordered) { + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + unlock_page(page); + page_cache_release(page); + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + goto again; + } + set_page_extent_mapped(page); + + /* + * this makes sure page_mkwrite is called on the + * page if it is dirtied again later + */ + clear_page_dirty_for_io(page); + + btrfs_set_extent_delalloc(inode, page_start, page_end); + + unlock_extent(io_tree, page_start, page_end, GFP_NOFS); + set_page_dirty(page); + unlock_page(page); + page_cache_release(page); + balance_dirty_pages_ratelimited_nr(inode->i_mapping, 1); + } + +out_unlock: + mutex_unlock(&inode->i_mutex); + return 0; +} + +/* + * Called inside transaction, so use GFP_NOFS + */ + +static int btrfs_ioctl_resize(struct btrfs_root *root, void __user *arg) +{ + u64 new_size; + u64 old_size; + u64 devid = 1; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_trans_handle *trans; + struct btrfs_device *device = NULL; + char *sizestr; + char *devstr = NULL; + int ret = 0; + int namelen; + int mod = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + + mutex_lock(&root->fs_info->volume_mutex); + sizestr = vol_args->name; + devstr = strchr(sizestr, ':'); + if (devstr) { + char *end; + sizestr = devstr + 1; + *devstr = '\0'; + devstr = vol_args->name; + devid = simple_strtoull(devstr, &end, 10); + printk(KERN_INFO "resizing devid %llu\n", devid); + } + device = btrfs_find_device(root, devid, NULL, NULL); + if (!device) { + printk(KERN_INFO "resizer unable to find device %llu\n", devid); + ret = -EINVAL; + goto out_unlock; + } + if (!strcmp(sizestr, "max")) + new_size = device->bdev->bd_inode->i_size; + else { + if (sizestr[0] == '-') { + mod = -1; + sizestr++; + } else if (sizestr[0] == '+') { + mod = 1; + sizestr++; + } + new_size = btrfs_parse_size(sizestr); + if (new_size == 0) { + ret = -EINVAL; + goto out_unlock; + } + } + + old_size = device->total_bytes; + + if (mod < 0) { + if (new_size > old_size) { + ret = -EINVAL; + goto out_unlock; + } + new_size = old_size - new_size; + } else if (mod > 0) { + new_size = old_size + new_size; + } + + if (new_size < 256 * 1024 * 1024) { + ret = -EINVAL; + goto out_unlock; + } + if (new_size > device->bdev->bd_inode->i_size) { + ret = -EFBIG; + goto out_unlock; + } + + do_div(new_size, root->sectorsize); + new_size *= root->sectorsize; + + printk(KERN_INFO "new size for %s is %llu\n", + device->name, (unsigned long long)new_size); + + if (new_size > old_size) { + trans = btrfs_start_transaction(root, 1); + ret = btrfs_grow_device(trans, device, new_size); + btrfs_commit_transaction(trans, root); + } else { + ret = btrfs_shrink_device(device, new_size); + } + +out_unlock: + mutex_unlock(&root->fs_info->volume_mutex); +out: + kfree(vol_args); + return ret; +} + +static noinline int btrfs_ioctl_snap_create(struct file *file, + void __user *arg, int subvol) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + struct btrfs_ioctl_vol_args *vol_args; + struct btrfs_dir_item *di; + struct btrfs_path *path; + struct file *src_file; + u64 root_dirid; + int namelen; + int ret = 0; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + namelen = strlen(vol_args->name); + if (strchr(vol_args->name, '/')) { + ret = -EINVAL; + goto out; + } + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + root_dirid = root->fs_info->sb->s_root->d_inode->i_ino, + di = btrfs_lookup_dir_item(NULL, root->fs_info->tree_root, + path, root_dirid, + vol_args->name, namelen, 0); + btrfs_free_path(path); + + if (di && !IS_ERR(di)) { + ret = -EEXIST; + goto out; + } + + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + if (subvol) { + ret = btrfs_mksubvol(&file->f_path, vol_args->name, + file->f_path.dentry->d_inode->i_mode, + namelen, NULL); + } else { + struct inode *src_inode; + src_file = fget(vol_args->fd); + if (!src_file) { + ret = -EINVAL; + goto out; + } + + src_inode = src_file->f_path.dentry->d_inode; + if (src_inode->i_sb != file->f_path.dentry->d_inode->i_sb) { + printk(KERN_INFO "btrfs: Snapshot src from " + "another FS\n"); + ret = -EINVAL; + fput(src_file); + goto out; + } + ret = btrfs_mksubvol(&file->f_path, vol_args->name, + file->f_path.dentry->d_inode->i_mode, + namelen, BTRFS_I(src_inode)->root); + fput(src_file); + } + +out: + kfree(vol_args); + return ret; +} + +static int btrfs_ioctl_defrag(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + int ret; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + + switch (inode->i_mode & S_IFMT) { + case S_IFDIR: + if (!capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; + } + btrfs_defrag_root(root, 0); + btrfs_defrag_root(root->fs_info->extent_root, 0); + break; + case S_IFREG: + if (!(file->f_mode & FMODE_WRITE)) { + ret = -EINVAL; + goto out; + } + btrfs_defrag_file(file); + break; + } +out: + mnt_drop_write(file->f_path.mnt); + return ret; +} + +static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_init_new_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + +static long btrfs_ioctl_rm_dev(struct btrfs_root *root, void __user *arg) +{ + struct btrfs_ioctl_vol_args *vol_args; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + vol_args = kmalloc(sizeof(*vol_args), GFP_NOFS); + + if (!vol_args) + return -ENOMEM; + + if (copy_from_user(vol_args, arg, sizeof(*vol_args))) { + ret = -EFAULT; + goto out; + } + vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; + ret = btrfs_rm_device(root, vol_args->name); + +out: + kfree(vol_args); + return ret; +} + +static long btrfs_ioctl_clone(struct file *file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct file *src_file; + struct inode *src; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + struct extent_buffer *leaf; + char *buf; + struct btrfs_key key; + u32 nritems; + int slot; + int ret; + u64 len = olen; + u64 bs = root->fs_info->sb->s_blocksize; + u64 hint_byte; + + /* + * TODO: + * - split compressed inline extents. annoying: we need to + * decompress into destination's address_space (the file offset + * may change, so source mapping won't do), then recompress (or + * otherwise reinsert) a subrange. + * - allow ranges within the same file to be cloned (provided + * they don't overlap)? + */ + + /* the destination must be opened for writing */ + if (!(file->f_mode & FMODE_WRITE)) + return -EINVAL; + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + return ret; + + src_file = fget(srcfd); + if (!src_file) { + ret = -EBADF; + goto out_drop_write; + } + src = src_file->f_dentry->d_inode; + + ret = -EINVAL; + if (src == inode) + goto out_fput; + + ret = -EISDIR; + if (S_ISDIR(src->i_mode) || S_ISDIR(inode->i_mode)) + goto out_fput; + + ret = -EXDEV; + if (src->i_sb != inode->i_sb || BTRFS_I(src)->root != root) + goto out_fput; + + ret = -ENOMEM; + buf = vmalloc(btrfs_level_size(root, 0)); + if (!buf) + goto out_fput; + + path = btrfs_alloc_path(); + if (!path) { + vfree(buf); + goto out_fput; + } + path->reada = 2; + + if (inode < src) { + mutex_lock(&inode->i_mutex); + mutex_lock(&src->i_mutex); + } else { + mutex_lock(&src->i_mutex); + mutex_lock(&inode->i_mutex); + } + + /* determine range to clone */ + ret = -EINVAL; + if (off >= src->i_size || off + len > src->i_size) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - off; + /* if we extend to eof, continue to block boundary */ + if (off + len == src->i_size) + len = ((src->i_size + bs-1) & ~(bs-1)) + - off; + + /* verify the end result is block aligned */ + if ((off & (bs-1)) || + ((off + len) & (bs-1))) + goto out_unlock; + + /* do any pending delalloc/csum calc on src, one way or + another, and lock file content */ + while (1) { + struct btrfs_ordered_extent *ordered; + lock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + ordered = btrfs_lookup_first_ordered_extent(inode, off+len); + if (BTRFS_I(src)->delalloc_bytes == 0 && !ordered) + break; + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + if (ordered) + btrfs_put_ordered_extent(ordered); + btrfs_wait_ordered_range(src, off, off+len); + } + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + /* punch hole in destination first */ + btrfs_drop_extents(trans, root, inode, off, off+len, 0, &hint_byte); + + /* clone data */ + key.objectid = src->i_ino; + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; + + while (1) { + /* + * note the key will change type as we walk through the + * tree. + */ + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) + goto out; + if (ret > 0) + break; + nritems = btrfs_header_nritems(path->nodes[0]); + } + leaf = path->nodes[0]; + slot = path->slots[0]; + + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || + key.objectid != src->i_ino) + break; + + if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + struct btrfs_file_extent_item *extent; + int type; + u32 size; + struct btrfs_key new_key; + u64 disko = 0, diskl = 0; + u64 datao = 0, datal = 0; + u8 comp; + + size = btrfs_item_size_nr(leaf, slot); + read_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + comp = btrfs_file_extent_compression(leaf, extent); + type = btrfs_file_extent_type(leaf, extent); + if (type == BTRFS_FILE_EXTENT_REG) { + disko = btrfs_file_extent_disk_bytenr(leaf, + extent); + diskl = btrfs_file_extent_disk_num_bytes(leaf, + extent); + datao = btrfs_file_extent_offset(leaf, extent); + datal = btrfs_file_extent_num_bytes(leaf, + extent); + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + /* take upper bound, may be compressed */ + datal = btrfs_file_extent_ram_bytes(leaf, + extent); + } + btrfs_release_path(root, path); + + if (key.offset + datal < off || + key.offset >= off+len) + goto next; + + memcpy(&new_key, &key, sizeof(new_key)); + new_key.objectid = inode->i_ino; + new_key.offset = key.offset + destoff - off; + + if (type == BTRFS_FILE_EXTENT_REG) { + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) + goto out; + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + + extent = btrfs_item_ptr(leaf, slot, + struct btrfs_file_extent_item); + + if (off > key.offset) { + datao += off - key.offset; + datal -= off - key.offset; + } + if (key.offset + datao + datal + key.offset > + off + len) + datal = off + len - key.offset - datao; + /* disko == 0 means it's a hole */ + if (!disko) + datao = 0; + + btrfs_set_file_extent_offset(leaf, extent, + datao); + btrfs_set_file_extent_num_bytes(leaf, extent, + datal); + if (disko) { + inode_add_bytes(inode, datal); + ret = btrfs_inc_extent_ref(trans, root, + disko, diskl, leaf->start, + root->root_key.objectid, + trans->transid, + inode->i_ino); + BUG_ON(ret); + } + } else if (type == BTRFS_FILE_EXTENT_INLINE) { + u64 skip = 0; + u64 trim = 0; + if (off > key.offset) { + skip = off - key.offset; + new_key.offset += skip; + } + + if (key.offset + datal > off+len) + trim = key.offset + datal - (off+len); + + if (comp && (skip || trim)) { + ret = -EINVAL; + goto out; + } + size -= skip + trim; + datal -= skip + trim; + ret = btrfs_insert_empty_item(trans, root, path, + &new_key, size); + if (ret) + goto out; + + if (skip) { + u32 start = + btrfs_file_extent_calc_inline_size(0); + memmove(buf+start, buf+start+skip, + datal); + } + + leaf = path->nodes[0]; + slot = path->slots[0]; + write_extent_buffer(leaf, buf, + btrfs_item_ptr_offset(leaf, slot), + size); + inode_add_bytes(inode, datal); + } + + btrfs_mark_buffer_dirty(leaf); + } + +next: + btrfs_release_path(root, path); + key.offset++; + } + ret = 0; +out: + btrfs_release_path(root, path); + if (ret == 0) { + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + if (destoff + olen > inode->i_size) + btrfs_i_size_write(inode, destoff + olen); + BTRFS_I(inode)->flags = BTRFS_I(src)->flags; + ret = btrfs_update_inode(trans, root, inode); + } + btrfs_end_transaction(trans, root); + unlock_extent(&BTRFS_I(src)->io_tree, off, off+len, GFP_NOFS); + if (ret) + vmtruncate(inode, 0); +out_unlock: + mutex_unlock(&src->i_mutex); + mutex_unlock(&inode->i_mutex); + vfree(buf); + btrfs_free_path(path); +out_fput: + fput(src_file); +out_drop_write: + mnt_drop_write(file->f_path.mnt); + return ret; +} + +static long btrfs_ioctl_clone_range(struct file *file, void __user *argp) +{ + struct btrfs_ioctl_clone_range_args args; + + if (copy_from_user(&args, argp, sizeof(args))) + return -EFAULT; + return btrfs_ioctl_clone(file, args.src_fd, args.src_offset, + args.src_length, args.dest_offset); +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +static long btrfs_ioctl_trans_start(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (file->private_data) { + ret = -EINPROGRESS; + goto out; + } + + ret = mnt_want_write(file->f_path.mnt); + if (ret) + goto out; + + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans++; + mutex_unlock(&root->fs_info->trans_mutex); + + trans = btrfs_start_ioctl_transaction(root, 0); + if (trans) + file->private_data = trans; + else + ret = -ENOMEM; + /*printk(KERN_INFO "btrfs_ioctl_trans_start on %p\n", file);*/ +out: + return ret; +} + +/* + * there are many ways the trans_start and trans_end ioctls can lead + * to deadlocks. They should only be used by applications that + * basically own the machine, and have a very in depth understanding + * of all the possible deadlocks and enospc problems. + */ +long btrfs_ioctl_trans_end(struct file *file) +{ + struct inode *inode = fdentry(file)->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + int ret = 0; + + trans = file->private_data; + if (!trans) { + ret = -EINVAL; + goto out; + } + btrfs_end_transaction(trans, root); + file->private_data = NULL; + + mutex_lock(&root->fs_info->trans_mutex); + root->fs_info->open_ioctl_trans--; + mutex_unlock(&root->fs_info->trans_mutex); + + mnt_drop_write(file->f_path.mnt); + +out: + return ret; +} + +long btrfs_ioctl(struct file *file, unsigned int + cmd, unsigned long arg) +{ + struct btrfs_root *root = BTRFS_I(fdentry(file)->d_inode)->root; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case BTRFS_IOC_SNAP_CREATE: + return btrfs_ioctl_snap_create(file, argp, 0); + case BTRFS_IOC_SUBVOL_CREATE: + return btrfs_ioctl_snap_create(file, argp, 1); + case BTRFS_IOC_DEFRAG: + return btrfs_ioctl_defrag(file); + case BTRFS_IOC_RESIZE: + return btrfs_ioctl_resize(root, argp); + case BTRFS_IOC_ADD_DEV: + return btrfs_ioctl_add_dev(root, argp); + case BTRFS_IOC_RM_DEV: + return btrfs_ioctl_rm_dev(root, argp); + case BTRFS_IOC_BALANCE: + return btrfs_balance(root->fs_info->dev_root); + case BTRFS_IOC_CLONE: + return btrfs_ioctl_clone(file, arg, 0, 0, 0); + case BTRFS_IOC_CLONE_RANGE: + return btrfs_ioctl_clone_range(file, argp); + case BTRFS_IOC_TRANS_START: + return btrfs_ioctl_trans_start(file); + case BTRFS_IOC_TRANS_END: + return btrfs_ioctl_trans_end(file); + case BTRFS_IOC_SYNC: + btrfs_sync_fs(file->f_dentry->d_sb, 1); + return 0; + } + + return -ENOTTY; +} diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h new file mode 100644 index 00000000000..78049ea208d --- /dev/null +++ b/fs/btrfs/ioctl.h @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __IOCTL_ +#define __IOCTL_ +#include <linux/ioctl.h> + +#define BTRFS_IOCTL_MAGIC 0x94 +#define BTRFS_VOL_NAME_MAX 255 +#define BTRFS_PATH_NAME_MAX 3072 + +struct btrfs_ioctl_vol_args { + __s64 fd; + char name[BTRFS_PATH_NAME_MAX + 1]; +}; + +#define BTRFS_IOC_SNAP_CREATE _IOW(BTRFS_IOCTL_MAGIC, 1, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_DEFRAG _IOW(BTRFS_IOCTL_MAGIC, 2, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RESIZE _IOW(BTRFS_IOCTL_MAGIC, 3, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_SCAN_DEV _IOW(BTRFS_IOCTL_MAGIC, 4, \ + struct btrfs_ioctl_vol_args) +/* trans start and trans end are dangerous, and only for + * use by applications that know how to avoid the + * resulting deadlocks + */ +#define BTRFS_IOC_TRANS_START _IO(BTRFS_IOCTL_MAGIC, 6) +#define BTRFS_IOC_TRANS_END _IO(BTRFS_IOCTL_MAGIC, 7) +#define BTRFS_IOC_SYNC _IO(BTRFS_IOCTL_MAGIC, 8) + +#define BTRFS_IOC_CLONE _IOW(BTRFS_IOCTL_MAGIC, 9, int) +#define BTRFS_IOC_ADD_DEV _IOW(BTRFS_IOCTL_MAGIC, 10, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_RM_DEV _IOW(BTRFS_IOCTL_MAGIC, 11, \ + struct btrfs_ioctl_vol_args) +#define BTRFS_IOC_BALANCE _IOW(BTRFS_IOCTL_MAGIC, 12, \ + struct btrfs_ioctl_vol_args) +struct btrfs_ioctl_clone_range_args { + __s64 src_fd; + __u64 src_offset, src_length; + __u64 dest_offset; +}; + +#define BTRFS_IOC_CLONE_RANGE _IOW(BTRFS_IOCTL_MAGIC, 13, \ + struct btrfs_ioctl_clone_range_args) + +#define BTRFS_IOC_SUBVOL_CREATE _IOW(BTRFS_IOCTL_MAGIC, 14, \ + struct btrfs_ioctl_vol_args) + +#endif diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c new file mode 100644 index 00000000000..39bae7761db --- /dev/null +++ b/fs/btrfs/locking.c @@ -0,0 +1,88 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/gfp.h> +#include <linux/pagemap.h> +#include <linux/spinlock.h> +#include <linux/page-flags.h> +#include <asm/bug.h> +#include "ctree.h" +#include "extent_io.h" +#include "locking.h" + +/* + * locks the per buffer mutex in an extent buffer. This uses adaptive locks + * and the spin is not tuned very extensively. The spinning does make a big + * difference in almost every workload, but spinning for the right amount of + * time needs some help. + * + * In general, we want to spin as long as the lock holder is doing btree + * searches, and we should give up if they are in more expensive code. + */ + +int btrfs_tree_lock(struct extent_buffer *eb) +{ + int i; + + if (mutex_trylock(&eb->mutex)) + return 0; + for (i = 0; i < 512; i++) { + cpu_relax(); + if (mutex_trylock(&eb->mutex)) + return 0; + } + cpu_relax(); + mutex_lock_nested(&eb->mutex, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); + return 0; +} + +int btrfs_try_tree_lock(struct extent_buffer *eb) +{ + return mutex_trylock(&eb->mutex); +} + +int btrfs_tree_unlock(struct extent_buffer *eb) +{ + mutex_unlock(&eb->mutex); + return 0; +} + +int btrfs_tree_locked(struct extent_buffer *eb) +{ + return mutex_is_locked(&eb->mutex); +} + +/* + * btrfs_search_slot uses this to decide if it should drop its locks + * before doing something expensive like allocating free blocks for cow. + */ +int btrfs_path_lock_waiting(struct btrfs_path *path, int level) +{ + int i; + struct extent_buffer *eb; + for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { + eb = path->nodes[i]; + if (!eb) + break; + smp_mb(); + if (!list_empty(&eb->mutex.wait_list)) + return 1; + } + return 0; +} + diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h new file mode 100644 index 00000000000..bc1faef1251 --- /dev/null +++ b/fs/btrfs/locking.h @@ -0,0 +1,27 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_LOCKING_ +#define __BTRFS_LOCKING_ + +int btrfs_tree_lock(struct extent_buffer *eb); +int btrfs_tree_unlock(struct extent_buffer *eb); +int btrfs_tree_locked(struct extent_buffer *eb); +int btrfs_try_tree_lock(struct extent_buffer *eb); +int btrfs_path_lock_waiting(struct btrfs_path *path, int level); +#endif diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c new file mode 100644 index 00000000000..a2094017027 --- /dev/null +++ b/fs/btrfs/ordered-data.c @@ -0,0 +1,730 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/gfp.h> +#include <linux/slab.h> +#include <linux/blkdev.h> +#include <linux/writeback.h> +#include <linux/pagevec.h> +#include "ctree.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "extent_io.h" + +static u64 entry_end(struct btrfs_ordered_extent *entry) +{ + if (entry->file_offset + entry->len < entry->file_offset) + return (u64)-1; + return entry->file_offset + entry->len; +} + +/* returns NULL if the insertion worked, or it returns the node it did find + * in the tree + */ +static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_ordered_extent *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node); + + if (file_offset < entry->file_offset) + p = &(*p)->rb_left; + else if (file_offset >= entry_end(entry)) + p = &(*p)->rb_right; + else + return parent; + } + + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +/* + * look for a given offset in the tree, and if it can't be found return the + * first lesser offset + */ +static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, + struct rb_node **prev_ret) +{ + struct rb_node *n = root->rb_node; + struct rb_node *prev = NULL; + struct rb_node *test; + struct btrfs_ordered_extent *entry; + struct btrfs_ordered_extent *prev_entry = NULL; + + while (n) { + entry = rb_entry(n, struct btrfs_ordered_extent, rb_node); + prev = n; + prev_entry = entry; + + if (file_offset < entry->file_offset) + n = n->rb_left; + else if (file_offset >= entry_end(entry)) + n = n->rb_right; + else + return n; + } + if (!prev_ret) + return NULL; + + while (prev && file_offset >= entry_end(prev_entry)) { + test = rb_next(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + if (file_offset < entry_end(prev_entry)) + break; + + prev = test; + } + if (prev) + prev_entry = rb_entry(prev, struct btrfs_ordered_extent, + rb_node); + while (prev && file_offset < entry_end(prev_entry)) { + test = rb_prev(prev); + if (!test) + break; + prev_entry = rb_entry(test, struct btrfs_ordered_extent, + rb_node); + prev = test; + } + *prev_ret = prev; + return NULL; +} + +/* + * helper to check if a given offset is inside a given entry + */ +static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) +{ + if (file_offset < entry->file_offset || + entry->file_offset + entry->len <= file_offset) + return 0; + return 1; +} + +/* + * look find the first ordered struct that has this offset, otherwise + * the first one less than this offset + */ +static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, + u64 file_offset) +{ + struct rb_root *root = &tree->tree; + struct rb_node *prev; + struct rb_node *ret; + struct btrfs_ordered_extent *entry; + + if (tree->last) { + entry = rb_entry(tree->last, struct btrfs_ordered_extent, + rb_node); + if (offset_in_entry(entry, file_offset)) + return tree->last; + } + ret = __tree_search(root, file_offset, &prev); + if (!ret) + ret = prev; + if (ret) + tree->last = ret; + return ret; +} + +/* allocate and add a new ordered_extent into the per-inode tree. + * file_offset is the logical offset in the file + * + * start is the disk block number of an extent already reserved in the + * extent allocation tree + * + * len is the length of the extent + * + * This also sets the EXTENT_ORDERED bit on the range in the inode. + * + * The tree is given a single reference on the ordered extent that was + * inserted. + */ +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int type) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; + + tree = &BTRFS_I(inode)->ordered_tree; + entry = kzalloc(sizeof(*entry), GFP_NOFS); + if (!entry) + return -ENOMEM; + + mutex_lock(&tree->mutex); + entry->file_offset = file_offset; + entry->start = start; + entry->len = len; + entry->disk_len = disk_len; + entry->inode = inode; + if (type != BTRFS_ORDERED_IO_DONE && type != BTRFS_ORDERED_COMPLETE) + set_bit(type, &entry->flags); + + /* one ref for the tree */ + atomic_set(&entry->refs, 1); + init_waitqueue_head(&entry->wait); + INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->root_extent_list); + + node = tree_insert(&tree->tree, file_offset, + &entry->rb_node); + BUG_ON(node); + + set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset, + entry_end(entry) - 1, GFP_NOFS); + + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_add_tail(&entry->root_extent_list, + &BTRFS_I(inode)->root->fs_info->ordered_extents); + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + + mutex_unlock(&tree->mutex); + BUG_ON(node); + return 0; +} + +/* + * Add a struct btrfs_ordered_sum into the list of checksums to be inserted + * when an ordered extent is finished. If the list covers more than one + * ordered extent, it is split across multiples. + */ +int btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum) +{ + struct btrfs_ordered_inode_tree *tree; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + list_add_tail(&sum->list, &entry->list); + mutex_unlock(&tree->mutex); + return 0; +} + +/* + * this is used to account for finished IO across a given range + * of the file. The IO should not span ordered extents. If + * a given ordered_extent is completely done, 1 is returned, otherwise + * 0. + * + * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used + * to make sure this function only returns 1 once for a given ordered extent. + */ +int btrfs_dec_test_ordered_pending(struct inode *inode, + u64 file_offset, u64 io_size) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + int ret; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1, + GFP_NOFS); + node = tree_search(tree, file_offset); + if (!node) { + ret = 1; + goto out; + } + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, file_offset)) { + ret = 1; + goto out; + } + + ret = test_range_bit(io_tree, entry->file_offset, + entry->file_offset + entry->len - 1, + EXTENT_ORDERED, 0); + if (ret == 0) + ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags); +out: + mutex_unlock(&tree->mutex); + return ret == 0; +} + +/* + * used to drop a reference on an ordered extent. This will free + * the extent if the last reference is dropped + */ +int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) +{ + struct list_head *cur; + struct btrfs_ordered_sum *sum; + + if (atomic_dec_and_test(&entry->refs)) { + while (!list_empty(&entry->list)) { + cur = entry->list.next; + sum = list_entry(cur, struct btrfs_ordered_sum, list); + list_del(&sum->list); + kfree(sum); + } + kfree(entry); + } + return 0; +} + +/* + * remove an ordered extent from the tree. No references are dropped + * but, anyone waiting on this extent is woken up. + */ +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = &entry->rb_node; + rb_erase(node, &tree->tree); + tree->last = NULL; + set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + + spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + list_del_init(&entry->root_extent_list); + spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock); + + mutex_unlock(&tree->mutex); + wake_up(&entry->wait); + return 0; +} + +/* + * wait for all the ordered extents in a root. This is done when balancing + * space between drives. + */ +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only) +{ + struct list_head splice; + struct list_head *cur; + struct btrfs_ordered_extent *ordered; + struct inode *inode; + + INIT_LIST_HEAD(&splice); + + spin_lock(&root->fs_info->ordered_extent_lock); + list_splice_init(&root->fs_info->ordered_extents, &splice); + while (!list_empty(&splice)) { + cur = splice.next; + ordered = list_entry(cur, struct btrfs_ordered_extent, + root_extent_list); + if (nocow_only && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered->flags)) { + list_move(&ordered->root_extent_list, + &root->fs_info->ordered_extents); + cond_resched_lock(&root->fs_info->ordered_extent_lock); + continue; + } + + list_del_init(&ordered->root_extent_list); + atomic_inc(&ordered->refs); + + /* + * the inode may be getting freed (in sys_unlink path). + */ + inode = igrab(ordered->inode); + + spin_unlock(&root->fs_info->ordered_extent_lock); + + if (inode) { + btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_put_ordered_extent(ordered); + iput(inode); + } else { + btrfs_put_ordered_extent(ordered); + } + + spin_lock(&root->fs_info->ordered_extent_lock); + } + spin_unlock(&root->fs_info->ordered_extent_lock); + return 0; +} + +/* + * Used to start IO or wait for a given ordered extent to finish. + * + * If wait is one, this effectively waits on page writeback for all the pages + * in the extent, and it waits on the io completion code to insert + * metadata into the btree corresponding to the extent + */ +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, + int wait) +{ + u64 start = entry->file_offset; + u64 end = start + entry->len - 1; + + /* + * pages in the range can be dirty, clean or writeback. We + * start IO on any dirty ones so the wait doesn't stall waiting + * for pdflush to find them + */ + btrfs_fdatawrite_range(inode->i_mapping, start, end, WB_SYNC_ALL); + if (wait) { + wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, + &entry->flags)); + } +} + +/* + * Used to wait on ordered extents across a large range of bytes. + */ +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) +{ + u64 end; + u64 orig_end; + u64 wait_end; + struct btrfs_ordered_extent *ordered; + + if (start + len < start) { + orig_end = INT_LIMIT(loff_t); + } else { + orig_end = start + len - 1; + if (orig_end > INT_LIMIT(loff_t)) + orig_end = INT_LIMIT(loff_t); + } + wait_end = orig_end; +again: + /* start IO across the range first to instantiate any delalloc + * extents + */ + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_NONE); + + /* The compression code will leave pages locked but return from + * writepage without setting the page writeback. Starting again + * with WB_SYNC_ALL will end up waiting for the IO to actually start. + */ + btrfs_fdatawrite_range(inode->i_mapping, start, orig_end, WB_SYNC_ALL); + + btrfs_wait_on_page_writeback_range(inode->i_mapping, + start >> PAGE_CACHE_SHIFT, + orig_end >> PAGE_CACHE_SHIFT); + + end = orig_end; + while (1) { + ordered = btrfs_lookup_first_ordered_extent(inode, end); + if (!ordered) + break; + if (ordered->file_offset > orig_end) { + btrfs_put_ordered_extent(ordered); + break; + } + if (ordered->file_offset + ordered->len < start) { + btrfs_put_ordered_extent(ordered); + break; + } + btrfs_start_ordered_extent(inode, ordered, 1); + end = ordered->file_offset; + btrfs_put_ordered_extent(ordered); + if (end == 0 || end == start) + break; + end--; + } + if (test_range_bit(&BTRFS_I(inode)->io_tree, start, orig_end, + EXTENT_ORDERED | EXTENT_DELALLOC, 0)) { + schedule_timeout(1); + goto again; + } + return 0; +} + +/* + * find an ordered extent corresponding to file_offset. return NULL if + * nothing is found, otherwise take a reference on the extent and return it + */ +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (!offset_in_entry(entry, file_offset)) + entry = NULL; + if (entry) + atomic_inc(&entry->refs); +out: + mutex_unlock(&tree->mutex); + return entry; +} + +/* + * lookup and return any extent before 'file_offset'. NULL is returned + * if none is found + */ +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +{ + struct btrfs_ordered_inode_tree *tree; + struct rb_node *node; + struct btrfs_ordered_extent *entry = NULL; + + tree = &BTRFS_I(inode)->ordered_tree; + mutex_lock(&tree->mutex); + node = tree_search(tree, file_offset); + if (!node) + goto out; + + entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); + atomic_inc(&entry->refs); +out: + mutex_unlock(&tree->mutex); + return entry; +} + +/* + * After an extent is done, call this to conditionally update the on disk + * i_size. i_size is updated to cover any fully written part of the file. + */ +int btrfs_ordered_update_i_size(struct inode *inode, + struct btrfs_ordered_extent *ordered) +{ + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + u64 disk_i_size; + u64 new_i_size; + u64 i_size_test; + struct rb_node *node; + struct btrfs_ordered_extent *test; + + mutex_lock(&tree->mutex); + disk_i_size = BTRFS_I(inode)->disk_i_size; + + /* + * if the disk i_size is already at the inode->i_size, or + * this ordered extent is inside the disk i_size, we're done + */ + if (disk_i_size >= inode->i_size || + ordered->file_offset + ordered->len <= disk_i_size) { + goto out; + } + + /* + * we can't update the disk_isize if there are delalloc bytes + * between disk_i_size and this ordered extent + */ + if (test_range_bit(io_tree, disk_i_size, + ordered->file_offset + ordered->len - 1, + EXTENT_DELALLOC, 0)) { + goto out; + } + /* + * walk backward from this ordered extent to disk_i_size. + * if we find an ordered extent then we can't update disk i_size + * yet + */ + node = &ordered->rb_node; + while (1) { + node = rb_prev(node); + if (!node) + break; + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset + test->len <= disk_i_size) + break; + if (test->file_offset >= inode->i_size) + break; + if (test->file_offset >= disk_i_size) + goto out; + } + new_i_size = min_t(u64, entry_end(ordered), i_size_read(inode)); + + /* + * at this point, we know we can safely update i_size to at least + * the offset from this ordered extent. But, we need to + * walk forward and see if ios from higher up in the file have + * finished. + */ + node = rb_next(&ordered->rb_node); + i_size_test = 0; + if (node) { + /* + * do we have an area where IO might have finished + * between our ordered extent and the next one. + */ + test = rb_entry(node, struct btrfs_ordered_extent, rb_node); + if (test->file_offset > entry_end(ordered)) + i_size_test = test->file_offset; + } else { + i_size_test = i_size_read(inode); + } + + /* + * i_size_test is the end of a region after this ordered + * extent where there are no ordered extents. As long as there + * are no delalloc bytes in this area, it is safe to update + * disk_i_size to the end of the region. + */ + if (i_size_test > entry_end(ordered) && + !test_range_bit(io_tree, entry_end(ordered), i_size_test - 1, + EXTENT_DELALLOC, 0)) { + new_i_size = min_t(u64, i_size_test, i_size_read(inode)); + } + BTRFS_I(inode)->disk_i_size = new_i_size; +out: + mutex_unlock(&tree->mutex); + return 0; +} + +/* + * search the ordered extents for one corresponding to 'offset' and + * try to find a checksum. This is used because we allow pages to + * be reclaimed before their checksum is actually put into the btree + */ +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, + u32 *sum) +{ + struct btrfs_ordered_sum *ordered_sum; + struct btrfs_sector_sum *sector_sums; + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct list_head *cur; + unsigned long num_sectors; + unsigned long i; + u32 sectorsize = BTRFS_I(inode)->root->sectorsize; + int ret = 1; + + ordered = btrfs_lookup_ordered_extent(inode, offset); + if (!ordered) + return 1; + + mutex_lock(&tree->mutex); + list_for_each_prev(cur, &ordered->list) { + ordered_sum = list_entry(cur, struct btrfs_ordered_sum, list); + if (disk_bytenr >= ordered_sum->bytenr) { + num_sectors = ordered_sum->len / sectorsize; + sector_sums = ordered_sum->sums; + for (i = 0; i < num_sectors; i++) { + if (sector_sums[i].bytenr == disk_bytenr) { + *sum = sector_sums[i].sum; + ret = 0; + goto out; + } + } + } + } +out: + mutex_unlock(&tree->mutex); + btrfs_put_ordered_extent(ordered); + return ret; +} + + +/** + * taken from mm/filemap.c because it isn't exported + * + * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * @mapping: address space structure to write + * @start: offset in bytes where the range starts + * @end: offset in bytes where the range ends (inclusive) + * @sync_mode: enable synchronous operation + * + * Start writeback against all of a mapping's dirty pages that lie + * within the byte offsets <start, end> inclusive. + * + * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as + * opposed to a regular memory cleansing writeback. The difference between + * these two operations is that if a dirty page/buffer is encountered, it must + * be waited upon, and not just skipped over. + */ +int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode) +{ + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = mapping->nrpages * 2, + .range_start = start, + .range_end = end, + .for_writepages = 1, + }; + return btrfs_writepages(mapping, &wbc); +} + +/** + * taken from mm/filemap.c because it isn't exported + * + * wait_on_page_writeback_range - wait for writeback to complete + * @mapping: target address_space + * @start: beginning page index + * @end: ending page index + * + * Wait for writeback to complete against pages indexed by start->end + * inclusive + */ +int btrfs_wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end) +{ + struct pagevec pvec; + int nr_pages; + int ret = 0; + pgoff_t index; + + if (end < start) + return 0; + + pagevec_init(&pvec, 0); + index = start; + while ((index <= end) && + (nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, + PAGECACHE_TAG_WRITEBACK, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1)) != 0) { + unsigned i; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + /* until radix tree lookup accepts end_index */ + if (page->index > end) + continue; + + wait_on_page_writeback(page); + if (PageError(page)) + ret = -EIO; + } + pagevec_release(&pvec); + cond_resched(); + } + + /* Check for outstanding write errors */ + if (test_and_clear_bit(AS_ENOSPC, &mapping->flags)) + ret = -ENOSPC; + if (test_and_clear_bit(AS_EIO, &mapping->flags)) + ret = -EIO; + + return ret; +} diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h new file mode 100644 index 00000000000..ab66d5e8d6d --- /dev/null +++ b/fs/btrfs/ordered-data.h @@ -0,0 +1,158 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_ORDERED_DATA__ +#define __BTRFS_ORDERED_DATA__ + +/* one of these per inode */ +struct btrfs_ordered_inode_tree { + struct mutex mutex; + struct rb_root tree; + struct rb_node *last; +}; + +/* + * these are used to collect checksums done just before bios submission. + * They are attached via a list into the ordered extent, and + * checksum items are inserted into the tree after all the blocks in + * the ordered extent are on disk + */ +struct btrfs_sector_sum { + /* bytenr on disk */ + u64 bytenr; + u32 sum; +}; + +struct btrfs_ordered_sum { + /* bytenr is the start of this extent on disk */ + u64 bytenr; + + /* + * this is the length in bytes covered by the sums array below. + */ + unsigned long len; + struct list_head list; + /* last field is a variable length array of btrfs_sector_sums */ + struct btrfs_sector_sum sums[]; +}; + +/* + * bits for the flags field: + * + * BTRFS_ORDERED_IO_DONE is set when all of the blocks are written. + * It is used to make sure metadata is inserted into the tree only once + * per extent. + * + * BTRFS_ORDERED_COMPLETE is set when the extent is removed from the + * rbtree, just before waking any waiters. It is used to indicate the + * IO is done and any metadata is inserted into the tree. + */ +#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */ + +#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */ + +#define BTRFS_ORDERED_NOCOW 2 /* set when we want to write in place */ + +#define BTRFS_ORDERED_COMPRESSED 3 /* writing a compressed extent */ + +#define BTRFS_ORDERED_PREALLOC 4 /* set when writing to prealloced extent */ + +struct btrfs_ordered_extent { + /* logical offset in the file */ + u64 file_offset; + + /* disk byte number */ + u64 start; + + /* ram length of the extent in bytes */ + u64 len; + + /* extent length on disk */ + u64 disk_len; + + /* flags (described above) */ + unsigned long flags; + + /* reference count */ + atomic_t refs; + + /* the inode we belong to */ + struct inode *inode; + + /* list of checksums for insertion when the extent io is done */ + struct list_head list; + + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ + wait_queue_head_t wait; + + /* our friendly rbtree entry */ + struct rb_node rb_node; + + /* a per root list of all the pending ordered extents */ + struct list_head root_extent_list; +}; + + +/* + * calculates the total size you need to allocate for an ordered sum + * structure spanning 'bytes' in the file + */ +static inline int btrfs_ordered_sum_size(struct btrfs_root *root, + unsigned long bytes) +{ + unsigned long num_sectors = (bytes + root->sectorsize - 1) / + root->sectorsize; + num_sectors++; + return sizeof(struct btrfs_ordered_sum) + + num_sectors * sizeof(struct btrfs_sector_sum); +} + +static inline void +btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) +{ + mutex_init(&t->mutex); + t->tree.rb_node = NULL; + t->last = NULL; +} + +int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); +int btrfs_remove_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry); +int btrfs_dec_test_ordered_pending(struct inode *inode, + u64 file_offset, u64 io_size); +int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset, + u64 start, u64 len, u64 disk_len, int tyep); +int btrfs_add_ordered_sum(struct inode *inode, + struct btrfs_ordered_extent *entry, + struct btrfs_ordered_sum *sum); +struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode, + u64 file_offset); +void btrfs_start_ordered_extent(struct inode *inode, + struct btrfs_ordered_extent *entry, int wait); +int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); +struct btrfs_ordered_extent * +btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +int btrfs_ordered_update_i_size(struct inode *inode, + struct btrfs_ordered_extent *ordered); +int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u32 *sum); +int btrfs_wait_on_page_writeback_range(struct address_space *mapping, + pgoff_t start, pgoff_t end); +int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end, int sync_mode); +int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only); +#endif diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c new file mode 100644 index 00000000000..3c0d52af4f8 --- /dev/null +++ b/fs/btrfs/orphan.c @@ -0,0 +1,67 @@ +/* + * Copyright (C) 2008 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" + +int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_free_path(path); + return ret; +} + +int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 offset) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + key.objectid = BTRFS_ORPHAN_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = offset; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret) + goto out; + + ret = btrfs_del_item(trans, root, path); + +out: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c new file mode 100644 index 00000000000..5f8f218c100 --- /dev/null +++ b/fs/btrfs/print-tree.c @@ -0,0 +1,216 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" + +static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) +{ + int num_stripes = btrfs_chunk_num_stripes(eb, chunk); + int i; + printk(KERN_INFO "\t\tchunk length %llu owner %llu type %llu " + "num_stripes %d\n", + (unsigned long long)btrfs_chunk_length(eb, chunk), + (unsigned long long)btrfs_chunk_owner(eb, chunk), + (unsigned long long)btrfs_chunk_type(eb, chunk), + num_stripes); + for (i = 0 ; i < num_stripes ; i++) { + printk(KERN_INFO "\t\t\tstripe %d devid %llu offset %llu\n", i, + (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i), + (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i)); + } +} +static void print_dev_item(struct extent_buffer *eb, + struct btrfs_dev_item *dev_item) +{ + printk(KERN_INFO "\t\tdev item devid %llu " + "total_bytes %llu bytes used %llu\n", + (unsigned long long)btrfs_device_id(eb, dev_item), + (unsigned long long)btrfs_device_total_bytes(eb, dev_item), + (unsigned long long)btrfs_device_bytes_used(eb, dev_item)); +} +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) +{ + int i; + u32 nr = btrfs_header_nritems(l); + struct btrfs_item *item; + struct btrfs_extent_item *ei; + struct btrfs_root_item *ri; + struct btrfs_dir_item *di; + struct btrfs_inode_item *ii; + struct btrfs_block_group_item *bi; + struct btrfs_file_extent_item *fi; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_extent_ref *ref; + struct btrfs_dev_extent *dev_extent; + u32 type; + + printk(KERN_INFO "leaf %llu total ptrs %d free space %d\n", + (unsigned long long)btrfs_header_bytenr(l), nr, + btrfs_leaf_free_space(root, l)); + for (i = 0 ; i < nr ; i++) { + item = btrfs_item_nr(l, i); + btrfs_item_key_to_cpu(l, &key, i); + type = btrfs_key_type(&key); + printk(KERN_INFO "\titem %d key (%llu %x %llu) itemoff %d " + "itemsize %d\n", + i, + (unsigned long long)key.objectid, type, + (unsigned long long)key.offset, + btrfs_item_offset(l, item), btrfs_item_size(l, item)); + switch (type) { + case BTRFS_INODE_ITEM_KEY: + ii = btrfs_item_ptr(l, i, struct btrfs_inode_item); + printk(KERN_INFO "\t\tinode generation %llu size %llu " + "mode %o\n", + (unsigned long long) + btrfs_inode_generation(l, ii), + (unsigned long long)btrfs_inode_size(l, ii), + btrfs_inode_mode(l, ii)); + break; + case BTRFS_DIR_ITEM_KEY: + di = btrfs_item_ptr(l, i, struct btrfs_dir_item); + btrfs_dir_item_key_to_cpu(l, di, &found_key); + printk(KERN_INFO "\t\tdir oid %llu type %u\n", + (unsigned long long)found_key.objectid, + btrfs_dir_type(l, di)); + break; + case BTRFS_ROOT_ITEM_KEY: + ri = btrfs_item_ptr(l, i, struct btrfs_root_item); + printk(KERN_INFO "\t\troot data bytenr %llu refs %u\n", + (unsigned long long) + btrfs_disk_root_bytenr(l, ri), + btrfs_disk_root_refs(l, ri)); + break; + case BTRFS_EXTENT_ITEM_KEY: + ei = btrfs_item_ptr(l, i, struct btrfs_extent_item); + printk(KERN_INFO "\t\textent data refs %u\n", + btrfs_extent_refs(l, ei)); + break; + case BTRFS_EXTENT_REF_KEY: + ref = btrfs_item_ptr(l, i, struct btrfs_extent_ref); + printk(KERN_INFO "\t\textent back ref root %llu " + "gen %llu owner %llu num_refs %lu\n", + (unsigned long long)btrfs_ref_root(l, ref), + (unsigned long long)btrfs_ref_generation(l, ref), + (unsigned long long)btrfs_ref_objectid(l, ref), + (unsigned long)btrfs_ref_num_refs(l, ref)); + break; + + case BTRFS_EXTENT_DATA_KEY: + fi = btrfs_item_ptr(l, i, + struct btrfs_file_extent_item); + if (btrfs_file_extent_type(l, fi) == + BTRFS_FILE_EXTENT_INLINE) { + printk(KERN_INFO "\t\tinline extent data " + "size %u\n", + btrfs_file_extent_inline_len(l, fi)); + break; + } + printk(KERN_INFO "\t\textent data disk bytenr %llu " + "nr %llu\n", + (unsigned long long) + btrfs_file_extent_disk_bytenr(l, fi), + (unsigned long long) + btrfs_file_extent_disk_num_bytes(l, fi)); + printk(KERN_INFO "\t\textent data offset %llu " + "nr %llu ram %llu\n", + (unsigned long long) + btrfs_file_extent_offset(l, fi), + (unsigned long long) + btrfs_file_extent_num_bytes(l, fi), + (unsigned long long) + btrfs_file_extent_ram_bytes(l, fi)); + break; + case BTRFS_BLOCK_GROUP_ITEM_KEY: + bi = btrfs_item_ptr(l, i, + struct btrfs_block_group_item); + printk(KERN_INFO "\t\tblock group used %llu\n", + (unsigned long long) + btrfs_disk_block_group_used(l, bi)); + break; + case BTRFS_CHUNK_ITEM_KEY: + print_chunk(l, btrfs_item_ptr(l, i, + struct btrfs_chunk)); + break; + case BTRFS_DEV_ITEM_KEY: + print_dev_item(l, btrfs_item_ptr(l, i, + struct btrfs_dev_item)); + break; + case BTRFS_DEV_EXTENT_KEY: + dev_extent = btrfs_item_ptr(l, i, + struct btrfs_dev_extent); + printk(KERN_INFO "\t\tdev extent chunk_tree %llu\n" + "\t\tchunk objectid %llu chunk offset %llu " + "length %llu\n", + (unsigned long long) + btrfs_dev_extent_chunk_tree(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_objectid(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_chunk_offset(l, dev_extent), + (unsigned long long) + btrfs_dev_extent_length(l, dev_extent)); + }; + } +} + +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) +{ + int i; u32 nr; + struct btrfs_key key; + int level; + + if (!c) + return; + nr = btrfs_header_nritems(c); + level = btrfs_header_level(c); + if (level == 0) { + btrfs_print_leaf(root, c); + return; + } + printk(KERN_INFO "node %llu level %d total ptrs %d free spc %u\n", + (unsigned long long)btrfs_header_bytenr(c), + btrfs_header_level(c), nr, + (u32)BTRFS_NODEPTRS_PER_BLOCK(root) - nr); + for (i = 0; i < nr; i++) { + btrfs_node_key_to_cpu(c, &key, i); + printk(KERN_INFO "\tkey %d (%llu %u %llu) block %llu\n", + i, + (unsigned long long)key.objectid, + key.type, + (unsigned long long)key.offset, + (unsigned long long)btrfs_node_blockptr(c, i)); + } + for (i = 0; i < nr; i++) { + struct extent_buffer *next = read_tree_block(root, + btrfs_node_blockptr(c, i), + btrfs_level_size(root, level - 1), + btrfs_node_ptr_generation(c, i)); + if (btrfs_is_leaf(next) && + btrfs_header_level(c) != 1) + BUG(); + if (btrfs_header_level(next) != + btrfs_header_level(c) - 1) + BUG(); + btrfs_print_tree(root, next); + free_extent_buffer(next); + } +} diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h new file mode 100644 index 00000000000..da75efe534d --- /dev/null +++ b/fs/btrfs/print-tree.h @@ -0,0 +1,23 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __PRINT_TREE_ +#define __PRINT_TREE_ +void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l); +void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *t); +#endif diff --git a/fs/btrfs/ref-cache.c b/fs/btrfs/ref-cache.c new file mode 100644 index 00000000000..6f0acc4c9ea --- /dev/null +++ b/fs/btrfs/ref-cache.c @@ -0,0 +1,230 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "ref-cache.h" +#include "transaction.h" + +/* + * leaf refs are used to cache the information about which extents + * a given leaf has references on. This allows us to process that leaf + * in btrfs_drop_snapshot without needing to read it back from disk. + */ + +/* + * kmalloc a leaf reference struct and update the counters for the + * total ref cache size + */ +struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, + int nr_extents) +{ + struct btrfs_leaf_ref *ref; + size_t size = btrfs_leaf_ref_size(nr_extents); + + ref = kmalloc(size, GFP_NOFS); + if (ref) { + spin_lock(&root->fs_info->ref_cache_lock); + root->fs_info->total_ref_cache_size += size; + spin_unlock(&root->fs_info->ref_cache_lock); + + memset(ref, 0, sizeof(*ref)); + atomic_set(&ref->usage, 1); + INIT_LIST_HEAD(&ref->list); + } + return ref; +} + +/* + * free a leaf reference struct and update the counters for the + * total ref cache size + */ +void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) +{ + if (!ref) + return; + WARN_ON(atomic_read(&ref->usage) == 0); + if (atomic_dec_and_test(&ref->usage)) { + size_t size = btrfs_leaf_ref_size(ref->nritems); + + BUG_ON(ref->in_tree); + kfree(ref); + + spin_lock(&root->fs_info->ref_cache_lock); + root->fs_info->total_ref_cache_size -= size; + spin_unlock(&root->fs_info->ref_cache_lock); + } +} + +static struct rb_node *tree_insert(struct rb_root *root, u64 bytenr, + struct rb_node *node) +{ + struct rb_node **p = &root->rb_node; + struct rb_node *parent = NULL; + struct btrfs_leaf_ref *entry; + + while (*p) { + parent = *p; + entry = rb_entry(parent, struct btrfs_leaf_ref, rb_node); + + if (bytenr < entry->bytenr) + p = &(*p)->rb_left; + else if (bytenr > entry->bytenr) + p = &(*p)->rb_right; + else + return parent; + } + + entry = rb_entry(node, struct btrfs_leaf_ref, rb_node); + rb_link_node(node, parent, p); + rb_insert_color(node, root); + return NULL; +} + +static struct rb_node *tree_search(struct rb_root *root, u64 bytenr) +{ + struct rb_node *n = root->rb_node; + struct btrfs_leaf_ref *entry; + + while (n) { + entry = rb_entry(n, struct btrfs_leaf_ref, rb_node); + WARN_ON(!entry->in_tree); + + if (bytenr < entry->bytenr) + n = n->rb_left; + else if (bytenr > entry->bytenr) + n = n->rb_right; + else + return n; + } + return NULL; +} + +int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, + int shared) +{ + struct btrfs_leaf_ref *ref = NULL; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + + if (shared) + tree = &root->fs_info->shared_ref_tree; + if (!tree) + return 0; + + spin_lock(&tree->lock); + while (!list_empty(&tree->list)) { + ref = list_entry(tree->list.next, struct btrfs_leaf_ref, list); + BUG_ON(ref->tree != tree); + if (ref->root_gen > max_root_gen) + break; + if (!xchg(&ref->in_tree, 0)) { + cond_resched_lock(&tree->lock); + continue; + } + + rb_erase(&ref->rb_node, &tree->root); + list_del_init(&ref->list); + + spin_unlock(&tree->lock); + btrfs_free_leaf_ref(root, ref); + cond_resched(); + spin_lock(&tree->lock); + } + spin_unlock(&tree->lock); + return 0; +} + +/* + * find the leaf ref for a given extent. This returns the ref struct with + * a usage reference incremented + */ +struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, + u64 bytenr) +{ + struct rb_node *rb; + struct btrfs_leaf_ref *ref = NULL; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; +again: + if (tree) { + spin_lock(&tree->lock); + rb = tree_search(&tree->root, bytenr); + if (rb) + ref = rb_entry(rb, struct btrfs_leaf_ref, rb_node); + if (ref) + atomic_inc(&ref->usage); + spin_unlock(&tree->lock); + if (ref) + return ref; + } + if (tree != &root->fs_info->shared_ref_tree) { + tree = &root->fs_info->shared_ref_tree; + goto again; + } + return NULL; +} + +/* + * add a fully filled in leaf ref struct + * remove all the refs older than a given root generation + */ +int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, + int shared) +{ + int ret = 0; + struct rb_node *rb; + struct btrfs_leaf_ref_tree *tree = root->ref_tree; + + if (shared) + tree = &root->fs_info->shared_ref_tree; + + spin_lock(&tree->lock); + rb = tree_insert(&tree->root, ref->bytenr, &ref->rb_node); + if (rb) { + ret = -EEXIST; + } else { + atomic_inc(&ref->usage); + ref->tree = tree; + ref->in_tree = 1; + list_add_tail(&ref->list, &tree->list); + } + spin_unlock(&tree->lock); + return ret; +} + +/* + * remove a single leaf ref from the tree. This drops the ref held by the tree + * only + */ +int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref) +{ + struct btrfs_leaf_ref_tree *tree; + + if (!xchg(&ref->in_tree, 0)) + return 0; + + tree = ref->tree; + spin_lock(&tree->lock); + + rb_erase(&ref->rb_node, &tree->root); + list_del_init(&ref->list); + + spin_unlock(&tree->lock); + + btrfs_free_leaf_ref(root, ref); + return 0; +} diff --git a/fs/btrfs/ref-cache.h b/fs/btrfs/ref-cache.h new file mode 100644 index 00000000000..16f3183d7c5 --- /dev/null +++ b/fs/btrfs/ref-cache.h @@ -0,0 +1,77 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#ifndef __REFCACHE__ +#define __REFCACHE__ + +struct btrfs_extent_info { + /* bytenr and num_bytes find the extent in the extent allocation tree */ + u64 bytenr; + u64 num_bytes; + + /* objectid and offset find the back reference for the file */ + u64 objectid; + u64 offset; +}; + +struct btrfs_leaf_ref { + struct rb_node rb_node; + struct btrfs_leaf_ref_tree *tree; + int in_tree; + atomic_t usage; + + u64 root_gen; + u64 bytenr; + u64 owner; + u64 generation; + int nritems; + + struct list_head list; + struct btrfs_extent_info extents[]; +}; + +static inline size_t btrfs_leaf_ref_size(int nr_extents) +{ + return sizeof(struct btrfs_leaf_ref) + + sizeof(struct btrfs_extent_info) * nr_extents; +} + +static inline void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree) +{ + tree->root.rb_node = NULL; + INIT_LIST_HEAD(&tree->list); + spin_lock_init(&tree->lock); +} + +static inline int btrfs_leaf_ref_tree_empty(struct btrfs_leaf_ref_tree *tree) +{ + return RB_EMPTY_ROOT(&tree->root); +} + +void btrfs_leaf_ref_tree_init(struct btrfs_leaf_ref_tree *tree); +struct btrfs_leaf_ref *btrfs_alloc_leaf_ref(struct btrfs_root *root, + int nr_extents); +void btrfs_free_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); +struct btrfs_leaf_ref *btrfs_lookup_leaf_ref(struct btrfs_root *root, + u64 bytenr); +int btrfs_add_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref, + int shared); +int btrfs_remove_leaf_refs(struct btrfs_root *root, u64 max_root_gen, + int shared); +int btrfs_remove_leaf_ref(struct btrfs_root *root, struct btrfs_leaf_ref *ref); + +#endif diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c new file mode 100644 index 00000000000..b48650de447 --- /dev/null +++ b/fs/btrfs/root-tree.c @@ -0,0 +1,366 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "print-tree.h" + +/* + * search forward for a root, starting with objectid 'search_start' + * if a root key is found, the objectid we find is filled into 'found_objectid' + * and 0 is returned. < 0 is returned on error, 1 if there is nothing + * left in the tree. + */ +int btrfs_search_root(struct btrfs_root *root, u64 search_start, + u64 *found_objectid) +{ + struct btrfs_path *path; + struct btrfs_key search_key; + int ret; + + root = root->fs_info->tree_root; + search_key.objectid = search_start; + search_key.type = (u8)-1; + search_key.offset = (u64)-1; + + path = btrfs_alloc_path(); + BUG_ON(!path); +again: + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + if (ret == 0) { + ret = 1; + goto out; + } + if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } + btrfs_item_key_to_cpu(path->nodes[0], &search_key, path->slots[0]); + if (search_key.type != BTRFS_ROOT_ITEM_KEY) { + search_key.offset++; + btrfs_release_path(root, path); + goto again; + } + ret = 0; + *found_objectid = search_key.objectid; + +out: + btrfs_free_path(path); + return ret; +} + +/* + * lookup the root with the highest offset for a given objectid. The key we do + * find is copied into 'key'. If we find something return 0, otherwise 1, < 0 + * on error. + */ +int btrfs_find_last_root(struct btrfs_root *root, u64 objectid, + struct btrfs_root_item *item, struct btrfs_key *key) +{ + struct btrfs_path *path; + struct btrfs_key search_key; + struct btrfs_key found_key; + struct extent_buffer *l; + int ret; + int slot; + + search_key.objectid = objectid; + search_key.type = BTRFS_ROOT_ITEM_KEY; + search_key.offset = (u64)-1; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); + if (ret < 0) + goto out; + + BUG_ON(ret == 0); + l = path->nodes[0]; + BUG_ON(path->slots[0] == 0); + slot = path->slots[0] - 1; + btrfs_item_key_to_cpu(l, &found_key, slot); + if (found_key.objectid != objectid) { + ret = 1; + goto out; + } + read_extent_buffer(l, item, btrfs_item_ptr_offset(l, slot), + sizeof(*item)); + memcpy(key, &found_key, sizeof(found_key)); + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +/* + * copy the data in 'item' into the btree + */ +int btrfs_update_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + struct btrfs_path *path; + struct extent_buffer *l; + int ret; + int slot; + unsigned long ptr; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret != 0) { + btrfs_print_leaf(root, path->nodes[0]); + printk(KERN_CRIT "unable to update root key %llu %u %llu\n", + (unsigned long long)key->objectid, key->type, + (unsigned long long)key->offset); + BUG_ON(1); + } + + l = path->nodes[0]; + slot = path->slots[0]; + ptr = btrfs_item_ptr_offset(l, slot); + write_extent_buffer(l, item, ptr, sizeof(*item)); + btrfs_mark_buffer_dirty(path->nodes[0]); +out: + btrfs_release_path(root, path); + btrfs_free_path(path); + return ret; +} + +int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root + *root, struct btrfs_key *key, struct btrfs_root_item + *item) +{ + int ret; + ret = btrfs_insert_item(trans, root, key, item, sizeof(*item)); + return ret; +} + +/* + * at mount time we want to find all the old transaction snapshots that were in + * the process of being deleted if we crashed. This is any root item with an + * offset lower than the latest root. They need to be queued for deletion to + * finish what was happening when we crashed. + */ +int btrfs_find_dead_roots(struct btrfs_root *root, u64 objectid, + struct btrfs_root *latest) +{ + struct btrfs_root *dead_root; + struct btrfs_item *item; + struct btrfs_root_item *ri; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + int ret; + u32 nritems; + struct extent_buffer *leaf; + int slot; + + key.objectid = objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.offset = 0; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + +again: + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + if (slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &key, slot); + if (btrfs_key_type(&key) != BTRFS_ROOT_ITEM_KEY) + goto next; + + if (key.objectid < objectid) + goto next; + + if (key.objectid > objectid) + break; + + ri = btrfs_item_ptr(leaf, slot, struct btrfs_root_item); + if (btrfs_disk_root_refs(leaf, ri) != 0) + goto next; + + memcpy(&found_key, &key, sizeof(key)); + key.offset++; + btrfs_release_path(root, path); + dead_root = + btrfs_read_fs_root_no_radix(root->fs_info->tree_root, + &found_key); + if (IS_ERR(dead_root)) { + ret = PTR_ERR(dead_root); + goto err; + } + + if (objectid == BTRFS_TREE_RELOC_OBJECTID) + ret = btrfs_add_dead_reloc_root(dead_root); + else + ret = btrfs_add_dead_root(dead_root, latest); + if (ret) + goto err; + goto again; +next: + slot++; + path->slots[0]++; + } + ret = 0; +err: + btrfs_free_path(path); + return ret; +} + +/* drop the root item for 'key' from 'root' */ +int btrfs_del_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, + struct btrfs_key *key) +{ + struct btrfs_path *path; + int ret; + u32 refs; + struct btrfs_root_item *ri; + struct extent_buffer *leaf; + + path = btrfs_alloc_path(); + BUG_ON(!path); + ret = btrfs_search_slot(trans, root, key, path, -1, 1); + if (ret < 0) + goto out; + + BUG_ON(ret != 0); + leaf = path->nodes[0]; + ri = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_item); + + refs = btrfs_disk_root_refs(leaf, ri); + BUG_ON(refs != 0); + ret = btrfs_del_item(trans, root, path); +out: + btrfs_release_path(root, path); + btrfs_free_path(path); + return ret; +} + +#if 0 /* this will get used when snapshot deletion is implemented */ +int btrfs_del_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u8 type, u64 ref_id) +{ + struct btrfs_key key; + int ret; + struct btrfs_path *path; + + path = btrfs_alloc_path(); + + key.objectid = root_id; + key.type = type; + key.offset = ref_id; + + ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1); + BUG_ON(ret); + + ret = btrfs_del_item(trans, tree_root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return ret; +} +#endif + +int btrfs_find_root_ref(struct btrfs_root *tree_root, + struct btrfs_path *path, + u64 root_id, u64 ref_id) +{ + struct btrfs_key key; + int ret; + + key.objectid = root_id; + key.type = BTRFS_ROOT_REF_KEY; + key.offset = ref_id; + + ret = btrfs_search_slot(NULL, tree_root, &key, path, 0, 0); + return ret; +} + + +/* + * add a btrfs_root_ref item. type is either BTRFS_ROOT_REF_KEY + * or BTRFS_ROOT_BACKREF_KEY. + * + * The dirid, sequence, name and name_len refer to the directory entry + * that is referencing the root. + * + * For a forward ref, the root_id is the id of the tree referencing + * the root and ref_id is the id of the subvol or snapshot. + * + * For a back ref the root_id is the id of the subvol or snapshot and + * ref_id is the id of the tree referencing it. + */ +int btrfs_add_root_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *tree_root, + u64 root_id, u8 type, u64 ref_id, + u64 dirid, u64 sequence, + const char *name, int name_len) +{ + struct btrfs_key key; + int ret; + struct btrfs_path *path; + struct btrfs_root_ref *ref; + struct extent_buffer *leaf; + unsigned long ptr; + + + path = btrfs_alloc_path(); + + key.objectid = root_id; + key.type = type; + key.offset = ref_id; + + ret = btrfs_insert_empty_item(trans, tree_root, path, &key, + sizeof(*ref) + name_len); + BUG_ON(ret); + + leaf = path->nodes[0]; + ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref); + btrfs_set_root_ref_dirid(leaf, ref, dirid); + btrfs_set_root_ref_sequence(leaf, ref, sequence); + btrfs_set_root_ref_name_len(leaf, ref, name_len); + ptr = (unsigned long)(ref + 1); + write_extent_buffer(leaf, name, ptr, name_len); + btrfs_mark_buffer_dirty(leaf); + + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c new file mode 100644 index 00000000000..c0f7ecaf1e7 --- /dev/null +++ b/fs/btrfs/struct-funcs.c @@ -0,0 +1,139 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/highmem.h> + +/* this is some deeply nasty code. ctree.h has a different + * definition for this BTRFS_SETGET_FUNCS macro, behind a #ifndef + * + * The end result is that anyone who #includes ctree.h gets a + * declaration for the btrfs_set_foo functions and btrfs_foo functions + * + * This file declares the macros and then #includes ctree.h, which results + * in cpp creating the function here based on the template below. + * + * These setget functions do all the extent_buffer related mapping + * required to efficiently read and write specific fields in the extent + * buffers. Every pointer to metadata items in btrfs is really just + * an unsigned long offset into the extent buffer which has been + * cast to a specific type. This gives us all the gcc type checking. + * + * The extent buffer api is used to do all the kmapping and page + * spanning work required to get extent buffers in highmem and have + * a metadata blocksize different from the page size. + * + * The macro starts with a simple function prototype declaration so that + * sparse won't complain about it being static. + */ + +#define BTRFS_SETGET_FUNCS(name, type, member, bits) \ +u##bits btrfs_##name(struct extent_buffer *eb, type *s); \ +void btrfs_set_##name(struct extent_buffer *eb, type *s, u##bits val); \ +u##bits btrfs_##name(struct extent_buffer *eb, \ + type *s) \ +{ \ + unsigned long part_offset = (unsigned long)s; \ + unsigned long offset = part_offset + offsetof(type, member); \ + type *p; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + p = (type *)(eb->kaddr + part_offset - eb->map_start); \ + return le##bits##_to_cpu(p->member); \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + u##bits res; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + __le##bits leres; \ + read_eb_member(eb, s, type, member, &leres); \ + return le##bits##_to_cpu(leres); \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + res = le##bits##_to_cpu(p->member); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + return res; \ + } \ +} \ +void btrfs_set_##name(struct extent_buffer *eb, \ + type *s, u##bits val) \ +{ \ + unsigned long part_offset = (unsigned long)s; \ + unsigned long offset = part_offset + offsetof(type, member); \ + type *p; \ + /* ugly, but we want the fast path here */ \ + if (eb->map_token && offset >= eb->map_start && \ + offset + sizeof(((type *)0)->member) <= eb->map_start + \ + eb->map_len) { \ + p = (type *)(eb->kaddr + part_offset - eb->map_start); \ + p->member = cpu_to_le##bits(val); \ + return; \ + } \ + { \ + int err; \ + char *map_token; \ + char *kaddr; \ + int unmap_on_exit = (eb->map_token == NULL); \ + unsigned long map_start; \ + unsigned long map_len; \ + err = map_extent_buffer(eb, offset, \ + sizeof(((type *)0)->member), \ + &map_token, &kaddr, \ + &map_start, &map_len, KM_USER1); \ + if (err) { \ + __le##bits val2; \ + val2 = cpu_to_le##bits(val); \ + write_eb_member(eb, s, type, member, &val2); \ + return; \ + } \ + p = (type *)(kaddr + part_offset - map_start); \ + p->member = cpu_to_le##bits(val); \ + if (unmap_on_exit) \ + unmap_extent_buffer(eb, map_token, KM_USER1); \ + } \ +} + +#include "ctree.h" + +void btrfs_node_key(struct extent_buffer *eb, + struct btrfs_disk_key *disk_key, int nr) +{ + unsigned long ptr = btrfs_node_key_ptr_offset(nr); + if (eb->map_token && ptr >= eb->map_start && + ptr + sizeof(*disk_key) <= eb->map_start + eb->map_len) { + memcpy(disk_key, eb->kaddr + ptr - eb->map_start, + sizeof(*disk_key)); + return; + } else if (eb->map_token) { + unmap_extent_buffer(eb, eb->map_token, KM_USER1); + eb->map_token = NULL; + } + read_eb_member(eb, (struct btrfs_key_ptr *)ptr, + struct btrfs_key_ptr, key, disk_key); +} diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c new file mode 100644 index 00000000000..b4c101d9322 --- /dev/null +++ b/fs/btrfs/super.c @@ -0,0 +1,720 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/blkdev.h> +#include <linux/module.h> +#include <linux/buffer_head.h> +#include <linux/fs.h> +#include <linux/pagemap.h> +#include <linux/highmem.h> +#include <linux/time.h> +#include <linux/init.h> +#include <linux/string.h> +#include <linux/smp_lock.h> +#include <linux/backing-dev.h> +#include <linux/mount.h> +#include <linux/mpage.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/statfs.h> +#include <linux/compat.h> +#include <linux/parser.h> +#include <linux/ctype.h> +#include <linux/namei.h> +#include <linux/miscdevice.h> +#include <linux/version.h> +#include "compat.h" +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "btrfs_inode.h" +#include "ioctl.h" +#include "print-tree.h" +#include "xattr.h" +#include "volumes.h" +#include "version.h" +#include "export.h" +#include "compression.h" + +#define BTRFS_SUPER_MAGIC 0x9123683E + +static struct super_operations btrfs_super_ops; + +static void btrfs_put_super(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + ret = close_ctree(root); + sb->s_fs_info = NULL; +} + +enum { + Opt_degraded, Opt_subvol, Opt_device, Opt_nodatasum, Opt_nodatacow, + Opt_max_extent, Opt_max_inline, Opt_alloc_start, Opt_nobarrier, + Opt_ssd, Opt_thread_pool, Opt_noacl, Opt_compress, Opt_err, +}; + +static match_table_t tokens = { + {Opt_degraded, "degraded"}, + {Opt_subvol, "subvol=%s"}, + {Opt_device, "device=%s"}, + {Opt_nodatasum, "nodatasum"}, + {Opt_nodatacow, "nodatacow"}, + {Opt_nobarrier, "nobarrier"}, + {Opt_max_extent, "max_extent=%s"}, + {Opt_max_inline, "max_inline=%s"}, + {Opt_alloc_start, "alloc_start=%s"}, + {Opt_thread_pool, "thread_pool=%d"}, + {Opt_compress, "compress"}, + {Opt_ssd, "ssd"}, + {Opt_noacl, "noacl"}, + {Opt_err, NULL}, +}; + +u64 btrfs_parse_size(char *str) +{ + u64 res; + int mult = 1; + char *end; + char last; + + res = simple_strtoul(str, &end, 10); + + last = end[0]; + if (isalpha(last)) { + last = tolower(last); + switch (last) { + case 'g': + mult *= 1024; + case 'm': + mult *= 1024; + case 'k': + mult *= 1024; + } + res = res * mult; + } + return res; +} + +/* + * Regular mount options parser. Everything that is needed only when + * reading in a new superblock is parsed here. + */ +int btrfs_parse_options(struct btrfs_root *root, char *options) +{ + struct btrfs_fs_info *info = root->fs_info; + substring_t args[MAX_OPT_ARGS]; + char *p, *num; + int intarg; + + if (!options) + return 0; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + options = kstrdup(options, GFP_NOFS); + if (!options) + return -ENOMEM; + + + while ((p = strsep(&options, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_degraded: + printk(KERN_INFO "btrfs: allowing degraded mounts\n"); + btrfs_set_opt(info->mount_opt, DEGRADED); + break; + case Opt_subvol: + case Opt_device: + /* + * These are parsed by btrfs_parse_early_options + * and can be happily ignored here. + */ + break; + case Opt_nodatasum: + printk(KERN_INFO "btrfs: setting nodatacsum\n"); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_nodatacow: + printk(KERN_INFO "btrfs: setting nodatacow\n"); + btrfs_set_opt(info->mount_opt, NODATACOW); + btrfs_set_opt(info->mount_opt, NODATASUM); + break; + case Opt_compress: + printk(KERN_INFO "btrfs: use compression\n"); + btrfs_set_opt(info->mount_opt, COMPRESS); + break; + case Opt_ssd: + printk(KERN_INFO "btrfs: use ssd allocation scheme\n"); + btrfs_set_opt(info->mount_opt, SSD); + break; + case Opt_nobarrier: + printk(KERN_INFO "btrfs: turning off barriers\n"); + btrfs_set_opt(info->mount_opt, NOBARRIER); + break; + case Opt_thread_pool: + intarg = 0; + match_int(&args[0], &intarg); + if (intarg) { + info->thread_pool_size = intarg; + printk(KERN_INFO "btrfs: thread pool %d\n", + info->thread_pool_size); + } + break; + case Opt_max_extent: + num = match_strdup(&args[0]); + if (num) { + info->max_extent = btrfs_parse_size(num); + kfree(num); + + info->max_extent = max_t(u64, + info->max_extent, root->sectorsize); + printk(KERN_INFO "btrfs: max_extent at %llu\n", + info->max_extent); + } + break; + case Opt_max_inline: + num = match_strdup(&args[0]); + if (num) { + info->max_inline = btrfs_parse_size(num); + kfree(num); + + if (info->max_inline) { + info->max_inline = max_t(u64, + info->max_inline, + root->sectorsize); + } + printk(KERN_INFO "btrfs: max_inline at %llu\n", + info->max_inline); + } + break; + case Opt_alloc_start: + num = match_strdup(&args[0]); + if (num) { + info->alloc_start = btrfs_parse_size(num); + kfree(num); + printk(KERN_INFO + "btrfs: allocations start at %llu\n", + info->alloc_start); + } + break; + case Opt_noacl: + root->fs_info->sb->s_flags &= ~MS_POSIXACL; + break; + default: + break; + } + } + kfree(options); + return 0; +} + +/* + * Parse mount options that are required early in the mount process. + * + * All other options will be parsed on much later in the mount process and + * only when we need to allocate a new super block. + */ +static int btrfs_parse_early_options(const char *options, fmode_t flags, + void *holder, char **subvol_name, + struct btrfs_fs_devices **fs_devices) +{ + substring_t args[MAX_OPT_ARGS]; + char *opts, *p; + int error = 0; + + if (!options) + goto out; + + /* + * strsep changes the string, duplicate it because parse_options + * gets called twice + */ + opts = kstrdup(options, GFP_KERNEL); + if (!opts) + return -ENOMEM; + + while ((p = strsep(&opts, ",")) != NULL) { + int token; + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_subvol: + *subvol_name = match_strdup(&args[0]); + break; + case Opt_device: + error = btrfs_scan_one_device(match_strdup(&args[0]), + flags, holder, fs_devices); + if (error) + goto out_free_opts; + break; + default: + break; + } + } + + out_free_opts: + kfree(opts); + out: + /* + * If no subvolume name is specified we use the default one. Allocate + * a copy of the string "." here so that code later in the + * mount path doesn't care if it's the default volume or another one. + */ + if (!*subvol_name) { + *subvol_name = kstrdup(".", GFP_KERNEL); + if (!*subvol_name) + return -ENOMEM; + } + return error; +} + +static int btrfs_fill_super(struct super_block *sb, + struct btrfs_fs_devices *fs_devices, + void *data, int silent) +{ + struct inode *inode; + struct dentry *root_dentry; + struct btrfs_super_block *disk_super; + struct btrfs_root *tree_root; + struct btrfs_inode *bi; + int err; + + sb->s_maxbytes = MAX_LFS_FILESIZE; + sb->s_magic = BTRFS_SUPER_MAGIC; + sb->s_op = &btrfs_super_ops; + sb->s_export_op = &btrfs_export_ops; + sb->s_xattr = btrfs_xattr_handlers; + sb->s_time_gran = 1; + sb->s_flags |= MS_POSIXACL; + + tree_root = open_ctree(sb, fs_devices, (char *)data); + + if (IS_ERR(tree_root)) { + printk("btrfs: open_ctree failed\n"); + return PTR_ERR(tree_root); + } + sb->s_fs_info = tree_root; + disk_super = &tree_root->fs_info->super_copy; + inode = btrfs_iget_locked(sb, BTRFS_FIRST_FREE_OBJECTID, + tree_root->fs_info->fs_root); + bi = BTRFS_I(inode); + bi->location.objectid = inode->i_ino; + bi->location.offset = 0; + bi->root = tree_root->fs_info->fs_root; + + btrfs_set_key_type(&bi->location, BTRFS_INODE_ITEM_KEY); + + if (!inode) { + err = -ENOMEM; + goto fail_close; + } + if (inode->i_state & I_NEW) { + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + } + + root_dentry = d_alloc_root(inode); + if (!root_dentry) { + iput(inode); + err = -ENOMEM; + goto fail_close; + } +#if 0 + /* this does the super kobj at the same time */ + err = btrfs_sysfs_add_super(tree_root->fs_info); + if (err) + goto fail_close; +#endif + + sb->s_root = root_dentry; + + save_mount_options(sb, data); + return 0; + +fail_close: + close_ctree(tree_root); + return err; +} + +int btrfs_sync_fs(struct super_block *sb, int wait) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root; + int ret; + root = btrfs_sb(sb); + + if (sb->s_flags & MS_RDONLY) + return 0; + + sb->s_dirt = 0; + if (!wait) { + filemap_flush(root->fs_info->btree_inode->i_mapping); + return 0; + } + + btrfs_start_delalloc_inodes(root); + btrfs_wait_ordered_extents(root, 0); + + btrfs_clean_old_snapshots(root); + trans = btrfs_start_transaction(root, 1); + ret = btrfs_commit_transaction(trans, root); + sb->s_dirt = 0; + return ret; +} + +static void btrfs_write_super(struct super_block *sb) +{ + sb->s_dirt = 0; +} + +static int btrfs_test_super(struct super_block *s, void *data) +{ + struct btrfs_fs_devices *test_fs_devices = data; + struct btrfs_root *root = btrfs_sb(s); + + return root->fs_info->fs_devices == test_fs_devices; +} + +/* + * Find a superblock for the given device / mount point. + * + * Note: This is based on get_sb_bdev from fs/super.c with a few additions + * for multiple device setup. Make sure to keep it in sync. + */ +static int btrfs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data, struct vfsmount *mnt) +{ + char *subvol_name = NULL; + struct block_device *bdev = NULL; + struct super_block *s; + struct dentry *root; + struct btrfs_fs_devices *fs_devices = NULL; + fmode_t mode = FMODE_READ; + int error = 0; + + if (!(flags & MS_RDONLY)) + mode |= FMODE_WRITE; + + error = btrfs_parse_early_options(data, mode, fs_type, + &subvol_name, &fs_devices); + if (error) + return error; + + error = btrfs_scan_one_device(dev_name, mode, fs_type, &fs_devices); + if (error) + goto error_free_subvol_name; + + error = btrfs_open_devices(fs_devices, mode, fs_type); + if (error) + goto error_free_subvol_name; + + if (!(flags & MS_RDONLY) && fs_devices->rw_devices == 0) { + error = -EACCES; + goto error_close_devices; + } + + bdev = fs_devices->latest_bdev; + s = sget(fs_type, btrfs_test_super, set_anon_super, fs_devices); + if (IS_ERR(s)) + goto error_s; + + if (s->s_root) { + if ((flags ^ s->s_flags) & MS_RDONLY) { + up_write(&s->s_umount); + deactivate_super(s); + error = -EBUSY; + goto error_close_devices; + } + + btrfs_close_devices(fs_devices); + } else { + char b[BDEVNAME_SIZE]; + + s->s_flags = flags; + strlcpy(s->s_id, bdevname(bdev, b), sizeof(s->s_id)); + error = btrfs_fill_super(s, fs_devices, data, + flags & MS_SILENT ? 1 : 0); + if (error) { + up_write(&s->s_umount); + deactivate_super(s); + goto error_free_subvol_name; + } + + btrfs_sb(s)->fs_info->bdev_holder = fs_type; + s->s_flags |= MS_ACTIVE; + } + + if (!strcmp(subvol_name, ".")) + root = dget(s->s_root); + else { + mutex_lock(&s->s_root->d_inode->i_mutex); + root = lookup_one_len(subvol_name, s->s_root, + strlen(subvol_name)); + mutex_unlock(&s->s_root->d_inode->i_mutex); + + if (IS_ERR(root)) { + up_write(&s->s_umount); + deactivate_super(s); + error = PTR_ERR(root); + goto error_free_subvol_name; + } + if (!root->d_inode) { + dput(root); + up_write(&s->s_umount); + deactivate_super(s); + error = -ENXIO; + goto error_free_subvol_name; + } + } + + mnt->mnt_sb = s; + mnt->mnt_root = root; + + kfree(subvol_name); + return 0; + +error_s: + error = PTR_ERR(s); +error_close_devices: + btrfs_close_devices(fs_devices); +error_free_subvol_name: + kfree(subvol_name); + return error; +} + +static int btrfs_remount(struct super_block *sb, int *flags, char *data) +{ + struct btrfs_root *root = btrfs_sb(sb); + int ret; + + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) + return 0; + + if (*flags & MS_RDONLY) { + sb->s_flags |= MS_RDONLY; + + ret = btrfs_commit_super(root); + WARN_ON(ret); + } else { + if (root->fs_info->fs_devices->rw_devices == 0) + return -EACCES; + + if (btrfs_super_log_root(&root->fs_info->super_copy) != 0) + return -EINVAL; + + ret = btrfs_cleanup_reloc_trees(root); + WARN_ON(ret); + + ret = btrfs_cleanup_fs_roots(root->fs_info); + WARN_ON(ret); + + sb->s_flags &= ~MS_RDONLY; + } + + return 0; +} + +static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct btrfs_root *root = btrfs_sb(dentry->d_sb); + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + int bits = dentry->d_sb->s_blocksize_bits; + __be32 *fsid = (__be32 *)root->fs_info->fsid; + + buf->f_namelen = BTRFS_NAME_LEN; + buf->f_blocks = btrfs_super_total_bytes(disk_super) >> bits; + buf->f_bfree = buf->f_blocks - + (btrfs_super_bytes_used(disk_super) >> bits); + buf->f_bavail = buf->f_bfree; + buf->f_bsize = dentry->d_sb->s_blocksize; + buf->f_type = BTRFS_SUPER_MAGIC; + + /* We treat it as constant endianness (it doesn't matter _which_) + because we want the fsid to come out the same whether mounted + on a big-endian or little-endian host */ + buf->f_fsid.val[0] = be32_to_cpu(fsid[0]) ^ be32_to_cpu(fsid[2]); + buf->f_fsid.val[1] = be32_to_cpu(fsid[1]) ^ be32_to_cpu(fsid[3]); + /* Mask in the root object ID too, to disambiguate subvols */ + buf->f_fsid.val[0] ^= BTRFS_I(dentry->d_inode)->root->objectid >> 32; + buf->f_fsid.val[1] ^= BTRFS_I(dentry->d_inode)->root->objectid; + + return 0; +} + +static struct file_system_type btrfs_fs_type = { + .owner = THIS_MODULE, + .name = "btrfs", + .get_sb = btrfs_get_sb, + .kill_sb = kill_anon_super, + .fs_flags = FS_REQUIRES_DEV, +}; + +/* + * used by btrfsctl to scan devices when no FS is mounted + */ +static long btrfs_control_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct btrfs_ioctl_vol_args *vol; + struct btrfs_fs_devices *fs_devices; + int ret = 0; + int len; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + vol = kmalloc(sizeof(*vol), GFP_KERNEL); + if (copy_from_user(vol, (void __user *)arg, sizeof(*vol))) { + ret = -EFAULT; + goto out; + } + len = strnlen(vol->name, BTRFS_PATH_NAME_MAX); + switch (cmd) { + case BTRFS_IOC_SCAN_DEV: + ret = btrfs_scan_one_device(vol->name, FMODE_READ, + &btrfs_fs_type, &fs_devices); + break; + } +out: + kfree(vol); + return ret; +} + +static void btrfs_write_super_lockfs(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + mutex_lock(&root->fs_info->transaction_kthread_mutex); + mutex_lock(&root->fs_info->cleaner_mutex); +} + +static void btrfs_unlockfs(struct super_block *sb) +{ + struct btrfs_root *root = btrfs_sb(sb); + mutex_unlock(&root->fs_info->cleaner_mutex); + mutex_unlock(&root->fs_info->transaction_kthread_mutex); +} + +static struct super_operations btrfs_super_ops = { + .delete_inode = btrfs_delete_inode, + .put_super = btrfs_put_super, + .write_super = btrfs_write_super, + .sync_fs = btrfs_sync_fs, + .show_options = generic_show_options, + .write_inode = btrfs_write_inode, + .dirty_inode = btrfs_dirty_inode, + .alloc_inode = btrfs_alloc_inode, + .destroy_inode = btrfs_destroy_inode, + .statfs = btrfs_statfs, + .remount_fs = btrfs_remount, + .write_super_lockfs = btrfs_write_super_lockfs, + .unlockfs = btrfs_unlockfs, +}; + +static const struct file_operations btrfs_ctl_fops = { + .unlocked_ioctl = btrfs_control_ioctl, + .compat_ioctl = btrfs_control_ioctl, + .owner = THIS_MODULE, +}; + +static struct miscdevice btrfs_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = "btrfs-control", + .fops = &btrfs_ctl_fops +}; + +static int btrfs_interface_init(void) +{ + return misc_register(&btrfs_misc); +} + +static void btrfs_interface_exit(void) +{ + if (misc_deregister(&btrfs_misc) < 0) + printk(KERN_INFO "misc_deregister failed for control device"); +} + +static int __init init_btrfs_fs(void) +{ + int err; + + err = btrfs_init_sysfs(); + if (err) + return err; + + err = btrfs_init_cachep(); + if (err) + goto free_sysfs; + + err = extent_io_init(); + if (err) + goto free_cachep; + + err = extent_map_init(); + if (err) + goto free_extent_io; + + err = btrfs_interface_init(); + if (err) + goto free_extent_map; + + err = register_filesystem(&btrfs_fs_type); + if (err) + goto unregister_ioctl; + + printk(KERN_INFO "%s loaded\n", BTRFS_BUILD_VERSION); + return 0; + +unregister_ioctl: + btrfs_interface_exit(); +free_extent_map: + extent_map_exit(); +free_extent_io: + extent_io_exit(); +free_cachep: + btrfs_destroy_cachep(); +free_sysfs: + btrfs_exit_sysfs(); + return err; +} + +static void __exit exit_btrfs_fs(void) +{ + btrfs_destroy_cachep(); + extent_map_exit(); + extent_io_exit(); + btrfs_interface_exit(); + unregister_filesystem(&btrfs_fs_type); + btrfs_exit_sysfs(); + btrfs_cleanup_fs_uuids(); + btrfs_zlib_exit(); +} + +module_init(init_btrfs_fs) +module_exit(exit_btrfs_fs) + +MODULE_LICENSE("GPL"); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c new file mode 100644 index 00000000000..a240b6fa81d --- /dev/null +++ b/fs/btrfs/sysfs.c @@ -0,0 +1,269 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/completion.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/kobject.h> + +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" + +static ssize_t root_blocks_used_show(struct btrfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_root_used(&root->root_item)); +} + +static ssize_t root_block_limit_show(struct btrfs_root *root, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_root_limit(&root->root_item)); +} + +static ssize_t super_blocks_used_show(struct btrfs_fs_info *fs, char *buf) +{ + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_bytes_used(&fs->super_copy)); +} + +static ssize_t super_total_blocks_show(struct btrfs_fs_info *fs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_total_bytes(&fs->super_copy)); +} + +static ssize_t super_blocksize_show(struct btrfs_fs_info *fs, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)btrfs_super_sectorsize(&fs->super_copy)); +} + +/* this is for root attrs (subvols/snapshots) */ +struct btrfs_root_attr { + struct attribute attr; + ssize_t (*show)(struct btrfs_root *, char *); + ssize_t (*store)(struct btrfs_root *, const char *, size_t); +}; + +#define ROOT_ATTR(name, mode, show, store) \ +static struct btrfs_root_attr btrfs_root_attr_##name = __ATTR(name, mode, \ + show, store) + +ROOT_ATTR(blocks_used, 0444, root_blocks_used_show, NULL); +ROOT_ATTR(block_limit, 0644, root_block_limit_show, NULL); + +static struct attribute *btrfs_root_attrs[] = { + &btrfs_root_attr_blocks_used.attr, + &btrfs_root_attr_block_limit.attr, + NULL, +}; + +/* this is for super attrs (actual full fs) */ +struct btrfs_super_attr { + struct attribute attr; + ssize_t (*show)(struct btrfs_fs_info *, char *); + ssize_t (*store)(struct btrfs_fs_info *, const char *, size_t); +}; + +#define SUPER_ATTR(name, mode, show, store) \ +static struct btrfs_super_attr btrfs_super_attr_##name = __ATTR(name, mode, \ + show, store) + +SUPER_ATTR(blocks_used, 0444, super_blocks_used_show, NULL); +SUPER_ATTR(total_blocks, 0444, super_total_blocks_show, NULL); +SUPER_ATTR(blocksize, 0444, super_blocksize_show, NULL); + +static struct attribute *btrfs_super_attrs[] = { + &btrfs_super_attr_blocks_used.attr, + &btrfs_super_attr_total_blocks.attr, + &btrfs_super_attr_blocksize.attr, + NULL, +}; + +static ssize_t btrfs_super_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + struct btrfs_super_attr *a = container_of(attr, + struct btrfs_super_attr, + attr); + + return a->show ? a->show(fs, buf) : 0; +} + +static ssize_t btrfs_super_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + struct btrfs_super_attr *a = container_of(attr, + struct btrfs_super_attr, + attr); + + return a->store ? a->store(fs, buf, len) : 0; +} + +static ssize_t btrfs_root_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + struct btrfs_root_attr *a = container_of(attr, + struct btrfs_root_attr, + attr); + + return a->show ? a->show(root, buf) : 0; +} + +static ssize_t btrfs_root_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + struct btrfs_root_attr *a = container_of(attr, + struct btrfs_root_attr, + attr); + return a->store ? a->store(root, buf, len) : 0; +} + +static void btrfs_super_release(struct kobject *kobj) +{ + struct btrfs_fs_info *fs = container_of(kobj, struct btrfs_fs_info, + super_kobj); + complete(&fs->kobj_unregister); +} + +static void btrfs_root_release(struct kobject *kobj) +{ + struct btrfs_root *root = container_of(kobj, struct btrfs_root, + root_kobj); + complete(&root->kobj_unregister); +} + +static struct sysfs_ops btrfs_super_attr_ops = { + .show = btrfs_super_attr_show, + .store = btrfs_super_attr_store, +}; + +static struct sysfs_ops btrfs_root_attr_ops = { + .show = btrfs_root_attr_show, + .store = btrfs_root_attr_store, +}; + +static struct kobj_type btrfs_root_ktype = { + .default_attrs = btrfs_root_attrs, + .sysfs_ops = &btrfs_root_attr_ops, + .release = btrfs_root_release, +}; + +static struct kobj_type btrfs_super_ktype = { + .default_attrs = btrfs_super_attrs, + .sysfs_ops = &btrfs_super_attr_ops, + .release = btrfs_super_release, +}; + +/* /sys/fs/btrfs/ entry */ +static struct kset *btrfs_kset; + +int btrfs_sysfs_add_super(struct btrfs_fs_info *fs) +{ + int error; + char *name; + char c; + int len = strlen(fs->sb->s_id) + 1; + int i; + + name = kmalloc(len, GFP_NOFS); + if (!name) { + error = -ENOMEM; + goto fail; + } + + for (i = 0; i < len; i++) { + c = fs->sb->s_id[i]; + if (c == '/' || c == '\\') + c = '!'; + name[i] = c; + } + name[len] = '\0'; + + fs->super_kobj.kset = btrfs_kset; + error = kobject_init_and_add(&fs->super_kobj, &btrfs_super_ktype, + NULL, "%s", name); + kfree(name); + if (error) + goto fail; + + return 0; + +fail: + printk(KERN_ERR "btrfs: sysfs creation for super failed\n"); + return error; +} + +int btrfs_sysfs_add_root(struct btrfs_root *root) +{ + int error; + + error = kobject_init_and_add(&root->root_kobj, &btrfs_root_ktype, + &root->fs_info->super_kobj, + "%s", root->name); + if (error) + goto fail; + + return 0; + +fail: + printk(KERN_ERR "btrfs: sysfs creation for root failed\n"); + return error; +} + +void btrfs_sysfs_del_root(struct btrfs_root *root) +{ + kobject_put(&root->root_kobj); + wait_for_completion(&root->kobj_unregister); +} + +void btrfs_sysfs_del_super(struct btrfs_fs_info *fs) +{ + kobject_put(&fs->super_kobj); + wait_for_completion(&fs->kobj_unregister); +} + +int btrfs_init_sysfs(void) +{ + btrfs_kset = kset_create_and_add("btrfs", NULL, fs_kobj); + if (!btrfs_kset) + return -ENOMEM; + return 0; +} + +void btrfs_exit_sysfs(void) +{ + kset_unregister(btrfs_kset); +} + diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c new file mode 100644 index 00000000000..8a08f944334 --- /dev/null +++ b/fs/btrfs/transaction.c @@ -0,0 +1,1097 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/fs.h> +#include <linux/sched.h> +#include <linux/writeback.h> +#include <linux/pagemap.h> +#include <linux/blkdev.h> +#include "ctree.h" +#include "disk-io.h" +#include "transaction.h" +#include "locking.h" +#include "ref-cache.h" +#include "tree-log.h" + +#define BTRFS_ROOT_TRANS_TAG 0 + +static noinline void put_transaction(struct btrfs_transaction *transaction) +{ + WARN_ON(transaction->use_count == 0); + transaction->use_count--; + if (transaction->use_count == 0) { + list_del_init(&transaction->list); + memset(transaction, 0, sizeof(*transaction)); + kmem_cache_free(btrfs_transaction_cachep, transaction); + } +} + +/* + * either allocate a new transaction or hop into the existing one + */ +static noinline int join_transaction(struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + cur_trans = root->fs_info->running_transaction; + if (!cur_trans) { + cur_trans = kmem_cache_alloc(btrfs_transaction_cachep, + GFP_NOFS); + BUG_ON(!cur_trans); + root->fs_info->generation++; + root->fs_info->last_alloc = 0; + root->fs_info->last_data_alloc = 0; + cur_trans->num_writers = 1; + cur_trans->num_joined = 0; + cur_trans->transid = root->fs_info->generation; + init_waitqueue_head(&cur_trans->writer_wait); + init_waitqueue_head(&cur_trans->commit_wait); + cur_trans->in_commit = 0; + cur_trans->blocked = 0; + cur_trans->use_count = 1; + cur_trans->commit_done = 0; + cur_trans->start_time = get_seconds(); + INIT_LIST_HEAD(&cur_trans->pending_snapshots); + list_add_tail(&cur_trans->list, &root->fs_info->trans_list); + extent_io_tree_init(&cur_trans->dirty_pages, + root->fs_info->btree_inode->i_mapping, + GFP_NOFS); + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = cur_trans; + spin_unlock(&root->fs_info->new_trans_lock); + } else { + cur_trans->num_writers++; + cur_trans->num_joined++; + } + + return 0; +} + +/* + * this does all the record keeping required to make sure that a reference + * counted root is properly recorded in a given transaction. This is required + * to make sure the old root from before we joined the transaction is deleted + * when the transaction commits + */ +noinline int btrfs_record_root_in_trans(struct btrfs_root *root) +{ + struct btrfs_dirty_root *dirty; + u64 running_trans_id = root->fs_info->running_transaction->transid; + if (root->ref_cows && root->last_trans < running_trans_id) { + WARN_ON(root == root->fs_info->extent_root); + if (root->root_item.refs != 0) { + radix_tree_tag_set(&root->fs_info->fs_roots_radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + + dirty = kmalloc(sizeof(*dirty), GFP_NOFS); + BUG_ON(!dirty); + dirty->root = kmalloc(sizeof(*dirty->root), GFP_NOFS); + BUG_ON(!dirty->root); + dirty->latest_root = root; + INIT_LIST_HEAD(&dirty->list); + + root->commit_root = btrfs_root_node(root); + + memcpy(dirty->root, root, sizeof(*root)); + spin_lock_init(&dirty->root->node_lock); + spin_lock_init(&dirty->root->list_lock); + mutex_init(&dirty->root->objectid_mutex); + mutex_init(&dirty->root->log_mutex); + INIT_LIST_HEAD(&dirty->root->dead_list); + dirty->root->node = root->commit_root; + dirty->root->commit_root = NULL; + + spin_lock(&root->list_lock); + list_add(&dirty->root->dead_list, &root->dead_list); + spin_unlock(&root->list_lock); + + root->dirty_root = dirty; + } else { + WARN_ON(1); + } + root->last_trans = running_trans_id; + } + return 0; +} + +/* wait for commit against the current transaction to become unblocked + * when this is done, it is safe to start a new transaction, but the current + * transaction might not be fully on disk. + */ +static void wait_current_trans(struct btrfs_root *root) +{ + struct btrfs_transaction *cur_trans; + + cur_trans = root->fs_info->running_transaction; + if (cur_trans && cur_trans->blocked) { + DEFINE_WAIT(wait); + cur_trans->use_count++; + while (1) { + prepare_to_wait(&root->fs_info->transaction_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (cur_trans->blocked) { + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&root->fs_info->transaction_wait, + &wait); + } else { + finish_wait(&root->fs_info->transaction_wait, + &wait); + break; + } + } + put_transaction(cur_trans); + } +} + +static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root, + int num_blocks, int wait) +{ + struct btrfs_trans_handle *h = + kmem_cache_alloc(btrfs_trans_handle_cachep, GFP_NOFS); + int ret; + + mutex_lock(&root->fs_info->trans_mutex); + if (!root->fs_info->log_root_recovering && + ((wait == 1 && !root->fs_info->open_ioctl_trans) || wait == 2)) + wait_current_trans(root); + ret = join_transaction(root); + BUG_ON(ret); + + btrfs_record_root_in_trans(root); + h->transid = root->fs_info->running_transaction->transid; + h->transaction = root->fs_info->running_transaction; + h->blocks_reserved = num_blocks; + h->blocks_used = 0; + h->block_group = 0; + h->alloc_exclude_nr = 0; + h->alloc_exclude_start = 0; + root->fs_info->running_transaction->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + return h; +} + +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_blocks) +{ + return start_transaction(root, num_blocks, 1); +} +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, + int num_blocks) +{ + return start_transaction(root, num_blocks, 0); +} + +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, + int num_blocks) +{ + return start_transaction(r, num_blocks, 2); +} + +/* wait for a transaction commit to be fully complete */ +static noinline int wait_for_commit(struct btrfs_root *root, + struct btrfs_transaction *commit) +{ + DEFINE_WAIT(wait); + mutex_lock(&root->fs_info->trans_mutex); + while (!commit->commit_done) { + prepare_to_wait(&commit->commit_wait, &wait, + TASK_UNINTERRUPTIBLE); + if (commit->commit_done) + break; + mutex_unlock(&root->fs_info->trans_mutex); + schedule(); + mutex_lock(&root->fs_info->trans_mutex); + } + mutex_unlock(&root->fs_info->trans_mutex); + finish_wait(&commit->commit_wait, &wait); + return 0; +} + +/* + * rate limit against the drop_snapshot code. This helps to slow down new + * operations if the drop_snapshot code isn't able to keep up. + */ +static void throttle_on_drops(struct btrfs_root *root) +{ + struct btrfs_fs_info *info = root->fs_info; + int harder_count = 0; + +harder: + if (atomic_read(&info->throttles)) { + DEFINE_WAIT(wait); + int thr; + thr = atomic_read(&info->throttle_gen); + + do { + prepare_to_wait(&info->transaction_throttle, + &wait, TASK_UNINTERRUPTIBLE); + if (!atomic_read(&info->throttles)) { + finish_wait(&info->transaction_throttle, &wait); + break; + } + schedule(); + finish_wait(&info->transaction_throttle, &wait); + } while (thr == atomic_read(&info->throttle_gen)); + harder_count++; + + if (root->fs_info->total_ref_cache_size > 1 * 1024 * 1024 && + harder_count < 2) + goto harder; + + if (root->fs_info->total_ref_cache_size > 5 * 1024 * 1024 && + harder_count < 10) + goto harder; + + if (root->fs_info->total_ref_cache_size > 10 * 1024 * 1024 && + harder_count < 20) + goto harder; + } +} + +void btrfs_throttle(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->trans_mutex); + if (!root->fs_info->open_ioctl_trans) + wait_current_trans(root); + mutex_unlock(&root->fs_info->trans_mutex); + + throttle_on_drops(root); +} + +static int __btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int throttle) +{ + struct btrfs_transaction *cur_trans; + struct btrfs_fs_info *info = root->fs_info; + + mutex_lock(&info->trans_mutex); + cur_trans = info->running_transaction; + WARN_ON(cur_trans != trans->transaction); + WARN_ON(cur_trans->num_writers < 1); + cur_trans->num_writers--; + + if (waitqueue_active(&cur_trans->writer_wait)) + wake_up(&cur_trans->writer_wait); + put_transaction(cur_trans); + mutex_unlock(&info->trans_mutex); + memset(trans, 0, sizeof(*trans)); + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (throttle) + throttle_on_drops(root); + + return 0; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 0); +} + +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + return __btrfs_end_transaction(trans, root, 1); +} + +/* + * when btree blocks are allocated, they have some corresponding bits set for + * them in one of two extent_io trees. This is used to make sure all of + * those extents are on disk for transaction or log commit + */ +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages) +{ + int ret; + int err = 0; + int werr = 0; + struct page *page; + struct inode *btree_inode = root->fs_info->btree_inode; + u64 start = 0; + u64 end; + unsigned long index; + + while (1) { + ret = find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + while (start <= end) { + cond_resched(); + + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + + btree_lock_page_hook(page); + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + continue; + } + + if (PageWriteback(page)) { + if (PageDirty(page)) + wait_on_page_writeback(page); + else { + unlock_page(page); + page_cache_release(page); + continue; + } + } + err = write_one_page(page, 0); + if (err) + werr = err; + page_cache_release(page); + } + } + while (1) { + ret = find_first_extent_bit(dirty_pages, 0, &start, &end, + EXTENT_DIRTY); + if (ret) + break; + + clear_extent_dirty(dirty_pages, start, end, GFP_NOFS); + while (start <= end) { + index = start >> PAGE_CACHE_SHIFT; + start = (u64)(index + 1) << PAGE_CACHE_SHIFT; + page = find_get_page(btree_inode->i_mapping, index); + if (!page) + continue; + if (PageDirty(page)) { + btree_lock_page_hook(page); + wait_on_page_writeback(page); + err = write_one_page(page, 0); + if (err) + werr = err; + } + wait_on_page_writeback(page); + page_cache_release(page); + cond_resched(); + } + } + if (err) + werr = err; + return werr; +} + +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + if (!trans || !trans->transaction) { + struct inode *btree_inode; + btree_inode = root->fs_info->btree_inode; + return filemap_write_and_wait(btree_inode->i_mapping); + } + return btrfs_write_and_wait_marked_extents(root, + &trans->transaction->dirty_pages); +} + +/* + * this is used to update the root pointer in the tree of tree roots. + * + * But, in the case of the extent allocation tree, updating the root + * pointer may allocate blocks which may change the root of the extent + * allocation tree. + * + * So, this loops and repeats and makes sure the cowonly root didn't + * change while the root pointer was being updated in the metadata. + */ +static int update_cowonly_root(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + u64 old_root_bytenr; + struct btrfs_root *tree_root = root->fs_info->tree_root; + + btrfs_extent_post_op(trans, root); + btrfs_write_dirty_block_groups(trans, root); + btrfs_extent_post_op(trans, root); + + while (1) { + old_root_bytenr = btrfs_root_bytenr(&root->root_item); + if (old_root_bytenr == root->node->start) + break; + btrfs_set_root_bytenr(&root->root_item, + root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(root->node)); + btrfs_set_root_generation(&root->root_item, trans->transid); + + btrfs_extent_post_op(trans, root); + + ret = btrfs_update_root(trans, tree_root, + &root->root_key, + &root->root_item); + BUG_ON(ret); + btrfs_write_dirty_block_groups(trans, root); + btrfs_extent_post_op(trans, root); + } + return 0; +} + +/* + * update all the cowonly tree roots on disk + */ +int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct list_head *next; + struct extent_buffer *eb; + + btrfs_extent_post_op(trans, fs_info->tree_root); + + eb = btrfs_lock_root_node(fs_info->tree_root); + btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0); + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + + btrfs_extent_post_op(trans, fs_info->tree_root); + + while (!list_empty(&fs_info->dirty_cowonly_roots)) { + next = fs_info->dirty_cowonly_roots.next; + list_del_init(next); + root = list_entry(next, struct btrfs_root, dirty_list); + + update_cowonly_root(trans, root); + } + return 0; +} + +/* + * dead roots are old snapshots that need to be deleted. This allocates + * a dirty root struct and adds it into the list of dead roots that need to + * be deleted + */ +int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest) +{ + struct btrfs_dirty_root *dirty; + + dirty = kmalloc(sizeof(*dirty), GFP_NOFS); + if (!dirty) + return -ENOMEM; + dirty->root = root; + dirty->latest_root = latest; + + mutex_lock(&root->fs_info->trans_mutex); + list_add(&dirty->list, &latest->fs_info->dead_roots); + mutex_unlock(&root->fs_info->trans_mutex); + return 0; +} + +/* + * at transaction commit time we need to schedule the old roots for + * deletion via btrfs_drop_snapshot. This runs through all the + * reference counted roots that were modified in the current + * transaction and puts them into the drop list + */ +static noinline int add_dirty_roots(struct btrfs_trans_handle *trans, + struct radix_tree_root *radix, + struct list_head *list) +{ + struct btrfs_dirty_root *dirty; + struct btrfs_root *gang[8]; + struct btrfs_root *root; + int i; + int ret; + int err = 0; + u32 refs; + + while (1) { + ret = radix_tree_gang_lookup_tag(radix, (void **)gang, 0, + ARRAY_SIZE(gang), + BTRFS_ROOT_TRANS_TAG); + if (ret == 0) + break; + for (i = 0; i < ret; i++) { + root = gang[i]; + radix_tree_tag_clear(radix, + (unsigned long)root->root_key.objectid, + BTRFS_ROOT_TRANS_TAG); + + BUG_ON(!root->ref_tree); + dirty = root->dirty_root; + + btrfs_free_log(trans, root); + btrfs_free_reloc_root(trans, root); + + if (root->commit_root == root->node) { + WARN_ON(root->node->start != + btrfs_root_bytenr(&root->root_item)); + + free_extent_buffer(root->commit_root); + root->commit_root = NULL; + root->dirty_root = NULL; + + spin_lock(&root->list_lock); + list_del_init(&dirty->root->dead_list); + spin_unlock(&root->list_lock); + + kfree(dirty->root); + kfree(dirty); + + /* make sure to update the root on disk + * so we get any updates to the block used + * counts + */ + err = btrfs_update_root(trans, + root->fs_info->tree_root, + &root->root_key, + &root->root_item); + continue; + } + + memset(&root->root_item.drop_progress, 0, + sizeof(struct btrfs_disk_key)); + root->root_item.drop_level = 0; + root->commit_root = NULL; + root->dirty_root = NULL; + root->root_key.offset = root->fs_info->generation; + btrfs_set_root_bytenr(&root->root_item, + root->node->start); + btrfs_set_root_level(&root->root_item, + btrfs_header_level(root->node)); + btrfs_set_root_generation(&root->root_item, + root->root_key.offset); + + err = btrfs_insert_root(trans, root->fs_info->tree_root, + &root->root_key, + &root->root_item); + if (err) + break; + + refs = btrfs_root_refs(&dirty->root->root_item); + btrfs_set_root_refs(&dirty->root->root_item, refs - 1); + err = btrfs_update_root(trans, root->fs_info->tree_root, + &dirty->root->root_key, + &dirty->root->root_item); + + BUG_ON(err); + if (refs == 1) { + list_add(&dirty->list, list); + } else { + WARN_ON(1); + free_extent_buffer(dirty->root->node); + kfree(dirty->root); + kfree(dirty); + } + } + } + return err; +} + +/* + * defrag a given btree. If cacheonly == 1, this won't read from the disk, + * otherwise every leaf in the btree is read and defragged. + */ +int btrfs_defrag_root(struct btrfs_root *root, int cacheonly) +{ + struct btrfs_fs_info *info = root->fs_info; + int ret; + struct btrfs_trans_handle *trans; + unsigned long nr; + + smp_mb(); + if (root->defrag_running) + return 0; + trans = btrfs_start_transaction(root, 1); + while (1) { + root->defrag_running = 1; + ret = btrfs_defrag_leaves(trans, root, cacheonly); + nr = trans->blocks_used; + btrfs_end_transaction(trans, root); + btrfs_btree_balance_dirty(info->tree_root, nr); + cond_resched(); + + trans = btrfs_start_transaction(root, 1); + if (root->fs_info->closing || ret != -EAGAIN) + break; + } + root->defrag_running = 0; + smp_mb(); + btrfs_end_transaction(trans, root); + return 0; +} + +/* + * Given a list of roots that need to be deleted, call btrfs_drop_snapshot on + * all of them + */ +static noinline int drop_dirty_roots(struct btrfs_root *tree_root, + struct list_head *list) +{ + struct btrfs_dirty_root *dirty; + struct btrfs_trans_handle *trans; + unsigned long nr; + u64 num_bytes; + u64 bytes_used; + u64 max_useless; + int ret = 0; + int err; + + while (!list_empty(list)) { + struct btrfs_root *root; + + dirty = list_entry(list->prev, struct btrfs_dirty_root, list); + list_del_init(&dirty->list); + + num_bytes = btrfs_root_used(&dirty->root->root_item); + root = dirty->latest_root; + atomic_inc(&root->fs_info->throttles); + + while (1) { + trans = btrfs_start_transaction(tree_root, 1); + mutex_lock(&root->fs_info->drop_mutex); + ret = btrfs_drop_snapshot(trans, dirty->root); + if (ret != -EAGAIN) + break; + mutex_unlock(&root->fs_info->drop_mutex); + + err = btrfs_update_root(trans, + tree_root, + &dirty->root->root_key, + &dirty->root->root_item); + if (err) + ret = err; + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + btrfs_btree_balance_dirty(tree_root, nr); + cond_resched(); + } + BUG_ON(ret); + atomic_dec(&root->fs_info->throttles); + wake_up(&root->fs_info->transaction_throttle); + + num_bytes -= btrfs_root_used(&dirty->root->root_item); + bytes_used = btrfs_root_used(&root->root_item); + if (num_bytes) { + btrfs_record_root_in_trans(root); + btrfs_set_root_used(&root->root_item, + bytes_used - num_bytes); + } + + ret = btrfs_del_root(trans, tree_root, &dirty->root->root_key); + if (ret) { + BUG(); + break; + } + mutex_unlock(&root->fs_info->drop_mutex); + + spin_lock(&root->list_lock); + list_del_init(&dirty->root->dead_list); + if (!list_empty(&root->dead_list)) { + struct btrfs_root *oldest; + oldest = list_entry(root->dead_list.prev, + struct btrfs_root, dead_list); + max_useless = oldest->root_key.offset - 1; + } else { + max_useless = root->root_key.offset - 1; + } + spin_unlock(&root->list_lock); + + nr = trans->blocks_used; + ret = btrfs_end_transaction(trans, tree_root); + BUG_ON(ret); + + ret = btrfs_remove_leaf_refs(root, max_useless, 0); + BUG_ON(ret); + + free_extent_buffer(dirty->root->node); + kfree(dirty->root); + kfree(dirty); + + btrfs_btree_balance_dirty(tree_root, nr); + cond_resched(); + } + return ret; +} + +/* + * new snapshots need to be created at a very specific time in the + * transaction commit. This does the actual creation + */ +static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, + struct btrfs_pending_snapshot *pending) +{ + struct btrfs_key key; + struct btrfs_root_item *new_root_item; + struct btrfs_root *tree_root = fs_info->tree_root; + struct btrfs_root *root = pending->root; + struct extent_buffer *tmp; + struct extent_buffer *old; + int ret; + u64 objectid; + + new_root_item = kmalloc(sizeof(*new_root_item), GFP_NOFS); + if (!new_root_item) { + ret = -ENOMEM; + goto fail; + } + ret = btrfs_find_free_objectid(trans, tree_root, 0, &objectid); + if (ret) + goto fail; + + btrfs_record_root_in_trans(root); + btrfs_set_root_last_snapshot(&root->root_item, trans->transid); + memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); + + key.objectid = objectid; + key.offset = trans->transid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + old = btrfs_lock_root_node(root); + btrfs_cow_block(trans, root, old, NULL, 0, &old, 0); + + btrfs_copy_root(trans, root, old, &tmp, objectid); + btrfs_tree_unlock(old); + free_extent_buffer(old); + + btrfs_set_root_bytenr(new_root_item, tmp->start); + btrfs_set_root_level(new_root_item, btrfs_header_level(tmp)); + btrfs_set_root_generation(new_root_item, trans->transid); + ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, + new_root_item); + btrfs_tree_unlock(tmp); + free_extent_buffer(tmp); + if (ret) + goto fail; + + key.offset = (u64)-1; + memcpy(&pending->root_key, &key, sizeof(key)); +fail: + kfree(new_root_item); + return ret; +} + +static noinline int finish_pending_snapshot(struct btrfs_fs_info *fs_info, + struct btrfs_pending_snapshot *pending) +{ + int ret; + int namelen; + u64 index = 0; + struct btrfs_trans_handle *trans; + struct inode *parent_inode; + struct inode *inode; + struct btrfs_root *parent_root; + + parent_inode = pending->dentry->d_parent->d_inode; + parent_root = BTRFS_I(parent_inode)->root; + trans = btrfs_join_transaction(parent_root, 1); + + /* + * insert the directory item + */ + namelen = strlen(pending->name); + ret = btrfs_set_inode_index(parent_inode, &index); + ret = btrfs_insert_dir_item(trans, parent_root, + pending->name, namelen, + parent_inode->i_ino, + &pending->root_key, BTRFS_FT_DIR, index); + + if (ret) + goto fail; + + btrfs_i_size_write(parent_inode, parent_inode->i_size + namelen * 2); + ret = btrfs_update_inode(trans, parent_root, parent_inode); + BUG_ON(ret); + + /* add the backref first */ + ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, + pending->root_key.objectid, + BTRFS_ROOT_BACKREF_KEY, + parent_root->root_key.objectid, + parent_inode->i_ino, index, pending->name, + namelen); + + BUG_ON(ret); + + /* now add the forward ref */ + ret = btrfs_add_root_ref(trans, parent_root->fs_info->tree_root, + parent_root->root_key.objectid, + BTRFS_ROOT_REF_KEY, + pending->root_key.objectid, + parent_inode->i_ino, index, pending->name, + namelen); + + inode = btrfs_lookup_dentry(parent_inode, pending->dentry); + d_instantiate(pending->dentry, inode); +fail: + btrfs_end_transaction(trans, fs_info->fs_root); + return ret; +} + +/* + * create all the snapshots we've scheduled for creation + */ +static noinline int create_pending_snapshots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + struct list_head *cur; + int ret; + + list_for_each(cur, head) { + pending = list_entry(cur, struct btrfs_pending_snapshot, list); + ret = create_pending_snapshot(trans, fs_info, pending); + BUG_ON(ret); + } + return 0; +} + +static noinline int finish_pending_snapshots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info) +{ + struct btrfs_pending_snapshot *pending; + struct list_head *head = &trans->transaction->pending_snapshots; + int ret; + + while (!list_empty(head)) { + pending = list_entry(head->next, + struct btrfs_pending_snapshot, list); + ret = finish_pending_snapshot(fs_info, pending); + BUG_ON(ret); + list_del(&pending->list); + kfree(pending->name); + kfree(pending); + } + return 0; +} + +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + unsigned long joined = 0; + unsigned long timeout = 1; + struct btrfs_transaction *cur_trans; + struct btrfs_transaction *prev_trans = NULL; + struct btrfs_root *chunk_root = root->fs_info->chunk_root; + struct list_head dirty_fs_roots; + struct extent_io_tree *pinned_copy; + DEFINE_WAIT(wait); + int ret; + + INIT_LIST_HEAD(&dirty_fs_roots); + mutex_lock(&root->fs_info->trans_mutex); + if (trans->transaction->in_commit) { + cur_trans = trans->transaction; + trans->transaction->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + btrfs_end_transaction(trans, root); + + ret = wait_for_commit(root, cur_trans); + BUG_ON(ret); + + mutex_lock(&root->fs_info->trans_mutex); + put_transaction(cur_trans); + mutex_unlock(&root->fs_info->trans_mutex); + + return 0; + } + + pinned_copy = kmalloc(sizeof(*pinned_copy), GFP_NOFS); + if (!pinned_copy) + return -ENOMEM; + + extent_io_tree_init(pinned_copy, + root->fs_info->btree_inode->i_mapping, GFP_NOFS); + + trans->transaction->in_commit = 1; + trans->transaction->blocked = 1; + cur_trans = trans->transaction; + if (cur_trans->list.prev != &root->fs_info->trans_list) { + prev_trans = list_entry(cur_trans->list.prev, + struct btrfs_transaction, list); + if (!prev_trans->commit_done) { + prev_trans->use_count++; + mutex_unlock(&root->fs_info->trans_mutex); + + wait_for_commit(root, prev_trans); + + mutex_lock(&root->fs_info->trans_mutex); + put_transaction(prev_trans); + } + } + + do { + int snap_pending = 0; + joined = cur_trans->num_joined; + if (!list_empty(&trans->transaction->pending_snapshots)) + snap_pending = 1; + + WARN_ON(cur_trans != trans->transaction); + prepare_to_wait(&cur_trans->writer_wait, &wait, + TASK_UNINTERRUPTIBLE); + + if (cur_trans->num_writers > 1) + timeout = MAX_SCHEDULE_TIMEOUT; + else + timeout = 1; + + mutex_unlock(&root->fs_info->trans_mutex); + + if (snap_pending) { + ret = btrfs_wait_ordered_extents(root, 1); + BUG_ON(ret); + } + + schedule_timeout(timeout); + + mutex_lock(&root->fs_info->trans_mutex); + finish_wait(&cur_trans->writer_wait, &wait); + } while (cur_trans->num_writers > 1 || + (cur_trans->num_joined != joined)); + + ret = create_pending_snapshots(trans, root->fs_info); + BUG_ON(ret); + + WARN_ON(cur_trans != trans->transaction); + + /* btrfs_commit_tree_roots is responsible for getting the + * various roots consistent with each other. Every pointer + * in the tree of tree roots has to point to the most up to date + * root for every subvolume and other tree. So, we have to keep + * the tree logging code from jumping in and changing any + * of the trees. + * + * At this point in the commit, there can't be any tree-log + * writers, but a little lower down we drop the trans mutex + * and let new people in. By holding the tree_log_mutex + * from now until after the super is written, we avoid races + * with the tree-log code. + */ + mutex_lock(&root->fs_info->tree_log_mutex); + /* + * keep tree reloc code from adding new reloc trees + */ + mutex_lock(&root->fs_info->tree_reloc_mutex); + + + ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix, + &dirty_fs_roots); + BUG_ON(ret); + + /* add_dirty_roots gets rid of all the tree log roots, it is now + * safe to free the root of tree log roots + */ + btrfs_free_log_root_tree(trans, root->fs_info); + + ret = btrfs_commit_tree_roots(trans, root); + BUG_ON(ret); + + cur_trans = root->fs_info->running_transaction; + spin_lock(&root->fs_info->new_trans_lock); + root->fs_info->running_transaction = NULL; + spin_unlock(&root->fs_info->new_trans_lock); + btrfs_set_super_generation(&root->fs_info->super_copy, + cur_trans->transid); + btrfs_set_super_root(&root->fs_info->super_copy, + root->fs_info->tree_root->node->start); + btrfs_set_super_root_level(&root->fs_info->super_copy, + btrfs_header_level(root->fs_info->tree_root->node)); + + btrfs_set_super_chunk_root(&root->fs_info->super_copy, + chunk_root->node->start); + btrfs_set_super_chunk_root_level(&root->fs_info->super_copy, + btrfs_header_level(chunk_root->node)); + btrfs_set_super_chunk_root_generation(&root->fs_info->super_copy, + btrfs_header_generation(chunk_root->node)); + + if (!root->fs_info->log_root_recovering) { + btrfs_set_super_log_root(&root->fs_info->super_copy, 0); + btrfs_set_super_log_root_level(&root->fs_info->super_copy, 0); + } + + memcpy(&root->fs_info->super_for_commit, &root->fs_info->super_copy, + sizeof(root->fs_info->super_copy)); + + btrfs_copy_pinned(root, pinned_copy); + + trans->transaction->blocked = 0; + wake_up(&root->fs_info->transaction_throttle); + wake_up(&root->fs_info->transaction_wait); + + mutex_unlock(&root->fs_info->trans_mutex); + ret = btrfs_write_and_wait_transaction(trans, root); + BUG_ON(ret); + write_ctree_super(trans, root, 0); + + /* + * the super is written, we can safely allow the tree-loggers + * to go about their business + */ + mutex_unlock(&root->fs_info->tree_log_mutex); + + btrfs_finish_extent_commit(trans, root, pinned_copy); + kfree(pinned_copy); + + btrfs_drop_dead_reloc_roots(root); + mutex_unlock(&root->fs_info->tree_reloc_mutex); + + /* do the directory inserts of any pending snapshot creations */ + finish_pending_snapshots(trans, root->fs_info); + + mutex_lock(&root->fs_info->trans_mutex); + + cur_trans->commit_done = 1; + root->fs_info->last_trans_committed = cur_trans->transid; + wake_up(&cur_trans->commit_wait); + + put_transaction(cur_trans); + put_transaction(cur_trans); + + list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots); + if (root->fs_info->closing) + list_splice_init(&root->fs_info->dead_roots, &dirty_fs_roots); + + mutex_unlock(&root->fs_info->trans_mutex); + + kmem_cache_free(btrfs_trans_handle_cachep, trans); + + if (root->fs_info->closing) + drop_dirty_roots(root->fs_info->tree_root, &dirty_fs_roots); + return ret; +} + +/* + * interface function to delete all the snapshots we have scheduled for deletion + */ +int btrfs_clean_old_snapshots(struct btrfs_root *root) +{ + struct list_head dirty_roots; + INIT_LIST_HEAD(&dirty_roots); +again: + mutex_lock(&root->fs_info->trans_mutex); + list_splice_init(&root->fs_info->dead_roots, &dirty_roots); + mutex_unlock(&root->fs_info->trans_mutex); + + if (!list_empty(&dirty_roots)) { + drop_dirty_roots(root, &dirty_roots); + goto again; + } + return 0; +} diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h new file mode 100644 index 00000000000..ea292117f88 --- /dev/null +++ b/fs/btrfs/transaction.h @@ -0,0 +1,106 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_TRANSACTION__ +#define __BTRFS_TRANSACTION__ +#include "btrfs_inode.h" + +struct btrfs_transaction { + u64 transid; + unsigned long num_writers; + unsigned long num_joined; + int in_commit; + int use_count; + int commit_done; + int blocked; + struct list_head list; + struct extent_io_tree dirty_pages; + unsigned long start_time; + wait_queue_head_t writer_wait; + wait_queue_head_t commit_wait; + struct list_head pending_snapshots; +}; + +struct btrfs_trans_handle { + u64 transid; + unsigned long blocks_reserved; + unsigned long blocks_used; + struct btrfs_transaction *transaction; + u64 block_group; + u64 alloc_exclude_start; + u64 alloc_exclude_nr; +}; + +struct btrfs_pending_snapshot { + struct dentry *dentry; + struct btrfs_root *root; + char *name; + struct btrfs_key root_key; + struct list_head list; +}; + +struct btrfs_dirty_root { + struct list_head list; + struct btrfs_root *root; + struct btrfs_root *latest_root; +}; + +static inline void btrfs_set_trans_block_group(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + trans->block_group = BTRFS_I(inode)->block_group; +} + +static inline void btrfs_update_inode_block_group( + struct btrfs_trans_handle *trans, + struct inode *inode) +{ + BTRFS_I(inode)->block_group = trans->block_group; +} + +static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, + struct inode *inode) +{ + BTRFS_I(inode)->last_trans = trans->transaction->transid; +} + +int btrfs_end_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +struct btrfs_trans_handle *btrfs_start_transaction(struct btrfs_root *root, + int num_blocks); +struct btrfs_trans_handle *btrfs_join_transaction(struct btrfs_root *root, + int num_blocks); +struct btrfs_trans_handle *btrfs_start_ioctl_transaction(struct btrfs_root *r, + int num_blocks); +int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans, + struct btrfs_root *root); + +int btrfs_add_dead_root(struct btrfs_root *root, struct btrfs_root *latest); +int btrfs_defrag_root(struct btrfs_root *root, int cacheonly); +int btrfs_clean_old_snapshots(struct btrfs_root *root); +int btrfs_commit_transaction(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +void btrfs_throttle(struct btrfs_root *root); +int btrfs_record_root_in_trans(struct btrfs_root *root); +int btrfs_write_and_wait_marked_extents(struct btrfs_root *root, + struct extent_io_tree *dirty_pages); +#endif diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c new file mode 100644 index 00000000000..3e8358c3616 --- /dev/null +++ b/fs/btrfs/tree-defrag.c @@ -0,0 +1,147 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "disk-io.h" +#include "print-tree.h" +#include "transaction.h" +#include "locking.h" + +/* defrag all the leaves in a given btree. If cache_only == 1, don't read + * things from disk, otherwise read all the leaves and try to get key order to + * better reflect disk order + */ + +int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, + struct btrfs_root *root, int cache_only) +{ + struct btrfs_path *path = NULL; + struct btrfs_key key; + int ret = 0; + int wret; + int level; + int orig_level; + int is_extent = 0; + int next_key_ret = 0; + u64 last_ret = 0; + u64 min_trans = 0; + + if (cache_only) + goto out; + + if (root->fs_info->extent_root == root) { + /* + * there's recursion here right now in the tree locking, + * we can't defrag the extent root without deadlock + */ + goto out; + } + + if (root->ref_cows == 0 && !is_extent) + goto out; + + if (btrfs_test_opt(root, SSD)) + goto out; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + level = btrfs_header_level(root->node); + orig_level = level; + + if (level == 0) + goto out; + + if (root->defrag_progress.objectid == 0) { + struct extent_buffer *root_node; + u32 nritems; + + root_node = btrfs_lock_root_node(root); + nritems = btrfs_header_nritems(root_node); + root->defrag_max.objectid = 0; + /* from above we know this is not a leaf */ + btrfs_node_key_to_cpu(root_node, &root->defrag_max, + nritems - 1); + btrfs_tree_unlock(root_node); + free_extent_buffer(root_node); + memset(&key, 0, sizeof(key)); + } else { + memcpy(&key, &root->defrag_progress, sizeof(key)); + } + + path->keep_locks = 1; + if (cache_only) + min_trans = root->defrag_trans_start; + + ret = btrfs_search_forward(root, &key, NULL, path, + cache_only, min_trans); + if (ret < 0) + goto out; + if (ret > 0) { + ret = 0; + goto out; + } + btrfs_release_path(root, path); + wret = btrfs_search_slot(trans, root, &key, path, 0, 1); + + if (wret < 0) { + ret = wret; + goto out; + } + if (!path->nodes[1]) { + ret = 0; + goto out; + } + path->slots[1] = btrfs_header_nritems(path->nodes[1]); + next_key_ret = btrfs_find_next_key(root, path, &key, 1, cache_only, + min_trans); + ret = btrfs_realloc_node(trans, root, + path->nodes[1], 0, + cache_only, &last_ret, + &root->defrag_progress); + WARN_ON(ret && ret != -EAGAIN); + if (next_key_ret == 0) { + memcpy(&root->defrag_progress, &key, sizeof(key)); + ret = -EAGAIN; + } + + btrfs_release_path(root, path); + if (is_extent) + btrfs_extent_post_op(trans, root); +out: + if (path) + btrfs_free_path(path); + if (ret == -EAGAIN) { + if (root->defrag_max.objectid > root->defrag_progress.objectid) + goto done; + if (root->defrag_max.type > root->defrag_progress.type) + goto done; + if (root->defrag_max.offset > root->defrag_progress.offset) + goto done; + ret = 0; + } +done: + if (ret != -EAGAIN) { + memset(&root->defrag_progress, 0, + sizeof(root->defrag_progress)); + root->defrag_trans_start = trans->transid; + } + return ret; +} diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c new file mode 100644 index 00000000000..d81cda2e077 --- /dev/null +++ b/fs/btrfs/tree-log.c @@ -0,0 +1,2898 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/sched.h> +#include "ctree.h" +#include "transaction.h" +#include "disk-io.h" +#include "locking.h" +#include "print-tree.h" +#include "compat.h" +#include "tree-log.h" + +/* magic values for the inode_only field in btrfs_log_inode: + * + * LOG_INODE_ALL means to log everything + * LOG_INODE_EXISTS means to log just enough to recreate the inode + * during log replay + */ +#define LOG_INODE_ALL 0 +#define LOG_INODE_EXISTS 1 + +/* + * stages for the tree walking. The first + * stage (0) is to only pin down the blocks we find + * the second stage (1) is to make sure that all the inodes + * we find in the log are created in the subvolume. + * + * The last stage is to deal with directories and links and extents + * and all the other fun semantics + */ +#define LOG_WALK_PIN_ONLY 0 +#define LOG_WALK_REPLAY_INODES 1 +#define LOG_WALK_REPLAY_ALL 2 + +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only); +static int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, u64 objectid); + +/* + * tree logging is a special write ahead log used to make sure that + * fsyncs and O_SYNCs can happen without doing full tree commits. + * + * Full tree commits are expensive because they require commonly + * modified blocks to be recowed, creating many dirty pages in the + * extent tree an 4x-6x higher write load than ext3. + * + * Instead of doing a tree commit on every fsync, we use the + * key ranges and transaction ids to find items for a given file or directory + * that have changed in this transaction. Those items are copied into + * a special tree (one per subvolume root), that tree is written to disk + * and then the fsync is considered complete. + * + * After a crash, items are copied out of the log-tree back into the + * subvolume tree. Any file data extents found are recorded in the extent + * allocation tree, and the log-tree freed. + * + * The log tree is read three times, once to pin down all the extents it is + * using in ram and once, once to create all the inodes logged in the tree + * and once to do all the other items. + */ + +/* + * btrfs_add_log_tree adds a new per-subvolume log tree into the + * tree of log tree roots. This must be called with a tree log transaction + * running (see start_log_trans). + */ +static int btrfs_add_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_key key; + struct btrfs_root_item root_item; + struct btrfs_inode_item *inode_item; + struct extent_buffer *leaf; + struct btrfs_root *new_root = root; + int ret; + u64 objectid = root->root_key.objectid; + + leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, + BTRFS_TREE_LOG_OBJECTID, + trans->transid, 0, 0, 0); + if (IS_ERR(leaf)) { + ret = PTR_ERR(leaf); + return ret; + } + + btrfs_set_header_nritems(leaf, 0); + btrfs_set_header_level(leaf, 0); + btrfs_set_header_bytenr(leaf, leaf->start); + btrfs_set_header_generation(leaf, trans->transid); + btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID); + + write_extent_buffer(leaf, root->fs_info->fsid, + (unsigned long)btrfs_header_fsid(leaf), + BTRFS_FSID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + inode_item = &root_item.inode; + memset(inode_item, 0, sizeof(*inode_item)); + inode_item->generation = cpu_to_le64(1); + inode_item->size = cpu_to_le64(3); + inode_item->nlink = cpu_to_le32(1); + inode_item->nbytes = cpu_to_le64(root->leafsize); + inode_item->mode = cpu_to_le32(S_IFDIR | 0755); + + btrfs_set_root_bytenr(&root_item, leaf->start); + btrfs_set_root_generation(&root_item, trans->transid); + btrfs_set_root_level(&root_item, 0); + btrfs_set_root_refs(&root_item, 0); + btrfs_set_root_used(&root_item, 0); + + memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress)); + root_item.drop_level = 0; + + btrfs_tree_unlock(leaf); + free_extent_buffer(leaf); + leaf = NULL; + + btrfs_set_root_dirid(&root_item, 0); + + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = objectid; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key, + &root_item); + if (ret) + goto fail; + + new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree, + &key); + BUG_ON(!new_root); + + WARN_ON(root->log_root); + root->log_root = new_root; + + /* + * log trees do not get reference counted because they go away + * before a real commit is actually done. They do store pointers + * to file data extents, and those reference counts still get + * updated (along with back refs to the log tree). + */ + new_root->ref_cows = 0; + new_root->last_trans = trans->transid; + + /* + * we need to make sure the root block for this new tree + * is marked as dirty in the dirty_log_pages tree. This + * is how it gets flushed down to disk at tree log commit time. + * + * the tree logging mutex keeps others from coming in and changing + * the new_root->node, so we can safely access it here + */ + set_extent_dirty(&new_root->dirty_log_pages, new_root->node->start, + new_root->node->start + new_root->node->len - 1, + GFP_NOFS); + +fail: + return ret; +} + +/* + * start a sub transaction and setup the log tree + * this increments the log tree writer count to make the people + * syncing the tree wait for us to finish + */ +static int start_log_trans(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + mutex_lock(&root->fs_info->tree_log_mutex); + if (!root->fs_info->log_root_tree) { + ret = btrfs_init_log_root_tree(trans, root->fs_info); + BUG_ON(ret); + } + if (!root->log_root) { + ret = btrfs_add_log_tree(trans, root); + BUG_ON(ret); + } + atomic_inc(&root->fs_info->tree_log_writers); + root->fs_info->tree_log_batch++; + mutex_unlock(&root->fs_info->tree_log_mutex); + return 0; +} + +/* + * returns 0 if there was a log transaction running and we were able + * to join, or returns -ENOENT if there were not transactions + * in progress + */ +static int join_running_log_trans(struct btrfs_root *root) +{ + int ret = -ENOENT; + + smp_mb(); + if (!root->log_root) + return -ENOENT; + + mutex_lock(&root->fs_info->tree_log_mutex); + if (root->log_root) { + ret = 0; + atomic_inc(&root->fs_info->tree_log_writers); + root->fs_info->tree_log_batch++; + } + mutex_unlock(&root->fs_info->tree_log_mutex); + return ret; +} + +/* + * indicate we're done making changes to the log tree + * and wake up anyone waiting to do a sync + */ +static int end_log_trans(struct btrfs_root *root) +{ + atomic_dec(&root->fs_info->tree_log_writers); + smp_mb(); + if (waitqueue_active(&root->fs_info->tree_log_wait)) + wake_up(&root->fs_info->tree_log_wait); + return 0; +} + + +/* + * the walk control struct is used to pass state down the chain when + * processing the log tree. The stage field tells us which part + * of the log tree processing we are currently doing. The others + * are state fields used for that specific part + */ +struct walk_control { + /* should we free the extent on disk when done? This is used + * at transaction commit time while freeing a log tree + */ + int free; + + /* should we write out the extent buffer? This is used + * while flushing the log tree to disk during a sync + */ + int write; + + /* should we wait for the extent buffer io to finish? Also used + * while flushing the log tree to disk for a sync + */ + int wait; + + /* pin only walk, we record which extents on disk belong to the + * log trees + */ + int pin; + + /* what stage of the replay code we're currently in */ + int stage; + + /* the root we are currently replaying */ + struct btrfs_root *replay_dest; + + /* the trans handle for the current replay */ + struct btrfs_trans_handle *trans; + + /* the function that gets used to process blocks we find in the + * tree. Note the extent_buffer might not be up to date when it is + * passed in, and it must be checked or read if you need the data + * inside it + */ + int (*process_func)(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen); +}; + +/* + * process_func used to pin down extents, write them or wait on them + */ +static int process_one_buffer(struct btrfs_root *log, + struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + if (wc->pin) { + mutex_lock(&log->fs_info->pinned_mutex); + btrfs_update_pinned_extents(log->fs_info->extent_root, + eb->start, eb->len, 1); + mutex_unlock(&log->fs_info->pinned_mutex); + } + + if (btrfs_buffer_uptodate(eb, gen)) { + if (wc->write) + btrfs_write_tree_block(eb); + if (wc->wait) + btrfs_wait_tree_block_writeback(eb); + } + return 0; +} + +/* + * Item overwrite used by replay and tree logging. eb, slot and key all refer + * to the src data we are copying out. + * + * root is the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and + * will be released on exit). + * + * If the key is already in the destination tree the existing item is + * overwritten. If the existing item isn't big enough, it is extended. + * If it is too large, it is truncated. + * + * If the key isn't in the destination yet, a new item is inserted. + */ +static noinline int overwrite_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size; + u64 saved_i_size = 0; + int save_old_i_size = 0; + unsigned long src_ptr; + unsigned long dst_ptr; + int overwrite_root = 0; + + if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) + overwrite_root = 1; + + item_size = btrfs_item_size_nr(eb, slot); + src_ptr = btrfs_item_ptr_offset(eb, slot); + + /* look for the key in the destination tree */ + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *src_copy; + char *dst_copy; + u32 dst_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (dst_size != item_size) + goto insert; + + if (item_size == 0) { + btrfs_release_path(root, path); + return 0; + } + dst_copy = kmalloc(item_size, GFP_NOFS); + src_copy = kmalloc(item_size, GFP_NOFS); + + read_extent_buffer(eb, src_copy, src_ptr, item_size); + + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, + item_size); + ret = memcmp(dst_copy, src_copy, item_size); + + kfree(dst_copy); + kfree(src_copy); + /* + * they have the same contents, just return, this saves + * us from cowing blocks in the destination tree and doing + * extra writes that may not have been done by a previous + * sync + */ + if (ret == 0) { + btrfs_release_path(root, path); + return 0; + } + + } +insert: + btrfs_release_path(root, path); + /* try to insert the key into the destination tree */ + ret = btrfs_insert_empty_item(trans, root, path, + key, item_size); + + /* make sure any existing item is the correct size */ + if (ret == -EEXIST) { + u32 found_size; + found_size = btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + if (found_size > item_size) { + btrfs_truncate_item(trans, root, path, item_size, 1); + } else if (found_size < item_size) { + ret = btrfs_extend_item(trans, root, path, + item_size - found_size); + BUG_ON(ret); + } + } else if (ret) { + BUG(); + } + dst_ptr = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + + /* don't overwrite an existing inode if the generation number + * was logged as zero. This is done when the tree logging code + * is just logging an inode to make sure it exists after recovery. + * + * Also, don't overwrite i_size on directories during replay. + * log replay inserts and removes directory items based on the + * state of the tree found in the subvolume, and i_size is modified + * as it goes + */ + if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) { + struct btrfs_inode_item *src_item; + struct btrfs_inode_item *dst_item; + + src_item = (struct btrfs_inode_item *)src_ptr; + dst_item = (struct btrfs_inode_item *)dst_ptr; + + if (btrfs_inode_generation(eb, src_item) == 0) + goto no_copy; + + if (overwrite_root && + S_ISDIR(btrfs_inode_mode(eb, src_item)) && + S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + save_old_i_size = 1; + saved_i_size = btrfs_inode_size(path->nodes[0], + dst_item); + } + } + + copy_extent_buffer(path->nodes[0], eb, dst_ptr, + src_ptr, item_size); + + if (save_old_i_size) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + } + + /* make sure the generation is filled in */ + if (key->type == BTRFS_INODE_ITEM_KEY) { + struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; + if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { + btrfs_set_inode_generation(path->nodes[0], dst_item, + trans->transid); + } + } +no_copy: + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(root, path); + return 0; +} + +/* + * simple helper to read an inode off the disk from a given root + * This can only be called for subvolume roots and not for the log + */ +static noinline struct inode *read_one_inode(struct btrfs_root *root, + u64 objectid) +{ + struct inode *inode; + inode = btrfs_iget_locked(root->fs_info->sb, objectid, root); + if (inode->i_state & I_NEW) { + BTRFS_I(inode)->root = root; + BTRFS_I(inode)->location.objectid = objectid; + BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY; + BTRFS_I(inode)->location.offset = 0; + btrfs_read_locked_inode(inode); + unlock_new_inode(inode); + + } + if (is_bad_inode(inode)) { + iput(inode); + inode = NULL; + } + return inode; +} + +/* replays a single extent in 'eb' at 'slot' with 'key' into the + * subvolume 'root'. path is released on entry and should be released + * on exit. + * + * extents in the log tree have not been allocated out of the extent + * tree yet. So, this completes the allocation, taking a reference + * as required if the extent already exists or creating a new extent + * if it isn't in the extent allocation tree yet. + * + * The extent is inserted into the file, dropping any existing extents + * from the file that overlap the new one. + */ +static noinline int replay_one_extent(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int found_type; + u64 mask = root->sectorsize - 1; + u64 extent_end; + u64 alloc_hint; + u64 start = key->offset; + u64 saved_nbytes; + struct btrfs_file_extent_item *item; + struct inode *inode = NULL; + unsigned long size; + int ret = 0; + + item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item); + found_type = btrfs_file_extent_type(eb, item); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) + extent_end = start + btrfs_file_extent_num_bytes(eb, item); + else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + size = btrfs_file_extent_inline_len(eb, item); + extent_end = (start + size + mask) & ~mask; + } else { + ret = 0; + goto out; + } + + inode = read_one_inode(root, key->objectid); + if (!inode) { + ret = -EIO; + goto out; + } + + /* + * first check to see if we already have this extent in the + * file. This must be done before the btrfs_drop_extents run + * so we don't try to drop this extent. + */ + ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino, + start, 0); + + if (ret == 0 && + (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC)) { + struct btrfs_file_extent_item cmp1; + struct btrfs_file_extent_item cmp2; + struct btrfs_file_extent_item *existing; + struct extent_buffer *leaf; + + leaf = path->nodes[0]; + existing = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + read_extent_buffer(eb, &cmp1, (unsigned long)item, + sizeof(cmp1)); + read_extent_buffer(leaf, &cmp2, (unsigned long)existing, + sizeof(cmp2)); + + /* + * we already have a pointer to this exact extent, + * we don't have to do anything + */ + if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + btrfs_release_path(root, path); + goto out; + } + } + btrfs_release_path(root, path); + + saved_nbytes = inode_get_bytes(inode); + /* drop any overlapping extents */ + ret = btrfs_drop_extents(trans, root, inode, + start, extent_end, start, &alloc_hint); + BUG_ON(ret); + + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + unsigned long dest_offset; + struct btrfs_key ins; + + ret = btrfs_insert_empty_item(trans, root, path, key, + sizeof(*item)); + BUG_ON(ret); + dest_offset = btrfs_item_ptr_offset(path->nodes[0], + path->slots[0]); + copy_extent_buffer(path->nodes[0], eb, dest_offset, + (unsigned long)item, sizeof(*item)); + + ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); + ins.type = BTRFS_EXTENT_ITEM_KEY; + + if (ins.objectid > 0) { + u64 csum_start; + u64 csum_end; + LIST_HEAD(ordered_sums); + /* + * is this extent already allocated in the extent + * allocation tree? If so, just add a reference + */ + ret = btrfs_lookup_extent(root, ins.objectid, + ins.offset); + if (ret == 0) { + ret = btrfs_inc_extent_ref(trans, root, + ins.objectid, ins.offset, + path->nodes[0]->start, + root->root_key.objectid, + trans->transid, key->objectid); + } else { + /* + * insert the extent pointer in the extent + * allocation tree + */ + ret = btrfs_alloc_logged_extent(trans, root, + path->nodes[0]->start, + root->root_key.objectid, + trans->transid, key->objectid, + &ins); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + if (btrfs_file_extent_compression(eb, item)) { + csum_start = ins.objectid; + csum_end = csum_start + ins.offset; + } else { + csum_start = ins.objectid + + btrfs_file_extent_offset(eb, item); + csum_end = csum_start + + btrfs_file_extent_num_bytes(eb, item); + } + + ret = btrfs_lookup_csums_range(root->log_root, + csum_start, csum_end - 1, + &ordered_sums); + BUG_ON(ret); + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums; + sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + ret = btrfs_csum_file_blocks(trans, + root->fs_info->csum_root, + sums); + BUG_ON(ret); + list_del(&sums->list); + kfree(sums); + } + } else { + btrfs_release_path(root, path); + } + } else if (found_type == BTRFS_FILE_EXTENT_INLINE) { + /* inline extents are easy, we just overwrite them */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + } + + inode_set_bytes(inode, saved_nbytes); + btrfs_update_inode(trans, root, inode); +out: + if (inode) + iput(inode); + return ret; +} + +/* + * when cleaning up conflicts between the directory names in the + * subvolume, directory names in the log and directory names in the + * inode back references, we may have to unlink inodes from directories. + * + * This is a helper function to do the unlink of a specific directory + * item + */ +static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct inode *dir, + struct btrfs_dir_item *di) +{ + struct inode *inode; + char *name; + int name_len; + struct extent_buffer *leaf; + struct btrfs_key location; + int ret; + + leaf = path->nodes[0]; + + btrfs_dir_item_key_to_cpu(leaf, di, &location); + name_len = btrfs_dir_name_len(leaf, di); + name = kmalloc(name_len, GFP_NOFS); + read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len); + btrfs_release_path(root, path); + + inode = read_one_inode(root, location.objectid); + BUG_ON(!inode); + + ret = link_to_fixup_dir(trans, root, path, location.objectid); + BUG_ON(ret); + ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len); + BUG_ON(ret); + kfree(name); + + iput(inode); + return ret; +} + +/* + * helper function to see if a given name and sequence number found + * in an inode back reference are already in a directory and correctly + * point to this inode + */ +static noinline int inode_in_dir(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 objectid, u64 index, + const char *name, int name_len) +{ + struct btrfs_dir_item *di; + struct btrfs_key location; + int match = 0; + + di = btrfs_lookup_dir_index_item(NULL, root, path, dirid, + index, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + btrfs_release_path(root, path); + + di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0); + if (di && !IS_ERR(di)) { + btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location); + if (location.objectid != objectid) + goto out; + } else + goto out; + match = 1; +out: + btrfs_release_path(root, path); + return match; +} + +/* + * helper function to check a log tree for a named back reference in + * an inode. This is used to decide if a back reference that is + * found in the subvolume conflicts with what we find in the log. + * + * inode backreferences may have multiple refs in a single item, + * during replay we process one reference at a time, and we don't + * want to delete valid links to a file from the subvolume if that + * link is also in the log. + */ +static noinline int backref_in_log(struct btrfs_root *log, + struct btrfs_key *key, + char *name, int namelen) +{ + struct btrfs_path *path; + struct btrfs_inode_ref *ref; + unsigned long ptr; + unsigned long ptr_end; + unsigned long name_ptr; + int found_name_len; + int item_size; + int ret; + int match = 0; + + path = btrfs_alloc_path(); + ret = btrfs_search_slot(NULL, log, key, path, 0, 0); + if (ret != 0) + goto out; + + item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + ref = (struct btrfs_inode_ref *)ptr; + found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref); + if (found_name_len == namelen) { + name_ptr = (unsigned long)(ref + 1); + ret = memcmp_extent_buffer(path->nodes[0], name, + name_ptr, namelen); + if (ret == 0) { + match = 1; + goto out; + } + } + ptr = (unsigned long)(ref + 1) + found_name_len; + } +out: + btrfs_free_path(path); + return match; +} + + +/* + * replay one inode back reference item found in the log tree. + * eb, slot and key refer to the buffer and key found in the log tree. + * root is the destination we are replaying into, and path is for temp + * use by this function. (it should be released on return). + */ +static noinline int add_inode_ref(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + struct inode *dir; + int ret; + struct btrfs_key location; + struct btrfs_inode_ref *ref; + struct btrfs_dir_item *di; + struct inode *inode; + char *name; + int namelen; + unsigned long ref_ptr; + unsigned long ref_end; + + location.objectid = key->objectid; + location.type = BTRFS_INODE_ITEM_KEY; + location.offset = 0; + + /* + * it is possible that we didn't log all the parent directories + * for a given inode. If we don't find the dir, just don't + * copy the back ref in. The link count fixup code will take + * care of the rest + */ + dir = read_one_inode(root, key->offset); + if (!dir) + return -ENOENT; + + inode = read_one_inode(root, key->objectid); + BUG_ON(!dir); + + ref_ptr = btrfs_item_ptr_offset(eb, slot); + ref_end = ref_ptr + btrfs_item_size_nr(eb, slot); + +again: + ref = (struct btrfs_inode_ref *)ref_ptr; + + namelen = btrfs_inode_ref_name_len(eb, ref); + name = kmalloc(namelen, GFP_NOFS); + BUG_ON(!name); + + read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen); + + /* if we already have a perfect match, we're done */ + if (inode_in_dir(root, path, dir->i_ino, inode->i_ino, + btrfs_inode_ref_index(eb, ref), + name, namelen)) { + goto out; + } + + /* + * look for a conflicting back reference in the metadata. + * if we find one we have to unlink that name of the file + * before we add our new link. Later on, we overwrite any + * existing back reference, and we don't want to create + * dangling pointers in the directory. + */ +conflict_again: + ret = btrfs_search_slot(NULL, root, key, path, 0, 0); + if (ret == 0) { + char *victim_name; + int victim_name_len; + struct btrfs_inode_ref *victim_ref; + unsigned long ptr; + unsigned long ptr_end; + struct extent_buffer *leaf = path->nodes[0]; + + /* are we trying to overwrite a back ref for the root directory + * if so, just jump out, we're done + */ + if (key->objectid == key->offset) + goto out_nowrite; + + /* check all the names in this back reference to see + * if they are in the log. if so, we allow them to stay + * otherwise they must be unlinked as a conflict + */ + ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]); + while (ptr < ptr_end) { + victim_ref = (struct btrfs_inode_ref *)ptr; + victim_name_len = btrfs_inode_ref_name_len(leaf, + victim_ref); + victim_name = kmalloc(victim_name_len, GFP_NOFS); + BUG_ON(!victim_name); + + read_extent_buffer(leaf, victim_name, + (unsigned long)(victim_ref + 1), + victim_name_len); + + if (!backref_in_log(log, key, victim_name, + victim_name_len)) { + btrfs_inc_nlink(inode); + btrfs_release_path(root, path); + ret = btrfs_unlink_inode(trans, root, dir, + inode, victim_name, + victim_name_len); + kfree(victim_name); + btrfs_release_path(root, path); + goto conflict_again; + } + kfree(victim_name); + ptr = (unsigned long)(victim_ref + 1) + victim_name_len; + } + BUG_ON(ret); + } + btrfs_release_path(root, path); + + /* look for a conflicting sequence number */ + di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino, + btrfs_inode_ref_index(eb, ref), + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + + /* look for a conflicting name */ + di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino, + name, namelen, 0); + if (di && !IS_ERR(di)) { + ret = drop_one_dir_item(trans, root, path, dir, di); + BUG_ON(ret); + } + btrfs_release_path(root, path); + + /* insert our name */ + ret = btrfs_add_link(trans, dir, inode, name, namelen, 0, + btrfs_inode_ref_index(eb, ref)); + BUG_ON(ret); + + btrfs_update_inode(trans, root, inode); + +out: + ref_ptr = (unsigned long)(ref + 1) + namelen; + kfree(name); + if (ref_ptr < ref_end) + goto again; + + /* finally write the back reference in the inode */ + ret = overwrite_item(trans, root, path, eb, slot, key); + BUG_ON(ret); + +out_nowrite: + btrfs_release_path(root, path); + iput(dir); + iput(inode); + return 0; +} + +/* + * There are a few corners where the link count of the file can't + * be properly maintained during replay. So, instead of adding + * lots of complexity to the log code, we just scan the backrefs + * for any file that has been through replay. + * + * The scan will update the link count on the inode to reflect the + * number of back refs found. If it goes down to zero, the iput + * will free the inode. + */ +static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct inode *inode) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + u64 nlink = 0; + unsigned long ptr; + unsigned long ptr_end; + int name_len; + + key.objectid = inode->i_ino; + key.type = BTRFS_INODE_REF_KEY; + key.offset = (u64)-1; + + path = btrfs_alloc_path(); + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &key, + path->slots[0]); + if (key.objectid != inode->i_ino || + key.type != BTRFS_INODE_REF_KEY) + break; + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + ptr_end = ptr + btrfs_item_size_nr(path->nodes[0], + path->slots[0]); + while (ptr < ptr_end) { + struct btrfs_inode_ref *ref; + + ref = (struct btrfs_inode_ref *)ptr; + name_len = btrfs_inode_ref_name_len(path->nodes[0], + ref); + ptr = (unsigned long)(ref + 1) + name_len; + nlink++; + } + + if (key.offset == 0) + break; + key.offset--; + btrfs_release_path(root, path); + } + btrfs_free_path(path); + if (nlink != inode->i_nlink) { + inode->i_nlink = nlink; + btrfs_update_inode(trans, root, inode); + } + BTRFS_I(inode)->index_cnt = (u64)-1; + + return 0; +} + +static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path) +{ + int ret; + struct btrfs_key key; + struct inode *inode; + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + key.type = BTRFS_ORPHAN_ITEM_KEY; + key.offset = (u64)-1; + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + break; + + if (ret == 1) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID || + key.type != BTRFS_ORPHAN_ITEM_KEY) + break; + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_release_path(root, path); + inode = read_one_inode(root, key.offset); + BUG_ON(!inode); + + ret = fixup_inode_link_count(trans, root, inode); + BUG_ON(ret); + + iput(inode); + + if (key.offset == 0) + break; + key.offset--; + } + btrfs_release_path(root, path); + return 0; +} + + +/* + * record a given inode in the fixup dir so we can check its link + * count when replay is done. The link count is incremented here + * so the inode won't go away until we check it + */ +static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 objectid) +{ + struct btrfs_key key; + int ret = 0; + struct inode *inode; + + inode = read_one_inode(root, objectid); + BUG_ON(!inode); + + key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; + btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.offset = objectid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, 0); + + btrfs_release_path(root, path); + if (ret == 0) { + btrfs_inc_nlink(inode); + btrfs_update_inode(trans, root, inode); + } else if (ret == -EEXIST) { + ret = 0; + } else { + BUG(); + } + iput(inode); + + return ret; +} + +/* + * when replaying the log for a directory, we only insert names + * for inodes that actually exist. This means an fsync on a directory + * does not implicitly fsync all the new files in it + */ +static noinline int insert_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, u64 index, + char *name, int name_len, u8 type, + struct btrfs_key *location) +{ + struct inode *inode; + struct inode *dir; + int ret; + + inode = read_one_inode(root, location->objectid); + if (!inode) + return -ENOENT; + + dir = read_one_inode(root, dirid); + if (!dir) { + iput(inode); + return -EIO; + } + ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index); + + /* FIXME, put inode into FIXUP list */ + + iput(inode); + iput(dir); + return ret; +} + +/* + * take a single entry in a log directory item and replay it into + * the subvolume. + * + * if a conflicting item exists in the subdirectory already, + * the inode it points to is unlinked and put into the link count + * fix up tree. + * + * If a name from the log points to a file or directory that does + * not exist in the FS, it is skipped. fsyncs on directories + * do not force down inodes inside that directory, just changes to the + * names or unlinks in a directory. + */ +static noinline int replay_one_name(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, + struct btrfs_dir_item *di, + struct btrfs_key *key) +{ + char *name; + int name_len; + struct btrfs_dir_item *dst_di; + struct btrfs_key found_key; + struct btrfs_key log_key; + struct inode *dir; + u8 log_type; + int exists; + int ret; + + dir = read_one_inode(root, key->objectid); + BUG_ON(!dir); + + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + log_type = btrfs_dir_type(eb, di); + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + + btrfs_dir_item_key_to_cpu(eb, di, &log_key); + exists = btrfs_lookup_inode(trans, root, path, &log_key, 0); + if (exists == 0) + exists = 1; + else + exists = 0; + btrfs_release_path(root, path); + + if (key->type == BTRFS_DIR_ITEM_KEY) { + dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid, + name, name_len, 1); + } else if (key->type == BTRFS_DIR_INDEX_KEY) { + dst_di = btrfs_lookup_dir_index_item(trans, root, path, + key->objectid, + key->offset, name, + name_len, 1); + } else { + BUG(); + } + if (!dst_di || IS_ERR(dst_di)) { + /* we need a sequence number to insert, so we only + * do inserts for the BTRFS_DIR_INDEX_KEY types + */ + if (key->type != BTRFS_DIR_INDEX_KEY) + goto out; + goto insert; + } + + btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key); + /* the existing item matches the logged item */ + if (found_key.objectid == log_key.objectid && + found_key.type == log_key.type && + found_key.offset == log_key.offset && + btrfs_dir_type(path->nodes[0], dst_di) == log_type) { + goto out; + } + + /* + * don't drop the conflicting directory entry if the inode + * for the new entry doesn't exist + */ + if (!exists) + goto out; + + ret = drop_one_dir_item(trans, root, path, dir, dst_di); + BUG_ON(ret); + + if (key->type == BTRFS_DIR_INDEX_KEY) + goto insert; +out: + btrfs_release_path(root, path); + kfree(name); + iput(dir); + return 0; + +insert: + btrfs_release_path(root, path); + ret = insert_one_name(trans, root, path, key->objectid, key->offset, + name, name_len, log_type, &log_key); + + if (ret && ret != -ENOENT) + BUG(); + goto out; +} + +/* + * find all the names in a directory item and reconcile them into + * the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than + * one name in a directory item, but the same code gets used for + * both directory index types + */ +static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, + struct extent_buffer *eb, int slot, + struct btrfs_key *key) +{ + int ret; + u32 item_size = btrfs_item_size_nr(eb, slot); + struct btrfs_dir_item *di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + name_len = btrfs_dir_name_len(eb, di); + ret = replay_one_name(trans, root, path, eb, di, key); + BUG_ON(ret); + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + return 0; +} + +/* + * directory replay has two parts. There are the standard directory + * items in the log copied from the subvolume, and range items + * created in the log while the subvolume was logged. + * + * The range items tell us which parts of the key space the log + * is authoritative for. During replay, if a key in the subvolume + * directory is in a logged range item, but not actually in the log + * that means it was deleted from the directory before the fsync + * and should be removed. + */ +static noinline int find_dir_range(struct btrfs_root *root, + struct btrfs_path *path, + u64 dirid, int key_type, + u64 *start_ret, u64 *end_ret) +{ + struct btrfs_key key; + u64 found_end; + struct btrfs_dir_log_item *item; + int ret; + int nritems; + + if (*start_ret == (u64)-1) + return 1; + + key.objectid = dirid; + key.type = key_type; + key.offset = *start_ret; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto out; + if (ret > 0) { + if (path->slots[0] == 0) + goto out; + path->slots[0]--; + } + if (ret != 0) + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto next; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + + if (*start_ret >= key.offset && *start_ret <= found_end) { + ret = 0; + *start_ret = key.offset; + *end_ret = found_end; + goto out; + } + ret = 1; +next: + /* check the next slot in the tree to see if it is a valid item */ + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + goto out; + } else { + path->slots[0]++; + } + + btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); + + if (key.type != key_type || key.objectid != dirid) { + ret = 1; + goto out; + } + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + found_end = btrfs_dir_log_end(path->nodes[0], item); + *start_ret = key.offset; + *end_ret = found_end; + ret = 0; +out: + btrfs_release_path(root, path); + return ret; +} + +/* + * this looks for a given directory item in the log. If the directory + * item is not in the log, the item is removed and the inode it points + * to is unlinked + */ +static noinline int check_item_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + struct btrfs_path *log_path, + struct inode *dir, + struct btrfs_key *dir_key) +{ + int ret; + struct extent_buffer *eb; + int slot; + u32 item_size; + struct btrfs_dir_item *di; + struct btrfs_dir_item *log_di; + int name_len; + unsigned long ptr; + unsigned long ptr_end; + char *name; + struct inode *inode; + struct btrfs_key location; + +again: + eb = path->nodes[0]; + slot = path->slots[0]; + item_size = btrfs_item_size_nr(eb, slot); + ptr = btrfs_item_ptr_offset(eb, slot); + ptr_end = ptr + item_size; + while (ptr < ptr_end) { + di = (struct btrfs_dir_item *)ptr; + name_len = btrfs_dir_name_len(eb, di); + name = kmalloc(name_len, GFP_NOFS); + if (!name) { + ret = -ENOMEM; + goto out; + } + read_extent_buffer(eb, name, (unsigned long)(di + 1), + name_len); + log_di = NULL; + if (dir_key->type == BTRFS_DIR_ITEM_KEY) { + log_di = btrfs_lookup_dir_item(trans, log, log_path, + dir_key->objectid, + name, name_len, 0); + } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) { + log_di = btrfs_lookup_dir_index_item(trans, log, + log_path, + dir_key->objectid, + dir_key->offset, + name, name_len, 0); + } + if (!log_di || IS_ERR(log_di)) { + btrfs_dir_item_key_to_cpu(eb, di, &location); + btrfs_release_path(root, path); + btrfs_release_path(log, log_path); + inode = read_one_inode(root, location.objectid); + BUG_ON(!inode); + + ret = link_to_fixup_dir(trans, root, + path, location.objectid); + BUG_ON(ret); + btrfs_inc_nlink(inode); + ret = btrfs_unlink_inode(trans, root, dir, inode, + name, name_len); + BUG_ON(ret); + kfree(name); + iput(inode); + + /* there might still be more names under this key + * check and repeat if required + */ + ret = btrfs_search_slot(NULL, root, dir_key, path, + 0, 0); + if (ret == 0) + goto again; + ret = 0; + goto out; + } + btrfs_release_path(log, log_path); + kfree(name); + + ptr = (unsigned long)(di + 1); + ptr += name_len; + } + ret = 0; +out: + btrfs_release_path(root, path); + btrfs_release_path(log, log_path); + return ret; +} + +/* + * deletion replay happens before we copy any new directory items + * out of the log or out of backreferences from inodes. It + * scans the log to find ranges of keys that log is authoritative for, + * and then scans the directory to find items in those ranges that are + * not present in the log. + * + * Anything we don't find in the log is unlinked and removed from the + * directory. + */ +static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_root *log, + struct btrfs_path *path, + u64 dirid) +{ + u64 range_start; + u64 range_end; + int key_type = BTRFS_DIR_LOG_ITEM_KEY; + int ret = 0; + struct btrfs_key dir_key; + struct btrfs_key found_key; + struct btrfs_path *log_path; + struct inode *dir; + + dir_key.objectid = dirid; + dir_key.type = BTRFS_DIR_ITEM_KEY; + log_path = btrfs_alloc_path(); + if (!log_path) + return -ENOMEM; + + dir = read_one_inode(root, dirid); + /* it isn't an error if the inode isn't there, that can happen + * because we replay the deletes before we copy in the inode item + * from the log + */ + if (!dir) { + btrfs_free_path(log_path); + return 0; + } +again: + range_start = 0; + range_end = 0; + while (1) { + ret = find_dir_range(log, path, dirid, key_type, + &range_start, &range_end); + if (ret != 0) + break; + + dir_key.offset = range_start; + while (1) { + int nritems; + ret = btrfs_search_slot(NULL, root, &dir_key, path, + 0, 0); + if (ret < 0) + goto out; + + nritems = btrfs_header_nritems(path->nodes[0]); + if (path->slots[0] >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != dirid || + found_key.type != dir_key.type) + goto next_type; + + if (found_key.offset > range_end) + break; + + ret = check_item_in_log(trans, root, log, path, + log_path, dir, &found_key); + BUG_ON(ret); + if (found_key.offset == (u64)-1) + break; + dir_key.offset = found_key.offset + 1; + } + btrfs_release_path(root, path); + if (range_end == (u64)-1) + break; + range_start = range_end + 1; + } + +next_type: + ret = 0; + if (key_type == BTRFS_DIR_LOG_ITEM_KEY) { + key_type = BTRFS_DIR_LOG_INDEX_KEY; + dir_key.type = BTRFS_DIR_INDEX_KEY; + btrfs_release_path(root, path); + goto again; + } +out: + btrfs_release_path(root, path); + btrfs_free_path(log_path); + iput(dir); + return ret; +} + +/* + * the process_func used to replay items from the log tree. This + * gets called in two different stages. The first stage just looks + * for inodes and makes sure they are all copied into the subvolume. + * + * The second stage copies all the other item types from the log into + * the subvolume. The two stage approach is slower, but gets rid of + * lots of complexity around inodes referencing other inodes that exist + * only in the log (references come from either directory items or inode + * back refs). + */ +static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, + struct walk_control *wc, u64 gen) +{ + int nritems; + struct btrfs_path *path; + struct btrfs_root *root = wc->replay_dest; + struct btrfs_key key; + u32 item_size; + int level; + int i; + int ret; + + btrfs_read_buffer(eb, gen); + + level = btrfs_header_level(eb); + + if (level != 0) + return 0; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + nritems = btrfs_header_nritems(eb); + for (i = 0; i < nritems; i++) { + btrfs_item_key_to_cpu(eb, &key, i); + item_size = btrfs_item_size_nr(eb, i); + + /* inode keys are done during the first stage */ + if (key.type == BTRFS_INODE_ITEM_KEY && + wc->stage == LOG_WALK_REPLAY_INODES) { + struct inode *inode; + struct btrfs_inode_item *inode_item; + u32 mode; + + inode_item = btrfs_item_ptr(eb, i, + struct btrfs_inode_item); + mode = btrfs_inode_mode(eb, inode_item); + if (S_ISDIR(mode)) { + ret = replay_dir_deletes(wc->trans, + root, log, path, key.objectid); + BUG_ON(ret); + } + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + + /* for regular files, truncate away + * extents past the new EOF + */ + if (S_ISREG(mode)) { + inode = read_one_inode(root, + key.objectid); + BUG_ON(!inode); + + ret = btrfs_truncate_inode_items(wc->trans, + root, inode, inode->i_size, + BTRFS_EXTENT_DATA_KEY); + BUG_ON(ret); + iput(inode); + } + ret = link_to_fixup_dir(wc->trans, root, + path, key.objectid); + BUG_ON(ret); + } + if (wc->stage < LOG_WALK_REPLAY_ALL) + continue; + + /* these keys are simply copied */ + if (key.type == BTRFS_XATTR_ITEM_KEY) { + ret = overwrite_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_INODE_REF_KEY) { + ret = add_inode_ref(wc->trans, root, log, path, + eb, i, &key); + BUG_ON(ret && ret != -ENOENT); + } else if (key.type == BTRFS_EXTENT_DATA_KEY) { + ret = replay_one_extent(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } else if (key.type == BTRFS_DIR_ITEM_KEY || + key.type == BTRFS_DIR_INDEX_KEY) { + ret = replay_one_dir_item(wc->trans, root, path, + eb, i, &key); + BUG_ON(ret); + } + } + btrfs_free_path(path); + return 0; +} + +static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 root_gen; + u64 bytenr; + u64 ptr_gen; + struct extent_buffer *next; + struct extent_buffer *cur; + struct extent_buffer *parent; + u32 blocksize; + int ret = 0; + + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + while (*level > 0) { + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + cur = path->nodes[*level]; + + if (btrfs_header_level(cur) != *level) + WARN_ON(1); + + if (path->slots[*level] >= + btrfs_header_nritems(cur)) + break; + + bytenr = btrfs_node_blockptr(cur, path->slots[*level]); + ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); + blocksize = btrfs_level_size(root, *level - 1); + + parent = path->nodes[*level]; + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + next = btrfs_find_create_tree_block(root, bytenr, blocksize); + + wc->process_func(root, next, wc, ptr_gen); + + if (*level == 1) { + path->slots[*level]++; + if (wc->free) { + btrfs_read_buffer(next, ptr_gen); + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + ret = btrfs_drop_leaf_ref(trans, root, next); + BUG_ON(ret); + + WARN_ON(root_owner != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, + bytenr, blocksize); + BUG_ON(ret); + } + free_extent_buffer(next); + continue; + } + btrfs_read_buffer(next, ptr_gen); + + WARN_ON(*level <= 0); + if (path->nodes[*level-1]) + free_extent_buffer(path->nodes[*level-1]); + path->nodes[*level-1] = next; + *level = btrfs_header_level(next); + path->slots[*level] = 0; + cond_resched(); + } + WARN_ON(*level < 0); + WARN_ON(*level >= BTRFS_MAX_LEVEL); + + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + bytenr = path->nodes[*level]->start; + + blocksize = btrfs_level_size(root, *level); + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + + wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + + if (wc->free) { + next = path->nodes[*level]; + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, next); + BUG_ON(ret); + } + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, bytenr, blocksize); + BUG_ON(ret); + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level += 1; + + cond_resched(); + return 0; +} + +static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_path *path, int *level, + struct walk_control *wc) +{ + u64 root_owner; + u64 root_gen; + int i; + int slot; + int ret; + + for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) { + slot = path->slots[i]; + if (slot < btrfs_header_nritems(path->nodes[i]) - 1) { + struct extent_buffer *node; + node = path->nodes[i]; + path->slots[i]++; + *level = i; + WARN_ON(*level == 0); + return 0; + } else { + struct extent_buffer *parent; + if (path->nodes[*level] == root->node) + parent = path->nodes[*level]; + else + parent = path->nodes[*level + 1]; + + root_owner = btrfs_header_owner(parent); + root_gen = btrfs_header_generation(parent); + wc->process_func(root, path->nodes[*level], wc, + btrfs_header_generation(path->nodes[*level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[*level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, root, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (*level == 0) { + ret = btrfs_drop_leaf_ref(trans, root, + next); + BUG_ON(ret); + } + + WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(root, + path->nodes[*level]->start, + path->nodes[*level]->len); + BUG_ON(ret); + } + free_extent_buffer(path->nodes[*level]); + path->nodes[*level] = NULL; + *level = i + 1; + } + } + return 1; +} + +/* + * drop the reference count on the tree rooted at 'snap'. This traverses + * the tree freeing any blocks that have a ref count of zero after being + * decremented. + */ +static int walk_log_tree(struct btrfs_trans_handle *trans, + struct btrfs_root *log, struct walk_control *wc) +{ + int ret = 0; + int wret; + int level; + struct btrfs_path *path; + int i; + int orig_level; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + level = btrfs_header_level(log->node); + orig_level = level; + path->nodes[level] = log->node; + extent_buffer_get(log->node); + path->slots[level] = 0; + + while (1) { + wret = walk_down_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + + wret = walk_up_log_tree(trans, log, path, &level, wc); + if (wret > 0) + break; + if (wret < 0) + ret = wret; + } + + /* was the root node processed? if not, catch it here */ + if (path->nodes[orig_level]) { + wc->process_func(log, path->nodes[orig_level], wc, + btrfs_header_generation(path->nodes[orig_level])); + if (wc->free) { + struct extent_buffer *next; + + next = path->nodes[orig_level]; + + btrfs_tree_lock(next); + clean_tree_block(trans, log, next); + btrfs_wait_tree_block_writeback(next); + btrfs_tree_unlock(next); + + if (orig_level == 0) { + ret = btrfs_drop_leaf_ref(trans, log, + next); + BUG_ON(ret); + } + WARN_ON(log->root_key.objectid != + BTRFS_TREE_LOG_OBJECTID); + ret = btrfs_free_reserved_extent(log, next->start, + next->len); + BUG_ON(ret); + } + } + + for (i = 0; i <= orig_level; i++) { + if (path->nodes[i]) { + free_extent_buffer(path->nodes[i]); + path->nodes[i] = NULL; + } + } + btrfs_free_path(path); + if (wc->free) + free_extent_buffer(log->node); + return ret; +} + +static int wait_log_commit(struct btrfs_root *log) +{ + DEFINE_WAIT(wait); + u64 transid = log->fs_info->tree_log_transid; + + do { + prepare_to_wait(&log->fs_info->tree_log_wait, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_commit)) + schedule(); + finish_wait(&log->fs_info->tree_log_wait, &wait); + mutex_lock(&log->fs_info->tree_log_mutex); + } while (transid == log->fs_info->tree_log_transid && + atomic_read(&log->fs_info->tree_log_commit)); + return 0; +} + +/* + * btrfs_sync_log does sends a given tree log down to the disk and + * updates the super blocks to record it. When this call is done, + * you know that any inodes previously logged are safely on disk + */ +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + int ret; + unsigned long batch; + struct btrfs_root *log = root->log_root; + + mutex_lock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_commit)) { + wait_log_commit(log); + goto out; + } + atomic_set(&log->fs_info->tree_log_commit, 1); + + while (1) { + batch = log->fs_info->tree_log_batch; + mutex_unlock(&log->fs_info->tree_log_mutex); + schedule_timeout_uninterruptible(1); + mutex_lock(&log->fs_info->tree_log_mutex); + + while (atomic_read(&log->fs_info->tree_log_writers)) { + DEFINE_WAIT(wait); + prepare_to_wait(&log->fs_info->tree_log_wait, &wait, + TASK_UNINTERRUPTIBLE); + mutex_unlock(&log->fs_info->tree_log_mutex); + if (atomic_read(&log->fs_info->tree_log_writers)) + schedule(); + mutex_lock(&log->fs_info->tree_log_mutex); + finish_wait(&log->fs_info->tree_log_wait, &wait); + } + if (batch == log->fs_info->tree_log_batch) + break; + } + + ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages); + BUG_ON(ret); + ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree, + &root->fs_info->log_root_tree->dirty_log_pages); + BUG_ON(ret); + + btrfs_set_super_log_root(&root->fs_info->super_for_commit, + log->fs_info->log_root_tree->node->start); + btrfs_set_super_log_root_level(&root->fs_info->super_for_commit, + btrfs_header_level(log->fs_info->log_root_tree->node)); + + write_ctree_super(trans, log->fs_info->tree_root, 2); + log->fs_info->tree_log_transid++; + log->fs_info->tree_log_batch = 0; + atomic_set(&log->fs_info->tree_log_commit, 0); + smp_mb(); + if (waitqueue_active(&log->fs_info->tree_log_wait)) + wake_up(&log->fs_info->tree_log_wait); +out: + mutex_unlock(&log->fs_info->tree_log_mutex); + return 0; +} + +/* * free all the extents used by the tree log. This should be called + * at commit time of the full transaction + */ +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root) +{ + int ret; + struct btrfs_root *log; + struct key; + u64 start; + u64 end; + struct walk_control wc = { + .free = 1, + .process_func = process_one_buffer + }; + + if (!root->log_root || root->fs_info->log_root_recovering) + return 0; + + log = root->log_root; + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + while (1) { + ret = find_first_extent_bit(&log->dirty_log_pages, + 0, &start, &end, EXTENT_DIRTY); + if (ret) + break; + + clear_extent_dirty(&log->dirty_log_pages, + start, end, GFP_NOFS); + } + + log = root->log_root; + ret = btrfs_del_root(trans, root->fs_info->log_root_tree, + &log->root_key); + BUG_ON(ret); + root->log_root = NULL; + kfree(root->log_root); + return 0; +} + +/* + * helper function to update the item for a given subvolumes log root + * in the tree of log roots + */ +static int update_log_root(struct btrfs_trans_handle *trans, + struct btrfs_root *log) +{ + u64 bytenr = btrfs_root_bytenr(&log->root_item); + int ret; + + if (log->node->start == bytenr) + return 0; + + btrfs_set_root_bytenr(&log->root_item, log->node->start); + btrfs_set_root_generation(&log->root_item, trans->transid); + btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node)); + ret = btrfs_update_root(trans, log->fs_info->log_root_tree, + &log->root_key, &log->root_item); + BUG_ON(ret); + return ret; +} + +/* + * If both a file and directory are logged, and unlinks or renames are + * mixed in, we have a few interesting corners: + * + * create file X in dir Y + * link file X to X.link in dir Y + * fsync file X + * unlink file X but leave X.link + * fsync dir Y + * + * After a crash we would expect only X.link to exist. But file X + * didn't get fsync'd again so the log has back refs for X and X.link. + * + * We solve this by removing directory entries and inode backrefs from the + * log when a file that was logged in the current transaction is + * unlinked. Any later fsync will include the updated log entries, and + * we'll be able to reconstruct the proper directory items from backrefs. + * + * This optimizations allows us to avoid relogging the entire inode + * or the entire directory. + */ +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index) +{ + struct btrfs_root *log; + struct btrfs_dir_item *di; + struct btrfs_path *path; + int ret; + int bytes_del = 0; + + if (BTRFS_I(dir)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + + mutex_lock(&BTRFS_I(dir)->log_mutex); + + log = root->log_root; + path = btrfs_alloc_path(); + di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino, + name, name_len, -1); + if (di && !IS_ERR(di)) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + btrfs_release_path(log, path); + di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino, + index, name, name_len, -1); + if (di && !IS_ERR(di)) { + ret = btrfs_delete_one_dir_name(trans, log, path, di); + bytes_del += name_len; + BUG_ON(ret); + } + + /* update the directory size in the log to reflect the names + * we have removed + */ + if (bytes_del) { + struct btrfs_key key; + + key.objectid = dir->i_ino; + key.offset = 0; + key.type = BTRFS_INODE_ITEM_KEY; + btrfs_release_path(log, path); + + ret = btrfs_search_slot(trans, log, &key, path, 0, 1); + if (ret == 0) { + struct btrfs_inode_item *item; + u64 i_size; + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_inode_item); + i_size = btrfs_inode_size(path->nodes[0], item); + if (i_size > bytes_del) + i_size -= bytes_del; + else + i_size = 0; + btrfs_set_inode_size(path->nodes[0], item, i_size); + btrfs_mark_buffer_dirty(path->nodes[0]); + } else + ret = 0; + btrfs_release_path(log, path); + } + + btrfs_free_path(path); + mutex_unlock(&BTRFS_I(dir)->log_mutex); + end_log_trans(root); + + return 0; +} + +/* see comments for btrfs_del_dir_entries_in_log */ +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid) +{ + struct btrfs_root *log; + u64 index; + int ret; + + if (BTRFS_I(inode)->logged_trans < trans->transid) + return 0; + + ret = join_running_log_trans(root); + if (ret) + return 0; + log = root->log_root; + mutex_lock(&BTRFS_I(inode)->log_mutex); + + ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino, + dirid, &index); + mutex_unlock(&BTRFS_I(inode)->log_mutex); + end_log_trans(root); + + return ret; +} + +/* + * creates a range item in the log for 'dirid'. first_offset and + * last_offset tell us which parts of the key space the log should + * be considered authoritative for. + */ +static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + int key_type, u64 dirid, + u64 first_offset, u64 last_offset) +{ + int ret; + struct btrfs_key key; + struct btrfs_dir_log_item *item; + + key.objectid = dirid; + key.offset = first_offset; + if (key_type == BTRFS_DIR_ITEM_KEY) + key.type = BTRFS_DIR_LOG_ITEM_KEY; + else + key.type = BTRFS_DIR_LOG_INDEX_KEY; + ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); + BUG_ON(ret); + + item = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_dir_log_item); + btrfs_set_dir_log_end(path->nodes[0], item, last_offset); + btrfs_mark_buffer_dirty(path->nodes[0]); + btrfs_release_path(log, path); + return 0; +} + +/* + * log all the items included in the current transaction for a given + * directory. This also creates the range items in the log tree required + * to replay anything deleted before the fsync + */ +static noinline int log_dir_items(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path, int key_type, + u64 min_offset, u64 *last_offset_ret) +{ + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src; + int ret; + int i; + int nritems; + u64 first_offset = min_offset; + u64 last_offset = (u64)-1; + + log = root->log_root; + max_key.objectid = inode->i_ino; + max_key.offset = (u64)-1; + max_key.type = key_type; + + min_key.objectid = inode->i_ino; + min_key.type = key_type; + min_key.offset = min_offset; + + path->keep_locks = 1; + + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + + /* + * we didn't find anything from this transaction, see if there + * is anything at all + */ + if (ret != 0 || min_key.objectid != inode->i_ino || + min_key.type != key_type) { + min_key.objectid = inode->i_ino; + min_key.type = key_type; + min_key.offset = (u64)-1; + btrfs_release_path(root, path); + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret < 0) { + btrfs_release_path(root, path); + return ret; + } + ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + + /* if ret == 0 there are items for this type, + * create a range to tell us the last key of this type. + * otherwise, there are no items in this directory after + * *min_offset, and we create a range to indicate that. + */ + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, + path->slots[0]); + if (key_type == tmp.type) + first_offset = max(min_offset, tmp.offset) + 1; + } + goto done; + } + + /* go backward to find any previous key */ + ret = btrfs_previous_item(root, path, inode->i_ino, key_type); + if (ret == 0) { + struct btrfs_key tmp; + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (key_type == tmp.type) { + first_offset = tmp.offset; + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + } + } + btrfs_release_path(root, path); + + /* find the first key from this transaction again */ + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); + if (ret != 0) { + WARN_ON(1); + goto done; + } + + /* + * we have a block from this transaction, log every item in it + * from our directory + */ + while (1) { + struct btrfs_key tmp; + src = path->nodes[0]; + nritems = btrfs_header_nritems(src); + for (i = path->slots[0]; i < nritems; i++) { + btrfs_item_key_to_cpu(src, &min_key, i); + + if (min_key.objectid != inode->i_ino || + min_key.type != key_type) + goto done; + ret = overwrite_item(trans, log, dst_path, src, i, + &min_key); + BUG_ON(ret); + } + path->slots[0] = nritems; + + /* + * look ahead to the next item and see if it is also + * from this directory and from this transaction + */ + ret = btrfs_next_leaf(root, path); + if (ret == 1) { + last_offset = (u64)-1; + goto done; + } + btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]); + if (tmp.objectid != inode->i_ino || tmp.type != key_type) { + last_offset = (u64)-1; + goto done; + } + if (btrfs_header_generation(path->nodes[0]) != trans->transid) { + ret = overwrite_item(trans, log, dst_path, + path->nodes[0], path->slots[0], + &tmp); + + BUG_ON(ret); + last_offset = tmp.offset; + goto done; + } + } +done: + *last_offset_ret = last_offset; + btrfs_release_path(root, path); + btrfs_release_path(log, dst_path); + + /* insert the log range keys to indicate where the log is valid */ + ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino, + first_offset, last_offset); + BUG_ON(ret); + return 0; +} + +/* + * logging directories is very similar to logging inodes, We find all the items + * from the current transaction and write them to the log. + * + * The recovery code scans the directory in the subvolume, and if it finds a + * key in the range logged that is not present in the log tree, then it means + * that dir entry was unlinked during the transaction. + * + * In order for that scan to work, we must include one key smaller than + * the smallest logged by this transaction and one key larger than the largest + * key logged by this transaction. + */ +static noinline int log_directory_changes(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + struct btrfs_path *path, + struct btrfs_path *dst_path) +{ + u64 min_key; + u64 max_key; + int ret; + int key_type = BTRFS_DIR_ITEM_KEY; + +again: + min_key = 0; + max_key = 0; + while (1) { + ret = log_dir_items(trans, root, inode, path, + dst_path, key_type, min_key, + &max_key); + BUG_ON(ret); + if (max_key == (u64)-1) + break; + min_key = max_key + 1; + } + + if (key_type == BTRFS_DIR_ITEM_KEY) { + key_type = BTRFS_DIR_INDEX_KEY; + goto again; + } + return 0; +} + +/* + * a helper function to drop items from the log before we relog an + * inode. max_key_type indicates the highest item type to remove. + * This cannot be run for file data extents because it does not + * free the extents they point to. + */ +static int drop_objectid_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *path, + u64 objectid, int max_key_type) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + + key.objectid = objectid; + key.type = max_key_type; + key.offset = (u64)-1; + + while (1) { + ret = btrfs_search_slot(trans, log, &key, path, -1, 1); + + if (ret != 1) + break; + + if (path->slots[0] == 0) + break; + + path->slots[0]--; + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + + if (found_key.objectid != objectid) + break; + + ret = btrfs_del_item(trans, log, path); + BUG_ON(ret); + btrfs_release_path(log, path); + } + btrfs_release_path(log, path); + return 0; +} + +static noinline int copy_items(struct btrfs_trans_handle *trans, + struct btrfs_root *log, + struct btrfs_path *dst_path, + struct extent_buffer *src, + int start_slot, int nr, int inode_only) +{ + unsigned long src_offset; + unsigned long dst_offset; + struct btrfs_file_extent_item *extent; + struct btrfs_inode_item *inode_item; + int ret; + struct btrfs_key *ins_keys; + u32 *ins_sizes; + char *ins_data; + int i; + struct list_head ordered_sums; + + INIT_LIST_HEAD(&ordered_sums); + + ins_data = kmalloc(nr * sizeof(struct btrfs_key) + + nr * sizeof(u32), GFP_NOFS); + ins_sizes = (u32 *)ins_data; + ins_keys = (struct btrfs_key *)(ins_data + nr * sizeof(u32)); + + for (i = 0; i < nr; i++) { + ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot); + btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot); + } + ret = btrfs_insert_empty_items(trans, log, dst_path, + ins_keys, ins_sizes, nr); + BUG_ON(ret); + + for (i = 0; i < nr; i++) { + dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0], + dst_path->slots[0]); + + src_offset = btrfs_item_ptr_offset(src, start_slot + i); + + copy_extent_buffer(dst_path->nodes[0], src, dst_offset, + src_offset, ins_sizes[i]); + + if (inode_only == LOG_INODE_EXISTS && + ins_keys[i].type == BTRFS_INODE_ITEM_KEY) { + inode_item = btrfs_item_ptr(dst_path->nodes[0], + dst_path->slots[0], + struct btrfs_inode_item); + btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0); + + /* set the generation to zero so the recover code + * can tell the difference between an logging + * just to say 'this inode exists' and a logging + * to say 'update this inode with these values' + */ + btrfs_set_inode_generation(dst_path->nodes[0], + inode_item, 0); + } + /* take a reference on file data extents so that truncates + * or deletes of this inode don't have to relog the inode + * again + */ + if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) { + int found_type; + extent = btrfs_item_ptr(src, start_slot + i, + struct btrfs_file_extent_item); + + found_type = btrfs_file_extent_type(src, extent); + if (found_type == BTRFS_FILE_EXTENT_REG || + found_type == BTRFS_FILE_EXTENT_PREALLOC) { + u64 ds = btrfs_file_extent_disk_bytenr(src, + extent); + u64 dl = btrfs_file_extent_disk_num_bytes(src, + extent); + u64 cs = btrfs_file_extent_offset(src, extent); + u64 cl = btrfs_file_extent_num_bytes(src, + extent);; + if (btrfs_file_extent_compression(src, + extent)) { + cs = 0; + cl = dl; + } + /* ds == 0 is a hole */ + if (ds != 0) { + ret = btrfs_inc_extent_ref(trans, log, + ds, dl, + dst_path->nodes[0]->start, + BTRFS_TREE_LOG_OBJECTID, + trans->transid, + ins_keys[i].objectid); + BUG_ON(ret); + ret = btrfs_lookup_csums_range( + log->fs_info->csum_root, + ds + cs, ds + cs + cl - 1, + &ordered_sums); + BUG_ON(ret); + } + } + } + dst_path->slots[0]++; + } + + btrfs_mark_buffer_dirty(dst_path->nodes[0]); + btrfs_release_path(log, dst_path); + kfree(ins_data); + + /* + * we have to do this after the loop above to avoid changing the + * log tree while trying to change the log tree. + */ + while (!list_empty(&ordered_sums)) { + struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, + struct btrfs_ordered_sum, + list); + ret = btrfs_csum_file_blocks(trans, log, sums); + BUG_ON(ret); + list_del(&sums->list); + kfree(sums); + } + return 0; +} + +/* log a single inode in the tree log. + * At least one parent directory for this inode must exist in the tree + * or be logged already. + * + * Any items from this inode changed by the current transaction are copied + * to the log tree. An extra reference is taken on any extents in this + * file, allowing us to avoid a whole pile of corner cases around logging + * blocks that have been removed from the tree. + * + * See LOG_INODE_ALL and related defines for a description of what inode_only + * does. + * + * This handles both files and directories. + */ +static int __btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only) +{ + struct btrfs_path *path; + struct btrfs_path *dst_path; + struct btrfs_key min_key; + struct btrfs_key max_key; + struct btrfs_root *log = root->log_root; + struct extent_buffer *src = NULL; + u32 size; + int ret; + int nritems; + int ins_start_slot = 0; + int ins_nr; + + log = root->log_root; + + path = btrfs_alloc_path(); + dst_path = btrfs_alloc_path(); + + min_key.objectid = inode->i_ino; + min_key.type = BTRFS_INODE_ITEM_KEY; + min_key.offset = 0; + + max_key.objectid = inode->i_ino; + if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode)) + max_key.type = BTRFS_XATTR_ITEM_KEY; + else + max_key.type = (u8)-1; + max_key.offset = (u64)-1; + + /* + * if this inode has already been logged and we're in inode_only + * mode, we don't want to delete the things that have already + * been written to the log. + * + * But, if the inode has been through an inode_only log, + * the logged_trans field is not set. This allows us to catch + * any new names for this inode in the backrefs by logging it + * again + */ + if (inode_only == LOG_INODE_EXISTS && + BTRFS_I(inode)->logged_trans == trans->transid) { + btrfs_free_path(path); + btrfs_free_path(dst_path); + goto out; + } + mutex_lock(&BTRFS_I(inode)->log_mutex); + + /* + * a brute force approach to making sure we get the most uptodate + * copies of everything. + */ + if (S_ISDIR(inode->i_mode)) { + int max_key_type = BTRFS_DIR_LOG_INDEX_KEY; + + if (inode_only == LOG_INODE_EXISTS) + max_key_type = BTRFS_XATTR_ITEM_KEY; + ret = drop_objectid_items(trans, log, path, + inode->i_ino, max_key_type); + } else { + ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0); + } + BUG_ON(ret); + path->keep_locks = 1; + + while (1) { + ins_nr = 0; + ret = btrfs_search_forward(root, &min_key, &max_key, + path, 0, trans->transid); + if (ret != 0) + break; +again: + /* note, ins_nr might be > 0 here, cleanup outside the loop */ + if (min_key.objectid != inode->i_ino) + break; + if (min_key.type > max_key.type) + break; + + src = path->nodes[0]; + size = btrfs_item_size_nr(src, path->slots[0]); + if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) { + ins_nr++; + goto next_slot; + } else if (!ins_nr) { + ins_start_slot = path->slots[0]; + ins_nr = 1; + goto next_slot; + } + + ret = copy_items(trans, log, dst_path, src, ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 1; + ins_start_slot = path->slots[0]; +next_slot: + + nritems = btrfs_header_nritems(path->nodes[0]); + path->slots[0]++; + if (path->slots[0] < nritems) { + btrfs_item_key_to_cpu(path->nodes[0], &min_key, + path->slots[0]); + goto again; + } + if (ins_nr) { + ret = copy_items(trans, log, dst_path, src, + ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 0; + } + btrfs_release_path(root, path); + + if (min_key.offset < (u64)-1) + min_key.offset++; + else if (min_key.type < (u8)-1) + min_key.type++; + else if (min_key.objectid < (u64)-1) + min_key.objectid++; + else + break; + } + if (ins_nr) { + ret = copy_items(trans, log, dst_path, src, + ins_start_slot, + ins_nr, inode_only); + BUG_ON(ret); + ins_nr = 0; + } + WARN_ON(ins_nr); + if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) { + btrfs_release_path(root, path); + btrfs_release_path(log, dst_path); + BTRFS_I(inode)->log_dirty_trans = 0; + ret = log_directory_changes(trans, root, inode, path, dst_path); + BUG_ON(ret); + } + BTRFS_I(inode)->logged_trans = trans->transid; + mutex_unlock(&BTRFS_I(inode)->log_mutex); + + btrfs_free_path(path); + btrfs_free_path(dst_path); + + mutex_lock(&root->fs_info->tree_log_mutex); + ret = update_log_root(trans, log); + BUG_ON(ret); + mutex_unlock(&root->fs_info->tree_log_mutex); +out: + return 0; +} + +int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only) +{ + int ret; + + start_log_trans(trans, root); + ret = __btrfs_log_inode(trans, root, inode, inode_only); + end_log_trans(root); + return ret; +} + +/* + * helper function around btrfs_log_inode to make sure newly created + * parent directories also end up in the log. A minimal inode and backref + * only logging is done of any parent directories that are older than + * the last committed transaction + */ +int btrfs_log_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry) +{ + int inode_only = LOG_INODE_ALL; + struct super_block *sb; + int ret; + + start_log_trans(trans, root); + sb = dentry->d_inode->i_sb; + while (1) { + ret = __btrfs_log_inode(trans, root, dentry->d_inode, + inode_only); + BUG_ON(ret); + inode_only = LOG_INODE_EXISTS; + + dentry = dentry->d_parent; + if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb) + break; + + if (BTRFS_I(dentry->d_inode)->generation <= + root->fs_info->last_trans_committed) + break; + } + end_log_trans(root); + return 0; +} + +/* + * it is not safe to log dentry if the chunk root has added new + * chunks. This returns 0 if the dentry was logged, and 1 otherwise. + * If this returns 1, you must commit the transaction to safely get your + * data on disk. + */ +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry) +{ + u64 gen; + gen = root->fs_info->last_trans_new_blockgroup; + if (gen > root->fs_info->last_trans_committed) + return 1; + else + return btrfs_log_dentry(trans, root, dentry); +} + +/* + * should be called during mount to recover any replay any log trees + * from the FS + */ +int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) +{ + int ret; + struct btrfs_path *path; + struct btrfs_trans_handle *trans; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_key tmp_key; + struct btrfs_root *log; + struct btrfs_fs_info *fs_info = log_root_tree->fs_info; + u64 highest_inode; + struct walk_control wc = { + .process_func = process_one_buffer, + .stage = 0, + }; + + fs_info->log_root_recovering = 1; + path = btrfs_alloc_path(); + BUG_ON(!path); + + trans = btrfs_start_transaction(fs_info->tree_root, 1); + + wc.trans = trans; + wc.pin = 1; + + walk_log_tree(trans, log_root_tree, &wc); + +again: + key.objectid = BTRFS_TREE_LOG_OBJECTID; + key.offset = (u64)-1; + btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + + while (1) { + ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); + if (ret < 0) + break; + if (ret > 0) { + if (path->slots[0] == 0) + break; + path->slots[0]--; + } + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + btrfs_release_path(log_root_tree, path); + if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID) + break; + + log = btrfs_read_fs_root_no_radix(log_root_tree, + &found_key); + BUG_ON(!log); + + + tmp_key.objectid = found_key.offset; + tmp_key.type = BTRFS_ROOT_ITEM_KEY; + tmp_key.offset = (u64)-1; + + wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key); + BUG_ON(!wc.replay_dest); + + wc.replay_dest->log_root = log; + btrfs_record_root_in_trans(wc.replay_dest); + ret = walk_log_tree(trans, log, &wc); + BUG_ON(ret); + + if (wc.stage == LOG_WALK_REPLAY_ALL) { + ret = fixup_inode_link_counts(trans, wc.replay_dest, + path); + BUG_ON(ret); + } + ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode); + if (ret == 0) { + wc.replay_dest->highest_inode = highest_inode; + wc.replay_dest->last_inode_alloc = highest_inode; + } + + key.offset = found_key.offset - 1; + wc.replay_dest->log_root = NULL; + free_extent_buffer(log->node); + kfree(log); + + if (found_key.offset == 0) + break; + } + btrfs_release_path(log_root_tree, path); + + /* step one is to pin it all, step two is to replay just inodes */ + if (wc.pin) { + wc.pin = 0; + wc.process_func = replay_one_buffer; + wc.stage = LOG_WALK_REPLAY_INODES; + goto again; + } + /* step three is to replay everything */ + if (wc.stage < LOG_WALK_REPLAY_ALL) { + wc.stage++; + goto again; + } + + btrfs_free_path(path); + + free_extent_buffer(log_root_tree->node); + log_root_tree->log_root = NULL; + fs_info->log_root_recovering = 0; + + /* step 4: commit the transaction, which also unpins the blocks */ + btrfs_commit_transaction(trans, fs_info->tree_root); + + kfree(log_root_tree); + return 0; +} diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h new file mode 100644 index 00000000000..b9409b32ed0 --- /dev/null +++ b/fs/btrfs/tree-log.h @@ -0,0 +1,41 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __TREE_LOG_ +#define __TREE_LOG_ + +int btrfs_sync_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root); +int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root); +int btrfs_log_dentry(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry); +int btrfs_recover_log_trees(struct btrfs_root *tree_root); +int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct dentry *dentry); +int btrfs_log_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, struct inode *inode, + int inode_only); +int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *dir, u64 index); +int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + const char *name, int name_len, + struct inode *inode, u64 dirid); +#endif diff --git a/fs/btrfs/version.h b/fs/btrfs/version.h new file mode 100644 index 00000000000..9bf3946d5ef --- /dev/null +++ b/fs/btrfs/version.h @@ -0,0 +1,4 @@ +#ifndef __BTRFS_VERSION_H +#define __BTRFS_VERSION_H +#define BTRFS_BUILD_VERSION "Btrfs" +#endif diff --git a/fs/btrfs/version.sh b/fs/btrfs/version.sh new file mode 100644 index 00000000000..1ca1952fd91 --- /dev/null +++ b/fs/btrfs/version.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# +# determine-version -- report a useful version for releases +# +# Copyright 2008, Aron Griffis <agriffis@n01se.net> +# Copyright 2008, Oracle +# Released under the GNU GPLv2 + +v="v0.16" + +which git &> /dev/null +if [ $? == 0 ]; then + git branch >& /dev/null + if [ $? == 0 ]; then + if head=`git rev-parse --verify HEAD 2>/dev/null`; then + if tag=`git describe --tags 2>/dev/null`; then + v="$tag" + fi + + # Are there uncommitted changes? + git update-index --refresh --unmerged > /dev/null + if git diff-index --name-only HEAD | \ + grep -v "^scripts/package" \ + | read dummy; then + v="$v"-dirty + fi + fi + fi +fi + +echo "#ifndef __BUILD_VERSION" > .build-version.h +echo "#define __BUILD_VERSION" >> .build-version.h +echo "#define BTRFS_BUILD_VERSION \"Btrfs $v\"" >> .build-version.h +echo "#endif" >> .build-version.h + +diff -q version.h .build-version.h >& /dev/null + +if [ $? == 0 ]; then + rm .build-version.h + exit 0 +fi + +mv .build-version.h version.h diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c new file mode 100644 index 00000000000..b187b537888 --- /dev/null +++ b/fs/btrfs/volumes.c @@ -0,0 +1,3218 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ +#include <linux/sched.h> +#include <linux/bio.h> +#include <linux/buffer_head.h> +#include <linux/blkdev.h> +#include <linux/random.h> +#include <linux/version.h> +#include <asm/div64.h> +#include "compat.h" +#include "ctree.h" +#include "extent_map.h" +#include "disk-io.h" +#include "transaction.h" +#include "print-tree.h" +#include "volumes.h" +#include "async-thread.h" + +struct map_lookup { + u64 type; + int io_align; + int io_width; + int stripe_len; + int sector_size; + int num_stripes; + int sub_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +static int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +static int btrfs_relocate_sys_chunks(struct btrfs_root *root); + +#define map_lookup_size(n) (sizeof(struct map_lookup) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +static DEFINE_MUTEX(uuid_mutex); +static LIST_HEAD(fs_uuids); + +void btrfs_lock_volumes(void) +{ + mutex_lock(&uuid_mutex); +} + +void btrfs_unlock_volumes(void) +{ + mutex_unlock(&uuid_mutex); +} + +static void lock_chunks(struct btrfs_root *root) +{ + mutex_lock(&root->fs_info->chunk_mutex); +} + +static void unlock_chunks(struct btrfs_root *root) +{ + mutex_unlock(&root->fs_info->chunk_mutex); +} + +static void free_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + WARN_ON(fs_devices->opened); + while (!list_empty(&fs_devices->devices)) { + device = list_entry(fs_devices->devices.next, + struct btrfs_device, dev_list); + list_del(&device->dev_list); + kfree(device->name); + kfree(device); + } + kfree(fs_devices); +} + +int btrfs_cleanup_fs_uuids(void) +{ + struct btrfs_fs_devices *fs_devices; + + while (!list_empty(&fs_uuids)) { + fs_devices = list_entry(fs_uuids.next, + struct btrfs_fs_devices, list); + list_del(&fs_devices->list); + free_fs_devices(fs_devices); + } + return 0; +} + +static noinline struct btrfs_device *__find_device(struct list_head *head, + u64 devid, u8 *uuid) +{ + struct btrfs_device *dev; + struct list_head *cur; + + list_for_each(cur, head) { + dev = list_entry(cur, struct btrfs_device, dev_list); + if (dev->devid == devid && + (!uuid || !memcmp(dev->uuid, uuid, BTRFS_UUID_SIZE))) { + return dev; + } + } + return NULL; +} + +static noinline struct btrfs_fs_devices *find_fsid(u8 *fsid) +{ + struct list_head *cur; + struct btrfs_fs_devices *fs_devices; + + list_for_each(cur, &fs_uuids) { + fs_devices = list_entry(cur, struct btrfs_fs_devices, list); + if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0) + return fs_devices; + } + return NULL; +} + +/* + * we try to collect pending bios for a device so we don't get a large + * number of procs sending bios down to the same device. This greatly + * improves the schedulers ability to collect and merge the bios. + * + * But, it also turns into a long list of bios to process and that is sure + * to eventually make the worker thread block. The solution here is to + * make some progress and then put this work struct back at the end of + * the list if the block device is congested. This way, multiple devices + * can make progress from a single worker thread. + */ +static noinline int run_scheduled_bios(struct btrfs_device *device) +{ + struct bio *pending; + struct backing_dev_info *bdi; + struct btrfs_fs_info *fs_info; + struct bio *tail; + struct bio *cur; + int again = 0; + unsigned long num_run = 0; + unsigned long limit; + + bdi = device->bdev->bd_inode->i_mapping->backing_dev_info; + fs_info = device->dev_root->fs_info; + limit = btrfs_async_submit_limit(fs_info); + limit = limit * 2 / 3; + +loop: + spin_lock(&device->io_lock); + + /* take all the bios off the list at once and process them + * later on (without the lock held). But, remember the + * tail and other pointers so the bios can be properly reinserted + * into the list if we hit congestion + */ + pending = device->pending_bios; + tail = device->pending_bio_tail; + WARN_ON(pending && !tail); + device->pending_bios = NULL; + device->pending_bio_tail = NULL; + + /* + * if pending was null this time around, no bios need processing + * at all and we can stop. Otherwise it'll loop back up again + * and do an additional check so no bios are missed. + * + * device->running_pending is used to synchronize with the + * schedule_bio code. + */ + if (pending) { + again = 1; + device->running_pending = 1; + } else { + again = 0; + device->running_pending = 0; + } + spin_unlock(&device->io_lock); + + while (pending) { + cur = pending; + pending = pending->bi_next; + cur->bi_next = NULL; + atomic_dec(&fs_info->nr_async_bios); + + if (atomic_read(&fs_info->nr_async_bios) < limit && + waitqueue_active(&fs_info->async_submit_wait)) + wake_up(&fs_info->async_submit_wait); + + BUG_ON(atomic_read(&cur->bi_cnt) == 0); + bio_get(cur); + submit_bio(cur->bi_rw, cur); + bio_put(cur); + num_run++; + + /* + * we made progress, there is more work to do and the bdi + * is now congested. Back off and let other work structs + * run instead + */ + if (pending && bdi_write_congested(bdi) && + fs_info->fs_devices->open_devices > 1) { + struct bio *old_head; + + spin_lock(&device->io_lock); + + old_head = device->pending_bios; + device->pending_bios = pending; + if (device->pending_bio_tail) + tail->bi_next = old_head; + else + device->pending_bio_tail = tail; + + spin_unlock(&device->io_lock); + btrfs_requeue_work(&device->work); + goto done; + } + } + if (again) + goto loop; +done: + return 0; +} + +static void pending_bios_fn(struct btrfs_work *work) +{ + struct btrfs_device *device; + + device = container_of(work, struct btrfs_device, work); + run_scheduled_bios(device); +} + +static noinline int device_list_add(const char *path, + struct btrfs_super_block *disk_super, + u64 devid, struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices; + u64 found_transid = btrfs_super_generation(disk_super); + + fs_devices = find_fsid(disk_super->fsid); + if (!fs_devices) { + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!fs_devices) + return -ENOMEM; + INIT_LIST_HEAD(&fs_devices->devices); + INIT_LIST_HEAD(&fs_devices->alloc_list); + list_add(&fs_devices->list, &fs_uuids); + memcpy(fs_devices->fsid, disk_super->fsid, BTRFS_FSID_SIZE); + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + device = NULL; + } else { + device = __find_device(&fs_devices->devices, devid, + disk_super->dev_item.uuid); + } + if (!device) { + if (fs_devices->opened) + return -EBUSY; + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + return -ENOMEM; + } + device->devid = devid; + device->work.func = pending_bios_fn; + memcpy(device->uuid, disk_super->dev_item.uuid, + BTRFS_UUID_SIZE); + device->barriers = 1; + spin_lock_init(&device->io_lock); + device->name = kstrdup(path, GFP_NOFS); + if (!device->name) { + kfree(device); + return -ENOMEM; + } + INIT_LIST_HEAD(&device->dev_alloc_list); + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } + + if (found_transid > fs_devices->latest_trans) { + fs_devices->latest_devid = devid; + fs_devices->latest_trans = found_transid; + } + *fs_devices_ret = fs_devices; + return 0; +} + +static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) +{ + struct btrfs_fs_devices *fs_devices; + struct btrfs_device *device; + struct btrfs_device *orig_dev; + + fs_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!fs_devices) + return ERR_PTR(-ENOMEM); + + INIT_LIST_HEAD(&fs_devices->devices); + INIT_LIST_HEAD(&fs_devices->alloc_list); + INIT_LIST_HEAD(&fs_devices->list); + fs_devices->latest_devid = orig->latest_devid; + fs_devices->latest_trans = orig->latest_trans; + memcpy(fs_devices->fsid, orig->fsid, sizeof(fs_devices->fsid)); + + list_for_each_entry(orig_dev, &orig->devices, dev_list) { + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) + goto error; + + device->name = kstrdup(orig_dev->name, GFP_NOFS); + if (!device->name) + goto error; + + device->devid = orig_dev->devid; + device->work.func = pending_bios_fn; + memcpy(device->uuid, orig_dev->uuid, sizeof(device->uuid)); + device->barriers = 1; + spin_lock_init(&device->io_lock); + INIT_LIST_HEAD(&device->dev_list); + INIT_LIST_HEAD(&device->dev_alloc_list); + + list_add(&device->dev_list, &fs_devices->devices); + device->fs_devices = fs_devices; + fs_devices->num_devices++; + } + return fs_devices; +error: + free_fs_devices(fs_devices); + return ERR_PTR(-ENOMEM); +} + +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices) +{ + struct list_head *tmp; + struct list_head *cur; + struct btrfs_device *device; + + mutex_lock(&uuid_mutex); +again: + list_for_each_safe(cur, tmp, &fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->in_fs_metadata) + continue; + + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + device->bdev = NULL; + fs_devices->open_devices--; + } + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + device->writeable = 0; + fs_devices->rw_devices--; + } + list_del_init(&device->dev_list); + fs_devices->num_devices--; + kfree(device->name); + kfree(device); + } + + if (fs_devices->seed) { + fs_devices = fs_devices->seed; + goto again; + } + + mutex_unlock(&uuid_mutex); + return 0; +} + +static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct list_head *cur; + struct btrfs_device *device; + + if (--fs_devices->opened > 0) + return 0; + + list_for_each(cur, &fs_devices->devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + fs_devices->open_devices--; + } + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + fs_devices->rw_devices--; + } + + device->bdev = NULL; + device->writeable = 0; + device->in_fs_metadata = 0; + } + WARN_ON(fs_devices->open_devices); + WARN_ON(fs_devices->rw_devices); + fs_devices->opened = 0; + fs_devices->seeding = 0; + + return 0; +} + +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_fs_devices *seed_devices = NULL; + int ret; + + mutex_lock(&uuid_mutex); + ret = __btrfs_close_devices(fs_devices); + if (!fs_devices->opened) { + seed_devices = fs_devices->seed; + fs_devices->seed = NULL; + } + mutex_unlock(&uuid_mutex); + + while (seed_devices) { + fs_devices = seed_devices; + seed_devices = fs_devices->seed; + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + } + return ret; +} + +static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + struct block_device *bdev; + struct list_head *head = &fs_devices->devices; + struct list_head *cur; + struct btrfs_device *device; + struct block_device *latest_bdev = NULL; + struct buffer_head *bh; + struct btrfs_super_block *disk_super; + u64 latest_devid = 0; + u64 latest_transid = 0; + u64 devid; + int seeding = 1; + int ret = 0; + + list_for_each(cur, head) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->bdev) + continue; + if (!device->name) + continue; + + bdev = open_bdev_exclusive(device->name, flags, holder); + if (IS_ERR(bdev)) { + printk(KERN_INFO "open %s failed\n", device->name); + goto error; + } + set_blocksize(bdev, 4096); + + bh = btrfs_read_dev_super(bdev); + if (!bh) + goto error_close; + + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + if (devid != device->devid) + goto error_brelse; + + if (memcmp(device->uuid, disk_super->dev_item.uuid, + BTRFS_UUID_SIZE)) + goto error_brelse; + + device->generation = btrfs_super_generation(disk_super); + if (!latest_transid || device->generation > latest_transid) { + latest_devid = devid; + latest_transid = device->generation; + latest_bdev = bdev; + } + + if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { + device->writeable = 0; + } else { + device->writeable = !bdev_read_only(bdev); + seeding = 0; + } + + device->bdev = bdev; + device->in_fs_metadata = 0; + device->mode = flags; + + fs_devices->open_devices++; + if (device->writeable) { + fs_devices->rw_devices++; + list_add(&device->dev_alloc_list, + &fs_devices->alloc_list); + } + continue; + +error_brelse: + brelse(bh); +error_close: + close_bdev_exclusive(bdev, FMODE_READ); +error: + continue; + } + if (fs_devices->open_devices == 0) { + ret = -EIO; + goto out; + } + fs_devices->seeding = seeding; + fs_devices->opened = 1; + fs_devices->latest_bdev = latest_bdev; + fs_devices->latest_devid = latest_devid; + fs_devices->latest_trans = latest_transid; + fs_devices->total_rw_bytes = 0; +out: + return ret; +} + +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder) +{ + int ret; + + mutex_lock(&uuid_mutex); + if (fs_devices->opened) { + fs_devices->opened++; + ret = 0; + } else { + ret = __btrfs_open_devices(fs_devices, flags, holder); + } + mutex_unlock(&uuid_mutex); + return ret; +} + +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret) +{ + struct btrfs_super_block *disk_super; + struct block_device *bdev; + struct buffer_head *bh; + int ret; + u64 devid; + u64 transid; + + mutex_lock(&uuid_mutex); + + bdev = open_bdev_exclusive(path, flags, holder); + + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto error; + } + + ret = set_blocksize(bdev, 4096); + if (ret) + goto error_close; + bh = btrfs_read_dev_super(bdev); + if (!bh) { + ret = -EIO; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + transid = btrfs_super_generation(disk_super); + if (disk_super->label[0]) + printk(KERN_INFO "device label %s ", disk_super->label); + else { + /* FIXME, make a readl uuid parser */ + printk(KERN_INFO "device fsid %llx-%llx ", + *(unsigned long long *)disk_super->fsid, + *(unsigned long long *)(disk_super->fsid + 8)); + } + printk(KERN_INFO "devid %llu transid %llu %s\n", + (unsigned long long)devid, (unsigned long long)transid, path); + ret = device_list_add(path, disk_super, devid, fs_devices_ret); + + brelse(bh); +error_close: + close_bdev_exclusive(bdev, flags); +error: + mutex_unlock(&uuid_mutex); + return ret; +} + +/* + * this uses a pretty simple search, the expectation is that it is + * called very infrequently and that a given device has a small number + * of extents + */ +static noinline int find_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 num_bytes, u64 *start) +{ + struct btrfs_key key; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 hole_size = 0; + u64 last_byte = 0; + u64 search_start = 0; + u64 search_end = device->total_bytes; + int ret; + int slot = 0; + int start_found; + struct extent_buffer *l; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + start_found = 0; + + /* FIXME use last free of some kind */ + + /* we don't want to overwrite the superblock on the drive, + * so we make sure to start at an offset of at least 1MB + */ + search_start = max((u64)1024 * 1024, search_start); + + if (root->fs_info->alloc_start + num_bytes <= device->total_bytes) + search_start = max(root->fs_info->alloc_start, search_start); + + key.objectid = device->devid; + key.offset = search_start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_search_slot(trans, root, &key, path, 0, 0); + if (ret < 0) + goto error; + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret < 0) + goto error; + l = path->nodes[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + while (1) { + l = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(l)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; +no_more_items: + if (!start_found) { + if (search_start >= search_end) { + ret = -ENOSPC; + goto error; + } + *start = search_start; + start_found = 1; + goto check_pending; + } + *start = last_byte > search_start ? + last_byte : search_start; + if (search_end <= *start) { + ret = -ENOSPC; + goto error; + } + goto check_pending; + } + btrfs_item_key_to_cpu(l, &key, slot); + + if (key.objectid < device->devid) + goto next; + + if (key.objectid > device->devid) + goto no_more_items; + + if (key.offset >= search_start && key.offset > last_byte && + start_found) { + if (last_byte < search_start) + last_byte = search_start; + hole_size = key.offset - last_byte; + if (key.offset > last_byte && + hole_size >= num_bytes) { + *start = last_byte; + goto check_pending; + } + } + if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + goto next; + + start_found = 1; + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent); +next: + path->slots[0]++; + cond_resched(); + } +check_pending: + /* we have to make sure we didn't find an extent that has already + * been allocated by the map tree or the original allocation + */ + BUG_ON(*start < search_start); + + if (*start + num_bytes > search_end) { + ret = -ENOSPC; + goto error; + } + /* check for pending inserts here */ + ret = 0; + +error: + btrfs_free_path(path); + return ret; +} + +static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 start) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_key key; + struct btrfs_key found_key; + struct extent_buffer *leaf = NULL; + struct btrfs_dev_extent *extent = NULL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret > 0) { + ret = btrfs_previous_item(root, path, key.objectid, + BTRFS_DEV_EXTENT_KEY); + BUG_ON(ret); + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + BUG_ON(found_key.offset > start || found_key.offset + + btrfs_dev_extent_length(leaf, extent) < start); + ret = 0; + } else if (ret == 0) { + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + } + BUG_ON(ret); + + if (device->bytes_used > 0) + device->bytes_used -= btrfs_dev_extent_length(leaf, extent); + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return ret; +} + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *extent; + struct extent_buffer *leaf; + struct btrfs_key key; + + WARN_ON(!device->in_fs_metadata); + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = device->devid; + key.offset = start; + key.type = BTRFS_DEV_EXTENT_KEY; + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*extent)); + BUG_ON(ret); + + leaf = path->nodes[0]; + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_extent); + btrfs_set_dev_extent_chunk_tree(leaf, extent, chunk_tree); + btrfs_set_dev_extent_chunk_objectid(leaf, extent, chunk_objectid); + btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); + + write_extent_buffer(leaf, root->fs_info->chunk_tree_uuid, + (unsigned long)btrfs_dev_extent_chunk_tree_uuid(extent), + BTRFS_UUID_SIZE); + + btrfs_set_dev_extent_length(leaf, extent, num_bytes); + btrfs_mark_buffer_dirty(leaf); + btrfs_free_path(path); + return ret; +} + +static noinline int find_next_chunk(struct btrfs_root *root, + u64 objectid, u64 *offset) +{ + struct btrfs_path *path; + int ret; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_key found_key; + + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = objectid; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY); + if (ret) { + *offset = 0; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != objectid) + *offset = 0; + else { + chunk = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_chunk); + *offset = found_key.offset + + btrfs_chunk_length(path->nodes[0], chunk); + } + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +static noinline int find_next_devid(struct btrfs_root *root, u64 *objectid) +{ + int ret; + struct btrfs_key key; + struct btrfs_key found_key; + struct btrfs_path *path; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = (u64)-1; + + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto error; + + BUG_ON(ret == 0); + + ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID, + BTRFS_DEV_ITEM_KEY); + if (ret) { + *objectid = 1; + } else { + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + *objectid = found_key.offset + 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +/* + * the device information is stored in the chunk root + * the btrfs_device struct should be fully filled in + */ +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + unsigned long ptr; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_insert_empty_item(trans, root, path, &key, + sizeof(*dev_item)); + if (ret) + goto out; + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_generation(leaf, dev_item, 0); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_group(leaf, dev_item, 0); + btrfs_set_device_seek_speed(leaf, dev_item, 0); + btrfs_set_device_bandwidth(leaf, dev_item, 0); + btrfs_set_device_start_offset(leaf, dev_item, 0); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + ptr = (unsigned long)btrfs_device_fsid(dev_item); + write_extent_buffer(leaf, root->fs_info->fsid, ptr, BTRFS_UUID_SIZE); + btrfs_mark_buffer_dirty(leaf); + + ret = 0; +out: + btrfs_free_path(path); + return ret; +} + +static int btrfs_rm_dev_item(struct btrfs_root *root, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_trans_handle *trans; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + lock_chunks(root); + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + ret = btrfs_del_item(trans, root, path); + if (ret) + goto out; +out: + btrfs_free_path(path); + unlock_chunks(root); + btrfs_commit_transaction(trans, root); + return ret; +} + +int btrfs_rm_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_device *device; + struct btrfs_device *next_device; + struct block_device *bdev; + struct buffer_head *bh = NULL; + struct btrfs_super_block *disk_super; + u64 all_avail; + u64 devid; + u64 num_devices; + u8 *dev_uuid; + int ret = 0; + + mutex_lock(&uuid_mutex); + mutex_lock(&root->fs_info->volume_mutex); + + all_avail = root->fs_info->avail_data_alloc_bits | + root->fs_info->avail_system_alloc_bits | + root->fs_info->avail_metadata_alloc_bits; + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID10) && + root->fs_info->fs_devices->rw_devices <= 4) { + printk(KERN_ERR "btrfs: unable to go below four devices " + "on raid10\n"); + ret = -EINVAL; + goto out; + } + + if ((all_avail & BTRFS_BLOCK_GROUP_RAID1) && + root->fs_info->fs_devices->rw_devices <= 2) { + printk(KERN_ERR "btrfs: unable to go below two " + "devices on raid1\n"); + ret = -EINVAL; + goto out; + } + + if (strcmp(device_path, "missing") == 0) { + struct list_head *cur; + struct list_head *devices; + struct btrfs_device *tmp; + + device = NULL; + devices = &root->fs_info->fs_devices->devices; + list_for_each(cur, devices) { + tmp = list_entry(cur, struct btrfs_device, dev_list); + if (tmp->in_fs_metadata && !tmp->bdev) { + device = tmp; + break; + } + } + bdev = NULL; + bh = NULL; + disk_super = NULL; + if (!device) { + printk(KERN_ERR "btrfs: no missing devices found to " + "remove\n"); + goto out; + } + } else { + bdev = open_bdev_exclusive(device_path, FMODE_READ, + root->fs_info->bdev_holder); + if (IS_ERR(bdev)) { + ret = PTR_ERR(bdev); + goto out; + } + + set_blocksize(bdev, 4096); + bh = btrfs_read_dev_super(bdev); + if (!bh) { + ret = -EIO; + goto error_close; + } + disk_super = (struct btrfs_super_block *)bh->b_data; + devid = le64_to_cpu(disk_super->dev_item.devid); + dev_uuid = disk_super->dev_item.uuid; + device = btrfs_find_device(root, devid, dev_uuid, + disk_super->fsid); + if (!device) { + ret = -ENOENT; + goto error_brelse; + } + } + + if (device->writeable && root->fs_info->fs_devices->rw_devices == 1) { + printk(KERN_ERR "btrfs: unable to remove the only writeable " + "device\n"); + ret = -EINVAL; + goto error_brelse; + } + + if (device->writeable) { + list_del_init(&device->dev_alloc_list); + root->fs_info->fs_devices->rw_devices--; + } + + ret = btrfs_shrink_device(device, 0); + if (ret) + goto error_brelse; + + ret = btrfs_rm_dev_item(root->fs_info->chunk_root, device); + if (ret) + goto error_brelse; + + device->in_fs_metadata = 0; + list_del_init(&device->dev_list); + device->fs_devices->num_devices--; + + next_device = list_entry(root->fs_info->fs_devices->devices.next, + struct btrfs_device, dev_list); + if (device->bdev == root->fs_info->sb->s_bdev) + root->fs_info->sb->s_bdev = next_device->bdev; + if (device->bdev == root->fs_info->fs_devices->latest_bdev) + root->fs_info->fs_devices->latest_bdev = next_device->bdev; + + if (device->bdev) { + close_bdev_exclusive(device->bdev, device->mode); + device->bdev = NULL; + device->fs_devices->open_devices--; + } + + num_devices = btrfs_super_num_devices(&root->fs_info->super_copy) - 1; + btrfs_set_super_num_devices(&root->fs_info->super_copy, num_devices); + + if (device->fs_devices->open_devices == 0) { + struct btrfs_fs_devices *fs_devices; + fs_devices = root->fs_info->fs_devices; + while (fs_devices) { + if (fs_devices->seed == device->fs_devices) + break; + fs_devices = fs_devices->seed; + } + fs_devices->seed = device->fs_devices->seed; + device->fs_devices->seed = NULL; + __btrfs_close_devices(device->fs_devices); + free_fs_devices(device->fs_devices); + } + + /* + * at this point, the device is zero sized. We want to + * remove it from the devices list and zero out the old super + */ + if (device->writeable) { + /* make sure this device isn't detected as part of + * the FS anymore + */ + memset(&disk_super->magic, 0, sizeof(disk_super->magic)); + set_buffer_dirty(bh); + sync_dirty_buffer(bh); + } + + kfree(device->name); + kfree(device); + ret = 0; + +error_brelse: + brelse(bh); +error_close: + if (bdev) + close_bdev_exclusive(bdev, FMODE_READ); +out: + mutex_unlock(&root->fs_info->volume_mutex); + mutex_unlock(&uuid_mutex); + return ret; +} + +/* + * does all the dirty work required for changing file system's UUID. + */ +static int btrfs_prepare_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + struct btrfs_fs_devices *old_devices; + struct btrfs_fs_devices *seed_devices; + struct btrfs_super_block *disk_super = &root->fs_info->super_copy; + struct btrfs_device *device; + u64 super_flags; + + BUG_ON(!mutex_is_locked(&uuid_mutex)); + if (!fs_devices->seeding) + return -EINVAL; + + seed_devices = kzalloc(sizeof(*fs_devices), GFP_NOFS); + if (!seed_devices) + return -ENOMEM; + + old_devices = clone_fs_devices(fs_devices); + if (IS_ERR(old_devices)) { + kfree(seed_devices); + return PTR_ERR(old_devices); + } + + list_add(&old_devices->list, &fs_uuids); + + memcpy(seed_devices, fs_devices, sizeof(*seed_devices)); + seed_devices->opened = 1; + INIT_LIST_HEAD(&seed_devices->devices); + INIT_LIST_HEAD(&seed_devices->alloc_list); + list_splice_init(&fs_devices->devices, &seed_devices->devices); + list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); + list_for_each_entry(device, &seed_devices->devices, dev_list) { + device->fs_devices = seed_devices; + } + + fs_devices->seeding = 0; + fs_devices->num_devices = 0; + fs_devices->open_devices = 0; + fs_devices->seed = seed_devices; + + generate_random_uuid(fs_devices->fsid); + memcpy(root->fs_info->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE); + super_flags = btrfs_super_flags(disk_super) & + ~BTRFS_SUPER_FLAG_SEEDING; + btrfs_set_super_flags(disk_super, super_flags); + + return 0; +} + +/* + * strore the expected generation for seed devices in device items. + */ +static int btrfs_finish_sprout(struct btrfs_trans_handle *trans, + struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_dev_item *dev_item; + struct btrfs_device *device; + struct btrfs_key key; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + u64 devid; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + root = root->fs_info->chunk_root; + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = BTRFS_DEV_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto error; + + leaf = path->nodes[0]; +next_slot: + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret > 0) + break; + if (ret < 0) + goto error; + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + btrfs_release_path(root, path); + continue; + } + + btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); + if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID || + key.type != BTRFS_DEV_ITEM_KEY) + break; + + dev_item = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_dev_item); + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, + (unsigned long)btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, + (unsigned long)btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + BUG_ON(!device); + + if (device->fs_devices->seeding) { + btrfs_set_device_generation(leaf, dev_item, + device->generation); + btrfs_mark_buffer_dirty(leaf); + } + + path->slots[0]++; + goto next_slot; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +int btrfs_init_new_device(struct btrfs_root *root, char *device_path) +{ + struct btrfs_trans_handle *trans; + struct btrfs_device *device; + struct block_device *bdev; + struct list_head *cur; + struct list_head *devices; + struct super_block *sb = root->fs_info->sb; + u64 total_bytes; + int seeding_dev = 0; + int ret = 0; + + if ((sb->s_flags & MS_RDONLY) && !root->fs_info->fs_devices->seeding) + return -EINVAL; + + bdev = open_bdev_exclusive(device_path, 0, root->fs_info->bdev_holder); + if (!bdev) + return -EIO; + + if (root->fs_info->fs_devices->seeding) { + seeding_dev = 1; + down_write(&sb->s_umount); + mutex_lock(&uuid_mutex); + } + + filemap_write_and_wait(bdev->bd_inode->i_mapping); + mutex_lock(&root->fs_info->volume_mutex); + + devices = &root->fs_info->fs_devices->devices; + list_for_each(cur, devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + if (device->bdev == bdev) { + ret = -EEXIST; + goto error; + } + } + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) { + /* we can safely leave the fs_devices entry around */ + ret = -ENOMEM; + goto error; + } + + device->name = kstrdup(device_path, GFP_NOFS); + if (!device->name) { + kfree(device); + ret = -ENOMEM; + goto error; + } + + ret = find_next_devid(root, &device->devid); + if (ret) { + kfree(device); + goto error; + } + + trans = btrfs_start_transaction(root, 1); + lock_chunks(root); + + device->barriers = 1; + device->writeable = 1; + device->work.func = pending_bios_fn; + generate_random_uuid(device->uuid); + spin_lock_init(&device->io_lock); + device->generation = trans->transid; + device->io_width = root->sectorsize; + device->io_align = root->sectorsize; + device->sector_size = root->sectorsize; + device->total_bytes = i_size_read(bdev->bd_inode); + device->dev_root = root->fs_info->dev_root; + device->bdev = bdev; + device->in_fs_metadata = 1; + device->mode = 0; + set_blocksize(device->bdev, 4096); + + if (seeding_dev) { + sb->s_flags &= ~MS_RDONLY; + ret = btrfs_prepare_sprout(trans, root); + BUG_ON(ret); + } + + device->fs_devices = root->fs_info->fs_devices; + list_add(&device->dev_list, &root->fs_info->fs_devices->devices); + list_add(&device->dev_alloc_list, + &root->fs_info->fs_devices->alloc_list); + root->fs_info->fs_devices->num_devices++; + root->fs_info->fs_devices->open_devices++; + root->fs_info->fs_devices->rw_devices++; + root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; + + total_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy); + btrfs_set_super_total_bytes(&root->fs_info->super_copy, + total_bytes + device->total_bytes); + + total_bytes = btrfs_super_num_devices(&root->fs_info->super_copy); + btrfs_set_super_num_devices(&root->fs_info->super_copy, + total_bytes + 1); + + if (seeding_dev) { + ret = init_first_rw_device(trans, root, device); + BUG_ON(ret); + ret = btrfs_finish_sprout(trans, root); + BUG_ON(ret); + } else { + ret = btrfs_add_device(trans, root, device); + } + + unlock_chunks(root); + btrfs_commit_transaction(trans, root); + + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + + ret = btrfs_relocate_sys_chunks(root); + BUG_ON(ret); + } +out: + mutex_unlock(&root->fs_info->volume_mutex); + return ret; +error: + close_bdev_exclusive(bdev, 0); + if (seeding_dev) { + mutex_unlock(&uuid_mutex); + up_write(&sb->s_umount); + } + goto out; +} + +static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device) +{ + int ret; + struct btrfs_path *path; + struct btrfs_root *root; + struct btrfs_dev_item *dev_item; + struct extent_buffer *leaf; + struct btrfs_key key; + + root = device->dev_root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.type = BTRFS_DEV_ITEM_KEY; + key.offset = device->devid; + + ret = btrfs_search_slot(trans, root, &key, path, 0, 1); + if (ret < 0) + goto out; + + if (ret > 0) { + ret = -ENOENT; + goto out; + } + + leaf = path->nodes[0]; + dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item); + + btrfs_set_device_id(leaf, dev_item, device->devid); + btrfs_set_device_type(leaf, dev_item, device->type); + btrfs_set_device_io_align(leaf, dev_item, device->io_align); + btrfs_set_device_io_width(leaf, dev_item, device->io_width); + btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); + btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes); + btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_mark_buffer_dirty(leaf); + +out: + btrfs_free_path(path); + return ret; +} + +static int __btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + struct btrfs_super_block *super_copy = + &device->dev_root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 diff = new_size - device->total_bytes; + + if (!device->writeable) + return -EACCES; + if (new_size <= device->total_bytes) + return -EINVAL; + + btrfs_set_super_total_bytes(super_copy, old_total + diff); + device->fs_devices->total_rw_bytes += diff; + + device->total_bytes = new_size; + return btrfs_update_device(trans, device); +} + +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size) +{ + int ret; + lock_chunks(device->dev_root); + ret = __btrfs_grow_device(trans, device, new_size); + unlock_chunks(device->dev_root); + return ret; +} + +static int btrfs_free_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + int ret; + struct btrfs_path *path; + struct btrfs_key key; + + root = root->fs_info->chunk_root; + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = chunk_objectid; + key.offset = chunk_offset; + key.type = BTRFS_CHUNK_ITEM_KEY; + + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + BUG_ON(ret); + + ret = btrfs_del_item(trans, root, path); + BUG_ON(ret); + + btrfs_free_path(path); + return 0; +} + +static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 + chunk_offset) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)(ptr + len); + num_stripes = btrfs_stack_chunk_num_stripes(chunk); + len += btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + if (key.objectid == chunk_objectid && + key.offset == chunk_offset) { + memmove(ptr, ptr + len, array_size - (cur + len)); + array_size -= len; + btrfs_set_super_sys_array_size(super_copy, array_size); + } else { + ptr += len; + cur += len; + } + } + return ret; +} + +static int btrfs_relocate_chunk(struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + struct extent_map_tree *em_tree; + struct btrfs_root *extent_root; + struct btrfs_trans_handle *trans; + struct extent_map *em; + struct map_lookup *map; + int ret; + int i; + + printk(KERN_INFO "btrfs relocating chunk %llu\n", + (unsigned long long)chunk_offset); + root = root->fs_info->chunk_root; + extent_root = root->fs_info->extent_root; + em_tree = &root->fs_info->mapping_tree.map_tree; + + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_relocate_block_group(extent_root, chunk_offset); + BUG_ON(ret); + + trans = btrfs_start_transaction(root, 1); + BUG_ON(!trans); + + lock_chunks(root); + + /* + * step two, delete the device extents and the + * chunk tree entries + */ + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_offset, 1); + spin_unlock(&em_tree->lock); + + BUG_ON(em->start > chunk_offset || + em->start + em->len < chunk_offset); + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, + map->stripes[i].physical); + BUG_ON(ret); + + if (map->stripes[i].dev) { + ret = btrfs_update_device(trans, map->stripes[i].dev); + BUG_ON(ret); + } + } + ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, + chunk_offset); + + BUG_ON(ret); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); + BUG_ON(ret); + } + + ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); + BUG_ON(ret); + + spin_lock(&em_tree->lock); + remove_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + + kfree(map); + em->bdev = NULL; + + /* once for the tree */ + free_extent_map(em); + /* once for us */ + free_extent_map(em); + + unlock_chunks(root); + btrfs_end_transaction(trans, root); + return 0; +} + +static int btrfs_relocate_sys_chunks(struct btrfs_root *root) +{ + struct btrfs_root *chunk_root = root->fs_info->chunk_root; + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_chunk *chunk; + struct btrfs_key key; + struct btrfs_key found_key; + u64 chunk_tree = chunk_root->root_key.objectid; + u64 chunk_type; + int ret; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + BUG_ON(ret == 0); + + ret = btrfs_previous_item(chunk_root, path, key.objectid, + key.type); + if (ret < 0) + goto error; + if (ret > 0) + break; + + leaf = path->nodes[0]; + btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); + + chunk = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_chunk); + chunk_type = btrfs_chunk_type(leaf, chunk); + btrfs_release_path(chunk_root, path); + + if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_relocate_chunk(chunk_root, chunk_tree, + found_key.objectid, + found_key.offset); + BUG_ON(ret); + } + + if (found_key.offset == 0) + break; + key.offset = found_key.offset - 1; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} + +static u64 div_factor(u64 num, int factor) +{ + if (factor == 10) + return num; + num *= factor; + do_div(num, 10); + return num; +} + +int btrfs_balance(struct btrfs_root *dev_root) +{ + int ret; + struct list_head *cur; + struct list_head *devices = &dev_root->fs_info->fs_devices->devices; + struct btrfs_device *device; + u64 old_size; + u64 size_to_free; + struct btrfs_path *path; + struct btrfs_key key; + struct btrfs_chunk *chunk; + struct btrfs_root *chunk_root = dev_root->fs_info->chunk_root; + struct btrfs_trans_handle *trans; + struct btrfs_key found_key; + + if (dev_root->fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + mutex_lock(&dev_root->fs_info->volume_mutex); + dev_root = dev_root->fs_info->dev_root; + + /* step one make some room on all the devices */ + list_for_each(cur, devices) { + device = list_entry(cur, struct btrfs_device, dev_list); + old_size = device->total_bytes; + size_to_free = div_factor(old_size, 1); + size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); + if (!device->writeable || + device->total_bytes - device->bytes_used > size_to_free) + continue; + + ret = btrfs_shrink_device(device, old_size - size_to_free); + BUG_ON(ret); + + trans = btrfs_start_transaction(dev_root, 1); + BUG_ON(!trans); + + ret = btrfs_grow_device(trans, device, old_size); + BUG_ON(ret); + + btrfs_end_transaction(trans, dev_root); + } + + /* step two, relocate all the chunks */ + path = btrfs_alloc_path(); + BUG_ON(!path); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.offset = (u64)-1; + key.type = BTRFS_CHUNK_ITEM_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); + if (ret < 0) + goto error; + + /* + * this shouldn't happen, it means the last relocate + * failed + */ + if (ret == 0) + break; + + ret = btrfs_previous_item(chunk_root, path, 0, + BTRFS_CHUNK_ITEM_KEY); + if (ret) + break; + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, + path->slots[0]); + if (found_key.objectid != key.objectid) + break; + + chunk = btrfs_item_ptr(path->nodes[0], + path->slots[0], + struct btrfs_chunk); + key.offset = found_key.offset; + /* chunk zero is special */ + if (key.offset == 0) + break; + + btrfs_release_path(chunk_root, path); + ret = btrfs_relocate_chunk(chunk_root, + chunk_root->root_key.objectid, + found_key.objectid, + found_key.offset); + BUG_ON(ret); + } + ret = 0; +error: + btrfs_free_path(path); + mutex_unlock(&dev_root->fs_info->volume_mutex); + return ret; +} + +/* + * shrinking a device means finding all of the device extents past + * the new size, and then following the back refs to the chunks. + * The chunk relocation code actually frees the device extent + */ +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) +{ + struct btrfs_trans_handle *trans; + struct btrfs_root *root = device->dev_root; + struct btrfs_dev_extent *dev_extent = NULL; + struct btrfs_path *path; + u64 length; + u64 chunk_tree; + u64 chunk_objectid; + u64 chunk_offset; + int ret; + int slot; + struct extent_buffer *l; + struct btrfs_key key; + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + u64 old_total = btrfs_super_total_bytes(super_copy); + u64 diff = device->total_bytes - new_size; + + if (new_size >= device->total_bytes) + return -EINVAL; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + if (!trans) { + ret = -ENOMEM; + goto done; + } + + path->reada = 2; + + lock_chunks(root); + + device->total_bytes = new_size; + if (device->writeable) + device->fs_devices->total_rw_bytes -= diff; + ret = btrfs_update_device(trans, device); + if (ret) { + unlock_chunks(root); + btrfs_end_transaction(trans, root); + goto done; + } + WARN_ON(diff > old_total); + btrfs_set_super_total_bytes(super_copy, old_total - diff); + unlock_chunks(root); + btrfs_end_transaction(trans, root); + + key.objectid = device->devid; + key.offset = (u64)-1; + key.type = BTRFS_DEV_EXTENT_KEY; + + while (1) { + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto done; + + ret = btrfs_previous_item(root, path, 0, key.type); + if (ret < 0) + goto done; + if (ret) { + ret = 0; + goto done; + } + + l = path->nodes[0]; + slot = path->slots[0]; + btrfs_item_key_to_cpu(l, &key, path->slots[0]); + + if (key.objectid != device->devid) + goto done; + + dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); + length = btrfs_dev_extent_length(l, dev_extent); + + if (key.offset + length <= new_size) + goto done; + + chunk_tree = btrfs_dev_extent_chunk_tree(l, dev_extent); + chunk_objectid = btrfs_dev_extent_chunk_objectid(l, dev_extent); + chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent); + btrfs_release_path(root, path); + + ret = btrfs_relocate_chunk(root, chunk_tree, chunk_objectid, + chunk_offset); + if (ret) + goto done; + } + +done: + btrfs_free_path(path); + return ret; +} + +static int btrfs_add_system_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_key *key, + struct btrfs_chunk *chunk, int item_size) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct btrfs_disk_key disk_key; + u32 array_size; + u8 *ptr; + + array_size = btrfs_super_sys_array_size(super_copy); + if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + return -EFBIG; + + ptr = super_copy->sys_chunk_array + array_size; + btrfs_cpu_key_to_disk(&disk_key, key); + memcpy(ptr, &disk_key, sizeof(disk_key)); + ptr += sizeof(disk_key); + memcpy(ptr, chunk, item_size); + item_size += sizeof(disk_key); + btrfs_set_super_sys_array_size(super_copy, array_size + item_size); + return 0; +} + +static noinline u64 chunk_bytes_by_type(u64 type, u64 calc_size, + int num_stripes, int sub_stripes) +{ + if (type & (BTRFS_BLOCK_GROUP_RAID1 | BTRFS_BLOCK_GROUP_DUP)) + return calc_size; + else if (type & BTRFS_BLOCK_GROUP_RAID10) + return calc_size * (num_stripes / sub_stripes); + else + return calc_size * num_stripes; +} + +static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup **map_ret, + u64 *num_bytes, u64 *stripe_size, + u64 start, u64 type) +{ + struct btrfs_fs_info *info = extent_root->fs_info; + struct btrfs_device *device = NULL; + struct btrfs_fs_devices *fs_devices = info->fs_devices; + struct list_head *cur; + struct map_lookup *map = NULL; + struct extent_map_tree *em_tree; + struct extent_map *em; + struct list_head private_devs; + int min_stripe_size = 1 * 1024 * 1024; + u64 calc_size = 1024 * 1024 * 1024; + u64 max_chunk_size = calc_size; + u64 min_free; + u64 avail; + u64 max_avail = 0; + u64 dev_offset; + int num_stripes = 1; + int min_stripes = 1; + int sub_stripes = 0; + int looped = 0; + int ret; + int index; + int stripe_len = 64 * 1024; + + if ((type & BTRFS_BLOCK_GROUP_RAID1) && + (type & BTRFS_BLOCK_GROUP_DUP)) { + WARN_ON(1); + type &= ~BTRFS_BLOCK_GROUP_DUP; + } + if (list_empty(&fs_devices->alloc_list)) + return -ENOSPC; + + if (type & (BTRFS_BLOCK_GROUP_RAID0)) { + num_stripes = fs_devices->rw_devices; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_DUP)) { + num_stripes = 2; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID1)) { + num_stripes = min_t(u64, 2, fs_devices->rw_devices); + if (num_stripes < 2) + return -ENOSPC; + min_stripes = 2; + } + if (type & (BTRFS_BLOCK_GROUP_RAID10)) { + num_stripes = fs_devices->rw_devices; + if (num_stripes < 4) + return -ENOSPC; + num_stripes &= ~(u32)1; + sub_stripes = 2; + min_stripes = 4; + } + + if (type & BTRFS_BLOCK_GROUP_DATA) { + max_chunk_size = 10 * calc_size; + min_stripe_size = 64 * 1024 * 1024; + } else if (type & BTRFS_BLOCK_GROUP_METADATA) { + max_chunk_size = 4 * calc_size; + min_stripe_size = 32 * 1024 * 1024; + } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) { + calc_size = 8 * 1024 * 1024; + max_chunk_size = calc_size * 2; + min_stripe_size = 1 * 1024 * 1024; + } + + /* we don't want a chunk larger than 10% of writeable space */ + max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1), + max_chunk_size); + +again: + if (!map || map->num_stripes != num_stripes) { + kfree(map); + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) + return -ENOMEM; + map->num_stripes = num_stripes; + } + + if (calc_size * num_stripes > max_chunk_size) { + calc_size = max_chunk_size; + do_div(calc_size, num_stripes); + do_div(calc_size, stripe_len); + calc_size *= stripe_len; + } + /* we don't want tiny stripes */ + calc_size = max_t(u64, min_stripe_size, calc_size); + + do_div(calc_size, stripe_len); + calc_size *= stripe_len; + + cur = fs_devices->alloc_list.next; + index = 0; + + if (type & BTRFS_BLOCK_GROUP_DUP) + min_free = calc_size * 2; + else + min_free = calc_size; + + /* + * we add 1MB because we never use the first 1MB of the device, unless + * we've looped, then we are likely allocating the maximum amount of + * space left already + */ + if (!looped) + min_free += 1024 * 1024; + + INIT_LIST_HEAD(&private_devs); + while (index < num_stripes) { + device = list_entry(cur, struct btrfs_device, dev_alloc_list); + BUG_ON(!device->writeable); + if (device->total_bytes > device->bytes_used) + avail = device->total_bytes - device->bytes_used; + else + avail = 0; + cur = cur->next; + + if (device->in_fs_metadata && avail >= min_free) { + ret = find_free_dev_extent(trans, device, + min_free, &dev_offset); + if (ret == 0) { + list_move_tail(&device->dev_alloc_list, + &private_devs); + map->stripes[index].dev = device; + map->stripes[index].physical = dev_offset; + index++; + if (type & BTRFS_BLOCK_GROUP_DUP) { + map->stripes[index].dev = device; + map->stripes[index].physical = + dev_offset + calc_size; + index++; + } + } + } else if (device->in_fs_metadata && avail > max_avail) + max_avail = avail; + if (cur == &fs_devices->alloc_list) + break; + } + list_splice(&private_devs, &fs_devices->alloc_list); + if (index < num_stripes) { + if (index >= min_stripes) { + num_stripes = index; + if (type & (BTRFS_BLOCK_GROUP_RAID10)) { + num_stripes /= sub_stripes; + num_stripes *= sub_stripes; + } + looped = 1; + goto again; + } + if (!looped && max_avail > 0) { + looped = 1; + calc_size = max_avail; + goto again; + } + kfree(map); + return -ENOSPC; + } + map->sector_size = extent_root->sectorsize; + map->stripe_len = stripe_len; + map->io_align = stripe_len; + map->io_width = stripe_len; + map->type = type; + map->num_stripes = num_stripes; + map->sub_stripes = sub_stripes; + + *map_ret = map; + *stripe_size = calc_size; + *num_bytes = chunk_bytes_by_type(type, calc_size, + num_stripes, sub_stripes); + + em = alloc_extent_map(GFP_NOFS); + if (!em) { + kfree(map); + return -ENOMEM; + } + em->bdev = (struct block_device *)map; + em->start = start; + em->len = *num_bytes; + em->block_start = 0; + em->block_len = em->len; + + em_tree = &extent_root->fs_info->mapping_tree.map_tree; + spin_lock(&em_tree->lock); + ret = add_extent_mapping(em_tree, em); + spin_unlock(&em_tree->lock); + BUG_ON(ret); + free_extent_map(em); + + ret = btrfs_make_block_group(trans, extent_root, 0, type, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, *num_bytes); + BUG_ON(ret); + + index = 0; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + dev_offset = map->stripes[index].physical; + + ret = btrfs_alloc_dev_extent(trans, device, + info->chunk_root->root_key.objectid, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, + start, dev_offset, calc_size); + BUG_ON(ret); + index++; + } + + return 0; +} + +static int __finish_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, + struct map_lookup *map, u64 chunk_offset, + u64 chunk_size, u64 stripe_size) +{ + u64 dev_offset; + struct btrfs_key key; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + struct btrfs_device *device; + struct btrfs_chunk *chunk; + struct btrfs_stripe *stripe; + size_t item_size = btrfs_chunk_item_size(map->num_stripes); + int index = 0; + int ret; + + chunk = kzalloc(item_size, GFP_NOFS); + if (!chunk) + return -ENOMEM; + + index = 0; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + device->bytes_used += stripe_size; + ret = btrfs_update_device(trans, device); + BUG_ON(ret); + index++; + } + + index = 0; + stripe = &chunk->stripe; + while (index < map->num_stripes) { + device = map->stripes[index].dev; + dev_offset = map->stripes[index].physical; + + btrfs_set_stack_stripe_devid(stripe, device->devid); + btrfs_set_stack_stripe_offset(stripe, dev_offset); + memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE); + stripe++; + index++; + } + + btrfs_set_stack_chunk_length(chunk, chunk_size); + btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid); + btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len); + btrfs_set_stack_chunk_type(chunk, map->type); + btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes); + btrfs_set_stack_chunk_io_align(chunk, map->stripe_len); + btrfs_set_stack_chunk_io_width(chunk, map->stripe_len); + btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize); + btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes); + + key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; + + ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size); + BUG_ON(ret); + + if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { + ret = btrfs_add_system_chunk(trans, chunk_root, &key, chunk, + item_size); + BUG_ON(ret); + } + kfree(chunk); + return 0; +} + +/* + * Chunk allocation falls into two parts. The first part does works + * that make the new allocated chunk useable, but not do any operation + * that modifies the chunk tree. The second part does the works that + * require modifying the chunk tree. This division is important for the + * bootstrap process of adding storage to a seed btrfs. + */ +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type) +{ + u64 chunk_offset; + u64 chunk_size; + u64 stripe_size; + struct map_lookup *map; + struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root; + int ret; + + ret = find_next_chunk(chunk_root, BTRFS_FIRST_CHUNK_TREE_OBJECTID, + &chunk_offset); + if (ret) + return ret; + + ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, + &stripe_size, chunk_offset, type); + if (ret) + return ret; + + ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, + chunk_size, stripe_size); + BUG_ON(ret); + return 0; +} + +static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device) +{ + u64 chunk_offset; + u64 sys_chunk_offset; + u64 chunk_size; + u64 sys_chunk_size; + u64 stripe_size; + u64 sys_stripe_size; + u64 alloc_profile; + struct map_lookup *map; + struct map_lookup *sys_map; + struct btrfs_fs_info *fs_info = root->fs_info; + struct btrfs_root *extent_root = fs_info->extent_root; + int ret; + + ret = find_next_chunk(fs_info->chunk_root, + BTRFS_FIRST_CHUNK_TREE_OBJECTID, &chunk_offset); + BUG_ON(ret); + + alloc_profile = BTRFS_BLOCK_GROUP_METADATA | + (fs_info->metadata_alloc_profile & + fs_info->avail_metadata_alloc_bits); + alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); + + ret = __btrfs_alloc_chunk(trans, extent_root, &map, &chunk_size, + &stripe_size, chunk_offset, alloc_profile); + BUG_ON(ret); + + sys_chunk_offset = chunk_offset + chunk_size; + + alloc_profile = BTRFS_BLOCK_GROUP_SYSTEM | + (fs_info->system_alloc_profile & + fs_info->avail_system_alloc_bits); + alloc_profile = btrfs_reduce_alloc_profile(root, alloc_profile); + + ret = __btrfs_alloc_chunk(trans, extent_root, &sys_map, + &sys_chunk_size, &sys_stripe_size, + sys_chunk_offset, alloc_profile); + BUG_ON(ret); + + ret = btrfs_add_device(trans, fs_info->chunk_root, device); + BUG_ON(ret); + + /* + * Modifying chunk tree needs allocating new blocks from both + * system block group and metadata block group. So we only can + * do operations require modifying the chunk tree after both + * block groups were created. + */ + ret = __finish_chunk_alloc(trans, extent_root, map, chunk_offset, + chunk_size, stripe_size); + BUG_ON(ret); + + ret = __finish_chunk_alloc(trans, extent_root, sys_map, + sys_chunk_offset, sys_chunk_size, + sys_stripe_size); + BUG_ON(ret); + return 0; +} + +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + int readonly = 0; + int i; + + spin_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, chunk_offset, 1); + spin_unlock(&map_tree->map_tree.lock); + if (!em) + return 1; + + map = (struct map_lookup *)em->bdev; + for (i = 0; i < map->num_stripes; i++) { + if (!map->stripes[i].dev->writeable) { + readonly = 1; + break; + } + } + free_extent_map(em); + return readonly; +} + +void btrfs_mapping_init(struct btrfs_mapping_tree *tree) +{ + extent_map_tree_init(&tree->map_tree, GFP_NOFS); +} + +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree) +{ + struct extent_map *em; + + while (1) { + spin_lock(&tree->map_tree.lock); + em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1); + if (em) + remove_extent_mapping(&tree->map_tree, em); + spin_unlock(&tree->map_tree.lock); + if (!em) + break; + kfree(em->bdev); + /* once for us */ + free_extent_map(em); + /* once for the tree */ + free_extent_map(em); + } +} + +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + int ret; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, len); + spin_unlock(&em_tree->lock); + BUG_ON(!em); + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) + ret = map->num_stripes; + else if (map->type & BTRFS_BLOCK_GROUP_RAID10) + ret = map->sub_stripes; + else + ret = 1; + free_extent_map(em); + return ret; +} + +static int find_live_mirror(struct map_lookup *map, int first, int num, + int optimal) +{ + int i; + if (map->stripes[optimal].dev->bdev) + return optimal; + for (i = first; i < first + num; i++) { + if (map->stripes[i].dev->bdev) + return i; + } + /* we couldn't find one that doesn't fail. Just return something + * and the io error handling code will clean up eventually + */ + return optimal; +} + +static int __btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, + int mirror_num, struct page *unplug_page) +{ + struct extent_map *em; + struct map_lookup *map; + struct extent_map_tree *em_tree = &map_tree->map_tree; + u64 offset; + u64 stripe_offset; + u64 stripe_nr; + int stripes_allocated = 8; + int stripes_required = 1; + int stripe_index; + int i; + int num_stripes; + int max_errors = 0; + struct btrfs_multi_bio *multi = NULL; + + if (multi_ret && !(rw & (1 << BIO_RW))) + stripes_allocated = 1; +again: + if (multi_ret) { + multi = kzalloc(btrfs_multi_bio_size(stripes_allocated), + GFP_NOFS); + if (!multi) + return -ENOMEM; + + atomic_set(&multi->error, 0); + } + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, logical, *length); + spin_unlock(&em_tree->lock); + + if (!em && unplug_page) + return 0; + + if (!em) { + printk(KERN_CRIT "unable to find logical %llu len %llu\n", + (unsigned long long)logical, + (unsigned long long)*length); + BUG(); + } + + BUG_ON(em->start > logical || em->start + em->len < logical); + map = (struct map_lookup *)em->bdev; + offset = logical - em->start; + + if (mirror_num > map->num_stripes) + mirror_num = 0; + + /* if our multi bio struct is too small, back off and try again */ + if (rw & (1 << BIO_RW)) { + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_DUP)) { + stripes_required = map->num_stripes; + max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripes_required = map->sub_stripes; + max_errors = 1; + } + } + if (multi_ret && rw == WRITE && + stripes_allocated < stripes_required) { + stripes_allocated = map->num_stripes; + free_extent_map(em); + kfree(multi); + goto again; + } + stripe_nr = offset; + /* + * stripe_nr counts the total number of stripes we have to stride + * to get to this block + */ + do_div(stripe_nr, map->stripe_len); + + stripe_offset = stripe_nr * map->stripe_len; + BUG_ON(offset < stripe_offset); + + /* stripe_offset is the offset of this block in its stripe*/ + stripe_offset = offset - stripe_offset; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_DUP)) { + /* we limit the length of each bio to what fits in a stripe */ + *length = min_t(u64, em->len - offset, + map->stripe_len - stripe_offset); + } else { + *length = em->len - offset; + } + + if (!multi_ret && !unplug_page) + goto out; + + num_stripes = 1; + stripe_index = 0; + if (map->type & BTRFS_BLOCK_GROUP_RAID1) { + if (unplug_page || (rw & (1 << BIO_RW))) + num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; + else { + stripe_index = find_live_mirror(map, 0, + map->num_stripes, + current->pid % map->num_stripes); + } + + } else if (map->type & BTRFS_BLOCK_GROUP_DUP) { + if (rw & (1 << BIO_RW)) + num_stripes = map->num_stripes; + else if (mirror_num) + stripe_index = mirror_num - 1; + + } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + int factor = map->num_stripes / map->sub_stripes; + + stripe_index = do_div(stripe_nr, factor); + stripe_index *= map->sub_stripes; + + if (unplug_page || (rw & (1 << BIO_RW))) + num_stripes = map->sub_stripes; + else if (mirror_num) + stripe_index += mirror_num - 1; + else { + stripe_index = find_live_mirror(map, stripe_index, + map->sub_stripes, stripe_index + + current->pid % map->sub_stripes); + } + } else { + /* + * after this do_div call, stripe_nr is the number of stripes + * on this device we have to walk to find the data, and + * stripe_index is the number of our device in the stripe array + */ + stripe_index = do_div(stripe_nr, map->num_stripes); + } + BUG_ON(stripe_index >= map->num_stripes); + + for (i = 0; i < num_stripes; i++) { + if (unplug_page) { + struct btrfs_device *device; + struct backing_dev_info *bdi; + + device = map->stripes[stripe_index].dev; + if (device->bdev) { + bdi = blk_get_backing_dev_info(device->bdev); + if (bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi, unplug_page); + } + } else { + multi->stripes[i].physical = + map->stripes[stripe_index].physical + + stripe_offset + stripe_nr * map->stripe_len; + multi->stripes[i].dev = map->stripes[stripe_index].dev; + } + stripe_index++; + } + if (multi_ret) { + *multi_ret = multi; + multi->num_stripes = num_stripes; + multi->max_errors = max_errors; + } +out: + free_extent_map(em); + return 0; +} + +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, int mirror_num) +{ + return __btrfs_map_block(map_tree, rw, logical, length, multi_ret, + mirror_num, NULL); +} + +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len) +{ + struct extent_map_tree *em_tree = &map_tree->map_tree; + struct extent_map *em; + struct map_lookup *map; + u64 *buf; + u64 bytenr; + u64 length; + u64 stripe_nr; + int i, j, nr = 0; + + spin_lock(&em_tree->lock); + em = lookup_extent_mapping(em_tree, chunk_start, 1); + spin_unlock(&em_tree->lock); + + BUG_ON(!em || em->start != chunk_start); + map = (struct map_lookup *)em->bdev; + + length = em->len; + if (map->type & BTRFS_BLOCK_GROUP_RAID10) + do_div(length, map->num_stripes / map->sub_stripes); + else if (map->type & BTRFS_BLOCK_GROUP_RAID0) + do_div(length, map->num_stripes); + + buf = kzalloc(sizeof(u64) * map->num_stripes, GFP_NOFS); + BUG_ON(!buf); + + for (i = 0; i < map->num_stripes; i++) { + if (devid && map->stripes[i].dev->devid != devid) + continue; + if (map->stripes[i].physical > physical || + map->stripes[i].physical + length <= physical) + continue; + + stripe_nr = physical - map->stripes[i].physical; + do_div(stripe_nr, map->stripe_len); + + if (map->type & BTRFS_BLOCK_GROUP_RAID10) { + stripe_nr = stripe_nr * map->num_stripes + i; + do_div(stripe_nr, map->sub_stripes); + } else if (map->type & BTRFS_BLOCK_GROUP_RAID0) { + stripe_nr = stripe_nr * map->num_stripes + i; + } + bytenr = chunk_start + stripe_nr * map->stripe_len; + WARN_ON(nr >= map->num_stripes); + for (j = 0; j < nr; j++) { + if (buf[j] == bytenr) + break; + } + if (j == nr) { + WARN_ON(nr >= map->num_stripes); + buf[nr++] = bytenr; + } + } + + for (i = 0; i > nr; i++) { + struct btrfs_multi_bio *multi; + struct btrfs_bio_stripe *stripe; + int ret; + + length = 1; + ret = btrfs_map_block(map_tree, WRITE, buf[i], + &length, &multi, 0); + BUG_ON(ret); + + stripe = multi->stripes; + for (j = 0; j < multi->num_stripes; j++) { + if (stripe->physical >= physical && + physical < stripe->physical + length) + break; + } + BUG_ON(j >= multi->num_stripes); + kfree(multi); + } + + *logical = buf; + *naddrs = nr; + *stripe_len = map->stripe_len; + + free_extent_map(em); + return 0; +} + +int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, + u64 logical, struct page *page) +{ + u64 length = PAGE_CACHE_SIZE; + return __btrfs_map_block(map_tree, READ, logical, &length, + NULL, 0, page); +} + +static void end_bio_multi_stripe(struct bio *bio, int err) +{ + struct btrfs_multi_bio *multi = bio->bi_private; + int is_orig_bio = 0; + + if (err) + atomic_inc(&multi->error); + + if (bio == multi->orig_bio) + is_orig_bio = 1; + + if (atomic_dec_and_test(&multi->stripes_pending)) { + if (!is_orig_bio) { + bio_put(bio); + bio = multi->orig_bio; + } + bio->bi_private = multi->private; + bio->bi_end_io = multi->end_io; + /* only send an error to the higher layers if it is + * beyond the tolerance of the multi-bio + */ + if (atomic_read(&multi->error) > multi->max_errors) { + err = -EIO; + } else if (err) { + /* + * this bio is actually up to date, we didn't + * go over the max number of errors + */ + set_bit(BIO_UPTODATE, &bio->bi_flags); + err = 0; + } + kfree(multi); + + bio_endio(bio, err); + } else if (!is_orig_bio) { + bio_put(bio); + } +} + +struct async_sched { + struct bio *bio; + int rw; + struct btrfs_fs_info *info; + struct btrfs_work work; +}; + +/* + * see run_scheduled_bios for a description of why bios are collected for + * async submit. + * + * This will add one bio to the pending list for a device and make sure + * the work struct is scheduled. + */ +static noinline int schedule_bio(struct btrfs_root *root, + struct btrfs_device *device, + int rw, struct bio *bio) +{ + int should_queue = 1; + + /* don't bother with additional async steps for reads, right now */ + if (!(rw & (1 << BIO_RW))) { + bio_get(bio); + submit_bio(rw, bio); + bio_put(bio); + return 0; + } + + /* + * nr_async_bios allows us to reliably return congestion to the + * higher layers. Otherwise, the async bio makes it appear we have + * made progress against dirty pages when we've really just put it + * on a queue for later + */ + atomic_inc(&root->fs_info->nr_async_bios); + WARN_ON(bio->bi_next); + bio->bi_next = NULL; + bio->bi_rw |= rw; + + spin_lock(&device->io_lock); + + if (device->pending_bio_tail) + device->pending_bio_tail->bi_next = bio; + + device->pending_bio_tail = bio; + if (!device->pending_bios) + device->pending_bios = bio; + if (device->running_pending) + should_queue = 0; + + spin_unlock(&device->io_lock); + + if (should_queue) + btrfs_queue_worker(&root->fs_info->submit_workers, + &device->work); + return 0; +} + +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit) +{ + struct btrfs_mapping_tree *map_tree; + struct btrfs_device *dev; + struct bio *first_bio = bio; + u64 logical = (u64)bio->bi_sector << 9; + u64 length = 0; + u64 map_length; + struct btrfs_multi_bio *multi = NULL; + int ret; + int dev_nr = 0; + int total_devs = 1; + + length = bio->bi_size; + map_tree = &root->fs_info->mapping_tree; + map_length = length; + + ret = btrfs_map_block(map_tree, rw, logical, &map_length, &multi, + mirror_num); + BUG_ON(ret); + + total_devs = multi->num_stripes; + if (map_length < length) { + printk(KERN_CRIT "mapping failed logical %llu bio len %llu " + "len %llu\n", (unsigned long long)logical, + (unsigned long long)length, + (unsigned long long)map_length); + BUG(); + } + multi->end_io = first_bio->bi_end_io; + multi->private = first_bio->bi_private; + multi->orig_bio = first_bio; + atomic_set(&multi->stripes_pending, multi->num_stripes); + + while (dev_nr < total_devs) { + if (total_devs > 1) { + if (dev_nr < total_devs - 1) { + bio = bio_clone(first_bio, GFP_NOFS); + BUG_ON(!bio); + } else { + bio = first_bio; + } + bio->bi_private = multi; + bio->bi_end_io = end_bio_multi_stripe; + } + bio->bi_sector = multi->stripes[dev_nr].physical >> 9; + dev = multi->stripes[dev_nr].dev; + BUG_ON(rw == WRITE && !dev->writeable); + if (dev && dev->bdev) { + bio->bi_bdev = dev->bdev; + if (async_submit) + schedule_bio(root, dev, rw, bio); + else + submit_bio(rw, bio); + } else { + bio->bi_bdev = root->fs_info->fs_devices->latest_bdev; + bio->bi_sector = logical >> 9; + bio_endio(bio, -EIO); + } + dev_nr++; + } + if (total_devs == 1) + kfree(multi); + return 0; +} + +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, + u8 *uuid, u8 *fsid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *cur_devices; + + cur_devices = root->fs_info->fs_devices; + while (cur_devices) { + if (!fsid || + !memcmp(cur_devices->fsid, fsid, BTRFS_UUID_SIZE)) { + device = __find_device(&cur_devices->devices, + devid, uuid); + if (device) + return device; + } + cur_devices = cur_devices->seed; + } + return NULL; +} + +static struct btrfs_device *add_missing_dev(struct btrfs_root *root, + u64 devid, u8 *dev_uuid) +{ + struct btrfs_device *device; + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; + + device = kzalloc(sizeof(*device), GFP_NOFS); + if (!device) + return NULL; + list_add(&device->dev_list, + &fs_devices->devices); + device->barriers = 1; + device->dev_root = root->fs_info->dev_root; + device->devid = devid; + device->work.func = pending_bios_fn; + device->fs_devices = fs_devices; + fs_devices->num_devices++; + spin_lock_init(&device->io_lock); + INIT_LIST_HEAD(&device->dev_alloc_list); + memcpy(device->uuid, dev_uuid, BTRFS_UUID_SIZE); + return device; +} + +static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, + struct extent_buffer *leaf, + struct btrfs_chunk *chunk) +{ + struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; + struct map_lookup *map; + struct extent_map *em; + u64 logical; + u64 length; + u64 devid; + u8 uuid[BTRFS_UUID_SIZE]; + int num_stripes; + int ret; + int i; + + logical = key->offset; + length = btrfs_chunk_length(leaf, chunk); + + spin_lock(&map_tree->map_tree.lock); + em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); + spin_unlock(&map_tree->map_tree.lock); + + /* already mapped? */ + if (em && em->start <= logical && em->start + em->len > logical) { + free_extent_map(em); + return 0; + } else if (em) { + free_extent_map(em); + } + + map = kzalloc(sizeof(*map), GFP_NOFS); + if (!map) + return -ENOMEM; + + em = alloc_extent_map(GFP_NOFS); + if (!em) + return -ENOMEM; + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); + if (!map) { + free_extent_map(em); + return -ENOMEM; + } + + em->bdev = (struct block_device *)map; + em->start = logical; + em->len = length; + em->block_start = 0; + em->block_len = em->len; + + map->num_stripes = num_stripes; + map->io_width = btrfs_chunk_io_width(leaf, chunk); + map->io_align = btrfs_chunk_io_align(leaf, chunk); + map->sector_size = btrfs_chunk_sector_size(leaf, chunk); + map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + map->type = btrfs_chunk_type(leaf, chunk); + map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); + for (i = 0; i < num_stripes; i++) { + map->stripes[i].physical = + btrfs_stripe_offset_nr(leaf, chunk, i); + devid = btrfs_stripe_devid_nr(leaf, chunk, i); + read_extent_buffer(leaf, uuid, (unsigned long) + btrfs_stripe_dev_uuid_nr(chunk, i), + BTRFS_UUID_SIZE); + map->stripes[i].dev = btrfs_find_device(root, devid, uuid, + NULL); + if (!map->stripes[i].dev && !btrfs_test_opt(root, DEGRADED)) { + kfree(map); + free_extent_map(em); + return -EIO; + } + if (!map->stripes[i].dev) { + map->stripes[i].dev = + add_missing_dev(root, devid, uuid); + if (!map->stripes[i].dev) { + kfree(map); + free_extent_map(em); + return -EIO; + } + } + map->stripes[i].dev->in_fs_metadata = 1; + } + + spin_lock(&map_tree->map_tree.lock); + ret = add_extent_mapping(&map_tree->map_tree, em); + spin_unlock(&map_tree->map_tree.lock); + BUG_ON(ret); + free_extent_map(em); + + return 0; +} + +static int fill_device_from_item(struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item, + struct btrfs_device *device) +{ + unsigned long ptr; + + device->devid = btrfs_device_id(leaf, dev_item); + device->total_bytes = btrfs_device_total_bytes(leaf, dev_item); + device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->type = btrfs_device_type(leaf, dev_item); + device->io_align = btrfs_device_io_align(leaf, dev_item); + device->io_width = btrfs_device_io_width(leaf, dev_item); + device->sector_size = btrfs_device_sector_size(leaf, dev_item); + + ptr = (unsigned long)btrfs_device_uuid(dev_item); + read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); + + return 0; +} + +static int open_seed_devices(struct btrfs_root *root, u8 *fsid) +{ + struct btrfs_fs_devices *fs_devices; + int ret; + + mutex_lock(&uuid_mutex); + + fs_devices = root->fs_info->fs_devices->seed; + while (fs_devices) { + if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { + ret = 0; + goto out; + } + fs_devices = fs_devices->seed; + } + + fs_devices = find_fsid(fsid); + if (!fs_devices) { + ret = -ENOENT; + goto out; + } + + fs_devices = clone_fs_devices(fs_devices); + if (IS_ERR(fs_devices)) { + ret = PTR_ERR(fs_devices); + goto out; + } + + ret = __btrfs_open_devices(fs_devices, FMODE_READ, + root->fs_info->bdev_holder); + if (ret) + goto out; + + if (!fs_devices->seeding) { + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + ret = -EINVAL; + goto out; + } + + fs_devices->seed = root->fs_info->fs_devices->seed; + root->fs_info->fs_devices->seed = fs_devices; +out: + mutex_unlock(&uuid_mutex); + return ret; +} + +static int read_one_dev(struct btrfs_root *root, + struct extent_buffer *leaf, + struct btrfs_dev_item *dev_item) +{ + struct btrfs_device *device; + u64 devid; + int ret; + u8 fs_uuid[BTRFS_UUID_SIZE]; + u8 dev_uuid[BTRFS_UUID_SIZE]; + + devid = btrfs_device_id(leaf, dev_item); + read_extent_buffer(leaf, dev_uuid, + (unsigned long)btrfs_device_uuid(dev_item), + BTRFS_UUID_SIZE); + read_extent_buffer(leaf, fs_uuid, + (unsigned long)btrfs_device_fsid(dev_item), + BTRFS_UUID_SIZE); + + if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { + ret = open_seed_devices(root, fs_uuid); + if (ret && !btrfs_test_opt(root, DEGRADED)) + return ret; + } + + device = btrfs_find_device(root, devid, dev_uuid, fs_uuid); + if (!device || !device->bdev) { + if (!btrfs_test_opt(root, DEGRADED)) + return -EIO; + + if (!device) { + printk(KERN_WARNING "warning devid %llu missing\n", + (unsigned long long)devid); + device = add_missing_dev(root, devid, dev_uuid); + if (!device) + return -ENOMEM; + } + } + + if (device->fs_devices != root->fs_info->fs_devices) { + BUG_ON(device->writeable); + if (device->generation != + btrfs_device_generation(leaf, dev_item)) + return -EINVAL; + } + + fill_device_from_item(leaf, dev_item, device); + device->dev_root = root->fs_info->dev_root; + device->in_fs_metadata = 1; + if (device->writeable) + device->fs_devices->total_rw_bytes += device->total_bytes; + ret = 0; + return ret; +} + +int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf) +{ + struct btrfs_dev_item *dev_item; + + dev_item = (struct btrfs_dev_item *)offsetof(struct btrfs_super_block, + dev_item); + return read_one_dev(root, buf, dev_item); +} + +int btrfs_read_sys_array(struct btrfs_root *root) +{ + struct btrfs_super_block *super_copy = &root->fs_info->super_copy; + struct extent_buffer *sb; + struct btrfs_disk_key *disk_key; + struct btrfs_chunk *chunk; + u8 *ptr; + unsigned long sb_ptr; + int ret = 0; + u32 num_stripes; + u32 array_size; + u32 len = 0; + u32 cur; + struct btrfs_key key; + + sb = btrfs_find_create_tree_block(root, BTRFS_SUPER_INFO_OFFSET, + BTRFS_SUPER_INFO_SIZE); + if (!sb) + return -ENOMEM; + btrfs_set_buffer_uptodate(sb); + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); + array_size = btrfs_super_sys_array_size(super_copy); + + ptr = super_copy->sys_chunk_array; + sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array); + cur = 0; + + while (cur < array_size) { + disk_key = (struct btrfs_disk_key *)ptr; + btrfs_disk_key_to_cpu(&key, disk_key); + + len = sizeof(*disk_key); ptr += len; + sb_ptr += len; + cur += len; + + if (key.type == BTRFS_CHUNK_ITEM_KEY) { + chunk = (struct btrfs_chunk *)sb_ptr; + ret = read_one_chunk(root, &key, sb, chunk); + if (ret) + break; + num_stripes = btrfs_chunk_num_stripes(sb, chunk); + len = btrfs_chunk_item_size(num_stripes); + } else { + ret = -EIO; + break; + } + ptr += len; + sb_ptr += len; + cur += len; + } + free_extent_buffer(sb); + return ret; +} + +int btrfs_read_chunk_tree(struct btrfs_root *root) +{ + struct btrfs_path *path; + struct extent_buffer *leaf; + struct btrfs_key key; + struct btrfs_key found_key; + int ret; + int slot; + + root = root->fs_info->chunk_root; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* first we search for all of the device items, and then we + * read in all of the chunk items. This way we can create chunk + * mappings that reference all of the devices that are afound + */ + key.objectid = BTRFS_DEV_ITEMS_OBJECTID; + key.offset = 0; + key.type = 0; +again: + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + while (1) { + leaf = path->nodes[0]; + slot = path->slots[0]; + if (slot >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); + if (ret == 0) + continue; + if (ret < 0) + goto error; + break; + } + btrfs_item_key_to_cpu(leaf, &found_key, slot); + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID) + break; + if (found_key.type == BTRFS_DEV_ITEM_KEY) { + struct btrfs_dev_item *dev_item; + dev_item = btrfs_item_ptr(leaf, slot, + struct btrfs_dev_item); + ret = read_one_dev(root, leaf, dev_item); + if (ret) + goto error; + } + } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) { + struct btrfs_chunk *chunk; + chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk); + ret = read_one_chunk(root, &found_key, leaf, chunk); + if (ret) + goto error; + } + path->slots[0]++; + } + if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) { + key.objectid = 0; + btrfs_release_path(root, path); + goto again; + } + ret = 0; +error: + btrfs_free_path(path); + return ret; +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h new file mode 100644 index 00000000000..86c44e9ae11 --- /dev/null +++ b/fs/btrfs/volumes.h @@ -0,0 +1,162 @@ +/* + * Copyright (C) 2007 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __BTRFS_VOLUMES_ +#define __BTRFS_VOLUMES_ + +#include <linux/bio.h> +#include "async-thread.h" + +struct buffer_head; +struct btrfs_device { + struct list_head dev_list; + struct list_head dev_alloc_list; + struct btrfs_fs_devices *fs_devices; + struct btrfs_root *dev_root; + struct bio *pending_bios; + struct bio *pending_bio_tail; + int running_pending; + u64 generation; + + int barriers; + int writeable; + int in_fs_metadata; + + spinlock_t io_lock; + + struct block_device *bdev; + + /* the mode sent to open_bdev_exclusive */ + fmode_t mode; + + char *name; + + /* the internal btrfs device id */ + u64 devid; + + /* size of the device */ + u64 total_bytes; + + /* bytes used */ + u64 bytes_used; + + /* optimal io alignment for this device */ + u32 io_align; + + /* optimal io width for this device */ + u32 io_width; + + /* minimal io size for this device */ + u32 sector_size; + + /* type and info about this device */ + u64 type; + + /* physical drive uuid (or lvm uuid) */ + u8 uuid[BTRFS_UUID_SIZE]; + + struct btrfs_work work; +}; + +struct btrfs_fs_devices { + u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ + + /* the device with this id has the most recent coyp of the super */ + u64 latest_devid; + u64 latest_trans; + u64 num_devices; + u64 open_devices; + u64 rw_devices; + u64 total_rw_bytes; + struct block_device *latest_bdev; + /* all of the devices in the FS */ + struct list_head devices; + + /* devices not currently being allocated */ + struct list_head alloc_list; + struct list_head list; + + struct btrfs_fs_devices *seed; + int seeding; + + int opened; +}; + +struct btrfs_bio_stripe { + struct btrfs_device *dev; + u64 physical; +}; + +struct btrfs_multi_bio { + atomic_t stripes_pending; + bio_end_io_t *end_io; + struct bio *orig_bio; + void *private; + atomic_t error; + int max_errors; + int num_stripes; + struct btrfs_bio_stripe stripes[]; +}; + +#define btrfs_multi_bio_size(n) (sizeof(struct btrfs_multi_bio) + \ + (sizeof(struct btrfs_bio_stripe) * (n))) + +int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans, + struct btrfs_device *device, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset, u64 start, u64 num_bytes); +int btrfs_map_block(struct btrfs_mapping_tree *map_tree, int rw, + u64 logical, u64 *length, + struct btrfs_multi_bio **multi_ret, int mirror_num); +int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, + u64 chunk_start, u64 physical, u64 devid, + u64 **logical, int *naddrs, int *stripe_len); +int btrfs_read_sys_array(struct btrfs_root *root); +int btrfs_read_chunk_tree(struct btrfs_root *root); +int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 type); +void btrfs_mapping_init(struct btrfs_mapping_tree *tree); +void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree); +int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, + int mirror_num, int async_submit); +int btrfs_read_super_device(struct btrfs_root *root, struct extent_buffer *buf); +int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, + fmode_t flags, void *holder); +int btrfs_scan_one_device(const char *path, fmode_t flags, void *holder, + struct btrfs_fs_devices **fs_devices_ret); +int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_close_extra_devices(struct btrfs_fs_devices *fs_devices); +int btrfs_add_device(struct btrfs_trans_handle *trans, + struct btrfs_root *root, + struct btrfs_device *device); +int btrfs_rm_device(struct btrfs_root *root, char *device_path); +int btrfs_cleanup_fs_uuids(void); +int btrfs_num_copies(struct btrfs_mapping_tree *map_tree, u64 logical, u64 len); +int btrfs_unplug_page(struct btrfs_mapping_tree *map_tree, + u64 logical, struct page *page); +int btrfs_grow_device(struct btrfs_trans_handle *trans, + struct btrfs_device *device, u64 new_size); +struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid, + u8 *uuid, u8 *fsid); +int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); +int btrfs_init_new_device(struct btrfs_root *root, char *path); +int btrfs_balance(struct btrfs_root *dev_root); +void btrfs_unlock_volumes(void); +void btrfs_lock_volumes(void); +int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset); +#endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c new file mode 100644 index 00000000000..7f332e27089 --- /dev/null +++ b/fs/btrfs/xattr.c @@ -0,0 +1,322 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#include <linux/init.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/rwsem.h> +#include <linux/xattr.h> +#include "ctree.h" +#include "btrfs_inode.h" +#include "transaction.h" +#include "xattr.h" +#include "disk-io.h" + + +ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct extent_buffer *leaf; + int ret = 0; + unsigned long data_ptr; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + /* lookup the xattr by name */ + di = btrfs_lookup_xattr(NULL, root, path, inode->i_ino, name, + strlen(name), 0); + if (!di || IS_ERR(di)) { + ret = -ENODATA; + goto out; + } + + leaf = path->nodes[0]; + /* if size is 0, that means we want the size of the attr */ + if (!size) { + ret = btrfs_dir_data_len(leaf, di); + goto out; + } + + /* now get the data out of our dir_item */ + if (btrfs_dir_data_len(leaf, di) > size) { + ret = -ERANGE; + goto out; + } + data_ptr = (unsigned long)((char *)(di + 1) + + btrfs_dir_name_len(leaf, di)); + read_extent_buffer(leaf, buffer, data_ptr, + btrfs_dir_data_len(leaf, di)); + ret = btrfs_dir_data_len(leaf, di); + +out: + btrfs_free_path(path); + return ret; +} + +int __btrfs_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags) +{ + struct btrfs_dir_item *di; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_trans_handle *trans; + struct btrfs_path *path; + int ret = 0, mod = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + + trans = btrfs_start_transaction(root, 1); + btrfs_set_trans_block_group(trans, inode); + + /* first lets see if we already have this xattr */ + di = btrfs_lookup_xattr(trans, root, path, inode->i_ino, name, + strlen(name), -1); + if (IS_ERR(di)) { + ret = PTR_ERR(di); + goto out; + } + + /* ok we already have this xattr, lets remove it */ + if (di) { + /* if we want create only exit */ + if (flags & XATTR_CREATE) { + ret = -EEXIST; + goto out; + } + + ret = btrfs_delete_one_dir_name(trans, root, path, di); + if (ret) + goto out; + btrfs_release_path(root, path); + + /* if we don't have a value then we are removing the xattr */ + if (!value) { + mod = 1; + goto out; + } + } else { + btrfs_release_path(root, path); + + if (flags & XATTR_REPLACE) { + /* we couldn't find the attr to replace */ + ret = -ENODATA; + goto out; + } + } + + /* ok we have to create a completely new xattr */ + ret = btrfs_insert_xattr_item(trans, root, name, strlen(name), + value, size, inode->i_ino); + if (ret) + goto out; + mod = 1; + +out: + if (mod) { + inode->i_ctime = CURRENT_TIME; + ret = btrfs_update_inode(trans, root, inode); + } + + btrfs_end_transaction(trans, root); + btrfs_free_path(path); + return ret; +} + +ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) +{ + struct btrfs_key key, found_key; + struct inode *inode = dentry->d_inode; + struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_path *path; + struct btrfs_item *item; + struct extent_buffer *leaf; + struct btrfs_dir_item *di; + int ret = 0, slot, advance; + size_t total_size = 0, size_left = size; + unsigned long name_ptr; + size_t name_len; + u32 nritems; + + /* + * ok we want all objects associated with this id. + * NOTE: we set key.offset = 0; because we want to start with the + * first xattr that we find and walk forward + */ + key.objectid = inode->i_ino; + btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.offset = 0; + + path = btrfs_alloc_path(); + if (!path) + return -ENOMEM; + path->reada = 2; + + /* search for our xattrs */ + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); + if (ret < 0) + goto err; + ret = 0; + advance = 0; + while (1) { + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + + /* this is where we start walking through the path */ + if (advance || slot >= nritems) { + /* + * if we've reached the last slot in this leaf we need + * to go to the next leaf and reset everything + */ + if (slot >= nritems-1) { + ret = btrfs_next_leaf(root, path); + if (ret) + break; + leaf = path->nodes[0]; + nritems = btrfs_header_nritems(leaf); + slot = path->slots[0]; + } else { + /* + * just walking through the slots on this leaf + */ + slot++; + path->slots[0]++; + } + } + advance = 1; + + item = btrfs_item_nr(leaf, slot); + btrfs_item_key_to_cpu(leaf, &found_key, slot); + + /* check to make sure this item is what we want */ + if (found_key.objectid != key.objectid) + break; + if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) + break; + + di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); + + name_len = btrfs_dir_name_len(leaf, di); + total_size += name_len + 1; + + /* we are just looking for how big our buffer needs to be */ + if (!size) + continue; + + if (!buffer || (name_len + 1) > size_left) { + ret = -ERANGE; + goto err; + } + + name_ptr = (unsigned long)(di + 1); + read_extent_buffer(leaf, buffer, name_ptr, name_len); + buffer[name_len] = '\0'; + + size_left -= name_len + 1; + buffer += name_len + 1; + } + ret = total_size; + +err: + btrfs_free_path(path); + + return ret; +} + +/* + * List of handlers for synthetic system.* attributes. All real ondisk + * attributes are handled directly. + */ +struct xattr_handler *btrfs_xattr_handlers[] = { +#ifdef CONFIG_FS_POSIX_ACL + &btrfs_xattr_acl_access_handler, + &btrfs_xattr_acl_default_handler, +#endif + NULL, +}; + +/* + * Check if the attribute is in a supported namespace. + * + * This applied after the check for the synthetic attributes in the system + * namespace. + */ +static bool btrfs_is_valid_xattr(const char *name) +{ + return !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) || + !strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN) || + !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || + !strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN); +} + +ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_getxattr(dentry, name, buffer, size); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __btrfs_getxattr(dentry->d_inode, name, buffer, size); +} + +int btrfs_setxattr(struct dentry *dentry, const char *name, const void *value, + size_t size, int flags) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_setxattr(dentry, name, value, size, flags); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + + if (size == 0) + value = ""; /* empty EA, do not remove */ + return __btrfs_setxattr(dentry->d_inode, name, value, size, flags); +} + +int btrfs_removexattr(struct dentry *dentry, const char *name) +{ + /* + * If this is a request for a synthetic attribute in the system.* + * namespace use the generic infrastructure to resolve a handler + * for it via sb->s_xattr. + */ + if (!strncmp(name, XATTR_SYSTEM_PREFIX, XATTR_SYSTEM_PREFIX_LEN)) + return generic_removexattr(dentry, name); + + if (!btrfs_is_valid_xattr(name)) + return -EOPNOTSUPP; + return __btrfs_setxattr(dentry->d_inode, name, NULL, 0, XATTR_REPLACE); +} diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h new file mode 100644 index 00000000000..5b1d08f8e68 --- /dev/null +++ b/fs/btrfs/xattr.h @@ -0,0 +1,39 @@ +/* + * Copyright (C) 2007 Red Hat. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + */ + +#ifndef __XATTR__ +#define __XATTR__ + +#include <linux/xattr.h> + +extern struct xattr_handler btrfs_xattr_acl_access_handler; +extern struct xattr_handler btrfs_xattr_acl_default_handler; +extern struct xattr_handler *btrfs_xattr_handlers[]; + +extern ssize_t __btrfs_getxattr(struct inode *inode, const char *name, + void *buffer, size_t size); +extern int __btrfs_setxattr(struct inode *inode, const char *name, + const void *value, size_t size, int flags); + +extern ssize_t btrfs_getxattr(struct dentry *dentry, const char *name, + void *buffer, size_t size); +extern int btrfs_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +extern int btrfs_removexattr(struct dentry *dentry, const char *name); + +#endif /* __XATTR__ */ diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c new file mode 100644 index 00000000000..ecfbce836d3 --- /dev/null +++ b/fs/btrfs/zlib.c @@ -0,0 +1,632 @@ +/* + * Copyright (C) 2008 Oracle. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public + * License v2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public + * License along with this program; if not, write to the + * Free Software Foundation, Inc., 59 Temple Place - Suite 330, + * Boston, MA 021110-1307, USA. + * + * Based on jffs2 zlib code: + * Copyright © 2001-2007 Red Hat, Inc. + * Created by David Woodhouse <dwmw2@infradead.org> + */ + +#include <linux/kernel.h> +#include <linux/slab.h> +#include <linux/zlib.h> +#include <linux/zutil.h> +#include <linux/vmalloc.h> +#include <linux/init.h> +#include <linux/err.h> +#include <linux/sched.h> +#include <linux/pagemap.h> +#include <linux/bio.h> +#include "compression.h" + +/* Plan: call deflate() with avail_in == *sourcelen, + avail_out = *dstlen - 12 and flush == Z_FINISH. + If it doesn't manage to finish, call it again with + avail_in == 0 and avail_out set to the remaining 12 + bytes for it to clean up. + Q: Is 12 bytes sufficient? +*/ +#define STREAM_END_SPACE 12 + +struct workspace { + z_stream inf_strm; + z_stream def_strm; + char *buf; + struct list_head list; +}; + +static LIST_HEAD(idle_workspace); +static DEFINE_SPINLOCK(workspace_lock); +static unsigned long num_workspace; +static atomic_t alloc_workspace = ATOMIC_INIT(0); +static DECLARE_WAIT_QUEUE_HEAD(workspace_wait); + +/* + * this finds an available zlib workspace or allocates a new one + * NULL or an ERR_PTR is returned if things go bad. + */ +static struct workspace *find_zlib_workspace(void) +{ + struct workspace *workspace; + int ret; + int cpus = num_online_cpus(); + +again: + spin_lock(&workspace_lock); + if (!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + num_workspace--; + spin_unlock(&workspace_lock); + return workspace; + + } + spin_unlock(&workspace_lock); + if (atomic_read(&alloc_workspace) > cpus) { + DEFINE_WAIT(wait); + prepare_to_wait(&workspace_wait, &wait, TASK_UNINTERRUPTIBLE); + if (atomic_read(&alloc_workspace) > cpus) + schedule(); + finish_wait(&workspace_wait, &wait); + goto again; + } + atomic_inc(&alloc_workspace); + workspace = kzalloc(sizeof(*workspace), GFP_NOFS); + if (!workspace) { + ret = -ENOMEM; + goto fail; + } + + workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize()); + if (!workspace->def_strm.workspace) { + ret = -ENOMEM; + goto fail; + } + workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + if (!workspace->inf_strm.workspace) { + ret = -ENOMEM; + goto fail_inflate; + } + workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); + if (!workspace->buf) { + ret = -ENOMEM; + goto fail_kmalloc; + } + return workspace; + +fail_kmalloc: + vfree(workspace->inf_strm.workspace); +fail_inflate: + vfree(workspace->def_strm.workspace); +fail: + kfree(workspace); + atomic_dec(&alloc_workspace); + wake_up(&workspace_wait); + return ERR_PTR(ret); +} + +/* + * put a workspace struct back on the list or free it if we have enough + * idle ones sitting around + */ +static int free_workspace(struct workspace *workspace) +{ + spin_lock(&workspace_lock); + if (num_workspace < num_online_cpus()) { + list_add_tail(&workspace->list, &idle_workspace); + num_workspace++; + spin_unlock(&workspace_lock); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; + } + spin_unlock(&workspace_lock); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + + atomic_dec(&alloc_workspace); + if (waitqueue_active(&workspace_wait)) + wake_up(&workspace_wait); + return 0; +} + +/* + * cleanup function for module exit + */ +static void free_workspaces(void) +{ + struct workspace *workspace; + while (!list_empty(&idle_workspace)) { + workspace = list_entry(idle_workspace.next, struct workspace, + list); + list_del(&workspace->list); + vfree(workspace->def_strm.workspace); + vfree(workspace->inf_strm.workspace); + kfree(workspace->buf); + kfree(workspace); + atomic_dec(&alloc_workspace); + } +} + +/* + * given an address space and start/len, compress the bytes. + * + * pages are allocated to hold the compressed result and stored + * in 'pages' + * + * out_pages is used to return the number of pages allocated. There + * may be pages allocated even if we return an error + * + * total_in is used to return the number of bytes actually read. It + * may be smaller then len if we had to exit early because we + * ran out of room in the pages array or because we cross the + * max_out threshold. + * + * total_out is used to return the total number of compressed bytes + * + * max_out tells us the max number of bytes that we're allowed to + * stuff into pages + */ +int btrfs_zlib_compress_pages(struct address_space *mapping, + u64 start, unsigned long len, + struct page **pages, + unsigned long nr_dest_pages, + unsigned long *out_pages, + unsigned long *total_in, + unsigned long *total_out, + unsigned long max_out) +{ + int ret; + struct workspace *workspace; + char *data_in; + char *cpage_out; + int nr_pages = 0; + struct page *in_page = NULL; + struct page *out_page = NULL; + int out_written = 0; + int in_read = 0; + unsigned long bytes_left; + + *out_pages = 0; + *total_out = 0; + *total_in = 0; + + workspace = find_zlib_workspace(); + if (!workspace) + return -1; + + if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { + printk(KERN_WARNING "deflateInit failed\n"); + ret = -1; + goto out; + } + + workspace->def_strm.total_in = 0; + workspace->def_strm.total_out = 0; + + in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[0] = out_page; + nr_pages = 1; + + workspace->def_strm.next_in = data_in; + workspace->def_strm.next_out = cpage_out; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); + + out_written = 0; + in_read = 0; + + while (workspace->def_strm.total_in < len) { + ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); + if (ret != Z_OK) { + printk(KERN_DEBUG "btrfs deflate in loop returned %d\n", + ret); + zlib_deflateEnd(&workspace->def_strm); + ret = -1; + goto out; + } + + /* we're making it bigger, give up */ + if (workspace->def_strm.total_in > 8192 && + workspace->def_strm.total_in < + workspace->def_strm.total_out) { + ret = -1; + goto out; + } + /* we need another page for writing out. Test this + * before the total_in so we will pull in a new page for + * the stream end if required + */ + if (workspace->def_strm.avail_out == 0) { + kunmap(out_page); + if (nr_pages == nr_dest_pages) { + out_page = NULL; + ret = -1; + goto out; + } + out_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); + cpage_out = kmap(out_page); + pages[nr_pages] = out_page; + nr_pages++; + workspace->def_strm.avail_out = PAGE_CACHE_SIZE; + workspace->def_strm.next_out = cpage_out; + } + /* we're all done */ + if (workspace->def_strm.total_in >= len) + break; + + /* we've read in a full page, get a new one */ + if (workspace->def_strm.avail_in == 0) { + if (workspace->def_strm.total_out > max_out) + break; + + bytes_left = len - workspace->def_strm.total_in; + kunmap(in_page); + page_cache_release(in_page); + + start += PAGE_CACHE_SIZE; + in_page = find_get_page(mapping, + start >> PAGE_CACHE_SHIFT); + data_in = kmap(in_page); + workspace->def_strm.avail_in = min(bytes_left, + PAGE_CACHE_SIZE); + workspace->def_strm.next_in = data_in; + } + } + workspace->def_strm.avail_in = 0; + ret = zlib_deflate(&workspace->def_strm, Z_FINISH); + zlib_deflateEnd(&workspace->def_strm); + + if (ret != Z_STREAM_END) { + ret = -1; + goto out; + } + + if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { + ret = -1; + goto out; + } + + ret = 0; + *total_out = workspace->def_strm.total_out; + *total_in = workspace->def_strm.total_in; +out: + *out_pages = nr_pages; + if (out_page) + kunmap(out_page); + + if (in_page) { + kunmap(in_page); + page_cache_release(in_page); + } + free_workspace(workspace); + return ret; +} + +/* + * pages_in is an array of pages with compressed data. + * + * disk_start is the starting logical offset of this array in the file + * + * bvec is a bio_vec of pages from the file that we want to decompress into + * + * vcnt is the count of pages in the biovec + * + * srclen is the number of bytes in pages_in + * + * The basic idea is that we have a bio that was created by readpages. + * The pages in the bio are for the uncompressed data, and they may not + * be contiguous. They all correspond to the range of bytes covered by + * the compressed extent. + */ +int btrfs_zlib_decompress_biovec(struct page **pages_in, + u64 disk_start, + struct bio_vec *bvec, + int vcnt, + size_t srclen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + char *data_in; + size_t total_out = 0; + unsigned long page_bytes_left; + unsigned long page_in_index = 0; + unsigned long page_out_index = 0; + struct page *page_out; + unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / + PAGE_CACHE_SIZE; + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long working_bytes; + unsigned long pg_offset; + unsigned long start_byte; + unsigned long current_buf_start; + char *kaddr; + + workspace = find_zlib_workspace(); + if (!workspace) + return -ENOMEM; + + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.total_out = 0; + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + page_out = bvec[page_out_index].bv_page; + page_bytes_left = PAGE_CACHE_SIZE; + pg_offset = 0; + + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + while (workspace->inf_strm.total_in < srclen) { + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + /* + * buf start is the byte offset we're of the start of + * our workspace buffer + */ + buf_start = total_out; + + /* total_out is the last byte of the workspace buffer */ + total_out = workspace->inf_strm.total_out; + + working_bytes = total_out - buf_start; + + /* + * start byte is the first byte of the page we're currently + * copying into relative to the start of the compressed data. + */ + start_byte = page_offset(page_out) - disk_start; + + if (working_bytes == 0) { + /* we didn't make progress in this inflate + * call, we're done + */ + if (ret != Z_STREAM_END) + ret = -1; + break; + } + + /* we haven't yet hit data corresponding to this page */ + if (total_out <= start_byte) + goto next; + + /* + * the start of the data we care about is offset into + * the middle of our working buffer + */ + if (total_out > start_byte && buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes -= buf_offset; + } else { + buf_offset = 0; + } + current_buf_start = buf_start; + + /* copy bytes from the working buffer into the pages */ + while (working_bytes > 0) { + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, working_bytes); + kaddr = kmap_atomic(page_out, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, + bytes); + kunmap_atomic(kaddr, KM_USER0); + flush_dcache_page(page_out); + + pg_offset += bytes; + page_bytes_left -= bytes; + buf_offset += bytes; + working_bytes -= bytes; + current_buf_start += bytes; + + /* check if we need to pick another page */ + if (page_bytes_left == 0) { + page_out_index++; + if (page_out_index >= vcnt) { + ret = 0; + goto done; + } + + page_out = bvec[page_out_index].bv_page; + pg_offset = 0; + page_bytes_left = PAGE_CACHE_SIZE; + start_byte = page_offset(page_out) - disk_start; + + /* + * make sure our new page is covered by this + * working buffer + */ + if (total_out <= start_byte) + goto next; + + /* the next page in the biovec might not + * be adjacent to the last page, but it + * might still be found inside this working + * buffer. bump our offset pointer + */ + if (total_out > start_byte && + current_buf_start < start_byte) { + buf_offset = start_byte - buf_start; + working_bytes = total_out - start_byte; + current_buf_start = buf_start + + buf_offset; + } + } + } +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + + if (workspace->inf_strm.avail_in == 0) { + unsigned long tmp; + kunmap(pages_in[page_in_index]); + page_in_index++; + if (page_in_index >= total_pages_in) { + data_in = NULL; + break; + } + data_in = kmap(pages_in[page_in_index]); + workspace->inf_strm.next_in = data_in; + tmp = srclen - workspace->inf_strm.total_in; + workspace->inf_strm.avail_in = min(tmp, + PAGE_CACHE_SIZE); + } + } + if (ret != Z_STREAM_END) + ret = -1; + else + ret = 0; +done: + zlib_inflateEnd(&workspace->inf_strm); + if (data_in) + kunmap(pages_in[page_in_index]); +out: + free_workspace(workspace); + return ret; +} + +/* + * a less complex decompression routine. Our compressed data fits in a + * single page, and we want to read a single page out of it. + * start_byte tells us the offset into the compressed data we're interested in + */ +int btrfs_zlib_decompress(unsigned char *data_in, + struct page *dest_page, + unsigned long start_byte, + size_t srclen, size_t destlen) +{ + int ret = 0; + int wbits = MAX_WBITS; + struct workspace *workspace; + unsigned long bytes_left = destlen; + unsigned long total_out = 0; + char *kaddr; + + if (destlen > PAGE_CACHE_SIZE) + return -ENOMEM; + + workspace = find_zlib_workspace(); + if (!workspace) + return -ENOMEM; + + workspace->inf_strm.next_in = data_in; + workspace->inf_strm.avail_in = srclen; + workspace->inf_strm.total_in = 0; + + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->inf_strm.total_out = 0; + /* If it's deflate, and it's got no preset dictionary, then + we can tell zlib to skip the adler32 check. */ + if (srclen > 2 && !(data_in[1] & PRESET_DICT) && + ((data_in[0] & 0x0f) == Z_DEFLATED) && + !(((data_in[0]<<8) + data_in[1]) % 31)) { + + wbits = -((data_in[0] >> 4) + 8); + workspace->inf_strm.next_in += 2; + workspace->inf_strm.avail_in -= 2; + } + + if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + printk(KERN_WARNING "inflateInit failed\n"); + ret = -1; + goto out; + } + + while (bytes_left > 0) { + unsigned long buf_start; + unsigned long buf_offset; + unsigned long bytes; + unsigned long pg_offset = 0; + + ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + if (ret != Z_OK && ret != Z_STREAM_END) + break; + + buf_start = total_out; + total_out = workspace->inf_strm.total_out; + + if (total_out == buf_start) { + ret = -1; + break; + } + + if (total_out <= start_byte) + goto next; + + if (total_out > start_byte && buf_start < start_byte) + buf_offset = start_byte - buf_start; + else + buf_offset = 0; + + bytes = min(PAGE_CACHE_SIZE - pg_offset, + PAGE_CACHE_SIZE - buf_offset); + bytes = min(bytes, bytes_left); + + kaddr = kmap_atomic(dest_page, KM_USER0); + memcpy(kaddr + pg_offset, workspace->buf + buf_offset, bytes); + kunmap_atomic(kaddr, KM_USER0); + + pg_offset += bytes; + bytes_left -= bytes; +next: + workspace->inf_strm.next_out = workspace->buf; + workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + } + + if (ret != Z_STREAM_END && bytes_left != 0) + ret = -1; + else + ret = 0; + + zlib_inflateEnd(&workspace->inf_strm); +out: + free_workspace(workspace); + return ret; +} + +void btrfs_zlib_exit(void) +{ + free_workspaces(); +} diff --git a/fs/jffs2/compr_rubin.c b/fs/jffs2/compr_rubin.c index c73fa89b5f8..170d289ac78 100644 --- a/fs/jffs2/compr_rubin.c +++ b/fs/jffs2/compr_rubin.c @@ -22,9 +22,7 @@ #define BIT_DIVIDER_MIPS 1043 -static int bits_mips[8] = { 277,249,290,267,229,341,212,241}; /* mips32 */ - -#include <linux/errno.h> +static int bits_mips[8] = { 277, 249, 290, 267, 229, 341, 212, 241}; struct pushpull { unsigned char *buf; @@ -43,7 +41,9 @@ struct rubin_state { int bits[8]; }; -static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen, unsigned ofs, unsigned reserve) +static inline void init_pushpull(struct pushpull *pp, char *buf, + unsigned buflen, unsigned ofs, + unsigned reserve) { pp->buf = buf; pp->buflen = buflen; @@ -53,16 +53,14 @@ static inline void init_pushpull(struct pushpull *pp, char *buf, unsigned buflen static inline int pushbit(struct pushpull *pp, int bit, int use_reserved) { - if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) { + if (pp->ofs >= pp->buflen - (use_reserved?0:pp->reserve)) return -ENOSPC; - } - if (bit) { - pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs &7))); - } - else { - pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs &7))); - } + if (bit) + pp->buf[pp->ofs >> 3] |= (1<<(7-(pp->ofs & 7))); + else + pp->buf[pp->ofs >> 3] &= ~(1<<(7-(pp->ofs & 7))); + pp->ofs++; return 0; @@ -97,6 +95,7 @@ static void init_rubin(struct rubin_state *rs, int div, int *bits) rs->p = (long) (2 * UPPER_BIT_RUBIN); rs->bit_number = (long) 0; rs->bit_divider = div; + for (c=0; c<8; c++) rs->bits[c] = bits[c]; } @@ -108,7 +107,8 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol) long i0, i1; int ret; - while ((rs->q >= UPPER_BIT_RUBIN) || ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { + while ((rs->q >= UPPER_BIT_RUBIN) || + ((rs->p + rs->q) <= UPPER_BIT_RUBIN)) { rs->bit_number++; ret = pushbit(&rs->pp, (rs->q & UPPER_BIT_RUBIN) ? 1 : 0, 0); @@ -119,12 +119,12 @@ static int encode(struct rubin_state *rs, long A, long B, int symbol) rs->p <<= 1; } i0 = A * rs->p / (A + B); - if (i0 <= 0) { + if (i0 <= 0) i0 = 1; - } - if (i0 >= rs->p) { + + if (i0 >= rs->p) i0 = rs->p - 1; - } + i1 = rs->p - i0; if (symbol == 0) @@ -157,11 +157,13 @@ static void init_decode(struct rubin_state *rs, int div, int *bits) /* behalve lower */ rs->rec_q = 0; - for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) + for (rs->bit_number = 0; rs->bit_number++ < RUBIN_REG_SIZE; + rs->rec_q = rs->rec_q * 2 + (long) (pullbit(&rs->pp))) ; } -static void __do_decode(struct rubin_state *rs, unsigned long p, unsigned long q) +static void __do_decode(struct rubin_state *rs, unsigned long p, + unsigned long q) { register unsigned long lower_bits_rubin = LOWER_BITS_RUBIN; unsigned long rec_q; @@ -207,12 +209,11 @@ static int decode(struct rubin_state *rs, long A, long B) __do_decode(rs, p, q); i0 = A * rs->p / (A + B); - if (i0 <= 0) { + if (i0 <= 0) i0 = 1; - } - if (i0 >= rs->p) { + + if (i0 >= rs->p) i0 = rs->p - 1; - } threshold = rs->q + i0; symbol = rs->rec_q >= threshold; @@ -234,14 +235,15 @@ static int out_byte(struct rubin_state *rs, unsigned char byte) struct rubin_state rs_copy; rs_copy = *rs; - for (i=0;i<8;i++) { - ret = encode(rs, rs->bit_divider-rs->bits[i],rs->bits[i],byte&1); + for (i=0; i<8; i++) { + ret = encode(rs, rs->bit_divider-rs->bits[i], + rs->bits[i], byte & 1); if (ret) { /* Failed. Restore old state */ *rs = rs_copy; return ret; } - byte=byte>>1; + byte >>= 1 ; } return 0; } @@ -251,7 +253,8 @@ static int in_byte(struct rubin_state *rs) int i, result = 0, bit_divider = rs->bit_divider; for (i = 0; i < 8; i++) - result |= decode(rs, bit_divider - rs->bits[i], rs->bits[i]) << i; + result |= decode(rs, bit_divider - rs->bits[i], + rs->bits[i]) << i; return result; } @@ -259,7 +262,8 @@ static int in_byte(struct rubin_state *rs) static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, - unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen) + unsigned char *cpage_out, uint32_t *sourcelen, + uint32_t *dstlen) { int outpos = 0; int pos=0; @@ -295,7 +299,8 @@ static int rubin_do_compress(int bit_divider, int *bits, unsigned char *data_in, int jffs2_rubinmips_compress(unsigned char *data_in, unsigned char *cpage_out, uint32_t *sourcelen, uint32_t *dstlen, void *model) { - return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); + return rubin_do_compress(BIT_DIVIDER_MIPS, bits_mips, data_in, + cpage_out, sourcelen, dstlen); } #endif static int jffs2_dynrubin_compress(unsigned char *data_in, @@ -316,9 +321,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, return -1; memset(histo, 0, 256); - for (i=0; i<mysrclen; i++) { + for (i=0; i<mysrclen; i++) histo[data_in[i]]++; - } memset(bits, 0, sizeof(int)*8); for (i=0; i<256; i++) { if (i&128) @@ -346,7 +350,8 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, cpage_out[i] = bits[i]; } - ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, &mydstlen); + ret = rubin_do_compress(256, bits, data_in, cpage_out+8, &mysrclen, + &mydstlen); if (ret) return ret; @@ -363,8 +368,10 @@ static int jffs2_dynrubin_compress(unsigned char *data_in, return 0; } -static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata_in, - unsigned char *page_out, uint32_t srclen, uint32_t destlen) +static void rubin_do_decompress(int bit_divider, int *bits, + unsigned char *cdata_in, + unsigned char *page_out, uint32_t srclen, + uint32_t destlen) { int outpos = 0; struct rubin_state rs; @@ -372,9 +379,8 @@ static void rubin_do_decompress(int bit_divider, int *bits, unsigned char *cdata init_pushpull(&rs.pp, cdata_in, srclen, 0, 0); init_decode(&rs, bit_divider, bits); - while (outpos < destlen) { + while (outpos < destlen) page_out[outpos++] = in_byte(&rs); - } } @@ -383,7 +389,8 @@ static int jffs2_rubinmips_decompress(unsigned char *data_in, uint32_t sourcelen, uint32_t dstlen, void *model) { - rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, cpage_out, sourcelen, dstlen); + rubin_do_decompress(BIT_DIVIDER_MIPS, bits_mips, data_in, + cpage_out, sourcelen, dstlen); return 0; } @@ -398,52 +405,53 @@ static int jffs2_dynrubin_decompress(unsigned char *data_in, for (c=0; c<8; c++) bits[c] = data_in[c]; - rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, dstlen); + rubin_do_decompress(256, bits, data_in+8, cpage_out, sourcelen-8, + dstlen); return 0; } static struct jffs2_compressor jffs2_rubinmips_comp = { - .priority = JFFS2_RUBINMIPS_PRIORITY, - .name = "rubinmips", - .compr = JFFS2_COMPR_DYNRUBIN, - .compress = NULL, /*&jffs2_rubinmips_compress,*/ - .decompress = &jffs2_rubinmips_decompress, + .priority = JFFS2_RUBINMIPS_PRIORITY, + .name = "rubinmips", + .compr = JFFS2_COMPR_DYNRUBIN, + .compress = NULL, /*&jffs2_rubinmips_compress,*/ + .decompress = &jffs2_rubinmips_decompress, #ifdef JFFS2_RUBINMIPS_DISABLED - .disabled = 1, + .disabled = 1, #else - .disabled = 0, + .disabled = 0, #endif }; int jffs2_rubinmips_init(void) { - return jffs2_register_compressor(&jffs2_rubinmips_comp); + return jffs2_register_compressor(&jffs2_rubinmips_comp); } void jffs2_rubinmips_exit(void) { - jffs2_unregister_compressor(&jffs2_rubinmips_comp); + jffs2_unregister_compressor(&jffs2_rubinmips_comp); } static struct jffs2_compressor jffs2_dynrubin_comp = { - .priority = JFFS2_DYNRUBIN_PRIORITY, - .name = "dynrubin", - .compr = JFFS2_COMPR_RUBINMIPS, - .compress = jffs2_dynrubin_compress, - .decompress = &jffs2_dynrubin_decompress, + .priority = JFFS2_DYNRUBIN_PRIORITY, + .name = "dynrubin", + .compr = JFFS2_COMPR_RUBINMIPS, + .compress = jffs2_dynrubin_compress, + .decompress = &jffs2_dynrubin_decompress, #ifdef JFFS2_DYNRUBIN_DISABLED - .disabled = 1, + .disabled = 1, #else - .disabled = 0, + .disabled = 0, #endif }; int jffs2_dynrubin_init(void) { - return jffs2_register_compressor(&jffs2_dynrubin_comp); + return jffs2_register_compressor(&jffs2_dynrubin_comp); } void jffs2_dynrubin_exit(void) { - jffs2_unregister_compressor(&jffs2_dynrubin_comp); + jffs2_unregister_compressor(&jffs2_dynrubin_comp); } diff --git a/fs/jffs2/erase.c b/fs/jffs2/erase.c index 259461b910a..c32b4a1ad6c 100644 --- a/fs/jffs2/erase.c +++ b/fs/jffs2/erase.c @@ -175,7 +175,7 @@ static void jffs2_erase_failed(struct jffs2_sb_info *c, struct jffs2_eraseblock { /* For NAND, if the failure did not occur at the device level for a specific physical page, don't bother updating the bad block table. */ - if (jffs2_cleanmarker_oob(c) && (bad_offset != MTD_FAIL_ADDR_UNKNOWN)) { + if (jffs2_cleanmarker_oob(c) && (bad_offset != (uint32_t)MTD_FAIL_ADDR_UNKNOWN)) { /* We had a device-level failure to erase. Let's see if we've failed too many times. */ if (!jffs2_write_nand_badblock(c, jeb, bad_offset)) { @@ -209,7 +209,8 @@ static void jffs2_erase_callback(struct erase_info *instr) struct erase_priv_struct *priv = (void *)instr->priv; if(instr->state != MTD_ERASE_DONE) { - printk(KERN_WARNING "Erase at 0x%08x finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", instr->addr, instr->state); + printk(KERN_WARNING "Erase at 0x%08llx finished, but state != MTD_ERASE_DONE. State is 0x%x instead.\n", + (unsigned long long)instr->addr, instr->state); jffs2_erase_failed(priv->c, priv->jeb, instr->fail_addr); } else { jffs2_erase_succeeded(priv->c, priv->jeb); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 5198ada6739..6d720243f5f 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -334,6 +334,7 @@ void delete_partition(struct gendisk *disk, int partno) blk_free_devt(part_devt(part)); rcu_assign_pointer(ptbl->part[partno], NULL); + rcu_assign_pointer(ptbl->last_lookup, NULL); kobject_put(part->holder_dir); device_del(part_to_dev(part)); diff --git a/include/acpi/acdisasm.h b/include/acpi/acdisasm.h deleted file mode 100644 index 0c1ed387073..00000000000 --- a/include/acpi/acdisasm.h +++ /dev/null @@ -1,445 +0,0 @@ -/****************************************************************************** - * - * Name: acdisasm.h - AML disassembler - * - *****************************************************************************/ - -/* - * Copyright (C) 2000 - 2008, Intel Corp. - * All rights reserved. - * - * Redistribution and use in source and binary forms, with or without - * modification, are permitted provided that the following conditions - * are met: - * 1. Redistributions of source code must retain the above copyright - * notice, this list of conditions, and the following disclaimer, - * without modification. - * 2. Redistributions in binary form must reproduce at minimum a disclaimer - * substantially similar to the "NO WARRANTY" disclaimer below - * ("Disclaimer") and any redistribution must be conditioned upon - * including a substantially similar Disclaimer requirement for further - * binary redistribution. - * 3. Neither the names of the above-listed copyright holders nor the names - * of any contributors may be used to endorse or promote products derived - * from this software without specific prior written permission. - * - * Alternatively, this software may be distributed under the terms of the - * GNU General Public License ("GPL") version 2 as published by the Free - * Software Foundation. - * - * NO WARRANTY - * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS - * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT - * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR - * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT - * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL - * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS - * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) - * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, - * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING - * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE - * POSSIBILITY OF SUCH DAMAGES. - */ - -#ifndef __ACDISASM_H__ -#define __ACDISASM_H__ - -#include "amlresrc.h" - -#define BLOCK_NONE 0 -#define BLOCK_PAREN 1 -#define BLOCK_BRACE 2 -#define BLOCK_COMMA_LIST 4 -#define ACPI_DEFAULT_RESNAME *(u32 *) "__RD" - -struct acpi_external_list { - char *path; - char *internal_path; - struct acpi_external_list *next; - u32 value; - u16 length; - u8 type; -}; - -extern struct acpi_external_list *acpi_gbl_external_list; - -typedef const struct acpi_dmtable_info { - u8 opcode; - u8 offset; - char *name; - -} acpi_dmtable_info; - -/* - * Values for Opcode above. - * Note: 0-7 must not change, used as a flag shift value - */ -#define ACPI_DMT_FLAG0 0 -#define ACPI_DMT_FLAG1 1 -#define ACPI_DMT_FLAG2 2 -#define ACPI_DMT_FLAG3 3 -#define ACPI_DMT_FLAG4 4 -#define ACPI_DMT_FLAG5 5 -#define ACPI_DMT_FLAG6 6 -#define ACPI_DMT_FLAG7 7 -#define ACPI_DMT_FLAGS0 8 -#define ACPI_DMT_FLAGS2 9 -#define ACPI_DMT_UINT8 10 -#define ACPI_DMT_UINT16 11 -#define ACPI_DMT_UINT24 12 -#define ACPI_DMT_UINT32 13 -#define ACPI_DMT_UINT56 14 -#define ACPI_DMT_UINT64 15 -#define ACPI_DMT_STRING 16 -#define ACPI_DMT_NAME4 17 -#define ACPI_DMT_NAME6 18 -#define ACPI_DMT_NAME8 19 -#define ACPI_DMT_CHKSUM 20 -#define ACPI_DMT_SPACEID 21 -#define ACPI_DMT_GAS 22 -#define ACPI_DMT_ASF 23 -#define ACPI_DMT_DMAR 24 -#define ACPI_DMT_HEST 25 -#define ACPI_DMT_HESTNTFY 26 -#define ACPI_DMT_HESTNTYP 27 -#define ACPI_DMT_MADT 28 -#define ACPI_DMT_SRAT 29 -#define ACPI_DMT_EXIT 30 -#define ACPI_DMT_SIG 31 - -typedef -void (*acpi_dmtable_handler) (struct acpi_table_header * table); - -struct acpi_dmtable_data { - char *signature; - struct acpi_dmtable_info *table_info; - acpi_dmtable_handler table_handler; - char *name; -}; - -struct acpi_op_walk_info { - u32 level; - u32 last_level; - u32 count; - u32 bit_offset; - u32 flags; - struct acpi_walk_state *walk_state; -}; - -typedef -acpi_status(*asl_walk_callback) (union acpi_parse_object * op, - u32 level, void *context); - -struct acpi_resource_tag { - u32 bit_index; - char *tag; -}; - -/* Strings used for decoding flags to ASL keywords */ - -extern const char *acpi_gbl_word_decode[]; -extern const char *acpi_gbl_irq_decode[]; -extern const char *acpi_gbl_lock_rule[]; -extern const char *acpi_gbl_access_types[]; -extern const char *acpi_gbl_update_rules[]; -extern const char *acpi_gbl_match_ops[]; - -extern struct acpi_dmtable_info acpi_dm_table_info_asf0[]; -extern struct acpi_dmtable_info acpi_dm_table_info_asf1[]; -extern struct acpi_dmtable_info acpi_dm_table_info_asf1a[]; -extern struct acpi_dmtable_info acpi_dm_table_info_asf2[]; -extern struct acpi_dmtable_info acpi_dm_table_info_asf2a[]; -extern struct acpi_dmtable_info acpi_dm_table_info_asf3[]; -extern struct acpi_dmtable_info acpi_dm_table_info_asf4[]; -extern struct acpi_dmtable_info acpi_dm_table_info_asf_hdr[]; -extern struct acpi_dmtable_info acpi_dm_table_info_boot[]; -extern struct acpi_dmtable_info acpi_dm_table_info_bert[]; -extern struct acpi_dmtable_info acpi_dm_table_info_cpep[]; -extern struct acpi_dmtable_info acpi_dm_table_info_cpep0[]; -extern struct acpi_dmtable_info acpi_dm_table_info_dbgp[]; -extern struct acpi_dmtable_info acpi_dm_table_info_dmar[]; -extern struct acpi_dmtable_info acpi_dm_table_info_dmar_hdr[]; -extern struct acpi_dmtable_info acpi_dm_table_info_dmar_scope[]; -extern struct acpi_dmtable_info acpi_dm_table_info_dmar0[]; -extern struct acpi_dmtable_info acpi_dm_table_info_dmar1[]; -extern struct acpi_dmtable_info acpi_dm_table_info_dmar2[]; -extern struct acpi_dmtable_info acpi_dm_table_info_ecdt[]; -extern struct acpi_dmtable_info acpi_dm_table_info_einj[]; -extern struct acpi_dmtable_info acpi_dm_table_info_einj0[]; -extern struct acpi_dmtable_info acpi_dm_table_info_erst[]; -extern struct acpi_dmtable_info acpi_dm_table_info_facs[]; -extern struct acpi_dmtable_info acpi_dm_table_info_fadt1[]; -extern struct acpi_dmtable_info acpi_dm_table_info_fadt2[]; -extern struct acpi_dmtable_info acpi_dm_table_info_gas[]; -extern struct acpi_dmtable_info acpi_dm_table_info_header[]; -extern struct acpi_dmtable_info acpi_dm_table_info_hest[]; -extern struct acpi_dmtable_info acpi_dm_table_info_hest9[]; -extern struct acpi_dmtable_info acpi_dm_table_info_hest_notify[]; -extern struct acpi_dmtable_info acpi_dm_table_info_hpet[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt0[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt1[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt2[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt3[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt4[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt5[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt6[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt7[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt8[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt9[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt10[]; -extern struct acpi_dmtable_info acpi_dm_table_info_madt_hdr[]; -extern struct acpi_dmtable_info acpi_dm_table_info_mcfg[]; -extern struct acpi_dmtable_info acpi_dm_table_info_mcfg0[]; -extern struct acpi_dmtable_info acpi_dm_table_info_rsdp1[]; -extern struct acpi_dmtable_info acpi_dm_table_info_rsdp2[]; -extern struct acpi_dmtable_info acpi_dm_table_info_sbst[]; -extern struct acpi_dmtable_info acpi_dm_table_info_slic[]; -extern struct acpi_dmtable_info acpi_dm_table_info_slit[]; -extern struct acpi_dmtable_info acpi_dm_table_info_spcr[]; -extern struct acpi_dmtable_info acpi_dm_table_info_spmi[]; -extern struct acpi_dmtable_info acpi_dm_table_info_srat[]; -extern struct acpi_dmtable_info acpi_dm_table_info_srat_hdr[]; -extern struct acpi_dmtable_info acpi_dm_table_info_srat0[]; -extern struct acpi_dmtable_info acpi_dm_table_info_srat1[]; -extern struct acpi_dmtable_info acpi_dm_table_info_srat2[]; -extern struct acpi_dmtable_info acpi_dm_table_info_tcpa[]; -extern struct acpi_dmtable_info acpi_dm_table_info_wdrt[]; - -/* - * dmtable - */ -void acpi_dm_dump_data_table(struct acpi_table_header *table); - -acpi_status -acpi_dm_dump_table(u32 table_length, - u32 table_offset, - void *table, - u32 sub_table_length, struct acpi_dmtable_info *info); - -void acpi_dm_line_header(u32 offset, u32 byte_length, char *name); - -void acpi_dm_line_header2(u32 offset, u32 byte_length, char *name, u32 value); - -/* - * dmtbdump - */ -void acpi_dm_dump_asf(struct acpi_table_header *table); - -void acpi_dm_dump_cpep(struct acpi_table_header *table); - -void acpi_dm_dump_dmar(struct acpi_table_header *table); - -void acpi_dm_dump_einj(struct acpi_table_header *table); - -void acpi_dm_dump_erst(struct acpi_table_header *table); - -void acpi_dm_dump_fadt(struct acpi_table_header *table); - -void acpi_dm_dump_hest(struct acpi_table_header *table); - -void acpi_dm_dump_mcfg(struct acpi_table_header *table); - -void acpi_dm_dump_madt(struct acpi_table_header *table); - -u32 acpi_dm_dump_rsdp(struct acpi_table_header *table); - -void acpi_dm_dump_rsdt(struct acpi_table_header *table); - -void acpi_dm_dump_slit(struct acpi_table_header *table); - -void acpi_dm_dump_srat(struct acpi_table_header *table); - -void acpi_dm_dump_xsdt(struct acpi_table_header *table); - -/* - * dmwalk - */ -void -acpi_dm_disassemble(struct acpi_walk_state *walk_state, - union acpi_parse_object *origin, u32 num_opcodes); - -void -acpi_dm_walk_parse_tree(union acpi_parse_object *op, - asl_walk_callback descending_callback, - asl_walk_callback ascending_callback, void *context); - -/* - * dmopcode - */ -void -acpi_dm_disassemble_one_op(struct acpi_walk_state *walk_state, - struct acpi_op_walk_info *info, - union acpi_parse_object *op); - -void acpi_dm_decode_internal_object(union acpi_operand_object *obj_desc); - -u32 acpi_dm_list_type(union acpi_parse_object *op); - -void acpi_dm_method_flags(union acpi_parse_object *op); - -void acpi_dm_field_flags(union acpi_parse_object *op); - -void acpi_dm_address_space(u8 space_id); - -void acpi_dm_region_flags(union acpi_parse_object *op); - -void acpi_dm_match_op(union acpi_parse_object *op); - -u8 acpi_dm_comma_if_list_member(union acpi_parse_object *op); - -void acpi_dm_comma_if_field_member(union acpi_parse_object *op); - -/* - * dmnames - */ -u32 acpi_dm_dump_name(char *name); - -acpi_status -acpi_ps_display_object_pathname(struct acpi_walk_state *walk_state, - union acpi_parse_object *op); - -void acpi_dm_namestring(char *name); - -/* - * dmobject - */ -void -acpi_dm_display_internal_object(union acpi_operand_object *obj_desc, - struct acpi_walk_state *walk_state); - -void acpi_dm_display_arguments(struct acpi_walk_state *walk_state); - -void acpi_dm_display_locals(struct acpi_walk_state *walk_state); - -void -acpi_dm_dump_method_info(acpi_status status, - struct acpi_walk_state *walk_state, - union acpi_parse_object *op); - -/* - * dmbuffer - */ -void acpi_dm_disasm_byte_list(u32 level, u8 * byte_data, u32 byte_count); - -void -acpi_dm_byte_list(struct acpi_op_walk_info *info, union acpi_parse_object *op); - -void acpi_dm_is_eisa_id(union acpi_parse_object *op); - -void acpi_dm_eisa_id(u32 encoded_id); - -u8 acpi_dm_is_unicode_buffer(union acpi_parse_object *op); - -u8 acpi_dm_is_string_buffer(union acpi_parse_object *op); - -/* - * dmresrc - */ -void acpi_dm_dump_integer8(u8 value, char *name); - -void acpi_dm_dump_integer16(u16 value, char *name); - -void acpi_dm_dump_integer32(u32 value, char *name); - -void acpi_dm_dump_integer64(u64 value, char *name); - -void -acpi_dm_resource_template(struct acpi_op_walk_info *info, - union acpi_parse_object *op, - u8 * byte_data, u32 byte_count); - -acpi_status acpi_dm_is_resource_template(union acpi_parse_object *op); - -void acpi_dm_indent(u32 level); - -void acpi_dm_bit_list(u16 mask); - -void acpi_dm_decode_attribute(u8 attribute); - -void acpi_dm_descriptor_name(void); - -/* - * dmresrcl - */ -void -acpi_dm_word_descriptor(union aml_resource *resource, u32 length, u32 level); - -void -acpi_dm_dword_descriptor(union aml_resource *resource, u32 length, u32 level); - -void -acpi_dm_extended_descriptor(union aml_resource *resource, - u32 length, u32 level); - -void -acpi_dm_qword_descriptor(union aml_resource *resource, u32 length, u32 level); - -void -acpi_dm_memory24_descriptor(union aml_resource *resource, - u32 length, u32 level); - -void -acpi_dm_memory32_descriptor(union aml_resource *resource, - u32 length, u32 level); - -void -acpi_dm_fixed_memory32_descriptor(union aml_resource *resource, - u32 length, u32 level); - -void -acpi_dm_generic_register_descriptor(union aml_resource *resource, - u32 length, u32 level); - -void -acpi_dm_interrupt_descriptor(union aml_resource *resource, - u32 length, u32 level); - -void -acpi_dm_vendor_large_descriptor(union aml_resource *resource, - u32 length, u32 level); - -void acpi_dm_vendor_common(char *name, u8 * byte_data, u32 length, u32 level); - -/* - * dmresrcs - */ -void -acpi_dm_irq_descriptor(union aml_resource *resource, u32 length, u32 level); - -void -acpi_dm_dma_descriptor(union aml_resource *resource, u32 length, u32 level); - -void acpi_dm_io_descriptor(union aml_resource *resource, u32 length, u32 level); - -void -acpi_dm_fixed_io_descriptor(union aml_resource *resource, - u32 length, u32 level); - -void -acpi_dm_start_dependent_descriptor(union aml_resource *resource, - u32 length, u32 level); - -void -acpi_dm_end_dependent_descriptor(union aml_resource *resource, - u32 length, u32 level); - -void -acpi_dm_vendor_small_descriptor(union aml_resource *resource, - u32 length, u32 level); - -/* - * dmutils - */ -void acpi_dm_add_to_external_list(char *path, u8 type, u32 value); - -/* - * dmrestag - */ -void acpi_dm_find_resources(union acpi_parse_object *root); - -void -acpi_dm_check_resource_reference(union acpi_parse_object *op, - struct acpi_walk_state *walk_state); - -#endif /* __ACDISASM_H__ */ diff --git a/include/acpi/acexcep.h b/include/acpi/acexcep.h index 84f5cb24286..eda04546cdf 100644 --- a/include/acpi/acexcep.h +++ b/include/acpi/acexcep.h @@ -153,8 +153,9 @@ #define AE_AML_CIRCULAR_REFERENCE (acpi_status) (0x001E | AE_CODE_AML) #define AE_AML_BAD_RESOURCE_LENGTH (acpi_status) (0x001F | AE_CODE_AML) #define AE_AML_ILLEGAL_ADDRESS (acpi_status) (0x0020 | AE_CODE_AML) +#define AE_AML_INFINITE_LOOP (acpi_status) (0x0021 | AE_CODE_AML) -#define AE_CODE_AML_MAX 0x0020 +#define AE_CODE_AML_MAX 0x0021 /* * Internal exceptions used for control @@ -175,6 +176,8 @@ #define AE_CODE_CTRL_MAX 0x000D +/* Exception strings for acpi_format_exception */ + #ifdef DEFINE_ACPI_GLOBALS /* @@ -267,6 +270,7 @@ char const *acpi_gbl_exception_names_aml[] = { "AE_AML_CIRCULAR_REFERENCE", "AE_AML_BAD_RESOURCE_LENGTH", "AE_AML_ILLEGAL_ADDRESS", + "AE_AML_INFINITE_LOOP" }; char const *acpi_gbl_exception_names_ctrl[] = { diff --git a/include/acpi/acoutput.h b/include/acpi/acoutput.h index db8852d8bcf..5c823d5ab78 100644 --- a/include/acpi/acoutput.h +++ b/include/acpi/acoutput.h @@ -45,9 +45,9 @@ #define __ACOUTPUT_H__ /* - * Debug levels and component IDs. These are used to control the - * granularity of the output of the DEBUG_PRINT macro -- on a per- - * component basis and a per-exception-type basis. + * Debug levels and component IDs. These are used to control the + * granularity of the output of the ACPI_DEBUG_PRINT macro -- on a + * per-component basis and a per-exception-type basis. */ /* Component IDs are used in the global "DebugLayer" */ @@ -69,8 +69,10 @@ #define ACPI_COMPILER 0x00001000 #define ACPI_TOOLS 0x00002000 +#define ACPI_EXAMPLE 0x00004000 +#define ACPI_DRIVER 0x00008000 -#define ACPI_ALL_COMPONENTS 0x00003FFF +#define ACPI_ALL_COMPONENTS 0x0000FFFF #define ACPI_COMPONENT_DEFAULT (ACPI_ALL_COMPONENTS) /* Component IDs reserved for ACPI drivers */ @@ -78,7 +80,7 @@ #define ACPI_ALL_DRIVERS 0xFFFF0000 /* - * Raw debug output levels, do not use these in the DEBUG_PRINT macros + * Raw debug output levels, do not use these in the ACPI_DEBUG_PRINT macros */ #define ACPI_LV_INIT 0x00000001 #define ACPI_LV_DEBUG_OBJECT 0x00000002 @@ -176,4 +178,95 @@ #define ACPI_NORMAL_DEFAULT (ACPI_LV_INIT | ACPI_LV_DEBUG_OBJECT) #define ACPI_DEBUG_ALL (ACPI_LV_AML_DISASSEMBLE | ACPI_LV_ALL_EXCEPTIONS | ACPI_LV_ALL) +#if defined (ACPI_DEBUG_OUTPUT) || !defined (ACPI_NO_ERROR_MESSAGES) +/* + * Module name is included in both debug and non-debug versions primarily for + * error messages. The __FILE__ macro is not very useful for this, because it + * often includes the entire pathname to the module + */ +#define ACPI_MODULE_NAME(name) static const char ACPI_UNUSED_VAR _acpi_module_name[] = name; +#else +#define ACPI_MODULE_NAME(name) +#endif + +/* + * Ascii error messages can be configured out + */ +#ifndef ACPI_NO_ERROR_MESSAGES +#define AE_INFO _acpi_module_name, __LINE__ + +/* + * Error reporting. Callers module and line number are inserted by AE_INFO, + * the plist contains a set of parens to allow variable-length lists. + * These macros are used for both the debug and non-debug versions of the code. + */ +#define ACPI_INFO(plist) acpi_info plist +#define ACPI_WARNING(plist) acpi_warning plist +#define ACPI_EXCEPTION(plist) acpi_exception plist +#define ACPI_ERROR(plist) acpi_error plist + +#else + +/* No error messages */ + +#define ACPI_INFO(plist) +#define ACPI_WARNING(plist) +#define ACPI_EXCEPTION(plist) +#define ACPI_ERROR(plist) + +#endif /* ACPI_NO_ERROR_MESSAGES */ + +/* + * Debug macros that are conditionally compiled + */ +#ifdef ACPI_DEBUG_OUTPUT + +/* + * If ACPI_GET_FUNCTION_NAME was not defined in the compiler-dependent header, + * define it now. This is the case where there the compiler does not support + * a __FUNCTION__ macro or equivalent. + */ +#ifndef ACPI_GET_FUNCTION_NAME +#define ACPI_GET_FUNCTION_NAME _acpi_function_name + +/* + * The Name parameter should be the procedure name as a quoted string. + * The function name is also used by the function exit macros below. + * Note: (const char) is used to be compatible with the debug interfaces + * and macros such as __FUNCTION__. + */ +#define ACPI_FUNCTION_NAME(name) static const char _acpi_function_name[] = #name; + +#else +/* Compiler supports __FUNCTION__ (or equivalent) -- Ignore this macro */ + +#define ACPI_FUNCTION_NAME(name) +#endif /* ACPI_GET_FUNCTION_NAME */ + +/* + * Common parameters used for debug output functions: + * line number, function name, module(file) name, component ID + */ +#define ACPI_DEBUG_PARAMETERS __LINE__, ACPI_GET_FUNCTION_NAME, _acpi_module_name, _COMPONENT + +/* + * Master debug print macros + * Print message if and only if: + * 1) Debug print for the current component is enabled + * 2) Debug error level or trace level for the print statement is enabled + */ +#define ACPI_DEBUG_PRINT(plist) acpi_debug_print plist +#define ACPI_DEBUG_PRINT_RAW(plist) acpi_debug_print_raw plist + +#else +/* + * This is the non-debug case -- make everything go away, + * leaving no executable debug code! + */ +#define ACPI_FUNCTION_NAME(a) +#define ACPI_DEBUG_PRINT(pl) +#define ACPI_DEBUG_PRINT_RAW(pl) + +#endif /* ACPI_DEBUG_OUTPUT */ + #endif /* __ACOUTPUT_H__ */ diff --git a/include/acpi/acpi.h b/include/acpi/acpi.h index c515ef6cc89..472b7bf0c5d 100644 --- a/include/acpi/acpi.h +++ b/include/acpi/acpi.h @@ -1,6 +1,6 @@ /****************************************************************************** * - * Name: acpi.h - Master include file, Publics and external data. + * Name: acpi.h - Master public include file used to interface to ACPICA * *****************************************************************************/ @@ -45,25 +45,22 @@ #define __ACPI_H__ /* - * Common includes for all ACPI driver files - * We put them here because we don't want to duplicate them - * in the rest of the source code again and again. + * Public include files for use by code that will interface to ACPICA. + * + * Information includes the ACPICA data types, names, exceptions, and + * external interface prototypes. Also included are the definitions for + * all ACPI tables (FADT, MADT, etc.) + * + * Note: The order of these include files is important. */ -#include "acnames.h" /* Global ACPI names and strings */ -#include "acconfig.h" /* Configuration constants */ -#include "platform/acenv.h" /* Target environment specific items */ -#include "actypes.h" /* Fundamental common data types */ -#include "acexcep.h" /* ACPI exception codes */ -#include "acmacros.h" /* C macros */ +#include "platform/acenv.h" /* Environment-specific items */ +#include "acnames.h" /* Common ACPI names and strings */ +#include "actypes.h" /* ACPICA data types and structures */ +#include "acexcep.h" /* ACPICA exceptions */ #include "actbl.h" /* ACPI table definitions */ -#include "aclocal.h" /* Internal data types */ #include "acoutput.h" /* Error output and Debug macros */ -#include "acpiosxf.h" /* Interfaces to the ACPI-to-OS layer */ +#include "acrestyp.h" /* Resource Descriptor structs */ +#include "acpiosxf.h" /* OSL interfaces (ACPICA-to-OS) */ #include "acpixf.h" /* ACPI core subsystem external interfaces */ -#include "acobject.h" /* ACPI internal object */ -#include "acstruct.h" /* Common structures */ -#include "acglobal.h" /* All global variables */ -#include "achware.h" /* Hardware defines and interfaces */ -#include "acutils.h" /* Utility interfaces */ #endif /* __ACPI_H__ */ diff --git a/include/acpi/acpiosxf.h b/include/acpi/acpiosxf.h index b91440ac0d1..a62720a7edc 100644 --- a/include/acpi/acpiosxf.h +++ b/include/acpi/acpiosxf.h @@ -121,8 +121,11 @@ acpi_os_wait_semaphore(acpi_semaphore handle, u32 units, u16 timeout); acpi_status acpi_os_signal_semaphore(acpi_semaphore handle, u32 units); /* - * Mutex primitives + * Mutex primitives. May be configured to use semaphores instead via + * ACPI_MUTEX_TYPE (see platform/acenv.h) */ +#if (ACPI_MUTEX_TYPE != ACPI_BINARY_SEMAPHORE) + acpi_status acpi_os_create_mutex(acpi_mutex * out_handle); void acpi_os_delete_mutex(acpi_mutex handle); @@ -130,13 +133,7 @@ void acpi_os_delete_mutex(acpi_mutex handle); acpi_status acpi_os_acquire_mutex(acpi_mutex handle, u16 timeout); void acpi_os_release_mutex(acpi_mutex handle); - -/* Temporary macros for Mutex* interfaces, map to existing semaphore xfaces */ - -#define acpi_os_create_mutex(out_handle) acpi_os_create_semaphore (1, 1, out_handle) -#define acpi_os_delete_mutex(handle) (void) acpi_os_delete_semaphore (handle) -#define acpi_os_acquire_mutex(handle,time) acpi_os_wait_semaphore (handle, 1, time) -#define acpi_os_release_mutex(handle) (void) acpi_os_signal_semaphore (handle, 1) +#endif /* * Memory allocation and mapping diff --git a/include/acpi/acpixf.h b/include/acpi/acpixf.h index 33bc0e3b195..c8e8cf45830 100644 --- a/include/acpi/acpixf.h +++ b/include/acpi/acpixf.h @@ -45,9 +45,32 @@ #ifndef __ACXFACE_H__ #define __ACXFACE_H__ +/* Current ACPICA subsystem version in YYYYMMDD format */ + +#define ACPI_CA_VERSION 0x20081204 + #include "actypes.h" #include "actbl.h" +extern u8 acpi_gbl_permanent_mmap; + +/* + * Globals that are publically available, allowing for + * run time configuration + */ +extern u32 acpi_dbg_level; +extern u32 acpi_dbg_layer; +extern u8 acpi_gbl_enable_interpreter_slack; +extern u8 acpi_gbl_all_methods_serialized; +extern u8 acpi_gbl_create_osi_method; +extern u8 acpi_gbl_leave_wake_gpes_disabled; +extern acpi_name acpi_gbl_trace_method_name; +extern u32 acpi_gbl_trace_flags; + +extern u32 acpi_current_gpe_count; +extern struct acpi_table_fadt acpi_gbl_FADT; + +extern u32 acpi_rsdt_forced; /* * Global interfaces */ @@ -79,11 +102,6 @@ const char *acpi_format_exception(acpi_status exception); acpi_status acpi_purge_cached_objects(void); -#ifdef ACPI_FUTURE_USAGE -acpi_status -acpi_install_initialization_handler(acpi_init_handler handler, u32 function); -#endif - /* * ACPI Memory management */ @@ -193,9 +211,12 @@ acpi_status acpi_get_id(acpi_handle object, acpi_owner_id * out_type); acpi_status acpi_get_parent(acpi_handle object, acpi_handle * out_handle); /* - * Event handler interfaces + * Handler interfaces */ acpi_status +acpi_install_initialization_handler(acpi_init_handler handler, u32 function); + +acpi_status acpi_install_fixed_event_handler(u32 acpi_event, acpi_event_handler handler, void *context); @@ -227,6 +248,10 @@ acpi_install_gpe_handler(acpi_handle gpe_device, u32 gpe_number, u32 type, acpi_event_handler address, void *context); +acpi_status +acpi_remove_gpe_handler(acpi_handle gpe_device, + u32 gpe_number, acpi_event_handler address); + #ifdef ACPI_FUTURE_USAGE acpi_status acpi_install_exception_handler(acpi_exception_handler handler); #endif @@ -238,10 +263,6 @@ acpi_status acpi_acquire_global_lock(u16 timeout, u32 * handle); acpi_status acpi_release_global_lock(u32 handle); -acpi_status -acpi_remove_gpe_handler(acpi_handle gpe_device, - u32 gpe_number, acpi_event_handler address); - acpi_status acpi_enable_event(u32 event, u32 flags); acpi_status acpi_disable_event(u32 event, u32 flags); @@ -250,6 +271,9 @@ acpi_status acpi_clear_event(u32 event); acpi_status acpi_get_event_status(u32 event, acpi_event_status * event_status); +/* + * GPE Interfaces + */ acpi_status acpi_set_gpe_type(acpi_handle gpe_device, u32 gpe_number, u8 type); acpi_status acpi_enable_gpe(acpi_handle gpe_device, u32 gpe_number); @@ -263,6 +287,12 @@ acpi_get_gpe_status(acpi_handle gpe_device, u32 gpe_number, u32 flags, acpi_event_status * event_status); +acpi_status acpi_disable_all_gpes(void); + +acpi_status acpi_enable_all_runtime_gpes(void); + +acpi_status acpi_get_gpe_device(u32 gpe_index, acpi_handle *gpe_device); + acpi_status acpi_install_gpe_block(acpi_handle gpe_device, struct acpi_generic_address *gpe_block_address, @@ -313,6 +343,8 @@ acpi_resource_to_address64(struct acpi_resource *resource, /* * Hardware (ACPI device) interfaces */ +acpi_status acpi_reset(void); + acpi_status acpi_get_register(u32 register_id, u32 * return_value); acpi_status acpi_get_register_unlocked(u32 register_id, u32 *return_value); @@ -320,12 +352,14 @@ acpi_status acpi_get_register_unlocked(u32 register_id, u32 *return_value); acpi_status acpi_set_register(u32 register_id, u32 value); acpi_status -acpi_set_firmware_waking_vector(acpi_physical_address physical_address); +acpi_set_firmware_waking_vector(u32 physical_address); -#ifdef ACPI_FUTURE_USAGE acpi_status -acpi_get_firmware_waking_vector(acpi_physical_address * physical_address); -#endif +acpi_set_firmware_waking_vector64(u64 physical_address); + +acpi_status acpi_read(u32 *value, struct acpi_generic_address *reg); + +acpi_status acpi_write(u32 value, struct acpi_generic_address *reg); acpi_status acpi_get_sleep_type_data(u8 sleep_state, u8 * slp_typ_a, u8 * slp_typ_b); @@ -340,4 +374,42 @@ acpi_status acpi_leave_sleep_state_prep(u8 sleep_state); acpi_status acpi_leave_sleep_state(u8 sleep_state); +/* + * Debug output + */ +void ACPI_INTERNAL_VAR_XFACE +acpi_error(const char *module_name, + u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3); + +void ACPI_INTERNAL_VAR_XFACE +acpi_exception(const char *module_name, + u32 line_number, + acpi_status status, const char *format, ...) ACPI_PRINTF_LIKE(4); + +void ACPI_INTERNAL_VAR_XFACE +acpi_warning(const char *module_name, + u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3); + +void ACPI_INTERNAL_VAR_XFACE +acpi_info(const char *module_name, + u32 line_number, const char *format, ...) ACPI_PRINTF_LIKE(3); + +#ifdef ACPI_DEBUG_OUTPUT + +void ACPI_INTERNAL_VAR_XFACE +acpi_debug_print(u32 requested_debug_level, + u32 line_number, + const char *function_name, + const char *module_name, + u32 component_id, const char *format, ...) ACPI_PRINTF_LIKE(6); + +void ACPI_INTERNAL_VAR_XFACE +acpi_debug_print_raw(u32 requested_debug_level, + u32 line_number, + const char *function_name, + const char *module_name, + u32 component_id, + const char *format, ...) ACPI_PRINTF_LIKE(6); +#endif + #endif /* __ACXFACE_H__ */ diff --git a/include/acpi/acrestyp.h b/include/acpi/acrestyp.h new file mode 100644 index 00000000000..9ffe00feada --- /dev/null +++ b/include/acpi/acrestyp.h @@ -0,0 +1,405 @@ +/****************************************************************************** + * + * Name: acrestyp.h - Defines, types, and structures for resource descriptors + * + *****************************************************************************/ + +/* + * Copyright (C) 2000 - 2008, Intel Corp. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions, and the following disclaimer, + * without modification. + * 2. Redistributions in binary form must reproduce at minimum a disclaimer + * substantially similar to the "NO WARRANTY" disclaimer below + * ("Disclaimer") and any redistribution must be conditioned upon + * including a substantially similar Disclaimer requirement for further + * binary redistribution. + * 3. Neither the names of the above-listed copyright holders nor the names + * of any contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * Alternatively, this software may be distributed under the terms of the + * GNU General Public License ("GPL") version 2 as published by the Free + * Software Foundation. + * + * NO WARRANTY + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, + * STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + * POSSIBILITY OF SUCH DAMAGES. + */ + +#ifndef __ACRESTYP_H__ +#define __ACRESTYP_H__ + +/* + * Definitions for Resource Attributes + */ +typedef u16 acpi_rs_length; /* Resource Length field is fixed at 16 bits */ +typedef u32 acpi_rsdesc_size; /* Max Resource Descriptor size is (Length+3) = (64_k-1)+3 */ + +/* + * Memory Attributes + */ +#define ACPI_READ_ONLY_MEMORY (u8) 0x00 +#define ACPI_READ_WRITE_MEMORY (u8) 0x01 + +#define ACPI_NON_CACHEABLE_MEMORY (u8) 0x00 +#define ACPI_CACHABLE_MEMORY (u8) 0x01 +#define ACPI_WRITE_COMBINING_MEMORY (u8) 0x02 +#define ACPI_PREFETCHABLE_MEMORY (u8) 0x03 + +/* + * IO Attributes + * The ISA IO ranges are: n000-n0_fFh, n400-n4_fFh, n800-n8_fFh, n_c00-n_cFFh. + * The non-ISA IO ranges are: n100-n3_fFh, n500-n7_fFh, n900-n_bFFh, n_cd0-n_fFFh. + */ +#define ACPI_NON_ISA_ONLY_RANGES (u8) 0x01 +#define ACPI_ISA_ONLY_RANGES (u8) 0x02 +#define ACPI_ENTIRE_RANGE (ACPI_NON_ISA_ONLY_RANGES | ACPI_ISA_ONLY_RANGES) + +/* Type of translation - 1=Sparse, 0=Dense */ + +#define ACPI_SPARSE_TRANSLATION (u8) 0x01 + +/* + * IO Port Descriptor Decode + */ +#define ACPI_DECODE_10 (u8) 0x00 /* 10-bit IO address decode */ +#define ACPI_DECODE_16 (u8) 0x01 /* 16-bit IO address decode */ + +/* + * IRQ Attributes + */ +#define ACPI_LEVEL_SENSITIVE (u8) 0x00 +#define ACPI_EDGE_SENSITIVE (u8) 0x01 + +#define ACPI_ACTIVE_HIGH (u8) 0x00 +#define ACPI_ACTIVE_LOW (u8) 0x01 + +#define ACPI_EXCLUSIVE (u8) 0x00 +#define ACPI_SHARED (u8) 0x01 + +/* + * DMA Attributes + */ +#define ACPI_COMPATIBILITY (u8) 0x00 +#define ACPI_TYPE_A (u8) 0x01 +#define ACPI_TYPE_B (u8) 0x02 +#define ACPI_TYPE_F (u8) 0x03 + +#define ACPI_NOT_BUS_MASTER (u8) 0x00 +#define ACPI_BUS_MASTER (u8) 0x01 + +#define ACPI_TRANSFER_8 (u8) 0x00 +#define ACPI_TRANSFER_8_16 (u8) 0x01 +#define ACPI_TRANSFER_16 (u8) 0x02 + +/* + * Start Dependent Functions Priority definitions + */ +#define ACPI_GOOD_CONFIGURATION (u8) 0x00 +#define ACPI_ACCEPTABLE_CONFIGURATION (u8) 0x01 +#define ACPI_SUB_OPTIMAL_CONFIGURATION (u8) 0x02 + +/* + * 16, 32 and 64-bit Address Descriptor resource types + */ +#define ACPI_MEMORY_RANGE (u8) 0x00 +#define ACPI_IO_RANGE (u8) 0x01 +#define ACPI_BUS_NUMBER_RANGE (u8) 0x02 + +#define ACPI_ADDRESS_NOT_FIXED (u8) 0x00 +#define ACPI_ADDRESS_FIXED (u8) 0x01 + +#define ACPI_POS_DECODE (u8) 0x00 +#define ACPI_SUB_DECODE (u8) 0x01 + +#define ACPI_PRODUCER (u8) 0x00 +#define ACPI_CONSUMER (u8) 0x01 + +/* + * If possible, pack the following structures to byte alignment + */ +#ifndef ACPI_MISALIGNMENT_NOT_SUPPORTED +#pragma pack(1) +#endif + +/* UUID data structures for use in vendor-defined resource descriptors */ + +struct acpi_uuid { + u8 data[ACPI_UUID_LENGTH]; +}; + +struct acpi_vendor_uuid { + u8 subtype; + u8 data[ACPI_UUID_LENGTH]; +}; + +/* + * Structures used to describe device resources + */ +struct acpi_resource_irq { + u8 descriptor_length; + u8 triggering; + u8 polarity; + u8 sharable; + u8 interrupt_count; + u8 interrupts[1]; +}; + +struct acpi_resource_dma { + u8 type; + u8 bus_master; + u8 transfer; + u8 channel_count; + u8 channels[1]; +}; + +struct acpi_resource_start_dependent { + u8 descriptor_length; + u8 compatibility_priority; + u8 performance_robustness; +}; + +/* + * The END_DEPENDENT_FUNCTIONS_RESOURCE struct is not + * needed because it has no fields + */ + +struct acpi_resource_io { + u8 io_decode; + u8 alignment; + u8 address_length; + u16 minimum; + u16 maximum; +}; + +struct acpi_resource_fixed_io { + u16 address; + u8 address_length; +}; + +struct acpi_resource_vendor { + u16 byte_length; + u8 byte_data[1]; +}; + +/* Vendor resource with UUID info (introduced in ACPI 3.0) */ + +struct acpi_resource_vendor_typed { + u16 byte_length; + u8 uuid_subtype; + u8 uuid[ACPI_UUID_LENGTH]; + u8 byte_data[1]; +}; + +struct acpi_resource_end_tag { + u8 checksum; +}; + +struct acpi_resource_memory24 { + u8 write_protect; + u16 minimum; + u16 maximum; + u16 alignment; + u16 address_length; +}; + +struct acpi_resource_memory32 { + u8 write_protect; + u32 minimum; + u32 maximum; + u32 alignment; + u32 address_length; +}; + +struct acpi_resource_fixed_memory32 { + u8 write_protect; + u32 address; + u32 address_length; +}; + +struct acpi_memory_attribute { + u8 write_protect; + u8 caching; + u8 range_type; + u8 translation; +}; + +struct acpi_io_attribute { + u8 range_type; + u8 translation; + u8 translation_type; + u8 reserved1; +}; + +union acpi_resource_attribute { + struct acpi_memory_attribute mem; + struct acpi_io_attribute io; + + /* Used for the *word_space macros */ + + u8 type_specific; +}; + +struct acpi_resource_source { + u8 index; + u16 string_length; + char *string_ptr; +}; + +/* Fields common to all address descriptors, 16/32/64 bit */ + +#define ACPI_RESOURCE_ADDRESS_COMMON \ + u8 resource_type; \ + u8 producer_consumer; \ + u8 decode; \ + u8 min_address_fixed; \ + u8 max_address_fixed; \ + union acpi_resource_attribute info; + +struct acpi_resource_address { +ACPI_RESOURCE_ADDRESS_COMMON}; + +struct acpi_resource_address16 { + ACPI_RESOURCE_ADDRESS_COMMON u16 granularity; + u16 minimum; + u16 maximum; + u16 translation_offset; + u16 address_length; + struct acpi_resource_source resource_source; +}; + +struct acpi_resource_address32 { + ACPI_RESOURCE_ADDRESS_COMMON u32 granularity; + u32 minimum; + u32 maximum; + u32 translation_offset; + u32 address_length; + struct acpi_resource_source resource_source; +}; + +struct acpi_resource_address64 { + ACPI_RESOURCE_ADDRESS_COMMON u64 granularity; + u64 minimum; + u64 maximum; + u64 translation_offset; + u64 address_length; + struct acpi_resource_source resource_source; +}; + +struct acpi_resource_extended_address64 { + ACPI_RESOURCE_ADDRESS_COMMON u8 revision_iD; + u64 granularity; + u64 minimum; + u64 maximum; + u64 translation_offset; + u64 address_length; + u64 type_specific; +}; + +struct acpi_resource_extended_irq { + u8 producer_consumer; + u8 triggering; + u8 polarity; + u8 sharable; + u8 interrupt_count; + struct acpi_resource_source resource_source; + u32 interrupts[1]; +}; + +struct acpi_resource_generic_register { + u8 space_id; + u8 bit_width; + u8 bit_offset; + u8 access_size; + u64 address; +}; + +/* ACPI_RESOURCE_TYPEs */ + +#define ACPI_RESOURCE_TYPE_IRQ 0 +#define ACPI_RESOURCE_TYPE_DMA 1 +#define ACPI_RESOURCE_TYPE_START_DEPENDENT 2 +#define ACPI_RESOURCE_TYPE_END_DEPENDENT 3 +#define ACPI_RESOURCE_TYPE_IO 4 +#define ACPI_RESOURCE_TYPE_FIXED_IO 5 +#define ACPI_RESOURCE_TYPE_VENDOR 6 +#define ACPI_RESOURCE_TYPE_END_TAG 7 +#define ACPI_RESOURCE_TYPE_MEMORY24 8 +#define ACPI_RESOURCE_TYPE_MEMORY32 9 +#define ACPI_RESOURCE_TYPE_FIXED_MEMORY32 10 +#define ACPI_RESOURCE_TYPE_ADDRESS16 11 +#define ACPI_RESOURCE_TYPE_ADDRESS32 12 +#define ACPI_RESOURCE_TYPE_ADDRESS64 13 +#define ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64 14 /* ACPI 3.0 */ +#define ACPI_RESOURCE_TYPE_EXTENDED_IRQ 15 +#define ACPI_RESOURCE_TYPE_GENERIC_REGISTER 16 +#define ACPI_RESOURCE_TYPE_MAX 16 + +/* Master union for resource descriptors */ + +union acpi_resource_data { + struct acpi_resource_irq irq; + struct acpi_resource_dma dma; + struct acpi_resource_start_dependent start_dpf; + struct acpi_resource_io io; + struct acpi_resource_fixed_io fixed_io; + struct acpi_resource_vendor vendor; + struct acpi_resource_vendor_typed vendor_typed; + struct acpi_resource_end_tag end_tag; + struct acpi_resource_memory24 memory24; + struct acpi_resource_memory32 memory32; + struct acpi_resource_fixed_memory32 fixed_memory32; + struct acpi_resource_address16 address16; + struct acpi_resource_address32 address32; + struct acpi_resource_address64 address64; + struct acpi_resource_extended_address64 ext_address64; + struct acpi_resource_extended_irq extended_irq; + struct acpi_resource_generic_register generic_reg; + + /* Common fields */ + + struct acpi_resource_address address; /* Common 16/32/64 address fields */ +}; + +/* Common resource header */ + +struct acpi_resource { + u32 type; + u32 length; + union acpi_resource_data data; +}; + +/* restore default alignment */ + +#pragma pack() + +#define ACPI_RS_SIZE_NO_DATA 8 /* Id + Length fields */ +#define ACPI_RS_SIZE_MIN (u32) ACPI_ROUND_UP_TO_NATIVE_WORD (12) +#define ACPI_RS_SIZE(type) (u32) (ACPI_RS_SIZE_NO_DATA + sizeof (type)) + +#define ACPI_NEXT_RESOURCE(res) (struct acpi_resource *)((u8 *) res + res->length) + +struct acpi_pci_routing_table { + u32 length; + u32 pin; + acpi_integer address; /* here for 64-bit alignment */ + u32 source_index; + char source[4]; /* pad to 64 bits so sizeof() works in all cases */ +}; + +#endif /* __ACRESTYP_H__ */ diff --git a/include/acpi/actbl.h b/include/acpi/actbl.h index 13a3d9ad92d..813e4b6c2c0 100644 --- a/include/acpi/actbl.h +++ b/include/acpi/actbl.h @@ -288,6 +288,31 @@ enum acpi_prefered_pm_profiles { #define ACPI_FADT_OFFSET(f) (u8) ACPI_OFFSET (struct acpi_table_fadt, f) +union acpi_name_union { + u32 integer; + char ascii[4]; +}; + +/* + * Internal ACPI Table Descriptor. One per ACPI table + */ +struct acpi_table_desc { + acpi_physical_address address; + struct acpi_table_header *pointer; + u32 length; /* Length fixed at 32 bits */ + union acpi_name_union signature; + acpi_owner_id owner_id; + u8 flags; +}; + +/* Flags for above */ + +#define ACPI_TABLE_ORIGIN_UNKNOWN (0) +#define ACPI_TABLE_ORIGIN_MAPPED (1) +#define ACPI_TABLE_ORIGIN_ALLOCATED (2) +#define ACPI_TABLE_ORIGIN_MASK (3) +#define ACPI_TABLE_IS_LOADED (4) + /* * Get the remaining ACPI tables */ diff --git a/include/acpi/actbl1.h b/include/acpi/actbl1.h index 63f5b4cf4de..18963b96811 100644 --- a/include/acpi/actbl1.h +++ b/include/acpi/actbl1.h @@ -627,7 +627,7 @@ struct acpi_hest_aer_common { u32 uncorrectable_error_mask; u32 uncorrectable_error_severity; u32 correctable_error_mask; - u32 advanced_error_cababilities; + u32 advanced_error_capabilities; }; /* Hardware Error Notification */ diff --git a/include/acpi/actypes.h b/include/acpi/actypes.h index 8222e8de0d1..a20aab51017 100644 --- a/include/acpi/actypes.h +++ b/include/acpi/actypes.h @@ -204,11 +204,10 @@ typedef u32 acpi_physical_address; /******************************************************************************* * - * OS-dependent and compiler-dependent types + * OS-dependent types * * If the defaults below are not appropriate for the host system, they can - * be defined in the compiler-specific or OS-specific header, and this will - * take precedence. + * be defined in the OS-specific header, and this will take precedence. * ******************************************************************************/ @@ -218,12 +217,6 @@ typedef u32 acpi_physical_address; #define acpi_thread_id acpi_size #endif -/* Object returned from acpi_os_create_lock */ - -#ifndef acpi_spinlock -#define acpi_spinlock void * -#endif - /* Flags for acpi_os_acquire_lock/acpi_os_release_lock */ #ifndef acpi_cpu_flags @@ -233,9 +226,51 @@ typedef u32 acpi_physical_address; /* Object returned from acpi_os_create_cache */ #ifndef acpi_cache_t +#ifdef ACPI_USE_LOCAL_CACHE #define acpi_cache_t struct acpi_memory_list +#else +#define acpi_cache_t void * +#endif +#endif + +/* + * Synchronization objects - Mutexes, Semaphores, and spin_locks + */ +#if (ACPI_MUTEX_TYPE == ACPI_BINARY_SEMAPHORE) +/* + * These macros are used if the host OS does not support a mutex object. + * Map the OSL Mutex interfaces to binary semaphores. + */ +#define acpi_mutex acpi_semaphore +#define acpi_os_create_mutex(out_handle) acpi_os_create_semaphore (1, 1, out_handle) +#define acpi_os_delete_mutex(handle) (void) acpi_os_delete_semaphore (handle) +#define acpi_os_acquire_mutex(handle,time) acpi_os_wait_semaphore (handle, 1, time) +#define acpi_os_release_mutex(handle) (void) acpi_os_signal_semaphore (handle, 1) +#endif + +/* Configurable types for synchronization objects */ + +#ifndef acpi_spinlock +#define acpi_spinlock void * +#endif + +#ifndef acpi_semaphore +#define acpi_semaphore void * +#endif + +#ifndef acpi_mutex +#define acpi_mutex void * #endif +/******************************************************************************* + * + * Compiler-dependent types + * + * If the defaults below are not appropriate for the host compiler, they can + * be defined in the compiler-specific header, and this will take precedence. + * + ******************************************************************************/ + /* Use C99 uintptr_t for pointer casting if available, "void *" otherwise */ #ifndef acpi_uintptr_t @@ -268,6 +303,43 @@ typedef u32 acpi_physical_address; #define ACPI_EXPORT_SYMBOL(symbol) #endif +/****************************************************************************** + * + * ACPI Specification constants (Do not change unless the specification changes) + * + *****************************************************************************/ + +/* Number of distinct FADT-based GPE register blocks (GPE0 and GPE1) */ + +#define ACPI_MAX_GPE_BLOCKS 2 + +/* Default ACPI register widths */ + +#define ACPI_GPE_REGISTER_WIDTH 8 +#define ACPI_PM1_REGISTER_WIDTH 16 +#define ACPI_PM2_REGISTER_WIDTH 8 +#define ACPI_PM_TIMER_WIDTH 32 + +/* Names within the namespace are 4 bytes long */ + +#define ACPI_NAME_SIZE 4 +#define ACPI_PATH_SEGMENT_LENGTH 5 /* 4 chars for name + 1 char for separator */ +#define ACPI_PATH_SEPARATOR '.' + +/* Sizes for ACPI table headers */ + +#define ACPI_OEM_ID_SIZE 6 +#define ACPI_OEM_TABLE_ID_SIZE 8 + +/* ACPI/PNP hardware IDs */ + +#define PCI_ROOT_HID_STRING "PNP0A03" +#define PCI_EXPRESS_ROOT_HID_STRING "PNP0A08" + +/* PM Timer ticks per second (HZ) */ + +#define PM_TIMER_FREQUENCY 3579545 + /******************************************************************************* * * Independent types @@ -291,13 +363,18 @@ typedef u32 acpi_physical_address; #endif /* - * Mescellaneous types + * Miscellaneous types */ typedef u32 acpi_status; /* All ACPI Exceptions */ typedef u32 acpi_name; /* 4-byte ACPI name */ typedef char *acpi_string; /* Null terminated ASCII string */ typedef void *acpi_handle; /* Actually a ptr to a NS Node */ +/* Owner IDs are used to track namespace nodes for selective deletion */ + +typedef u8 acpi_owner_id; +#define ACPI_OWNER_ID_MAX 0xFF + struct uint64_struct { u32 lo; u32 hi; @@ -313,13 +390,8 @@ struct uint32_struct { u32 hi; }; -/* Synchronization objects */ - -#define acpi_mutex void * -#define acpi_semaphore void * - /* - * Acpi integer width. In ACPI version 1, integers are 32 bits. In ACPI + * Acpi integer width. In ACPI version 1, integers are 32 bits. In ACPI * version 2, integers are 64 bits. Note that this pertains to the ACPI integer * type only, not other integers used in the implementation of the ACPI CA * subsystem. @@ -338,10 +410,75 @@ typedef unsigned long long acpi_integer; #define ACPI_MAX16_DECIMAL_DIGITS 5 #define ACPI_MAX8_DECIMAL_DIGITS 3 +/* PM Timer ticks per second (HZ) */ + +#define PM_TIMER_FREQUENCY 3579545 + /* * Constants with special meanings */ #define ACPI_ROOT_OBJECT ACPI_ADD_PTR (acpi_handle, NULL, ACPI_MAX_PTR) +#define ACPI_WAIT_FOREVER 0xFFFF /* u16, as per ACPI spec */ +#define ACPI_DO_NOT_WAIT 0 + +/******************************************************************************* + * + * Commonly used macros + * + ******************************************************************************/ + +/* Data manipulation */ + +#define ACPI_LOWORD(l) ((u16)(u32)(l)) +#define ACPI_HIWORD(l) ((u16)((((u32)(l)) >> 16) & 0xFFFF)) +#define ACPI_LOBYTE(l) ((u8)(u16)(l)) +#define ACPI_HIBYTE(l) ((u8)((((u16)(l)) >> 8) & 0xFF)) + +/* Full 64-bit integer must be available on both 32-bit and 64-bit platforms */ + +struct acpi_integer_overlay { + u32 lo_dword; + u32 hi_dword; +}; + +#define ACPI_LODWORD(integer) (ACPI_CAST_PTR (struct acpi_integer_overlay, &integer)->lo_dword) +#define ACPI_HIDWORD(integer) (ACPI_CAST_PTR (struct acpi_integer_overlay, &integer)->hi_dword) + +#define ACPI_SET_BIT(target,bit) ((target) |= (bit)) +#define ACPI_CLEAR_BIT(target,bit) ((target) &= ~(bit)) +#define ACPI_MIN(a,b) (((a)<(b))?(a):(b)) +#define ACPI_MAX(a,b) (((a)>(b))?(a):(b)) + +/* Size calculation */ + +#define ACPI_ARRAY_LENGTH(x) (sizeof(x) / sizeof((x)[0])) + +/* Pointer manipulation */ + +#define ACPI_CAST_PTR(t, p) ((t *) (acpi_uintptr_t) (p)) +#define ACPI_CAST_INDIRECT_PTR(t, p) ((t **) (acpi_uintptr_t) (p)) +#define ACPI_ADD_PTR(t, a, b) ACPI_CAST_PTR (t, (ACPI_CAST_PTR (u8, (a)) + (acpi_size)(b))) +#define ACPI_PTR_DIFF(a, b) (acpi_size) (ACPI_CAST_PTR (u8, (a)) - ACPI_CAST_PTR (u8, (b))) + +/* Pointer/Integer type conversions */ + +#define ACPI_TO_POINTER(i) ACPI_ADD_PTR (void, (void *) NULL,(acpi_size) i) +#define ACPI_TO_INTEGER(p) ACPI_PTR_DIFF (p, (void *) NULL) +#define ACPI_OFFSET(d, f) (acpi_size) ACPI_PTR_DIFF (&(((d *)0)->f), (void *) NULL) +#define ACPI_PHYSADDR_TO_PTR(i) ACPI_TO_POINTER(i) +#define ACPI_PTR_TO_PHYSADDR(i) ACPI_TO_INTEGER(i) + +#ifndef ACPI_MISALIGNMENT_NOT_SUPPORTED +#define ACPI_COMPARE_NAME(a,b) (*ACPI_CAST_PTR (u32, (a)) == *ACPI_CAST_PTR (u32, (b))) +#else +#define ACPI_COMPARE_NAME(a,b) (!ACPI_STRNCMP (ACPI_CAST_PTR (char, (a)), ACPI_CAST_PTR (char, (b)), ACPI_NAME_SIZE)) +#endif + +/******************************************************************************* + * + * Miscellaneous constants + * + ******************************************************************************/ /* * Initialization sequence @@ -414,7 +551,7 @@ typedef unsigned long long acpi_integer; #define ACPI_NOTIFY_MAX 0x0B /* - * Types associated with ACPI names and objects. The first group of + * Types associated with ACPI names and objects. The first group of * values (up to ACPI_TYPE_EXTERNAL_MAX) correspond to the definition * of the ACPI object_type() operator (See the ACPI Spec). Therefore, * only add to the first group if the spec changes. @@ -732,6 +869,15 @@ struct acpi_buffer { #define ACPI_NAME_TYPE_MAX 1 /* + * Predefined Namespace items + */ +struct acpi_predefined_names { + char *name; + u8 type; + char *val; +}; + +/* * Structure and flags for acpi_get_system_info */ #define ACPI_SYS_MODE_UNKNOWN 0x0000 @@ -787,7 +933,7 @@ acpi_status(*acpi_exception_handler) (acpi_status aml_status, u16 opcode, u32 aml_offset, void *context); -/* Table Event handler (Load, load_table etc) and types */ +/* Table Event handler (Load, load_table, etc.) and types */ typedef acpi_status(*acpi_tbl_handler) (u32 event, void *table, void *context); @@ -823,6 +969,12 @@ acpi_status(*acpi_walk_callback) (acpi_handle obj_handle, #define ACPI_INTERRUPT_NOT_HANDLED 0x00 #define ACPI_INTERRUPT_HANDLED 0x01 +/* Length of _HID, _UID, _CID, and UUID values */ + +#define ACPI_DEVICE_ID_LENGTH 0x09 +#define ACPI_MAX_CID_LENGTH 48 +#define ACPI_UUID_LENGTH 16 + /* Common string version of device HIDs and UIDs */ struct acpica_device_id { @@ -900,357 +1052,28 @@ struct acpi_mem_space_context { }; /* - * Definitions for Resource Attributes - */ -typedef u16 acpi_rs_length; /* Resource Length field is fixed at 16 bits */ -typedef u32 acpi_rsdesc_size; /* Max Resource Descriptor size is (Length+3) = (64_k-1)+3 */ - -/* - * Memory Attributes - */ -#define ACPI_READ_ONLY_MEMORY (u8) 0x00 -#define ACPI_READ_WRITE_MEMORY (u8) 0x01 - -#define ACPI_NON_CACHEABLE_MEMORY (u8) 0x00 -#define ACPI_CACHABLE_MEMORY (u8) 0x01 -#define ACPI_WRITE_COMBINING_MEMORY (u8) 0x02 -#define ACPI_PREFETCHABLE_MEMORY (u8) 0x03 - -/* - * IO Attributes - * The ISA IO ranges are: n000-n0_fFh, n400-n4_fFh, n800-n8_fFh, n_c00-n_cFFh. - * The non-ISA IO ranges are: n100-n3_fFh, n500-n7_fFh, n900-n_bFFh, n_cd0-n_fFFh. + * struct acpi_memory_list is used only if the ACPICA local cache is enabled */ -#define ACPI_NON_ISA_ONLY_RANGES (u8) 0x01 -#define ACPI_ISA_ONLY_RANGES (u8) 0x02 -#define ACPI_ENTIRE_RANGE (ACPI_NON_ISA_ONLY_RANGES | ACPI_ISA_ONLY_RANGES) - -/* Type of translation - 1=Sparse, 0=Dense */ - -#define ACPI_SPARSE_TRANSLATION (u8) 0x01 - -/* - * IO Port Descriptor Decode - */ -#define ACPI_DECODE_10 (u8) 0x00 /* 10-bit IO address decode */ -#define ACPI_DECODE_16 (u8) 0x01 /* 16-bit IO address decode */ - -/* - * IRQ Attributes - */ -#define ACPI_LEVEL_SENSITIVE (u8) 0x00 -#define ACPI_EDGE_SENSITIVE (u8) 0x01 - -#define ACPI_ACTIVE_HIGH (u8) 0x00 -#define ACPI_ACTIVE_LOW (u8) 0x01 - -#define ACPI_EXCLUSIVE (u8) 0x00 -#define ACPI_SHARED (u8) 0x01 - -/* - * DMA Attributes - */ -#define ACPI_COMPATIBILITY (u8) 0x00 -#define ACPI_TYPE_A (u8) 0x01 -#define ACPI_TYPE_B (u8) 0x02 -#define ACPI_TYPE_F (u8) 0x03 - -#define ACPI_NOT_BUS_MASTER (u8) 0x00 -#define ACPI_BUS_MASTER (u8) 0x01 - -#define ACPI_TRANSFER_8 (u8) 0x00 -#define ACPI_TRANSFER_8_16 (u8) 0x01 -#define ACPI_TRANSFER_16 (u8) 0x02 - -/* - * Start Dependent Functions Priority definitions - */ -#define ACPI_GOOD_CONFIGURATION (u8) 0x00 -#define ACPI_ACCEPTABLE_CONFIGURATION (u8) 0x01 -#define ACPI_SUB_OPTIMAL_CONFIGURATION (u8) 0x02 - -/* - * 16, 32 and 64-bit Address Descriptor resource types - */ -#define ACPI_MEMORY_RANGE (u8) 0x00 -#define ACPI_IO_RANGE (u8) 0x01 -#define ACPI_BUS_NUMBER_RANGE (u8) 0x02 - -#define ACPI_ADDRESS_NOT_FIXED (u8) 0x00 -#define ACPI_ADDRESS_FIXED (u8) 0x01 - -#define ACPI_POS_DECODE (u8) 0x00 -#define ACPI_SUB_DECODE (u8) 0x01 - -#define ACPI_PRODUCER (u8) 0x00 -#define ACPI_CONSUMER (u8) 0x01 - -/* - * If possible, pack the following structures to byte alignment - */ -#ifndef ACPI_MISALIGNMENT_NOT_SUPPORTED -#pragma pack(1) +struct acpi_memory_list { + char *list_name; + void *list_head; + u16 object_size; + u16 max_depth; + u16 current_depth; + u16 link_offset; + +#ifdef ACPI_DBG_TRACK_ALLOCATIONS + + /* Statistics for debug memory tracking only */ + + u32 total_allocated; + u32 total_freed; + u32 max_occupied; + u32 total_size; + u32 current_total_size; + u32 requests; + u32 hits; #endif - -/* UUID data structures for use in vendor-defined resource descriptors */ - -struct acpi_uuid { - u8 data[ACPI_UUID_LENGTH]; -}; - -struct acpi_vendor_uuid { - u8 subtype; - u8 data[ACPI_UUID_LENGTH]; -}; - -/* - * Structures used to describe device resources - */ -struct acpi_resource_irq { - u8 descriptor_length; - u8 triggering; - u8 polarity; - u8 sharable; - u8 interrupt_count; - u8 interrupts[1]; -}; - -struct acpi_resource_dma { - u8 type; - u8 bus_master; - u8 transfer; - u8 channel_count; - u8 channels[1]; -}; - -struct acpi_resource_start_dependent { - u8 descriptor_length; - u8 compatibility_priority; - u8 performance_robustness; -}; - -/* - * END_DEPENDENT_FUNCTIONS_RESOURCE struct is not - * needed because it has no fields - */ - -struct acpi_resource_io { - u8 io_decode; - u8 alignment; - u8 address_length; - u16 minimum; - u16 maximum; -}; - -struct acpi_resource_fixed_io { - u16 address; - u8 address_length; -}; - -struct acpi_resource_vendor { - u16 byte_length; - u8 byte_data[1]; -}; - -/* Vendor resource with UUID info (introduced in ACPI 3.0) */ - -struct acpi_resource_vendor_typed { - u16 byte_length; - u8 uuid_subtype; - u8 uuid[ACPI_UUID_LENGTH]; - u8 byte_data[1]; -}; - -struct acpi_resource_end_tag { - u8 checksum; -}; - -struct acpi_resource_memory24 { - u8 write_protect; - u16 minimum; - u16 maximum; - u16 alignment; - u16 address_length; -}; - -struct acpi_resource_memory32 { - u8 write_protect; - u32 minimum; - u32 maximum; - u32 alignment; - u32 address_length; -}; - -struct acpi_resource_fixed_memory32 { - u8 write_protect; - u32 address; - u32 address_length; -}; - -struct acpi_memory_attribute { - u8 write_protect; - u8 caching; - u8 range_type; - u8 translation; -}; - -struct acpi_io_attribute { - u8 range_type; - u8 translation; - u8 translation_type; - u8 reserved1; -}; - -union acpi_resource_attribute { - struct acpi_memory_attribute mem; - struct acpi_io_attribute io; - - /* Used for the *word_space macros */ - - u8 type_specific; -}; - -struct acpi_resource_source { - u8 index; - u16 string_length; - char *string_ptr; -}; - -/* Fields common to all address descriptors, 16/32/64 bit */ - -#define ACPI_RESOURCE_ADDRESS_COMMON \ - u8 resource_type; \ - u8 producer_consumer; \ - u8 decode; \ - u8 min_address_fixed; \ - u8 max_address_fixed; \ - union acpi_resource_attribute info; - -struct acpi_resource_address { -ACPI_RESOURCE_ADDRESS_COMMON}; - -struct acpi_resource_address16 { - ACPI_RESOURCE_ADDRESS_COMMON u16 granularity; - u16 minimum; - u16 maximum; - u16 translation_offset; - u16 address_length; - struct acpi_resource_source resource_source; -}; - -struct acpi_resource_address32 { - ACPI_RESOURCE_ADDRESS_COMMON u32 granularity; - u32 minimum; - u32 maximum; - u32 translation_offset; - u32 address_length; - struct acpi_resource_source resource_source; -}; - -struct acpi_resource_address64 { - ACPI_RESOURCE_ADDRESS_COMMON u64 granularity; - u64 minimum; - u64 maximum; - u64 translation_offset; - u64 address_length; - struct acpi_resource_source resource_source; -}; - -struct acpi_resource_extended_address64 { - ACPI_RESOURCE_ADDRESS_COMMON u8 revision_iD; - u64 granularity; - u64 minimum; - u64 maximum; - u64 translation_offset; - u64 address_length; - u64 type_specific; -}; - -struct acpi_resource_extended_irq { - u8 producer_consumer; - u8 triggering; - u8 polarity; - u8 sharable; - u8 interrupt_count; - struct acpi_resource_source resource_source; - u32 interrupts[1]; -}; - -struct acpi_resource_generic_register { - u8 space_id; - u8 bit_width; - u8 bit_offset; - u8 access_size; - u64 address; -}; - -/* ACPI_RESOURCE_TYPEs */ - -#define ACPI_RESOURCE_TYPE_IRQ 0 -#define ACPI_RESOURCE_TYPE_DMA 1 -#define ACPI_RESOURCE_TYPE_START_DEPENDENT 2 -#define ACPI_RESOURCE_TYPE_END_DEPENDENT 3 -#define ACPI_RESOURCE_TYPE_IO 4 -#define ACPI_RESOURCE_TYPE_FIXED_IO 5 -#define ACPI_RESOURCE_TYPE_VENDOR 6 -#define ACPI_RESOURCE_TYPE_END_TAG 7 -#define ACPI_RESOURCE_TYPE_MEMORY24 8 -#define ACPI_RESOURCE_TYPE_MEMORY32 9 -#define ACPI_RESOURCE_TYPE_FIXED_MEMORY32 10 -#define ACPI_RESOURCE_TYPE_ADDRESS16 11 -#define ACPI_RESOURCE_TYPE_ADDRESS32 12 -#define ACPI_RESOURCE_TYPE_ADDRESS64 13 -#define ACPI_RESOURCE_TYPE_EXTENDED_ADDRESS64 14 /* ACPI 3.0 */ -#define ACPI_RESOURCE_TYPE_EXTENDED_IRQ 15 -#define ACPI_RESOURCE_TYPE_GENERIC_REGISTER 16 -#define ACPI_RESOURCE_TYPE_MAX 16 - -union acpi_resource_data { - struct acpi_resource_irq irq; - struct acpi_resource_dma dma; - struct acpi_resource_start_dependent start_dpf; - struct acpi_resource_io io; - struct acpi_resource_fixed_io fixed_io; - struct acpi_resource_vendor vendor; - struct acpi_resource_vendor_typed vendor_typed; - struct acpi_resource_end_tag end_tag; - struct acpi_resource_memory24 memory24; - struct acpi_resource_memory32 memory32; - struct acpi_resource_fixed_memory32 fixed_memory32; - struct acpi_resource_address16 address16; - struct acpi_resource_address32 address32; - struct acpi_resource_address64 address64; - struct acpi_resource_extended_address64 ext_address64; - struct acpi_resource_extended_irq extended_irq; - struct acpi_resource_generic_register generic_reg; - - /* Common fields */ - - struct acpi_resource_address address; /* Common 16/32/64 address fields */ -}; - -struct acpi_resource { - u32 type; - u32 length; - union acpi_resource_data data; -}; - -/* restore default alignment */ - -#pragma pack() - -#define ACPI_RS_SIZE_NO_DATA 8 /* Id + Length fields */ -#define ACPI_RS_SIZE_MIN (u32) ACPI_ROUND_UP_TO_NATIVE_WORD (12) -#define ACPI_RS_SIZE(type) (u32) (ACPI_RS_SIZE_NO_DATA + sizeof (type)) - -#define ACPI_NEXT_RESOURCE(res) (struct acpi_resource *)((u8 *) res + res->length) - -struct acpi_pci_routing_table { - u32 length; - u32 pin; - acpi_integer address; /* here for 64-bit alignment */ - u32 source_index; - char source[4]; /* pad to 64 bits so sizeof() works in all cases */ }; #endif /* __ACTYPES_H__ */ diff --git a/include/acpi/platform/acenv.h b/include/acpi/platform/acenv.h index fcd2572e428..e62f10d9a7d 100644 --- a/include/acpi/platform/acenv.h +++ b/include/acpi/platform/acenv.h @@ -44,14 +44,26 @@ #ifndef __ACENV_H__ #define __ACENV_H__ -/* +/* Types for ACPI_MUTEX_TYPE */ + +#define ACPI_BINARY_SEMAPHORE 0 +#define ACPI_OSL_MUTEX 1 + +/* Types for DEBUGGER_THREADING */ + +#define DEBUGGER_SINGLE_THREADED 0 +#define DEBUGGER_MULTI_THREADED 1 + +/****************************************************************************** + * * Configuration for ACPI tools and utilities - */ + * + *****************************************************************************/ #ifdef ACPI_LIBRARY /* * Note: The non-debug version of the acpi_library does not contain any - * debug support, for minimimal size. The debug version uses ACPI_FULL_DEBUG + * debug support, for minimal size. The debug version uses ACPI_FULL_DEBUG */ #define ACPI_USE_LOCAL_CACHE #endif @@ -75,17 +87,6 @@ #define ACPI_DBG_TRACK_ALLOCATIONS #endif -#ifdef ACPI_DASM_APP -#ifndef MSDOS -#define ACPI_DEBUG_OUTPUT -#endif -#define ACPI_APPLICATION -#define ACPI_DISASSEMBLER -#define ACPI_NO_METHOD_EXECUTION -#define ACPI_LARGE_NAMESPACE_NODE -#define ACPI_DATA_TABLE_DISASSEMBLY -#endif - #ifdef ACPI_APPLICATION #define ACPI_USE_SYSTEM_CLIBRARY #define ACPI_USE_LOCAL_CACHE @@ -179,6 +180,19 @@ /*! [End] no source code translation !*/ +/****************************************************************************** + * + * Miscellaneous configuration + * + *****************************************************************************/ + +/* + * Are mutexes supported by the host? default is no, use binary semaphores. + */ +#ifndef ACPI_MUTEX_TYPE +#define ACPI_MUTEX_TYPE ACPI_BINARY_SEMAPHORE +#endif + /* * Debugger threading model * Use single threaded if the entire subsystem is contained in an application @@ -187,9 +201,6 @@ * By default the model is single threaded if ACPI_APPLICATION is set, * multi-threaded if ACPI_APPLICATION is not set. */ -#define DEBUGGER_SINGLE_THREADED 0 -#define DEBUGGER_MULTI_THREADED 1 - #ifndef DEBUGGER_THREADING #ifdef ACPI_APPLICATION #define DEBUGGER_THREADING DEBUGGER_SINGLE_THREADED diff --git a/include/acpi/platform/aclinux.h b/include/acpi/platform/aclinux.h index 0515e754449..6d49b2a498c 100644 --- a/include/acpi/platform/aclinux.h +++ b/include/acpi/platform/aclinux.h @@ -46,6 +46,7 @@ #define ACPI_USE_SYSTEM_CLIBRARY #define ACPI_USE_DO_WHILE_0 +#define ACPI_MUTEX_TYPE ACPI_BINARY_SEMAPHORE #ifdef __KERNEL__ @@ -70,9 +71,6 @@ #define ACPI_EXPORT_SYMBOL(symbol) EXPORT_SYMBOL(symbol); #define strtoul simple_strtoul -/* Full namespace pathname length limit - arbitrary */ -#define ACPI_PATHNAME_MAX 256 - #else /* !__KERNEL__ */ #include <stdarg.h> diff --git a/include/linux/acpi.h b/include/linux/acpi.h index fba8051fb29..6fce2fc2d12 100644 --- a/include/linux/acpi.h +++ b/include/linux/acpi.h @@ -131,22 +131,6 @@ extern int acpi_get_override_irq(int bus_irq, int *trigger, int *polarity); */ void acpi_unregister_gsi (u32 gsi); -struct acpi_prt_entry { - struct list_head node; - struct acpi_pci_id id; - u8 pin; - struct { - acpi_handle handle; - u32 index; - } link; - u32 irq; -}; - -struct acpi_prt_list { - int count; - struct list_head entries; -}; - struct pci_dev; int acpi_pci_irq_enable (struct pci_dev *dev); @@ -270,6 +254,7 @@ int acpi_check_mem_region(resource_size_t start, resource_size_t n, #ifdef CONFIG_PM_SLEEP void __init acpi_no_s4_hw_signature(void); void __init acpi_old_suspend_ordering(void); +void __init acpi_s4_no_nvs(void); #endif /* CONFIG_PM_SLEEP */ #else /* CONFIG_ACPI */ diff --git a/include/linux/async_tx.h b/include/linux/async_tx.h index 0f50d4cc436..45f6297821b 100644 --- a/include/linux/async_tx.h +++ b/include/linux/async_tx.h @@ -59,9 +59,7 @@ enum async_tx_flags { }; #ifdef CONFIG_DMA_ENGINE -void async_tx_issue_pending_all(void); -enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx); -void async_tx_run_dependencies(struct dma_async_tx_descriptor *tx); +#define async_tx_issue_pending_all dma_issue_pending_all #ifdef CONFIG_ARCH_HAS_ASYNC_TX_FIND_CHANNEL #include <asm/async_tx.h> #else @@ -77,19 +75,6 @@ static inline void async_tx_issue_pending_all(void) do { } while (0); } -static inline enum dma_status -dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) -{ - return DMA_SUCCESS; -} - -static inline void -async_tx_run_dependencies(struct dma_async_tx_descriptor *tx, - struct dma_chan *host_chan) -{ - do { } while (0); -} - static inline struct dma_chan * async_tx_find_channel(struct dma_async_tx_descriptor *depend_tx, enum dma_transaction_type tx_type, struct page **dst, int dst_count, diff --git a/include/linux/atmel-mci.h b/include/linux/atmel-mci.h index 2a2213eefd8..2f1f95737ac 100644 --- a/include/linux/atmel-mci.h +++ b/include/linux/atmel-mci.h @@ -3,7 +3,7 @@ #define ATMEL_MCI_MAX_NR_SLOTS 2 -struct dma_slave; +#include <linux/dw_dmac.h> /** * struct mci_slot_pdata - board-specific per-slot configuration @@ -28,11 +28,11 @@ struct mci_slot_pdata { /** * struct mci_platform_data - board-specific MMC/SDcard configuration - * @dma_slave: DMA slave interface to use in data transfers, or NULL. + * @dma_slave: DMA slave interface to use in data transfers. * @slot: Per-slot configuration data. */ struct mci_platform_data { - struct dma_slave *dma_slave; + struct dw_dma_slave dma_slave; struct mci_slot_pdata slot[ATMEL_MCI_MAX_NR_SLOTS]; }; diff --git a/include/linux/backlight.h b/include/linux/backlight.h index 1ee9488ca2e..79ca2da81c8 100644 --- a/include/linux/backlight.h +++ b/include/linux/backlight.h @@ -31,6 +31,10 @@ struct backlight_device; struct fb_info; struct backlight_ops { + unsigned int options; + +#define BL_CORE_SUSPENDRESUME (1 << 0) + /* Notify the backlight driver some property has changed */ int (*update_status)(struct backlight_device *); /* Return the current backlight brightness (accounting for power, @@ -51,7 +55,19 @@ struct backlight_properties { modes; 4: full off), see FB_BLANK_XXX */ int power; /* FB Blanking active? (values as for power) */ + /* Due to be removed, please use (state & BL_CORE_FBBLANK) */ int fb_blank; + /* Flags used to signal drivers of state changes */ + /* Upper 4 bits are reserved for driver internal use */ + unsigned int state; + +#define BL_CORE_SUSPENDED (1 << 0) /* backlight is suspended */ +#define BL_CORE_FBBLANK (1 << 1) /* backlight is under an fb blank event */ +#define BL_CORE_DRIVER4 (1 << 28) /* reserved for driver specific use */ +#define BL_CORE_DRIVER3 (1 << 29) /* reserved for driver specific use */ +#define BL_CORE_DRIVER2 (1 << 30) /* reserved for driver specific use */ +#define BL_CORE_DRIVER1 (1 << 31) /* reserved for driver specific use */ + }; struct backlight_device { diff --git a/include/linux/dmaengine.h b/include/linux/dmaengine.h index adb0b084eb5..64dea2ab326 100644 --- a/include/linux/dmaengine.h +++ b/include/linux/dmaengine.h @@ -29,32 +29,6 @@ #include <linux/dma-mapping.h> /** - * enum dma_state - resource PNP/power management state - * @DMA_RESOURCE_SUSPEND: DMA device going into low power state - * @DMA_RESOURCE_RESUME: DMA device returning to full power - * @DMA_RESOURCE_AVAILABLE: DMA device available to the system - * @DMA_RESOURCE_REMOVED: DMA device removed from the system - */ -enum dma_state { - DMA_RESOURCE_SUSPEND, - DMA_RESOURCE_RESUME, - DMA_RESOURCE_AVAILABLE, - DMA_RESOURCE_REMOVED, -}; - -/** - * enum dma_state_client - state of the channel in the client - * @DMA_ACK: client would like to use, or was using this channel - * @DMA_DUP: client has already seen this channel, or is not using this channel - * @DMA_NAK: client does not want to see any more channels - */ -enum dma_state_client { - DMA_ACK, - DMA_DUP, - DMA_NAK, -}; - -/** * typedef dma_cookie_t - an opaque DMA cookie * * if dma_cookie_t is >0 it's a DMA request cookie, <0 it's an error code @@ -89,23 +63,13 @@ enum dma_transaction_type { DMA_MEMSET, DMA_MEMCPY_CRC32C, DMA_INTERRUPT, + DMA_PRIVATE, DMA_SLAVE, }; /* last transaction type for creation of the capabilities mask */ #define DMA_TX_TYPE_END (DMA_SLAVE + 1) -/** - * enum dma_slave_width - DMA slave register access width. - * @DMA_SLAVE_WIDTH_8BIT: Do 8-bit slave register accesses - * @DMA_SLAVE_WIDTH_16BIT: Do 16-bit slave register accesses - * @DMA_SLAVE_WIDTH_32BIT: Do 32-bit slave register accesses - */ -enum dma_slave_width { - DMA_SLAVE_WIDTH_8BIT, - DMA_SLAVE_WIDTH_16BIT, - DMA_SLAVE_WIDTH_32BIT, -}; /** * enum dma_ctrl_flags - DMA flags to augment operation preparation, @@ -132,32 +96,6 @@ enum dma_ctrl_flags { typedef struct { DECLARE_BITMAP(bits, DMA_TX_TYPE_END); } dma_cap_mask_t; /** - * struct dma_slave - Information about a DMA slave - * @dev: device acting as DMA slave - * @dma_dev: required DMA master device. If non-NULL, the client can not be - * bound to other masters than this. - * @tx_reg: physical address of data register used for - * memory-to-peripheral transfers - * @rx_reg: physical address of data register used for - * peripheral-to-memory transfers - * @reg_width: peripheral register width - * - * If dma_dev is non-NULL, the client can not be bound to other DMA - * masters than the one corresponding to this device. The DMA master - * driver may use this to determine if there is controller-specific - * data wrapped around this struct. Drivers of platform code that sets - * the dma_dev field must therefore make sure to use an appropriate - * controller-specific dma slave structure wrapping this struct. - */ -struct dma_slave { - struct device *dev; - struct device *dma_dev; - dma_addr_t tx_reg; - dma_addr_t rx_reg; - enum dma_slave_width reg_width; -}; - -/** * struct dma_chan_percpu - the per-CPU part of struct dma_chan * @refcount: local_t used for open-coded "bigref" counting * @memcpy_count: transaction counter @@ -165,7 +103,6 @@ struct dma_slave { */ struct dma_chan_percpu { - local_t refcount; /* stats */ unsigned long memcpy_count; unsigned long bytes_transferred; @@ -176,13 +113,14 @@ struct dma_chan_percpu { * @device: ptr to the dma device who supplies this channel, always !%NULL * @cookie: last cookie value returned to client * @chan_id: channel ID for sysfs - * @class_dev: class device for sysfs + * @dev: class device for sysfs * @refcount: kref, used in "bigref" slow-mode * @slow_ref: indicates that the DMA channel is free * @rcu: the DMA channel's RCU head * @device_node: used to add this to the device chan list * @local: per-cpu pointer to a struct dma_chan_percpu * @client-count: how many clients are using this channel + * @table_count: number of appearances in the mem-to-mem allocation table */ struct dma_chan { struct dma_device *device; @@ -190,73 +128,47 @@ struct dma_chan { /* sysfs */ int chan_id; - struct device dev; - - struct kref refcount; - int slow_ref; - struct rcu_head rcu; + struct dma_chan_dev *dev; struct list_head device_node; struct dma_chan_percpu *local; int client_count; + int table_count; }; -#define to_dma_chan(p) container_of(p, struct dma_chan, dev) - -void dma_chan_cleanup(struct kref *kref); - -static inline void dma_chan_get(struct dma_chan *chan) -{ - if (unlikely(chan->slow_ref)) - kref_get(&chan->refcount); - else { - local_inc(&(per_cpu_ptr(chan->local, get_cpu())->refcount)); - put_cpu(); - } -} +/** + * struct dma_chan_dev - relate sysfs device node to backing channel device + * @chan - driver channel device + * @device - sysfs device + * @dev_id - parent dma_device dev_id + * @idr_ref - reference count to gate release of dma_device dev_id + */ +struct dma_chan_dev { + struct dma_chan *chan; + struct device device; + int dev_id; + atomic_t *idr_ref; +}; -static inline void dma_chan_put(struct dma_chan *chan) +static inline const char *dma_chan_name(struct dma_chan *chan) { - if (unlikely(chan->slow_ref)) - kref_put(&chan->refcount, dma_chan_cleanup); - else { - local_dec(&(per_cpu_ptr(chan->local, get_cpu())->refcount)); - put_cpu(); - } + return dev_name(&chan->dev->device); } -/* - * typedef dma_event_callback - function pointer to a DMA event callback - * For each channel added to the system this routine is called for each client. - * If the client would like to use the channel it returns '1' to signal (ack) - * the dmaengine core to take out a reference on the channel and its - * corresponding device. A client must not 'ack' an available channel more - * than once. When a channel is removed all clients are notified. If a client - * is using the channel it must 'ack' the removal. A client must not 'ack' a - * removed channel more than once. - * @client - 'this' pointer for the client context - * @chan - channel to be acted upon - * @state - available or removed - */ -struct dma_client; -typedef enum dma_state_client (*dma_event_callback) (struct dma_client *client, - struct dma_chan *chan, enum dma_state state); +void dma_chan_cleanup(struct kref *kref); /** - * struct dma_client - info on the entity making use of DMA services - * @event_callback: func ptr to call when something happens - * @cap_mask: only return channels that satisfy the requested capabilities - * a value of zero corresponds to any capability - * @slave: data for preparing slave transfer. Must be non-NULL iff the - * DMA_SLAVE capability is requested. - * @global_node: list_head for global dma_client_list + * typedef dma_filter_fn - callback filter for dma_request_channel + * @chan: channel to be reviewed + * @filter_param: opaque parameter passed through dma_request_channel + * + * When this optional parameter is specified in a call to dma_request_channel a + * suitable channel is passed to this routine for further dispositioning before + * being returned. Where 'suitable' indicates a non-busy channel that + * satisfies the given capability mask. It returns 'true' to indicate that the + * channel is suitable. */ -struct dma_client { - dma_event_callback event_callback; - dma_cap_mask_t cap_mask; - struct dma_slave *slave; - struct list_head global_node; -}; +typedef bool (*dma_filter_fn)(struct dma_chan *chan, void *filter_param); typedef void (*dma_async_tx_callback)(void *dma_async_param); /** @@ -323,14 +235,10 @@ struct dma_device { dma_cap_mask_t cap_mask; int max_xor; - struct kref refcount; - struct completion done; - int dev_id; struct device *dev; - int (*device_alloc_chan_resources)(struct dma_chan *chan, - struct dma_client *client); + int (*device_alloc_chan_resources)(struct dma_chan *chan); void (*device_free_chan_resources)(struct dma_chan *chan); struct dma_async_tx_descriptor *(*device_prep_dma_memcpy)( @@ -362,9 +270,8 @@ struct dma_device { /* --- public DMA engine API --- */ -void dma_async_client_register(struct dma_client *client); -void dma_async_client_unregister(struct dma_client *client); -void dma_async_client_chan_request(struct dma_client *client); +void dmaengine_get(void); +void dmaengine_put(void); dma_cookie_t dma_async_memcpy_buf_to_buf(struct dma_chan *chan, void *dest, void *src, size_t len); dma_cookie_t dma_async_memcpy_buf_to_pg(struct dma_chan *chan, @@ -406,6 +313,12 @@ __dma_cap_set(enum dma_transaction_type tx_type, dma_cap_mask_t *dstp) set_bit(tx_type, dstp->bits); } +#define dma_cap_zero(mask) __dma_cap_zero(&(mask)) +static inline void __dma_cap_zero(dma_cap_mask_t *dstp) +{ + bitmap_zero(dstp->bits, DMA_TX_TYPE_END); +} + #define dma_has_cap(tx, mask) __dma_has_cap((tx), &(mask)) static inline int __dma_has_cap(enum dma_transaction_type tx_type, dma_cap_mask_t *srcp) @@ -475,11 +388,25 @@ static inline enum dma_status dma_async_is_complete(dma_cookie_t cookie, } enum dma_status dma_sync_wait(struct dma_chan *chan, dma_cookie_t cookie); +#ifdef CONFIG_DMA_ENGINE +enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx); +#else +static inline enum dma_status dma_wait_for_async_tx(struct dma_async_tx_descriptor *tx) +{ + return DMA_SUCCESS; +} +#endif /* --- DMA device --- */ int dma_async_device_register(struct dma_device *device); void dma_async_device_unregister(struct dma_device *device); +void dma_run_dependencies(struct dma_async_tx_descriptor *tx); +struct dma_chan *dma_find_channel(enum dma_transaction_type tx_type); +void dma_issue_pending_all(void); +#define dma_request_channel(mask, x, y) __dma_request_channel(&(mask), x, y) +struct dma_chan *__dma_request_channel(dma_cap_mask_t *mask, dma_filter_fn fn, void *fn_param); +void dma_release_channel(struct dma_chan *chan); /* --- Helper iov-locking functions --- */ diff --git a/include/linux/dw_dmac.h b/include/linux/dw_dmac.h index 04d217b442b..d797dde247f 100644 --- a/include/linux/dw_dmac.h +++ b/include/linux/dw_dmac.h @@ -22,14 +22,34 @@ struct dw_dma_platform_data { }; /** + * enum dw_dma_slave_width - DMA slave register access width. + * @DMA_SLAVE_WIDTH_8BIT: Do 8-bit slave register accesses + * @DMA_SLAVE_WIDTH_16BIT: Do 16-bit slave register accesses + * @DMA_SLAVE_WIDTH_32BIT: Do 32-bit slave register accesses + */ +enum dw_dma_slave_width { + DW_DMA_SLAVE_WIDTH_8BIT, + DW_DMA_SLAVE_WIDTH_16BIT, + DW_DMA_SLAVE_WIDTH_32BIT, +}; + +/** * struct dw_dma_slave - Controller-specific information about a slave - * @slave: Generic information about the slave - * @ctl_lo: Platform-specific initializer for the CTL_LO register + * + * @dma_dev: required DMA master device + * @tx_reg: physical address of data register used for + * memory-to-peripheral transfers + * @rx_reg: physical address of data register used for + * peripheral-to-memory transfers + * @reg_width: peripheral register width * @cfg_hi: Platform-specific initializer for the CFG_HI register * @cfg_lo: Platform-specific initializer for the CFG_LO register */ struct dw_dma_slave { - struct dma_slave slave; + struct device *dma_dev; + dma_addr_t tx_reg; + dma_addr_t rx_reg; + enum dw_dma_slave_width reg_width; u32 cfg_hi; u32 cfg_lo; }; @@ -54,9 +74,4 @@ struct dw_dma_slave { #define DWC_CFGL_HS_DST_POL (1 << 18) /* dst handshake active low */ #define DWC_CFGL_HS_SRC_POL (1 << 19) /* src handshake active low */ -static inline struct dw_dma_slave *to_dw_dma_slave(struct dma_slave *slave) -{ - return container_of(slave, struct dw_dma_slave, slave); -} - #endif /* DW_DMAC_H */ diff --git a/include/linux/leds-pca9532.h b/include/linux/leds-pca9532.h index 81b4207deb9..96eea90f01a 100644 --- a/include/linux/leds-pca9532.h +++ b/include/linux/leds-pca9532.h @@ -15,6 +15,7 @@ #define __LINUX_PCA9532_H #include <linux/leds.h> +#include <linux/workqueue.h> enum pca9532_state { PCA9532_OFF = 0x0, @@ -31,6 +32,7 @@ struct pca9532_led { struct i2c_client *client; char *name; struct led_classdev ldev; + struct work_struct work; enum pca9532_type type; enum pca9532_state state; }; diff --git a/include/linux/leds.h b/include/linux/leds.h index d3a73f5a48c..24489da701e 100644 --- a/include/linux/leds.h +++ b/include/linux/leds.h @@ -32,7 +32,10 @@ struct led_classdev { int brightness; int flags; + /* Lower 16 bits reflect status */ #define LED_SUSPENDED (1 << 0) + /* Upper 16 bits reflect control information */ +#define LED_CORE_SUSPENDRESUME (1 << 16) /* Set LED brightness level */ /* Must not sleep, use a workqueue if needed */ @@ -62,7 +65,7 @@ struct led_classdev { extern int led_classdev_register(struct device *parent, struct led_classdev *led_cdev); -extern void led_classdev_unregister(struct led_classdev *lcd); +extern void led_classdev_unregister(struct led_classdev *led_cdev); extern void led_classdev_suspend(struct led_classdev *led_cdev); extern void led_classdev_resume(struct led_classdev *led_cdev); diff --git a/include/linux/mfd/wm8350/pmic.h b/include/linux/mfd/wm8350/pmic.h index 96acbfc8aa1..be3264e286e 100644 --- a/include/linux/mfd/wm8350/pmic.h +++ b/include/linux/mfd/wm8350/pmic.h @@ -13,6 +13,10 @@ #ifndef __LINUX_MFD_WM8350_PMIC_H #define __LINUX_MFD_WM8350_PMIC_H +#include <linux/platform_device.h> +#include <linux/leds.h> +#include <linux/regulator/machine.h> + /* * Register values. */ @@ -700,6 +704,33 @@ struct wm8350; struct platform_device; struct regulator_init_data; +/* + * WM8350 LED platform data + */ +struct wm8350_led_platform_data { + const char *name; + const char *default_trigger; + int max_uA; +}; + +struct wm8350_led { + struct platform_device *pdev; + struct mutex mutex; + struct work_struct work; + spinlock_t value_lock; + enum led_brightness value; + struct led_classdev cdev; + int max_uA_index; + int enabled; + + struct regulator *isink; + struct regulator_consumer_supply isink_consumer; + struct regulator_init_data isink_init; + struct regulator *dcdc; + struct regulator_consumer_supply dcdc_consumer; + struct regulator_init_data dcdc_init; +}; + struct wm8350_pmic { /* Number of regulators of each type on this device */ int max_dcdc; @@ -717,10 +748,15 @@ struct wm8350_pmic { /* regulator devices */ struct platform_device *pdev[NUM_WM8350_REGULATORS]; + + /* LED devices */ + struct wm8350_led led[2]; }; int wm8350_register_regulator(struct wm8350 *wm8350, int reg, struct regulator_init_data *initdata); +int wm8350_register_led(struct wm8350 *wm8350, int lednum, int dcdc, int isink, + struct wm8350_led_platform_data *pdata); /* * Additional DCDC control not supported via regulator API diff --git a/include/linux/mtd/cfi.h b/include/linux/mtd/cfi.h index 00e2b575021..88d3d8fbf9f 100644 --- a/include/linux/mtd/cfi.h +++ b/include/linux/mtd/cfi.h @@ -520,6 +520,7 @@ struct cfi_fixup { #define CFI_MFR_AMD 0x0001 #define CFI_MFR_ATMEL 0x001F +#define CFI_MFR_SAMSUNG 0x00EC #define CFI_MFR_ST 0x0020 /* STMicroelectronics */ void cfi_fixup(struct mtd_info *mtd, struct cfi_fixup* fixups); diff --git a/include/linux/mtd/ftl.h b/include/linux/mtd/ftl.h index 0be442f881d..0555f7a0b9e 100644 --- a/include/linux/mtd/ftl.h +++ b/include/linux/mtd/ftl.h @@ -32,25 +32,25 @@ #define _LINUX_FTL_H typedef struct erase_unit_header_t { - u_int8_t LinkTargetTuple[5]; - u_int8_t DataOrgTuple[10]; - u_int8_t NumTransferUnits; - u_int32_t EraseCount; - u_int16_t LogicalEUN; - u_int8_t BlockSize; - u_int8_t EraseUnitSize; - u_int16_t FirstPhysicalEUN; - u_int16_t NumEraseUnits; - u_int32_t FormattedSize; - u_int32_t FirstVMAddress; - u_int16_t NumVMPages; - u_int8_t Flags; - u_int8_t Code; - u_int32_t SerialNumber; - u_int32_t AltEUHOffset; - u_int32_t BAMOffset; - u_int8_t Reserved[12]; - u_int8_t EndTuple[2]; + uint8_t LinkTargetTuple[5]; + uint8_t DataOrgTuple[10]; + uint8_t NumTransferUnits; + uint32_t EraseCount; + uint16_t LogicalEUN; + uint8_t BlockSize; + uint8_t EraseUnitSize; + uint16_t FirstPhysicalEUN; + uint16_t NumEraseUnits; + uint32_t FormattedSize; + uint32_t FirstVMAddress; + uint16_t NumVMPages; + uint8_t Flags; + uint8_t Code; + uint32_t SerialNumber; + uint32_t AltEUHOffset; + uint32_t BAMOffset; + uint8_t Reserved[12]; + uint8_t EndTuple[2]; } erase_unit_header_t; /* Flags in erase_unit_header_t */ diff --git a/include/linux/mtd/map.h b/include/linux/mtd/map.h index aa30244492c..b981b877221 100644 --- a/include/linux/mtd/map.h +++ b/include/linux/mtd/map.h @@ -223,6 +223,7 @@ struct map_info { must leave it enabled. */ void (*set_vpp)(struct map_info *, int); + unsigned long pfow_base; unsigned long map_priv_1; unsigned long map_priv_2; void *fldrv_priv; diff --git a/include/linux/mtd/mtd.h b/include/linux/mtd/mtd.h index 64433eb411d..3aa5d77c2cd 100644 --- a/include/linux/mtd/mtd.h +++ b/include/linux/mtd/mtd.h @@ -15,6 +15,8 @@ #include <linux/mtd/compatmac.h> #include <mtd/mtd-abi.h> +#include <asm/div64.h> + #define MTD_CHAR_MAJOR 90 #define MTD_BLOCK_MAJOR 31 #define MAX_MTD_DEVICES 32 @@ -25,20 +27,20 @@ #define MTD_ERASE_DONE 0x08 #define MTD_ERASE_FAILED 0x10 -#define MTD_FAIL_ADDR_UNKNOWN 0xffffffff +#define MTD_FAIL_ADDR_UNKNOWN -1LL /* If the erase fails, fail_addr might indicate exactly which block failed. If fail_addr = MTD_FAIL_ADDR_UNKNOWN, the failure was not at the device level or was not specific to any particular block. */ struct erase_info { struct mtd_info *mtd; - u_int32_t addr; - u_int32_t len; - u_int32_t fail_addr; + uint64_t addr; + uint64_t len; + uint64_t fail_addr; u_long time; u_long retries; - u_int dev; - u_int cell; + unsigned dev; + unsigned cell; void (*callback) (struct erase_info *self); u_long priv; u_char state; @@ -46,9 +48,9 @@ struct erase_info { }; struct mtd_erase_region_info { - u_int32_t offset; /* At which this region starts, from the beginning of the MTD */ - u_int32_t erasesize; /* For this region */ - u_int32_t numblocks; /* Number of blocks of erasesize in this region */ + uint64_t offset; /* At which this region starts, from the beginning of the MTD */ + uint32_t erasesize; /* For this region */ + uint32_t numblocks; /* Number of blocks of erasesize in this region */ unsigned long *lockmap; /* If keeping bitmap of locks */ }; @@ -100,14 +102,14 @@ struct mtd_oob_ops { struct mtd_info { u_char type; - u_int32_t flags; - u_int32_t size; // Total size of the MTD + uint32_t flags; + uint64_t size; // Total size of the MTD /* "Major" erase size for the device. Naïve users may take this * to be the only erase size available, or may use the more detailed * information below if they desire */ - u_int32_t erasesize; + uint32_t erasesize; /* Minimal writable flash unit size. In case of NOR flash it is 1 (even * though individual bits can be cleared), in case of NAND flash it is * one NAND page (or half, or one-fourths of it), in case of ECC-ed NOR @@ -115,10 +117,20 @@ struct mtd_info { * Any driver registering a struct mtd_info must ensure a writesize of * 1 or larger. */ - u_int32_t writesize; + uint32_t writesize; + + uint32_t oobsize; // Amount of OOB data per block (e.g. 16) + uint32_t oobavail; // Available OOB bytes per block - u_int32_t oobsize; // Amount of OOB data per block (e.g. 16) - u_int32_t oobavail; // Available OOB bytes per block + /* + * If erasesize is a power of 2 then the shift is stored in + * erasesize_shift otherwise erasesize_shift is zero. Ditto writesize. + */ + unsigned int erasesize_shift; + unsigned int writesize_shift; + /* Masks based on erasesize_shift and writesize_shift */ + unsigned int erasesize_mask; + unsigned int writesize_mask; // Kernel-only stuff starts here. const char *name; @@ -190,8 +202,8 @@ struct mtd_info { void (*sync) (struct mtd_info *mtd); /* Chip-supported device locking */ - int (*lock) (struct mtd_info *mtd, loff_t ofs, size_t len); - int (*unlock) (struct mtd_info *mtd, loff_t ofs, size_t len); + int (*lock) (struct mtd_info *mtd, loff_t ofs, uint64_t len); + int (*unlock) (struct mtd_info *mtd, loff_t ofs, uint64_t len); /* Power Management functions */ int (*suspend) (struct mtd_info *mtd); @@ -221,6 +233,35 @@ struct mtd_info { void (*put_device) (struct mtd_info *mtd); }; +static inline uint32_t mtd_div_by_eb(uint64_t sz, struct mtd_info *mtd) +{ + if (mtd->erasesize_shift) + return sz >> mtd->erasesize_shift; + do_div(sz, mtd->erasesize); + return sz; +} + +static inline uint32_t mtd_mod_by_eb(uint64_t sz, struct mtd_info *mtd) +{ + if (mtd->erasesize_shift) + return sz & mtd->erasesize_mask; + return do_div(sz, mtd->erasesize); +} + +static inline uint32_t mtd_div_by_ws(uint64_t sz, struct mtd_info *mtd) +{ + if (mtd->writesize_shift) + return sz >> mtd->writesize_shift; + do_div(sz, mtd->writesize); + return sz; +} + +static inline uint32_t mtd_mod_by_ws(uint64_t sz, struct mtd_info *mtd) +{ + if (mtd->writesize_shift) + return sz & mtd->writesize_mask; + return do_div(sz, mtd->writesize); +} /* Kernel-side ioctl definitions */ diff --git a/include/linux/mtd/nand.h b/include/linux/mtd/nand.h index 733d3f3b4eb..db5b63da2a7 100644 --- a/include/linux/mtd/nand.h +++ b/include/linux/mtd/nand.h @@ -335,17 +335,12 @@ struct nand_buffers { * @erase_cmd: [INTERN] erase command write function, selectable due to AND support * @scan_bbt: [REPLACEABLE] function to scan bad block table * @chip_delay: [BOARDSPECIFIC] chip dependent delay for transfering data from array to read regs (tR) - * @wq: [INTERN] wait queue to sleep on if a NAND operation is in progress * @state: [INTERN] the current state of the NAND device * @oob_poi: poison value buffer * @page_shift: [INTERN] number of address bits in a page (column address bits) * @phys_erase_shift: [INTERN] number of address bits in a physical eraseblock * @bbt_erase_shift: [INTERN] number of address bits in a bbt entry * @chip_shift: [INTERN] number of address bits in one chip - * @datbuf: [INTERN] internal buffer for one page + oob - * @oobbuf: [INTERN] oob buffer for one eraseblock - * @oobdirty: [INTERN] indicates that oob_buf must be reinitialized - * @data_poi: [INTERN] pointer to a data buffer * @options: [BOARDSPECIFIC] various chip options. They can partly be set to inform nand_scan about * special functionality. See the defines for further explanation * @badblockpos: [INTERN] position of the bad block marker in the oob area @@ -399,7 +394,7 @@ struct nand_chip { int bbt_erase_shift; int chip_shift; int numchips; - unsigned long chipsize; + uint64_t chipsize; int pagemask; int pagebuf; int subpagesize; diff --git a/include/linux/mtd/partitions.h b/include/linux/mtd/partitions.h index c92b4d43960..a45dd831b3f 100644 --- a/include/linux/mtd/partitions.h +++ b/include/linux/mtd/partitions.h @@ -36,9 +36,9 @@ struct mtd_partition { char *name; /* identifier string */ - u_int32_t size; /* partition size */ - u_int32_t offset; /* offset within the master MTD space */ - u_int32_t mask_flags; /* master MTD flags to mask out for this partition */ + uint64_t size; /* partition size */ + uint64_t offset; /* offset within the master MTD space */ + uint32_t mask_flags; /* master MTD flags to mask out for this partition */ struct nand_ecclayout *ecclayout; /* out of band layout for this partition (NAND only)*/ struct mtd_info **mtdp; /* pointer to store the MTD object */ }; diff --git a/include/linux/mtd/pfow.h b/include/linux/mtd/pfow.h new file mode 100644 index 00000000000..b730d4f8465 --- /dev/null +++ b/include/linux/mtd/pfow.h @@ -0,0 +1,159 @@ +/* Primary function overlay window definitions + * and service functions used by LPDDR chips + */ +#ifndef __LINUX_MTD_PFOW_H +#define __LINUX_MTD_PFOW_H + +#include <linux/mtd/qinfo.h> + +/* PFOW registers addressing */ +/* Address of symbol "P" */ +#define PFOW_QUERY_STRING_P 0x0000 +/* Address of symbol "F" */ +#define PFOW_QUERY_STRING_F 0x0002 +/* Address of symbol "O" */ +#define PFOW_QUERY_STRING_O 0x0004 +/* Address of symbol "W" */ +#define PFOW_QUERY_STRING_W 0x0006 +/* Identification info for LPDDR chip */ +#define PFOW_MANUFACTURER_ID 0x0020 +#define PFOW_DEVICE_ID 0x0022 +/* Address in PFOW where prog buffer can can be found */ +#define PFOW_PROGRAM_BUFFER_OFFSET 0x0040 +/* Size of program buffer in words */ +#define PFOW_PROGRAM_BUFFER_SIZE 0x0042 +/* Address command code register */ +#define PFOW_COMMAND_CODE 0x0080 +/* command data register */ +#define PFOW_COMMAND_DATA 0x0084 +/* command address register lower address bits */ +#define PFOW_COMMAND_ADDRESS_L 0x0088 +/* command address register upper address bits */ +#define PFOW_COMMAND_ADDRESS_H 0x008a +/* number of bytes to be proggrammed lower address bits */ +#define PFOW_DATA_COUNT_L 0x0090 +/* number of bytes to be proggrammed higher address bits */ +#define PFOW_DATA_COUNT_H 0x0092 +/* command execution register, the only possible value is 0x01 */ +#define PFOW_COMMAND_EXECUTE 0x00c0 +/* 0x01 should be written at this address to clear buffer */ +#define PFOW_CLEAR_PROGRAM_BUFFER 0x00c4 +/* device program/erase suspend register */ +#define PFOW_PROGRAM_ERASE_SUSPEND 0x00c8 +/* device status register */ +#define PFOW_DSR 0x00cc + +/* LPDDR memory device command codes */ +/* They are possible values of PFOW command code register */ +#define LPDDR_WORD_PROGRAM 0x0041 +#define LPDDR_BUFF_PROGRAM 0x00E9 +#define LPDDR_BLOCK_ERASE 0x0020 +#define LPDDR_LOCK_BLOCK 0x0061 +#define LPDDR_UNLOCK_BLOCK 0x0062 +#define LPDDR_READ_BLOCK_LOCK_STATUS 0x0065 +#define LPDDR_INFO_QUERY 0x0098 +#define LPDDR_READ_OTP 0x0097 +#define LPDDR_PROG_OTP 0x00C0 +#define LPDDR_RESUME 0x00D0 + +/* Defines possible value of PFOW command execution register */ +#define LPDDR_START_EXECUTION 0x0001 + +/* Defines possible value of PFOW program/erase suspend register */ +#define LPDDR_SUSPEND 0x0001 + +/* Possible values of PFOW device status register */ +/* access R - read; RC read & clearable */ +#define DSR_DPS (1<<1) /* RC; device protect status + * 0 - not protected 1 - locked */ +#define DSR_PSS (1<<2) /* R; program suspend status; + * 0-prog in progress/completed, + * 1- prog suspended */ +#define DSR_VPPS (1<<3) /* RC; 0-Vpp OK, * 1-Vpp low */ +#define DSR_PROGRAM_STATUS (1<<4) /* RC; 0-successful, 1-error */ +#define DSR_ERASE_STATUS (1<<5) /* RC; erase or blank check status; + * 0-success erase/blank check, + * 1 blank check error */ +#define DSR_ESS (1<<6) /* R; erase suspend status; + * 0-erase in progress/complete, + * 1 erase suspended */ +#define DSR_READY_STATUS (1<<7) /* R; Device status + * 0-busy, + * 1-ready */ +#define DSR_RPS (0x3<<8) /* RC; region program status + * 00 - Success, + * 01-re-program attempt in region with + * object mode data, + * 10-object mode program w attempt in + * region with control mode data + * 11-attempt to program invalid half + * with 0x41 command */ +#define DSR_AOS (1<<12) /* RC; 1- AO related failure */ +#define DSR_AVAILABLE (1<<15) /* R; Device availbility + * 1 - Device available + * 0 - not available */ + +/* The superset of all possible error bits in DSR */ +#define DSR_ERR 0x133A + +static inline void send_pfow_command(struct map_info *map, + unsigned long cmd_code, unsigned long adr, + unsigned long len, map_word *datum) +{ + int bits_per_chip = map_bankwidth(map) * 8; + int chipnum; + struct lpddr_private *lpddr = map->fldrv_priv; + chipnum = adr >> lpddr->chipshift; + + map_write(map, CMD(cmd_code), map->pfow_base + PFOW_COMMAND_CODE); + map_write(map, CMD(adr & ((1<<bits_per_chip) - 1)), + map->pfow_base + PFOW_COMMAND_ADDRESS_L); + map_write(map, CMD(adr>>bits_per_chip), + map->pfow_base + PFOW_COMMAND_ADDRESS_H); + if (len) { + map_write(map, CMD(len & ((1<<bits_per_chip) - 1)), + map->pfow_base + PFOW_DATA_COUNT_L); + map_write(map, CMD(len>>bits_per_chip), + map->pfow_base + PFOW_DATA_COUNT_H); + } + if (datum) + map_write(map, *datum, map->pfow_base + PFOW_COMMAND_DATA); + + /* Command execution start */ + map_write(map, CMD(LPDDR_START_EXECUTION), + map->pfow_base + PFOW_COMMAND_EXECUTE); +} + +static inline void print_drs_error(unsigned dsr) +{ + int prog_status = (dsr & DSR_RPS) >> 8; + + if (!(dsr & DSR_AVAILABLE)) + printk(KERN_NOTICE"DSR.15: (0) Device not Available\n"); + if (prog_status & 0x03) + printk(KERN_NOTICE"DSR.9,8: (11) Attempt to program invalid " + "half with 41h command\n"); + else if (prog_status & 0x02) + printk(KERN_NOTICE"DSR.9,8: (10) Object Mode Program attempt " + "in region with Control Mode data\n"); + else if (prog_status & 0x01) + printk(KERN_NOTICE"DSR.9,8: (01) Program attempt in region " + "with Object Mode data\n"); + if (!(dsr & DSR_READY_STATUS)) + printk(KERN_NOTICE"DSR.7: (0) Device is Busy\n"); + if (dsr & DSR_ESS) + printk(KERN_NOTICE"DSR.6: (1) Erase Suspended\n"); + if (dsr & DSR_ERASE_STATUS) + printk(KERN_NOTICE"DSR.5: (1) Erase/Blank check error\n"); + if (dsr & DSR_PROGRAM_STATUS) + printk(KERN_NOTICE"DSR.4: (1) Program Error\n"); + if (dsr & DSR_VPPS) + printk(KERN_NOTICE"DSR.3: (1) Vpp low detect, operation " + "aborted\n"); + if (dsr & DSR_PSS) + printk(KERN_NOTICE"DSR.2: (1) Program suspended\n"); + if (dsr & DSR_DPS) + printk(KERN_NOTICE"DSR.1: (1) Aborted Erase/Program attempt " + "on locked block\n"); +} +#endif /* __LINUX_MTD_PFOW_H */ diff --git a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h index c8e63a5ee72..76f7cabf07d 100644 --- a/include/linux/mtd/physmap.h +++ b/include/linux/mtd/physmap.h @@ -24,6 +24,7 @@ struct physmap_flash_data { unsigned int width; void (*set_vpp)(struct map_info *, int); unsigned int nr_parts; + unsigned int pfow_base; struct mtd_partition *parts; }; diff --git a/include/linux/mtd/qinfo.h b/include/linux/mtd/qinfo.h new file mode 100644 index 00000000000..7b3d487d8b3 --- /dev/null +++ b/include/linux/mtd/qinfo.h @@ -0,0 +1,91 @@ +#ifndef __LINUX_MTD_QINFO_H +#define __LINUX_MTD_QINFO_H + +#include <linux/mtd/map.h> +#include <linux/wait.h> +#include <linux/spinlock.h> +#include <linux/delay.h> +#include <linux/mtd/mtd.h> +#include <linux/mtd/flashchip.h> +#include <linux/mtd/partitions.h> + +/* lpddr_private describes lpddr flash chip in memory map + * @ManufactId - Chip Manufacture ID + * @DevId - Chip Device ID + * @qinfo - pointer to qinfo records describing the chip + * @numchips - number of chips including virual RWW partitions + * @chipshift - Chip/partiton size 2^chipshift + * @chips - per-chip data structure + */ +struct lpddr_private { + uint16_t ManufactId; + uint16_t DevId; + struct qinfo_chip *qinfo; + int numchips; + unsigned long chipshift; + struct flchip chips[0]; +}; + +/* qinfo_query_info structure contains request information for + * each qinfo record + * @major - major number of qinfo record + * @major - minor number of qinfo record + * @id_str - descriptive string to access the record + * @desc - detailed description for the qinfo record + */ +struct qinfo_query_info { + uint8_t major; + uint8_t minor; + char *id_str; + char *desc; +}; + +/* + * qinfo_chip structure contains necessary qinfo records data + * @DevSizeShift - Device size 2^n bytes + * @BufSizeShift - Program buffer size 2^n bytes + * @TotalBlocksNum - Total number of blocks + * @UniformBlockSizeShift - Uniform block size 2^UniformBlockSizeShift bytes + * @HWPartsNum - Number of hardware partitions + * @SuspEraseSupp - Suspend erase supported + * @SingleWordProgTime - Single word program 2^SingleWordProgTime u-sec + * @ProgBufferTime - Program buffer write 2^ProgBufferTime u-sec + * @BlockEraseTime - Block erase 2^BlockEraseTime m-sec + */ +struct qinfo_chip { + /* General device info */ + uint16_t DevSizeShift; + uint16_t BufSizeShift; + /* Erase block information */ + uint16_t TotalBlocksNum; + uint16_t UniformBlockSizeShift; + /* Partition information */ + uint16_t HWPartsNum; + /* Optional features */ + uint16_t SuspEraseSupp; + /* Operation typical time */ + uint16_t SingleWordProgTime; + uint16_t ProgBufferTime; + uint16_t BlockEraseTime; +}; + +/* defines for fixup usage */ +#define LPDDR_MFR_ANY 0xffff +#define LPDDR_ID_ANY 0xffff +#define NUMONYX_MFGR_ID 0x0089 +#define R18_DEVICE_ID_1G 0x893c + +static inline map_word lpddr_build_cmd(u_long cmd, struct map_info *map) +{ + map_word val = { {0} }; + val.x[0] = cmd; + return val; +} + +#define CMD(x) lpddr_build_cmd(x, map) +#define CMDVAL(cmd) cmd.x[0] + +struct mtd_info *lpddr_cmdset(struct map_info *); + +#endif + diff --git a/include/linux/mtd/sharpsl.h b/include/linux/mtd/sharpsl.h new file mode 100644 index 00000000000..25f4d2a845c --- /dev/null +++ b/include/linux/mtd/sharpsl.h @@ -0,0 +1,20 @@ +/* + * SharpSL NAND support + * + * Copyright (C) 2008 Dmitry Baryshkov + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include <linux/mtd/nand.h> +#include <linux/mtd/nand_ecc.h> +#include <linux/mtd/partitions.h> + +struct sharpsl_nand_platform_data { + struct nand_bbt_descr *badblock_pattern; + struct nand_ecclayout *ecc_layout; + struct mtd_partition *partitions; + unsigned int nr_partitions; +}; diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 114091be887..f2455681337 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1125,9 +1125,6 @@ struct softnet_data struct sk_buff *completion_queue; struct napi_struct backlog; -#ifdef CONFIG_NET_DMA - struct dma_chan *net_dma; -#endif }; DECLARE_PER_CPU(struct softnet_data,softnet_data); diff --git a/include/linux/oprofile.h b/include/linux/oprofile.h index 1ce9fe572e5..1d9518bc4c5 100644 --- a/include/linux/oprofile.h +++ b/include/linux/oprofile.h @@ -164,4 +164,22 @@ void oprofile_put_buff(unsigned long *buf, unsigned int start, unsigned long oprofile_get_cpu_buffer_size(void); void oprofile_cpu_buffer_inc_smpl_lost(void); +/* cpu buffer functions */ + +struct op_sample; + +struct op_entry { + struct ring_buffer_event *event; + struct op_sample *sample; + unsigned long irq_flags; + unsigned long size; + unsigned long *data; +}; + +void oprofile_write_reserve(struct op_entry *entry, + struct pt_regs * const regs, + unsigned long pc, int code, int size); +int oprofile_add_data(struct op_entry *entry, unsigned long val); +int oprofile_write_commit(struct op_entry *entry); + #endif /* OPROFILE_H */ diff --git a/include/linux/pci_hotplug.h b/include/linux/pci_hotplug.h index f7cc204fab0..20998746518 100644 --- a/include/linux/pci_hotplug.h +++ b/include/linux/pci_hotplug.h @@ -223,7 +223,6 @@ struct hotplug_params { #ifdef CONFIG_ACPI #include <acpi/acpi.h> #include <acpi/acpi_bus.h> -#include <acpi/actypes.h> extern acpi_status acpi_get_hp_params_from_firmware(struct pci_bus *bus, struct hotplug_params *hpp); int acpi_get_hp_hw_control_from_firmware(struct pci_dev *dev, u32 flags); diff --git a/include/linux/spi/tdo24m.h b/include/linux/spi/tdo24m.h new file mode 100644 index 00000000000..7572d4e1fe7 --- /dev/null +++ b/include/linux/spi/tdo24m.h @@ -0,0 +1,13 @@ +#ifndef __TDO24M_H__ +#define __TDO24M_H__ + +enum tdo24m_model { + TDO24M, + TDO35S, +}; + +struct tdo24m_platform_data { + enum tdo24m_model model; +}; + +#endif /* __TDO24M_H__ */ diff --git a/include/linux/suspend.h b/include/linux/suspend.h index 2ce8207686e..2b409c44db8 100644 --- a/include/linux/suspend.h +++ b/include/linux/suspend.h @@ -232,6 +232,11 @@ extern unsigned long get_safe_page(gfp_t gfp_mask); extern void hibernation_set_ops(struct platform_hibernation_ops *ops); extern int hibernate(void); +extern int hibernate_nvs_register(unsigned long start, unsigned long size); +extern int hibernate_nvs_alloc(void); +extern void hibernate_nvs_free(void); +extern void hibernate_nvs_save(void); +extern void hibernate_nvs_restore(void); #else /* CONFIG_HIBERNATION */ static inline int swsusp_page_is_forbidden(struct page *p) { return 0; } static inline void swsusp_set_page_free(struct page *p) {} @@ -239,6 +244,14 @@ static inline void swsusp_unset_page_free(struct page *p) {} static inline void hibernation_set_ops(struct platform_hibernation_ops *ops) {} static inline int hibernate(void) { return -ENOSYS; } +static inline int hibernate_nvs_register(unsigned long a, unsigned long b) +{ + return 0; +} +static inline int hibernate_nvs_alloc(void) { return 0; } +static inline void hibernate_nvs_free(void) {} +static inline void hibernate_nvs_save(void) {} +static inline void hibernate_nvs_restore(void) {} #endif /* CONFIG_HIBERNATION */ #ifdef CONFIG_PM_SLEEP diff --git a/include/net/netdma.h b/include/net/netdma.h index f28c6e064e8..8ba8ce284ee 100644 --- a/include/net/netdma.h +++ b/include/net/netdma.h @@ -24,17 +24,6 @@ #include <linux/dmaengine.h> #include <linux/skbuff.h> -static inline struct dma_chan *get_softnet_dma(void) -{ - struct dma_chan *chan; - rcu_read_lock(); - chan = rcu_dereference(__get_cpu_var(softnet_data).net_dma); - if (chan) - dma_chan_get(chan); - rcu_read_unlock(); - return chan; -} - int dma_skb_copy_datagram_iovec(struct dma_chan* chan, struct sk_buff *skb, int offset, struct iovec *to, size_t len, struct dma_pinned_list *pinned_list); diff --git a/kernel/cred.c b/kernel/cred.c index ff7bc071991..043f78c133c 100644 --- a/kernel/cred.c +++ b/kernel/cred.c @@ -506,6 +506,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) else old = get_cred(&init_cred); + *new = *old; get_uid(new->user); get_group_info(new->group_info); @@ -529,6 +530,7 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) error: put_cred(new); + put_cred(old); return NULL; } EXPORT_SYMBOL(prepare_kernel_cred); diff --git a/kernel/power/disk.c b/kernel/power/disk.c index f77d3819ef5..45e8541ab7e 100644 --- a/kernel/power/disk.c +++ b/kernel/power/disk.c @@ -258,12 +258,12 @@ int hibernation_snapshot(int platform_mode) { int error; - /* Free memory before shutting down devices. */ - error = swsusp_shrink_memory(); + error = platform_begin(platform_mode); if (error) return error; - error = platform_begin(platform_mode); + /* Free memory before shutting down devices. */ + error = swsusp_shrink_memory(); if (error) goto Close; diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c index 5d2ab836e99..f5fc2d7680f 100644 --- a/kernel/power/snapshot.c +++ b/kernel/power/snapshot.c @@ -25,6 +25,7 @@ #include <linux/syscalls.h> #include <linux/console.h> #include <linux/highmem.h> +#include <linux/list.h> #include <asm/uaccess.h> #include <asm/mmu_context.h> @@ -192,12 +193,6 @@ static void *chain_alloc(struct chain_allocator *ca, unsigned int size) return ret; } -static void chain_free(struct chain_allocator *ca, int clear_page_nosave) -{ - free_list_of_pages(ca->chain, clear_page_nosave); - memset(ca, 0, sizeof(struct chain_allocator)); -} - /** * Data types related to memory bitmaps. * @@ -233,7 +228,7 @@ static void chain_free(struct chain_allocator *ca, int clear_page_nosave) #define BM_BITS_PER_BLOCK (PAGE_SIZE << 3) struct bm_block { - struct bm_block *next; /* next element of the list */ + struct list_head hook; /* hook into a list of bitmap blocks */ unsigned long start_pfn; /* pfn represented by the first bit */ unsigned long end_pfn; /* pfn represented by the last bit plus 1 */ unsigned long *data; /* bitmap representing pages */ @@ -244,24 +239,15 @@ static inline unsigned long bm_block_bits(struct bm_block *bb) return bb->end_pfn - bb->start_pfn; } -struct zone_bitmap { - struct zone_bitmap *next; /* next element of the list */ - unsigned long start_pfn; /* minimal pfn in this zone */ - unsigned long end_pfn; /* maximal pfn in this zone plus 1 */ - struct bm_block *bm_blocks; /* list of bitmap blocks */ - struct bm_block *cur_block; /* recently used bitmap block */ -}; - /* strcut bm_position is used for browsing memory bitmaps */ struct bm_position { - struct zone_bitmap *zone_bm; struct bm_block *block; int bit; }; struct memory_bitmap { - struct zone_bitmap *zone_bm_list; /* list of zone bitmaps */ + struct list_head blocks; /* list of bitmap blocks */ struct linked_page *p_list; /* list of pages used to store zone * bitmap objects and bitmap block * objects @@ -273,11 +259,7 @@ struct memory_bitmap { static void memory_bm_position_reset(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; - - zone_bm = bm->zone_bm_list; - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; + bm->cur.block = list_entry(bm->blocks.next, struct bm_block, hook); bm->cur.bit = 0; } @@ -285,151 +267,184 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free); /** * create_bm_block_list - create a list of block bitmap objects + * @nr_blocks - number of blocks to allocate + * @list - list to put the allocated blocks into + * @ca - chain allocator to be used for allocating memory */ - -static inline struct bm_block * -create_bm_block_list(unsigned int nr_blocks, struct chain_allocator *ca) +static int create_bm_block_list(unsigned long pages, + struct list_head *list, + struct chain_allocator *ca) { - struct bm_block *bblist = NULL; + unsigned int nr_blocks = DIV_ROUND_UP(pages, BM_BITS_PER_BLOCK); while (nr_blocks-- > 0) { struct bm_block *bb; bb = chain_alloc(ca, sizeof(struct bm_block)); if (!bb) - return NULL; - - bb->next = bblist; - bblist = bb; + return -ENOMEM; + list_add(&bb->hook, list); } - return bblist; + + return 0; } +struct mem_extent { + struct list_head hook; + unsigned long start; + unsigned long end; +}; + /** - * create_zone_bm_list - create a list of zone bitmap objects + * free_mem_extents - free a list of memory extents + * @list - list of extents to empty */ +static void free_mem_extents(struct list_head *list) +{ + struct mem_extent *ext, *aux; -static inline struct zone_bitmap * -create_zone_bm_list(unsigned int nr_zones, struct chain_allocator *ca) + list_for_each_entry_safe(ext, aux, list, hook) { + list_del(&ext->hook); + kfree(ext); + } +} + +/** + * create_mem_extents - create a list of memory extents representing + * contiguous ranges of PFNs + * @list - list to put the extents into + * @gfp_mask - mask to use for memory allocations + */ +static int create_mem_extents(struct list_head *list, gfp_t gfp_mask) { - struct zone_bitmap *zbmlist = NULL; + struct zone *zone; - while (nr_zones-- > 0) { - struct zone_bitmap *zbm; + INIT_LIST_HEAD(list); - zbm = chain_alloc(ca, sizeof(struct zone_bitmap)); - if (!zbm) - return NULL; + for_each_zone(zone) { + unsigned long zone_start, zone_end; + struct mem_extent *ext, *cur, *aux; + + if (!populated_zone(zone)) + continue; - zbm->next = zbmlist; - zbmlist = zbm; + zone_start = zone->zone_start_pfn; + zone_end = zone->zone_start_pfn + zone->spanned_pages; + + list_for_each_entry(ext, list, hook) + if (zone_start <= ext->end) + break; + + if (&ext->hook == list || zone_end < ext->start) { + /* New extent is necessary */ + struct mem_extent *new_ext; + + new_ext = kzalloc(sizeof(struct mem_extent), gfp_mask); + if (!new_ext) { + free_mem_extents(list); + return -ENOMEM; + } + new_ext->start = zone_start; + new_ext->end = zone_end; + list_add_tail(&new_ext->hook, &ext->hook); + continue; + } + + /* Merge this zone's range of PFNs with the existing one */ + if (zone_start < ext->start) + ext->start = zone_start; + if (zone_end > ext->end) + ext->end = zone_end; + + /* More merging may be possible */ + cur = ext; + list_for_each_entry_safe_continue(cur, aux, list, hook) { + if (zone_end < cur->start) + break; + if (zone_end < cur->end) + ext->end = cur->end; + list_del(&cur->hook); + kfree(cur); + } } - return zbmlist; + + return 0; } /** * memory_bm_create - allocate memory for a memory bitmap */ - static int memory_bm_create(struct memory_bitmap *bm, gfp_t gfp_mask, int safe_needed) { struct chain_allocator ca; - struct zone *zone; - struct zone_bitmap *zone_bm; - struct bm_block *bb; - unsigned int nr; + struct list_head mem_extents; + struct mem_extent *ext; + int error; chain_init(&ca, gfp_mask, safe_needed); + INIT_LIST_HEAD(&bm->blocks); - /* Compute the number of zones */ - nr = 0; - for_each_zone(zone) - if (populated_zone(zone)) - nr++; - - /* Allocate the list of zones bitmap objects */ - zone_bm = create_zone_bm_list(nr, &ca); - bm->zone_bm_list = zone_bm; - if (!zone_bm) { - chain_free(&ca, PG_UNSAFE_CLEAR); - return -ENOMEM; - } - - /* Initialize the zone bitmap objects */ - for_each_zone(zone) { - unsigned long pfn; + error = create_mem_extents(&mem_extents, gfp_mask); + if (error) + return error; - if (!populated_zone(zone)) - continue; + list_for_each_entry(ext, &mem_extents, hook) { + struct bm_block *bb; + unsigned long pfn = ext->start; + unsigned long pages = ext->end - ext->start; - zone_bm->start_pfn = zone->zone_start_pfn; - zone_bm->end_pfn = zone->zone_start_pfn + zone->spanned_pages; - /* Allocate the list of bitmap block objects */ - nr = DIV_ROUND_UP(zone->spanned_pages, BM_BITS_PER_BLOCK); - bb = create_bm_block_list(nr, &ca); - zone_bm->bm_blocks = bb; - zone_bm->cur_block = bb; - if (!bb) - goto Free; + bb = list_entry(bm->blocks.prev, struct bm_block, hook); - nr = zone->spanned_pages; - pfn = zone->zone_start_pfn; - /* Initialize the bitmap block objects */ - while (bb) { - unsigned long *ptr; + error = create_bm_block_list(pages, bm->blocks.prev, &ca); + if (error) + goto Error; - ptr = get_image_page(gfp_mask, safe_needed); - bb->data = ptr; - if (!ptr) - goto Free; + list_for_each_entry_continue(bb, &bm->blocks, hook) { + bb->data = get_image_page(gfp_mask, safe_needed); + if (!bb->data) { + error = -ENOMEM; + goto Error; + } bb->start_pfn = pfn; - if (nr >= BM_BITS_PER_BLOCK) { + if (pages >= BM_BITS_PER_BLOCK) { pfn += BM_BITS_PER_BLOCK; - nr -= BM_BITS_PER_BLOCK; + pages -= BM_BITS_PER_BLOCK; } else { /* This is executed only once in the loop */ - pfn += nr; + pfn += pages; } bb->end_pfn = pfn; - bb = bb->next; } - zone_bm = zone_bm->next; } + bm->p_list = ca.chain; memory_bm_position_reset(bm); - return 0; + Exit: + free_mem_extents(&mem_extents); + return error; - Free: + Error: bm->p_list = ca.chain; memory_bm_free(bm, PG_UNSAFE_CLEAR); - return -ENOMEM; + goto Exit; } /** * memory_bm_free - free memory occupied by the memory bitmap @bm */ - static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) { - struct zone_bitmap *zone_bm; + struct bm_block *bb; - /* Free the list of bit blocks for each zone_bitmap object */ - zone_bm = bm->zone_bm_list; - while (zone_bm) { - struct bm_block *bb; + list_for_each_entry(bb, &bm->blocks, hook) + if (bb->data) + free_image_page(bb->data, clear_nosave_free); - bb = zone_bm->bm_blocks; - while (bb) { - if (bb->data) - free_image_page(bb->data, clear_nosave_free); - bb = bb->next; - } - zone_bm = zone_bm->next; - } free_list_of_pages(bm->p_list, clear_nosave_free); - bm->zone_bm_list = NULL; + + INIT_LIST_HEAD(&bm->blocks); } /** @@ -437,38 +452,33 @@ static void memory_bm_free(struct memory_bitmap *bm, int clear_nosave_free) * to given pfn. The cur_zone_bm member of @bm and the cur_block member * of @bm->cur_zone_bm are updated. */ - static int memory_bm_find_bit(struct memory_bitmap *bm, unsigned long pfn, void **addr, unsigned int *bit_nr) { - struct zone_bitmap *zone_bm; struct bm_block *bb; - /* Check if the pfn is from the current zone */ - zone_bm = bm->cur.zone_bm; - if (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = bm->zone_bm_list; - /* We don't assume that the zones are sorted by pfns */ - while (pfn < zone_bm->start_pfn || pfn >= zone_bm->end_pfn) { - zone_bm = zone_bm->next; - - if (!zone_bm) - return -EFAULT; - } - bm->cur.zone_bm = zone_bm; - } - /* Check if the pfn corresponds to the current bitmap block */ - bb = zone_bm->cur_block; + /* + * Check if the pfn corresponds to the current bitmap block and find + * the block where it fits if this is not the case. + */ + bb = bm->cur.block; if (pfn < bb->start_pfn) - bb = zone_bm->bm_blocks; + list_for_each_entry_continue_reverse(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn) + break; - while (pfn >= bb->end_pfn) { - bb = bb->next; + if (pfn >= bb->end_pfn) + list_for_each_entry_continue(bb, &bm->blocks, hook) + if (pfn >= bb->start_pfn && pfn < bb->end_pfn) + break; - BUG_ON(!bb); - } - zone_bm->cur_block = bb; + if (&bb->hook == &bm->blocks) + return -EFAULT; + + /* The block has been found */ + bm->cur.block = bb; pfn -= bb->start_pfn; + bm->cur.bit = pfn + 1; *bit_nr = pfn; *addr = bb->data; return 0; @@ -519,6 +529,14 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) return test_bit(bit, addr); } +static bool memory_bm_pfn_present(struct memory_bitmap *bm, unsigned long pfn) +{ + void *addr; + unsigned int bit; + + return !memory_bm_find_bit(bm, pfn, &addr, &bit); +} + /** * memory_bm_next_pfn - find the pfn that corresponds to the next set bit * in the bitmap @bm. If the pfn cannot be found, BM_END_OF_MAP is @@ -530,29 +548,21 @@ static int memory_bm_test_bit(struct memory_bitmap *bm, unsigned long pfn) static unsigned long memory_bm_next_pfn(struct memory_bitmap *bm) { - struct zone_bitmap *zone_bm; struct bm_block *bb; int bit; + bb = bm->cur.block; do { - bb = bm->cur.block; - do { - bit = bm->cur.bit; - bit = find_next_bit(bb->data, bm_block_bits(bb), bit); - if (bit < bm_block_bits(bb)) - goto Return_pfn; - - bb = bb->next; - bm->cur.block = bb; - bm->cur.bit = 0; - } while (bb); - zone_bm = bm->cur.zone_bm->next; - if (zone_bm) { - bm->cur.zone_bm = zone_bm; - bm->cur.block = zone_bm->bm_blocks; - bm->cur.bit = 0; - } - } while (zone_bm); + bit = bm->cur.bit; + bit = find_next_bit(bb->data, bm_block_bits(bb), bit); + if (bit < bm_block_bits(bb)) + goto Return_pfn; + + bb = list_entry(bb->hook.next, struct bm_block, hook); + bm->cur.block = bb; + bm->cur.bit = 0; + } while (&bb->hook != &bm->blocks); + memory_bm_position_reset(bm); return BM_END_OF_MAP; @@ -808,8 +818,7 @@ static unsigned int count_free_highmem_pages(void) * We should save the page if it isn't Nosave or NosaveFree, or Reserved, * and it isn't a part of a free chunk of pages. */ - -static struct page *saveable_highmem_page(unsigned long pfn) +static struct page *saveable_highmem_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -817,6 +826,8 @@ static struct page *saveable_highmem_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(!PageHighMem(page)); @@ -846,13 +857,16 @@ unsigned int count_highmem_pages(void) mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if (saveable_highmem_page(pfn)) + if (saveable_highmem_page(zone, pfn)) n++; } return n; } #else -static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } +static inline void *saveable_highmem_page(struct zone *z, unsigned long p) +{ + return NULL; +} #endif /* CONFIG_HIGHMEM */ /** @@ -863,8 +877,7 @@ static inline void *saveable_highmem_page(unsigned long pfn) { return NULL; } * of pages statically defined as 'unsaveable', and it isn't a part of * a free chunk of pages. */ - -static struct page *saveable_page(unsigned long pfn) +static struct page *saveable_page(struct zone *zone, unsigned long pfn) { struct page *page; @@ -872,6 +885,8 @@ static struct page *saveable_page(unsigned long pfn) return NULL; page = pfn_to_page(pfn); + if (page_zone(page) != zone) + return NULL; BUG_ON(PageHighMem(page)); @@ -903,7 +918,7 @@ unsigned int count_data_pages(void) mark_free_pages(zone); max_zone_pfn = zone->zone_start_pfn + zone->spanned_pages; for (pfn = zone->zone_start_pfn; pfn < max_zone_pfn; pfn++) - if(saveable_page(pfn)) + if (saveable_page(zone, pfn)) n++; } return n; @@ -944,7 +959,7 @@ static inline struct page * page_is_saveable(struct zone *zone, unsigned long pfn) { return is_highmem(zone) ? - saveable_highmem_page(pfn) : saveable_page(pfn); + saveable_highmem_page(zone, pfn) : saveable_page(zone, pfn); } static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) @@ -966,7 +981,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) * data modified by kmap_atomic() */ safe_copy_page(buffer, s_page); - dst = kmap_atomic(pfn_to_page(dst_pfn), KM_USER0); + dst = kmap_atomic(d_page, KM_USER0); memcpy(dst, buffer, PAGE_SIZE); kunmap_atomic(dst, KM_USER0); } else { @@ -975,7 +990,7 @@ static void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) } } #else -#define page_is_saveable(zone, pfn) saveable_page(pfn) +#define page_is_saveable(zone, pfn) saveable_page(zone, pfn) static inline void copy_data_page(unsigned long dst_pfn, unsigned long src_pfn) { @@ -1459,9 +1474,7 @@ load_header(struct swsusp_info *info) * unpack_orig_pfns - for each element of @buf[] (1 page at a time) set * the corresponding bit in the memory bitmap @bm */ - -static inline void -unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) +static int unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) { int j; @@ -1469,8 +1482,13 @@ unpack_orig_pfns(unsigned long *buf, struct memory_bitmap *bm) if (unlikely(buf[j] == BM_END_OF_MAP)) break; - memory_bm_set_bit(bm, buf[j]); + if (memory_bm_pfn_present(bm, buf[j])) + memory_bm_set_bit(bm, buf[j]); + else + return -EFAULT; } + + return 0; } /* List of "safe" pages that may be used to store data loaded from the suspend @@ -1608,7 +1626,7 @@ get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct highmem_pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_page = page; if (safe_highmem_pages > 0) { @@ -1677,7 +1695,7 @@ prepare_highmem_image(struct memory_bitmap *bm, unsigned int *nr_highmem_p) static inline void * get_highmem_page_buffer(struct page *page, struct chain_allocator *ca) { - return NULL; + return ERR_PTR(-EINVAL); } static inline void copy_last_highmem_page(void) {} @@ -1788,8 +1806,13 @@ prepare_image(struct memory_bitmap *new_bm, struct memory_bitmap *bm) static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) { struct pbe *pbe; - struct page *page = pfn_to_page(memory_bm_next_pfn(bm)); + struct page *page; + unsigned long pfn = memory_bm_next_pfn(bm); + if (pfn == BM_END_OF_MAP) + return ERR_PTR(-EFAULT); + + page = pfn_to_page(pfn); if (PageHighMem(page)) return get_highmem_page_buffer(page, ca); @@ -1805,7 +1828,7 @@ static void *get_buffer(struct memory_bitmap *bm, struct chain_allocator *ca) pbe = chain_alloc(ca, sizeof(struct pbe)); if (!pbe) { swsusp_free(); - return NULL; + return ERR_PTR(-ENOMEM); } pbe->orig_address = page_address(page); pbe->address = safe_pages_list; @@ -1868,7 +1891,10 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) return error; } else if (handle->prev <= nr_meta_pages) { - unpack_orig_pfns(buffer, ©_bm); + error = unpack_orig_pfns(buffer, ©_bm); + if (error) + return error; + if (handle->prev == nr_meta_pages) { error = prepare_image(&orig_bm, ©_bm); if (error) @@ -1879,12 +1905,14 @@ int snapshot_write_next(struct snapshot_handle *handle, size_t count) restore_pblist = NULL; handle->buffer = get_buffer(&orig_bm, &ca); handle->sync_read = 0; - if (!handle->buffer) - return -ENOMEM; + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); } } else { copy_last_highmem_page(); handle->buffer = get_buffer(&orig_bm, &ca); + if (IS_ERR(handle->buffer)) + return PTR_ERR(handle->buffer); if (handle->buffer != buffer) handle->sync_read = 0; } diff --git a/kernel/power/swsusp.c b/kernel/power/swsusp.c index 023ff2a31d8..a92c9145155 100644 --- a/kernel/power/swsusp.c +++ b/kernel/power/swsusp.c @@ -262,3 +262,125 @@ int swsusp_shrink_memory(void) return 0; } + +/* + * Platforms, like ACPI, may want us to save some memory used by them during + * hibernation and to restore the contents of this memory during the subsequent + * resume. The code below implements a mechanism allowing us to do that. + */ + +struct nvs_page { + unsigned long phys_start; + unsigned int size; + void *kaddr; + void *data; + struct list_head node; +}; + +static LIST_HEAD(nvs_list); + +/** + * hibernate_nvs_register - register platform NVS memory region to save + * @start - physical address of the region + * @size - size of the region + * + * The NVS region need not be page-aligned (both ends) and we arrange + * things so that the data from page-aligned addresses in this region will + * be copied into separate RAM pages. + */ +int hibernate_nvs_register(unsigned long start, unsigned long size) +{ + struct nvs_page *entry, *next; + + while (size > 0) { + unsigned int nr_bytes; + + entry = kzalloc(sizeof(struct nvs_page), GFP_KERNEL); + if (!entry) + goto Error; + + list_add_tail(&entry->node, &nvs_list); + entry->phys_start = start; + nr_bytes = PAGE_SIZE - (start & ~PAGE_MASK); + entry->size = (size < nr_bytes) ? size : nr_bytes; + + start += entry->size; + size -= entry->size; + } + return 0; + + Error: + list_for_each_entry_safe(entry, next, &nvs_list, node) { + list_del(&entry->node); + kfree(entry); + } + return -ENOMEM; +} + +/** + * hibernate_nvs_free - free data pages allocated for saving NVS regions + */ +void hibernate_nvs_free(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + free_page((unsigned long)entry->data); + entry->data = NULL; + if (entry->kaddr) { + iounmap(entry->kaddr); + entry->kaddr = NULL; + } + } +} + +/** + * hibernate_nvs_alloc - allocate memory necessary for saving NVS regions + */ +int hibernate_nvs_alloc(void) +{ + struct nvs_page *entry; + + list_for_each_entry(entry, &nvs_list, node) { + entry->data = (void *)__get_free_page(GFP_KERNEL); + if (!entry->data) { + hibernate_nvs_free(); + return -ENOMEM; + } + } + return 0; +} + +/** + * hibernate_nvs_save - save NVS memory regions + */ +void hibernate_nvs_save(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Saving platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) { + entry->kaddr = ioremap(entry->phys_start, entry->size); + memcpy(entry->data, entry->kaddr, entry->size); + } +} + +/** + * hibernate_nvs_restore - restore NVS memory regions + * + * This function is going to be called with interrupts disabled, so it + * cannot iounmap the virtual addresses used to access the NVS region. + */ +void hibernate_nvs_restore(void) +{ + struct nvs_page *entry; + + printk(KERN_INFO "PM: Restoring platform NVS memory\n"); + + list_for_each_entry(entry, &nvs_list, node) + if (entry->data) + memcpy(entry->kaddr, entry->data, entry->size); +} diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index a9d9760dc7b..8b0daf0662e 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -168,7 +168,13 @@ rb_event_length(struct ring_buffer_event *event) */ unsigned ring_buffer_event_length(struct ring_buffer_event *event) { - return rb_event_length(event); + unsigned length = rb_event_length(event); + if (event->type != RINGBUF_TYPE_DATA) + return length; + length -= RB_EVNT_HDR_SIZE; + if (length > RB_MAX_SMALL_DATA + sizeof(event->array[0])) + length -= sizeof(event->array[0]); + return length; } EXPORT_SYMBOL_GPL(ring_buffer_event_length); diff --git a/net/core/dev.c b/net/core/dev.c index bab8bcedd62..5f736f1ceea 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -170,25 +170,6 @@ static DEFINE_SPINLOCK(ptype_lock); static struct list_head ptype_base[PTYPE_HASH_SIZE] __read_mostly; static struct list_head ptype_all __read_mostly; /* Taps */ -#ifdef CONFIG_NET_DMA -struct net_dma { - struct dma_client client; - spinlock_t lock; - cpumask_t channel_mask; - struct dma_chan **channels; -}; - -static enum dma_state_client -netdev_dma_event(struct dma_client *client, struct dma_chan *chan, - enum dma_state state); - -static struct net_dma net_dma = { - .client = { - .event_callback = netdev_dma_event, - }, -}; -#endif - /* * The @dev_base_head list is protected by @dev_base_lock and the rtnl * semaphore. @@ -2754,14 +2735,7 @@ out: * There may not be any more sk_buffs coming right now, so push * any pending DMA copies to hardware */ - if (!cpus_empty(net_dma.channel_mask)) { - int chan_idx; - for_each_cpu_mask_nr(chan_idx, net_dma.channel_mask) { - struct dma_chan *chan = net_dma.channels[chan_idx]; - if (chan) - dma_async_memcpy_issue_pending(chan); - } - } + dma_issue_pending_all(); #endif return; @@ -4952,122 +4926,6 @@ static int dev_cpu_callback(struct notifier_block *nfb, return NOTIFY_OK; } -#ifdef CONFIG_NET_DMA -/** - * net_dma_rebalance - try to maintain one DMA channel per CPU - * @net_dma: DMA client and associated data (lock, channels, channel_mask) - * - * This is called when the number of channels allocated to the net_dma client - * changes. The net_dma client tries to have one DMA channel per CPU. - */ - -static void net_dma_rebalance(struct net_dma *net_dma) -{ - unsigned int cpu, i, n, chan_idx; - struct dma_chan *chan; - - if (cpus_empty(net_dma->channel_mask)) { - for_each_online_cpu(cpu) - rcu_assign_pointer(per_cpu(softnet_data, cpu).net_dma, NULL); - return; - } - - i = 0; - cpu = first_cpu(cpu_online_map); - - for_each_cpu_mask_nr(chan_idx, net_dma->channel_mask) { - chan = net_dma->channels[chan_idx]; - - n = ((num_online_cpus() / cpus_weight(net_dma->channel_mask)) - + (i < (num_online_cpus() % - cpus_weight(net_dma->channel_mask)) ? 1 : 0)); - - while(n) { - per_cpu(softnet_data, cpu).net_dma = chan; - cpu = next_cpu(cpu, cpu_online_map); - n--; - } - i++; - } -} - -/** - * netdev_dma_event - event callback for the net_dma_client - * @client: should always be net_dma_client - * @chan: DMA channel for the event - * @state: DMA state to be handled - */ -static enum dma_state_client -netdev_dma_event(struct dma_client *client, struct dma_chan *chan, - enum dma_state state) -{ - int i, found = 0, pos = -1; - struct net_dma *net_dma = - container_of(client, struct net_dma, client); - enum dma_state_client ack = DMA_DUP; /* default: take no action */ - - spin_lock(&net_dma->lock); - switch (state) { - case DMA_RESOURCE_AVAILABLE: - for (i = 0; i < nr_cpu_ids; i++) - if (net_dma->channels[i] == chan) { - found = 1; - break; - } else if (net_dma->channels[i] == NULL && pos < 0) - pos = i; - - if (!found && pos >= 0) { - ack = DMA_ACK; - net_dma->channels[pos] = chan; - cpu_set(pos, net_dma->channel_mask); - net_dma_rebalance(net_dma); - } - break; - case DMA_RESOURCE_REMOVED: - for (i = 0; i < nr_cpu_ids; i++) - if (net_dma->channels[i] == chan) { - found = 1; - pos = i; - break; - } - - if (found) { - ack = DMA_ACK; - cpu_clear(pos, net_dma->channel_mask); - net_dma->channels[i] = NULL; - net_dma_rebalance(net_dma); - } - break; - default: - break; - } - spin_unlock(&net_dma->lock); - - return ack; -} - -/** - * netdev_dma_register - register the networking subsystem as a DMA client - */ -static int __init netdev_dma_register(void) -{ - net_dma.channels = kzalloc(nr_cpu_ids * sizeof(struct net_dma), - GFP_KERNEL); - if (unlikely(!net_dma.channels)) { - printk(KERN_NOTICE - "netdev_dma: no memory for net_dma.channels\n"); - return -ENOMEM; - } - spin_lock_init(&net_dma.lock); - dma_cap_set(DMA_MEMCPY, net_dma.client.cap_mask); - dma_async_client_register(&net_dma.client); - dma_async_client_chan_request(&net_dma.client); - return 0; -} - -#else -static int __init netdev_dma_register(void) { return -ENODEV; } -#endif /* CONFIG_NET_DMA */ /** * netdev_increment_features - increment feature set by one @@ -5287,14 +5145,15 @@ static int __init net_dev_init(void) if (register_pernet_device(&default_device_ops)) goto out; - netdev_dma_register(); - open_softirq(NET_TX_SOFTIRQ, net_tx_action); open_softirq(NET_RX_SOFTIRQ, net_rx_action); hotcpu_notifier(dev_cpu_callback, 0); dst_init(); dev_mcast_init(); + #ifdef CONFIG_NET_DMA + dmaengine_get(); + #endif rc = 0; out: return rc; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index bd6ff907d9e..ce572f9dff0 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -1313,7 +1313,7 @@ int tcp_recvmsg(struct kiocb *iocb, struct sock *sk, struct msghdr *msg, if ((available < target) && (len > sysctl_tcp_dma_copybreak) && !(flags & MSG_PEEK) && !sysctl_tcp_low_latency && - __get_cpu_var(softnet_data).net_dma) { + dma_find_channel(DMA_MEMCPY)) { preempt_enable_no_resched(); tp->ucopy.pinned_list = dma_pin_iovec_pages(msg->msg_iov, len); @@ -1523,7 +1523,7 @@ do_prequeue: if (!(flags & MSG_TRUNC)) { #ifdef CONFIG_NET_DMA if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = get_softnet_dma(); + tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); if (tp->ucopy.dma_chan) { tp->ucopy.dma_cookie = dma_skb_copy_datagram_iovec( @@ -1628,7 +1628,6 @@ skip_copy: /* Safe to free early-copied skbs now */ __skb_queue_purge(&sk->sk_async_wait_queue); - dma_chan_put(tp->ucopy.dma_chan); tp->ucopy.dma_chan = NULL; } if (tp->ucopy.pinned_list) { diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 99b7ecbe889..a6961d75c7e 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -5005,7 +5005,7 @@ static int tcp_dma_try_early_copy(struct sock *sk, struct sk_buff *skb, return 0; if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = get_softnet_dma(); + tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); if (tp->ucopy.dma_chan && skb_csum_unnecessary(skb)) { diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 9d839fa9331..19d7b429a26 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1594,7 +1594,7 @@ process: #ifdef CONFIG_NET_DMA struct tcp_sock *tp = tcp_sk(sk); if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = get_softnet_dma(); + tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); if (tp->ucopy.dma_chan) ret = tcp_v4_do_rcv(sk, skb); else diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 1297306d729..e5b85d45bee 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1675,7 +1675,7 @@ process: #ifdef CONFIG_NET_DMA struct tcp_sock *tp = tcp_sk(sk); if (!tp->ucopy.dma_chan && tp->ucopy.pinned_list) - tp->ucopy.dma_chan = get_softnet_dma(); + tp->ucopy.dma_chan = dma_find_channel(DMA_MEMCPY); if (tp->ucopy.dma_chan) ret = tcp_v6_do_rcv(sk, skb); else |