aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-fs-ext481
-rw-r--r--Documentation/filesystems/ext4.txt30
-rw-r--r--Documentation/filesystems/proc.txt21
-rw-r--r--arch/ia64/include/asm/intrinsics.h6
-rw-r--r--arch/ia64/include/asm/mmu_context.h6
-rw-r--r--arch/ia64/include/asm/module.h6
-rw-r--r--arch/ia64/include/asm/native/inst.h13
-rw-r--r--arch/ia64/include/asm/native/patchlist.h38
-rw-r--r--arch/ia64/include/asm/native/pvchk_inst.h8
-rw-r--r--arch/ia64/include/asm/paravirt.h65
-rw-r--r--arch/ia64/include/asm/paravirt_patch.h143
-rw-r--r--arch/ia64/include/asm/paravirt_privop.h365
-rw-r--r--arch/ia64/include/asm/smp.h3
-rw-r--r--arch/ia64/include/asm/timex.h1
-rw-r--r--arch/ia64/include/asm/topology.h5
-rw-r--r--arch/ia64/include/asm/xen/hypervisor.h39
-rw-r--r--arch/ia64/include/asm/xen/inst.h28
-rw-r--r--arch/ia64/include/asm/xen/interface.h9
-rw-r--r--arch/ia64/include/asm/xen/minstate.h11
-rw-r--r--arch/ia64/include/asm/xen/patchlist.h38
-rw-r--r--arch/ia64/include/asm/xen/privop.h8
-rw-r--r--arch/ia64/kernel/Makefile39
-rw-r--r--arch/ia64/kernel/Makefile.gate27
-rw-r--r--arch/ia64/kernel/acpi.c8
-rw-r--r--arch/ia64/kernel/asm-offsets.c2
-rw-r--r--arch/ia64/kernel/efi.c1
-rw-r--r--arch/ia64/kernel/entry.S4
-rw-r--r--arch/ia64/kernel/fsys.S35
-rw-r--r--arch/ia64/kernel/gate.S171
-rw-r--r--arch/ia64/kernel/gate.lds.S17
-rw-r--r--arch/ia64/kernel/head.S10
-rw-r--r--arch/ia64/kernel/ivt.S2
-rw-r--r--arch/ia64/kernel/mca.c6
-rw-r--r--arch/ia64/kernel/module.c35
-rw-r--r--arch/ia64/kernel/paravirt.c539
-rw-r--r--arch/ia64/kernel/paravirt_patch.c514
-rw-r--r--arch/ia64/kernel/paravirt_patchlist.c79
-rw-r--r--arch/ia64/kernel/paravirt_patchlist.h28
-rw-r--r--arch/ia64/kernel/paravirtentry.S99
-rw-r--r--arch/ia64/kernel/patch.c40
-rw-r--r--arch/ia64/kernel/perfmon.c4
-rw-r--r--arch/ia64/kernel/salinfo.c6
-rw-r--r--arch/ia64/kernel/setup.c9
-rw-r--r--arch/ia64/kernel/smp.c6
-rw-r--r--arch/ia64/kernel/smpboot.c17
-rw-r--r--arch/ia64/kernel/time.c9
-rw-r--r--arch/ia64/kernel/vmlinux.lds.S30
-rw-r--r--arch/ia64/kvm/kvm-ia64.c2
-rw-r--r--arch/ia64/kvm/vcpu.c2
-rw-r--r--arch/ia64/kvm/vtlb.c2
-rw-r--r--arch/ia64/mm/init.c12
-rw-r--r--arch/ia64/mm/tlb.c2
-rw-r--r--arch/ia64/scripts/pvcheck.sed1
-rw-r--r--arch/ia64/sn/kernel/io_common.c15
-rw-r--r--arch/ia64/sn/kernel/io_init.c12
-rw-r--r--arch/ia64/sn/kernel/setup.c5
-rw-r--r--arch/ia64/sn/kernel/sn2/sn2_smp.c12
-rw-r--r--arch/ia64/sn/kernel/sn2/sn_hwperf.c8
-rw-r--r--arch/ia64/sn/pci/pcibr/pcibr_dma.c4
-rw-r--r--arch/ia64/xen/Makefile19
-rw-r--r--arch/ia64/xen/gate-data.S3
-rw-r--r--arch/ia64/xen/hypercall.S2
-rw-r--r--arch/ia64/xen/time.c48
-rw-r--r--arch/ia64/xen/xen_pv_ops.c800
-rw-r--r--arch/x86/boot/memory.c39
-rw-r--r--drivers/gpu/drm/drm_crtc_helper.c31
-rw-r--r--drivers/gpu/drm/drm_edid.c8
-rw-r--r--drivers/gpu/drm/i915/i915_dma.c2
-rw-r--r--drivers/s390/net/qeth_core_offl.c0
-rw-r--r--drivers/s390/net/qeth_core_offl.h0
-rw-r--r--drivers/serial/serial_core.c2
-rw-r--r--fs/btrfs/Makefile2
-rw-r--r--fs/btrfs/btrfs_inode.h31
-rw-r--r--fs/btrfs/ctree.c588
-rw-r--r--fs/btrfs/ctree.h69
-rw-r--r--fs/btrfs/delayed-ref.c669
-rw-r--r--fs/btrfs/delayed-ref.h193
-rw-r--r--fs/btrfs/dir-item.c3
-rw-r--r--fs/btrfs/disk-io.c81
-rw-r--r--fs/btrfs/disk-io.h1
-rw-r--r--fs/btrfs/extent-tree.c1674
-rw-r--r--fs/btrfs/extent_io.c51
-rw-r--r--fs/btrfs/extent_io.h3
-rw-r--r--fs/btrfs/file-item.c7
-rw-r--r--fs/btrfs/file.c50
-rw-r--r--fs/btrfs/inode-item.c3
-rw-r--r--fs/btrfs/inode.c194
-rw-r--r--fs/btrfs/locking.c21
-rw-r--r--fs/btrfs/ordered-data.c118
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/transaction.c151
-rw-r--r--fs/btrfs/transaction.h8
-rw-r--r--fs/btrfs/tree-defrag.c2
-rw-r--r--fs/btrfs/tree-log.c444
-rw-r--r--fs/btrfs/tree-log.h17
-rw-r--r--fs/ext4/balloc.c14
-rw-r--r--fs/ext4/dir.c16
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_extents.h1
-rw-r--r--fs/ext4/ext4_i.h6
-rw-r--r--fs/ext4/ext4_sb.h14
-rw-r--r--fs/ext4/extents.c127
-rw-r--r--fs/ext4/file.c7
-rw-r--r--fs/ext4/ialloc.c273
-rw-r--r--fs/ext4/inode.c424
-rw-r--r--fs/ext4/ioctl.c17
-rw-r--r--fs/ext4/mballoc.c158
-rw-r--r--fs/ext4/mballoc.h8
-rw-r--r--fs/ext4/namei.c164
-rw-r--r--fs/ext4/resize.c8
-rw-r--r--fs/ext4/super.c327
-rw-r--r--fs/jbd2/commit.c5
-rw-r--r--fs/jbd2/revoke.c24
-rw-r--r--fs/jbd2/transaction.c2
-rw-r--r--fs/lockd/clntlock.c51
-rw-r--r--fs/lockd/mon.c8
-rw-r--r--fs/lockd/svc.c42
-rw-r--r--fs/nfs/callback.c31
-rw-r--r--fs/nfs/callback.h1
-rw-r--r--fs/nfs/client.c116
-rw-r--r--fs/nfs/dir.c9
-rw-r--r--fs/nfs/file.c32
-rw-r--r--fs/nfs/getroot.c4
-rw-r--r--fs/nfs/inode.c309
-rw-r--r--fs/nfs/internal.h4
-rw-r--r--fs/nfs/nfs2xdr.c9
-rw-r--r--fs/nfs/nfs3proc.c1
-rw-r--r--fs/nfs/nfs3xdr.c37
-rw-r--r--fs/nfs/nfs4proc.c47
-rw-r--r--fs/nfs/nfs4state.c10
-rw-r--r--fs/nfs/nfs4xdr.c213
-rw-r--r--fs/nfs/pagelist.c11
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c4
-rw-r--r--fs/nfs/write.c53
-rw-r--r--fs/nfsd/nfsctl.c6
-rw-r--r--fs/nfsd/nfssvc.c5
-rw-r--r--include/drm/drm_crtc_helper.h3
-rw-r--r--include/drm/drm_os_linux.h4
-rw-r--r--include/linux/jbd2.h6
-rw-r--r--include/linux/nfs_fs.h4
-rw-r--r--include/linux/nfs_fs_sb.h5
-rw-r--r--include/linux/nfs_xdr.h59
-rw-r--r--include/linux/sunrpc/svc.h9
-rw-r--r--include/linux/sunrpc/svc_xprt.h52
-rw-r--r--include/linux/sunrpc/xprt.h2
-rw-r--r--net/sunrpc/Kconfig22
-rw-r--r--net/sunrpc/clnt.c48
-rw-r--r--net/sunrpc/rpcb_clnt.c103
-rw-r--r--net/sunrpc/svc.c158
-rw-r--r--net/sunrpc/svc_xprt.c31
-rw-r--r--net/sunrpc/svcsock.c40
-rw-r--r--net/sunrpc/xprt.c89
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c26
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c8
-rw-r--r--net/sunrpc/xprtsock.c363
156 files changed, 8419 insertions, 3379 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-ext4 b/Documentation/ABI/testing/sysfs-fs-ext4
new file mode 100644
index 00000000000..4e79074de28
--- /dev/null
+++ b/Documentation/ABI/testing/sysfs-fs-ext4
@@ -0,0 +1,81 @@
+What: /sys/fs/ext4/<disk>/mb_stats
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Controls whether the multiblock allocator should
+ collect statistics, which are shown during the unmount.
+ 1 means to collect statistics, 0 means not to collect
+ statistics
+
+What: /sys/fs/ext4/<disk>/mb_group_prealloc
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ The multiblock allocator will round up allocation
+ requests to a multiple of this tuning parameter if the
+ stripe size is not set in the ext4 superblock
+
+What: /sys/fs/ext4/<disk>/mb_max_to_scan
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ The maximum number of extents the multiblock allocator
+ will search to find the best extent
+
+What: /sys/fs/ext4/<disk>/mb_min_to_scan
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ The minimum number of extents the multiblock allocator
+ will search to find the best extent
+
+What: /sys/fs/ext4/<disk>/mb_order2_req
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Tuning parameter which controls the minimum size for
+ requests (as a power of 2) where the buddy cache is
+ used
+
+What: /sys/fs/ext4/<disk>/mb_stream_req
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Files which have fewer blocks than this tunable
+ parameter will have their blocks allocated out of a
+ block group specific preallocation pool, so that small
+ files are packed closely together. Each large file
+ will have its blocks allocated out of its own unique
+ preallocation pool.
+
+What: /sys/fs/ext4/<disk>/inode_readahead
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ Tuning parameter which controls the maximum number of
+ inode table blocks that ext4's inode table readahead
+ algorithm will pre-read into the buffer cache
+
+What: /sys/fs/ext4/<disk>/delayed_allocation_blocks
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ This file is read-only and shows the number of blocks
+ that are dirty in the page cache, but which do not
+ have their location in the filesystem allocated yet.
+
+What: /sys/fs/ext4/<disk>/lifetime_write_kbytes
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ This file is read-only and shows the number of kilobytes
+ of data that have been written to this filesystem since it was
+ created.
+
+What: /sys/fs/ext4/<disk>/session_write_kbytes
+Date: March 2008
+Contact: "Theodore Ts'o" <tytso@mit.edu>
+Description:
+ This file is read-only and shows the number of
+ kilobytes of data that have been written to this
+ filesystem since it was mounted.
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index cec829bc729..97882df0486 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -85,7 +85,7 @@ Note: More extensive information for getting started with ext4 can be
* extent format more robust in face of on-disk corruption due to magics,
* internal redundancy in tree
* improved file allocation (multi-block alloc)
-* fix 32000 subdirectory limit
+* lift 32000 subdirectory limit imposed by i_links_count[1]
* nsec timestamps for mtime, atime, ctime, create time
* inode version field on disk (NFSv4, Lustre)
* reduced e2fsck time via uninit_bg feature
@@ -100,6 +100,9 @@ Note: More extensive information for getting started with ext4 can be
* efficent new ordered mode in JBD2 and ext4(avoid using buffer head to force
the ordering)
+[1] Filesystems with a block size of 1k may see a limit imposed by the
+directory hash tree having a maximum depth of two.
+
2.2 Candidate features for future inclusion
* Online defrag (patches available but not well tested)
@@ -180,8 +183,8 @@ commit=nrsec (*) Ext4 can be told to sync all its data and metadata
performance.
barrier=<0|1(*)> This enables/disables the use of write barriers in
- the jbd code. barrier=0 disables, barrier=1 enables.
- This also requires an IO stack which can support
+barrier(*) the jbd code. barrier=0 disables, barrier=1 enables.
+nobarrier This also requires an IO stack which can support
barriers, and if jbd gets an error on a barrier
write, it will disable again with a warning.
Write barriers enforce proper on-disk ordering
@@ -189,6 +192,9 @@ barrier=<0|1(*)> This enables/disables the use of write barriers in
safe to use, at some performance penalty. If
your disks are battery-backed in one way or another,
disabling barriers may safely improve performance.
+ The mount options "barrier" and "nobarrier" can
+ also be used to enable or disable barriers, for
+ consistency with other ext4 mount options.
inode_readahead=n This tuning parameter controls the maximum
number of inode table blocks that ext4's inode
@@ -310,6 +316,24 @@ journal_ioprio=prio The I/O priority (from 0 to 7, where 0 is the
a slightly higher priority than the default I/O
priority.
+auto_da_alloc(*) Many broken applications don't use fsync() when
+noauto_da_alloc replacing existing files via patterns such as
+ fd = open("foo.new")/write(fd,..)/close(fd)/
+ rename("foo.new", "foo"), or worse yet,
+ fd = open("foo", O_TRUNC)/write(fd,..)/close(fd).
+ If auto_da_alloc is enabled, ext4 will detect
+ the replace-via-rename and replace-via-truncate
+ patterns and force that any delayed allocation
+ blocks are allocated such that at the next
+ journal commit, in the default data=ordered
+ mode, the data blocks of the new file are forced
+ to disk before the rename() operation is
+ commited. This provides roughly the same level
+ of guarantees as ext3, and avoids the
+ "zero-length" problem that can happen when a
+ system crashes before the delayed allocation
+ blocks are forced to disk.
+
Data Mode
=========
There are 3 different data modes:
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 830bad7cce0..efc4fd9f40c 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -940,27 +940,6 @@ Table 1-10: Files in /proc/fs/ext4/<devname>
File Content
mb_groups details of multiblock allocator buddy cache of free blocks
mb_history multiblock allocation history
- stats controls whether the multiblock allocator should start
- collecting statistics, which are shown during the unmount
- group_prealloc the multiblock allocator will round up allocation
- requests to a multiple of this tuning parameter if the
- stripe size is not set in the ext4 superblock
- max_to_scan The maximum number of extents the multiblock allocator
- will search to find the best extent
- min_to_scan The minimum number of extents the multiblock allocator
- will search to find the best extent
- order2_req Tuning parameter which controls the minimum size for
- requests (as a power of 2) where the buddy cache is
- used
- stream_req Files which have fewer blocks than this tunable
- parameter will have their blocks allocated out of a
- block group specific preallocation pool, so that small
- files are packed closely together. Each large file
- will have its blocks allocated out of its own unique
- preallocation pool.
-inode_readahead Tuning parameter which controls the maximum number of
- inode table blocks that ext4's inode table readahead
- algorithm will pre-read into the buffer cache
..............................................................................
diff --git a/arch/ia64/include/asm/intrinsics.h b/arch/ia64/include/asm/intrinsics.h
index c47830e26cb..111ed522289 100644
--- a/arch/ia64/include/asm/intrinsics.h
+++ b/arch/ia64/include/asm/intrinsics.h
@@ -202,7 +202,11 @@ extern long ia64_cmpxchg_called_with_bad_pointer (void);
#ifndef __ASSEMBLY__
#if defined(CONFIG_PARAVIRT) && defined(__KERNEL__)
-#define IA64_INTRINSIC_API(name) pv_cpu_ops.name
+#ifdef ASM_SUPPORTED
+# define IA64_INTRINSIC_API(name) paravirt_ ## name
+#else
+# define IA64_INTRINSIC_API(name) pv_cpu_ops.name
+#endif
#define IA64_INTRINSIC_MACRO(name) paravirt_ ## name
#else
#define IA64_INTRINSIC_API(name) ia64_native_ ## name
diff --git a/arch/ia64/include/asm/mmu_context.h b/arch/ia64/include/asm/mmu_context.h
index 040bc87db93..7f2a456603c 100644
--- a/arch/ia64/include/asm/mmu_context.h
+++ b/arch/ia64/include/asm/mmu_context.h
@@ -87,7 +87,7 @@ get_mmu_context (struct mm_struct *mm)
/* re-check, now that we've got the lock: */
context = mm->context;
if (context == 0) {
- cpus_clear(mm->cpu_vm_mask);
+ cpumask_clear(mm_cpumask(mm));
if (ia64_ctx.next >= ia64_ctx.limit) {
ia64_ctx.next = find_next_zero_bit(ia64_ctx.bitmap,
ia64_ctx.max_ctx, ia64_ctx.next);
@@ -166,8 +166,8 @@ activate_context (struct mm_struct *mm)
do {
context = get_mmu_context(mm);
- if (!cpu_isset(smp_processor_id(), mm->cpu_vm_mask))
- cpu_set(smp_processor_id(), mm->cpu_vm_mask);
+ if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(mm)))
+ cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm));
reload_context(context);
/*
* in the unlikely event of a TLB-flush by another thread,
diff --git a/arch/ia64/include/asm/module.h b/arch/ia64/include/asm/module.h
index d2da61e4c49..908eaef42a0 100644
--- a/arch/ia64/include/asm/module.h
+++ b/arch/ia64/include/asm/module.h
@@ -16,6 +16,12 @@ struct mod_arch_specific {
struct elf64_shdr *got; /* global offset table */
struct elf64_shdr *opd; /* official procedure descriptors */
struct elf64_shdr *unwind; /* unwind-table section */
+#ifdef CONFIG_PARAVIRT
+ struct elf64_shdr *paravirt_bundles;
+ /* paravirt_alt_bundle_patch table */
+ struct elf64_shdr *paravirt_insts;
+ /* paravirt_alt_inst_patch table */
+#endif
unsigned long gp; /* global-pointer for module */
void *core_unw_table; /* core unwind-table cookie returned by unwinder */
diff --git a/arch/ia64/include/asm/native/inst.h b/arch/ia64/include/asm/native/inst.h
index 0a1026cca4f..d2d46efb3e6 100644
--- a/arch/ia64/include/asm/native/inst.h
+++ b/arch/ia64/include/asm/native/inst.h
@@ -30,6 +30,9 @@
#define __paravirt_work_processed_syscall_target \
ia64_work_processed_syscall
+#define paravirt_fsyscall_table ia64_native_fsyscall_table
+#define paravirt_fsys_bubble_down ia64_native_fsys_bubble_down
+
#ifdef CONFIG_PARAVIRT_GUEST_ASM_CLOBBER_CHECK
# define PARAVIRT_POISON 0xdeadbeefbaadf00d
# define CLOBBER(clob) \
@@ -74,6 +77,11 @@
(pred) mov reg = psr \
CLOBBER(clob)
+#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \
+(pred) mov reg = ar.itc \
+ CLOBBER(clob) \
+ CLOBBER_PRED(pred_clob)
+
#define MOV_TO_IFA(reg, clob) \
mov cr.ifa = reg \
CLOBBER(clob)
@@ -158,6 +166,11 @@
#define RSM_PSR_DT \
rsm psr.dt
+#define RSM_PSR_BE_I(clob0, clob1) \
+ rsm psr.be | psr.i \
+ CLOBBER(clob0) \
+ CLOBBER(clob1)
+
#define SSM_PSR_DT_AND_SRLZ_I \
ssm psr.dt \
;; \
diff --git a/arch/ia64/include/asm/native/patchlist.h b/arch/ia64/include/asm/native/patchlist.h
new file mode 100644
index 00000000000..be16ca9311b
--- /dev/null
+++ b/arch/ia64/include/asm/native/patchlist.h
@@ -0,0 +1,38 @@
+/******************************************************************************
+ * arch/ia64/include/asm/native/inst.h
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#define __paravirt_start_gate_fsyscall_patchlist \
+ __ia64_native_start_gate_fsyscall_patchlist
+#define __paravirt_end_gate_fsyscall_patchlist \
+ __ia64_native_end_gate_fsyscall_patchlist
+#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist \
+ __ia64_native_start_gate_brl_fsys_bubble_down_patchlist
+#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist \
+ __ia64_native_end_gate_brl_fsys_bubble_down_patchlist
+#define __paravirt_start_gate_vtop_patchlist \
+ __ia64_native_start_gate_vtop_patchlist
+#define __paravirt_end_gate_vtop_patchlist \
+ __ia64_native_end_gate_vtop_patchlist
+#define __paravirt_start_gate_mckinley_e9_patchlist \
+ __ia64_native_start_gate_mckinley_e9_patchlist
+#define __paravirt_end_gate_mckinley_e9_patchlist \
+ __ia64_native_end_gate_mckinley_e9_patchlist
diff --git a/arch/ia64/include/asm/native/pvchk_inst.h b/arch/ia64/include/asm/native/pvchk_inst.h
index b8e6eb1090d..8d72962ec83 100644
--- a/arch/ia64/include/asm/native/pvchk_inst.h
+++ b/arch/ia64/include/asm/native/pvchk_inst.h
@@ -180,6 +180,11 @@
IS_PRED_IN(pred) \
IS_RREG_OUT(reg) \
IS_RREG_CLOB(clob)
+#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \
+ IS_PRED_IN(pred) \
+ IS_PRED_CLOB(pred_clob) \
+ IS_RREG_OUT(reg) \
+ IS_RREG_CLOB(clob)
#define MOV_TO_IFA(reg, clob) \
IS_RREG_IN(reg) \
IS_RREG_CLOB(clob)
@@ -246,6 +251,9 @@
IS_RREG_CLOB(clob2)
#define RSM_PSR_DT \
nop 0
+#define RSM_PSR_BE_I(clob0, clob1) \
+ IS_RREG_CLOB(clob0) \
+ IS_RREG_CLOB(clob1)
#define SSM_PSR_DT_AND_SRLZ_I \
nop 0
#define BSW_0(clob0, clob1, clob2) \
diff --git a/arch/ia64/include/asm/paravirt.h b/arch/ia64/include/asm/paravirt.h
index 2bf3636473f..2eb0a981a09 100644
--- a/arch/ia64/include/asm/paravirt.h
+++ b/arch/ia64/include/asm/paravirt.h
@@ -22,6 +22,56 @@
#ifndef __ASM_PARAVIRT_H
#define __ASM_PARAVIRT_H
+#ifndef __ASSEMBLY__
+/******************************************************************************
+ * fsys related addresses
+ */
+struct pv_fsys_data {
+ unsigned long *fsyscall_table;
+ void *fsys_bubble_down;
+};
+
+extern struct pv_fsys_data pv_fsys_data;
+
+unsigned long *paravirt_get_fsyscall_table(void);
+char *paravirt_get_fsys_bubble_down(void);
+
+/******************************************************************************
+ * patchlist addresses for gate page
+ */
+enum pv_gate_patchlist {
+ PV_GATE_START_FSYSCALL,
+ PV_GATE_END_FSYSCALL,
+
+ PV_GATE_START_BRL_FSYS_BUBBLE_DOWN,
+ PV_GATE_END_BRL_FSYS_BUBBLE_DOWN,
+
+ PV_GATE_START_VTOP,
+ PV_GATE_END_VTOP,
+
+ PV_GATE_START_MCKINLEY_E9,
+ PV_GATE_END_MCKINLEY_E9,
+};
+
+struct pv_patchdata {
+ unsigned long start_fsyscall_patchlist;
+ unsigned long end_fsyscall_patchlist;
+ unsigned long start_brl_fsys_bubble_down_patchlist;
+ unsigned long end_brl_fsys_bubble_down_patchlist;
+ unsigned long start_vtop_patchlist;
+ unsigned long end_vtop_patchlist;
+ unsigned long start_mckinley_e9_patchlist;
+ unsigned long end_mckinley_e9_patchlist;
+
+ void *gate_section;
+};
+
+extern struct pv_patchdata pv_patchdata;
+
+unsigned long paravirt_get_gate_patchlist(enum pv_gate_patchlist type);
+void *paravirt_get_gate_section(void);
+#endif
+
#ifdef CONFIG_PARAVIRT_GUEST
#define PARAVIRT_HYPERVISOR_TYPE_DEFAULT 0
@@ -68,6 +118,14 @@ struct pv_init_ops {
int (*arch_setup_nomca)(void);
void (*post_smp_prepare_boot_cpu)(void);
+
+#ifdef ASM_SUPPORTED
+ unsigned long (*patch_bundle)(void *sbundle, void *ebundle,
+ unsigned long type);
+ unsigned long (*patch_inst)(unsigned long stag, unsigned long etag,
+ unsigned long type);
+#endif
+ void (*patch_branch)(unsigned long tag, unsigned long type);
};
extern struct pv_init_ops pv_init_ops;
@@ -210,6 +268,8 @@ struct pv_time_ops {
int (*do_steal_accounting)(unsigned long *new_itm);
void (*clocksource_resume)(void);
+
+ unsigned long long (*sched_clock)(void);
};
extern struct pv_time_ops pv_time_ops;
@@ -227,6 +287,11 @@ paravirt_do_steal_accounting(unsigned long *new_itm)
return pv_time_ops.do_steal_accounting(new_itm);
}
+static inline unsigned long long paravirt_sched_clock(void)
+{
+ return pv_time_ops.sched_clock();
+}
+
#endif /* !__ASSEMBLY__ */
#else
diff --git a/arch/ia64/include/asm/paravirt_patch.h b/arch/ia64/include/asm/paravirt_patch.h
new file mode 100644
index 00000000000..128ff5db6e6
--- /dev/null
+++ b/arch/ia64/include/asm/paravirt_patch.h
@@ -0,0 +1,143 @@
+/******************************************************************************
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#ifndef __ASM_PARAVIRT_PATCH_H
+#define __ASM_PARAVIRT_PATCH_H
+
+#ifdef __ASSEMBLY__
+
+ .section .paravirt_branches, "a"
+ .previous
+#define PARAVIRT_PATCH_SITE_BR(type) \
+ { \
+ [1:] ; \
+ br.cond.sptk.many 2f ; \
+ nop.b 0 ; \
+ nop.b 0;; ; \
+ } ; \
+ 2: \
+ .xdata8 ".paravirt_branches", 1b, type
+
+#else
+
+#include <linux/stringify.h>
+#include <asm/intrinsics.h>
+
+/* for binary patch */
+struct paravirt_patch_site_bundle {
+ void *sbundle;
+ void *ebundle;
+ unsigned long type;
+};
+
+/* label means the beginning of new bundle */
+#define paravirt_alt_bundle(instr, privop) \
+ "\t998:\n" \
+ "\t" instr "\n" \
+ "\t999:\n" \
+ "\t.pushsection .paravirt_bundles, \"a\"\n" \
+ "\t.popsection\n" \
+ "\t.xdata8 \".paravirt_bundles\", 998b, 999b, " \
+ __stringify(privop) "\n"
+
+
+struct paravirt_patch_bundle_elem {
+ const void *sbundle;
+ const void *ebundle;
+ unsigned long type;
+};
+
+
+struct paravirt_patch_site_inst {
+ unsigned long stag;
+ unsigned long etag;
+ unsigned long type;
+};
+
+#define paravirt_alt_inst(instr, privop) \
+ "\t[998:]\n" \
+ "\t" instr "\n" \
+ "\t[999:]\n" \
+ "\t.pushsection .paravirt_insts, \"a\"\n" \
+ "\t.popsection\n" \
+ "\t.xdata8 \".paravirt_insts\", 998b, 999b, " \
+ __stringify(privop) "\n"
+
+struct paravirt_patch_site_branch {
+ unsigned long tag;
+ unsigned long type;
+};
+
+struct paravirt_patch_branch_target {
+ const void *entry;
+ unsigned long type;
+};
+
+void
+__paravirt_patch_apply_branch(
+ unsigned long tag, unsigned long type,
+ const struct paravirt_patch_branch_target *entries,
+ unsigned int nr_entries);
+
+void
+paravirt_patch_reloc_br(unsigned long tag, const void *target);
+
+void
+paravirt_patch_reloc_brl(unsigned long tag, const void *target);
+
+
+#if defined(ASM_SUPPORTED) && defined(CONFIG_PARAVIRT)
+unsigned long
+ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type);
+
+unsigned long
+__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type,
+ const struct paravirt_patch_bundle_elem *elems,
+ unsigned long nelems,
+ const struct paravirt_patch_bundle_elem **found);
+
+void
+paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start,
+ const struct paravirt_patch_site_bundle *end);
+
+void
+paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start,
+ const struct paravirt_patch_site_inst *end);
+
+void paravirt_patch_apply(void);
+#else
+#define paravirt_patch_apply_bundle(start, end) do { } while (0)
+#define paravirt_patch_apply_inst(start, end) do { } while (0)
+#define paravirt_patch_apply() do { } while (0)
+#endif
+
+#endif /* !__ASSEMBLEY__ */
+
+#endif /* __ASM_PARAVIRT_PATCH_H */
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "linux"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * indent-tabs-mode: t
+ * End:
+ */
diff --git a/arch/ia64/include/asm/paravirt_privop.h b/arch/ia64/include/asm/paravirt_privop.h
index 33c8e55f577..3d2951130b5 100644
--- a/arch/ia64/include/asm/paravirt_privop.h
+++ b/arch/ia64/include/asm/paravirt_privop.h
@@ -33,7 +33,7 @@
*/
struct pv_cpu_ops {
- void (*fc)(unsigned long addr);
+ void (*fc)(void *addr);
unsigned long (*thash)(unsigned long addr);
unsigned long (*get_cpuid)(int index);
unsigned long (*get_pmd)(int index);
@@ -60,12 +60,18 @@ extern unsigned long ia64_native_getreg_func(int regnum);
/* Instructions paravirtualized for performance */
/************************************************/
+#ifndef ASM_SUPPORTED
+#define paravirt_ssm_i() pv_cpu_ops.ssm_i()
+#define paravirt_rsm_i() pv_cpu_ops.rsm_i()
+#define __paravirt_getreg() pv_cpu_ops.getreg()
+#endif
+
/* mask for ia64_native_ssm/rsm() must be constant.("i" constraing).
* static inline function doesn't satisfy it. */
#define paravirt_ssm(mask) \
do { \
if ((mask) == IA64_PSR_I) \
- pv_cpu_ops.ssm_i(); \
+ paravirt_ssm_i(); \
else \
ia64_native_ssm(mask); \
} while (0)
@@ -73,7 +79,7 @@ extern unsigned long ia64_native_getreg_func(int regnum);
#define paravirt_rsm(mask) \
do { \
if ((mask) == IA64_PSR_I) \
- pv_cpu_ops.rsm_i(); \
+ paravirt_rsm_i(); \
else \
ia64_native_rsm(mask); \
} while (0)
@@ -86,7 +92,7 @@ extern unsigned long ia64_native_getreg_func(int regnum);
if ((reg) == _IA64_REG_IP) \
res = ia64_native_getreg(_IA64_REG_IP); \
else \
- res = pv_cpu_ops.getreg(reg); \
+ res = __paravirt_getreg(reg); \
res; \
})
@@ -112,6 +118,12 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
#endif /* CONFIG_PARAVIRT */
+#if defined(CONFIG_PARAVIRT) && defined(ASM_SUPPORTED)
+#define paravirt_dv_serialize_data() ia64_dv_serialize_data()
+#else
+#define paravirt_dv_serialize_data() /* nothing */
+#endif
+
/* these routines utilize privilege-sensitive or performance-sensitive
* privileged instructions so the code must be replaced with
* paravirtualized versions */
@@ -121,4 +133,349 @@ void paravirt_cpu_asm_init(const struct pv_cpu_asm_switch *cpu_asm_switch);
IA64_PARAVIRT_ASM_FUNC(work_processed_syscall)
#define ia64_leave_kernel IA64_PARAVIRT_ASM_FUNC(leave_kernel)
+
+#if defined(CONFIG_PARAVIRT)
+/******************************************************************************
+ * binary patching infrastructure
+ */
+#define PARAVIRT_PATCH_TYPE_FC 1
+#define PARAVIRT_PATCH_TYPE_THASH 2
+#define PARAVIRT_PATCH_TYPE_GET_CPUID 3
+#define PARAVIRT_PATCH_TYPE_GET_PMD 4
+#define PARAVIRT_PATCH_TYPE_PTCGA 5
+#define PARAVIRT_PATCH_TYPE_GET_RR 6
+#define PARAVIRT_PATCH_TYPE_SET_RR 7
+#define PARAVIRT_PATCH_TYPE_SET_RR0_TO_RR4 8
+#define PARAVIRT_PATCH_TYPE_SSM_I 9
+#define PARAVIRT_PATCH_TYPE_RSM_I 10
+#define PARAVIRT_PATCH_TYPE_GET_PSR_I 11
+#define PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE 12
+
+/* PARAVIRT_PATY_TYPE_[GS]ETREG + _IA64_REG_xxx */
+#define PARAVIRT_PATCH_TYPE_GETREG 0x10000000
+#define PARAVIRT_PATCH_TYPE_SETREG 0x20000000
+
+/*
+ * struct task_struct* (*ia64_switch_to)(void* next_task);
+ * void *ia64_leave_syscall;
+ * void *ia64_work_processed_syscall
+ * void *ia64_leave_kernel;
+ */
+
+#define PARAVIRT_PATCH_TYPE_BR_START 0x30000000
+#define PARAVIRT_PATCH_TYPE_BR_SWITCH_TO \
+ (PARAVIRT_PATCH_TYPE_BR_START + 0)
+#define PARAVIRT_PATCH_TYPE_BR_LEAVE_SYSCALL \
+ (PARAVIRT_PATCH_TYPE_BR_START + 1)
+#define PARAVIRT_PATCH_TYPE_BR_WORK_PROCESSED_SYSCALL \
+ (PARAVIRT_PATCH_TYPE_BR_START + 2)
+#define PARAVIRT_PATCH_TYPE_BR_LEAVE_KERNEL \
+ (PARAVIRT_PATCH_TYPE_BR_START + 3)
+
+#ifdef ASM_SUPPORTED
+#include <asm/paravirt_patch.h>
+
+/*
+ * pv_cpu_ops calling stub.
+ * normal function call convension can't be written by gcc
+ * inline assembly.
+ *
+ * from the caller's point of view,
+ * the following registers will be clobbered.
+ * r2, r3
+ * r8-r15
+ * r16, r17
+ * b6, b7
+ * p6-p15
+ * ar.ccv
+ *
+ * from the callee's point of view ,
+ * the following registers can be used.
+ * r2, r3: scratch
+ * r8: scratch, input argument0 and return value
+ * r0-r15: scratch, input argument1-5
+ * b6: return pointer
+ * b7: scratch
+ * p6-p15: scratch
+ * ar.ccv: scratch
+ *
+ * other registers must not be changed. especially
+ * b0: rp: preserved. gcc ignores b0 in clobbered register.
+ * r16: saved gp
+ */
+/* 5 bundles */
+#define __PARAVIRT_BR \
+ ";;\n" \
+ "{ .mlx\n" \
+ "nop 0\n" \
+ "movl r2 = %[op_addr]\n"/* get function pointer address */ \
+ ";;\n" \
+ "}\n" \
+ "1:\n" \
+ "{ .mii\n" \
+ "ld8 r2 = [r2]\n" /* load function descriptor address */ \
+ "mov r17 = ip\n" /* get ip to calc return address */ \
+ "mov r16 = gp\n" /* save gp */ \
+ ";;\n" \
+ "}\n" \
+ "{ .mii\n" \
+ "ld8 r3 = [r2], 8\n" /* load entry address */ \
+ "adds r17 = 1f - 1b, r17\n" /* calculate return address */ \
+ ";;\n" \
+ "mov b7 = r3\n" /* set entry address */ \
+ "}\n" \
+ "{ .mib\n" \
+ "ld8 gp = [r2]\n" /* load gp value */ \
+ "mov b6 = r17\n" /* set return address */ \
+ "br.cond.sptk.few b7\n" /* intrinsics are very short isns */ \
+ "}\n" \
+ "1:\n" \
+ "{ .mii\n" \
+ "mov gp = r16\n" /* restore gp value */ \
+ "nop 0\n" \
+ "nop 0\n" \
+ ";;\n" \
+ "}\n"
+
+#define PARAVIRT_OP(op) \
+ [op_addr] "i"(&pv_cpu_ops.op)
+
+#define PARAVIRT_TYPE(type) \
+ PARAVIRT_PATCH_TYPE_ ## type
+
+#define PARAVIRT_REG_CLOBBERS0 \
+ "r2", "r3", /*"r8",*/ "r9", "r10", "r11", "r14", \
+ "r15", "r16", "r17"
+
+#define PARAVIRT_REG_CLOBBERS1 \
+ "r2","r3", /*"r8",*/ "r9", "r10", "r11", "r14", \
+ "r15", "r16", "r17"
+
+#define PARAVIRT_REG_CLOBBERS2 \
+ "r2", "r3", /*"r8", "r9",*/ "r10", "r11", "r14", \
+ "r15", "r16", "r17"
+
+#define PARAVIRT_REG_CLOBBERS5 \
+ "r2", "r3", /*"r8", "r9", "r10", "r11", "r14",*/ \
+ "r15", "r16", "r17"
+
+#define PARAVIRT_BR_CLOBBERS \
+ "b6", "b7"
+
+#define PARAVIRT_PR_CLOBBERS \
+ "p6", "p7", "p8", "p9", "p10", "p11", "p12", "p13", "p14", "p15"
+
+#define PARAVIRT_AR_CLOBBERS \
+ "ar.ccv"
+
+#define PARAVIRT_CLOBBERS0 \
+ PARAVIRT_REG_CLOBBERS0, \
+ PARAVIRT_BR_CLOBBERS, \
+ PARAVIRT_PR_CLOBBERS, \
+ PARAVIRT_AR_CLOBBERS, \
+ "memory"
+
+#define PARAVIRT_CLOBBERS1 \
+ PARAVIRT_REG_CLOBBERS1, \
+ PARAVIRT_BR_CLOBBERS, \
+ PARAVIRT_PR_CLOBBERS, \
+ PARAVIRT_AR_CLOBBERS, \
+ "memory"
+
+#define PARAVIRT_CLOBBERS2 \
+ PARAVIRT_REG_CLOBBERS2, \
+ PARAVIRT_BR_CLOBBERS, \
+ PARAVIRT_PR_CLOBBERS, \
+ PARAVIRT_AR_CLOBBERS, \
+ "memory"
+
+#define PARAVIRT_CLOBBERS5 \
+ PARAVIRT_REG_CLOBBERS5, \
+ PARAVIRT_BR_CLOBBERS, \
+ PARAVIRT_PR_CLOBBERS, \
+ PARAVIRT_AR_CLOBBERS, \
+ "memory"
+
+#define PARAVIRT_BR0(op, type) \
+ register unsigned long ia64_clobber asm ("r8"); \
+ asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
+ PARAVIRT_TYPE(type)) \
+ : "=r"(ia64_clobber) \
+ : PARAVIRT_OP(op) \
+ : PARAVIRT_CLOBBERS0)
+
+#define PARAVIRT_BR0_RET(op, type) \
+ register unsigned long ia64_intri_res asm ("r8"); \
+ asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
+ PARAVIRT_TYPE(type)) \
+ : "=r"(ia64_intri_res) \
+ : PARAVIRT_OP(op) \
+ : PARAVIRT_CLOBBERS0)
+
+#define PARAVIRT_BR1(op, type, arg1) \
+ register unsigned long __##arg1 asm ("r8") = arg1; \
+ register unsigned long ia64_clobber asm ("r8"); \
+ asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
+ PARAVIRT_TYPE(type)) \
+ : "=r"(ia64_clobber) \
+ : PARAVIRT_OP(op), "0"(__##arg1) \
+ : PARAVIRT_CLOBBERS1)
+
+#define PARAVIRT_BR1_RET(op, type, arg1) \
+ register unsigned long ia64_intri_res asm ("r8"); \
+ register unsigned long __##arg1 asm ("r8") = arg1; \
+ asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
+ PARAVIRT_TYPE(type)) \
+ : "=r"(ia64_intri_res) \
+ : PARAVIRT_OP(op), "0"(__##arg1) \
+ : PARAVIRT_CLOBBERS1)
+
+#define PARAVIRT_BR1_VOID(op, type, arg1) \
+ register void *__##arg1 asm ("r8") = arg1; \
+ register unsigned long ia64_clobber asm ("r8"); \
+ asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
+ PARAVIRT_TYPE(type)) \
+ : "=r"(ia64_clobber) \
+ : PARAVIRT_OP(op), "0"(__##arg1) \
+ : PARAVIRT_CLOBBERS1)
+
+#define PARAVIRT_BR2(op, type, arg1, arg2) \
+ register unsigned long __##arg1 asm ("r8") = arg1; \
+ register unsigned long __##arg2 asm ("r9") = arg2; \
+ register unsigned long ia64_clobber1 asm ("r8"); \
+ register unsigned long ia64_clobber2 asm ("r9"); \
+ asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
+ PARAVIRT_TYPE(type)) \
+ : "=r"(ia64_clobber1), "=r"(ia64_clobber2) \
+ : PARAVIRT_OP(op), "0"(__##arg1), "1"(__##arg2) \
+ : PARAVIRT_CLOBBERS2)
+
+
+#define PARAVIRT_DEFINE_CPU_OP0(op, type) \
+ static inline void \
+ paravirt_ ## op (void) \
+ { \
+ PARAVIRT_BR0(op, type); \
+ }
+
+#define PARAVIRT_DEFINE_CPU_OP0_RET(op, type) \
+ static inline unsigned long \
+ paravirt_ ## op (void) \
+ { \
+ PARAVIRT_BR0_RET(op, type); \
+ return ia64_intri_res; \
+ }
+
+#define PARAVIRT_DEFINE_CPU_OP1_VOID(op, type) \
+ static inline void \
+ paravirt_ ## op (void *arg1) \
+ { \
+ PARAVIRT_BR1_VOID(op, type, arg1); \
+ }
+
+#define PARAVIRT_DEFINE_CPU_OP1(op, type) \
+ static inline void \
+ paravirt_ ## op (unsigned long arg1) \
+ { \
+ PARAVIRT_BR1(op, type, arg1); \
+ }
+
+#define PARAVIRT_DEFINE_CPU_OP1_RET(op, type) \
+ static inline unsigned long \
+ paravirt_ ## op (unsigned long arg1) \
+ { \
+ PARAVIRT_BR1_RET(op, type, arg1); \
+ return ia64_intri_res; \
+ }
+
+#define PARAVIRT_DEFINE_CPU_OP2(op, type) \
+ static inline void \
+ paravirt_ ## op (unsigned long arg1, \
+ unsigned long arg2) \
+ { \
+ PARAVIRT_BR2(op, type, arg1, arg2); \
+ }
+
+
+PARAVIRT_DEFINE_CPU_OP1_VOID(fc, FC);
+PARAVIRT_DEFINE_CPU_OP1_RET(thash, THASH)
+PARAVIRT_DEFINE_CPU_OP1_RET(get_cpuid, GET_CPUID)
+PARAVIRT_DEFINE_CPU_OP1_RET(get_pmd, GET_PMD)
+PARAVIRT_DEFINE_CPU_OP2(ptcga, PTCGA)
+PARAVIRT_DEFINE_CPU_OP1_RET(get_rr, GET_RR)
+PARAVIRT_DEFINE_CPU_OP2(set_rr, SET_RR)
+PARAVIRT_DEFINE_CPU_OP0(ssm_i, SSM_I)
+PARAVIRT_DEFINE_CPU_OP0(rsm_i, RSM_I)
+PARAVIRT_DEFINE_CPU_OP0_RET(get_psr_i, GET_PSR_I)
+PARAVIRT_DEFINE_CPU_OP1(intrin_local_irq_restore, INTRIN_LOCAL_IRQ_RESTORE)
+
+static inline void
+paravirt_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
+ unsigned long val2, unsigned long val3,
+ unsigned long val4)
+{
+ register unsigned long __val0 asm ("r8") = val0;
+ register unsigned long __val1 asm ("r9") = val1;
+ register unsigned long __val2 asm ("r10") = val2;
+ register unsigned long __val3 asm ("r11") = val3;
+ register unsigned long __val4 asm ("r14") = val4;
+
+ register unsigned long ia64_clobber0 asm ("r8");
+ register unsigned long ia64_clobber1 asm ("r9");
+ register unsigned long ia64_clobber2 asm ("r10");
+ register unsigned long ia64_clobber3 asm ("r11");
+ register unsigned long ia64_clobber4 asm ("r14");
+
+ asm volatile (paravirt_alt_bundle(__PARAVIRT_BR,
+ PARAVIRT_TYPE(SET_RR0_TO_RR4))
+ : "=r"(ia64_clobber0),
+ "=r"(ia64_clobber1),
+ "=r"(ia64_clobber2),
+ "=r"(ia64_clobber3),
+ "=r"(ia64_clobber4)
+ : PARAVIRT_OP(set_rr0_to_rr4),
+ "0"(__val0), "1"(__val1), "2"(__val2),
+ "3"(__val3), "4"(__val4)
+ : PARAVIRT_CLOBBERS5);
+}
+
+/* unsigned long paravirt_getreg(int reg) */
+#define __paravirt_getreg(reg) \
+ ({ \
+ register unsigned long ia64_intri_res asm ("r8"); \
+ register unsigned long __reg asm ("r8") = (reg); \
+ \
+ BUILD_BUG_ON(!__builtin_constant_p(reg)); \
+ asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
+ PARAVIRT_TYPE(GETREG) \
+ + (reg)) \
+ : "=r"(ia64_intri_res) \
+ : PARAVIRT_OP(getreg), "0"(__reg) \
+ : PARAVIRT_CLOBBERS1); \
+ \
+ ia64_intri_res; \
+ })
+
+/* void paravirt_setreg(int reg, unsigned long val) */
+#define paravirt_setreg(reg, val) \
+ do { \
+ register unsigned long __val asm ("r8") = val; \
+ register unsigned long __reg asm ("r9") = reg; \
+ register unsigned long ia64_clobber1 asm ("r8"); \
+ register unsigned long ia64_clobber2 asm ("r9"); \
+ \
+ BUILD_BUG_ON(!__builtin_constant_p(reg)); \
+ asm volatile (paravirt_alt_bundle(__PARAVIRT_BR, \
+ PARAVIRT_TYPE(SETREG) \
+ + (reg)) \
+ : "=r"(ia64_clobber1), \
+ "=r"(ia64_clobber2) \
+ : PARAVIRT_OP(setreg), \
+ "1"(__reg), "0"(__val) \
+ : PARAVIRT_CLOBBERS2); \
+ } while (0)
+
+#endif /* ASM_SUPPORTED */
+#endif /* CONFIG_PARAVIRT && ASM_SUPPOTED */
+
#endif /* _ASM_IA64_PARAVIRT_PRIVOP_H */
diff --git a/arch/ia64/include/asm/smp.h b/arch/ia64/include/asm/smp.h
index 21c402365d0..59840833625 100644
--- a/arch/ia64/include/asm/smp.h
+++ b/arch/ia64/include/asm/smp.h
@@ -126,7 +126,8 @@ extern void identify_siblings (struct cpuinfo_ia64 *);
extern int is_multithreading_enabled(void);
extern void arch_send_call_function_single_ipi(int cpu);
-extern void arch_send_call_function_ipi(cpumask_t mask);
+extern void arch_send_call_function_ipi_mask(const struct cpumask *mask);
+#define arch_send_call_function_ipi_mask arch_send_call_function_ipi_mask
#else /* CONFIG_SMP */
diff --git a/arch/ia64/include/asm/timex.h b/arch/ia64/include/asm/timex.h
index 4e03cfe74a0..86c7db86118 100644
--- a/arch/ia64/include/asm/timex.h
+++ b/arch/ia64/include/asm/timex.h
@@ -40,5 +40,6 @@ get_cycles (void)
}
extern void ia64_cpu_local_tick (void);
+extern unsigned long long ia64_native_sched_clock (void);
#endif /* _ASM_IA64_TIMEX_H */
diff --git a/arch/ia64/include/asm/topology.h b/arch/ia64/include/asm/topology.h
index f260dcf2151..7b4c8c70b2d 100644
--- a/arch/ia64/include/asm/topology.h
+++ b/arch/ia64/include/asm/topology.h
@@ -112,11 +112,6 @@ void build_cpu_to_node_map(void);
extern void arch_fix_phys_package_id(int num, u32 slot);
-#define pcibus_to_cpumask(bus) (pcibus_to_node(bus) == -1 ? \
- CPU_MASK_ALL : \
- node_to_cpumask(pcibus_to_node(bus)) \
- )
-
#define cpumask_of_pcibus(bus) (pcibus_to_node(bus) == -1 ? \
cpu_all_mask : \
cpumask_of_node(pcibus_to_node(bus)))
diff --git a/arch/ia64/include/asm/xen/hypervisor.h b/arch/ia64/include/asm/xen/hypervisor.h
index 7a804e80fc6..e425227a418 100644
--- a/arch/ia64/include/asm/xen/hypervisor.h
+++ b/arch/ia64/include/asm/xen/hypervisor.h
@@ -33,9 +33,6 @@
#ifndef _ASM_IA64_XEN_HYPERVISOR_H
#define _ASM_IA64_XEN_HYPERVISOR_H
-#ifdef CONFIG_XEN
-
-#include <linux/init.h>
#include <xen/interface/xen.h>
#include <xen/interface/version.h> /* to compile feature.c */
#include <xen/features.h> /* to comiple xen-netfront.c */
@@ -43,22 +40,32 @@
/* xen_domain_type is set before executing any C code by early_xen_setup */
enum xen_domain_type {
- XEN_NATIVE,
- XEN_PV_DOMAIN,
- XEN_HVM_DOMAIN,
+ XEN_NATIVE, /* running on bare hardware */
+ XEN_PV_DOMAIN, /* running in a PV domain */
+ XEN_HVM_DOMAIN, /* running in a Xen hvm domain*/
};
+#ifdef CONFIG_XEN
extern enum xen_domain_type xen_domain_type;
+#else
+#define xen_domain_type XEN_NATIVE
+#endif
#define xen_domain() (xen_domain_type != XEN_NATIVE)
-#define xen_pv_domain() (xen_domain_type == XEN_PV_DOMAIN)
-#define xen_initial_domain() (xen_pv_domain() && \
+#define xen_pv_domain() (xen_domain() && \
+ xen_domain_type == XEN_PV_DOMAIN)
+#define xen_hvm_domain() (xen_domain() && \
+ xen_domain_type == XEN_HVM_DOMAIN)
+
+#ifdef CONFIG_XEN_DOM0
+#define xen_initial_domain() (xen_pv_domain() && \
(xen_start_info->flags & SIF_INITDOMAIN))
-#define xen_hvm_domain() (xen_domain_type == XEN_HVM_DOMAIN)
+#else
+#define xen_initial_domain() (0)
+#endif
-/* deprecated. remove this */
-#define is_running_on_xen() (xen_domain_type == XEN_PV_DOMAIN)
+#ifdef CONFIG_XEN
extern struct shared_info *HYPERVISOR_shared_info;
extern struct start_info *xen_start_info;
@@ -74,16 +81,6 @@ void force_evtchn_callback(void);
/* For setup_arch() in arch/ia64/kernel/setup.c */
void xen_ia64_enable_opt_feature(void);
-
-#else /* CONFIG_XEN */
-
-#define xen_domain() (0)
-#define xen_pv_domain() (0)
-#define xen_initial_domain() (0)
-#define xen_hvm_domain() (0)
-#define is_running_on_xen() (0) /* deprecated. remove this */
#endif
-#define is_initial_xendomain() (0) /* deprecated. remove this */
-
#endif /* _ASM_IA64_XEN_HYPERVISOR_H */
diff --git a/arch/ia64/include/asm/xen/inst.h b/arch/ia64/include/asm/xen/inst.h
index 19c2ae1d878..c53a4761120 100644
--- a/arch/ia64/include/asm/xen/inst.h
+++ b/arch/ia64/include/asm/xen/inst.h
@@ -33,6 +33,9 @@
#define __paravirt_work_processed_syscall_target \
xen_work_processed_syscall
+#define paravirt_fsyscall_table xen_fsyscall_table
+#define paravirt_fsys_bubble_down xen_fsys_bubble_down
+
#define MOV_FROM_IFA(reg) \
movl reg = XSI_IFA; \
;; \
@@ -110,6 +113,27 @@
.endm
#define MOV_FROM_PSR(pred, reg, clob) __MOV_FROM_PSR pred, reg, clob
+/* assuming ar.itc is read with interrupt disabled. */
+#define MOV_FROM_ITC(pred, pred_clob, reg, clob) \
+(pred) movl clob = XSI_ITC_OFFSET; \
+ ;; \
+(pred) ld8 clob = [clob]; \
+(pred) mov reg = ar.itc; \
+ ;; \
+(pred) add reg = reg, clob; \
+ ;; \
+(pred) movl clob = XSI_ITC_LAST; \
+ ;; \
+(pred) ld8 clob = [clob]; \
+ ;; \
+(pred) cmp.geu.unc pred_clob, p0 = clob, reg; \
+ ;; \
+(pred_clob) add reg = 1, clob; \
+ ;; \
+(pred) movl clob = XSI_ITC_LAST; \
+ ;; \
+(pred) st8 [clob] = reg
+
#define MOV_TO_IFA(reg, clob) \
movl clob = XSI_IFA; \
@@ -362,6 +386,10 @@
#define RSM_PSR_DT \
XEN_HYPER_RSM_PSR_DT
+#define RSM_PSR_BE_I(clob0, clob1) \
+ RSM_PSR_I(p0, clob0, clob1); \
+ rum psr.be
+
#define SSM_PSR_DT_AND_SRLZ_I \
XEN_HYPER_SSM_PSR_DT
diff --git a/arch/ia64/include/asm/xen/interface.h b/arch/ia64/include/asm/xen/interface.h
index f00fab40854..e951e740bdf 100644
--- a/arch/ia64/include/asm/xen/interface.h
+++ b/arch/ia64/include/asm/xen/interface.h
@@ -209,6 +209,15 @@ struct mapped_regs {
unsigned long krs[8]; /* kernel registers */
unsigned long tmp[16]; /* temp registers
(e.g. for hyperprivops) */
+
+ /* itc paravirtualization
+ * vAR.ITC = mAR.ITC + itc_offset
+ * itc_last is one which was lastly passed to
+ * the guest OS in order to prevent it from
+ * going backwords.
+ */
+ unsigned long itc_offset;
+ unsigned long itc_last;
};
};
};
diff --git a/arch/ia64/include/asm/xen/minstate.h b/arch/ia64/include/asm/xen/minstate.h
index 4d92d9bbda7..c57fa910f2c 100644
--- a/arch/ia64/include/asm/xen/minstate.h
+++ b/arch/ia64/include/asm/xen/minstate.h
@@ -1,3 +1,12 @@
+
+#ifdef CONFIG_VIRT_CPU_ACCOUNTING
+/* read ar.itc in advance, and use it before leaving bank 0 */
+#define XEN_ACCOUNT_GET_STAMP \
+ MOV_FROM_ITC(pUStk, p6, r20, r2);
+#else
+#define XEN_ACCOUNT_GET_STAMP
+#endif
+
/*
* DO_SAVE_MIN switches to the kernel stacks (if necessary) and saves
* the minimum state necessary that allows us to turn psr.ic back
@@ -123,7 +132,7 @@
;; \
.mem.offset 0,0; st8.spill [r16]=r2,16; \
.mem.offset 8,0; st8.spill [r17]=r3,16; \
- ACCOUNT_GET_STAMP \
+ XEN_ACCOUNT_GET_STAMP \
adds r2=IA64_PT_REGS_R16_OFFSET,r1; \
;; \
EXTRA; \
diff --git a/arch/ia64/include/asm/xen/patchlist.h b/arch/ia64/include/asm/xen/patchlist.h
new file mode 100644
index 00000000000..eae944e8884
--- /dev/null
+++ b/arch/ia64/include/asm/xen/patchlist.h
@@ -0,0 +1,38 @@
+/******************************************************************************
+ * arch/ia64/include/asm/xen/patchlist.h
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#define __paravirt_start_gate_fsyscall_patchlist \
+ __xen_start_gate_fsyscall_patchlist
+#define __paravirt_end_gate_fsyscall_patchlist \
+ __xen_end_gate_fsyscall_patchlist
+#define __paravirt_start_gate_brl_fsys_bubble_down_patchlist \
+ __xen_start_gate_brl_fsys_bubble_down_patchlist
+#define __paravirt_end_gate_brl_fsys_bubble_down_patchlist \
+ __xen_end_gate_brl_fsys_bubble_down_patchlist
+#define __paravirt_start_gate_vtop_patchlist \
+ __xen_start_gate_vtop_patchlist
+#define __paravirt_end_gate_vtop_patchlist \
+ __xen_end_gate_vtop_patchlist
+#define __paravirt_start_gate_mckinley_e9_patchlist \
+ __xen_start_gate_mckinley_e9_patchlist
+#define __paravirt_end_gate_mckinley_e9_patchlist \
+ __xen_end_gate_mckinley_e9_patchlist
diff --git a/arch/ia64/include/asm/xen/privop.h b/arch/ia64/include/asm/xen/privop.h
index 71ec7546e10..fb4ec5e0b06 100644
--- a/arch/ia64/include/asm/xen/privop.h
+++ b/arch/ia64/include/asm/xen/privop.h
@@ -55,6 +55,8 @@
#define XSI_BANK1_R16 (XSI_BASE + XSI_BANK1_R16_OFS)
#define XSI_BANKNUM (XSI_BASE + XSI_BANKNUM_OFS)
#define XSI_IHA (XSI_BASE + XSI_IHA_OFS)
+#define XSI_ITC_OFFSET (XSI_BASE + XSI_ITC_OFFSET_OFS)
+#define XSI_ITC_LAST (XSI_BASE + XSI_ITC_LAST_OFS)
#endif
#ifndef __ASSEMBLY__
@@ -67,7 +69,7 @@
* may have different semantics depending on whether they are executed
* at PL0 vs PL!=0. When paravirtualized, these instructions mustn't
* be allowed to execute directly, lest incorrect semantics result. */
-extern void xen_fc(unsigned long addr);
+extern void xen_fc(void *addr);
extern unsigned long xen_thash(unsigned long addr);
/* Note that "ttag" and "cover" are also privilege-sensitive; "ttag"
@@ -80,8 +82,10 @@ extern unsigned long xen_thash(unsigned long addr);
extern unsigned long xen_get_cpuid(int index);
extern unsigned long xen_get_pmd(int index);
+#ifndef ASM_SUPPORTED
extern unsigned long xen_get_eflag(void); /* see xen_ia64_getreg */
extern void xen_set_eflag(unsigned long); /* see xen_ia64_setreg */
+#endif
/************************************************/
/* Instructions paravirtualized for performance */
@@ -106,6 +110,7 @@ extern void xen_set_eflag(unsigned long); /* see xen_ia64_setreg */
#define xen_get_virtual_pend() \
(*(((uint8_t *)XEN_MAPPEDREGS->interrupt_mask_addr) - 1))
+#ifndef ASM_SUPPORTED
/* Although all privileged operations can be left to trap and will
* be properly handled by Xen, some are frequent enough that we use
* hyperprivops for performance. */
@@ -123,6 +128,7 @@ extern void xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
unsigned long val4);
extern void xen_set_kr(unsigned long index, unsigned long val);
extern void xen_ptcga(unsigned long addr, unsigned long size);
+#endif /* !ASM_SUPPORTED */
#endif /* !__ASSEMBLY__ */
diff --git a/arch/ia64/kernel/Makefile b/arch/ia64/kernel/Makefile
index f2778f2c4fd..5628e9a990a 100644
--- a/arch/ia64/kernel/Makefile
+++ b/arch/ia64/kernel/Makefile
@@ -5,7 +5,7 @@
extra-y := head.o init_task.o vmlinux.lds
obj-y := acpi.o entry.o efi.o efi_stub.o gate-data.o fsys.o ia64_ksyms.o irq.o irq_ia64.o \
- irq_lsapic.o ivt.o machvec.o pal.o patch.o process.o perfmon.o ptrace.o sal.o \
+ irq_lsapic.o ivt.o machvec.o pal.o paravirt_patchlist.o patch.o process.o perfmon.o ptrace.o sal.o \
salinfo.o setup.o signal.o sys_ia64.o time.o traps.o unaligned.o \
unwind.o mca.o mca_asm.o topology.o dma-mapping.o
@@ -36,7 +36,8 @@ obj-$(CONFIG_PCI_MSI) += msi_ia64.o
mca_recovery-y += mca_drv.o mca_drv_asm.o
obj-$(CONFIG_IA64_MC_ERR_INJECT)+= err_inject.o
-obj-$(CONFIG_PARAVIRT) += paravirt.o paravirtentry.o
+obj-$(CONFIG_PARAVIRT) += paravirt.o paravirtentry.o \
+ paravirt_patch.o
obj-$(CONFIG_IA64_ESI) += esi.o
ifneq ($(CONFIG_IA64_ESI),)
@@ -45,35 +46,13 @@ endif
obj-$(CONFIG_DMAR) += pci-dma.o
obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o
-# The gate DSO image is built using a special linker script.
-targets += gate.so gate-syms.o
-
-extra-y += gate.so gate-syms.o gate.lds gate.o
-
# fp_emulate() expects f2-f5,f16-f31 to contain the user-level state.
CFLAGS_traps.o += -mfixed-range=f2-f5,f16-f31
-CPPFLAGS_gate.lds := -P -C -U$(ARCH)
-
-quiet_cmd_gate = GATE $@
- cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
-
-GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
- $(call ld-option, -Wl$(comma)--hash-style=sysv)
-$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
- $(call if_changed,gate)
-
-$(obj)/built-in.o: $(obj)/gate-syms.o
-$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
-
-GATECFLAGS_gate-syms.o = -r
-$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
- $(call if_changed,gate)
-
-# gate-data.o contains the gate DSO image as data in section .data.gate.
-# We must build gate.so before we can assemble it.
-# Note: kbuild does not track this dependency due to usage of .incbin
-$(obj)/gate-data.o: $(obj)/gate.so
+# The gate DSO image is built using a special linker script.
+include $(srctree)/arch/ia64/kernel/Makefile.gate
+# tell compiled for native
+CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_NATIVE
# Calculate NR_IRQ = max(IA64_NATIVE_NR_IRQS, XEN_NR_IRQS, ...) based on config
define sed-y
@@ -109,9 +88,9 @@ include/asm-ia64/nr-irqs.h: arch/$(SRCARCH)/kernel/nr-irqs.s
clean-files += $(objtree)/include/asm-ia64/nr-irqs.h
#
-# native ivt.S and entry.S
+# native ivt.S, entry.S and fsys.S
#
-ASM_PARAVIRT_OBJS = ivt.o entry.o
+ASM_PARAVIRT_OBJS = ivt.o entry.o fsys.o
define paravirtualized_native
AFLAGS_$(1) += -D__IA64_ASM_PARAVIRTUALIZED_NATIVE
AFLAGS_pvchk-sed-$(1) += -D__IA64_ASM_PARAVIRTUALIZED_PVCHECK
diff --git a/arch/ia64/kernel/Makefile.gate b/arch/ia64/kernel/Makefile.gate
new file mode 100644
index 00000000000..1d87f84069b
--- /dev/null
+++ b/arch/ia64/kernel/Makefile.gate
@@ -0,0 +1,27 @@
+# The gate DSO image is built using a special linker script.
+
+targets += gate.so gate-syms.o
+
+extra-y += gate.so gate-syms.o gate.lds gate.o
+
+CPPFLAGS_gate.lds := -P -C -U$(ARCH)
+
+quiet_cmd_gate = GATE $@
+ cmd_gate = $(CC) -nostdlib $(GATECFLAGS_$(@F)) -Wl,-T,$(filter-out FORCE,$^) -o $@
+
+GATECFLAGS_gate.so = -shared -s -Wl,-soname=linux-gate.so.1 \
+ $(call ld-option, -Wl$(comma)--hash-style=sysv)
+$(obj)/gate.so: $(obj)/gate.lds $(obj)/gate.o FORCE
+ $(call if_changed,gate)
+
+$(obj)/built-in.o: $(obj)/gate-syms.o
+$(obj)/built-in.o: ld_flags += -R $(obj)/gate-syms.o
+
+GATECFLAGS_gate-syms.o = -r
+$(obj)/gate-syms.o: $(obj)/gate.lds $(obj)/gate.o FORCE
+ $(call if_changed,gate)
+
+# gate-data.o contains the gate DSO image as data in section .data.gate.
+# We must build gate.so before we can assemble it.
+# Note: kbuild does not track this dependency due to usage of .incbin
+$(obj)/gate-data.o: $(obj)/gate.so
diff --git a/arch/ia64/kernel/acpi.c b/arch/ia64/kernel/acpi.c
index bdef2ce38c8..5510317db37 100644
--- a/arch/ia64/kernel/acpi.c
+++ b/arch/ia64/kernel/acpi.c
@@ -890,7 +890,7 @@ __init void prefill_possible_map(void)
possible, max((possible - available_cpus), 0));
for (i = 0; i < possible; i++)
- cpu_set(i, cpu_possible_map);
+ set_cpu_possible(i, true);
}
int acpi_map_lsapic(acpi_handle handle, int *pcpu)
@@ -928,9 +928,9 @@ int acpi_map_lsapic(acpi_handle handle, int *pcpu)
buffer.length = ACPI_ALLOCATE_BUFFER;
buffer.pointer = NULL;
- cpus_complement(tmp_map, cpu_present_map);
- cpu = first_cpu(tmp_map);
- if (cpu >= NR_CPUS)
+ cpumask_complement(&tmp_map, cpu_present_mask);
+ cpu = cpumask_first(&tmp_map);
+ if (cpu >= nr_cpu_ids)
return -EINVAL;
acpi_map_cpu2node(handle, cpu, physid);
diff --git a/arch/ia64/kernel/asm-offsets.c b/arch/ia64/kernel/asm-offsets.c
index 742dbb1d5a4..af565016904 100644
--- a/arch/ia64/kernel/asm-offsets.c
+++ b/arch/ia64/kernel/asm-offsets.c
@@ -316,5 +316,7 @@ void foo(void)
DEFINE_MAPPED_REG_OFS(XSI_BANK1_R16_OFS, bank1_regs[0]);
DEFINE_MAPPED_REG_OFS(XSI_B0NATS_OFS, vbnat);
DEFINE_MAPPED_REG_OFS(XSI_B1NATS_OFS, vnat);
+ DEFINE_MAPPED_REG_OFS(XSI_ITC_OFFSET_OFS, itc_offset);
+ DEFINE_MAPPED_REG_OFS(XSI_ITC_LAST_OFS, itc_last);
#endif /* CONFIG_XEN */
}
diff --git a/arch/ia64/kernel/efi.c b/arch/ia64/kernel/efi.c
index efaff15d8cf..7ef80e8161c 100644
--- a/arch/ia64/kernel/efi.c
+++ b/arch/ia64/kernel/efi.c
@@ -456,6 +456,7 @@ efi_map_pal_code (void)
GRANULEROUNDDOWN((unsigned long) pal_vaddr),
pte_val(pfn_pte(__pa(pal_vaddr) >> PAGE_SHIFT, PAGE_KERNEL)),
IA64_GRANULE_SHIFT);
+ paravirt_dv_serialize_data();
ia64_set_psr(psr); /* restore psr */
}
diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index e5341e2c117..ccfdeee9d89 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S
@@ -735,7 +735,7 @@ GLOBAL_ENTRY(__paravirt_leave_syscall)
__paravirt_work_processed_syscall:
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
adds r2=PT(LOADRS)+16,r12
-(pUStk) mov.m r22=ar.itc // fetch time at leave
+ MOV_FROM_ITC(pUStk, p9, r22, r19) // fetch time at leave
adds r18=TI_FLAGS+IA64_TASK_SIZE,r13
;;
(p6) ld4 r31=[r18] // load current_thread_info()->flags
@@ -984,7 +984,7 @@ GLOBAL_ENTRY(__paravirt_leave_kernel)
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
.pred.rel.mutex pUStk,pKStk
MOV_FROM_PSR(pKStk, r22, r29) // M2 read PSR now that interrupts are disabled
-(pUStk) mov.m r22=ar.itc // M fetch time at leave
+ MOV_FROM_ITC(pUStk, p9, r22, r29) // M fetch time at leave
nop.i 0
;;
#else
diff --git a/arch/ia64/kernel/fsys.S b/arch/ia64/kernel/fsys.S
index c1625c7e177..3567d54f8ce 100644
--- a/arch/ia64/kernel/fsys.S
+++ b/arch/ia64/kernel/fsys.S
@@ -25,6 +25,7 @@
#include <asm/unistd.h>
#include "entry.h"
+#include "paravirt_inst.h"
/*
* See Documentation/ia64/fsys.txt for details on fsyscalls.
@@ -279,7 +280,7 @@ ENTRY(fsys_gettimeofday)
(p9) cmp.eq p13,p0 = 0,r30 // if mmio_ptr, clear p13 jitter control
;;
.pred.rel.mutex p8,p9
-(p8) mov r2 = ar.itc // CPU_TIMER. 36 clocks latency!!!
+ MOV_FROM_ITC(p8, p6, r2, r10) // CPU_TIMER. 36 clocks latency!!!
(p9) ld8 r2 = [r30] // MMIO_TIMER. Could also have latency issues..
(p13) ld8 r25 = [r19] // get itc_lastcycle value
ld8 r9 = [r22],IA64_TIMESPEC_TV_NSEC_OFFSET // tv_sec
@@ -418,7 +419,7 @@ EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set
mov r17=(1 << (SIGKILL - 1)) | (1 << (SIGSTOP - 1))
;;
- rsm psr.i // mask interrupt delivery
+ RSM_PSR_I(p0, r18, r19) // mask interrupt delivery
mov ar.ccv=0
andcm r14=r14,r17 // filter out SIGKILL & SIGSTOP
@@ -491,7 +492,7 @@ EX(.fail_efault, ld8 r14=[r33]) // r14 <- *set
#ifdef CONFIG_SMP
st4.rel [r31]=r0 // release the lock
#endif
- ssm psr.i
+ SSM_PSR_I(p0, p9, r31)
;;
srlz.d // ensure psr.i is set again
@@ -513,7 +514,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3)
#ifdef CONFIG_SMP
st4.rel [r31]=r0 // release the lock
#endif
- ssm psr.i
+ SSM_PSR_I(p0, p9, r17)
;;
srlz.d
br.sptk.many fsys_fallback_syscall // with signal pending, do the heavy-weight syscall
@@ -521,7 +522,7 @@ EX(.fail_efault, (p15) st8 [r34]=r3)
#ifdef CONFIG_SMP
.lock_contention:
/* Rather than spinning here, fall back on doing a heavy-weight syscall. */
- ssm psr.i
+ SSM_PSR_I(p0, p9, r17)
;;
srlz.d
br.sptk.many fsys_fallback_syscall
@@ -592,17 +593,17 @@ ENTRY(fsys_fallback_syscall)
adds r17=-1024,r15
movl r14=sys_call_table
;;
- rsm psr.i
+ RSM_PSR_I(p0, r26, r27)
shladd r18=r17,3,r14
;;
ld8 r18=[r18] // load normal (heavy-weight) syscall entry-point
- mov r29=psr // read psr (12 cyc load latency)
+ MOV_FROM_PSR(p0, r29, r26) // read psr (12 cyc load latency)
mov r27=ar.rsc
mov r21=ar.fpsr
mov r26=ar.pfs
END(fsys_fallback_syscall)
/* FALL THROUGH */
-GLOBAL_ENTRY(fsys_bubble_down)
+GLOBAL_ENTRY(paravirt_fsys_bubble_down)
.prologue
.altrp b6
.body
@@ -640,7 +641,7 @@ GLOBAL_ENTRY(fsys_bubble_down)
*
* PSR.BE : already is turned off in __kernel_syscall_via_epc()
* PSR.AC : don't care (kernel normally turns PSR.AC on)
- * PSR.I : already turned off by the time fsys_bubble_down gets
+ * PSR.I : already turned off by the time paravirt_fsys_bubble_down gets
* invoked
* PSR.DFL: always 0 (kernel never turns it on)
* PSR.DFH: don't care --- kernel never touches f32-f127 on its own
@@ -650,7 +651,7 @@ GLOBAL_ENTRY(fsys_bubble_down)
* PSR.DB : don't care --- kernel never enables kernel-level
* breakpoints
* PSR.TB : must be 0 already; if it wasn't zero on entry to
- * __kernel_syscall_via_epc, the branch to fsys_bubble_down
+ * __kernel_syscall_via_epc, the branch to paravirt_fsys_bubble_down
* will trigger a taken branch; the taken-trap-handler then
* converts the syscall into a break-based system-call.
*/
@@ -683,7 +684,7 @@ GLOBAL_ENTRY(fsys_bubble_down)
;;
mov ar.rsc=0 // M2 set enforced lazy mode, pl 0, LE, loadrs=0
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
- mov.m r30=ar.itc // M get cycle for accounting
+ MOV_FROM_ITC(p0, p6, r30, r23) // M get cycle for accounting
#else
nop.m 0
#endif
@@ -734,21 +735,21 @@ GLOBAL_ENTRY(fsys_bubble_down)
mov rp=r14 // I0 set the real return addr
and r3=_TIF_SYSCALL_TRACEAUDIT,r3 // A
;;
- ssm psr.i // M2 we're on kernel stacks now, reenable irqs
+ SSM_PSR_I(p0, p6, r22) // M2 we're on kernel stacks now, reenable irqs
cmp.eq p8,p0=r3,r0 // A
(p10) br.cond.spnt.many ia64_ret_from_syscall // B return if bad call-frame or r15 is a NaT
nop.m 0
(p8) br.call.sptk.many b6=b6 // B (ignore return address)
br.cond.spnt ia64_trace_syscall // B
-END(fsys_bubble_down)
+END(paravirt_fsys_bubble_down)
.rodata
.align 8
- .globl fsyscall_table
+ .globl paravirt_fsyscall_table
- data8 fsys_bubble_down
-fsyscall_table:
+ data8 paravirt_fsys_bubble_down
+paravirt_fsyscall_table:
data8 fsys_ni_syscall
data8 0 // exit // 1025
data8 0 // read
@@ -1033,4 +1034,4 @@ fsyscall_table:
// fill in zeros for the remaining entries
.zero:
- .space fsyscall_table + 8*NR_syscalls - .zero, 0
+ .space paravirt_fsyscall_table + 8*NR_syscalls - .zero, 0
diff --git a/arch/ia64/kernel/gate.S b/arch/ia64/kernel/gate.S
index 74b1ccce4e8..cf5e0a105e1 100644
--- a/arch/ia64/kernel/gate.S
+++ b/arch/ia64/kernel/gate.S
@@ -13,6 +13,7 @@
#include <asm/sigcontext.h>
#include <asm/system.h>
#include <asm/unistd.h>
+#include "paravirt_inst.h"
/*
* We can't easily refer to symbols inside the kernel. To avoid full runtime relocation,
@@ -48,87 +49,6 @@ GLOBAL_ENTRY(__kernel_syscall_via_break)
}
END(__kernel_syscall_via_break)
-/*
- * On entry:
- * r11 = saved ar.pfs
- * r15 = system call #
- * b0 = saved return address
- * b6 = return address
- * On exit:
- * r11 = saved ar.pfs
- * r15 = system call #
- * b0 = saved return address
- * all other "scratch" registers: undefined
- * all "preserved" registers: same as on entry
- */
-
-GLOBAL_ENTRY(__kernel_syscall_via_epc)
- .prologue
- .altrp b6
- .body
-{
- /*
- * Note: the kernel cannot assume that the first two instructions in this
- * bundle get executed. The remaining code must be safe even if
- * they do not get executed.
- */
- adds r17=-1024,r15 // A
- mov r10=0 // A default to successful syscall execution
- epc // B causes split-issue
-}
- ;;
- rsm psr.be | psr.i // M2 (5 cyc to srlz.d)
- LOAD_FSYSCALL_TABLE(r14) // X
- ;;
- mov r16=IA64_KR(CURRENT) // M2 (12 cyc)
- shladd r18=r17,3,r14 // A
- mov r19=NR_syscalls-1 // A
- ;;
- lfetch [r18] // M0|1
- mov r29=psr // M2 (12 cyc)
- // If r17 is a NaT, p6 will be zero
- cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)?
- ;;
- mov r21=ar.fpsr // M2 (12 cyc)
- tnat.nz p10,p9=r15 // I0
- mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...)
- ;;
- srlz.d // M0 (forces split-issue) ensure PSR.BE==0
-(p6) ld8 r18=[r18] // M0|1
- nop.i 0
- ;;
- nop.m 0
-(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!)
- nop.i 0
- ;;
-(p8) ssm psr.i
-(p6) mov b7=r18 // I0
-(p8) br.dptk.many b7 // B
-
- mov r27=ar.rsc // M2 (12 cyc)
-/*
- * brl.cond doesn't work as intended because the linker would convert this branch
- * into a branch to a PLT. Perhaps there will be a way to avoid this with some
- * future version of the linker. In the meantime, we just use an indirect branch
- * instead.
- */
-#ifdef CONFIG_ITANIUM
-(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry
- ;;
-(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down
- ;;
-(p6) mov b7=r14
-(p6) br.sptk.many b7
-#else
- BRL_COND_FSYS_BUBBLE_DOWN(p6)
-#endif
- ssm psr.i
- mov r10=-1
-(p10) mov r8=EINVAL
-(p9) mov r8=ENOSYS
- FSYS_RETURN
-END(__kernel_syscall_via_epc)
-
# define ARG0_OFF (16 + IA64_SIGFRAME_ARG0_OFFSET)
# define ARG1_OFF (16 + IA64_SIGFRAME_ARG1_OFFSET)
# define ARG2_OFF (16 + IA64_SIGFRAME_ARG2_OFFSET)
@@ -374,3 +294,92 @@ restore_rbs:
// invala not necessary as that will happen when returning to user-mode
br.cond.sptk back_from_restore_rbs
END(__kernel_sigtramp)
+
+/*
+ * On entry:
+ * r11 = saved ar.pfs
+ * r15 = system call #
+ * b0 = saved return address
+ * b6 = return address
+ * On exit:
+ * r11 = saved ar.pfs
+ * r15 = system call #
+ * b0 = saved return address
+ * all other "scratch" registers: undefined
+ * all "preserved" registers: same as on entry
+ */
+
+GLOBAL_ENTRY(__kernel_syscall_via_epc)
+ .prologue
+ .altrp b6
+ .body
+{
+ /*
+ * Note: the kernel cannot assume that the first two instructions in this
+ * bundle get executed. The remaining code must be safe even if
+ * they do not get executed.
+ */
+ adds r17=-1024,r15 // A
+ mov r10=0 // A default to successful syscall execution
+ epc // B causes split-issue
+}
+ ;;
+ RSM_PSR_BE_I(r20, r22) // M2 (5 cyc to srlz.d)
+ LOAD_FSYSCALL_TABLE(r14) // X
+ ;;
+ mov r16=IA64_KR(CURRENT) // M2 (12 cyc)
+ shladd r18=r17,3,r14 // A
+ mov r19=NR_syscalls-1 // A
+ ;;
+ lfetch [r18] // M0|1
+ MOV_FROM_PSR(p0, r29, r8) // M2 (12 cyc)
+ // If r17 is a NaT, p6 will be zero
+ cmp.geu p6,p7=r19,r17 // A (sysnr > 0 && sysnr < 1024+NR_syscalls)?
+ ;;
+ mov r21=ar.fpsr // M2 (12 cyc)
+ tnat.nz p10,p9=r15 // I0
+ mov.i r26=ar.pfs // I0 (would stall anyhow due to srlz.d...)
+ ;;
+ srlz.d // M0 (forces split-issue) ensure PSR.BE==0
+(p6) ld8 r18=[r18] // M0|1
+ nop.i 0
+ ;;
+ nop.m 0
+(p6) tbit.z.unc p8,p0=r18,0 // I0 (dual-issues with "mov b7=r18"!)
+ nop.i 0
+ ;;
+ SSM_PSR_I(p8, p14, r25)
+(p6) mov b7=r18 // I0
+(p8) br.dptk.many b7 // B
+
+ mov r27=ar.rsc // M2 (12 cyc)
+/*
+ * brl.cond doesn't work as intended because the linker would convert this branch
+ * into a branch to a PLT. Perhaps there will be a way to avoid this with some
+ * future version of the linker. In the meantime, we just use an indirect branch
+ * instead.
+ */
+#ifdef CONFIG_ITANIUM
+(p6) add r14=-8,r14 // r14 <- addr of fsys_bubble_down entry
+ ;;
+(p6) ld8 r14=[r14] // r14 <- fsys_bubble_down
+ ;;
+(p6) mov b7=r14
+(p6) br.sptk.many b7
+#else
+ BRL_COND_FSYS_BUBBLE_DOWN(p6)
+#endif
+ SSM_PSR_I(p0, p14, r10)
+ mov r10=-1
+(p10) mov r8=EINVAL
+(p9) mov r8=ENOSYS
+ FSYS_RETURN
+
+#ifdef CONFIG_PARAVIRT
+ /*
+ * padd to make the size of this symbol constant
+ * independent of paravirtualization.
+ */
+ .align PAGE_SIZE / 8
+#endif
+END(__kernel_syscall_via_epc)
diff --git a/arch/ia64/kernel/gate.lds.S b/arch/ia64/kernel/gate.lds.S
index 3cb1abc00e2..88c64ed47c3 100644
--- a/arch/ia64/kernel/gate.lds.S
+++ b/arch/ia64/kernel/gate.lds.S
@@ -7,6 +7,7 @@
#include <asm/system.h>
+#include "paravirt_patchlist.h"
SECTIONS
{
@@ -33,21 +34,21 @@ SECTIONS
. = GATE_ADDR + 0x600;
.data.patch : {
- __start_gate_mckinley_e9_patchlist = .;
+ __paravirt_start_gate_mckinley_e9_patchlist = .;
*(.data.patch.mckinley_e9)
- __end_gate_mckinley_e9_patchlist = .;
+ __paravirt_end_gate_mckinley_e9_patchlist = .;
- __start_gate_vtop_patchlist = .;
+ __paravirt_start_gate_vtop_patchlist = .;
*(.data.patch.vtop)
- __end_gate_vtop_patchlist = .;
+ __paravirt_end_gate_vtop_patchlist = .;
- __start_gate_fsyscall_patchlist = .;
+ __paravirt_start_gate_fsyscall_patchlist = .;
*(.data.patch.fsyscall_table)
- __end_gate_fsyscall_patchlist = .;
+ __paravirt_end_gate_fsyscall_patchlist = .;
- __start_gate_brl_fsys_bubble_down_patchlist = .;
+ __paravirt_start_gate_brl_fsys_bubble_down_patchlist = .;
*(.data.patch.brl_fsys_bubble_down)
- __end_gate_brl_fsys_bubble_down_patchlist = .;
+ __paravirt_end_gate_brl_fsys_bubble_down_patchlist = .;
} :readable
.IA_64.unwind_info : { *(.IA_64.unwind_info*) }
diff --git a/arch/ia64/kernel/head.S b/arch/ia64/kernel/head.S
index 59301c47280..23f846de62d 100644
--- a/arch/ia64/kernel/head.S
+++ b/arch/ia64/kernel/head.S
@@ -1050,7 +1050,7 @@ END(ia64_delay_loop)
* except that the multiplication and the shift are done with 128-bit
* intermediate precision so that we can produce a full 64-bit result.
*/
-GLOBAL_ENTRY(sched_clock)
+GLOBAL_ENTRY(ia64_native_sched_clock)
addl r8=THIS_CPU(cpu_info) + IA64_CPUINFO_NSEC_PER_CYC_OFFSET,r0
mov.m r9=ar.itc // fetch cycle-counter (35 cyc)
;;
@@ -1066,7 +1066,13 @@ GLOBAL_ENTRY(sched_clock)
;;
shrp r8=r9,r8,IA64_NSEC_PER_CYC_SHIFT
br.ret.sptk.many rp
-END(sched_clock)
+END(ia64_native_sched_clock)
+#ifndef CONFIG_PARAVIRT
+ //unsigned long long
+ //sched_clock(void) __attribute__((alias("ia64_native_sched_clock")));
+ .global sched_clock
+sched_clock = ia64_native_sched_clock
+#endif
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
GLOBAL_ENTRY(cycle_to_cputime)
diff --git a/arch/ia64/kernel/ivt.S b/arch/ia64/kernel/ivt.S
index f675d8e3385..ec9a5fdfa1b 100644
--- a/arch/ia64/kernel/ivt.S
+++ b/arch/ia64/kernel/ivt.S
@@ -804,7 +804,7 @@ ENTRY(break_fault)
///////////////////////////////////////////////////////////////////////
st1 [r16]=r0 // M2|3 clear current->thread.on_ustack flag
#ifdef CONFIG_VIRT_CPU_ACCOUNTING
- mov.m r30=ar.itc // M get cycle for accounting
+ MOV_FROM_ITC(p0, p14, r30, r18) // M get cycle for accounting
#else
mov b6=r30 // I0 setup syscall handler branch reg early
#endif
diff --git a/arch/ia64/kernel/mca.c b/arch/ia64/kernel/mca.c
index bab1de2d2f6..8f33a884042 100644
--- a/arch/ia64/kernel/mca.c
+++ b/arch/ia64/kernel/mca.c
@@ -1456,9 +1456,9 @@ ia64_mca_cmc_int_caller(int cmc_irq, void *arg)
ia64_mca_cmc_int_handler(cmc_irq, arg);
- for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
+ cpuid = cpumask_next(cpuid+1, cpu_online_mask);
- if (cpuid < NR_CPUS) {
+ if (cpuid < nr_cpu_ids) {
platform_send_ipi(cpuid, IA64_CMCP_VECTOR, IA64_IPI_DM_INT, 0);
} else {
/* If no log record, switch out of polling mode */
@@ -1525,7 +1525,7 @@ ia64_mca_cpe_int_caller(int cpe_irq, void *arg)
ia64_mca_cpe_int_handler(cpe_irq, arg);
- for (++cpuid ; cpuid < NR_CPUS && !cpu_online(cpuid) ; cpuid++);
+ cpuid = cpumask_next(cpuid+1, cpu_online_mask);
if (cpuid < NR_CPUS) {
platform_send_ipi(cpuid, IA64_CPEP_VECTOR, IA64_IPI_DM_INT, 0);
diff --git a/arch/ia64/kernel/module.c b/arch/ia64/kernel/module.c
index aaa7d901521..da3b0cf495a 100644
--- a/arch/ia64/kernel/module.c
+++ b/arch/ia64/kernel/module.c
@@ -446,6 +446,14 @@ module_frob_arch_sections (Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings,
mod->arch.opd = s;
else if (strcmp(".IA_64.unwind", secstrings + s->sh_name) == 0)
mod->arch.unwind = s;
+#ifdef CONFIG_PARAVIRT
+ else if (strcmp(".paravirt_bundles",
+ secstrings + s->sh_name) == 0)
+ mod->arch.paravirt_bundles = s;
+ else if (strcmp(".paravirt_insts",
+ secstrings + s->sh_name) == 0)
+ mod->arch.paravirt_insts = s;
+#endif
if (!mod->arch.core_plt || !mod->arch.init_plt || !mod->arch.got || !mod->arch.opd) {
printk(KERN_ERR "%s: sections missing\n", mod->name);
@@ -525,8 +533,7 @@ get_ltoff (struct module *mod, uint64_t value, int *okp)
goto found;
/* Not enough GOT entries? */
- if (e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size))
- BUG();
+ BUG_ON(e >= (struct got_entry *) (mod->arch.got->sh_addr + mod->arch.got->sh_size));
e->val = value;
++mod->arch.next_got_entry;
@@ -921,6 +928,30 @@ module_finalize (const Elf_Ehdr *hdr, const Elf_Shdr *sechdrs, struct module *mo
DEBUGP("%s: init: entry=%p\n", __func__, mod->init);
if (mod->arch.unwind)
register_unwind_table(mod);
+#ifdef CONFIG_PARAVIRT
+ if (mod->arch.paravirt_bundles) {
+ struct paravirt_patch_site_bundle *start =
+ (struct paravirt_patch_site_bundle *)
+ mod->arch.paravirt_bundles->sh_addr;
+ struct paravirt_patch_site_bundle *end =
+ (struct paravirt_patch_site_bundle *)
+ (mod->arch.paravirt_bundles->sh_addr +
+ mod->arch.paravirt_bundles->sh_size);
+
+ paravirt_patch_apply_bundle(start, end);
+ }
+ if (mod->arch.paravirt_insts) {
+ struct paravirt_patch_site_inst *start =
+ (struct paravirt_patch_site_inst *)
+ mod->arch.paravirt_insts->sh_addr;
+ struct paravirt_patch_site_inst *end =
+ (struct paravirt_patch_site_inst *)
+ (mod->arch.paravirt_insts->sh_addr +
+ mod->arch.paravirt_insts->sh_size);
+
+ paravirt_patch_apply_inst(start, end);
+ }
+#endif
return 0;
}
diff --git a/arch/ia64/kernel/paravirt.c b/arch/ia64/kernel/paravirt.c
index 9f14c16f636..a21d7bb9c69 100644
--- a/arch/ia64/kernel/paravirt.c
+++ b/arch/ia64/kernel/paravirt.c
@@ -46,13 +46,23 @@ struct pv_info pv_info = {
* initialization hooks.
*/
-struct pv_init_ops pv_init_ops;
+static void __init
+ia64_native_patch_branch(unsigned long tag, unsigned long type);
+
+struct pv_init_ops pv_init_ops =
+{
+#ifdef ASM_SUPPORTED
+ .patch_bundle = ia64_native_patch_bundle,
+#endif
+ .patch_branch = ia64_native_patch_branch,
+};
/***************************************************************************
* pv_cpu_ops
* intrinsics hooks.
*/
+#ifndef ASM_SUPPORTED
/* ia64_native_xxx are macros so that we have to make them real functions */
#define DEFINE_VOID_FUNC1(name) \
@@ -60,7 +70,14 @@ struct pv_init_ops pv_init_ops;
ia64_native_ ## name ## _func(unsigned long arg) \
{ \
ia64_native_ ## name(arg); \
- } \
+ }
+
+#define DEFINE_VOID_FUNC1_VOID(name) \
+ static void \
+ ia64_native_ ## name ## _func(void *arg) \
+ { \
+ ia64_native_ ## name(arg); \
+ }
#define DEFINE_VOID_FUNC2(name) \
static void \
@@ -68,7 +85,7 @@ struct pv_init_ops pv_init_ops;
unsigned long arg1) \
{ \
ia64_native_ ## name(arg0, arg1); \
- } \
+ }
#define DEFINE_FUNC0(name) \
static unsigned long \
@@ -84,7 +101,7 @@ struct pv_init_ops pv_init_ops;
return ia64_native_ ## name(arg); \
} \
-DEFINE_VOID_FUNC1(fc);
+DEFINE_VOID_FUNC1_VOID(fc);
DEFINE_VOID_FUNC1(intrin_local_irq_restore);
DEFINE_VOID_FUNC2(ptcga);
@@ -274,6 +291,266 @@ ia64_native_setreg_func(int regnum, unsigned long val)
break;
}
}
+#else
+
+#define __DEFINE_FUNC(name, code) \
+ extern const char ia64_native_ ## name ## _direct_start[]; \
+ extern const char ia64_native_ ## name ## _direct_end[]; \
+ asm (".align 32\n" \
+ ".proc ia64_native_" #name "_func\n" \
+ "ia64_native_" #name "_func:\n" \
+ "ia64_native_" #name "_direct_start:\n" \
+ code \
+ "ia64_native_" #name "_direct_end:\n" \
+ "br.cond.sptk.many b6\n" \
+ ".endp ia64_native_" #name "_func\n")
+
+#define DEFINE_VOID_FUNC0(name, code) \
+ extern void \
+ ia64_native_ ## name ## _func(void); \
+ __DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC1(name, code) \
+ extern void \
+ ia64_native_ ## name ## _func(unsigned long arg); \
+ __DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC1_VOID(name, code) \
+ extern void \
+ ia64_native_ ## name ## _func(void *arg); \
+ __DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC2(name, code) \
+ extern void \
+ ia64_native_ ## name ## _func(unsigned long arg0, \
+ unsigned long arg1); \
+ __DEFINE_FUNC(name, code)
+
+#define DEFINE_FUNC0(name, code) \
+ extern unsigned long \
+ ia64_native_ ## name ## _func(void); \
+ __DEFINE_FUNC(name, code)
+
+#define DEFINE_FUNC1(name, type, code) \
+ extern unsigned long \
+ ia64_native_ ## name ## _func(type arg); \
+ __DEFINE_FUNC(name, code)
+
+DEFINE_VOID_FUNC1_VOID(fc,
+ "fc r8\n");
+DEFINE_VOID_FUNC1(intrin_local_irq_restore,
+ ";;\n"
+ " cmp.ne p6, p7 = r8, r0\n"
+ ";;\n"
+ "(p6) ssm psr.i\n"
+ "(p7) rsm psr.i\n"
+ ";;\n"
+ "(p6) srlz.d\n");
+
+DEFINE_VOID_FUNC2(ptcga,
+ "ptc.ga r8, r9\n");
+DEFINE_VOID_FUNC2(set_rr,
+ "mov rr[r8] = r9\n");
+
+/* ia64_native_getreg(_IA64_REG_PSR) & IA64_PSR_I */
+DEFINE_FUNC0(get_psr_i,
+ "mov r2 = " __stringify(1 << IA64_PSR_I_BIT) "\n"
+ "mov r8 = psr\n"
+ ";;\n"
+ "and r8 = r2, r8\n");
+
+DEFINE_FUNC1(thash, unsigned long,
+ "thash r8 = r8\n");
+DEFINE_FUNC1(get_cpuid, int,
+ "mov r8 = cpuid[r8]\n");
+DEFINE_FUNC1(get_pmd, int,
+ "mov r8 = pmd[r8]\n");
+DEFINE_FUNC1(get_rr, unsigned long,
+ "mov r8 = rr[r8]\n");
+
+DEFINE_VOID_FUNC0(ssm_i,
+ "ssm psr.i\n");
+DEFINE_VOID_FUNC0(rsm_i,
+ "rsm psr.i\n");
+
+extern void
+ia64_native_set_rr0_to_rr4_func(unsigned long val0, unsigned long val1,
+ unsigned long val2, unsigned long val3,
+ unsigned long val4);
+__DEFINE_FUNC(set_rr0_to_rr4,
+ "mov rr[r0] = r8\n"
+ "movl r2 = 0x2000000000000000\n"
+ ";;\n"
+ "mov rr[r2] = r9\n"
+ "shl r3 = r2, 1\n" /* movl r3 = 0x4000000000000000 */
+ ";;\n"
+ "add r2 = r2, r3\n" /* movl r2 = 0x6000000000000000 */
+ "mov rr[r3] = r10\n"
+ ";;\n"
+ "mov rr[r2] = r11\n"
+ "shl r3 = r3, 1\n" /* movl r3 = 0x8000000000000000 */
+ ";;\n"
+ "mov rr[r3] = r14\n");
+
+extern unsigned long ia64_native_getreg_func(int regnum);
+asm(".global ia64_native_getreg_func\n");
+#define __DEFINE_GET_REG(id, reg) \
+ "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \
+ ";;\n" \
+ "cmp.eq p6, p0 = r2, r8\n" \
+ ";;\n" \
+ "(p6) mov r8 = " #reg "\n" \
+ "(p6) br.cond.sptk.many b6\n" \
+ ";;\n"
+#define __DEFINE_GET_AR(id, reg) __DEFINE_GET_REG(AR_ ## id, ar.reg)
+#define __DEFINE_GET_CR(id, reg) __DEFINE_GET_REG(CR_ ## id, cr.reg)
+
+__DEFINE_FUNC(getreg,
+ __DEFINE_GET_REG(GP, gp)
+ /*__DEFINE_GET_REG(IP, ip)*/ /* returned ip value shouldn't be constant */
+ __DEFINE_GET_REG(PSR, psr)
+ __DEFINE_GET_REG(TP, tp)
+ __DEFINE_GET_REG(SP, sp)
+
+ __DEFINE_GET_REG(AR_KR0, ar0)
+ __DEFINE_GET_REG(AR_KR1, ar1)
+ __DEFINE_GET_REG(AR_KR2, ar2)
+ __DEFINE_GET_REG(AR_KR3, ar3)
+ __DEFINE_GET_REG(AR_KR4, ar4)
+ __DEFINE_GET_REG(AR_KR5, ar5)
+ __DEFINE_GET_REG(AR_KR6, ar6)
+ __DEFINE_GET_REG(AR_KR7, ar7)
+ __DEFINE_GET_AR(RSC, rsc)
+ __DEFINE_GET_AR(BSP, bsp)
+ __DEFINE_GET_AR(BSPSTORE, bspstore)
+ __DEFINE_GET_AR(RNAT, rnat)
+ __DEFINE_GET_AR(FCR, fcr)
+ __DEFINE_GET_AR(EFLAG, eflag)
+ __DEFINE_GET_AR(CSD, csd)
+ __DEFINE_GET_AR(SSD, ssd)
+ __DEFINE_GET_REG(AR_CFLAG, ar27)
+ __DEFINE_GET_AR(FSR, fsr)
+ __DEFINE_GET_AR(FIR, fir)
+ __DEFINE_GET_AR(FDR, fdr)
+ __DEFINE_GET_AR(CCV, ccv)
+ __DEFINE_GET_AR(UNAT, unat)
+ __DEFINE_GET_AR(FPSR, fpsr)
+ __DEFINE_GET_AR(ITC, itc)
+ __DEFINE_GET_AR(PFS, pfs)
+ __DEFINE_GET_AR(LC, lc)
+ __DEFINE_GET_AR(EC, ec)
+
+ __DEFINE_GET_CR(DCR, dcr)
+ __DEFINE_GET_CR(ITM, itm)
+ __DEFINE_GET_CR(IVA, iva)
+ __DEFINE_GET_CR(PTA, pta)
+ __DEFINE_GET_CR(IPSR, ipsr)
+ __DEFINE_GET_CR(ISR, isr)
+ __DEFINE_GET_CR(IIP, iip)
+ __DEFINE_GET_CR(IFA, ifa)
+ __DEFINE_GET_CR(ITIR, itir)
+ __DEFINE_GET_CR(IIPA, iipa)
+ __DEFINE_GET_CR(IFS, ifs)
+ __DEFINE_GET_CR(IIM, iim)
+ __DEFINE_GET_CR(IHA, iha)
+ __DEFINE_GET_CR(LID, lid)
+ __DEFINE_GET_CR(IVR, ivr)
+ __DEFINE_GET_CR(TPR, tpr)
+ __DEFINE_GET_CR(EOI, eoi)
+ __DEFINE_GET_CR(IRR0, irr0)
+ __DEFINE_GET_CR(IRR1, irr1)
+ __DEFINE_GET_CR(IRR2, irr2)
+ __DEFINE_GET_CR(IRR3, irr3)
+ __DEFINE_GET_CR(ITV, itv)
+ __DEFINE_GET_CR(PMV, pmv)
+ __DEFINE_GET_CR(CMCV, cmcv)
+ __DEFINE_GET_CR(LRR0, lrr0)
+ __DEFINE_GET_CR(LRR1, lrr1)
+
+ "mov r8 = -1\n" /* unsupported case */
+ );
+
+extern void ia64_native_setreg_func(int regnum, unsigned long val);
+asm(".global ia64_native_setreg_func\n");
+#define __DEFINE_SET_REG(id, reg) \
+ "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \
+ ";;\n" \
+ "cmp.eq p6, p0 = r2, r9\n" \
+ ";;\n" \
+ "(p6) mov " #reg " = r8\n" \
+ "(p6) br.cond.sptk.many b6\n" \
+ ";;\n"
+#define __DEFINE_SET_AR(id, reg) __DEFINE_SET_REG(AR_ ## id, ar.reg)
+#define __DEFINE_SET_CR(id, reg) __DEFINE_SET_REG(CR_ ## id, cr.reg)
+__DEFINE_FUNC(setreg,
+ "mov r2 = " __stringify(_IA64_REG_PSR_L) "\n"
+ ";;\n"
+ "cmp.eq p6, p0 = r2, r9\n"
+ ";;\n"
+ "(p6) mov psr.l = r8\n"
+#ifdef HAVE_SERIALIZE_DIRECTIVE
+ ".serialize.data\n"
+#endif
+ "(p6) br.cond.sptk.many b6\n"
+ __DEFINE_SET_REG(GP, gp)
+ __DEFINE_SET_REG(SP, sp)
+
+ __DEFINE_SET_REG(AR_KR0, ar0)
+ __DEFINE_SET_REG(AR_KR1, ar1)
+ __DEFINE_SET_REG(AR_KR2, ar2)
+ __DEFINE_SET_REG(AR_KR3, ar3)
+ __DEFINE_SET_REG(AR_KR4, ar4)
+ __DEFINE_SET_REG(AR_KR5, ar5)
+ __DEFINE_SET_REG(AR_KR6, ar6)
+ __DEFINE_SET_REG(AR_KR7, ar7)
+ __DEFINE_SET_AR(RSC, rsc)
+ __DEFINE_SET_AR(BSP, bsp)
+ __DEFINE_SET_AR(BSPSTORE, bspstore)
+ __DEFINE_SET_AR(RNAT, rnat)
+ __DEFINE_SET_AR(FCR, fcr)
+ __DEFINE_SET_AR(EFLAG, eflag)
+ __DEFINE_SET_AR(CSD, csd)
+ __DEFINE_SET_AR(SSD, ssd)
+ __DEFINE_SET_REG(AR_CFLAG, ar27)
+ __DEFINE_SET_AR(FSR, fsr)
+ __DEFINE_SET_AR(FIR, fir)
+ __DEFINE_SET_AR(FDR, fdr)
+ __DEFINE_SET_AR(CCV, ccv)
+ __DEFINE_SET_AR(UNAT, unat)
+ __DEFINE_SET_AR(FPSR, fpsr)
+ __DEFINE_SET_AR(ITC, itc)
+ __DEFINE_SET_AR(PFS, pfs)
+ __DEFINE_SET_AR(LC, lc)
+ __DEFINE_SET_AR(EC, ec)
+
+ __DEFINE_SET_CR(DCR, dcr)
+ __DEFINE_SET_CR(ITM, itm)
+ __DEFINE_SET_CR(IVA, iva)
+ __DEFINE_SET_CR(PTA, pta)
+ __DEFINE_SET_CR(IPSR, ipsr)
+ __DEFINE_SET_CR(ISR, isr)
+ __DEFINE_SET_CR(IIP, iip)
+ __DEFINE_SET_CR(IFA, ifa)
+ __DEFINE_SET_CR(ITIR, itir)
+ __DEFINE_SET_CR(IIPA, iipa)
+ __DEFINE_SET_CR(IFS, ifs)
+ __DEFINE_SET_CR(IIM, iim)
+ __DEFINE_SET_CR(IHA, iha)
+ __DEFINE_SET_CR(LID, lid)
+ __DEFINE_SET_CR(IVR, ivr)
+ __DEFINE_SET_CR(TPR, tpr)
+ __DEFINE_SET_CR(EOI, eoi)
+ __DEFINE_SET_CR(IRR0, irr0)
+ __DEFINE_SET_CR(IRR1, irr1)
+ __DEFINE_SET_CR(IRR2, irr2)
+ __DEFINE_SET_CR(IRR3, irr3)
+ __DEFINE_SET_CR(ITV, itv)
+ __DEFINE_SET_CR(PMV, pmv)
+ __DEFINE_SET_CR(CMCV, cmcv)
+ __DEFINE_SET_CR(LRR0, lrr0)
+ __DEFINE_SET_CR(LRR1, lrr1)
+ );
+#endif
struct pv_cpu_ops pv_cpu_ops = {
.fc = ia64_native_fc_func,
@@ -366,4 +643,258 @@ ia64_native_do_steal_accounting(unsigned long *new_itm)
struct pv_time_ops pv_time_ops = {
.do_steal_accounting = ia64_native_do_steal_accounting,
+ .sched_clock = ia64_native_sched_clock,
+};
+
+/***************************************************************************
+ * binary pacthing
+ * pv_init_ops.patch_bundle
+ */
+
+#ifdef ASM_SUPPORTED
+#define IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg) \
+ __DEFINE_FUNC(get_ ## name, \
+ ";;\n" \
+ "mov r8 = " #reg "\n" \
+ ";;\n")
+
+#define IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg) \
+ __DEFINE_FUNC(set_ ## name, \
+ ";;\n" \
+ "mov " #reg " = r8\n" \
+ ";;\n")
+
+#define IA64_NATIVE_PATCH_DEFINE_REG(name, reg) \
+ IA64_NATIVE_PATCH_DEFINE_GET_REG(name, reg); \
+ IA64_NATIVE_PATCH_DEFINE_SET_REG(name, reg) \
+
+#define IA64_NATIVE_PATCH_DEFINE_AR(name, reg) \
+ IA64_NATIVE_PATCH_DEFINE_REG(ar_ ## name, ar.reg)
+
+#define IA64_NATIVE_PATCH_DEFINE_CR(name, reg) \
+ IA64_NATIVE_PATCH_DEFINE_REG(cr_ ## name, cr.reg)
+
+
+IA64_NATIVE_PATCH_DEFINE_GET_REG(psr, psr);
+IA64_NATIVE_PATCH_DEFINE_GET_REG(tp, tp);
+
+/* IA64_NATIVE_PATCH_DEFINE_SET_REG(psr_l, psr.l); */
+__DEFINE_FUNC(set_psr_l,
+ ";;\n"
+ "mov psr.l = r8\n"
+#ifdef HAVE_SERIALIZE_DIRECTIVE
+ ".serialize.data\n"
+#endif
+ ";;\n");
+
+IA64_NATIVE_PATCH_DEFINE_REG(gp, gp);
+IA64_NATIVE_PATCH_DEFINE_REG(sp, sp);
+
+IA64_NATIVE_PATCH_DEFINE_REG(kr0, ar0);
+IA64_NATIVE_PATCH_DEFINE_REG(kr1, ar1);
+IA64_NATIVE_PATCH_DEFINE_REG(kr2, ar2);
+IA64_NATIVE_PATCH_DEFINE_REG(kr3, ar3);
+IA64_NATIVE_PATCH_DEFINE_REG(kr4, ar4);
+IA64_NATIVE_PATCH_DEFINE_REG(kr5, ar5);
+IA64_NATIVE_PATCH_DEFINE_REG(kr6, ar6);
+IA64_NATIVE_PATCH_DEFINE_REG(kr7, ar7);
+
+IA64_NATIVE_PATCH_DEFINE_AR(rsc, rsc);
+IA64_NATIVE_PATCH_DEFINE_AR(bsp, bsp);
+IA64_NATIVE_PATCH_DEFINE_AR(bspstore, bspstore);
+IA64_NATIVE_PATCH_DEFINE_AR(rnat, rnat);
+IA64_NATIVE_PATCH_DEFINE_AR(fcr, fcr);
+IA64_NATIVE_PATCH_DEFINE_AR(eflag, eflag);
+IA64_NATIVE_PATCH_DEFINE_AR(csd, csd);
+IA64_NATIVE_PATCH_DEFINE_AR(ssd, ssd);
+IA64_NATIVE_PATCH_DEFINE_REG(ar27, ar27);
+IA64_NATIVE_PATCH_DEFINE_AR(fsr, fsr);
+IA64_NATIVE_PATCH_DEFINE_AR(fir, fir);
+IA64_NATIVE_PATCH_DEFINE_AR(fdr, fdr);
+IA64_NATIVE_PATCH_DEFINE_AR(ccv, ccv);
+IA64_NATIVE_PATCH_DEFINE_AR(unat, unat);
+IA64_NATIVE_PATCH_DEFINE_AR(fpsr, fpsr);
+IA64_NATIVE_PATCH_DEFINE_AR(itc, itc);
+IA64_NATIVE_PATCH_DEFINE_AR(pfs, pfs);
+IA64_NATIVE_PATCH_DEFINE_AR(lc, lc);
+IA64_NATIVE_PATCH_DEFINE_AR(ec, ec);
+
+IA64_NATIVE_PATCH_DEFINE_CR(dcr, dcr);
+IA64_NATIVE_PATCH_DEFINE_CR(itm, itm);
+IA64_NATIVE_PATCH_DEFINE_CR(iva, iva);
+IA64_NATIVE_PATCH_DEFINE_CR(pta, pta);
+IA64_NATIVE_PATCH_DEFINE_CR(ipsr, ipsr);
+IA64_NATIVE_PATCH_DEFINE_CR(isr, isr);
+IA64_NATIVE_PATCH_DEFINE_CR(iip, iip);
+IA64_NATIVE_PATCH_DEFINE_CR(ifa, ifa);
+IA64_NATIVE_PATCH_DEFINE_CR(itir, itir);
+IA64_NATIVE_PATCH_DEFINE_CR(iipa, iipa);
+IA64_NATIVE_PATCH_DEFINE_CR(ifs, ifs);
+IA64_NATIVE_PATCH_DEFINE_CR(iim, iim);
+IA64_NATIVE_PATCH_DEFINE_CR(iha, iha);
+IA64_NATIVE_PATCH_DEFINE_CR(lid, lid);
+IA64_NATIVE_PATCH_DEFINE_CR(ivr, ivr);
+IA64_NATIVE_PATCH_DEFINE_CR(tpr, tpr);
+IA64_NATIVE_PATCH_DEFINE_CR(eoi, eoi);
+IA64_NATIVE_PATCH_DEFINE_CR(irr0, irr0);
+IA64_NATIVE_PATCH_DEFINE_CR(irr1, irr1);
+IA64_NATIVE_PATCH_DEFINE_CR(irr2, irr2);
+IA64_NATIVE_PATCH_DEFINE_CR(irr3, irr3);
+IA64_NATIVE_PATCH_DEFINE_CR(itv, itv);
+IA64_NATIVE_PATCH_DEFINE_CR(pmv, pmv);
+IA64_NATIVE_PATCH_DEFINE_CR(cmcv, cmcv);
+IA64_NATIVE_PATCH_DEFINE_CR(lrr0, lrr0);
+IA64_NATIVE_PATCH_DEFINE_CR(lrr1, lrr1);
+
+static const struct paravirt_patch_bundle_elem ia64_native_patch_bundle_elems[]
+__initdata_or_module =
+{
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM(name, type) \
+ { \
+ (void*)ia64_native_ ## name ## _direct_start, \
+ (void*)ia64_native_ ## name ## _direct_end, \
+ PARAVIRT_PATCH_TYPE_ ## type, \
+ }
+
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(fc, FC),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(thash, THASH),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(ptcga, PTCGA),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(get_rr, GET_RR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr, SET_RR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(ssm_i, SSM_I),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(rsm_i, RSM_I),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM(intrin_local_irq_restore,
+ INTRIN_LOCAL_IRQ_RESTORE),
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg) \
+ { \
+ (void*)ia64_native_get_ ## name ## _direct_start, \
+ (void*)ia64_native_get_ ## name ## _direct_end, \
+ PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg, \
+ }
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg) \
+ { \
+ (void*)ia64_native_set_ ## name ## _direct_start, \
+ (void*)ia64_native_set_ ## name ## _direct_end, \
+ PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg, \
+ }
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(name, reg) \
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(name, reg), \
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(name, reg) \
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(name, reg) \
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar_ ## name, AR_ ## reg)
+
+#define IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(name, reg) \
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(cr_ ## name, CR_ ## reg)
+
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(psr, PSR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_GETREG(tp, TP),
+
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_SETREG(psr_l, PSR_L),
+
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(gp, GP),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(sp, SP),
+
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr0, AR_KR0),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr1, AR_KR1),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr2, AR_KR2),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr3, AR_KR3),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr4, AR_KR4),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr5, AR_KR5),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr6, AR_KR6),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(kr7, AR_KR7),
+
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rsc, RSC),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bsp, BSP),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(bspstore, BSPSTORE),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(rnat, RNAT),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fcr, FCR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(eflag, EFLAG),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(csd, CSD),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ssd, SSD),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_REG(ar27, AR_CFLAG),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fsr, FSR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fir, FIR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fdr, FDR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ccv, CCV),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(unat, UNAT),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(fpsr, FPSR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(itc, ITC),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(pfs, PFS),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(lc, LC),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_AR(ec, EC),
+
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(dcr, DCR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itm, ITM),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iva, IVA),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pta, PTA),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ipsr, IPSR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(isr, ISR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iip, IIP),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifa, IFA),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itir, ITIR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iipa, IIPA),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ifs, IFS),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iim, IIM),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(iha, IHA),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lid, LID),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(ivr, IVR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(tpr, TPR),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(eoi, EOI),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr0, IRR0),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr1, IRR1),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr2, IRR2),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(irr3, IRR3),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(itv, ITV),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(pmv, PMV),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(cmcv, CMCV),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr0, LRR0),
+ IA64_NATIVE_PATCH_BUNDLE_ELEM_CR(lrr1, LRR1),
};
+
+unsigned long __init_or_module
+ia64_native_patch_bundle(void *sbundle, void *ebundle, unsigned long type)
+{
+ const unsigned long nelems = sizeof(ia64_native_patch_bundle_elems) /
+ sizeof(ia64_native_patch_bundle_elems[0]);
+
+ return __paravirt_patch_apply_bundle(sbundle, ebundle, type,
+ ia64_native_patch_bundle_elems,
+ nelems, NULL);
+}
+#endif /* ASM_SUPPOTED */
+
+extern const char ia64_native_switch_to[];
+extern const char ia64_native_leave_syscall[];
+extern const char ia64_native_work_processed_syscall[];
+extern const char ia64_native_leave_kernel[];
+
+const struct paravirt_patch_branch_target ia64_native_branch_target[]
+__initconst = {
+#define PARAVIRT_BR_TARGET(name, type) \
+ { \
+ ia64_native_ ## name, \
+ PARAVIRT_PATCH_TYPE_BR_ ## type, \
+ }
+ PARAVIRT_BR_TARGET(switch_to, SWITCH_TO),
+ PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL),
+ PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL),
+ PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL),
+};
+
+static void __init
+ia64_native_patch_branch(unsigned long tag, unsigned long type)
+{
+ const unsigned long nelem =
+ sizeof(ia64_native_branch_target) /
+ sizeof(ia64_native_branch_target[0]);
+ __paravirt_patch_apply_branch(tag, type,
+ ia64_native_branch_target, nelem);
+}
diff --git a/arch/ia64/kernel/paravirt_patch.c b/arch/ia64/kernel/paravirt_patch.c
new file mode 100644
index 00000000000..bfdfef1b1ff
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patch.c
@@ -0,0 +1,514 @@
+/******************************************************************************
+ * linux/arch/ia64/xen/paravirt_patch.c
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/init.h>
+#include <asm/intrinsics.h>
+#include <asm/kprobes.h>
+#include <asm/paravirt.h>
+#include <asm/paravirt_patch.h>
+
+typedef union ia64_inst {
+ struct {
+ unsigned long long qp : 6;
+ unsigned long long : 31;
+ unsigned long long opcode : 4;
+ unsigned long long reserved : 23;
+ } generic;
+ unsigned long long l;
+} ia64_inst_t;
+
+/*
+ * flush_icache_range() can't be used here.
+ * we are here before cpu_init() which initializes
+ * ia64_i_cache_stride_shift. flush_icache_range() uses it.
+ */
+void __init_or_module
+paravirt_flush_i_cache_range(const void *instr, unsigned long size)
+{
+ extern void paravirt_fc_i(const void *addr);
+ unsigned long i;
+
+ for (i = 0; i < size; i += sizeof(bundle_t))
+ paravirt_fc_i(instr + i);
+}
+
+bundle_t* __init_or_module
+paravirt_get_bundle(unsigned long tag)
+{
+ return (bundle_t *)(tag & ~3UL);
+}
+
+unsigned long __init_or_module
+paravirt_get_slot(unsigned long tag)
+{
+ return tag & 3UL;
+}
+
+unsigned long __init_or_module
+paravirt_get_num_inst(unsigned long stag, unsigned long etag)
+{
+ bundle_t *sbundle = paravirt_get_bundle(stag);
+ unsigned long sslot = paravirt_get_slot(stag);
+ bundle_t *ebundle = paravirt_get_bundle(etag);
+ unsigned long eslot = paravirt_get_slot(etag);
+
+ return (ebundle - sbundle) * 3 + eslot - sslot + 1;
+}
+
+unsigned long __init_or_module
+paravirt_get_next_tag(unsigned long tag)
+{
+ unsigned long slot = paravirt_get_slot(tag);
+
+ switch (slot) {
+ case 0:
+ case 1:
+ return tag + 1;
+ case 2: {
+ bundle_t *bundle = paravirt_get_bundle(tag);
+ return (unsigned long)(bundle + 1);
+ }
+ default:
+ BUG();
+ }
+ /* NOTREACHED */
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_slot0(const bundle_t *bundle)
+{
+ ia64_inst_t inst;
+ inst.l = bundle->quad0.slot0;
+ return inst;
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_slot1(const bundle_t *bundle)
+{
+ ia64_inst_t inst;
+ inst.l = bundle->quad0.slot1_p0 |
+ ((unsigned long long)bundle->quad1.slot1_p1 << 18UL);
+ return inst;
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_slot2(const bundle_t *bundle)
+{
+ ia64_inst_t inst;
+ inst.l = bundle->quad1.slot2;
+ return inst;
+}
+
+ia64_inst_t __init_or_module
+paravirt_read_inst(unsigned long tag)
+{
+ bundle_t *bundle = paravirt_get_bundle(tag);
+ unsigned long slot = paravirt_get_slot(tag);
+
+ switch (slot) {
+ case 0:
+ return paravirt_read_slot0(bundle);
+ case 1:
+ return paravirt_read_slot1(bundle);
+ case 2:
+ return paravirt_read_slot2(bundle);
+ default:
+ BUG();
+ }
+ /* NOTREACHED */
+}
+
+void __init_or_module
+paravirt_write_slot0(bundle_t *bundle, ia64_inst_t inst)
+{
+ bundle->quad0.slot0 = inst.l;
+}
+
+void __init_or_module
+paravirt_write_slot1(bundle_t *bundle, ia64_inst_t inst)
+{
+ bundle->quad0.slot1_p0 = inst.l;
+ bundle->quad1.slot1_p1 = inst.l >> 18UL;
+}
+
+void __init_or_module
+paravirt_write_slot2(bundle_t *bundle, ia64_inst_t inst)
+{
+ bundle->quad1.slot2 = inst.l;
+}
+
+void __init_or_module
+paravirt_write_inst(unsigned long tag, ia64_inst_t inst)
+{
+ bundle_t *bundle = paravirt_get_bundle(tag);
+ unsigned long slot = paravirt_get_slot(tag);
+
+ switch (slot) {
+ case 0:
+ paravirt_write_slot0(bundle, inst);
+ break;
+ case 1:
+ paravirt_write_slot1(bundle, inst);
+ break;
+ case 2:
+ paravirt_write_slot2(bundle, inst);
+ break;
+ default:
+ BUG();
+ break;
+ }
+ paravirt_flush_i_cache_range(bundle, sizeof(*bundle));
+}
+
+/* for debug */
+void
+paravirt_print_bundle(const bundle_t *bundle)
+{
+ const unsigned long *quad = (const unsigned long *)bundle;
+ ia64_inst_t slot0 = paravirt_read_slot0(bundle);
+ ia64_inst_t slot1 = paravirt_read_slot1(bundle);
+ ia64_inst_t slot2 = paravirt_read_slot2(bundle);
+
+ printk(KERN_DEBUG
+ "bundle 0x%p 0x%016lx 0x%016lx\n", bundle, quad[0], quad[1]);
+ printk(KERN_DEBUG
+ "bundle template 0x%x\n",
+ bundle->quad0.template);
+ printk(KERN_DEBUG
+ "slot0 0x%lx slot1_p0 0x%lx slot1_p1 0x%lx slot2 0x%lx\n",
+ (unsigned long)bundle->quad0.slot0,
+ (unsigned long)bundle->quad0.slot1_p0,
+ (unsigned long)bundle->quad1.slot1_p1,
+ (unsigned long)bundle->quad1.slot2);
+ printk(KERN_DEBUG
+ "slot0 0x%016llx slot1 0x%016llx slot2 0x%016llx\n",
+ slot0.l, slot1.l, slot2.l);
+}
+
+static int noreplace_paravirt __init_or_module = 0;
+
+static int __init setup_noreplace_paravirt(char *str)
+{
+ noreplace_paravirt = 1;
+ return 1;
+}
+__setup("noreplace-paravirt", setup_noreplace_paravirt);
+
+#ifdef ASM_SUPPORTED
+static void __init_or_module
+fill_nop_bundle(void *sbundle, void *ebundle)
+{
+ extern const char paravirt_nop_bundle[];
+ extern const unsigned long paravirt_nop_bundle_size;
+
+ void *bundle = sbundle;
+
+ BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
+ BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
+
+ while (bundle < ebundle) {
+ memcpy(bundle, paravirt_nop_bundle, paravirt_nop_bundle_size);
+
+ bundle += paravirt_nop_bundle_size;
+ }
+}
+
+/* helper function */
+unsigned long __init_or_module
+__paravirt_patch_apply_bundle(void *sbundle, void *ebundle, unsigned long type,
+ const struct paravirt_patch_bundle_elem *elems,
+ unsigned long nelems,
+ const struct paravirt_patch_bundle_elem **found)
+{
+ unsigned long used = 0;
+ unsigned long i;
+
+ BUG_ON((((unsigned long)sbundle) % sizeof(bundle_t)) != 0);
+ BUG_ON((((unsigned long)ebundle) % sizeof(bundle_t)) != 0);
+
+ found = NULL;
+ for (i = 0; i < nelems; i++) {
+ const struct paravirt_patch_bundle_elem *p = &elems[i];
+ if (p->type == type) {
+ unsigned long need = p->ebundle - p->sbundle;
+ unsigned long room = ebundle - sbundle;
+
+ if (found != NULL)
+ *found = p;
+
+ if (room < need) {
+ /* no room to replace. skip it */
+ printk(KERN_DEBUG
+ "the space is too small to put "
+ "bundles. type %ld need %ld room %ld\n",
+ type, need, room);
+ break;
+ }
+
+ used = need;
+ memcpy(sbundle, p->sbundle, used);
+ break;
+ }
+ }
+
+ return used;
+}
+
+void __init_or_module
+paravirt_patch_apply_bundle(const struct paravirt_patch_site_bundle *start,
+ const struct paravirt_patch_site_bundle *end)
+{
+ const struct paravirt_patch_site_bundle *p;
+
+ if (noreplace_paravirt)
+ return;
+ if (pv_init_ops.patch_bundle == NULL)
+ return;
+
+ for (p = start; p < end; p++) {
+ unsigned long used;
+
+ used = (*pv_init_ops.patch_bundle)(p->sbundle, p->ebundle,
+ p->type);
+ if (used == 0)
+ continue;
+
+ fill_nop_bundle(p->sbundle + used, p->ebundle);
+ paravirt_flush_i_cache_range(p->sbundle,
+ p->ebundle - p->sbundle);
+ }
+ ia64_sync_i();
+ ia64_srlz_i();
+}
+
+/*
+ * nop.i, nop.m, nop.f instruction are same format.
+ * but nop.b has differennt format.
+ * This doesn't support nop.b for now.
+ */
+static void __init_or_module
+fill_nop_inst(unsigned long stag, unsigned long etag)
+{
+ extern const bundle_t paravirt_nop_mfi_inst_bundle[];
+ unsigned long tag;
+ const ia64_inst_t nop_inst =
+ paravirt_read_slot0(paravirt_nop_mfi_inst_bundle);
+
+ for (tag = stag; tag < etag; tag = paravirt_get_next_tag(tag))
+ paravirt_write_inst(tag, nop_inst);
+}
+
+void __init_or_module
+paravirt_patch_apply_inst(const struct paravirt_patch_site_inst *start,
+ const struct paravirt_patch_site_inst *end)
+{
+ const struct paravirt_patch_site_inst *p;
+
+ if (noreplace_paravirt)
+ return;
+ if (pv_init_ops.patch_inst == NULL)
+ return;
+
+ for (p = start; p < end; p++) {
+ unsigned long tag;
+ bundle_t *sbundle;
+ bundle_t *ebundle;
+
+ tag = (*pv_init_ops.patch_inst)(p->stag, p->etag, p->type);
+ if (tag == p->stag)
+ continue;
+
+ fill_nop_inst(tag, p->etag);
+ sbundle = paravirt_get_bundle(p->stag);
+ ebundle = paravirt_get_bundle(p->etag) + 1;
+ paravirt_flush_i_cache_range(sbundle, (ebundle - sbundle) *
+ sizeof(bundle_t));
+ }
+ ia64_sync_i();
+ ia64_srlz_i();
+}
+#endif /* ASM_SUPPOTED */
+
+/* brl.cond.sptk.many <target64> X3 */
+typedef union inst_x3_op {
+ ia64_inst_t inst;
+ struct {
+ unsigned long qp: 6;
+ unsigned long btyp: 3;
+ unsigned long unused: 3;
+ unsigned long p: 1;
+ unsigned long imm20b: 20;
+ unsigned long wh: 2;
+ unsigned long d: 1;
+ unsigned long i: 1;
+ unsigned long opcode: 4;
+ };
+ unsigned long l;
+} inst_x3_op_t;
+
+typedef union inst_x3_imm {
+ ia64_inst_t inst;
+ struct {
+ unsigned long unused: 2;
+ unsigned long imm39: 39;
+ };
+ unsigned long l;
+} inst_x3_imm_t;
+
+void __init_or_module
+paravirt_patch_reloc_brl(unsigned long tag, const void *target)
+{
+ unsigned long tag_op = paravirt_get_next_tag(tag);
+ unsigned long tag_imm = tag;
+ bundle_t *bundle = paravirt_get_bundle(tag);
+
+ ia64_inst_t inst_op = paravirt_read_inst(tag_op);
+ ia64_inst_t inst_imm = paravirt_read_inst(tag_imm);
+
+ inst_x3_op_t inst_x3_op = { .l = inst_op.l };
+ inst_x3_imm_t inst_x3_imm = { .l = inst_imm.l };
+
+ unsigned long imm60 =
+ ((unsigned long)target - (unsigned long)bundle) >> 4;
+
+ BUG_ON(paravirt_get_slot(tag) != 1); /* MLX */
+ BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
+
+ /* imm60[59] 1bit */
+ inst_x3_op.i = (imm60 >> 59) & 1;
+ /* imm60[19:0] 20bit */
+ inst_x3_op.imm20b = imm60 & ((1UL << 20) - 1);
+ /* imm60[58:20] 39bit */
+ inst_x3_imm.imm39 = (imm60 >> 20) & ((1UL << 39) - 1);
+
+ inst_op.l = inst_x3_op.l;
+ inst_imm.l = inst_x3_imm.l;
+
+ paravirt_write_inst(tag_op, inst_op);
+ paravirt_write_inst(tag_imm, inst_imm);
+}
+
+/* br.cond.sptk.many <target25> B1 */
+typedef union inst_b1 {
+ ia64_inst_t inst;
+ struct {
+ unsigned long qp: 6;
+ unsigned long btype: 3;
+ unsigned long unused: 3;
+ unsigned long p: 1;
+ unsigned long imm20b: 20;
+ unsigned long wh: 2;
+ unsigned long d: 1;
+ unsigned long s: 1;
+ unsigned long opcode: 4;
+ };
+ unsigned long l;
+} inst_b1_t;
+
+void __init
+paravirt_patch_reloc_br(unsigned long tag, const void *target)
+{
+ bundle_t *bundle = paravirt_get_bundle(tag);
+ ia64_inst_t inst = paravirt_read_inst(tag);
+ unsigned long target25 = (unsigned long)target - (unsigned long)bundle;
+ inst_b1_t inst_b1;
+
+ BUG_ON(((unsigned long)target & (sizeof(bundle_t) - 1)) != 0);
+
+ inst_b1.l = inst.l;
+ if (target25 & (1UL << 63))
+ inst_b1.s = 1;
+ else
+ inst_b1.s = 0;
+
+ inst_b1.imm20b = target25 >> 4;
+ inst.l = inst_b1.l;
+
+ paravirt_write_inst(tag, inst);
+}
+
+void __init
+__paravirt_patch_apply_branch(
+ unsigned long tag, unsigned long type,
+ const struct paravirt_patch_branch_target *entries,
+ unsigned int nr_entries)
+{
+ unsigned int i;
+ for (i = 0; i < nr_entries; i++) {
+ if (entries[i].type == type) {
+ paravirt_patch_reloc_br(tag, entries[i].entry);
+ break;
+ }
+ }
+}
+
+static void __init
+paravirt_patch_apply_branch(const struct paravirt_patch_site_branch *start,
+ const struct paravirt_patch_site_branch *end)
+{
+ const struct paravirt_patch_site_branch *p;
+
+ if (noreplace_paravirt)
+ return;
+ if (pv_init_ops.patch_branch == NULL)
+ return;
+
+ for (p = start; p < end; p++)
+ (*pv_init_ops.patch_branch)(p->tag, p->type);
+
+ ia64_sync_i();
+ ia64_srlz_i();
+}
+
+void __init
+paravirt_patch_apply(void)
+{
+ extern const char __start_paravirt_bundles[];
+ extern const char __stop_paravirt_bundles[];
+ extern const char __start_paravirt_insts[];
+ extern const char __stop_paravirt_insts[];
+ extern const char __start_paravirt_branches[];
+ extern const char __stop_paravirt_branches[];
+
+ paravirt_patch_apply_bundle((const struct paravirt_patch_site_bundle *)
+ __start_paravirt_bundles,
+ (const struct paravirt_patch_site_bundle *)
+ __stop_paravirt_bundles);
+ paravirt_patch_apply_inst((const struct paravirt_patch_site_inst *)
+ __start_paravirt_insts,
+ (const struct paravirt_patch_site_inst *)
+ __stop_paravirt_insts);
+ paravirt_patch_apply_branch((const struct paravirt_patch_site_branch *)
+ __start_paravirt_branches,
+ (const struct paravirt_patch_site_branch *)
+ __stop_paravirt_branches);
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "linux"
+ * c-basic-offset: 8
+ * tab-width: 8
+ * indent-tabs-mode: t
+ * End:
+ */
diff --git a/arch/ia64/kernel/paravirt_patchlist.c b/arch/ia64/kernel/paravirt_patchlist.c
new file mode 100644
index 00000000000..b28082a95d4
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patchlist.c
@@ -0,0 +1,79 @@
+/******************************************************************************
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#include <linux/bug.h>
+#include <asm/paravirt.h>
+
+#define DECLARE(name) \
+ extern unsigned long \
+ __ia64_native_start_gate_##name##_patchlist[]; \
+ extern unsigned long \
+ __ia64_native_end_gate_##name##_patchlist[]
+
+DECLARE(fsyscall);
+DECLARE(brl_fsys_bubble_down);
+DECLARE(vtop);
+DECLARE(mckinley_e9);
+
+extern unsigned long __start_gate_section[];
+
+#define ASSIGN(name) \
+ .start_##name##_patchlist = \
+ (unsigned long)__ia64_native_start_gate_##name##_patchlist, \
+ .end_##name##_patchlist = \
+ (unsigned long)__ia64_native_end_gate_##name##_patchlist
+
+struct pv_patchdata pv_patchdata __initdata = {
+ ASSIGN(fsyscall),
+ ASSIGN(brl_fsys_bubble_down),
+ ASSIGN(vtop),
+ ASSIGN(mckinley_e9),
+
+ .gate_section = (void*)__start_gate_section,
+};
+
+
+unsigned long __init
+paravirt_get_gate_patchlist(enum pv_gate_patchlist type)
+{
+
+#define CASE(NAME, name) \
+ case PV_GATE_START_##NAME: \
+ return pv_patchdata.start_##name##_patchlist; \
+ case PV_GATE_END_##NAME: \
+ return pv_patchdata.end_##name##_patchlist; \
+
+ switch (type) {
+ CASE(FSYSCALL, fsyscall);
+ CASE(BRL_FSYS_BUBBLE_DOWN, brl_fsys_bubble_down);
+ CASE(VTOP, vtop);
+ CASE(MCKINLEY_E9, mckinley_e9);
+ default:
+ BUG();
+ break;
+ }
+ return 0;
+}
+
+void * __init
+paravirt_get_gate_section(void)
+{
+ return pv_patchdata.gate_section;
+}
diff --git a/arch/ia64/kernel/paravirt_patchlist.h b/arch/ia64/kernel/paravirt_patchlist.h
new file mode 100644
index 00000000000..0684aa6c650
--- /dev/null
+++ b/arch/ia64/kernel/paravirt_patchlist.h
@@ -0,0 +1,28 @@
+/******************************************************************************
+ * linux/arch/ia64/xen/paravirt_patchlist.h
+ *
+ * Copyright (c) 2008 Isaku Yamahata <yamahata at valinux co jp>
+ * VA Linux Systems Japan K.K.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
+ *
+ */
+
+#if defined(__IA64_GATE_PARAVIRTUALIZED_XEN)
+#include <asm/xen/patchlist.h>
+#else
+#include <asm/native/patchlist.h>
+#endif
+
diff --git a/arch/ia64/kernel/paravirtentry.S b/arch/ia64/kernel/paravirtentry.S
index 2f42fcb9776..6158560d7f1 100644
--- a/arch/ia64/kernel/paravirtentry.S
+++ b/arch/ia64/kernel/paravirtentry.S
@@ -20,8 +20,11 @@
*
*/
+#include <linux/init.h>
#include <asm/asmmacro.h>
#include <asm/asm-offsets.h>
+#include <asm/paravirt_privop.h>
+#include <asm/paravirt_patch.h>
#include "entry.h"
#define DATA8(sym, init_value) \
@@ -32,29 +35,87 @@
data8 init_value ; \
.popsection
-#define BRANCH(targ, reg, breg) \
- movl reg=targ ; \
- ;; \
- ld8 reg=[reg] ; \
- ;; \
- mov breg=reg ; \
+#define BRANCH(targ, reg, breg, type) \
+ PARAVIRT_PATCH_SITE_BR(PARAVIRT_PATCH_TYPE_BR_ ## type) ; \
+ ;; \
+ movl reg=targ ; \
+ ;; \
+ ld8 reg=[reg] ; \
+ ;; \
+ mov breg=reg ; \
br.cond.sptk.many breg
-#define BRANCH_PROC(sym, reg, breg) \
- DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \
- GLOBAL_ENTRY(paravirt_ ## sym) ; \
- BRANCH(paravirt_ ## sym ## _targ, reg, breg) ; \
+#define BRANCH_PROC(sym, reg, breg, type) \
+ DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \
+ GLOBAL_ENTRY(paravirt_ ## sym) ; \
+ BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ; \
END(paravirt_ ## sym)
-#define BRANCH_PROC_UNWINFO(sym, reg, breg) \
- DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \
- GLOBAL_ENTRY(paravirt_ ## sym) ; \
- PT_REGS_UNWIND_INFO(0) ; \
- BRANCH(paravirt_ ## sym ## _targ, reg, breg) ; \
+#define BRANCH_PROC_UNWINFO(sym, reg, breg, type) \
+ DATA8(paravirt_ ## sym ## _targ, ia64_native_ ## sym) ; \
+ GLOBAL_ENTRY(paravirt_ ## sym) ; \
+ PT_REGS_UNWIND_INFO(0) ; \
+ BRANCH(paravirt_ ## sym ## _targ, reg, breg, type) ; \
END(paravirt_ ## sym)
-BRANCH_PROC(switch_to, r22, b7)
-BRANCH_PROC_UNWINFO(leave_syscall, r22, b7)
-BRANCH_PROC(work_processed_syscall, r2, b7)
-BRANCH_PROC_UNWINFO(leave_kernel, r22, b7)
+BRANCH_PROC(switch_to, r22, b7, SWITCH_TO)
+BRANCH_PROC_UNWINFO(leave_syscall, r22, b7, LEAVE_SYSCALL)
+BRANCH_PROC(work_processed_syscall, r2, b7, WORK_PROCESSED_SYSCALL)
+BRANCH_PROC_UNWINFO(leave_kernel, r22, b7, LEAVE_KERNEL)
+
+
+#ifdef CONFIG_MODULES
+#define __INIT_OR_MODULE .text
+#define __INITDATA_OR_MODULE .data
+#else
+#define __INIT_OR_MODULE __INIT
+#define __INITDATA_OR_MODULE __INITDATA
+#endif /* CONFIG_MODULES */
+
+ __INIT_OR_MODULE
+ GLOBAL_ENTRY(paravirt_fc_i)
+ fc.i r32
+ br.ret.sptk.many rp
+ END(paravirt_fc_i)
+ __FINIT
+
+ __INIT_OR_MODULE
+ .align 32
+ GLOBAL_ENTRY(paravirt_nop_b_inst_bundle)
+ {
+ nop.b 0
+ nop.b 0
+ nop.b 0
+ }
+ END(paravirt_nop_b_inst_bundle)
+ __FINIT
+
+ /* NOTE: nop.[mfi] has same format */
+ __INIT_OR_MODULE
+ GLOBAL_ENTRY(paravirt_nop_mfi_inst_bundle)
+ {
+ nop.m 0
+ nop.f 0
+ nop.i 0
+ }
+ END(paravirt_nop_mfi_inst_bundle)
+ __FINIT
+
+ __INIT_OR_MODULE
+ GLOBAL_ENTRY(paravirt_nop_bundle)
+paravirt_nop_bundle_start:
+ {
+ nop 0
+ nop 0
+ nop 0
+ }
+paravirt_nop_bundle_end:
+ END(paravirt_nop_bundle)
+ __FINIT
+
+ __INITDATA_OR_MODULE
+ .align 8
+ .global paravirt_nop_bundle_size
+paravirt_nop_bundle_size:
+ data8 paravirt_nop_bundle_end - paravirt_nop_bundle_start
diff --git a/arch/ia64/kernel/patch.c b/arch/ia64/kernel/patch.c
index b83b2c51600..68a1311db80 100644
--- a/arch/ia64/kernel/patch.c
+++ b/arch/ia64/kernel/patch.c
@@ -7,6 +7,7 @@
#include <linux/init.h>
#include <linux/string.h>
+#include <asm/paravirt.h>
#include <asm/patch.h>
#include <asm/processor.h>
#include <asm/sections.h>
@@ -169,16 +170,35 @@ ia64_patch_mckinley_e9 (unsigned long start, unsigned long end)
ia64_srlz_i();
}
+extern unsigned long ia64_native_fsyscall_table[NR_syscalls];
+extern char ia64_native_fsys_bubble_down[];
+struct pv_fsys_data pv_fsys_data __initdata = {
+ .fsyscall_table = (unsigned long *)ia64_native_fsyscall_table,
+ .fsys_bubble_down = (void *)ia64_native_fsys_bubble_down,
+};
+
+unsigned long * __init
+paravirt_get_fsyscall_table(void)
+{
+ return pv_fsys_data.fsyscall_table;
+}
+
+char * __init
+paravirt_get_fsys_bubble_down(void)
+{
+ return pv_fsys_data.fsys_bubble_down;
+}
+
static void __init
patch_fsyscall_table (unsigned long start, unsigned long end)
{
- extern unsigned long fsyscall_table[NR_syscalls];
+ u64 fsyscall_table = (u64)paravirt_get_fsyscall_table();
s32 *offp = (s32 *) start;
u64 ip;
while (offp < (s32 *) end) {
ip = (u64) ia64_imva((char *) offp + *offp);
- ia64_patch_imm64(ip, (u64) fsyscall_table);
+ ia64_patch_imm64(ip, fsyscall_table);
ia64_fc((void *) ip);
++offp;
}
@@ -189,7 +209,7 @@ patch_fsyscall_table (unsigned long start, unsigned long end)
static void __init
patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
{
- extern char fsys_bubble_down[];
+ u64 fsys_bubble_down = (u64)paravirt_get_fsys_bubble_down();
s32 *offp = (s32 *) start;
u64 ip;
@@ -207,13 +227,13 @@ patch_brl_fsys_bubble_down (unsigned long start, unsigned long end)
void __init
ia64_patch_gate (void)
{
-# define START(name) ((unsigned long) __start_gate_##name##_patchlist)
-# define END(name) ((unsigned long)__end_gate_##name##_patchlist)
+# define START(name) paravirt_get_gate_patchlist(PV_GATE_START_##name)
+# define END(name) paravirt_get_gate_patchlist(PV_GATE_END_##name)
- patch_fsyscall_table(START(fsyscall), END(fsyscall));
- patch_brl_fsys_bubble_down(START(brl_fsys_bubble_down), END(brl_fsys_bubble_down));
- ia64_patch_vtop(START(vtop), END(vtop));
- ia64_patch_mckinley_e9(START(mckinley_e9), END(mckinley_e9));
+ patch_fsyscall_table(START(FSYSCALL), END(FSYSCALL));
+ patch_brl_fsys_bubble_down(START(BRL_FSYS_BUBBLE_DOWN), END(BRL_FSYS_BUBBLE_DOWN));
+ ia64_patch_vtop(START(VTOP), END(VTOP));
+ ia64_patch_mckinley_e9(START(MCKINLEY_E9), END(MCKINLEY_E9));
}
void ia64_patch_phys_stack_reg(unsigned long val)
@@ -229,7 +249,7 @@ void ia64_patch_phys_stack_reg(unsigned long val)
while (offp < end) {
ip = (u64) offp + *offp;
ia64_patch(ip, mask, imm);
- ia64_fc(ip);
+ ia64_fc((void *)ip);
++offp;
}
ia64_sync_i();
diff --git a/arch/ia64/kernel/perfmon.c b/arch/ia64/kernel/perfmon.c
index 5c0f408cfd7..8a06dc48059 100644
--- a/arch/ia64/kernel/perfmon.c
+++ b/arch/ia64/kernel/perfmon.c
@@ -5603,7 +5603,7 @@ pfm_interrupt_handler(int irq, void *arg)
* /proc/perfmon interface, for debug only
*/
-#define PFM_PROC_SHOW_HEADER ((void *)NR_CPUS+1)
+#define PFM_PROC_SHOW_HEADER ((void *)nr_cpu_ids+1)
static void *
pfm_proc_start(struct seq_file *m, loff_t *pos)
@@ -5612,7 +5612,7 @@ pfm_proc_start(struct seq_file *m, loff_t *pos)
return PFM_PROC_SHOW_HEADER;
}
- while (*pos <= NR_CPUS) {
+ while (*pos <= nr_cpu_ids) {
if (cpu_online(*pos - 1)) {
return (void *)*pos;
}
diff --git a/arch/ia64/kernel/salinfo.c b/arch/ia64/kernel/salinfo.c
index ecb9eb78d68..7053c55b764 100644
--- a/arch/ia64/kernel/salinfo.c
+++ b/arch/ia64/kernel/salinfo.c
@@ -317,7 +317,7 @@ retry:
}
n = data->cpu_check;
- for (i = 0; i < NR_CPUS; i++) {
+ for (i = 0; i < nr_cpu_ids; i++) {
if (cpu_isset(n, data->cpu_event)) {
if (!cpu_online(n)) {
cpu_clear(n, data->cpu_event);
@@ -326,7 +326,7 @@ retry:
cpu = n;
break;
}
- if (++n == NR_CPUS)
+ if (++n == nr_cpu_ids)
n = 0;
}
@@ -337,7 +337,7 @@ retry:
/* for next read, start checking at next CPU */
data->cpu_check = cpu;
- if (++data->cpu_check == NR_CPUS)
+ if (++data->cpu_check == nr_cpu_ids)
data->cpu_check = 0;
snprintf(cmd, sizeof(cmd), "read %d\n", cpu);
diff --git a/arch/ia64/kernel/setup.c b/arch/ia64/kernel/setup.c
index 865af27c773..714066aeda7 100644
--- a/arch/ia64/kernel/setup.c
+++ b/arch/ia64/kernel/setup.c
@@ -52,6 +52,7 @@
#include <asm/meminit.h>
#include <asm/page.h>
#include <asm/paravirt.h>
+#include <asm/paravirt_patch.h>
#include <asm/patch.h>
#include <asm/pgtable.h>
#include <asm/processor.h>
@@ -537,6 +538,7 @@ setup_arch (char **cmdline_p)
paravirt_arch_setup_early();
ia64_patch_vtop((u64) __start___vtop_patchlist, (u64) __end___vtop_patchlist);
+ paravirt_patch_apply();
*cmdline_p = __va(ia64_boot_param->command_line);
strlcpy(boot_command_line, *cmdline_p, COMMAND_LINE_SIZE);
@@ -730,10 +732,10 @@ static void *
c_start (struct seq_file *m, loff_t *pos)
{
#ifdef CONFIG_SMP
- while (*pos < NR_CPUS && !cpu_isset(*pos, cpu_online_map))
+ while (*pos < nr_cpu_ids && !cpu_online(*pos))
++*pos;
#endif
- return *pos < NR_CPUS ? cpu_data(*pos) : NULL;
+ return *pos < nr_cpu_ids ? cpu_data(*pos) : NULL;
}
static void *
@@ -1016,8 +1018,7 @@ cpu_init (void)
| IA64_DCR_DA | IA64_DCR_DD | IA64_DCR_LC));
atomic_inc(&init_mm.mm_count);
current->active_mm = &init_mm;
- if (current->mm)
- BUG();
+ BUG_ON(current->mm);
ia64_mmu_init(ia64_imva(cpu_data));
ia64_mca_cpu_init(ia64_imva(cpu_data));
diff --git a/arch/ia64/kernel/smp.c b/arch/ia64/kernel/smp.c
index da8f020d82c..2ea4199d9c5 100644
--- a/arch/ia64/kernel/smp.c
+++ b/arch/ia64/kernel/smp.c
@@ -166,11 +166,11 @@ send_IPI_allbutself (int op)
* Called with preemption disabled.
*/
static inline void
-send_IPI_mask(cpumask_t mask, int op)
+send_IPI_mask(const struct cpumask *mask, int op)
{
unsigned int cpu;
- for_each_cpu_mask(cpu, mask) {
+ for_each_cpu(cpu, mask) {
send_IPI_single(cpu, op);
}
}
@@ -316,7 +316,7 @@ void arch_send_call_function_single_ipi(int cpu)
send_IPI_single(cpu, IPI_CALL_FUNC_SINGLE);
}
-void arch_send_call_function_ipi(cpumask_t mask)
+void arch_send_call_function_ipi_mask(const struct cpumask *mask)
{
send_IPI_mask(mask, IPI_CALL_FUNC);
}
diff --git a/arch/ia64/kernel/smpboot.c b/arch/ia64/kernel/smpboot.c
index 52290547c85..7700e23034b 100644
--- a/arch/ia64/kernel/smpboot.c
+++ b/arch/ia64/kernel/smpboot.c
@@ -581,14 +581,14 @@ smp_build_cpu_map (void)
ia64_cpu_to_sapicid[0] = boot_cpu_id;
cpus_clear(cpu_present_map);
- cpu_set(0, cpu_present_map);
- cpu_set(0, cpu_possible_map);
+ set_cpu_present(0, true);
+ set_cpu_possible(0, true);
for (cpu = 1, i = 0; i < smp_boot_data.cpu_count; i++) {
sapicid = smp_boot_data.cpu_phys_id[i];
if (sapicid == boot_cpu_id)
continue;
- cpu_set(cpu, cpu_present_map);
- cpu_set(cpu, cpu_possible_map);
+ set_cpu_present(cpu, true);
+ set_cpu_possible(cpu, true);
ia64_cpu_to_sapicid[cpu] = sapicid;
cpu++;
}
@@ -626,12 +626,9 @@ smp_prepare_cpus (unsigned int max_cpus)
*/
if (!max_cpus) {
printk(KERN_INFO "SMP mode deactivated.\n");
- cpus_clear(cpu_online_map);
- cpus_clear(cpu_present_map);
- cpus_clear(cpu_possible_map);
- cpu_set(0, cpu_online_map);
- cpu_set(0, cpu_present_map);
- cpu_set(0, cpu_possible_map);
+ init_cpu_online(cpumask_of(0));
+ init_cpu_present(cpumask_of(0));
+ init_cpu_possible(cpumask_of(0));
return;
}
}
diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c
index d6747bae52d..641c8b61c4f 100644
--- a/arch/ia64/kernel/time.c
+++ b/arch/ia64/kernel/time.c
@@ -51,6 +51,15 @@ EXPORT_SYMBOL(last_cli_ip);
#endif
#ifdef CONFIG_PARAVIRT
+/* We need to define a real function for sched_clock, to override the
+ weak default version */
+unsigned long long sched_clock(void)
+{
+ return paravirt_sched_clock();
+}
+#endif
+
+#ifdef CONFIG_PARAVIRT
static void
paravirt_clocksource_resume(void)
{
diff --git a/arch/ia64/kernel/vmlinux.lds.S b/arch/ia64/kernel/vmlinux.lds.S
index 3765efc5f96..4a95e86b9ac 100644
--- a/arch/ia64/kernel/vmlinux.lds.S
+++ b/arch/ia64/kernel/vmlinux.lds.S
@@ -169,6 +169,30 @@ SECTIONS
__end___mckinley_e9_bundles = .;
}
+#if defined(CONFIG_PARAVIRT)
+ . = ALIGN(16);
+ .paravirt_bundles : AT(ADDR(.paravirt_bundles) - LOAD_OFFSET)
+ {
+ __start_paravirt_bundles = .;
+ *(.paravirt_bundles)
+ __stop_paravirt_bundles = .;
+ }
+ . = ALIGN(16);
+ .paravirt_insts : AT(ADDR(.paravirt_insts) - LOAD_OFFSET)
+ {
+ __start_paravirt_insts = .;
+ *(.paravirt_insts)
+ __stop_paravirt_insts = .;
+ }
+ . = ALIGN(16);
+ .paravirt_branches : AT(ADDR(.paravirt_branches) - LOAD_OFFSET)
+ {
+ __start_paravirt_branches = .;
+ *(.paravirt_branches)
+ __stop_paravirt_branches = .;
+ }
+#endif
+
#if defined(CONFIG_IA64_GENERIC)
/* Machine Vector */
. = ALIGN(16);
@@ -201,6 +225,12 @@ SECTIONS
__start_gate_section = .;
*(.data.gate)
__stop_gate_section = .;
+#ifdef CONFIG_XEN
+ . = ALIGN(PAGE_SIZE);
+ __xen_start_gate_section = .;
+ *(.data.gate.xen)
+ __xen_stop_gate_section = .;
+#endif
}
. = ALIGN(PAGE_SIZE); /* make sure the gate page doesn't expose
* kernel data
diff --git a/arch/ia64/kvm/kvm-ia64.c b/arch/ia64/kvm/kvm-ia64.c
index 076b00d1dbf..28af6a731bb 100644
--- a/arch/ia64/kvm/kvm-ia64.c
+++ b/arch/ia64/kvm/kvm-ia64.c
@@ -70,7 +70,7 @@ static void kvm_flush_icache(unsigned long start, unsigned long len)
int l;
for (l = 0; l < (len + 32); l += 32)
- ia64_fc(start + l);
+ ia64_fc((void *)(start + l));
ia64_sync_i();
ia64_srlz_i();
diff --git a/arch/ia64/kvm/vcpu.c b/arch/ia64/kvm/vcpu.c
index d4d28050587..a18ee17b919 100644
--- a/arch/ia64/kvm/vcpu.c
+++ b/arch/ia64/kvm/vcpu.c
@@ -386,7 +386,7 @@ void set_rse_reg(struct kvm_pt_regs *regs, unsigned long r1,
else
*rnat_addr = (*rnat_addr) & (~nat_mask);
- ia64_setreg(_IA64_REG_AR_BSPSTORE, bspstore);
+ ia64_setreg(_IA64_REG_AR_BSPSTORE, (unsigned long)bspstore);
ia64_setreg(_IA64_REG_AR_RNAT, rnat);
}
local_irq_restore(psr);
diff --git a/arch/ia64/kvm/vtlb.c b/arch/ia64/kvm/vtlb.c
index 38232b37668..2c2501f1315 100644
--- a/arch/ia64/kvm/vtlb.c
+++ b/arch/ia64/kvm/vtlb.c
@@ -210,6 +210,7 @@ void thash_vhpt_insert(struct kvm_vcpu *v, u64 pte, u64 itir, u64 va, int type)
phy_pte &= ~PAGE_FLAGS_RV_MASK;
psr = ia64_clear_ic();
ia64_itc(type, va, phy_pte, itir_ps(itir));
+ paravirt_dv_serialize_data();
ia64_set_psr(psr);
}
@@ -456,6 +457,7 @@ void thash_purge_and_insert(struct kvm_vcpu *v, u64 pte, u64 itir,
phy_pte &= ~PAGE_FLAGS_RV_MASK;
psr = ia64_clear_ic();
ia64_itc(type, ifa, phy_pte, ps);
+ paravirt_dv_serialize_data();
ia64_set_psr(psr);
}
if (!(pte&VTLB_PTE_IO))
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 56e12903973..c0f3bee6904 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -35,6 +35,7 @@
#include <asm/uaccess.h>
#include <asm/unistd.h>
#include <asm/mca.h>
+#include <asm/paravirt.h>
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
@@ -259,6 +260,7 @@ put_kernel_page (struct page *page, unsigned long address, pgprot_t pgprot)
static void __init
setup_gate (void)
{
+ void *gate_section;
struct page *page;
/*
@@ -266,10 +268,11 @@ setup_gate (void)
* headers etc. and once execute-only page to enable
* privilege-promotion via "epc":
*/
- page = virt_to_page(ia64_imva(__start_gate_section));
+ gate_section = paravirt_get_gate_section();
+ page = virt_to_page(ia64_imva(gate_section));
put_kernel_page(page, GATE_ADDR, PAGE_READONLY);
#ifdef HAVE_BUGGY_SEGREL
- page = virt_to_page(ia64_imva(__start_gate_section + PAGE_SIZE));
+ page = virt_to_page(ia64_imva(gate_section + PAGE_SIZE));
put_kernel_page(page, GATE_ADDR + PAGE_SIZE, PAGE_GATE);
#else
put_kernel_page(page, GATE_ADDR + PERCPU_PAGE_SIZE, PAGE_GATE);
@@ -633,8 +636,7 @@ mem_init (void)
#endif
#ifdef CONFIG_FLATMEM
- if (!mem_map)
- BUG();
+ BUG_ON(!mem_map);
max_mapnr = max_low_pfn;
#endif
@@ -667,8 +669,8 @@ mem_init (void)
* code can tell them apart.
*/
for (i = 0; i < NR_syscalls; ++i) {
- extern unsigned long fsyscall_table[NR_syscalls];
extern unsigned long sys_call_table[NR_syscalls];
+ unsigned long *fsyscall_table = paravirt_get_fsyscall_table();
if (!fsyscall_table[i] || nolwsys)
fsyscall_table[i] = sys_call_table[i] | 1;
diff --git a/arch/ia64/mm/tlb.c b/arch/ia64/mm/tlb.c
index bd9818a36b4..b9f3d7bbb33 100644
--- a/arch/ia64/mm/tlb.c
+++ b/arch/ia64/mm/tlb.c
@@ -309,7 +309,7 @@ flush_tlb_range (struct vm_area_struct *vma, unsigned long start,
preempt_disable();
#ifdef CONFIG_SMP
- if (mm != current->active_mm || cpus_weight(mm->cpu_vm_mask) != 1) {
+ if (mm != current->active_mm || cpumask_weight(mm_cpumask(mm)) != 1) {
platform_global_tlb_purge(mm, start, end, nbits);
preempt_enable();
return;
diff --git a/arch/ia64/scripts/pvcheck.sed b/arch/ia64/scripts/pvcheck.sed
index ba66ac2e4c6..e59809a3fc0 100644
--- a/arch/ia64/scripts/pvcheck.sed
+++ b/arch/ia64/scripts/pvcheck.sed
@@ -17,6 +17,7 @@ s/mov.*=.*cr\.iip/.warning \"cr.iip should not used directly\"/g
s/mov.*=.*cr\.ivr/.warning \"cr.ivr should not used directly\"/g
s/mov.*=[^\.]*psr/.warning \"psr should not used directly\"/g # avoid ar.fpsr
s/mov.*=.*ar\.eflags/.warning \"ar.eflags should not used directly\"/g
+s/mov.*=.*ar\.itc.*/.warning \"ar.itc should not used directly\"/g
s/mov.*cr\.ifa.*=.*/.warning \"cr.ifa should not used directly\"/g
s/mov.*cr\.itir.*=.*/.warning \"cr.itir should not used directly\"/g
s/mov.*cr\.iha.*=.*/.warning \"cr.iha should not used directly\"/g
diff --git a/arch/ia64/sn/kernel/io_common.c b/arch/ia64/sn/kernel/io_common.c
index 0d4ffa4da1d..57f280dd9de 100644
--- a/arch/ia64/sn/kernel/io_common.c
+++ b/arch/ia64/sn/kernel/io_common.c
@@ -135,8 +135,7 @@ static s64 sn_device_fixup_war(u64 nasid, u64 widget, int device,
}
war_list = kzalloc(DEV_PER_WIDGET * sizeof(*war_list), GFP_KERNEL);
- if (!war_list)
- BUG();
+ BUG_ON(!war_list);
SAL_CALL_NOLOCK(isrv, SN_SAL_IOIF_GET_WIDGET_DMAFLUSH_LIST,
nasid, widget, __pa(war_list), 0, 0, 0 ,0);
@@ -180,23 +179,20 @@ sn_common_hubdev_init(struct hubdev_info *hubdev)
sizeof(struct sn_flush_device_kernel *);
hubdev->hdi_flush_nasid_list.widget_p =
kzalloc(size, GFP_KERNEL);
- if (!hubdev->hdi_flush_nasid_list.widget_p)
- BUG();
+ BUG_ON(!hubdev->hdi_flush_nasid_list.widget_p);
for (widget = 0; widget <= HUB_WIDGET_ID_MAX; widget++) {
size = DEV_PER_WIDGET *
sizeof(struct sn_flush_device_kernel);
sn_flush_device_kernel = kzalloc(size, GFP_KERNEL);
- if (!sn_flush_device_kernel)
- BUG();
+ BUG_ON(!sn_flush_device_kernel);
dev_entry = sn_flush_device_kernel;
for (device = 0; device < DEV_PER_WIDGET;
device++, dev_entry++) {
size = sizeof(struct sn_flush_device_common);
dev_entry->common = kzalloc(size, GFP_KERNEL);
- if (!dev_entry->common)
- BUG();
+ BUG_ON(!dev_entry->common);
if (sn_prom_feature_available(PRF_DEVICE_FLUSH_LIST))
status = sal_get_device_dmaflush_list(
hubdev->hdi_nasid, widget, device,
@@ -326,8 +322,7 @@ sn_common_bus_fixup(struct pci_bus *bus,
*/
controller->platform_data = kzalloc(sizeof(struct sn_platform_data),
GFP_KERNEL);
- if (controller->platform_data == NULL)
- BUG();
+ BUG_ON(controller->platform_data == NULL);
sn_platform_data =
(struct sn_platform_data *) controller->platform_data;
sn_platform_data->provider_soft = provider_soft;
diff --git a/arch/ia64/sn/kernel/io_init.c b/arch/ia64/sn/kernel/io_init.c
index e2eb2da60f9..ee774c366a0 100644
--- a/arch/ia64/sn/kernel/io_init.c
+++ b/arch/ia64/sn/kernel/io_init.c
@@ -128,8 +128,7 @@ sn_legacy_pci_window_fixup(struct pci_controller *controller,
{
controller->window = kcalloc(2, sizeof(struct pci_window),
GFP_KERNEL);
- if (controller->window == NULL)
- BUG();
+ BUG_ON(controller->window == NULL);
controller->window[0].offset = legacy_io;
controller->window[0].resource.name = "legacy_io";
controller->window[0].resource.flags = IORESOURCE_IO;
@@ -168,8 +167,7 @@ sn_pci_window_fixup(struct pci_dev *dev, unsigned int count,
idx = controller->windows;
new_count = controller->windows + count;
new_window = kcalloc(new_count, sizeof(struct pci_window), GFP_KERNEL);
- if (new_window == NULL)
- BUG();
+ BUG_ON(new_window == NULL);
if (controller->window) {
memcpy(new_window, controller->window,
sizeof(struct pci_window) * controller->windows);
@@ -222,8 +220,7 @@ sn_io_slot_fixup(struct pci_dev *dev)
(u64) __pa(pcidev_info),
(u64) __pa(sn_irq_info));
- if (status)
- BUG(); /* Cannot get platform pci device information */
+ BUG_ON(status); /* Cannot get platform pci device information */
/* Copy over PIO Mapped Addresses */
@@ -307,8 +304,7 @@ sn_pci_controller_fixup(int segment, int busnum, struct pci_bus *bus)
prom_bussoft_ptr = __va(prom_bussoft_ptr);
controller = kzalloc(sizeof(*controller), GFP_KERNEL);
- if (!controller)
- BUG();
+ BUG_ON(!controller);
controller->segment = segment;
/*
diff --git a/arch/ia64/sn/kernel/setup.c b/arch/ia64/sn/kernel/setup.c
index 02c5b8a9fb6..e456f062f24 100644
--- a/arch/ia64/sn/kernel/setup.c
+++ b/arch/ia64/sn/kernel/setup.c
@@ -732,8 +732,7 @@ void __init build_cnode_tables(void)
kl_config_hdr_t *klgraph_header;
nasid = cnodeid_to_nasid(node);
klgraph_header = ia64_sn_get_klconfig_addr(nasid);
- if (klgraph_header == NULL)
- BUG();
+ BUG_ON(klgraph_header == NULL);
brd = NODE_OFFSET_TO_LBOARD(nasid, klgraph_header->ch_board_info);
while (brd) {
if (board_needs_cnode(brd->brd_type) && physical_node_map[brd->brd_nasid] < 0) {
@@ -750,7 +749,7 @@ nasid_slice_to_cpuid(int nasid, int slice)
{
long cpu;
- for (cpu = 0; cpu < NR_CPUS; cpu++)
+ for (cpu = 0; cpu < nr_cpu_ids; cpu++)
if (cpuid_to_nasid(cpu) == nasid &&
cpuid_to_slice(cpu) == slice)
return cpu;
diff --git a/arch/ia64/sn/kernel/sn2/sn2_smp.c b/arch/ia64/sn/kernel/sn2/sn2_smp.c
index e585f9a2afb..1176506b2ba 100644
--- a/arch/ia64/sn/kernel/sn2/sn2_smp.c
+++ b/arch/ia64/sn/kernel/sn2/sn2_smp.c
@@ -133,7 +133,7 @@ sn2_ipi_flush_all_tlb(struct mm_struct *mm)
unsigned long itc;
itc = ia64_get_itc();
- smp_flush_tlb_cpumask(mm->cpu_vm_mask);
+ smp_flush_tlb_cpumask(*mm_cpumask(mm));
itc = ia64_get_itc() - itc;
__get_cpu_var(ptcstats).shub_ipi_flushes_itc_clocks += itc;
__get_cpu_var(ptcstats).shub_ipi_flushes++;
@@ -182,7 +182,7 @@ sn2_global_tlb_purge(struct mm_struct *mm, unsigned long start,
nodes_clear(nodes_flushed);
i = 0;
- for_each_cpu_mask(cpu, mm->cpu_vm_mask) {
+ for_each_cpu(cpu, mm_cpumask(mm)) {
cnode = cpu_to_node(cpu);
node_set(cnode, nodes_flushed);
lcpu = cpu;
@@ -461,7 +461,7 @@ bool sn_cpu_disable_allowed(int cpu)
static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset)
{
- if (*offset < NR_CPUS)
+ if (*offset < nr_cpu_ids)
return offset;
return NULL;
}
@@ -469,7 +469,7 @@ static void *sn2_ptc_seq_start(struct seq_file *file, loff_t * offset)
static void *sn2_ptc_seq_next(struct seq_file *file, void *data, loff_t * offset)
{
(*offset)++;
- if (*offset < NR_CPUS)
+ if (*offset < nr_cpu_ids)
return offset;
return NULL;
}
@@ -491,7 +491,7 @@ static int sn2_ptc_seq_show(struct seq_file *file, void *data)
seq_printf(file, "# ptctest %d, flushopt %d\n", sn2_ptctest, sn2_flush_opt);
}
- if (cpu < NR_CPUS && cpu_online(cpu)) {
+ if (cpu < nr_cpu_ids && cpu_online(cpu)) {
stat = &per_cpu(ptcstats, cpu);
seq_printf(file, "cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld\n", cpu, stat->ptc_l,
stat->change_rid, stat->shub_ptc_flushes, stat->nodes_flushed,
@@ -554,7 +554,7 @@ static int __init sn2_ptc_init(void)
proc_sn2_ptc = proc_create(PTC_BASENAME, 0444,
NULL, &proc_sn2_ptc_operations);
- if (!&proc_sn2_ptc_operations) {
+ if (!proc_sn2_ptc) {
printk(KERN_ERR "unable to create %s proc entry", PTC_BASENAME);
return -EINVAL;
}
diff --git a/arch/ia64/sn/kernel/sn2/sn_hwperf.c b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
index be339477f90..9e6491cf72b 100644
--- a/arch/ia64/sn/kernel/sn2/sn_hwperf.c
+++ b/arch/ia64/sn/kernel/sn2/sn_hwperf.c
@@ -275,8 +275,7 @@ static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objb
/* get it's interconnect topology */
sz = op->ports * sizeof(struct sn_hwperf_port_info);
- if (sz > sizeof(ptdata))
- BUG();
+ BUG_ON(sz > sizeof(ptdata));
e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
SN_HWPERF_ENUM_PORTS, nodeobj->id, sz,
(u64)&ptdata, 0, 0, NULL);
@@ -310,8 +309,7 @@ static int sn_hwperf_get_nearest_node_objdata(struct sn_hwperf_object_info *objb
if (router && (!found_cpu || !found_mem)) {
/* search for a node connected to the same router */
sz = router->ports * sizeof(struct sn_hwperf_port_info);
- if (sz > sizeof(ptdata))
- BUG();
+ BUG_ON(sz > sizeof(ptdata));
e = ia64_sn_hwperf_op(sn_hwperf_master_nasid,
SN_HWPERF_ENUM_PORTS, router->id, sz,
(u64)&ptdata, 0, 0, NULL);
@@ -612,7 +610,7 @@ static int sn_hwperf_op_cpu(struct sn_hwperf_op_info *op_info)
op_info->a->arg &= SN_HWPERF_ARG_OBJID_MASK;
if (cpu != SN_HWPERF_ARG_ANY_CPU) {
- if (cpu >= NR_CPUS || !cpu_online(cpu)) {
+ if (cpu >= nr_cpu_ids || !cpu_online(cpu)) {
r = -EINVAL;
goto out;
}
diff --git a/arch/ia64/sn/pci/pcibr/pcibr_dma.c b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
index 060df4aa991..c659ad5613a 100644
--- a/arch/ia64/sn/pci/pcibr/pcibr_dma.c
+++ b/arch/ia64/sn/pci/pcibr/pcibr_dma.c
@@ -256,9 +256,7 @@ void sn_dma_flush(u64 addr)
hubinfo = (NODEPDA(nasid_to_cnodeid(nasid)))->pdinfo;
- if (!hubinfo) {
- BUG();
- }
+ BUG_ON(!hubinfo);
flush_nasid_list = &hubinfo->hdi_flush_nasid_list;
if (flush_nasid_list->widget_p == NULL)
diff --git a/arch/ia64/xen/Makefile b/arch/ia64/xen/Makefile
index 0ad0224693d..e6f4a0a7422 100644
--- a/arch/ia64/xen/Makefile
+++ b/arch/ia64/xen/Makefile
@@ -3,14 +3,29 @@
#
obj-y := hypercall.o xenivt.o xensetup.o xen_pv_ops.o irq_xen.o \
- hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o
+ hypervisor.o xencomm.o xcom_hcall.o grant-table.o time.o suspend.o \
+ gate-data.o
obj-$(CONFIG_IA64_GENERIC) += machvec.o
+# The gate DSO image is built using a special linker script.
+include $(srctree)/arch/ia64/kernel/Makefile.gate
+
+# tell compiled for xen
+CPPFLAGS_gate.lds += -D__IA64_GATE_PARAVIRTUALIZED_XEN
+AFLAGS_gate.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN -D__IA64_GATE_PARAVIRTUALIZED_XEN
+
+# use same file of native.
+$(obj)/gate.o: $(src)/../kernel/gate.S FORCE
+ $(call if_changed_dep,as_o_S)
+$(obj)/gate.lds: $(src)/../kernel/gate.lds.S FORCE
+ $(call if_changed_dep,cpp_lds_S)
+
+
AFLAGS_xenivt.o += -D__IA64_ASM_PARAVIRTUALIZED_XEN
# xen multi compile
-ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S
+ASM_PARAVIRT_MULTI_COMPILE_SRCS = ivt.S entry.S fsys.S
ASM_PARAVIRT_OBJS = $(addprefix xen-,$(ASM_PARAVIRT_MULTI_COMPILE_SRCS:.S=.o))
obj-y += $(ASM_PARAVIRT_OBJS)
define paravirtualized_xen
diff --git a/arch/ia64/xen/gate-data.S b/arch/ia64/xen/gate-data.S
new file mode 100644
index 00000000000..7d4830afc91
--- /dev/null
+++ b/arch/ia64/xen/gate-data.S
@@ -0,0 +1,3 @@
+ .section .data.gate.xen, "aw"
+
+ .incbin "arch/ia64/xen/gate.so"
diff --git a/arch/ia64/xen/hypercall.S b/arch/ia64/xen/hypercall.S
index 45e02bb64a9..e32dae444dd 100644
--- a/arch/ia64/xen/hypercall.S
+++ b/arch/ia64/xen/hypercall.S
@@ -9,6 +9,7 @@
#include <asm/intrinsics.h>
#include <asm/xen/privop.h>
+#ifdef __INTEL_COMPILER
/*
* Hypercalls without parameter.
*/
@@ -72,6 +73,7 @@ GLOBAL_ENTRY(xen_set_rr0_to_rr4)
br.ret.sptk.many rp
;;
END(xen_set_rr0_to_rr4)
+#endif
GLOBAL_ENTRY(xen_send_ipi)
mov r14=r32
diff --git a/arch/ia64/xen/time.c b/arch/ia64/xen/time.c
index 68d6204c3f1..fb833269017 100644
--- a/arch/ia64/xen/time.c
+++ b/arch/ia64/xen/time.c
@@ -175,10 +175,58 @@ static void xen_itc_jitter_data_reset(void)
} while (unlikely(ret != lcycle));
}
+/* based on xen_sched_clock() in arch/x86/xen/time.c. */
+/*
+ * This relies on HAVE_UNSTABLE_SCHED_CLOCK. If it can't be defined,
+ * something similar logic should be implemented here.
+ */
+/*
+ * Xen sched_clock implementation. Returns the number of unstolen
+ * nanoseconds, which is nanoseconds the VCPU spent in RUNNING+BLOCKED
+ * states.
+ */
+static unsigned long long xen_sched_clock(void)
+{
+ struct vcpu_runstate_info runstate;
+
+ unsigned long long now;
+ unsigned long long offset;
+ unsigned long long ret;
+
+ /*
+ * Ideally sched_clock should be called on a per-cpu basis
+ * anyway, so preempt should already be disabled, but that's
+ * not current practice at the moment.
+ */
+ preempt_disable();
+
+ /*
+ * both ia64_native_sched_clock() and xen's runstate are
+ * based on mAR.ITC. So difference of them makes sense.
+ */
+ now = ia64_native_sched_clock();
+
+ get_runstate_snapshot(&runstate);
+
+ WARN_ON(runstate.state != RUNSTATE_running);
+
+ offset = 0;
+ if (now > runstate.state_entry_time)
+ offset = now - runstate.state_entry_time;
+ ret = runstate.time[RUNSTATE_blocked] +
+ runstate.time[RUNSTATE_running] +
+ offset;
+
+ preempt_enable();
+
+ return ret;
+}
+
struct pv_time_ops xen_time_ops __initdata = {
.init_missing_ticks_accounting = xen_init_missing_ticks_accounting,
.do_steal_accounting = xen_do_steal_accounting,
.clocksource_resume = xen_itc_jitter_data_reset,
+ .sched_clock = xen_sched_clock,
};
/* Called after suspend, to resume time. */
diff --git a/arch/ia64/xen/xen_pv_ops.c b/arch/ia64/xen/xen_pv_ops.c
index 936cff3c96e..5e2270a999f 100644
--- a/arch/ia64/xen/xen_pv_ops.c
+++ b/arch/ia64/xen/xen_pv_ops.c
@@ -24,6 +24,7 @@
#include <linux/irq.h>
#include <linux/kernel.h>
#include <linux/pm.h>
+#include <linux/unistd.h>
#include <asm/xen/hypervisor.h>
#include <asm/xen/xencomm.h>
@@ -153,6 +154,13 @@ xen_post_smp_prepare_boot_cpu(void)
xen_setup_vcpu_info_placement();
}
+#ifdef ASM_SUPPORTED
+static unsigned long __init_or_module
+xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type);
+#endif
+static void __init
+xen_patch_branch(unsigned long tag, unsigned long type);
+
static const struct pv_init_ops xen_init_ops __initconst = {
.banner = xen_banner,
@@ -163,6 +171,53 @@ static const struct pv_init_ops xen_init_ops __initconst = {
.arch_setup_nomca = xen_arch_setup_nomca,
.post_smp_prepare_boot_cpu = xen_post_smp_prepare_boot_cpu,
+#ifdef ASM_SUPPORTED
+ .patch_bundle = xen_patch_bundle,
+#endif
+ .patch_branch = xen_patch_branch,
+};
+
+/***************************************************************************
+ * pv_fsys_data
+ * addresses for fsys
+ */
+
+extern unsigned long xen_fsyscall_table[NR_syscalls];
+extern char xen_fsys_bubble_down[];
+struct pv_fsys_data xen_fsys_data __initdata = {
+ .fsyscall_table = (unsigned long *)xen_fsyscall_table,
+ .fsys_bubble_down = (void *)xen_fsys_bubble_down,
+};
+
+/***************************************************************************
+ * pv_patchdata
+ * patchdata addresses
+ */
+
+#define DECLARE(name) \
+ extern unsigned long __xen_start_gate_##name##_patchlist[]; \
+ extern unsigned long __xen_end_gate_##name##_patchlist[]
+
+DECLARE(fsyscall);
+DECLARE(brl_fsys_bubble_down);
+DECLARE(vtop);
+DECLARE(mckinley_e9);
+
+extern unsigned long __xen_start_gate_section[];
+
+#define ASSIGN(name) \
+ .start_##name##_patchlist = \
+ (unsigned long)__xen_start_gate_##name##_patchlist, \
+ .end_##name##_patchlist = \
+ (unsigned long)__xen_end_gate_##name##_patchlist
+
+static struct pv_patchdata xen_patchdata __initdata = {
+ ASSIGN(fsyscall),
+ ASSIGN(brl_fsys_bubble_down),
+ ASSIGN(vtop),
+ ASSIGN(mckinley_e9),
+
+ .gate_section = (void*)__xen_start_gate_section,
};
/***************************************************************************
@@ -170,6 +225,76 @@ static const struct pv_init_ops xen_init_ops __initconst = {
* intrinsics hooks.
*/
+#ifndef ASM_SUPPORTED
+static void
+xen_set_itm_with_offset(unsigned long val)
+{
+ /* ia64_cpu_local_tick() calls this with interrupt enabled. */
+ /* WARN_ON(!irqs_disabled()); */
+ xen_set_itm(val - XEN_MAPPEDREGS->itc_offset);
+}
+
+static unsigned long
+xen_get_itm_with_offset(void)
+{
+ /* unused at this moment */
+ printk(KERN_DEBUG "%s is called.\n", __func__);
+
+ WARN_ON(!irqs_disabled());
+ return ia64_native_getreg(_IA64_REG_CR_ITM) +
+ XEN_MAPPEDREGS->itc_offset;
+}
+
+/* ia64_set_itc() is only called by
+ * cpu_init() with ia64_set_itc(0) and ia64_sync_itc().
+ * So XEN_MAPPEDRESG->itc_offset cal be considered as almost constant.
+ */
+static void
+xen_set_itc(unsigned long val)
+{
+ unsigned long mitc;
+
+ WARN_ON(!irqs_disabled());
+ mitc = ia64_native_getreg(_IA64_REG_AR_ITC);
+ XEN_MAPPEDREGS->itc_offset = val - mitc;
+ XEN_MAPPEDREGS->itc_last = val;
+}
+
+static unsigned long
+xen_get_itc(void)
+{
+ unsigned long res;
+ unsigned long itc_offset;
+ unsigned long itc_last;
+ unsigned long ret_itc_last;
+
+ itc_offset = XEN_MAPPEDREGS->itc_offset;
+ do {
+ itc_last = XEN_MAPPEDREGS->itc_last;
+ res = ia64_native_getreg(_IA64_REG_AR_ITC);
+ res += itc_offset;
+ if (itc_last >= res)
+ res = itc_last + 1;
+ ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last,
+ itc_last, res);
+ } while (unlikely(ret_itc_last != itc_last));
+ return res;
+
+#if 0
+ /* ia64_itc_udelay() calls ia64_get_itc() with interrupt enabled.
+ Should it be paravirtualized instead? */
+ WARN_ON(!irqs_disabled());
+ itc_offset = XEN_MAPPEDREGS->itc_offset;
+ itc_last = XEN_MAPPEDREGS->itc_last;
+ res = ia64_native_getreg(_IA64_REG_AR_ITC);
+ res += itc_offset;
+ if (itc_last >= res)
+ res = itc_last + 1;
+ XEN_MAPPEDREGS->itc_last = res;
+ return res;
+#endif
+}
+
static void xen_setreg(int regnum, unsigned long val)
{
switch (regnum) {
@@ -181,11 +306,14 @@ static void xen_setreg(int regnum, unsigned long val)
xen_set_eflag(val);
break;
#endif
+ case _IA64_REG_AR_ITC:
+ xen_set_itc(val);
+ break;
case _IA64_REG_CR_TPR:
xen_set_tpr(val);
break;
case _IA64_REG_CR_ITM:
- xen_set_itm(val);
+ xen_set_itm_with_offset(val);
break;
case _IA64_REG_CR_EOI:
xen_eoi(val);
@@ -209,6 +337,12 @@ static unsigned long xen_getreg(int regnum)
res = xen_get_eflag();
break;
#endif
+ case _IA64_REG_AR_ITC:
+ res = xen_get_itc();
+ break;
+ case _IA64_REG_CR_ITM:
+ res = xen_get_itm_with_offset();
+ break;
case _IA64_REG_CR_IVR:
res = xen_get_ivr();
break;
@@ -259,8 +393,417 @@ xen_intrin_local_irq_restore(unsigned long mask)
else
xen_rsm_i();
}
+#else
+#define __DEFINE_FUNC(name, code) \
+ extern const char xen_ ## name ## _direct_start[]; \
+ extern const char xen_ ## name ## _direct_end[]; \
+ asm (".align 32\n" \
+ ".proc xen_" #name "\n" \
+ "xen_" #name ":\n" \
+ "xen_" #name "_direct_start:\n" \
+ code \
+ "xen_" #name "_direct_end:\n" \
+ "br.cond.sptk.many b6\n" \
+ ".endp xen_" #name "\n")
+
+#define DEFINE_VOID_FUNC0(name, code) \
+ extern void \
+ xen_ ## name (void); \
+ __DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC1(name, code) \
+ extern void \
+ xen_ ## name (unsigned long arg); \
+ __DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC1_VOID(name, code) \
+ extern void \
+ xen_ ## name (void *arg); \
+ __DEFINE_FUNC(name, code)
+
+#define DEFINE_VOID_FUNC2(name, code) \
+ extern void \
+ xen_ ## name (unsigned long arg0, \
+ unsigned long arg1); \
+ __DEFINE_FUNC(name, code)
-static const struct pv_cpu_ops xen_cpu_ops __initdata = {
+#define DEFINE_FUNC0(name, code) \
+ extern unsigned long \
+ xen_ ## name (void); \
+ __DEFINE_FUNC(name, code)
+
+#define DEFINE_FUNC1(name, type, code) \
+ extern unsigned long \
+ xen_ ## name (type arg); \
+ __DEFINE_FUNC(name, code)
+
+#define XEN_PSR_I_ADDR_ADDR (XSI_BASE + XSI_PSR_I_ADDR_OFS)
+
+/*
+ * static void xen_set_itm_with_offset(unsigned long val)
+ * xen_set_itm(val - XEN_MAPPEDREGS->itc_offset);
+ */
+/* 2 bundles */
+DEFINE_VOID_FUNC1(set_itm_with_offset,
+ "mov r2 = " __stringify(XSI_BASE) " + "
+ __stringify(XSI_ITC_OFFSET_OFS) "\n"
+ ";;\n"
+ "ld8 r3 = [r2]\n"
+ ";;\n"
+ "sub r8 = r8, r3\n"
+ "break " __stringify(HYPERPRIVOP_SET_ITM) "\n");
+
+/*
+ * static unsigned long xen_get_itm_with_offset(void)
+ * return ia64_native_getreg(_IA64_REG_CR_ITM) + XEN_MAPPEDREGS->itc_offset;
+ */
+/* 2 bundles */
+DEFINE_FUNC0(get_itm_with_offset,
+ "mov r2 = " __stringify(XSI_BASE) " + "
+ __stringify(XSI_ITC_OFFSET_OFS) "\n"
+ ";;\n"
+ "ld8 r3 = [r2]\n"
+ "mov r8 = cr.itm\n"
+ ";;\n"
+ "add r8 = r8, r2\n");
+
+/*
+ * static void xen_set_itc(unsigned long val)
+ * unsigned long mitc;
+ *
+ * WARN_ON(!irqs_disabled());
+ * mitc = ia64_native_getreg(_IA64_REG_AR_ITC);
+ * XEN_MAPPEDREGS->itc_offset = val - mitc;
+ * XEN_MAPPEDREGS->itc_last = val;
+ */
+/* 2 bundles */
+DEFINE_VOID_FUNC1(set_itc,
+ "mov r2 = " __stringify(XSI_BASE) " + "
+ __stringify(XSI_ITC_LAST_OFS) "\n"
+ "mov r3 = ar.itc\n"
+ ";;\n"
+ "sub r3 = r8, r3\n"
+ "st8 [r2] = r8, "
+ __stringify(XSI_ITC_LAST_OFS) " - "
+ __stringify(XSI_ITC_OFFSET_OFS) "\n"
+ ";;\n"
+ "st8 [r2] = r3\n");
+
+/*
+ * static unsigned long xen_get_itc(void)
+ * unsigned long res;
+ * unsigned long itc_offset;
+ * unsigned long itc_last;
+ * unsigned long ret_itc_last;
+ *
+ * itc_offset = XEN_MAPPEDREGS->itc_offset;
+ * do {
+ * itc_last = XEN_MAPPEDREGS->itc_last;
+ * res = ia64_native_getreg(_IA64_REG_AR_ITC);
+ * res += itc_offset;
+ * if (itc_last >= res)
+ * res = itc_last + 1;
+ * ret_itc_last = cmpxchg(&XEN_MAPPEDREGS->itc_last,
+ * itc_last, res);
+ * } while (unlikely(ret_itc_last != itc_last));
+ * return res;
+ */
+/* 5 bundles */
+DEFINE_FUNC0(get_itc,
+ "mov r2 = " __stringify(XSI_BASE) " + "
+ __stringify(XSI_ITC_OFFSET_OFS) "\n"
+ ";;\n"
+ "ld8 r9 = [r2], " __stringify(XSI_ITC_LAST_OFS) " - "
+ __stringify(XSI_ITC_OFFSET_OFS) "\n"
+ /* r9 = itc_offset */
+ /* r2 = XSI_ITC_OFFSET */
+ "888:\n"
+ "mov r8 = ar.itc\n" /* res = ar.itc */
+ ";;\n"
+ "ld8 r3 = [r2]\n" /* r3 = itc_last */
+ "add r8 = r8, r9\n" /* res = ar.itc + itc_offset */
+ ";;\n"
+ "cmp.gtu p6, p0 = r3, r8\n"
+ ";;\n"
+ "(p6) add r8 = 1, r3\n" /* if (itc_last > res) itc_last + 1 */
+ ";;\n"
+ "mov ar.ccv = r8\n"
+ ";;\n"
+ "cmpxchg8.acq r10 = [r2], r8, ar.ccv\n"
+ ";;\n"
+ "cmp.ne p6, p0 = r10, r3\n"
+ "(p6) hint @pause\n"
+ "(p6) br.cond.spnt 888b\n");
+
+DEFINE_VOID_FUNC1_VOID(fc,
+ "break " __stringify(HYPERPRIVOP_FC) "\n");
+
+/*
+ * psr_i_addr_addr = XEN_PSR_I_ADDR_ADDR
+ * masked_addr = *psr_i_addr_addr
+ * pending_intr_addr = masked_addr - 1
+ * if (val & IA64_PSR_I) {
+ * masked = *masked_addr
+ * *masked_addr = 0:xen_set_virtual_psr_i(1)
+ * compiler barrier
+ * if (masked) {
+ * uint8_t pending = *pending_intr_addr;
+ * if (pending)
+ * XEN_HYPER_SSM_I
+ * }
+ * } else {
+ * *masked_addr = 1:xen_set_virtual_psr_i(0)
+ * }
+ */
+/* 6 bundles */
+DEFINE_VOID_FUNC1(intrin_local_irq_restore,
+ /* r8 = input value: 0 or IA64_PSR_I
+ * p6 = (flags & IA64_PSR_I)
+ * = if clause
+ * p7 = !(flags & IA64_PSR_I)
+ * = else clause
+ */
+ "cmp.ne p6, p7 = r8, r0\n"
+ "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
+ ";;\n"
+ /* r9 = XEN_PSR_I_ADDR */
+ "ld8 r9 = [r9]\n"
+ ";;\n"
+
+ /* r10 = masked previous value */
+ "(p6) ld1.acq r10 = [r9]\n"
+ ";;\n"
+
+ /* p8 = !masked interrupt masked previously? */
+ "(p6) cmp.ne.unc p8, p0 = r10, r0\n"
+
+ /* p7 = else clause */
+ "(p7) mov r11 = 1\n"
+ ";;\n"
+ /* masked = 1 */
+ "(p7) st1.rel [r9] = r11\n"
+
+ /* p6 = if clause */
+ /* masked = 0
+ * r9 = masked_addr - 1
+ * = pending_intr_addr
+ */
+ "(p8) st1.rel [r9] = r0, -1\n"
+ ";;\n"
+ /* r8 = pending_intr */
+ "(p8) ld1.acq r11 = [r9]\n"
+ ";;\n"
+ /* p9 = interrupt pending? */
+ "(p8) cmp.ne.unc p9, p10 = r11, r0\n"
+ ";;\n"
+ "(p10) mf\n"
+ /* issue hypercall to trigger interrupt */
+ "(p9) break " __stringify(HYPERPRIVOP_SSM_I) "\n");
+
+DEFINE_VOID_FUNC2(ptcga,
+ "break " __stringify(HYPERPRIVOP_PTC_GA) "\n");
+DEFINE_VOID_FUNC2(set_rr,
+ "break " __stringify(HYPERPRIVOP_SET_RR) "\n");
+
+/*
+ * tmp = XEN_MAPPEDREGS->interrupt_mask_addr = XEN_PSR_I_ADDR_ADDR;
+ * tmp = *tmp
+ * tmp = *tmp;
+ * psr_i = tmp? 0: IA64_PSR_I;
+ */
+/* 4 bundles */
+DEFINE_FUNC0(get_psr_i,
+ "mov r9 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
+ ";;\n"
+ "ld8 r9 = [r9]\n" /* r9 = XEN_PSR_I_ADDR */
+ "mov r8 = 0\n" /* psr_i = 0 */
+ ";;\n"
+ "ld1.acq r9 = [r9]\n" /* r9 = XEN_PSR_I */
+ ";;\n"
+ "cmp.eq.unc p6, p0 = r9, r0\n" /* p6 = (XEN_PSR_I != 0) */
+ ";;\n"
+ "(p6) mov r8 = " __stringify(1 << IA64_PSR_I_BIT) "\n");
+
+DEFINE_FUNC1(thash, unsigned long,
+ "break " __stringify(HYPERPRIVOP_THASH) "\n");
+DEFINE_FUNC1(get_cpuid, int,
+ "break " __stringify(HYPERPRIVOP_GET_CPUID) "\n");
+DEFINE_FUNC1(get_pmd, int,
+ "break " __stringify(HYPERPRIVOP_GET_PMD) "\n");
+DEFINE_FUNC1(get_rr, unsigned long,
+ "break " __stringify(HYPERPRIVOP_GET_RR) "\n");
+
+/*
+ * void xen_privop_ssm_i(void)
+ *
+ * int masked = !xen_get_virtual_psr_i();
+ * // masked = *(*XEN_MAPPEDREGS->interrupt_mask_addr)
+ * xen_set_virtual_psr_i(1)
+ * // *(*XEN_MAPPEDREGS->interrupt_mask_addr) = 0
+ * // compiler barrier
+ * if (masked) {
+ * uint8_t* pend_int_addr =
+ * (uint8_t*)(*XEN_MAPPEDREGS->interrupt_mask_addr) - 1;
+ * uint8_t pending = *pend_int_addr;
+ * if (pending)
+ * XEN_HYPER_SSM_I
+ * }
+ */
+/* 4 bundles */
+DEFINE_VOID_FUNC0(ssm_i,
+ "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
+ ";;\n"
+ "ld8 r8 = [r8]\n" /* r8 = XEN_PSR_I_ADDR */
+ ";;\n"
+ "ld1.acq r9 = [r8]\n" /* r9 = XEN_PSR_I */
+ ";;\n"
+ "st1.rel [r8] = r0, -1\n" /* psr_i = 0. enable interrupt
+ * r8 = XEN_PSR_I_ADDR - 1
+ * = pend_int_addr
+ */
+ "cmp.eq.unc p0, p6 = r9, r0\n"/* p6 = !XEN_PSR_I
+ * previously interrupt
+ * masked?
+ */
+ ";;\n"
+ "(p6) ld1.acq r8 = [r8]\n" /* r8 = xen_pend_int */
+ ";;\n"
+ "(p6) cmp.eq.unc p6, p7 = r8, r0\n" /*interrupt pending?*/
+ ";;\n"
+ /* issue hypercall to get interrupt */
+ "(p7) break " __stringify(HYPERPRIVOP_SSM_I) "\n"
+ ";;\n");
+
+/*
+ * psr_i_addr_addr = XEN_MAPPEDREGS->interrupt_mask_addr
+ * = XEN_PSR_I_ADDR_ADDR;
+ * psr_i_addr = *psr_i_addr_addr;
+ * *psr_i_addr = 1;
+ */
+/* 2 bundles */
+DEFINE_VOID_FUNC0(rsm_i,
+ "mov r8 = " __stringify(XEN_PSR_I_ADDR_ADDR) "\n"
+ /* r8 = XEN_PSR_I_ADDR */
+ "mov r9 = 1\n"
+ ";;\n"
+ "ld8 r8 = [r8]\n" /* r8 = XEN_PSR_I */
+ ";;\n"
+ "st1.rel [r8] = r9\n"); /* XEN_PSR_I = 1 */
+
+extern void
+xen_set_rr0_to_rr4(unsigned long val0, unsigned long val1,
+ unsigned long val2, unsigned long val3,
+ unsigned long val4);
+__DEFINE_FUNC(set_rr0_to_rr4,
+ "break " __stringify(HYPERPRIVOP_SET_RR0_TO_RR4) "\n");
+
+
+extern unsigned long xen_getreg(int regnum);
+#define __DEFINE_GET_REG(id, privop) \
+ "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \
+ ";;\n" \
+ "cmp.eq p6, p0 = r2, r8\n" \
+ ";;\n" \
+ "(p6) break " __stringify(HYPERPRIVOP_GET_ ## privop) "\n" \
+ "(p6) br.cond.sptk.many b6\n" \
+ ";;\n"
+
+__DEFINE_FUNC(getreg,
+ __DEFINE_GET_REG(PSR, PSR)
+#ifdef CONFIG_IA32_SUPPORT
+ __DEFINE_GET_REG(AR_EFLAG, EFLAG)
+#endif
+
+ /* get_itc */
+ "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n"
+ ";;\n"
+ "cmp.eq p6, p0 = r2, r8\n"
+ ";;\n"
+ "(p6) br.cond.spnt xen_get_itc\n"
+ ";;\n"
+
+ /* get itm */
+ "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n"
+ ";;\n"
+ "cmp.eq p6, p0 = r2, r8\n"
+ ";;\n"
+ "(p6) br.cond.spnt xen_get_itm_with_offset\n"
+ ";;\n"
+
+ __DEFINE_GET_REG(CR_IVR, IVR)
+ __DEFINE_GET_REG(CR_TPR, TPR)
+
+ /* fall back */
+ "movl r2 = ia64_native_getreg_func\n"
+ ";;\n"
+ "mov b7 = r2\n"
+ ";;\n"
+ "br.cond.sptk.many b7\n");
+
+extern void xen_setreg(int regnum, unsigned long val);
+#define __DEFINE_SET_REG(id, privop) \
+ "mov r2 = " __stringify(_IA64_REG_ ## id) "\n" \
+ ";;\n" \
+ "cmp.eq p6, p0 = r2, r9\n" \
+ ";;\n" \
+ "(p6) break " __stringify(HYPERPRIVOP_ ## privop) "\n" \
+ "(p6) br.cond.sptk.many b6\n" \
+ ";;\n"
+
+__DEFINE_FUNC(setreg,
+ /* kr0 .. kr 7*/
+ /*
+ * if (_IA64_REG_AR_KR0 <= regnum &&
+ * regnum <= _IA64_REG_AR_KR7) {
+ * register __index asm ("r8") = regnum - _IA64_REG_AR_KR0
+ * register __val asm ("r9") = val
+ * "break HYPERPRIVOP_SET_KR"
+ * }
+ */
+ "mov r17 = r9\n"
+ "mov r2 = " __stringify(_IA64_REG_AR_KR0) "\n"
+ ";;\n"
+ "cmp.ge p6, p0 = r9, r2\n"
+ "sub r17 = r17, r2\n"
+ ";;\n"
+ "(p6) cmp.ge.unc p7, p0 = "
+ __stringify(_IA64_REG_AR_KR7) " - " __stringify(_IA64_REG_AR_KR0)
+ ", r17\n"
+ ";;\n"
+ "(p7) mov r9 = r8\n"
+ ";;\n"
+ "(p7) mov r8 = r17\n"
+ "(p7) break " __stringify(HYPERPRIVOP_SET_KR) "\n"
+
+ /* set itm */
+ "mov r2 = " __stringify(_IA64_REG_CR_ITM) "\n"
+ ";;\n"
+ "cmp.eq p6, p0 = r2, r8\n"
+ ";;\n"
+ "(p6) br.cond.spnt xen_set_itm_with_offset\n"
+
+ /* set itc */
+ "mov r2 = " __stringify(_IA64_REG_AR_ITC) "\n"
+ ";;\n"
+ "cmp.eq p6, p0 = r2, r8\n"
+ ";;\n"
+ "(p6) br.cond.spnt xen_set_itc\n"
+
+#ifdef CONFIG_IA32_SUPPORT
+ __DEFINE_SET_REG(AR_EFLAG, SET_EFLAG)
+#endif
+ __DEFINE_SET_REG(CR_TPR, SET_TPR)
+ __DEFINE_SET_REG(CR_EOI, EOI)
+
+ /* fall back */
+ "movl r2 = ia64_native_setreg_func\n"
+ ";;\n"
+ "mov b7 = r2\n"
+ ";;\n"
+ "br.cond.sptk.many b7\n");
+#endif
+
+static const struct pv_cpu_ops xen_cpu_ops __initconst = {
.fc = xen_fc,
.thash = xen_thash,
.get_cpuid = xen_get_cpuid,
@@ -337,7 +880,7 @@ xen_iosapic_write(char __iomem *iosapic, unsigned int reg, u32 val)
HYPERVISOR_physdev_op(PHYSDEVOP_apic_write, &apic_op);
}
-static const struct pv_iosapic_ops xen_iosapic_ops __initconst = {
+static struct pv_iosapic_ops xen_iosapic_ops __initdata = {
.pcat_compat_init = xen_pcat_compat_init,
.__get_irq_chip = xen_iosapic_get_irq_chip,
@@ -355,6 +898,8 @@ xen_setup_pv_ops(void)
xen_info_init();
pv_info = xen_info;
pv_init_ops = xen_init_ops;
+ pv_fsys_data = xen_fsys_data;
+ pv_patchdata = xen_patchdata;
pv_cpu_ops = xen_cpu_ops;
pv_iosapic_ops = xen_iosapic_ops;
pv_irq_ops = xen_irq_ops;
@@ -362,3 +907,252 @@ xen_setup_pv_ops(void)
paravirt_cpu_asm_init(&xen_cpu_asm_switch);
}
+
+#ifdef ASM_SUPPORTED
+/***************************************************************************
+ * binary pacthing
+ * pv_init_ops.patch_bundle
+ */
+
+#define DEFINE_FUNC_GETREG(name, privop) \
+ DEFINE_FUNC0(get_ ## name, \
+ "break "__stringify(HYPERPRIVOP_GET_ ## privop) "\n")
+
+DEFINE_FUNC_GETREG(psr, PSR);
+DEFINE_FUNC_GETREG(eflag, EFLAG);
+DEFINE_FUNC_GETREG(ivr, IVR);
+DEFINE_FUNC_GETREG(tpr, TPR);
+
+#define DEFINE_FUNC_SET_KR(n) \
+ DEFINE_VOID_FUNC0(set_kr ## n, \
+ ";;\n" \
+ "mov r9 = r8\n" \
+ "mov r8 = " #n "\n" \
+ "break " __stringify(HYPERPRIVOP_SET_KR) "\n")
+
+DEFINE_FUNC_SET_KR(0);
+DEFINE_FUNC_SET_KR(1);
+DEFINE_FUNC_SET_KR(2);
+DEFINE_FUNC_SET_KR(3);
+DEFINE_FUNC_SET_KR(4);
+DEFINE_FUNC_SET_KR(5);
+DEFINE_FUNC_SET_KR(6);
+DEFINE_FUNC_SET_KR(7);
+
+#define __DEFINE_FUNC_SETREG(name, privop) \
+ DEFINE_VOID_FUNC0(name, \
+ "break "__stringify(HYPERPRIVOP_ ## privop) "\n")
+
+#define DEFINE_FUNC_SETREG(name, privop) \
+ __DEFINE_FUNC_SETREG(set_ ## name, SET_ ## privop)
+
+DEFINE_FUNC_SETREG(eflag, EFLAG);
+DEFINE_FUNC_SETREG(tpr, TPR);
+__DEFINE_FUNC_SETREG(eoi, EOI);
+
+extern const char xen_check_events[];
+extern const char __xen_intrin_local_irq_restore_direct_start[];
+extern const char __xen_intrin_local_irq_restore_direct_end[];
+extern const unsigned long __xen_intrin_local_irq_restore_direct_reloc;
+
+asm (
+ ".align 32\n"
+ ".proc xen_check_events\n"
+ "xen_check_events:\n"
+ /* masked = 0
+ * r9 = masked_addr - 1
+ * = pending_intr_addr
+ */
+ "st1.rel [r9] = r0, -1\n"
+ ";;\n"
+ /* r8 = pending_intr */
+ "ld1.acq r11 = [r9]\n"
+ ";;\n"
+ /* p9 = interrupt pending? */
+ "cmp.ne p9, p10 = r11, r0\n"
+ ";;\n"
+ "(p10) mf\n"
+ /* issue hypercall to trigger interrupt */
+ "(p9) break " __stringify(HYPERPRIVOP_SSM_I) "\n"
+ "br.cond.sptk.many b6\n"
+ ".endp xen_check_events\n"
+ "\n"
+ ".align 32\n"
+ ".proc __xen_intrin_local_irq_restore_direct\n"
+ "__xen_intrin_local_irq_restore_direct:\n"
+ "__xen_intrin_local_irq_restore_direct_start:\n"
+ "1:\n"
+ "{\n"
+ "cmp.ne p6, p7 = r8, r0\n"
+ "mov r17 = ip\n" /* get ip to calc return address */
+ "mov r9 = "__stringify(XEN_PSR_I_ADDR_ADDR) "\n"
+ ";;\n"
+ "}\n"
+ "{\n"
+ /* r9 = XEN_PSR_I_ADDR */
+ "ld8 r9 = [r9]\n"
+ ";;\n"
+ /* r10 = masked previous value */
+ "(p6) ld1.acq r10 = [r9]\n"
+ "adds r17 = 1f - 1b, r17\n" /* calculate return address */
+ ";;\n"
+ "}\n"
+ "{\n"
+ /* p8 = !masked interrupt masked previously? */
+ "(p6) cmp.ne.unc p8, p0 = r10, r0\n"
+ "\n"
+ /* p7 = else clause */
+ "(p7) mov r11 = 1\n"
+ ";;\n"
+ "(p8) mov b6 = r17\n" /* set return address */
+ "}\n"
+ "{\n"
+ /* masked = 1 */
+ "(p7) st1.rel [r9] = r11\n"
+ "\n"
+ "[99:]\n"
+ "(p8) brl.cond.dptk.few xen_check_events\n"
+ "}\n"
+ /* pv calling stub is 5 bundles. fill nop to adjust return address */
+ "{\n"
+ "nop 0\n"
+ "nop 0\n"
+ "nop 0\n"
+ "}\n"
+ "1:\n"
+ "__xen_intrin_local_irq_restore_direct_end:\n"
+ ".endp __xen_intrin_local_irq_restore_direct\n"
+ "\n"
+ ".align 8\n"
+ "__xen_intrin_local_irq_restore_direct_reloc:\n"
+ "data8 99b\n"
+);
+
+static struct paravirt_patch_bundle_elem xen_patch_bundle_elems[]
+__initdata_or_module =
+{
+#define XEN_PATCH_BUNDLE_ELEM(name, type) \
+ { \
+ (void*)xen_ ## name ## _direct_start, \
+ (void*)xen_ ## name ## _direct_end, \
+ PARAVIRT_PATCH_TYPE_ ## type, \
+ }
+
+ XEN_PATCH_BUNDLE_ELEM(fc, FC),
+ XEN_PATCH_BUNDLE_ELEM(thash, THASH),
+ XEN_PATCH_BUNDLE_ELEM(get_cpuid, GET_CPUID),
+ XEN_PATCH_BUNDLE_ELEM(get_pmd, GET_PMD),
+ XEN_PATCH_BUNDLE_ELEM(ptcga, PTCGA),
+ XEN_PATCH_BUNDLE_ELEM(get_rr, GET_RR),
+ XEN_PATCH_BUNDLE_ELEM(set_rr, SET_RR),
+ XEN_PATCH_BUNDLE_ELEM(set_rr0_to_rr4, SET_RR0_TO_RR4),
+ XEN_PATCH_BUNDLE_ELEM(ssm_i, SSM_I),
+ XEN_PATCH_BUNDLE_ELEM(rsm_i, RSM_I),
+ XEN_PATCH_BUNDLE_ELEM(get_psr_i, GET_PSR_I),
+ {
+ (void*)__xen_intrin_local_irq_restore_direct_start,
+ (void*)__xen_intrin_local_irq_restore_direct_end,
+ PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE,
+ },
+
+#define XEN_PATCH_BUNDLE_ELEM_GETREG(name, reg) \
+ { \
+ xen_get_ ## name ## _direct_start, \
+ xen_get_ ## name ## _direct_end, \
+ PARAVIRT_PATCH_TYPE_GETREG + _IA64_REG_ ## reg, \
+ }
+
+ XEN_PATCH_BUNDLE_ELEM_GETREG(psr, PSR),
+ XEN_PATCH_BUNDLE_ELEM_GETREG(eflag, AR_EFLAG),
+
+ XEN_PATCH_BUNDLE_ELEM_GETREG(ivr, CR_IVR),
+ XEN_PATCH_BUNDLE_ELEM_GETREG(tpr, CR_TPR),
+
+ XEN_PATCH_BUNDLE_ELEM_GETREG(itc, AR_ITC),
+ XEN_PATCH_BUNDLE_ELEM_GETREG(itm_with_offset, CR_ITM),
+
+
+#define __XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg) \
+ { \
+ xen_ ## name ## _direct_start, \
+ xen_ ## name ## _direct_end, \
+ PARAVIRT_PATCH_TYPE_SETREG + _IA64_REG_ ## reg, \
+ }
+
+#define XEN_PATCH_BUNDLE_ELEM_SETREG(name, reg) \
+ __XEN_PATCH_BUNDLE_ELEM_SETREG(set_ ## name, reg)
+
+ XEN_PATCH_BUNDLE_ELEM_SETREG(kr0, AR_KR0),
+ XEN_PATCH_BUNDLE_ELEM_SETREG(kr1, AR_KR1),
+ XEN_PATCH_BUNDLE_ELEM_SETREG(kr2, AR_KR2),
+ XEN_PATCH_BUNDLE_ELEM_SETREG(kr3, AR_KR3),
+ XEN_PATCH_BUNDLE_ELEM_SETREG(kr4, AR_KR4),
+ XEN_PATCH_BUNDLE_ELEM_SETREG(kr5, AR_KR5),
+ XEN_PATCH_BUNDLE_ELEM_SETREG(kr6, AR_KR6),
+ XEN_PATCH_BUNDLE_ELEM_SETREG(kr7, AR_KR7),
+
+ XEN_PATCH_BUNDLE_ELEM_SETREG(eflag, AR_EFLAG),
+ XEN_PATCH_BUNDLE_ELEM_SETREG(tpr, CR_TPR),
+ __XEN_PATCH_BUNDLE_ELEM_SETREG(eoi, CR_EOI),
+
+ XEN_PATCH_BUNDLE_ELEM_SETREG(itc, AR_ITC),
+ XEN_PATCH_BUNDLE_ELEM_SETREG(itm_with_offset, CR_ITM),
+};
+
+static unsigned long __init_or_module
+xen_patch_bundle(void *sbundle, void *ebundle, unsigned long type)
+{
+ const unsigned long nelems = sizeof(xen_patch_bundle_elems) /
+ sizeof(xen_patch_bundle_elems[0]);
+ unsigned long used;
+ const struct paravirt_patch_bundle_elem *found;
+
+ used = __paravirt_patch_apply_bundle(sbundle, ebundle, type,
+ xen_patch_bundle_elems, nelems,
+ &found);
+
+ if (found == NULL)
+ /* fallback */
+ return ia64_native_patch_bundle(sbundle, ebundle, type);
+ if (used == 0)
+ return used;
+
+ /* relocation */
+ switch (type) {
+ case PARAVIRT_PATCH_TYPE_INTRIN_LOCAL_IRQ_RESTORE: {
+ unsigned long reloc =
+ __xen_intrin_local_irq_restore_direct_reloc;
+ unsigned long reloc_offset = reloc - (unsigned long)
+ __xen_intrin_local_irq_restore_direct_start;
+ unsigned long tag = (unsigned long)sbundle + reloc_offset;
+ paravirt_patch_reloc_brl(tag, xen_check_events);
+ break;
+ }
+ default:
+ /* nothing */
+ break;
+ }
+ return used;
+}
+#endif /* ASM_SUPPOTED */
+
+const struct paravirt_patch_branch_target xen_branch_target[]
+__initconst = {
+#define PARAVIRT_BR_TARGET(name, type) \
+ { \
+ &xen_ ## name, \
+ PARAVIRT_PATCH_TYPE_BR_ ## type, \
+ }
+ PARAVIRT_BR_TARGET(switch_to, SWITCH_TO),
+ PARAVIRT_BR_TARGET(leave_syscall, LEAVE_SYSCALL),
+ PARAVIRT_BR_TARGET(work_processed_syscall, WORK_PROCESSED_SYSCALL),
+ PARAVIRT_BR_TARGET(leave_kernel, LEAVE_KERNEL),
+};
+
+static void __init
+xen_patch_branch(unsigned long tag, unsigned long type)
+{
+ const unsigned long nelem =
+ sizeof(xen_branch_target) / sizeof(xen_branch_target[0]);
+ __paravirt_patch_apply_branch(tag, type, xen_branch_target, nelem);
+}
diff --git a/arch/x86/boot/memory.c b/arch/x86/boot/memory.c
index 8c3c25f3557..5054c2ddd1a 100644
--- a/arch/x86/boot/memory.c
+++ b/arch/x86/boot/memory.c
@@ -2,6 +2,7 @@
*
* Copyright (C) 1991, 1992 Linus Torvalds
* Copyright 2007 rPath, Inc. - All Rights Reserved
+ * Copyright 2009 Intel Corporation; author H. Peter Anvin
*
* This file is part of the Linux kernel, and is made available under
* the terms of the GNU General Public License version 2.
@@ -16,24 +17,38 @@
#define SMAP 0x534d4150 /* ASCII "SMAP" */
+struct e820_ext_entry {
+ struct e820entry std;
+ u32 ext_flags;
+} __attribute__((packed));
+
static int detect_memory_e820(void)
{
int count = 0;
u32 next = 0;
- u32 size, id;
+ u32 size, id, edi;
u8 err;
struct e820entry *desc = boot_params.e820_map;
+ static struct e820_ext_entry buf; /* static so it is zeroed */
+
+ /*
+ * Set this here so that if the BIOS doesn't change this field
+ * but still doesn't change %ecx, we're still okay...
+ */
+ buf.ext_flags = 1;
do {
- size = sizeof(struct e820entry);
+ size = sizeof buf;
- /* Important: %edx is clobbered by some BIOSes,
- so it must be either used for the error output
- or explicitly marked clobbered. */
- asm("int $0x15; setc %0"
+ /* Important: %edx and %esi are clobbered by some BIOSes,
+ so they must be either used for the error output
+ or explicitly marked clobbered. Given that, assume there
+ is something out there clobbering %ebp and %edi, too. */
+ asm("pushl %%ebp; int $0x15; popl %%ebp; setc %0"
: "=d" (err), "+b" (next), "=a" (id), "+c" (size),
- "=m" (*desc)
- : "D" (desc), "d" (SMAP), "a" (0xe820));
+ "=D" (edi), "+m" (buf)
+ : "D" (&buf), "d" (SMAP), "a" (0xe820)
+ : "esi");
/* BIOSes which terminate the chain with CF = 1 as opposed
to %ebx = 0 don't always report the SMAP signature on
@@ -51,8 +66,14 @@ static int detect_memory_e820(void)
break;
}
+ /* ACPI 3.0 added the extended flags support. If bit 0
+ in the extended flags is zero, we're supposed to simply
+ ignore the entry -- a backwards incompatible change! */
+ if (size > 20 && !(buf.ext_flags & 1))
+ continue;
+
+ *desc++ = buf.std;
count++;
- desc++;
} while (next && count < ARRAY_SIZE(boot_params.e820_map));
return boot_params.e820_entries = count;
diff --git a/drivers/gpu/drm/drm_crtc_helper.c b/drivers/gpu/drm/drm_crtc_helper.c
index 1c3a8c55714..a04639dc633 100644
--- a/drivers/gpu/drm/drm_crtc_helper.c
+++ b/drivers/gpu/drm/drm_crtc_helper.c
@@ -42,6 +42,26 @@ static struct drm_display_mode std_modes[] = {
DRM_MODE_FLAG_PHSYNC | DRM_MODE_FLAG_PVSYNC) },
};
+static void drm_mode_validate_flag(struct drm_connector *connector,
+ int flags)
+{
+ struct drm_display_mode *mode, *t;
+
+ if (flags == (DRM_MODE_FLAG_DBLSCAN | DRM_MODE_FLAG_INTERLACE))
+ return;
+
+ list_for_each_entry_safe(mode, t, &connector->modes, head) {
+ if ((mode->flags & DRM_MODE_FLAG_INTERLACE) &&
+ !(flags & DRM_MODE_FLAG_INTERLACE))
+ mode->status = MODE_NO_INTERLACE;
+ if ((mode->flags & DRM_MODE_FLAG_DBLSCAN) &&
+ !(flags & DRM_MODE_FLAG_DBLSCAN))
+ mode->status = MODE_NO_DBLESCAN;
+ }
+
+ return;
+}
+
/**
* drm_helper_probe_connector_modes - get complete set of display modes
* @dev: DRM device
@@ -72,6 +92,7 @@ int drm_helper_probe_single_connector_modes(struct drm_connector *connector,
struct drm_connector_helper_funcs *connector_funcs =
connector->helper_private;
int count = 0;
+ int mode_flags = 0;
DRM_DEBUG("%s\n", drm_get_connector_name(connector));
/* set all modes to the unverified state */
@@ -96,6 +117,13 @@ int drm_helper_probe_single_connector_modes(struct drm_connector *connector,
if (maxX && maxY)
drm_mode_validate_size(dev, &connector->modes, maxX,
maxY, 0);
+
+ if (connector->interlace_allowed)
+ mode_flags |= DRM_MODE_FLAG_INTERLACE;
+ if (connector->doublescan_allowed)
+ mode_flags |= DRM_MODE_FLAG_DBLSCAN;
+ drm_mode_validate_flag(connector, mode_flags);
+
list_for_each_entry_safe(mode, t, &connector->modes, head) {
if (mode->status == MODE_OK)
mode->status = connector_funcs->mode_valid(connector,
@@ -885,7 +913,6 @@ bool drm_helper_plugged_event(struct drm_device *dev)
/**
* drm_initial_config - setup a sane initial connector configuration
* @dev: DRM device
- * @can_grow: this configuration is growable
*
* LOCKING:
* Called at init time, must take mode config lock.
@@ -897,7 +924,7 @@ bool drm_helper_plugged_event(struct drm_device *dev)
* RETURNS:
* Zero if everything went ok, nonzero otherwise.
*/
-bool drm_helper_initial_config(struct drm_device *dev, bool can_grow)
+bool drm_helper_initial_config(struct drm_device *dev)
{
struct drm_connector *connector;
int count = 0;
diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c
index c67400067b8..ca9c6165671 100644
--- a/drivers/gpu/drm/drm_edid.c
+++ b/drivers/gpu/drm/drm_edid.c
@@ -125,10 +125,8 @@ static bool edid_is_valid(struct edid *edid)
DRM_ERROR("EDID has major version %d, instead of 1\n", edid->version);
goto bad;
}
- if (edid->revision > 3) {
- DRM_ERROR("EDID has minor version %d, which is not between 0-3\n", edid->revision);
- goto bad;
- }
+ if (edid->revision > 4)
+ DRM_DEBUG("EDID minor > 4, assuming backward compatibility\n");
for (i = 0; i < EDID_LENGTH; i++)
csum += raw_edid[i];
@@ -162,7 +160,7 @@ static bool edid_vendor(struct edid *edid, char *vendor)
edid_vendor[0] = ((edid->mfg_id[0] & 0x7c) >> 2) + '@';
edid_vendor[1] = (((edid->mfg_id[0] & 0x3) << 3) |
((edid->mfg_id[1] & 0xe0) >> 5)) + '@';
- edid_vendor[2] = (edid->mfg_id[2] & 0x1f) + '@';
+ edid_vendor[2] = (edid->mfg_id[1] & 0x1f) + '@';
return !strncmp(edid_vendor, vendor, 3);
}
diff --git a/drivers/gpu/drm/i915/i915_dma.c b/drivers/gpu/drm/i915/i915_dma.c
index 0b9984ffed1..c23b3a95b7c 100644
--- a/drivers/gpu/drm/i915/i915_dma.c
+++ b/drivers/gpu/drm/i915/i915_dma.c
@@ -1042,7 +1042,7 @@ static int i915_load_modeset_init(struct drm_device *dev)
intel_modeset_init(dev);
- drm_helper_initial_config(dev, false);
+ drm_helper_initial_config(dev);
return 0;
diff --git a/drivers/s390/net/qeth_core_offl.c b/drivers/s390/net/qeth_core_offl.c
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/drivers/s390/net/qeth_core_offl.c
+++ /dev/null
diff --git a/drivers/s390/net/qeth_core_offl.h b/drivers/s390/net/qeth_core_offl.h
deleted file mode 100644
index e69de29bb2d..00000000000
--- a/drivers/s390/net/qeth_core_offl.h
+++ /dev/null
diff --git a/drivers/serial/serial_core.c b/drivers/serial/serial_core.c
index bf3c0e32a33..b0bb29d804a 100644
--- a/drivers/serial/serial_core.c
+++ b/drivers/serial/serial_core.c
@@ -1765,7 +1765,7 @@ static void uart_line_info(struct seq_file *m, struct uart_driver *drv, int i)
static int uart_proc_show(struct seq_file *m, void *v)
{
- struct tty_driver *ttydrv = v;
+ struct tty_driver *ttydrv = m->private;
struct uart_driver *drv = ttydrv->driver_state;
int i;
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d2cf5a54a4b..9adf5e4f7e9 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -8,7 +8,7 @@ btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
ref-cache.o export.o tree-log.o acl.o free-space-cache.o zlib.o \
- compression.o
+ compression.o delayed-ref.o
else
# Normal Makefile
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 72677ce2b74..b30986f00b9 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -66,6 +66,12 @@ struct btrfs_inode {
*/
struct list_head delalloc_inodes;
+ /*
+ * list for tracking inodes that must be sent to disk before a
+ * rename or truncate commit
+ */
+ struct list_head ordered_operations;
+
/* the space_info for where this inode's data allocations are done */
struct btrfs_space_info *space_info;
@@ -86,12 +92,6 @@ struct btrfs_inode {
*/
u64 logged_trans;
- /*
- * trans that last made a change that should be fully fsync'd. This
- * gets reset to zero each time the inode is logged
- */
- u64 log_dirty_trans;
-
/* total number of bytes pending delalloc, used by stat to calc the
* real block usage of the file
*/
@@ -121,6 +121,25 @@ struct btrfs_inode {
/* the start of block group preferred for allocations. */
u64 block_group;
+ /* the fsync log has some corner cases that mean we have to check
+ * directories to see if any unlinks have been done before
+ * the directory was logged. See tree-log.c for all the
+ * details
+ */
+ u64 last_unlink_trans;
+
+ /*
+ * ordered_data_close is set by truncate when a file that used
+ * to have good data has been truncated to zero. When it is set
+ * the btrfs file release call will add this inode to the
+ * ordered operations list so that we make sure to flush out any
+ * new data the application may have written before commit.
+ *
+ * yes, its silly to have a single bitflag, but we might grow more
+ * of these.
+ */
+ unsigned ordered_data_close:1;
+
struct inode vfs_inode;
};
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 37f31b5529a..dbb72412463 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -254,18 +254,13 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
* empty_size -- a hint that you plan on doing more cow. This is the size in
* bytes the allocator should try to find free next to the block it returns.
* This is just a hint and may be ignored by the allocator.
- *
- * prealloc_dest -- if you have already reserved a destination for the cow,
- * this uses that block instead of allocating a new one.
- * btrfs_alloc_reserved_extent is used to finish the allocation.
*/
static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
struct extent_buffer **cow_ret,
- u64 search_start, u64 empty_size,
- u64 prealloc_dest)
+ u64 search_start, u64 empty_size)
{
u64 parent_start;
struct extent_buffer *cow;
@@ -291,26 +286,10 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
level = btrfs_header_level(buf);
nritems = btrfs_header_nritems(buf);
- if (prealloc_dest) {
- struct btrfs_key ins;
-
- ins.objectid = prealloc_dest;
- ins.offset = buf->len;
- ins.type = BTRFS_EXTENT_ITEM_KEY;
-
- ret = btrfs_alloc_reserved_extent(trans, root, parent_start,
- root->root_key.objectid,
- trans->transid, level, &ins);
- BUG_ON(ret);
- cow = btrfs_init_new_buffer(trans, root, prealloc_dest,
- buf->len, level);
- } else {
- cow = btrfs_alloc_free_block(trans, root, buf->len,
- parent_start,
- root->root_key.objectid,
- trans->transid, level,
- search_start, empty_size);
- }
+ cow = btrfs_alloc_free_block(trans, root, buf->len,
+ parent_start, root->root_key.objectid,
+ trans->transid, level,
+ search_start, empty_size);
if (IS_ERR(cow))
return PTR_ERR(cow);
@@ -413,7 +392,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret, u64 prealloc_dest)
+ struct extent_buffer **cow_ret)
{
u64 search_start;
int ret;
@@ -436,7 +415,6 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_header_owner(buf) == root->root_key.objectid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
*cow_ret = buf;
- WARN_ON(prealloc_dest);
return 0;
}
@@ -447,8 +425,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
btrfs_set_lock_blocking(buf);
ret = __btrfs_cow_block(trans, root, buf, parent,
- parent_slot, cow_ret, search_start, 0,
- prealloc_dest);
+ parent_slot, cow_ret, search_start, 0);
return ret;
}
@@ -617,7 +594,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans,
err = __btrfs_cow_block(trans, root, cur, parent, i,
&cur, search_start,
min(16 * blocksize,
- (end_slot - i) * blocksize), 0);
+ (end_slot - i) * blocksize));
if (err) {
btrfs_tree_unlock(cur);
free_extent_buffer(cur);
@@ -937,7 +914,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BUG_ON(!child);
btrfs_tree_lock(child);
btrfs_set_lock_blocking(child);
- ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0);
+ ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
BUG_ON(ret);
spin_lock(&root->node_lock);
@@ -945,6 +922,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
spin_unlock(&root->node_lock);
ret = btrfs_update_extent_ref(trans, root, child->start,
+ child->len,
mid->start, child->start,
root->root_key.objectid,
trans->transid, level - 1);
@@ -971,6 +949,10 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
BTRFS_NODEPTRS_PER_BLOCK(root) / 4)
return 0;
+ if (trans->transaction->delayed_refs.flushing &&
+ btrfs_header_nritems(mid) > 2)
+ return 0;
+
if (btrfs_header_nritems(mid) < 2)
err_on_enospc = 1;
@@ -979,7 +961,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(left);
btrfs_set_lock_blocking(left);
wret = btrfs_cow_block(trans, root, left,
- parent, pslot - 1, &left, 0);
+ parent, pslot - 1, &left);
if (wret) {
ret = wret;
goto enospc;
@@ -990,7 +972,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
btrfs_tree_lock(right);
btrfs_set_lock_blocking(right);
wret = btrfs_cow_block(trans, root, right,
- parent, pslot + 1, &right, 0);
+ parent, pslot + 1, &right);
if (wret) {
ret = wret;
goto enospc;
@@ -1171,7 +1153,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
wret = 1;
} else {
ret = btrfs_cow_block(trans, root, left, parent,
- pslot - 1, &left, 0);
+ pslot - 1, &left);
if (ret)
wret = 1;
else {
@@ -1222,7 +1204,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
} else {
ret = btrfs_cow_block(trans, root, right,
parent, pslot + 1,
- &right, 0);
+ &right);
if (ret)
wret = 1;
else {
@@ -1492,7 +1474,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
u8 lowest_level = 0;
u64 blocknr;
u64 gen;
- struct btrfs_key prealloc_block;
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
@@ -1501,8 +1482,6 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root
if (ins_len < 0)
lowest_unlock = 2;
- prealloc_block.objectid = 0;
-
again:
if (p->skip_locking)
b = btrfs_root_node(root);
@@ -1529,44 +1508,11 @@ again:
!btrfs_header_flag(b, BTRFS_HEADER_FLAG_WRITTEN)) {
goto cow_done;
}
-
- /* ok, we have to cow, is our old prealloc the right
- * size?
- */
- if (prealloc_block.objectid &&
- prealloc_block.offset != b->len) {
- btrfs_release_path(root, p);
- btrfs_free_reserved_extent(root,
- prealloc_block.objectid,
- prealloc_block.offset);
- prealloc_block.objectid = 0;
- goto again;
- }
-
- /*
- * for higher level blocks, try not to allocate blocks
- * with the block and the parent locks held.
- */
- if (level > 0 && !prealloc_block.objectid) {
- u32 size = b->len;
- u64 hint = b->start;
-
- btrfs_release_path(root, p);
- ret = btrfs_reserve_extent(trans, root,
- size, size, 0,
- hint, (u64)-1,
- &prealloc_block, 0);
- BUG_ON(ret);
- goto again;
- }
-
btrfs_set_path_blocking(p);
wret = btrfs_cow_block(trans, root, b,
p->nodes[level + 1],
- p->slots[level + 1],
- &b, prealloc_block.objectid);
- prealloc_block.objectid = 0;
+ p->slots[level + 1], &b);
if (wret) {
free_extent_buffer(b);
ret = wret;
@@ -1742,12 +1688,8 @@ done:
* we don't really know what they plan on doing with the path
* from here on, so for now just mark it as blocking
*/
- btrfs_set_path_blocking(p);
- if (prealloc_block.objectid) {
- btrfs_free_reserved_extent(root,
- prealloc_block.objectid,
- prealloc_block.offset);
- }
+ if (!p->leave_spinning)
+ btrfs_set_path_blocking(p);
return ret;
}
@@ -1768,7 +1710,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
int ret;
eb = btrfs_lock_root_node(root);
- ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb, 0);
+ ret = btrfs_cow_block(trans, root, eb, NULL, 0, &eb);
BUG_ON(ret);
btrfs_set_lock_blocking(eb);
@@ -1826,7 +1768,7 @@ int btrfs_merge_path(struct btrfs_trans_handle *trans,
}
ret = btrfs_cow_block(trans, root, eb, parent, slot,
- &eb, 0);
+ &eb);
BUG_ON(ret);
if (root->root_key.objectid ==
@@ -2139,7 +2081,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
spin_unlock(&root->node_lock);
ret = btrfs_update_extent_ref(trans, root, lower->start,
- lower->start, c->start,
+ lower->len, lower->start, c->start,
root->root_key.objectid,
trans->transid, level - 1);
BUG_ON(ret);
@@ -2221,7 +2163,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
ret = insert_new_root(trans, root, path, level + 1);
if (ret)
return ret;
- } else {
+ } else if (!trans->transaction->delayed_refs.flushing) {
ret = push_nodes_for_insert(trans, root, path, level);
c = path->nodes[level];
if (!ret && btrfs_header_nritems(c) <
@@ -2329,66 +2271,27 @@ noinline int btrfs_leaf_free_space(struct btrfs_root *root,
return ret;
}
-/*
- * push some data in the path leaf to the right, trying to free up at
- * least data_size bytes. returns zero if the push worked, nonzero otherwise
- *
- * returns 1 if the push failed because the other node didn't have enough
- * room, 0 if everything worked out and < 0 if there were major errors.
- */
-static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, int data_size,
- int empty)
+static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ int data_size, int empty,
+ struct extent_buffer *right,
+ int free_space, u32 left_nritems)
{
struct extent_buffer *left = path->nodes[0];
- struct extent_buffer *right;
- struct extent_buffer *upper;
+ struct extent_buffer *upper = path->nodes[1];
struct btrfs_disk_key disk_key;
int slot;
u32 i;
- int free_space;
int push_space = 0;
int push_items = 0;
struct btrfs_item *item;
- u32 left_nritems;
u32 nr;
u32 right_nritems;
u32 data_end;
u32 this_item_size;
int ret;
- slot = path->slots[1];
- if (!path->nodes[1])
- return 1;
-
- upper = path->nodes[1];
- if (slot >= btrfs_header_nritems(upper) - 1)
- return 1;
-
- btrfs_assert_tree_locked(path->nodes[1]);
-
- right = read_node_slot(root, upper, slot + 1);
- btrfs_tree_lock(right);
- btrfs_set_lock_blocking(right);
-
- free_space = btrfs_leaf_free_space(root, right);
- if (free_space < data_size)
- goto out_unlock;
-
- /* cow and double check */
- ret = btrfs_cow_block(trans, root, right, upper,
- slot + 1, &right, 0);
- if (ret)
- goto out_unlock;
-
- free_space = btrfs_leaf_free_space(root, right);
- if (free_space < data_size)
- goto out_unlock;
-
- left_nritems = btrfs_header_nritems(left);
- if (left_nritems == 0)
- goto out_unlock;
-
if (empty)
nr = 0;
else
@@ -2397,6 +2300,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
if (path->slots[0] >= left_nritems)
push_space += data_size;
+ slot = path->slots[1];
i = left_nritems - 1;
while (i >= nr) {
item = btrfs_item_nr(left, i);
@@ -2528,24 +2432,82 @@ out_unlock:
}
/*
+ * push some data in the path leaf to the right, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ *
+ * returns 1 if the push failed because the other node didn't have enough
+ * room, 0 if everything worked out and < 0 if there were major errors.
+ */
+static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int data_size,
+ int empty)
+{
+ struct extent_buffer *left = path->nodes[0];
+ struct extent_buffer *right;
+ struct extent_buffer *upper;
+ int slot;
+ int free_space;
+ u32 left_nritems;
+ int ret;
+
+ if (!path->nodes[1])
+ return 1;
+
+ slot = path->slots[1];
+ upper = path->nodes[1];
+ if (slot >= btrfs_header_nritems(upper) - 1)
+ return 1;
+
+ btrfs_assert_tree_locked(path->nodes[1]);
+
+ right = read_node_slot(root, upper, slot + 1);
+ btrfs_tree_lock(right);
+ btrfs_set_lock_blocking(right);
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, right, upper,
+ slot + 1, &right);
+ if (ret)
+ goto out_unlock;
+
+ free_space = btrfs_leaf_free_space(root, right);
+ if (free_space < data_size)
+ goto out_unlock;
+
+ left_nritems = btrfs_header_nritems(left);
+ if (left_nritems == 0)
+ goto out_unlock;
+
+ return __push_leaf_right(trans, root, path, data_size, empty,
+ right, free_space, left_nritems);
+out_unlock:
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ return 1;
+}
+
+/*
* push some data in the path leaf to the left, trying to free up at
* least data_size bytes. returns zero if the push worked, nonzero otherwise
*/
-static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
- *root, struct btrfs_path *path, int data_size,
- int empty)
+static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path, int data_size,
+ int empty, struct extent_buffer *left,
+ int free_space, int right_nritems)
{
struct btrfs_disk_key disk_key;
struct extent_buffer *right = path->nodes[0];
- struct extent_buffer *left;
int slot;
int i;
- int free_space;
int push_space = 0;
int push_items = 0;
struct btrfs_item *item;
u32 old_left_nritems;
- u32 right_nritems;
u32 nr;
int ret = 0;
int wret;
@@ -2553,41 +2515,6 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
u32 old_left_item_size;
slot = path->slots[1];
- if (slot == 0)
- return 1;
- if (!path->nodes[1])
- return 1;
-
- right_nritems = btrfs_header_nritems(right);
- if (right_nritems == 0)
- return 1;
-
- btrfs_assert_tree_locked(path->nodes[1]);
-
- left = read_node_slot(root, path->nodes[1], slot - 1);
- btrfs_tree_lock(left);
- btrfs_set_lock_blocking(left);
-
- free_space = btrfs_leaf_free_space(root, left);
- if (free_space < data_size) {
- ret = 1;
- goto out;
- }
-
- /* cow and double check */
- ret = btrfs_cow_block(trans, root, left,
- path->nodes[1], slot - 1, &left, 0);
- if (ret) {
- /* we hit -ENOSPC, but it isn't fatal here */
- ret = 1;
- goto out;
- }
-
- free_space = btrfs_leaf_free_space(root, left);
- if (free_space < data_size) {
- ret = 1;
- goto out;
- }
if (empty)
nr = right_nritems;
@@ -2755,6 +2682,154 @@ out:
}
/*
+ * push some data in the path leaf to the left, trying to free up at
+ * least data_size bytes. returns zero if the push worked, nonzero otherwise
+ */
+static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
+ *root, struct btrfs_path *path, int data_size,
+ int empty)
+{
+ struct extent_buffer *right = path->nodes[0];
+ struct extent_buffer *left;
+ int slot;
+ int free_space;
+ u32 right_nritems;
+ int ret = 0;
+
+ slot = path->slots[1];
+ if (slot == 0)
+ return 1;
+ if (!path->nodes[1])
+ return 1;
+
+ right_nritems = btrfs_header_nritems(right);
+ if (right_nritems == 0)
+ return 1;
+
+ btrfs_assert_tree_locked(path->nodes[1]);
+
+ left = read_node_slot(root, path->nodes[1], slot - 1);
+ btrfs_tree_lock(left);
+ btrfs_set_lock_blocking(left);
+
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ /* cow and double check */
+ ret = btrfs_cow_block(trans, root, left,
+ path->nodes[1], slot - 1, &left);
+ if (ret) {
+ /* we hit -ENOSPC, but it isn't fatal here */
+ ret = 1;
+ goto out;
+ }
+
+ free_space = btrfs_leaf_free_space(root, left);
+ if (free_space < data_size) {
+ ret = 1;
+ goto out;
+ }
+
+ return __push_leaf_left(trans, root, path, data_size,
+ empty, left, free_space, right_nritems);
+out:
+ btrfs_tree_unlock(left);
+ free_extent_buffer(left);
+ return ret;
+}
+
+/*
+ * split the path's leaf in two, making sure there is at least data_size
+ * available for the resulting leaf level of the path.
+ *
+ * returns 0 if all went well and < 0 on failure.
+ */
+static noinline int copy_for_split(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct extent_buffer *l,
+ struct extent_buffer *right,
+ int slot, int mid, int nritems)
+{
+ int data_copy_size;
+ int rt_data_off;
+ int i;
+ int ret = 0;
+ int wret;
+ struct btrfs_disk_key disk_key;
+
+ nritems = nritems - mid;
+ btrfs_set_header_nritems(right, nritems);
+ data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
+
+ copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
+ btrfs_item_nr_offset(mid),
+ nritems * sizeof(struct btrfs_item));
+
+ copy_extent_buffer(right, l,
+ btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
+ data_copy_size, btrfs_leaf_data(l) +
+ leaf_data_end(root, l), data_copy_size);
+
+ rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
+ btrfs_item_end_nr(l, mid);
+
+ for (i = 0; i < nritems; i++) {
+ struct btrfs_item *item = btrfs_item_nr(right, i);
+ u32 ioff;
+
+ if (!right->map_token) {
+ map_extent_buffer(right, (unsigned long)item,
+ sizeof(struct btrfs_item),
+ &right->map_token, &right->kaddr,
+ &right->map_start, &right->map_len,
+ KM_USER1);
+ }
+
+ ioff = btrfs_item_offset(right, item);
+ btrfs_set_item_offset(right, item, ioff + rt_data_off);
+ }
+
+ if (right->map_token) {
+ unmap_extent_buffer(right, right->map_token, KM_USER1);
+ right->map_token = NULL;
+ }
+
+ btrfs_set_header_nritems(l, mid);
+ ret = 0;
+ btrfs_item_key(right, &disk_key, 0);
+ wret = insert_ptr(trans, root, path, &disk_key, right->start,
+ path->slots[1] + 1, 1);
+ if (wret)
+ ret = wret;
+
+ btrfs_mark_buffer_dirty(right);
+ btrfs_mark_buffer_dirty(l);
+ BUG_ON(path->slots[0] != slot);
+
+ ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+ BUG_ON(ret);
+
+ if (mid <= slot) {
+ btrfs_tree_unlock(path->nodes[0]);
+ free_extent_buffer(path->nodes[0]);
+ path->nodes[0] = right;
+ path->slots[0] -= mid;
+ path->slots[1] += 1;
+ } else {
+ btrfs_tree_unlock(right);
+ free_extent_buffer(right);
+ }
+
+ BUG_ON(path->slots[0] < 0);
+
+ return ret;
+}
+
+/*
* split the path's leaf in two, making sure there is at least data_size
* available for the resulting leaf level of the path.
*
@@ -2771,17 +2846,14 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans,
int mid;
int slot;
struct extent_buffer *right;
- int data_copy_size;
- int rt_data_off;
- int i;
int ret = 0;
int wret;
int double_split;
int num_doubles = 0;
- struct btrfs_disk_key disk_key;
/* first try to make some room by pushing left and right */
- if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY) {
+ if (data_size && ins_key->type != BTRFS_DIR_ITEM_KEY &&
+ !trans->transaction->delayed_refs.flushing) {
wret = push_leaf_right(trans, root, path, data_size, 0);
if (wret < 0)
return wret;
@@ -2830,11 +2902,14 @@ again:
write_extent_buffer(right, root->fs_info->chunk_tree_uuid,
(unsigned long)btrfs_header_chunk_tree_uuid(right),
BTRFS_UUID_SIZE);
+
if (mid <= slot) {
if (nritems == 1 ||
leaf_space_used(l, mid, nritems - mid) + data_size >
BTRFS_LEAF_DATA_SIZE(root)) {
if (slot >= nritems) {
+ struct btrfs_disk_key disk_key;
+
btrfs_cpu_key_to_disk(&disk_key, ins_key);
btrfs_set_header_nritems(right, 0);
wret = insert_ptr(trans, root, path,
@@ -2862,6 +2937,8 @@ again:
if (leaf_space_used(l, 0, mid) + data_size >
BTRFS_LEAF_DATA_SIZE(root)) {
if (!extend && data_size && slot == 0) {
+ struct btrfs_disk_key disk_key;
+
btrfs_cpu_key_to_disk(&disk_key, ins_key);
btrfs_set_header_nritems(right, 0);
wret = insert_ptr(trans, root, path,
@@ -2894,76 +2971,16 @@ again:
}
}
}
- nritems = nritems - mid;
- btrfs_set_header_nritems(right, nritems);
- data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(root, l);
-
- copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
- btrfs_item_nr_offset(mid),
- nritems * sizeof(struct btrfs_item));
-
- copy_extent_buffer(right, l,
- btrfs_leaf_data(right) + BTRFS_LEAF_DATA_SIZE(root) -
- data_copy_size, btrfs_leaf_data(l) +
- leaf_data_end(root, l), data_copy_size);
-
- rt_data_off = BTRFS_LEAF_DATA_SIZE(root) -
- btrfs_item_end_nr(l, mid);
-
- for (i = 0; i < nritems; i++) {
- struct btrfs_item *item = btrfs_item_nr(right, i);
- u32 ioff;
-
- if (!right->map_token) {
- map_extent_buffer(right, (unsigned long)item,
- sizeof(struct btrfs_item),
- &right->map_token, &right->kaddr,
- &right->map_start, &right->map_len,
- KM_USER1);
- }
-
- ioff = btrfs_item_offset(right, item);
- btrfs_set_item_offset(right, item, ioff + rt_data_off);
- }
-
- if (right->map_token) {
- unmap_extent_buffer(right, right->map_token, KM_USER1);
- right->map_token = NULL;
- }
-
- btrfs_set_header_nritems(l, mid);
- ret = 0;
- btrfs_item_key(right, &disk_key, 0);
- wret = insert_ptr(trans, root, path, &disk_key, right->start,
- path->slots[1] + 1, 1);
- if (wret)
- ret = wret;
-
- btrfs_mark_buffer_dirty(right);
- btrfs_mark_buffer_dirty(l);
- BUG_ON(path->slots[0] != slot);
- ret = btrfs_update_ref(trans, root, l, right, 0, nritems);
+ ret = copy_for_split(trans, root, path, l, right, slot, mid, nritems);
BUG_ON(ret);
- if (mid <= slot) {
- btrfs_tree_unlock(path->nodes[0]);
- free_extent_buffer(path->nodes[0]);
- path->nodes[0] = right;
- path->slots[0] -= mid;
- path->slots[1] += 1;
- } else {
- btrfs_tree_unlock(right);
- free_extent_buffer(right);
- }
-
- BUG_ON(path->slots[0] < 0);
-
if (double_split) {
BUG_ON(num_doubles != 0);
num_doubles++;
goto again;
}
+
return ret;
}
@@ -3021,26 +3038,27 @@ int btrfs_split_item(struct btrfs_trans_handle *trans,
return -EAGAIN;
}
+ btrfs_set_path_blocking(path);
ret = split_leaf(trans, root, &orig_key, path,
sizeof(struct btrfs_item), 1);
path->keep_locks = 0;
BUG_ON(ret);
+ btrfs_unlock_up_safe(path, 1);
+ leaf = path->nodes[0];
+ BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
+
+split:
/*
* make sure any changes to the path from split_leaf leave it
* in a blocking state
*/
btrfs_set_path_blocking(path);
- leaf = path->nodes[0];
- BUG_ON(btrfs_leaf_free_space(root, leaf) < sizeof(struct btrfs_item));
-
-split:
item = btrfs_item_nr(leaf, path->slots[0]);
orig_offset = btrfs_item_offset(leaf, item);
item_size = btrfs_item_size(leaf, item);
-
buf = kmalloc(item_size, GFP_NOFS);
read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
path->slots[0]), item_size);
@@ -3445,39 +3463,27 @@ out:
}
/*
- * Given a key and some data, insert items into the tree.
- * This does all the path init required, making room in the tree if needed.
+ * this is a helper for btrfs_insert_empty_items, the main goal here is
+ * to save stack depth by doing the bulk of the work in a function
+ * that doesn't call btrfs_search_slot
*/
-int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- struct btrfs_path *path,
- struct btrfs_key *cpu_key, u32 *data_size,
- int nr)
+static noinline_for_stack int
+setup_items_for_insert(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ u32 total_data, u32 total_size, int nr)
{
- struct extent_buffer *leaf;
struct btrfs_item *item;
- int ret = 0;
- int slot;
- int slot_orig;
int i;
u32 nritems;
- u32 total_size = 0;
- u32 total_data = 0;
unsigned int data_end;
struct btrfs_disk_key disk_key;
+ int ret;
+ struct extent_buffer *leaf;
+ int slot;
- for (i = 0; i < nr; i++)
- total_data += data_size[i];
-
- total_size = total_data + (nr * sizeof(struct btrfs_item));
- ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
- if (ret == 0)
- return -EEXIST;
- if (ret < 0)
- goto out;
-
- slot_orig = path->slots[0];
leaf = path->nodes[0];
+ slot = path->slots[0];
nritems = btrfs_header_nritems(leaf);
data_end = leaf_data_end(root, leaf);
@@ -3489,9 +3495,6 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
BUG();
}
- slot = path->slots[0];
- BUG_ON(slot < 0);
-
if (slot != nritems) {
unsigned int old_data = btrfs_item_end_nr(leaf, slot);
@@ -3547,21 +3550,60 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
data_end -= data_size[i];
btrfs_set_item_size(leaf, item, data_size[i]);
}
+
btrfs_set_header_nritems(leaf, nritems + nr);
- btrfs_mark_buffer_dirty(leaf);
ret = 0;
if (slot == 0) {
+ struct btrfs_disk_key disk_key;
btrfs_cpu_key_to_disk(&disk_key, cpu_key);
ret = fixup_low_keys(trans, root, path, &disk_key, 1);
}
+ btrfs_unlock_up_safe(path, 1);
+ btrfs_mark_buffer_dirty(leaf);
if (btrfs_leaf_free_space(root, leaf) < 0) {
btrfs_print_leaf(root, leaf);
BUG();
}
+ return ret;
+}
+
+/*
+ * Given a key and some data, insert items into the tree.
+ * This does all the path init required, making room in the tree if needed.
+ */
+int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_key *cpu_key, u32 *data_size,
+ int nr)
+{
+ struct extent_buffer *leaf;
+ int ret = 0;
+ int slot;
+ int i;
+ u32 total_size = 0;
+ u32 total_data = 0;
+
+ for (i = 0; i < nr; i++)
+ total_data += data_size[i];
+
+ total_size = total_data + (nr * sizeof(struct btrfs_item));
+ ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
+ if (ret == 0)
+ return -EEXIST;
+ if (ret < 0)
+ goto out;
+
+ leaf = path->nodes[0];
+ slot = path->slots[0];
+ BUG_ON(slot < 0);
+
+ ret = setup_items_for_insert(trans, root, path, cpu_key, data_size,
+ total_data, total_size, nr);
+
out:
- btrfs_unlock_up_safe(path, 1);
return ret;
}
@@ -3749,7 +3791,8 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
}
/* delete the leaf if it is mostly empty */
- if (used < BTRFS_LEAF_DATA_SIZE(root) / 4) {
+ if (used < BTRFS_LEAF_DATA_SIZE(root) / 4 &&
+ !trans->transaction->delayed_refs.flushing) {
/* push_leaf_left fixes the path.
* make sure the path still points to our leaf
* for possible call to del_ptr below
@@ -3757,6 +3800,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
slot = path->slots[1];
extent_buffer_get(leaf);
+ btrfs_set_path_blocking(path);
wret = push_leaf_left(trans, root, path, 1, 1);
if (wret < 0 && wret != -ENOSPC)
ret = wret;
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 7dd1b6d0bf3..9417713542a 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -45,6 +45,13 @@ struct btrfs_ordered_sum;
#define BTRFS_MAX_LEVEL 8
+/*
+ * files bigger than this get some pre-flushing when they are added
+ * to the ordered operations list. That way we limit the total
+ * work done by the commit
+ */
+#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024)
+
/* holds pointers to all of the tree roots */
#define BTRFS_ROOT_TREE_OBJECTID 1ULL
@@ -401,15 +408,16 @@ struct btrfs_path {
int locks[BTRFS_MAX_LEVEL];
int reada;
/* keep some upper locks as we walk down */
- int keep_locks;
- int skip_locking;
int lowest_level;
/*
* set by btrfs_split_item, tells search_slot to keep all locks
* and to force calls to keep space in the nodes
*/
- int search_for_split;
+ unsigned int search_for_split:1;
+ unsigned int keep_locks:1;
+ unsigned int skip_locking:1;
+ unsigned int leave_spinning:1;
};
/*
@@ -688,15 +696,18 @@ struct btrfs_fs_info {
struct rb_root block_group_cache_tree;
struct extent_io_tree pinned_extents;
- struct extent_io_tree pending_del;
- struct extent_io_tree extent_ins;
/* logical->physical extent mapping */
struct btrfs_mapping_tree mapping_tree;
u64 generation;
u64 last_trans_committed;
- u64 last_trans_new_blockgroup;
+
+ /*
+ * this is updated to the current trans every time a full commit
+ * is required instead of the faster short fsync log commits
+ */
+ u64 last_trans_log_full_commit;
u64 open_ioctl_trans;
unsigned long mount_opt;
u64 max_extent;
@@ -717,12 +728,21 @@ struct btrfs_fs_info {
struct mutex tree_log_mutex;
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
- struct mutex extent_ins_mutex;
struct mutex pinned_mutex;
struct mutex chunk_mutex;
struct mutex drop_mutex;
struct mutex volume_mutex;
struct mutex tree_reloc_mutex;
+
+ /*
+ * this protects the ordered operations list only while we are
+ * processing all of the entries on it. This way we make
+ * sure the commit code doesn't find the list temporarily empty
+ * because another function happens to be doing non-waiting preflush
+ * before jumping into the main commit.
+ */
+ struct mutex ordered_operations_mutex;
+
struct list_head trans_list;
struct list_head hashers;
struct list_head dead_roots;
@@ -737,10 +757,29 @@ struct btrfs_fs_info {
* ordered extents
*/
spinlock_t ordered_extent_lock;
+
+ /*
+ * all of the data=ordered extents pending writeback
+ * these can span multiple transactions and basically include
+ * every dirty data page that isn't from nodatacow
+ */
struct list_head ordered_extents;
+
+ /*
+ * all of the inodes that have delalloc bytes. It is possible for
+ * this list to be empty even when there is still dirty data=ordered
+ * extents waiting to finish IO.
+ */
struct list_head delalloc_inodes;
/*
+ * special rename and truncate targets that must be on disk before
+ * we're allowed to commit. This is basically the ext3 style
+ * data=ordered list.
+ */
+ struct list_head ordered_operations;
+
+ /*
* there is a pool of worker threads for checksumming during writes
* and a pool for checksumming after reads. This is because readers
* can run with FS locks held, and the writers may be waiting for
@@ -781,6 +820,11 @@ struct btrfs_fs_info {
atomic_t throttle_gen;
u64 total_pinned;
+
+ /* protected by the delalloc lock, used to keep from writing
+ * metadata until there is a nice batch
+ */
+ u64 dirty_metadata_bytes;
struct list_head dirty_cowonly_roots;
struct btrfs_fs_devices *fs_devices;
@@ -1704,18 +1748,15 @@ static inline struct dentry *fdentry(struct file *file)
}
/* extent-tree.c */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count);
int btrfs_lookup_extent(struct btrfs_root *root, u64 start, u64 len);
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u32 *refs);
int btrfs_update_pinned_extents(struct btrfs_root *root,
u64 bytenr, u64 num, int pin);
int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *leaf);
int btrfs_cross_ref_exist(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid, u64 bytenr);
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
- struct btrfs_root *root);
int btrfs_copy_pinned(struct btrfs_root *root, struct extent_io_tree *copy);
struct btrfs_block_group_cache *btrfs_lookup_block_group(
struct btrfs_fs_info *info,
@@ -1777,7 +1818,7 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 ref_generation,
u64 owner_objectid);
int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
+ struct btrfs_root *root, u64 bytenr, u64 num_bytes,
u64 orig_parent, u64 parent,
u64 root_objectid, u64 ref_generation,
u64 owner_objectid);
@@ -1838,7 +1879,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
int btrfs_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *parent, int parent_slot,
- struct extent_buffer **cow_ret, u64 prealloc_dest);
+ struct extent_buffer **cow_ret);
int btrfs_copy_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf,
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
new file mode 100644
index 00000000000..cbf7dc8ae3e
--- /dev/null
+++ b/fs/btrfs/delayed-ref.c
@@ -0,0 +1,669 @@
+/*
+ * Copyright (C) 2009 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/sched.h>
+#include <linux/sort.h>
+#include <linux/ftrace.h>
+#include "ctree.h"
+#include "delayed-ref.h"
+#include "transaction.h"
+
+/*
+ * delayed back reference update tracking. For subvolume trees
+ * we queue up extent allocations and backref maintenance for
+ * delayed processing. This avoids deep call chains where we
+ * add extents in the middle of btrfs_search_slot, and it allows
+ * us to buffer up frequently modified backrefs in an rb tree instead
+ * of hammering updates on the extent allocation tree.
+ *
+ * Right now this code is only used for reference counted trees, but
+ * the long term goal is to get rid of the similar code for delayed
+ * extent tree modifications.
+ */
+
+/*
+ * entries in the rb tree are ordered by the byte number of the extent
+ * and by the byte number of the parent block.
+ */
+static int comp_entry(struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 parent)
+{
+ if (bytenr < ref->bytenr)
+ return -1;
+ if (bytenr > ref->bytenr)
+ return 1;
+ if (parent < ref->parent)
+ return -1;
+ if (parent > ref->parent)
+ return 1;
+ return 0;
+}
+
+/*
+ * insert a new ref into the rbtree. This returns any existing refs
+ * for the same (bytenr,parent) tuple, or NULL if the new node was properly
+ * inserted.
+ */
+static struct btrfs_delayed_ref_node *tree_insert(struct rb_root *root,
+ u64 bytenr, u64 parent,
+ struct rb_node *node)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent_node = NULL;
+ struct btrfs_delayed_ref_node *entry;
+ int cmp;
+
+ while (*p) {
+ parent_node = *p;
+ entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
+ rb_node);
+
+ cmp = comp_entry(entry, bytenr, parent);
+ if (cmp < 0)
+ p = &(*p)->rb_left;
+ else if (cmp > 0)
+ p = &(*p)->rb_right;
+ else
+ return entry;
+ }
+
+ entry = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ rb_link_node(node, parent_node, p);
+ rb_insert_color(node, root);
+ return NULL;
+}
+
+/*
+ * find an entry based on (bytenr,parent). This returns the delayed
+ * ref if it was able to find one, or NULL if nothing was in that spot
+ */
+static struct btrfs_delayed_ref_node *tree_search(struct rb_root *root,
+ u64 bytenr, u64 parent,
+ struct btrfs_delayed_ref_node **last)
+{
+ struct rb_node *n = root->rb_node;
+ struct btrfs_delayed_ref_node *entry;
+ int cmp;
+
+ while (n) {
+ entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
+ WARN_ON(!entry->in_tree);
+ if (last)
+ *last = entry;
+
+ cmp = comp_entry(entry, bytenr, parent);
+ if (cmp < 0)
+ n = n->rb_left;
+ else if (cmp > 0)
+ n = n->rb_right;
+ else
+ return entry;
+ }
+ return NULL;
+}
+
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ assert_spin_locked(&delayed_refs->lock);
+ if (mutex_trylock(&head->mutex))
+ return 0;
+
+ atomic_inc(&head->node.refs);
+ spin_unlock(&delayed_refs->lock);
+
+ mutex_lock(&head->mutex);
+ spin_lock(&delayed_refs->lock);
+ if (!head->node.in_tree) {
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(&head->node);
+ return -EAGAIN;
+ }
+ btrfs_put_delayed_ref(&head->node);
+ return 0;
+}
+
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+ struct list_head *cluster, u64 start)
+{
+ int count = 0;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *node;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *head;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ if (start == 0) {
+ node = rb_first(&delayed_refs->root);
+ } else {
+ ref = NULL;
+ tree_search(&delayed_refs->root, start, (u64)-1, &ref);
+ if (ref) {
+ struct btrfs_delayed_ref_node *tmp;
+
+ node = rb_prev(&ref->rb_node);
+ while (node) {
+ tmp = rb_entry(node,
+ struct btrfs_delayed_ref_node,
+ rb_node);
+ if (tmp->bytenr < start)
+ break;
+ ref = tmp;
+ node = rb_prev(&ref->rb_node);
+ }
+ node = &ref->rb_node;
+ } else
+ node = rb_first(&delayed_refs->root);
+ }
+again:
+ while (node && count < 32) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ head = btrfs_delayed_node_to_head(ref);
+ if (list_empty(&head->cluster)) {
+ list_add_tail(&head->cluster, cluster);
+ delayed_refs->run_delayed_start =
+ head->node.bytenr;
+ count++;
+
+ WARN_ON(delayed_refs->num_heads_ready == 0);
+ delayed_refs->num_heads_ready--;
+ } else if (count) {
+ /* the goal of the clustering is to find extents
+ * that are likely to end up in the same extent
+ * leaf on disk. So, we don't want them spread
+ * all over the tree. Stop now if we've hit
+ * a head that was already in use
+ */
+ break;
+ }
+ }
+ node = rb_next(node);
+ }
+ if (count) {
+ return 0;
+ } else if (start) {
+ /*
+ * we've gone to the end of the rbtree without finding any
+ * clusters. start from the beginning and try again
+ */
+ start = 0;
+ node = rb_first(&delayed_refs->root);
+ goto again;
+ }
+ return 1;
+}
+
+/*
+ * This checks to see if there are any delayed refs in the
+ * btree for a given bytenr. It returns one if it finds any
+ * and zero otherwise.
+ *
+ * If it only finds a head node, it returns 0.
+ *
+ * The idea is to use this when deciding if you can safely delete an
+ * extent from the extent allocation tree. There may be a pending
+ * ref in the rbtree that adds or removes references, so as long as this
+ * returns one you need to leave the BTRFS_EXTENT_ITEM in the extent
+ * allocation tree.
+ */
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct rb_node *prev_node;
+ int ret = 0;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+ if (ref) {
+ prev_node = rb_prev(&ref->rb_node);
+ if (!prev_node)
+ goto out;
+ ref = rb_entry(prev_node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (ref->bytenr == bytenr)
+ ret = 1;
+ }
+out:
+ spin_unlock(&delayed_refs->lock);
+ return ret;
+}
+
+/*
+ * helper function to lookup reference count
+ *
+ * the head node for delayed ref is used to store the sum of all the
+ * reference count modifications queued up in the rbtree. This way you
+ * can check to see what the reference count would be if all of the
+ * delayed refs are processed.
+ */
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u32 *refs)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_path *path;
+ struct extent_buffer *leaf;
+ struct btrfs_extent_item *ei;
+ struct btrfs_key key;
+ u32 num_refs;
+ int ret;
+
+ path = btrfs_alloc_path();
+ if (!path)
+ return -ENOMEM;
+
+ key.objectid = bytenr;
+ key.type = BTRFS_EXTENT_ITEM_KEY;
+ key.offset = num_bytes;
+ delayed_refs = &trans->transaction->delayed_refs;
+again:
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root,
+ &key, path, 0, 0);
+ if (ret < 0)
+ goto out;
+
+ if (ret == 0) {
+ leaf = path->nodes[0];
+ ei = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_extent_item);
+ num_refs = btrfs_extent_refs(leaf, ei);
+ } else {
+ num_refs = 0;
+ ret = 0;
+ }
+
+ spin_lock(&delayed_refs->lock);
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+ if (ref) {
+ head = btrfs_delayed_node_to_head(ref);
+ if (mutex_trylock(&head->mutex)) {
+ num_refs += ref->ref_mod;
+ mutex_unlock(&head->mutex);
+ *refs = num_refs;
+ goto out;
+ }
+
+ atomic_inc(&ref->refs);
+ spin_unlock(&delayed_refs->lock);
+
+ btrfs_release_path(root->fs_info->extent_root, path);
+
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+ btrfs_put_delayed_ref(ref);
+ goto again;
+ } else {
+ *refs = num_refs;
+ }
+out:
+ spin_unlock(&delayed_refs->lock);
+ btrfs_free_path(path);
+ return ret;
+}
+
+/*
+ * helper function to update an extent delayed ref in the
+ * rbtree. existing and update must both have the same
+ * bytenr and parent
+ *
+ * This may free existing if the update cancels out whatever
+ * operation it was doing.
+ */
+static noinline void
+update_existing_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_root *delayed_refs,
+ struct btrfs_delayed_ref_node *existing,
+ struct btrfs_delayed_ref_node *update)
+{
+ struct btrfs_delayed_ref *existing_ref;
+ struct btrfs_delayed_ref *ref;
+
+ existing_ref = btrfs_delayed_node_to_ref(existing);
+ ref = btrfs_delayed_node_to_ref(update);
+
+ if (ref->pin)
+ existing_ref->pin = 1;
+
+ if (ref->action != existing_ref->action) {
+ /*
+ * this is effectively undoing either an add or a
+ * drop. We decrement the ref_mod, and if it goes
+ * down to zero we just delete the entry without
+ * every changing the extent allocation tree.
+ */
+ existing->ref_mod--;
+ if (existing->ref_mod == 0) {
+ rb_erase(&existing->rb_node,
+ &delayed_refs->root);
+ existing->in_tree = 0;
+ btrfs_put_delayed_ref(existing);
+ delayed_refs->num_entries--;
+ if (trans->delayed_ref_updates)
+ trans->delayed_ref_updates--;
+ }
+ } else {
+ if (existing_ref->action == BTRFS_ADD_DELAYED_REF) {
+ /* if we're adding refs, make sure all the
+ * details match up. The extent could
+ * have been totally freed and reallocated
+ * by a different owner before the delayed
+ * ref entries were removed.
+ */
+ existing_ref->owner_objectid = ref->owner_objectid;
+ existing_ref->generation = ref->generation;
+ existing_ref->root = ref->root;
+ existing->num_bytes = update->num_bytes;
+ }
+ /*
+ * the action on the existing ref matches
+ * the action on the ref we're trying to add.
+ * Bump the ref_mod by one so the backref that
+ * is eventually added/removed has the correct
+ * reference count
+ */
+ existing->ref_mod += update->ref_mod;
+ }
+}
+
+/*
+ * helper function to update the accounting in the head ref
+ * existing and update must have the same bytenr
+ */
+static noinline void
+update_existing_head_ref(struct btrfs_delayed_ref_node *existing,
+ struct btrfs_delayed_ref_node *update)
+{
+ struct btrfs_delayed_ref_head *existing_ref;
+ struct btrfs_delayed_ref_head *ref;
+
+ existing_ref = btrfs_delayed_node_to_head(existing);
+ ref = btrfs_delayed_node_to_head(update);
+
+ if (ref->must_insert_reserved) {
+ /* if the extent was freed and then
+ * reallocated before the delayed ref
+ * entries were processed, we can end up
+ * with an existing head ref without
+ * the must_insert_reserved flag set.
+ * Set it again here
+ */
+ existing_ref->must_insert_reserved = ref->must_insert_reserved;
+
+ /*
+ * update the num_bytes so we make sure the accounting
+ * is done correctly
+ */
+ existing->num_bytes = update->num_bytes;
+
+ }
+
+ /*
+ * update the reference mod on the head to reflect this new operation
+ */
+ existing->ref_mod += update->ref_mod;
+}
+
+/*
+ * helper function to actually insert a delayed ref into the rbtree.
+ * this does all the dirty work in terms of maintaining the correct
+ * overall modification count in the head node and properly dealing
+ * with updating existing nodes as new modifications are queued.
+ */
+static noinline int __btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_node *ref,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid, int action,
+ int pin)
+{
+ struct btrfs_delayed_ref_node *existing;
+ struct btrfs_delayed_ref *full_ref;
+ struct btrfs_delayed_ref_head *head_ref = NULL;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int count_mod = 1;
+ int must_insert_reserved = 0;
+
+ /*
+ * the head node stores the sum of all the mods, so dropping a ref
+ * should drop the sum in the head node by one.
+ */
+ if (parent == (u64)-1) {
+ if (action == BTRFS_DROP_DELAYED_REF)
+ count_mod = -1;
+ else if (action == BTRFS_UPDATE_DELAYED_HEAD)
+ count_mod = 0;
+ }
+
+ /*
+ * BTRFS_ADD_DELAYED_EXTENT means that we need to update
+ * the reserved accounting when the extent is finally added, or
+ * if a later modification deletes the delayed ref without ever
+ * inserting the extent into the extent allocation tree.
+ * ref->must_insert_reserved is the flag used to record
+ * that accounting mods are required.
+ *
+ * Once we record must_insert_reserved, switch the action to
+ * BTRFS_ADD_DELAYED_REF because other special casing is not required.
+ */
+ if (action == BTRFS_ADD_DELAYED_EXTENT) {
+ must_insert_reserved = 1;
+ action = BTRFS_ADD_DELAYED_REF;
+ } else {
+ must_insert_reserved = 0;
+ }
+
+
+ delayed_refs = &trans->transaction->delayed_refs;
+
+ /* first set the basic ref node struct up */
+ atomic_set(&ref->refs, 1);
+ ref->bytenr = bytenr;
+ ref->parent = parent;
+ ref->ref_mod = count_mod;
+ ref->in_tree = 1;
+ ref->num_bytes = num_bytes;
+
+ if (btrfs_delayed_ref_is_head(ref)) {
+ head_ref = btrfs_delayed_node_to_head(ref);
+ head_ref->must_insert_reserved = must_insert_reserved;
+ INIT_LIST_HEAD(&head_ref->cluster);
+ mutex_init(&head_ref->mutex);
+ } else {
+ full_ref = btrfs_delayed_node_to_ref(ref);
+ full_ref->root = ref_root;
+ full_ref->generation = ref_generation;
+ full_ref->owner_objectid = owner_objectid;
+ full_ref->pin = pin;
+ full_ref->action = action;
+ }
+
+ existing = tree_insert(&delayed_refs->root, bytenr,
+ parent, &ref->rb_node);
+
+ if (existing) {
+ if (btrfs_delayed_ref_is_head(ref))
+ update_existing_head_ref(existing, ref);
+ else
+ update_existing_ref(trans, delayed_refs, existing, ref);
+
+ /*
+ * we've updated the existing ref, free the newly
+ * allocated ref
+ */
+ kfree(ref);
+ } else {
+ if (btrfs_delayed_ref_is_head(ref)) {
+ delayed_refs->num_heads++;
+ delayed_refs->num_heads_ready++;
+ }
+ delayed_refs->num_entries++;
+ trans->delayed_ref_updates++;
+ }
+ return 0;
+}
+
+/*
+ * add a delayed ref to the tree. This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ */
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid, int action,
+ int pin)
+{
+ struct btrfs_delayed_ref *ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ /*
+ * the parent = 0 case comes from cases where we don't actually
+ * know the parent yet. It will get updated later via a add/drop
+ * pair.
+ */
+ if (parent == 0)
+ parent = bytenr;
+
+ head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ if (!head_ref) {
+ kfree(ref);
+ return -ENOMEM;
+ }
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+ (u64)-1, 0, 0, 0, action, pin);
+ BUG_ON(ret);
+
+ ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+ parent, ref_root, ref_generation,
+ owner_objectid, action, pin);
+ BUG_ON(ret);
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
+
+/*
+ * this does a simple search for the head node for a given extent.
+ * It must be called with the delayed ref spinlock held, and it returns
+ * the head node if any where found, or NULL if not.
+ */
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr)
+{
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ ref = tree_search(&delayed_refs->root, bytenr, (u64)-1, NULL);
+ if (ref)
+ return btrfs_delayed_node_to_head(ref);
+ return NULL;
+}
+
+/*
+ * add a delayed ref to the tree. This does all of the accounting required
+ * to make sure the delayed ref is eventually processed before this
+ * transaction commits.
+ *
+ * The main point of this call is to add and remove a backreference in a single
+ * shot, taking the lock only once, and only searching for the head node once.
+ *
+ * It is the same as doing a ref add and delete in two separate calls.
+ */
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 orig_parent,
+ u64 parent, u64 orig_ref_root, u64 ref_root,
+ u64 orig_ref_generation, u64 ref_generation,
+ u64 owner_objectid, int pin)
+{
+ struct btrfs_delayed_ref *ref;
+ struct btrfs_delayed_ref *old_ref;
+ struct btrfs_delayed_ref_head *head_ref;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ int ret;
+
+ ref = kmalloc(sizeof(*ref), GFP_NOFS);
+ if (!ref)
+ return -ENOMEM;
+
+ old_ref = kmalloc(sizeof(*old_ref), GFP_NOFS);
+ if (!old_ref) {
+ kfree(ref);
+ return -ENOMEM;
+ }
+
+ /*
+ * the parent = 0 case comes from cases where we don't actually
+ * know the parent yet. It will get updated later via a add/drop
+ * pair.
+ */
+ if (parent == 0)
+ parent = bytenr;
+ if (orig_parent == 0)
+ orig_parent = bytenr;
+
+ head_ref = kmalloc(sizeof(*head_ref), GFP_NOFS);
+ if (!head_ref) {
+ kfree(ref);
+ kfree(old_ref);
+ return -ENOMEM;
+ }
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+
+ /*
+ * insert both the head node and the new ref without dropping
+ * the spin lock
+ */
+ ret = __btrfs_add_delayed_ref(trans, &head_ref->node, bytenr, num_bytes,
+ (u64)-1, 0, 0, 0,
+ BTRFS_UPDATE_DELAYED_HEAD, 0);
+ BUG_ON(ret);
+
+ ret = __btrfs_add_delayed_ref(trans, &ref->node, bytenr, num_bytes,
+ parent, ref_root, ref_generation,
+ owner_objectid, BTRFS_ADD_DELAYED_REF, 0);
+ BUG_ON(ret);
+
+ ret = __btrfs_add_delayed_ref(trans, &old_ref->node, bytenr, num_bytes,
+ orig_parent, orig_ref_root,
+ orig_ref_generation, owner_objectid,
+ BTRFS_DROP_DELAYED_REF, pin);
+ BUG_ON(ret);
+ spin_unlock(&delayed_refs->lock);
+ return 0;
+}
diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h
new file mode 100644
index 00000000000..3bec2ff0b15
--- /dev/null
+++ b/fs/btrfs/delayed-ref.h
@@ -0,0 +1,193 @@
+/*
+ * Copyright (C) 2008 Oracle. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#ifndef __DELAYED_REF__
+#define __DELAYED_REF__
+
+/* these are the possible values of struct btrfs_delayed_ref->action */
+#define BTRFS_ADD_DELAYED_REF 1 /* add one backref to the tree */
+#define BTRFS_DROP_DELAYED_REF 2 /* delete one backref from the tree */
+#define BTRFS_ADD_DELAYED_EXTENT 3 /* record a full extent allocation */
+#define BTRFS_UPDATE_DELAYED_HEAD 4 /* not changing ref count on head ref */
+
+struct btrfs_delayed_ref_node {
+ struct rb_node rb_node;
+
+ /* the starting bytenr of the extent */
+ u64 bytenr;
+
+ /* the parent our backref will point to */
+ u64 parent;
+
+ /* the size of the extent */
+ u64 num_bytes;
+
+ /* ref count on this data structure */
+ atomic_t refs;
+
+ /*
+ * how many refs is this entry adding or deleting. For
+ * head refs, this may be a negative number because it is keeping
+ * track of the total mods done to the reference count.
+ * For individual refs, this will always be a positive number
+ *
+ * It may be more than one, since it is possible for a single
+ * parent to have more than one ref on an extent
+ */
+ int ref_mod;
+
+ /* is this node still in the rbtree? */
+ unsigned int in_tree:1;
+};
+
+/*
+ * the head refs are used to hold a lock on a given extent, which allows us
+ * to make sure that only one process is running the delayed refs
+ * at a time for a single extent. They also store the sum of all the
+ * reference count modifications we've queued up.
+ */
+struct btrfs_delayed_ref_head {
+ struct btrfs_delayed_ref_node node;
+
+ /*
+ * the mutex is held while running the refs, and it is also
+ * held when checking the sum of reference modifications.
+ */
+ struct mutex mutex;
+
+ struct list_head cluster;
+
+ /*
+ * when a new extent is allocated, it is just reserved in memory
+ * The actual extent isn't inserted into the extent allocation tree
+ * until the delayed ref is processed. must_insert_reserved is
+ * used to flag a delayed ref so the accounting can be updated
+ * when a full insert is done.
+ *
+ * It is possible the extent will be freed before it is ever
+ * inserted into the extent allocation tree. In this case
+ * we need to update the in ram accounting to properly reflect
+ * the free has happened.
+ */
+ unsigned int must_insert_reserved:1;
+};
+
+struct btrfs_delayed_ref {
+ struct btrfs_delayed_ref_node node;
+
+ /* the root objectid our ref will point to */
+ u64 root;
+
+ /* the generation for the backref */
+ u64 generation;
+
+ /* owner_objectid of the backref */
+ u64 owner_objectid;
+
+ /* operation done by this entry in the rbtree */
+ u8 action;
+
+ /* if pin == 1, when the extent is freed it will be pinned until
+ * transaction commit
+ */
+ unsigned int pin:1;
+};
+
+struct btrfs_delayed_ref_root {
+ struct rb_root root;
+
+ /* this spin lock protects the rbtree and the entries inside */
+ spinlock_t lock;
+
+ /* how many delayed ref updates we've queued, used by the
+ * throttling code
+ */
+ unsigned long num_entries;
+
+ /* total number of head nodes in tree */
+ unsigned long num_heads;
+
+ /* total number of head nodes ready for processing */
+ unsigned long num_heads_ready;
+
+ /*
+ * set when the tree is flushing before a transaction commit,
+ * used by the throttling code to decide if new updates need
+ * to be run right away
+ */
+ int flushing;
+
+ u64 run_delayed_start;
+};
+
+static inline void btrfs_put_delayed_ref(struct btrfs_delayed_ref_node *ref)
+{
+ WARN_ON(atomic_read(&ref->refs) == 0);
+ if (atomic_dec_and_test(&ref->refs)) {
+ WARN_ON(ref->in_tree);
+ kfree(ref);
+ }
+}
+
+int btrfs_add_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid, int action,
+ int pin);
+
+struct btrfs_delayed_ref_head *
+btrfs_find_delayed_ref_head(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_delayed_ref_pending(struct btrfs_trans_handle *trans, u64 bytenr);
+int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u32 *refs);
+int btrfs_update_delayed_ref(struct btrfs_trans_handle *trans,
+ u64 bytenr, u64 num_bytes, u64 orig_parent,
+ u64 parent, u64 orig_ref_root, u64 ref_root,
+ u64 orig_ref_generation, u64 ref_generation,
+ u64 owner_objectid, int pin);
+int btrfs_delayed_ref_lock(struct btrfs_trans_handle *trans,
+ struct btrfs_delayed_ref_head *head);
+int btrfs_find_ref_cluster(struct btrfs_trans_handle *trans,
+ struct list_head *cluster, u64 search_start);
+/*
+ * a node might live in a head or a regular ref, this lets you
+ * test for the proper type to use.
+ */
+static int btrfs_delayed_ref_is_head(struct btrfs_delayed_ref_node *node)
+{
+ return node->parent == (u64)-1;
+}
+
+/*
+ * helper functions to cast a node into its container
+ */
+static inline struct btrfs_delayed_ref *
+btrfs_delayed_node_to_ref(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_ref, node);
+
+}
+
+static inline struct btrfs_delayed_ref_head *
+btrfs_delayed_node_to_head(struct btrfs_delayed_ref_node *node)
+{
+ WARN_ON(!btrfs_delayed_ref_is_head(node));
+ return container_of(node, struct btrfs_delayed_ref_head, node);
+
+}
+#endif
diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c
index 926a0b287a7..1d70236ba00 100644
--- a/fs/btrfs/dir-item.c
+++ b/fs/btrfs/dir-item.c
@@ -145,7 +145,10 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root
key.objectid = dir;
btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY);
key.offset = btrfs_name_hash(name, name_len);
+
path = btrfs_alloc_path();
+ path->leave_spinning = 1;
+
data_size = sizeof(*dir_item) + name_len;
dir_item = insert_with_overflow(trans, root, path, &key, data_size,
name, name_len);
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 6ec80c0fc86..92d73929d38 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -668,14 +668,31 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio,
static int btree_writepage(struct page *page, struct writeback_control *wbc)
{
struct extent_io_tree *tree;
+ struct btrfs_root *root = BTRFS_I(page->mapping->host)->root;
+ struct extent_buffer *eb;
+ int was_dirty;
+
tree = &BTRFS_I(page->mapping->host)->io_tree;
+ if (!(current->flags & PF_MEMALLOC)) {
+ return extent_write_full_page(tree, page,
+ btree_get_extent, wbc);
+ }
- if (current->flags & PF_MEMALLOC) {
- redirty_page_for_writepage(wbc, page);
- unlock_page(page);
- return 0;
+ redirty_page_for_writepage(wbc, page);
+ eb = btrfs_find_tree_block(root, page_offset(page),
+ PAGE_CACHE_SIZE);
+ WARN_ON(!eb);
+
+ was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
+ if (!was_dirty) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ root->fs_info->dirty_metadata_bytes += PAGE_CACHE_SIZE;
+ spin_unlock(&root->fs_info->delalloc_lock);
}
- return extent_write_full_page(tree, page, btree_get_extent, wbc);
+ free_extent_buffer(eb);
+
+ unlock_page(page);
+ return 0;
}
static int btree_writepages(struct address_space *mapping,
@@ -684,15 +701,15 @@ static int btree_writepages(struct address_space *mapping,
struct extent_io_tree *tree;
tree = &BTRFS_I(mapping->host)->io_tree;
if (wbc->sync_mode == WB_SYNC_NONE) {
+ struct btrfs_root *root = BTRFS_I(mapping->host)->root;
u64 num_dirty;
- u64 start = 0;
unsigned long thresh = 32 * 1024 * 1024;
if (wbc->for_kupdate)
return 0;
- num_dirty = count_range_bits(tree, &start, (u64)-1,
- thresh, EXTENT_DIRTY);
+ /* this is a bit racy, but that's ok */
+ num_dirty = root->fs_info->dirty_metadata_bytes;
if (num_dirty < thresh)
return 0;
}
@@ -859,9 +876,17 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root,
root->fs_info->running_transaction->transid) {
btrfs_assert_tree_locked(buf);
- /* ugh, clear_extent_buffer_dirty can be expensive */
- btrfs_set_lock_blocking(buf);
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (root->fs_info->dirty_metadata_bytes >= buf->len)
+ root->fs_info->dirty_metadata_bytes -= buf->len;
+ else
+ WARN_ON(1);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+ /* ugh, clear_extent_buffer_dirty needs to lock the page */
+ btrfs_set_lock_blocking(buf);
clear_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
buf);
}
@@ -1471,12 +1496,6 @@ static int transaction_kthread(void *arg)
vfs_check_frozen(root->fs_info->sb, SB_FREEZE_WRITE);
mutex_lock(&root->fs_info->transaction_kthread_mutex);
- if (root->fs_info->total_ref_cache_size > 20 * 1024 * 1024) {
- printk(KERN_INFO "btrfs: total reference cache "
- "size %llu\n",
- root->fs_info->total_ref_cache_size);
- }
-
mutex_lock(&root->fs_info->trans_mutex);
cur = root->fs_info->running_transaction;
if (!cur) {
@@ -1493,6 +1512,7 @@ static int transaction_kthread(void *arg)
mutex_unlock(&root->fs_info->trans_mutex);
trans = btrfs_start_transaction(root, 1);
ret = btrfs_commit_transaction(trans, root);
+
sleep:
wake_up_process(root->fs_info->cleaner_kthread);
mutex_unlock(&root->fs_info->transaction_kthread_mutex);
@@ -1552,6 +1572,7 @@ struct btrfs_root *open_ctree(struct super_block *sb,
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->hashers);
INIT_LIST_HEAD(&fs_info->delalloc_inodes);
+ INIT_LIST_HEAD(&fs_info->ordered_operations);
spin_lock_init(&fs_info->delalloc_lock);
spin_lock_init(&fs_info->new_trans_lock);
spin_lock_init(&fs_info->ref_cache_lock);
@@ -1611,10 +1632,6 @@ struct btrfs_root *open_ctree(struct super_block *sb,
extent_io_tree_init(&fs_info->pinned_extents,
fs_info->btree_inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(&fs_info->pending_del,
- fs_info->btree_inode->i_mapping, GFP_NOFS);
- extent_io_tree_init(&fs_info->extent_ins,
- fs_info->btree_inode->i_mapping, GFP_NOFS);
fs_info->do_barriers = 1;
INIT_LIST_HEAD(&fs_info->dead_reloc_roots);
@@ -1627,9 +1644,9 @@ struct btrfs_root *open_ctree(struct super_block *sb,
insert_inode_hash(fs_info->btree_inode);
mutex_init(&fs_info->trans_mutex);
+ mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
mutex_init(&fs_info->drop_mutex);
- mutex_init(&fs_info->extent_ins_mutex);
mutex_init(&fs_info->pinned_mutex);
mutex_init(&fs_info->chunk_mutex);
mutex_init(&fs_info->transaction_kthread_mutex);
@@ -2358,8 +2375,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
struct btrfs_root *root = BTRFS_I(buf->first_page->mapping->host)->root;
u64 transid = btrfs_header_generation(buf);
struct inode *btree_inode = root->fs_info->btree_inode;
-
- btrfs_set_lock_blocking(buf);
+ int was_dirty;
btrfs_assert_tree_locked(buf);
if (transid != root->fs_info->generation) {
@@ -2370,7 +2386,13 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
(unsigned long long)root->fs_info->generation);
WARN_ON(1);
}
- set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree, buf);
+ was_dirty = set_extent_buffer_dirty(&BTRFS_I(btree_inode)->io_tree,
+ buf);
+ if (!was_dirty) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ root->fs_info->dirty_metadata_bytes += buf->len;
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
}
void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr)
@@ -2410,6 +2432,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid)
int btree_lock_page_hook(struct page *page)
{
struct inode *inode = page->mapping->host;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
struct extent_buffer *eb;
unsigned long len;
@@ -2425,6 +2448,16 @@ int btree_lock_page_hook(struct page *page)
btrfs_tree_lock(eb);
btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
+
+ if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ spin_lock(&root->fs_info->delalloc_lock);
+ if (root->fs_info->dirty_metadata_bytes >= eb->len)
+ root->fs_info->dirty_metadata_bytes -= eb->len;
+ else
+ WARN_ON(1);
+ spin_unlock(&root->fs_info->delalloc_lock);
+ }
+
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
out:
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 95029db227b..c958ecbc191 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -72,6 +72,7 @@ int btrfs_insert_dev_radix(struct btrfs_root *root,
void btrfs_btree_balance_dirty(struct btrfs_root *root, unsigned long nr);
int btrfs_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
+void btrfs_mark_buffer_dirty_nonblocking(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid);
int btrfs_set_buffer_uptodate(struct extent_buffer *buf);
int wait_on_tree_block_writeback(struct btrfs_root *root,
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index fefe83ad205..f5e7cae63d8 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -49,17 +49,23 @@ struct pending_extent_op {
int del;
};
-static int finish_current_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all);
-static int del_pending_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all);
-static int pin_down_bytes(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int is_data);
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner, struct btrfs_key *ins,
+ int ref_mod);
+static int update_reserved_extents(struct btrfs_root *root,
+ u64 bytenr, u64 num, int reserve);
static int update_block_group(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, int alloc,
int mark_free);
+static noinline int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid, int pin,
+ int ref_to_drop);
static int do_chunk_alloc(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root, u64 alloc_bytes,
@@ -554,262 +560,13 @@ out:
return ret;
}
-/*
- * updates all the backrefs that are pending on update_list for the
- * extent_root
- */
-static noinline int update_backrefs(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct btrfs_path *path,
- struct list_head *update_list)
-{
- struct btrfs_key key;
- struct btrfs_extent_ref *ref;
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct pending_extent_op *op;
- struct extent_buffer *leaf;
- int ret = 0;
- struct list_head *cur = update_list->next;
- u64 ref_objectid;
- u64 ref_root = extent_root->root_key.objectid;
-
- op = list_entry(cur, struct pending_extent_op, list);
-
-search:
- key.objectid = op->bytenr;
- key.type = BTRFS_EXTENT_REF_KEY;
- key.offset = op->orig_parent;
-
- ret = btrfs_search_slot(trans, extent_root, &key, path, 0, 1);
- BUG_ON(ret);
-
- leaf = path->nodes[0];
-
-loop:
- ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
-
- ref_objectid = btrfs_ref_objectid(leaf, ref);
-
- if (btrfs_ref_root(leaf, ref) != ref_root ||
- btrfs_ref_generation(leaf, ref) != op->orig_generation ||
- (ref_objectid != op->level &&
- ref_objectid != BTRFS_MULTIPLE_OBJECTIDS)) {
- printk(KERN_ERR "btrfs couldn't find %llu, parent %llu, "
- "root %llu, owner %u\n",
- (unsigned long long)op->bytenr,
- (unsigned long long)op->orig_parent,
- (unsigned long long)ref_root, op->level);
- btrfs_print_leaf(extent_root, leaf);
- BUG();
- }
-
- key.objectid = op->bytenr;
- key.offset = op->parent;
- key.type = BTRFS_EXTENT_REF_KEY;
- ret = btrfs_set_item_key_safe(trans, extent_root, path, &key);
- BUG_ON(ret);
- ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
- btrfs_set_ref_generation(leaf, ref, op->generation);
-
- cur = cur->next;
-
- list_del_init(&op->list);
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1, GFP_NOFS);
- kfree(op);
-
- if (cur == update_list) {
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(extent_root, path);
- goto out;
- }
-
- op = list_entry(cur, struct pending_extent_op, list);
-
- path->slots[0]++;
- while (path->slots[0] < btrfs_header_nritems(leaf)) {
- btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
- if (key.objectid == op->bytenr &&
- key.type == BTRFS_EXTENT_REF_KEY)
- goto loop;
- path->slots[0]++;
- }
-
- btrfs_mark_buffer_dirty(path->nodes[0]);
- btrfs_release_path(extent_root, path);
- goto search;
-
-out:
- return 0;
-}
-
-static noinline int insert_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct btrfs_path *path,
- struct list_head *insert_list, int nr)
-{
- struct btrfs_key *keys;
- u32 *data_size;
- struct pending_extent_op *op;
- struct extent_buffer *leaf;
- struct list_head *cur = insert_list->next;
- struct btrfs_fs_info *info = extent_root->fs_info;
- u64 ref_root = extent_root->root_key.objectid;
- int i = 0, last = 0, ret;
- int total = nr * 2;
-
- if (!nr)
- return 0;
-
- keys = kzalloc(total * sizeof(struct btrfs_key), GFP_NOFS);
- if (!keys)
- return -ENOMEM;
-
- data_size = kzalloc(total * sizeof(u32), GFP_NOFS);
- if (!data_size) {
- kfree(keys);
- return -ENOMEM;
- }
-
- list_for_each_entry(op, insert_list, list) {
- keys[i].objectid = op->bytenr;
- keys[i].offset = op->num_bytes;
- keys[i].type = BTRFS_EXTENT_ITEM_KEY;
- data_size[i] = sizeof(struct btrfs_extent_item);
- i++;
-
- keys[i].objectid = op->bytenr;
- keys[i].offset = op->parent;
- keys[i].type = BTRFS_EXTENT_REF_KEY;
- data_size[i] = sizeof(struct btrfs_extent_ref);
- i++;
- }
-
- op = list_entry(cur, struct pending_extent_op, list);
- i = 0;
- while (i < total) {
- int c;
- ret = btrfs_insert_some_items(trans, extent_root, path,
- keys+i, data_size+i, total-i);
- BUG_ON(ret < 0);
-
- if (last && ret > 1)
- BUG();
-
- leaf = path->nodes[0];
- for (c = 0; c < ret; c++) {
- int ref_first = keys[i].type == BTRFS_EXTENT_REF_KEY;
-
- /*
- * if the first item we inserted was a backref, then
- * the EXTENT_ITEM will be the odd c's, else it will
- * be the even c's
- */
- if ((ref_first && (c % 2)) ||
- (!ref_first && !(c % 2))) {
- struct btrfs_extent_item *itm;
-
- itm = btrfs_item_ptr(leaf, path->slots[0] + c,
- struct btrfs_extent_item);
- btrfs_set_extent_refs(path->nodes[0], itm, 1);
- op->del++;
- } else {
- struct btrfs_extent_ref *ref;
-
- ref = btrfs_item_ptr(leaf, path->slots[0] + c,
- struct btrfs_extent_ref);
- btrfs_set_ref_root(leaf, ref, ref_root);
- btrfs_set_ref_generation(leaf, ref,
- op->generation);
- btrfs_set_ref_objectid(leaf, ref, op->level);
- btrfs_set_ref_num_refs(leaf, ref, 1);
- op->del++;
- }
-
- /*
- * using del to see when its ok to free up the
- * pending_extent_op. In the case where we insert the
- * last item on the list in order to help do batching
- * we need to not free the extent op until we actually
- * insert the extent_item
- */
- if (op->del == 2) {
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1,
- GFP_NOFS);
- cur = cur->next;
- list_del_init(&op->list);
- kfree(op);
- if (cur != insert_list)
- op = list_entry(cur,
- struct pending_extent_op,
- list);
- }
- }
- btrfs_mark_buffer_dirty(leaf);
- btrfs_release_path(extent_root, path);
-
- /*
- * Ok backref's and items usually go right next to eachother,
- * but if we could only insert 1 item that means that we
- * inserted on the end of a leaf, and we have no idea what may
- * be on the next leaf so we just play it safe. In order to
- * try and help this case we insert the last thing on our
- * insert list so hopefully it will end up being the last
- * thing on the leaf and everything else will be before it,
- * which will let us insert a whole bunch of items at the same
- * time.
- */
- if (ret == 1 && !last && (i + ret < total)) {
- /*
- * last: where we will pick up the next time around
- * i: our current key to insert, will be total - 1
- * cur: the current op we are screwing with
- * op: duh
- */
- last = i + ret;
- i = total - 1;
- cur = insert_list->prev;
- op = list_entry(cur, struct pending_extent_op, list);
- } else if (last) {
- /*
- * ok we successfully inserted the last item on the
- * list, lets reset everything
- *
- * i: our current key to insert, so where we left off
- * last time
- * last: done with this
- * cur: the op we are messing with
- * op: duh
- * total: since we inserted the last key, we need to
- * decrement total so we dont overflow
- */
- i = last;
- last = 0;
- total--;
- if (i < total) {
- cur = insert_list->next;
- op = list_entry(cur, struct pending_extent_op,
- list);
- }
- } else {
- i += ret;
- }
-
- cond_resched();
- }
- ret = 0;
- kfree(keys);
- kfree(data_size);
- return ret;
-}
-
static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
u64 bytenr, u64 parent,
u64 ref_root, u64 ref_generation,
- u64 owner_objectid)
+ u64 owner_objectid,
+ int refs_to_add)
{
struct btrfs_key key;
struct extent_buffer *leaf;
@@ -829,9 +586,10 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
btrfs_set_ref_root(leaf, ref, ref_root);
btrfs_set_ref_generation(leaf, ref, ref_generation);
btrfs_set_ref_objectid(leaf, ref, owner_objectid);
- btrfs_set_ref_num_refs(leaf, ref, 1);
+ btrfs_set_ref_num_refs(leaf, ref, refs_to_add);
} else if (ret == -EEXIST) {
u64 existing_owner;
+
BUG_ON(owner_objectid < BTRFS_FIRST_FREE_OBJECTID);
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
@@ -845,7 +603,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
num_refs = btrfs_ref_num_refs(leaf, ref);
BUG_ON(num_refs == 0);
- btrfs_set_ref_num_refs(leaf, ref, num_refs + 1);
+ btrfs_set_ref_num_refs(leaf, ref, num_refs + refs_to_add);
existing_owner = btrfs_ref_objectid(leaf, ref);
if (existing_owner != owner_objectid &&
@@ -857,6 +615,7 @@ static noinline int insert_extent_backref(struct btrfs_trans_handle *trans,
} else {
goto out;
}
+ btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
btrfs_release_path(root, path);
@@ -865,7 +624,8 @@ out:
static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_path *path)
+ struct btrfs_path *path,
+ int refs_to_drop)
{
struct extent_buffer *leaf;
struct btrfs_extent_ref *ref;
@@ -875,8 +635,8 @@ static noinline int remove_extent_backref(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_ref);
num_refs = btrfs_ref_num_refs(leaf, ref);
- BUG_ON(num_refs == 0);
- num_refs -= 1;
+ BUG_ON(num_refs < refs_to_drop);
+ num_refs -= refs_to_drop;
if (num_refs == 0) {
ret = btrfs_del_item(trans, root, path);
} else {
@@ -927,332 +687,28 @@ static int btrfs_discard_extent(struct btrfs_root *root, u64 bytenr,
#endif
}
-static noinline int free_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root,
- struct list_head *del_list)
-{
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct btrfs_path *path;
- struct btrfs_key key, found_key;
- struct extent_buffer *leaf;
- struct list_head *cur;
- struct pending_extent_op *op;
- struct btrfs_extent_item *ei;
- int ret, num_to_del, extent_slot = 0, found_extent = 0;
- u32 refs;
- u64 bytes_freed = 0;
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- path->reada = 1;
-
-search:
- /* search for the backref for the current ref we want to delete */
- cur = del_list->next;
- op = list_entry(cur, struct pending_extent_op, list);
- ret = lookup_extent_backref(trans, extent_root, path, op->bytenr,
- op->orig_parent,
- extent_root->root_key.objectid,
- op->orig_generation, op->level, 1);
- if (ret) {
- printk(KERN_ERR "btrfs unable to find backref byte nr %llu "
- "root %llu gen %llu owner %u\n",
- (unsigned long long)op->bytenr,
- (unsigned long long)extent_root->root_key.objectid,
- (unsigned long long)op->orig_generation, op->level);
- btrfs_print_leaf(extent_root, path->nodes[0]);
- WARN_ON(1);
- goto out;
- }
-
- extent_slot = path->slots[0];
- num_to_del = 1;
- found_extent = 0;
-
- /*
- * if we aren't the first item on the leaf we can move back one and see
- * if our ref is right next to our extent item
- */
- if (likely(extent_slot)) {
- extent_slot--;
- btrfs_item_key_to_cpu(path->nodes[0], &found_key,
- extent_slot);
- if (found_key.objectid == op->bytenr &&
- found_key.type == BTRFS_EXTENT_ITEM_KEY &&
- found_key.offset == op->num_bytes) {
- num_to_del++;
- found_extent = 1;
- }
- }
-
- /*
- * if we didn't find the extent we need to delete the backref and then
- * search for the extent item key so we can update its ref count
- */
- if (!found_extent) {
- key.objectid = op->bytenr;
- key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = op->num_bytes;
-
- ret = remove_extent_backref(trans, extent_root, path);
- BUG_ON(ret);
- btrfs_release_path(extent_root, path);
- ret = btrfs_search_slot(trans, extent_root, &key, path, -1, 1);
- BUG_ON(ret);
- extent_slot = path->slots[0];
- }
-
- /* this is where we update the ref count for the extent */
- leaf = path->nodes[0];
- ei = btrfs_item_ptr(leaf, extent_slot, struct btrfs_extent_item);
- refs = btrfs_extent_refs(leaf, ei);
- BUG_ON(refs == 0);
- refs--;
- btrfs_set_extent_refs(leaf, ei, refs);
-
- btrfs_mark_buffer_dirty(leaf);
-
- /*
- * This extent needs deleting. The reason cur_slot is extent_slot +
- * num_to_del is because extent_slot points to the slot where the extent
- * is, and if the backref was not right next to the extent we will be
- * deleting at least 1 item, and will want to start searching at the
- * slot directly next to extent_slot. However if we did find the
- * backref next to the extent item them we will be deleting at least 2
- * items and will want to start searching directly after the ref slot
- */
- if (!refs) {
- struct list_head *pos, *n, *end;
- int cur_slot = extent_slot+num_to_del;
- u64 super_used;
- u64 root_used;
-
- path->slots[0] = extent_slot;
- bytes_freed = op->num_bytes;
-
- mutex_lock(&info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root, op->bytenr,
- op->num_bytes, op->level >=
- BTRFS_FIRST_FREE_OBJECTID);
- mutex_unlock(&info->pinned_mutex);
- BUG_ON(ret < 0);
- op->del = ret;
-
- /*
- * we need to see if we can delete multiple things at once, so
- * start looping through the list of extents we are wanting to
- * delete and see if their extent/backref's are right next to
- * eachother and the extents only have 1 ref
- */
- for (pos = cur->next; pos != del_list; pos = pos->next) {
- struct pending_extent_op *tmp;
-
- tmp = list_entry(pos, struct pending_extent_op, list);
-
- /* we only want to delete extent+ref at this stage */
- if (cur_slot >= btrfs_header_nritems(leaf) - 1)
- break;
-
- btrfs_item_key_to_cpu(leaf, &found_key, cur_slot);
- if (found_key.objectid != tmp->bytenr ||
- found_key.type != BTRFS_EXTENT_ITEM_KEY ||
- found_key.offset != tmp->num_bytes)
- break;
-
- /* check to make sure this extent only has one ref */
- ei = btrfs_item_ptr(leaf, cur_slot,
- struct btrfs_extent_item);
- if (btrfs_extent_refs(leaf, ei) != 1)
- break;
-
- btrfs_item_key_to_cpu(leaf, &found_key, cur_slot+1);
- if (found_key.objectid != tmp->bytenr ||
- found_key.type != BTRFS_EXTENT_REF_KEY ||
- found_key.offset != tmp->orig_parent)
- break;
-
- /*
- * the ref is right next to the extent, we can set the
- * ref count to 0 since we will delete them both now
- */
- btrfs_set_extent_refs(leaf, ei, 0);
-
- /* pin down the bytes for this extent */
- mutex_lock(&info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root, tmp->bytenr,
- tmp->num_bytes, tmp->level >=
- BTRFS_FIRST_FREE_OBJECTID);
- mutex_unlock(&info->pinned_mutex);
- BUG_ON(ret < 0);
-
- /*
- * use the del field to tell if we need to go ahead and
- * free up the extent when we delete the item or not.
- */
- tmp->del = ret;
- bytes_freed += tmp->num_bytes;
-
- num_to_del += 2;
- cur_slot += 2;
- }
- end = pos;
-
- /* update the free space counters */
- spin_lock(&info->delalloc_lock);
- super_used = btrfs_super_bytes_used(&info->super_copy);
- btrfs_set_super_bytes_used(&info->super_copy,
- super_used - bytes_freed);
-
- root_used = btrfs_root_used(&extent_root->root_item);
- btrfs_set_root_used(&extent_root->root_item,
- root_used - bytes_freed);
- spin_unlock(&info->delalloc_lock);
-
- /* delete the items */
- ret = btrfs_del_items(trans, extent_root, path,
- path->slots[0], num_to_del);
- BUG_ON(ret);
-
- /*
- * loop through the extents we deleted and do the cleanup work
- * on them
- */
- for (pos = cur, n = pos->next; pos != end;
- pos = n, n = pos->next) {
- struct pending_extent_op *tmp;
- tmp = list_entry(pos, struct pending_extent_op, list);
-
- /*
- * remember tmp->del tells us wether or not we pinned
- * down the extent
- */
- ret = update_block_group(trans, extent_root,
- tmp->bytenr, tmp->num_bytes, 0,
- tmp->del);
- BUG_ON(ret);
-
- list_del_init(&tmp->list);
- unlock_extent(&info->extent_ins, tmp->bytenr,
- tmp->bytenr + tmp->num_bytes - 1,
- GFP_NOFS);
- kfree(tmp);
- }
- } else if (refs && found_extent) {
- /*
- * the ref and extent were right next to eachother, but the
- * extent still has a ref, so just free the backref and keep
- * going
- */
- ret = remove_extent_backref(trans, extent_root, path);
- BUG_ON(ret);
-
- list_del_init(&op->list);
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1, GFP_NOFS);
- kfree(op);
- } else {
- /*
- * the extent has multiple refs and the backref we were looking
- * for was not right next to it, so just unlock and go next,
- * we're good to go
- */
- list_del_init(&op->list);
- unlock_extent(&info->extent_ins, op->bytenr,
- op->bytenr + op->num_bytes - 1, GFP_NOFS);
- kfree(op);
- }
-
- btrfs_release_path(extent_root, path);
- if (!list_empty(del_list))
- goto search;
-
-out:
- btrfs_free_path(path);
- return ret;
-}
-
static int __btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes,
u64 orig_parent, u64 parent,
u64 orig_root, u64 ref_root,
u64 orig_generation, u64 ref_generation,
u64 owner_objectid)
{
int ret;
- struct btrfs_root *extent_root = root->fs_info->extent_root;
- struct btrfs_path *path;
-
- if (root == root->fs_info->extent_root) {
- struct pending_extent_op *extent_op;
- u64 num_bytes;
-
- BUG_ON(owner_objectid >= BTRFS_MAX_LEVEL);
- num_bytes = btrfs_level_size(root, (int)owner_objectid);
- mutex_lock(&root->fs_info->extent_ins_mutex);
- if (test_range_bit(&root->fs_info->extent_ins, bytenr,
- bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
- u64 priv;
- ret = get_state_private(&root->fs_info->extent_ins,
- bytenr, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)
- (unsigned long)priv;
- BUG_ON(extent_op->parent != orig_parent);
- BUG_ON(extent_op->generation != orig_generation);
+ int pin = owner_objectid < BTRFS_FIRST_FREE_OBJECTID;
- extent_op->parent = parent;
- extent_op->generation = ref_generation;
- } else {
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
-
- extent_op->type = PENDING_BACKREF_UPDATE;
- extent_op->bytenr = bytenr;
- extent_op->num_bytes = num_bytes;
- extent_op->parent = parent;
- extent_op->orig_parent = orig_parent;
- extent_op->generation = ref_generation;
- extent_op->orig_generation = orig_generation;
- extent_op->level = (int)owner_objectid;
- INIT_LIST_HEAD(&extent_op->list);
- extent_op->del = 0;
-
- set_extent_bits(&root->fs_info->extent_ins,
- bytenr, bytenr + num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- set_state_private(&root->fs_info->extent_ins,
- bytenr, (unsigned long)extent_op);
- }
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- return 0;
- }
-
- path = btrfs_alloc_path();
- if (!path)
- return -ENOMEM;
- ret = lookup_extent_backref(trans, extent_root, path,
- bytenr, orig_parent, orig_root,
- orig_generation, owner_objectid, 1);
- if (ret)
- goto out;
- ret = remove_extent_backref(trans, extent_root, path);
- if (ret)
- goto out;
- ret = insert_extent_backref(trans, extent_root, path, bytenr,
- parent, ref_root, ref_generation,
- owner_objectid);
+ ret = btrfs_update_delayed_ref(trans, bytenr, num_bytes,
+ orig_parent, parent, orig_root,
+ ref_root, orig_generation,
+ ref_generation, owner_objectid, pin);
BUG_ON(ret);
- finish_current_insert(trans, extent_root, 0);
- del_pending_extents(trans, extent_root, 0);
-out:
- btrfs_free_path(path);
return ret;
}
int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
- u64 orig_parent, u64 parent,
+ u64 num_bytes, u64 orig_parent, u64 parent,
u64 ref_root, u64 ref_generation,
u64 owner_objectid)
{
@@ -1260,20 +716,36 @@ int btrfs_update_extent_ref(struct btrfs_trans_handle *trans,
if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
return 0;
- ret = __btrfs_update_extent_ref(trans, root, bytenr, orig_parent,
- parent, ref_root, ref_root,
- ref_generation, ref_generation,
- owner_objectid);
+
+ ret = __btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
+ orig_parent, parent, ref_root,
+ ref_root, ref_generation,
+ ref_generation, owner_objectid);
return ret;
}
-
static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes,
u64 orig_parent, u64 parent,
u64 orig_root, u64 ref_root,
u64 orig_generation, u64 ref_generation,
u64 owner_objectid)
{
+ int ret;
+
+ ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent, ref_root,
+ ref_generation, owner_objectid,
+ BTRFS_ADD_DELAYED_REF, 0);
+ BUG_ON(ret);
+ return ret;
+}
+
+static noinline_for_stack int add_extent_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr,
+ u64 num_bytes, u64 parent, u64 ref_root,
+ u64 ref_generation, u64 owner_objectid,
+ int refs_to_add)
+{
struct btrfs_path *path;
int ret;
struct btrfs_key key;
@@ -1286,17 +758,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = 1;
+ path->leave_spinning = 1;
key.objectid = bytenr;
key.type = BTRFS_EXTENT_ITEM_KEY;
- key.offset = (u64)-1;
+ key.offset = num_bytes;
- ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
- 0, 1);
- if (ret < 0)
+ /* first find the extent item and update its reference count */
+ ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key,
+ path, 0, 1);
+ if (ret < 0) {
+ btrfs_set_path_blocking(path);
return ret;
- BUG_ON(ret == 0 || path->slots[0] == 0);
+ }
- path->slots[0]--;
+ if (ret > 0) {
+ WARN_ON(1);
+ btrfs_free_path(path);
+ return -EIO;
+ }
l = path->nodes[0];
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
@@ -1310,21 +789,24 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
BUG_ON(key.type != BTRFS_EXTENT_ITEM_KEY);
item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
+
refs = btrfs_extent_refs(l, item);
- btrfs_set_extent_refs(l, item, refs + 1);
+ btrfs_set_extent_refs(l, item, refs + refs_to_add);
+ btrfs_unlock_up_safe(path, 1);
+
btrfs_mark_buffer_dirty(path->nodes[0]);
btrfs_release_path(root->fs_info->extent_root, path);
path->reada = 1;
+ path->leave_spinning = 1;
+
+ /* now insert the actual backref */
ret = insert_extent_backref(trans, root->fs_info->extent_root,
path, bytenr, parent,
ref_root, ref_generation,
- owner_objectid);
+ owner_objectid, refs_to_add);
BUG_ON(ret);
- finish_current_insert(trans, root->fs_info->extent_root, 0);
- del_pending_extents(trans, root->fs_info->extent_root, 0);
-
btrfs_free_path(path);
return 0;
}
@@ -1339,68 +821,278 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
if (ref_root == BTRFS_TREE_LOG_OBJECTID &&
owner_objectid < BTRFS_FIRST_FREE_OBJECTID)
return 0;
- ret = __btrfs_inc_extent_ref(trans, root, bytenr, 0, parent,
+
+ ret = __btrfs_inc_extent_ref(trans, root, bytenr, num_bytes, 0, parent,
0, ref_root, 0, ref_generation,
owner_objectid);
return ret;
}
-int btrfs_extent_post_op(struct btrfs_trans_handle *trans,
- struct btrfs_root *root)
+static int drop_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node)
+{
+ int ret = 0;
+ struct btrfs_delayed_ref *ref = btrfs_delayed_node_to_ref(node);
+
+ BUG_ON(node->ref_mod == 0);
+ ret = __btrfs_free_extent(trans, root, node->bytenr, node->num_bytes,
+ node->parent, ref->root, ref->generation,
+ ref->owner_objectid, ref->pin, node->ref_mod);
+
+ return ret;
+}
+
+/* helper function to actually process a single delayed ref entry */
+static noinline int run_one_delayed_ref(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_delayed_ref_node *node,
+ int insert_reserved)
{
- u64 start;
- u64 end;
int ret;
+ struct btrfs_delayed_ref *ref;
+
+ if (node->parent == (u64)-1) {
+ struct btrfs_delayed_ref_head *head;
+ /*
+ * we've hit the end of the chain and we were supposed
+ * to insert this extent into the tree. But, it got
+ * deleted before we ever needed to insert it, so all
+ * we have to do is clean up the accounting
+ */
+ if (insert_reserved) {
+ update_reserved_extents(root, node->bytenr,
+ node->num_bytes, 0);
+ }
+ head = btrfs_delayed_node_to_head(node);
+ mutex_unlock(&head->mutex);
+ return 0;
+ }
- while(1) {
- finish_current_insert(trans, root->fs_info->extent_root, 1);
- del_pending_extents(trans, root->fs_info->extent_root, 1);
+ ref = btrfs_delayed_node_to_ref(node);
+ if (ref->action == BTRFS_ADD_DELAYED_REF) {
+ if (insert_reserved) {
+ struct btrfs_key ins;
- /* is there more work to do? */
- ret = find_first_extent_bit(&root->fs_info->pending_del,
- 0, &start, &end, EXTENT_WRITEBACK);
- if (!ret)
- continue;
- ret = find_first_extent_bit(&root->fs_info->extent_ins,
- 0, &start, &end, EXTENT_WRITEBACK);
- if (!ret)
- continue;
- break;
+ ins.objectid = node->bytenr;
+ ins.offset = node->num_bytes;
+ ins.type = BTRFS_EXTENT_ITEM_KEY;
+
+ /* record the full extent allocation */
+ ret = __btrfs_alloc_reserved_extent(trans, root,
+ node->parent, ref->root,
+ ref->generation, ref->owner_objectid,
+ &ins, node->ref_mod);
+ update_reserved_extents(root, node->bytenr,
+ node->num_bytes, 0);
+ } else {
+ /* just add one backref */
+ ret = add_extent_ref(trans, root, node->bytenr,
+ node->num_bytes,
+ node->parent, ref->root, ref->generation,
+ ref->owner_objectid, node->ref_mod);
+ }
+ BUG_ON(ret);
+ } else if (ref->action == BTRFS_DROP_DELAYED_REF) {
+ WARN_ON(insert_reserved);
+ ret = drop_delayed_ref(trans, root, node);
}
return 0;
}
-int btrfs_lookup_extent_ref(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, u64 bytenr,
- u64 num_bytes, u32 *refs)
+static noinline struct btrfs_delayed_ref_node *
+select_delayed_ref(struct btrfs_delayed_ref_head *head)
{
- struct btrfs_path *path;
+ struct rb_node *node;
+ struct btrfs_delayed_ref_node *ref;
+ int action = BTRFS_ADD_DELAYED_REF;
+again:
+ /*
+ * select delayed ref of type BTRFS_ADD_DELAYED_REF first.
+ * this prevents ref count from going down to zero when
+ * there still are pending delayed ref.
+ */
+ node = rb_prev(&head->node.rb_node);
+ while (1) {
+ if (!node)
+ break;
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (ref->bytenr != head->node.bytenr)
+ break;
+ if (btrfs_delayed_node_to_ref(ref)->action == action)
+ return ref;
+ node = rb_prev(node);
+ }
+ if (action == BTRFS_ADD_DELAYED_REF) {
+ action = BTRFS_DROP_DELAYED_REF;
+ goto again;
+ }
+ return NULL;
+}
+
+static noinline int run_clustered_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct list_head *cluster)
+{
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct btrfs_delayed_ref_head *locked_ref = NULL;
int ret;
- struct btrfs_key key;
- struct extent_buffer *l;
- struct btrfs_extent_item *item;
+ int count = 0;
+ int must_insert_reserved = 0;
- WARN_ON(num_bytes < root->sectorsize);
- path = btrfs_alloc_path();
- path->reada = 1;
- key.objectid = bytenr;
- key.offset = num_bytes;
- btrfs_set_key_type(&key, BTRFS_EXTENT_ITEM_KEY);
- ret = btrfs_search_slot(trans, root->fs_info->extent_root, &key, path,
- 0, 0);
- if (ret < 0)
- goto out;
- if (ret != 0) {
- btrfs_print_leaf(root, path->nodes[0]);
- printk(KERN_INFO "btrfs failed to find block number %llu\n",
- (unsigned long long)bytenr);
- BUG();
+ delayed_refs = &trans->transaction->delayed_refs;
+ while (1) {
+ if (!locked_ref) {
+ /* pick a new head ref from the cluster list */
+ if (list_empty(cluster))
+ break;
+
+ locked_ref = list_entry(cluster->next,
+ struct btrfs_delayed_ref_head, cluster);
+
+ /* grab the lock that says we are going to process
+ * all the refs for this head */
+ ret = btrfs_delayed_ref_lock(trans, locked_ref);
+
+ /*
+ * we may have dropped the spin lock to get the head
+ * mutex lock, and that might have given someone else
+ * time to free the head. If that's true, it has been
+ * removed from our list and we can move on.
+ */
+ if (ret == -EAGAIN) {
+ locked_ref = NULL;
+ count++;
+ continue;
+ }
+ }
+
+ /*
+ * record the must insert reserved flag before we
+ * drop the spin lock.
+ */
+ must_insert_reserved = locked_ref->must_insert_reserved;
+ locked_ref->must_insert_reserved = 0;
+
+ /*
+ * locked_ref is the head node, so we have to go one
+ * node back for any delayed ref updates
+ */
+ ref = select_delayed_ref(locked_ref);
+ if (!ref) {
+ /* All delayed refs have been processed, Go ahead
+ * and send the head node to run_one_delayed_ref,
+ * so that any accounting fixes can happen
+ */
+ ref = &locked_ref->node;
+ list_del_init(&locked_ref->cluster);
+ locked_ref = NULL;
+ }
+
+ ref->in_tree = 0;
+ rb_erase(&ref->rb_node, &delayed_refs->root);
+ delayed_refs->num_entries--;
+ spin_unlock(&delayed_refs->lock);
+
+ ret = run_one_delayed_ref(trans, root, ref,
+ must_insert_reserved);
+ BUG_ON(ret);
+ btrfs_put_delayed_ref(ref);
+
+ count++;
+ cond_resched();
+ spin_lock(&delayed_refs->lock);
+ }
+ return count;
+}
+
+/*
+ * this starts processing the delayed reference count updates and
+ * extent insertions we have queued up so far. count can be
+ * 0, which means to process everything in the tree at the start
+ * of the run (but not newly added entries), or it can be some target
+ * number you'd like to process.
+ */
+int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long count)
+{
+ struct rb_node *node;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct list_head cluster;
+ int ret;
+ int run_all = count == (unsigned long)-1;
+ int run_most = 0;
+
+ if (root == root->fs_info->extent_root)
+ root = root->fs_info->tree_root;
+
+ delayed_refs = &trans->transaction->delayed_refs;
+ INIT_LIST_HEAD(&cluster);
+again:
+ spin_lock(&delayed_refs->lock);
+ if (count == 0) {
+ count = delayed_refs->num_entries * 2;
+ run_most = 1;
+ }
+ while (1) {
+ if (!(run_all || run_most) &&
+ delayed_refs->num_heads_ready < 64)
+ break;
+
+ /*
+ * go find something we can process in the rbtree. We start at
+ * the beginning of the tree, and then build a cluster
+ * of refs to process starting at the first one we are able to
+ * lock
+ */
+ ret = btrfs_find_ref_cluster(trans, &cluster,
+ delayed_refs->run_delayed_start);
+ if (ret)
+ break;
+
+ ret = run_clustered_refs(trans, root, &cluster);
+ BUG_ON(ret < 0);
+
+ count -= min_t(unsigned long, ret, count);
+
+ if (count == 0)
+ break;
+ }
+
+ if (run_all) {
+ node = rb_first(&delayed_refs->root);
+ if (!node)
+ goto out;
+ count = (unsigned long)-1;
+
+ while (node) {
+ ref = rb_entry(node, struct btrfs_delayed_ref_node,
+ rb_node);
+ if (btrfs_delayed_ref_is_head(ref)) {
+ struct btrfs_delayed_ref_head *head;
+
+ head = btrfs_delayed_node_to_head(ref);
+ atomic_inc(&ref->refs);
+
+ spin_unlock(&delayed_refs->lock);
+ mutex_lock(&head->mutex);
+ mutex_unlock(&head->mutex);
+
+ btrfs_put_delayed_ref(ref);
+ cond_resched();
+ goto again;
+ }
+ node = rb_next(node);
+ }
+ spin_unlock(&delayed_refs->lock);
+ schedule_timeout(1);
+ goto again;
}
- l = path->nodes[0];
- item = btrfs_item_ptr(l, path->slots[0], struct btrfs_extent_item);
- *refs = btrfs_extent_refs(l, item);
out:
- btrfs_free_path(path);
+ spin_unlock(&delayed_refs->lock);
return 0;
}
@@ -1624,7 +1316,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
int refi = 0;
int slot;
int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *,
- u64, u64, u64, u64, u64, u64, u64, u64);
+ u64, u64, u64, u64, u64, u64, u64, u64, u64);
ref_root = btrfs_header_owner(buf);
ref_generation = btrfs_header_generation(buf);
@@ -1696,12 +1388,19 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
if (level == 0) {
btrfs_item_key_to_cpu(buf, &key, slot);
+ fi = btrfs_item_ptr(buf, slot,
+ struct btrfs_file_extent_item);
+
+ bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
+ if (bytenr == 0)
+ continue;
ret = process_func(trans, root, bytenr,
- orig_buf->start, buf->start,
- orig_root, ref_root,
- orig_generation, ref_generation,
- key.objectid);
+ btrfs_file_extent_disk_num_bytes(buf, fi),
+ orig_buf->start, buf->start,
+ orig_root, ref_root,
+ orig_generation, ref_generation,
+ key.objectid);
if (ret) {
faili = slot;
@@ -1709,7 +1408,7 @@ noinline int btrfs_inc_ref(struct btrfs_trans_handle *trans,
goto fail;
}
} else {
- ret = process_func(trans, root, bytenr,
+ ret = process_func(trans, root, bytenr, buf->len,
orig_buf->start, buf->start,
orig_root, ref_root,
orig_generation, ref_generation,
@@ -1786,17 +1485,17 @@ int btrfs_update_ref(struct btrfs_trans_handle *trans,
if (bytenr == 0)
continue;
ret = __btrfs_update_extent_ref(trans, root, bytenr,
- orig_buf->start, buf->start,
- orig_root, ref_root,
- orig_generation, ref_generation,
- key.objectid);
+ btrfs_file_extent_disk_num_bytes(buf, fi),
+ orig_buf->start, buf->start,
+ orig_root, ref_root, orig_generation,
+ ref_generation, key.objectid);
if (ret)
goto fail;
} else {
bytenr = btrfs_node_blockptr(buf, slot);
ret = __btrfs_update_extent_ref(trans, root, bytenr,
- orig_buf->start, buf->start,
- orig_root, ref_root,
+ buf->len, orig_buf->start,
+ buf->start, orig_root, ref_root,
orig_generation, ref_generation,
level - 1);
if (ret)
@@ -1815,7 +1514,6 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
struct btrfs_block_group_cache *cache)
{
int ret;
- int pending_ret;
struct btrfs_root *extent_root = root->fs_info->extent_root;
unsigned long bi;
struct extent_buffer *leaf;
@@ -1831,12 +1529,8 @@ static int write_one_cache_group(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
btrfs_release_path(extent_root, path);
fail:
- finish_current_insert(trans, extent_root, 0);
- pending_ret = del_pending_extents(trans, extent_root, 0);
if (ret)
return ret;
- if (pending_ret)
- return pending_ret;
return 0;
}
@@ -2361,6 +2055,8 @@ int btrfs_update_pinned_extents(struct btrfs_root *root,
clear_extent_dirty(&fs_info->pinned_extents,
bytenr, bytenr + num - 1, GFP_NOFS);
}
+ mutex_unlock(&root->fs_info->pinned_mutex);
+
while (num > 0) {
cache = btrfs_lookup_block_group(fs_info, bytenr);
BUG_ON(!cache);
@@ -2452,8 +2148,8 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
u64 end;
int ret;
- mutex_lock(&root->fs_info->pinned_mutex);
while (1) {
+ mutex_lock(&root->fs_info->pinned_mutex);
ret = find_first_extent_bit(unpin, 0, &start, &end,
EXTENT_DIRTY);
if (ret)
@@ -2461,209 +2157,21 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans,
ret = btrfs_discard_extent(root, start, end + 1 - start);
+ /* unlocks the pinned mutex */
btrfs_update_pinned_extents(root, start, end + 1 - start, 0);
clear_extent_dirty(unpin, start, end, GFP_NOFS);
- if (need_resched()) {
- mutex_unlock(&root->fs_info->pinned_mutex);
- cond_resched();
- mutex_lock(&root->fs_info->pinned_mutex);
- }
+ cond_resched();
}
mutex_unlock(&root->fs_info->pinned_mutex);
return ret;
}
-static int finish_current_insert(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all)
-{
- u64 start;
- u64 end;
- u64 priv;
- u64 search = 0;
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct btrfs_path *path;
- struct pending_extent_op *extent_op, *tmp;
- struct list_head insert_list, update_list;
- int ret;
- int num_inserts = 0, max_inserts, restart = 0;
-
- path = btrfs_alloc_path();
- INIT_LIST_HEAD(&insert_list);
- INIT_LIST_HEAD(&update_list);
-
- max_inserts = extent_root->leafsize /
- (2 * sizeof(struct btrfs_key) + 2 * sizeof(struct btrfs_item) +
- sizeof(struct btrfs_extent_ref) +
- sizeof(struct btrfs_extent_item));
-again:
- mutex_lock(&info->extent_ins_mutex);
- while (1) {
- ret = find_first_extent_bit(&info->extent_ins, search, &start,
- &end, EXTENT_WRITEBACK);
- if (ret) {
- if (restart && !num_inserts &&
- list_empty(&update_list)) {
- restart = 0;
- search = 0;
- continue;
- }
- break;
- }
-
- ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS);
- if (!ret) {
- if (all)
- restart = 1;
- search = end + 1;
- if (need_resched()) {
- mutex_unlock(&info->extent_ins_mutex);
- cond_resched();
- mutex_lock(&info->extent_ins_mutex);
- }
- continue;
- }
-
- ret = get_state_private(&info->extent_ins, start, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)(unsigned long) priv;
-
- if (extent_op->type == PENDING_EXTENT_INSERT) {
- num_inserts++;
- list_add_tail(&extent_op->list, &insert_list);
- search = end + 1;
- if (num_inserts == max_inserts) {
- restart = 1;
- break;
- }
- } else if (extent_op->type == PENDING_BACKREF_UPDATE) {
- list_add_tail(&extent_op->list, &update_list);
- search = end + 1;
- } else {
- BUG();
- }
- }
-
- /*
- * process the update list, clear the writeback bit for it, and if
- * somebody marked this thing for deletion then just unlock it and be
- * done, the free_extents will handle it
- */
- list_for_each_entry_safe(extent_op, tmp, &update_list, list) {
- clear_extent_bits(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- if (extent_op->del) {
- list_del_init(&extent_op->list);
- unlock_extent(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes
- - 1, GFP_NOFS);
- kfree(extent_op);
- }
- }
- mutex_unlock(&info->extent_ins_mutex);
-
- /*
- * still have things left on the update list, go ahead an update
- * everything
- */
- if (!list_empty(&update_list)) {
- ret = update_backrefs(trans, extent_root, path, &update_list);
- BUG_ON(ret);
-
- /* we may have COW'ed new blocks, so lets start over */
- if (all)
- restart = 1;
- }
-
- /*
- * if no inserts need to be done, but we skipped some extents and we
- * need to make sure everything is cleaned then reset everything and
- * go back to the beginning
- */
- if (!num_inserts && restart) {
- search = 0;
- restart = 0;
- INIT_LIST_HEAD(&update_list);
- INIT_LIST_HEAD(&insert_list);
- goto again;
- } else if (!num_inserts) {
- goto out;
- }
-
- /*
- * process the insert extents list. Again if we are deleting this
- * extent, then just unlock it, pin down the bytes if need be, and be
- * done with it. Saves us from having to actually insert the extent
- * into the tree and then subsequently come along and delete it
- */
- mutex_lock(&info->extent_ins_mutex);
- list_for_each_entry_safe(extent_op, tmp, &insert_list, list) {
- clear_extent_bits(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- if (extent_op->del) {
- u64 used;
- list_del_init(&extent_op->list);
- unlock_extent(&info->extent_ins, extent_op->bytenr,
- extent_op->bytenr + extent_op->num_bytes
- - 1, GFP_NOFS);
-
- mutex_lock(&extent_root->fs_info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root,
- extent_op->bytenr,
- extent_op->num_bytes, 0);
- mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
- spin_lock(&info->delalloc_lock);
- used = btrfs_super_bytes_used(&info->super_copy);
- btrfs_set_super_bytes_used(&info->super_copy,
- used - extent_op->num_bytes);
- used = btrfs_root_used(&extent_root->root_item);
- btrfs_set_root_used(&extent_root->root_item,
- used - extent_op->num_bytes);
- spin_unlock(&info->delalloc_lock);
-
- ret = update_block_group(trans, extent_root,
- extent_op->bytenr,
- extent_op->num_bytes,
- 0, ret > 0);
- BUG_ON(ret);
- kfree(extent_op);
- num_inserts--;
- }
- }
- mutex_unlock(&info->extent_ins_mutex);
-
- ret = insert_extents(trans, extent_root, path, &insert_list,
- num_inserts);
- BUG_ON(ret);
-
- /*
- * if restart is set for whatever reason we need to go back and start
- * searching through the pending list again.
- *
- * We just inserted some extents, which could have resulted in new
- * blocks being allocated, which would result in new blocks needing
- * updates, so if all is set we _must_ restart to get the updated
- * blocks.
- */
- if (restart || all) {
- INIT_LIST_HEAD(&insert_list);
- INIT_LIST_HEAD(&update_list);
- search = 0;
- restart = 0;
- num_inserts = 0;
- goto again;
- }
-out:
- btrfs_free_path(path);
- return 0;
-}
-
static int pin_down_bytes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, int is_data)
+ struct btrfs_path *path,
+ u64 bytenr, u64 num_bytes, int is_data,
+ struct extent_buffer **must_clean)
{
int err = 0;
struct extent_buffer *buf;
@@ -2686,17 +2194,19 @@ static int pin_down_bytes(struct btrfs_trans_handle *trans,
u64 header_transid = btrfs_header_generation(buf);
if (header_owner != BTRFS_TREE_LOG_OBJECTID &&
header_owner != BTRFS_TREE_RELOC_OBJECTID &&
+ header_owner != BTRFS_DATA_RELOC_TREE_OBJECTID &&
header_transid == trans->transid &&
!btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
- clean_tree_block(NULL, root, buf);
- btrfs_tree_unlock(buf);
- free_extent_buffer(buf);
+ *must_clean = buf;
return 1;
}
btrfs_tree_unlock(buf);
}
free_extent_buffer(buf);
pinit:
+ btrfs_set_path_blocking(path);
+ mutex_lock(&root->fs_info->pinned_mutex);
+ /* unlocks the pinned mutex */
btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
BUG_ON(err < 0);
@@ -2710,7 +2220,8 @@ static int __free_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 bytenr, u64 num_bytes, u64 parent,
u64 root_objectid, u64 ref_generation,
- u64 owner_objectid, int pin, int mark_free)
+ u64 owner_objectid, int pin, int mark_free,
+ int refs_to_drop)
{
struct btrfs_path *path;
struct btrfs_key key;
@@ -2732,6 +2243,7 @@ static int __free_extent(struct btrfs_trans_handle *trans,
return -ENOMEM;
path->reada = 1;
+ path->leave_spinning = 1;
ret = lookup_extent_backref(trans, extent_root, path,
bytenr, parent, root_objectid,
ref_generation, owner_objectid, 1);
@@ -2753,9 +2265,11 @@ static int __free_extent(struct btrfs_trans_handle *trans,
break;
}
if (!found_extent) {
- ret = remove_extent_backref(trans, extent_root, path);
+ ret = remove_extent_backref(trans, extent_root, path,
+ refs_to_drop);
BUG_ON(ret);
btrfs_release_path(extent_root, path);
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, extent_root,
&key, path, -1, 1);
if (ret) {
@@ -2771,8 +2285,9 @@ static int __free_extent(struct btrfs_trans_handle *trans,
btrfs_print_leaf(extent_root, path->nodes[0]);
WARN_ON(1);
printk(KERN_ERR "btrfs unable to find ref byte nr %llu "
- "root %llu gen %llu owner %llu\n",
+ "parent %llu root %llu gen %llu owner %llu\n",
(unsigned long long)bytenr,
+ (unsigned long long)parent,
(unsigned long long)root_objectid,
(unsigned long long)ref_generation,
(unsigned long long)owner_objectid);
@@ -2782,17 +2297,23 @@ static int __free_extent(struct btrfs_trans_handle *trans,
ei = btrfs_item_ptr(leaf, extent_slot,
struct btrfs_extent_item);
refs = btrfs_extent_refs(leaf, ei);
- BUG_ON(refs == 0);
- refs -= 1;
- btrfs_set_extent_refs(leaf, ei, refs);
+ /*
+ * we're not allowed to delete the extent item if there
+ * are other delayed ref updates pending
+ */
+
+ BUG_ON(refs < refs_to_drop);
+ refs -= refs_to_drop;
+ btrfs_set_extent_refs(leaf, ei, refs);
btrfs_mark_buffer_dirty(leaf);
- if (refs == 0 && found_extent && path->slots[0] == extent_slot + 1) {
+ if (refs == 0 && found_extent &&
+ path->slots[0] == extent_slot + 1) {
struct btrfs_extent_ref *ref;
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_extent_ref);
- BUG_ON(btrfs_ref_num_refs(leaf, ref) != 1);
+ BUG_ON(btrfs_ref_num_refs(leaf, ref) != refs_to_drop);
/* if the back ref and the extent are next to each other
* they get deleted below in one shot
*/
@@ -2800,11 +2321,13 @@ static int __free_extent(struct btrfs_trans_handle *trans,
num_to_del = 2;
} else if (found_extent) {
/* otherwise delete the extent back ref */
- ret = remove_extent_backref(trans, extent_root, path);
+ ret = remove_extent_backref(trans, extent_root, path,
+ refs_to_drop);
BUG_ON(ret);
/* if refs are 0, we need to setup the path for deletion */
if (refs == 0) {
btrfs_release_path(extent_root, path);
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, extent_root, &key, path,
-1, 1);
BUG_ON(ret);
@@ -2814,16 +2337,18 @@ static int __free_extent(struct btrfs_trans_handle *trans,
if (refs == 0) {
u64 super_used;
u64 root_used;
+ struct extent_buffer *must_clean = NULL;
if (pin) {
- mutex_lock(&root->fs_info->pinned_mutex);
- ret = pin_down_bytes(trans, root, bytenr, num_bytes,
- owner_objectid >= BTRFS_FIRST_FREE_OBJECTID);
- mutex_unlock(&root->fs_info->pinned_mutex);
+ ret = pin_down_bytes(trans, root, path,
+ bytenr, num_bytes,
+ owner_objectid >= BTRFS_FIRST_FREE_OBJECTID,
+ &must_clean);
if (ret > 0)
mark_free = 1;
BUG_ON(ret < 0);
}
+
/* block accounting for super block */
spin_lock(&info->delalloc_lock);
super_used = btrfs_super_bytes_used(&info->super_copy);
@@ -2835,14 +2360,34 @@ static int __free_extent(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item,
root_used - num_bytes);
spin_unlock(&info->delalloc_lock);
+
+ /*
+ * it is going to be very rare for someone to be waiting
+ * on the block we're freeing. del_items might need to
+ * schedule, so rather than get fancy, just force it
+ * to blocking here
+ */
+ if (must_clean)
+ btrfs_set_lock_blocking(must_clean);
+
ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
num_to_del);
BUG_ON(ret);
btrfs_release_path(extent_root, path);
+ if (must_clean) {
+ clean_tree_block(NULL, root, must_clean);
+ btrfs_tree_unlock(must_clean);
+ free_extent_buffer(must_clean);
+ }
+
if (owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) {
ret = btrfs_del_csums(trans, root, bytenr, num_bytes);
BUG_ON(ret);
+ } else {
+ invalidate_mapping_pages(info->btree_inode->i_mapping,
+ bytenr >> PAGE_CACHE_SHIFT,
+ (bytenr + num_bytes - 1) >> PAGE_CACHE_SHIFT);
}
ret = update_block_group(trans, root, bytenr, num_bytes, 0,
@@ -2850,218 +2395,103 @@ static int __free_extent(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
btrfs_free_path(path);
- finish_current_insert(trans, extent_root, 0);
return ret;
}
/*
- * find all the blocks marked as pending in the radix tree and remove
- * them from the extent map
+ * remove an extent from the root, returns 0 on success
*/
-static int del_pending_extents(struct btrfs_trans_handle *trans,
- struct btrfs_root *extent_root, int all)
+static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ u64 bytenr, u64 num_bytes, u64 parent,
+ u64 root_objectid, u64 ref_generation,
+ u64 owner_objectid, int pin,
+ int refs_to_drop)
{
- int ret;
- int err = 0;
- u64 start;
- u64 end;
- u64 priv;
- u64 search = 0;
- int nr = 0, skipped = 0;
- struct extent_io_tree *pending_del;
- struct extent_io_tree *extent_ins;
- struct pending_extent_op *extent_op;
- struct btrfs_fs_info *info = extent_root->fs_info;
- struct list_head delete_list;
-
- INIT_LIST_HEAD(&delete_list);
- extent_ins = &extent_root->fs_info->extent_ins;
- pending_del = &extent_root->fs_info->pending_del;
-
-again:
- mutex_lock(&info->extent_ins_mutex);
- while (1) {
- ret = find_first_extent_bit(pending_del, search, &start, &end,
- EXTENT_WRITEBACK);
- if (ret) {
- if (all && skipped && !nr) {
- search = 0;
- skipped = 0;
- continue;
- }
- mutex_unlock(&info->extent_ins_mutex);
- break;
- }
-
- ret = try_lock_extent(extent_ins, start, end, GFP_NOFS);
- if (!ret) {
- search = end+1;
- skipped = 1;
-
- if (need_resched()) {
- mutex_unlock(&info->extent_ins_mutex);
- cond_resched();
- mutex_lock(&info->extent_ins_mutex);
- }
-
- continue;
- }
- BUG_ON(ret < 0);
-
- ret = get_state_private(pending_del, start, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)(unsigned long)priv;
-
- clear_extent_bits(pending_del, start, end, EXTENT_WRITEBACK,
- GFP_NOFS);
- if (!test_range_bit(extent_ins, start, end,
- EXTENT_WRITEBACK, 0)) {
- list_add_tail(&extent_op->list, &delete_list);
- nr++;
- } else {
- kfree(extent_op);
-
- ret = get_state_private(&info->extent_ins, start,
- &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)
- (unsigned long)priv;
-
- clear_extent_bits(&info->extent_ins, start, end,
- EXTENT_WRITEBACK, GFP_NOFS);
-
- if (extent_op->type == PENDING_BACKREF_UPDATE) {
- list_add_tail(&extent_op->list, &delete_list);
- search = end + 1;
- nr++;
- continue;
- }
-
- mutex_lock(&extent_root->fs_info->pinned_mutex);
- ret = pin_down_bytes(trans, extent_root, start,
- end + 1 - start, 0);
- mutex_unlock(&extent_root->fs_info->pinned_mutex);
-
- ret = update_block_group(trans, extent_root, start,
- end + 1 - start, 0, ret > 0);
-
- unlock_extent(extent_ins, start, end, GFP_NOFS);
- BUG_ON(ret);
- kfree(extent_op);
- }
- if (ret)
- err = ret;
-
- search = end + 1;
-
- if (need_resched()) {
- mutex_unlock(&info->extent_ins_mutex);
- cond_resched();
- mutex_lock(&info->extent_ins_mutex);
- }
- }
+ WARN_ON(num_bytes < root->sectorsize);
- if (nr) {
- ret = free_extents(trans, extent_root, &delete_list);
- BUG_ON(ret);
- }
+ /*
+ * if metadata always pin
+ * if data pin when any transaction has committed this
+ */
+ if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID ||
+ ref_generation != trans->transid)
+ pin = 1;
- if (all && skipped) {
- INIT_LIST_HEAD(&delete_list);
- search = 0;
- nr = 0;
- goto again;
- }
+ if (ref_generation != trans->transid)
+ pin = 1;
- if (!err)
- finish_current_insert(trans, extent_root, 0);
- return err;
+ return __free_extent(trans, root, bytenr, num_bytes, parent,
+ root_objectid, ref_generation,
+ owner_objectid, pin, pin == 0, refs_to_drop);
}
/*
- * remove an extent from the root, returns 0 on success
+ * when we free an extent, it is possible (and likely) that we free the last
+ * delayed ref for that extent as well. This searches the delayed ref tree for
+ * a given extent, and if there are no other delayed refs to be processed, it
+ * removes it from the tree.
*/
-static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
- struct btrfs_root *root,
- u64 bytenr, u64 num_bytes, u64 parent,
- u64 root_objectid, u64 ref_generation,
- u64 owner_objectid, int pin)
+static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 bytenr)
{
- struct btrfs_root *extent_root = root->fs_info->extent_root;
- int pending_ret;
+ struct btrfs_delayed_ref_head *head;
+ struct btrfs_delayed_ref_root *delayed_refs;
+ struct btrfs_delayed_ref_node *ref;
+ struct rb_node *node;
int ret;
- WARN_ON(num_bytes < root->sectorsize);
- if (root == extent_root) {
- struct pending_extent_op *extent_op = NULL;
-
- mutex_lock(&root->fs_info->extent_ins_mutex);
- if (test_range_bit(&root->fs_info->extent_ins, bytenr,
- bytenr + num_bytes - 1, EXTENT_WRITEBACK, 0)) {
- u64 priv;
- ret = get_state_private(&root->fs_info->extent_ins,
- bytenr, &priv);
- BUG_ON(ret);
- extent_op = (struct pending_extent_op *)
- (unsigned long)priv;
+ delayed_refs = &trans->transaction->delayed_refs;
+ spin_lock(&delayed_refs->lock);
+ head = btrfs_find_delayed_ref_head(trans, bytenr);
+ if (!head)
+ goto out;
- extent_op->del = 1;
- if (extent_op->type == PENDING_EXTENT_INSERT) {
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- return 0;
- }
- }
+ node = rb_prev(&head->node.rb_node);
+ if (!node)
+ goto out;
- if (extent_op) {
- ref_generation = extent_op->orig_generation;
- parent = extent_op->orig_parent;
- }
+ ref = rb_entry(node, struct btrfs_delayed_ref_node, rb_node);
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
-
- extent_op->type = PENDING_EXTENT_DELETE;
- extent_op->bytenr = bytenr;
- extent_op->num_bytes = num_bytes;
- extent_op->parent = parent;
- extent_op->orig_parent = parent;
- extent_op->generation = ref_generation;
- extent_op->orig_generation = ref_generation;
- extent_op->level = (int)owner_objectid;
- INIT_LIST_HEAD(&extent_op->list);
- extent_op->del = 0;
-
- set_extent_bits(&root->fs_info->pending_del,
- bytenr, bytenr + num_bytes - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- set_state_private(&root->fs_info->pending_del,
- bytenr, (unsigned long)extent_op);
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- return 0;
- }
- /* if metadata always pin */
- if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
- if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
- mutex_lock(&root->fs_info->pinned_mutex);
- btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
- mutex_unlock(&root->fs_info->pinned_mutex);
- update_reserved_extents(root, bytenr, num_bytes, 0);
- return 0;
- }
- pin = 1;
- }
+ /* there are still entries for this ref, we can't drop it */
+ if (ref->bytenr == bytenr)
+ goto out;
- /* if data pin when any transaction has committed this */
- if (ref_generation != trans->transid)
- pin = 1;
+ /*
+ * waiting for the lock here would deadlock. If someone else has it
+ * locked they are already in the process of dropping it anyway
+ */
+ if (!mutex_trylock(&head->mutex))
+ goto out;
- ret = __free_extent(trans, root, bytenr, num_bytes, parent,
- root_objectid, ref_generation,
- owner_objectid, pin, pin == 0);
+ /*
+ * at this point we have a head with no other entries. Go
+ * ahead and process it.
+ */
+ head->node.in_tree = 0;
+ rb_erase(&head->node.rb_node, &delayed_refs->root);
- finish_current_insert(trans, root->fs_info->extent_root, 0);
- pending_ret = del_pending_extents(trans, root->fs_info->extent_root, 0);
- return ret ? ret : pending_ret;
+ delayed_refs->num_entries--;
+
+ /*
+ * we don't take a ref on the node because we're removing it from the
+ * tree, so we just steal the ref the tree was holding.
+ */
+ delayed_refs->num_heads--;
+ if (list_empty(&head->cluster))
+ delayed_refs->num_heads_ready--;
+
+ list_del_init(&head->cluster);
+ spin_unlock(&delayed_refs->lock);
+
+ ret = run_one_delayed_ref(trans, root->fs_info->tree_root,
+ &head->node, head->must_insert_reserved);
+ BUG_ON(ret);
+ btrfs_put_delayed_ref(&head->node);
+ return 0;
+out:
+ spin_unlock(&delayed_refs->lock);
+ return 0;
}
int btrfs_free_extent(struct btrfs_trans_handle *trans,
@@ -3072,9 +2502,30 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans,
{
int ret;
- ret = __btrfs_free_extent(trans, root, bytenr, num_bytes, parent,
- root_objectid, ref_generation,
- owner_objectid, pin);
+ /*
+ * tree log blocks never actually go into the extent allocation
+ * tree, just update pinning info and exit early.
+ *
+ * data extents referenced by the tree log do need to have
+ * their reference counts bumped.
+ */
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID &&
+ owner_objectid < BTRFS_FIRST_FREE_OBJECTID) {
+ mutex_lock(&root->fs_info->pinned_mutex);
+
+ /* unlocks the pinned mutex */
+ btrfs_update_pinned_extents(root, bytenr, num_bytes, 1);
+ update_reserved_extents(root, bytenr, num_bytes, 0);
+ ret = 0;
+ } else {
+ ret = btrfs_add_delayed_ref(trans, bytenr, num_bytes, parent,
+ root_objectid, ref_generation,
+ owner_objectid,
+ BTRFS_DROP_DELAYED_REF, 1);
+ BUG_ON(ret);
+ ret = check_ref_cleanup(trans, root, bytenr);
+ BUG_ON(ret);
+ }
return ret;
}
@@ -3475,10 +2926,10 @@ int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 parent,
u64 root_objectid, u64 ref_generation,
- u64 owner, struct btrfs_key *ins)
+ u64 owner, struct btrfs_key *ins,
+ int ref_mod)
{
int ret;
- int pending_ret;
u64 super_used;
u64 root_used;
u64 num_bytes = ins->offset;
@@ -3503,33 +2954,6 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
btrfs_set_root_used(&root->root_item, root_used + num_bytes);
spin_unlock(&info->delalloc_lock);
- if (root == extent_root) {
- struct pending_extent_op *extent_op;
-
- extent_op = kmalloc(sizeof(*extent_op), GFP_NOFS);
- BUG_ON(!extent_op);
-
- extent_op->type = PENDING_EXTENT_INSERT;
- extent_op->bytenr = ins->objectid;
- extent_op->num_bytes = ins->offset;
- extent_op->parent = parent;
- extent_op->orig_parent = 0;
- extent_op->generation = ref_generation;
- extent_op->orig_generation = 0;
- extent_op->level = (int)owner;
- INIT_LIST_HEAD(&extent_op->list);
- extent_op->del = 0;
-
- mutex_lock(&root->fs_info->extent_ins_mutex);
- set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
- ins->objectid + ins->offset - 1,
- EXTENT_WRITEBACK, GFP_NOFS);
- set_state_private(&root->fs_info->extent_ins,
- ins->objectid, (unsigned long)extent_op);
- mutex_unlock(&root->fs_info->extent_ins_mutex);
- goto update_block;
- }
-
memcpy(&keys[0], ins, sizeof(*ins));
keys[1].objectid = ins->objectid;
keys[1].type = BTRFS_EXTENT_REF_KEY;
@@ -3540,37 +2964,31 @@ static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_items(trans, extent_root, path, keys,
sizes, 2);
BUG_ON(ret);
extent_item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
- btrfs_set_extent_refs(path->nodes[0], extent_item, 1);
+ btrfs_set_extent_refs(path->nodes[0], extent_item, ref_mod);
ref = btrfs_item_ptr(path->nodes[0], path->slots[0] + 1,
struct btrfs_extent_ref);
btrfs_set_ref_root(path->nodes[0], ref, root_objectid);
btrfs_set_ref_generation(path->nodes[0], ref, ref_generation);
btrfs_set_ref_objectid(path->nodes[0], ref, owner);
- btrfs_set_ref_num_refs(path->nodes[0], ref, 1);
+ btrfs_set_ref_num_refs(path->nodes[0], ref, ref_mod);
btrfs_mark_buffer_dirty(path->nodes[0]);
trans->alloc_exclude_start = 0;
trans->alloc_exclude_nr = 0;
btrfs_free_path(path);
- finish_current_insert(trans, extent_root, 0);
- pending_ret = del_pending_extents(trans, extent_root, 0);
if (ret)
goto out;
- if (pending_ret) {
- ret = pending_ret;
- goto out;
- }
-update_block:
ret = update_block_group(trans, root, ins->objectid,
ins->offset, 1, 0);
if (ret) {
@@ -3592,9 +3010,12 @@ int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
if (root_objectid == BTRFS_TREE_LOG_OBJECTID)
return 0;
- ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
- ref_generation, owner, ins);
- update_reserved_extents(root, ins->objectid, ins->offset, 0);
+
+ ret = btrfs_add_delayed_ref(trans, ins->objectid,
+ ins->offset, parent, root_objectid,
+ ref_generation, owner,
+ BTRFS_ADD_DELAYED_EXTENT, 0);
+ BUG_ON(ret);
return ret;
}
@@ -3621,7 +3042,7 @@ int btrfs_alloc_logged_extent(struct btrfs_trans_handle *trans,
BUG_ON(ret);
put_block_group(block_group);
ret = __btrfs_alloc_reserved_extent(trans, root, parent, root_objectid,
- ref_generation, owner, ins);
+ ref_generation, owner, ins, 1);
return ret;
}
@@ -3640,20 +3061,18 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
u64 search_end, struct btrfs_key *ins, u64 data)
{
int ret;
-
ret = __btrfs_reserve_extent(trans, root, num_bytes,
min_alloc_size, empty_size, hint_byte,
search_end, ins, data);
BUG_ON(ret);
if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
- ret = __btrfs_alloc_reserved_extent(trans, root, parent,
- root_objectid, ref_generation,
- owner_objectid, ins);
+ ret = btrfs_add_delayed_ref(trans, ins->objectid,
+ ins->offset, parent, root_objectid,
+ ref_generation, owner_objectid,
+ BTRFS_ADD_DELAYED_EXTENT, 0);
BUG_ON(ret);
-
- } else {
- update_reserved_extents(root, ins->objectid, ins->offset, 1);
}
+ update_reserved_extents(root, ins->objectid, ins->offset, 1);
return ret;
}
@@ -3789,7 +3208,7 @@ int btrfs_drop_leaf_ref(struct btrfs_trans_handle *trans,
fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
- ret = __btrfs_free_extent(trans, root, disk_bytenr,
+ ret = btrfs_free_extent(trans, root, disk_bytenr,
btrfs_file_extent_disk_num_bytes(leaf, fi),
leaf->start, leaf_owner, leaf_generation,
key.objectid, 0);
@@ -3829,7 +3248,7 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
*/
for (i = 0; i < ref->nritems; i++) {
info = ref->extents + sorted[i].slot;
- ret = __btrfs_free_extent(trans, root, info->bytenr,
+ ret = btrfs_free_extent(trans, root, info->bytenr,
info->num_bytes, ref->bytenr,
ref->owner, ref->generation,
info->objectid, 0);
@@ -3846,12 +3265,13 @@ static noinline int cache_drop_leaf_ref(struct btrfs_trans_handle *trans,
return 0;
}
-static int drop_snap_lookup_refcount(struct btrfs_root *root, u64 start,
+static int drop_snap_lookup_refcount(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 start,
u64 len, u32 *refs)
{
int ret;
- ret = btrfs_lookup_extent_ref(NULL, root, start, len, refs);
+ ret = btrfs_lookup_extent_ref(trans, root, start, len, refs);
BUG_ON(ret);
#if 0 /* some debugging code in case we see problems here */
@@ -3959,7 +3379,8 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
* we just decrement it below and don't update any
* of the refs the leaf points to.
*/
- ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+ ret = drop_snap_lookup_refcount(trans, root, bytenr,
+ blocksize, &refs);
BUG_ON(ret);
if (refs != 1)
continue;
@@ -4010,7 +3431,7 @@ static noinline int drop_level_one_refs(struct btrfs_trans_handle *trans,
*/
for (i = 0; i < refi; i++) {
bytenr = sorted[i].bytenr;
- ret = __btrfs_free_extent(trans, root, bytenr,
+ ret = btrfs_free_extent(trans, root, bytenr,
blocksize, eb->start,
root_owner, root_gen, 0, 1);
BUG_ON(ret);
@@ -4053,7 +3474,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
WARN_ON(*level < 0);
WARN_ON(*level >= BTRFS_MAX_LEVEL);
- ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
+ ret = drop_snap_lookup_refcount(trans, root, path->nodes[*level]->start,
path->nodes[*level]->len, &refs);
BUG_ON(ret);
if (refs > 1)
@@ -4104,7 +3525,8 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
blocksize = btrfs_level_size(root, *level - 1);
- ret = drop_snap_lookup_refcount(root, bytenr, blocksize, &refs);
+ ret = drop_snap_lookup_refcount(trans, root, bytenr,
+ blocksize, &refs);
BUG_ON(ret);
/*
@@ -4119,7 +3541,7 @@ static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
root_gen = btrfs_header_generation(parent);
path->slots[*level]++;
- ret = __btrfs_free_extent(trans, root, bytenr,
+ ret = btrfs_free_extent(trans, root, bytenr,
blocksize, parent->start,
root_owner, root_gen,
*level - 1, 1);
@@ -4165,7 +3587,7 @@ out:
* cleanup and free the reference on the last node
* we processed
*/
- ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
+ ret = btrfs_free_extent(trans, root, bytenr, blocksize,
parent->start, root_owner, root_gen,
*level, 1);
free_extent_buffer(path->nodes[*level]);
@@ -4354,6 +3776,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
struct btrfs_path *path;
int i;
int orig_level;
+ int update_count;
struct btrfs_root_item *root_item = &root->root_item;
WARN_ON(!mutex_is_locked(&root->fs_info->drop_mutex));
@@ -4395,6 +3818,7 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
}
}
while (1) {
+ unsigned long update;
wret = walk_down_tree(trans, root, path, &level);
if (wret > 0)
break;
@@ -4407,12 +3831,21 @@ int btrfs_drop_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root
break;
if (wret < 0)
ret = wret;
- if (trans->transaction->in_commit) {
+ if (trans->transaction->in_commit ||
+ trans->transaction->delayed_refs.flushing) {
ret = -EAGAIN;
break;
}
atomic_inc(&root->fs_info->throttle_gen);
wake_up(&root->fs_info->transaction_throttle);
+ for (update_count = 0; update_count < 16; update_count++) {
+ update = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (update)
+ btrfs_run_delayed_refs(trans, root, update);
+ else
+ break;
+ }
}
for (i = 0; i <= orig_level; i++) {
if (path->nodes[i]) {
@@ -5457,6 +4890,7 @@ static noinline int replace_extents_in_leaf(struct btrfs_trans_handle *trans,
root->root_key.objectid,
trans->transid, key.objectid);
BUG_ON(ret);
+
ret = btrfs_free_extent(trans, root,
bytenr, num_bytes, leaf->start,
btrfs_header_owner(leaf),
@@ -5768,9 +5202,6 @@ static noinline int relocate_tree_block(struct btrfs_trans_handle *trans,
ref_path, NULL, NULL);
BUG_ON(ret);
- if (root == root->fs_info->extent_root)
- btrfs_extent_post_op(trans, root);
-
return 0;
}
@@ -6038,6 +5469,7 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_inode(trans, root, path, objectid);
if (ret)
goto out;
@@ -6208,6 +5640,9 @@ again:
btrfs_remove_leaf_refs(info->tree_root, (u64)-1, 1);
mutex_unlock(&root->fs_info->cleaner_mutex);
+ trans = btrfs_start_transaction(info->tree_root, 1);
+ btrfs_commit_transaction(trans, info->tree_root);
+
while (1) {
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
@@ -6466,7 +5901,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
extent_root = root->fs_info->extent_root;
- root->fs_info->last_trans_new_blockgroup = trans->transid;
+ root->fs_info->last_trans_log_full_commit = trans->transid;
cache = kzalloc(sizeof(*cache), GFP_NOFS);
if (!cache)
@@ -6500,9 +5935,6 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans,
sizeof(cache->item));
BUG_ON(ret);
- finish_current_insert(trans, extent_root, 0);
- ret = del_pending_extents(trans, extent_root, 0);
- BUG_ON(ret);
set_avail_alloc_bits(extent_root->fs_info, type);
return 0;
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index ebe6b29e606..08085af089e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3124,20 +3124,15 @@ void free_extent_buffer(struct extent_buffer *eb)
int clear_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb)
{
- int set;
unsigned long i;
unsigned long num_pages;
struct page *page;
- u64 start = eb->start;
- u64 end = start + eb->len - 1;
-
- set = clear_extent_dirty(tree, start, end, GFP_NOFS);
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++) {
page = extent_buffer_page(eb, i);
- if (!set && !PageDirty(page))
+ if (!PageDirty(page))
continue;
lock_page(page);
@@ -3146,22 +3141,6 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
else
set_page_private(page, EXTENT_PAGE_PRIVATE);
- /*
- * if we're on the last page or the first page and the
- * block isn't aligned on a page boundary, do extra checks
- * to make sure we don't clean page that is partially dirty
- */
- if ((i == 0 && (eb->start & (PAGE_CACHE_SIZE - 1))) ||
- ((i == num_pages - 1) &&
- ((eb->start + eb->len) & (PAGE_CACHE_SIZE - 1)))) {
- start = (u64)page->index << PAGE_CACHE_SHIFT;
- end = start + PAGE_CACHE_SIZE - 1;
- if (test_range_bit(tree, start, end,
- EXTENT_DIRTY, 0)) {
- unlock_page(page);
- continue;
- }
- }
clear_page_dirty_for_io(page);
spin_lock_irq(&page->mapping->tree_lock);
if (!PageDirty(page)) {
@@ -3187,29 +3166,13 @@ int set_extent_buffer_dirty(struct extent_io_tree *tree,
{
unsigned long i;
unsigned long num_pages;
+ int was_dirty = 0;
+ was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags);
num_pages = num_extent_pages(eb->start, eb->len);
- for (i = 0; i < num_pages; i++) {
- struct page *page = extent_buffer_page(eb, i);
- /* writepage may need to do something special for the
- * first page, we have to make sure page->private is
- * properly set. releasepage may drop page->private
- * on us if the page isn't already dirty.
- */
- lock_page(page);
- if (i == 0) {
- set_page_extent_head(page, eb->len);
- } else if (PagePrivate(page) &&
- page->private != EXTENT_PAGE_PRIVATE) {
- set_page_extent_mapped(page);
- }
+ for (i = 0; i < num_pages; i++)
__set_page_dirty_nobuffers(extent_buffer_page(eb, i));
- set_extent_dirty(tree, page_offset(page),
- page_offset(page) + PAGE_CACHE_SIZE - 1,
- GFP_NOFS);
- unlock_page(page);
- }
- return 0;
+ return was_dirty;
}
int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
@@ -3789,6 +3752,10 @@ int try_release_extent_buffer(struct extent_io_tree *tree, struct page *page)
ret = 0;
goto out;
}
+ if (test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) {
+ ret = 0;
+ goto out;
+ }
/* at this point we can safely release the extent buffer */
num_pages = num_extent_pages(eb->start, eb->len);
for (i = 0; i < num_pages; i++)
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 1f9df88afbf..5bc20abf3f3 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -25,6 +25,7 @@
/* these are bit numbers for test/set bit */
#define EXTENT_BUFFER_UPTODATE 0
#define EXTENT_BUFFER_BLOCKING 1
+#define EXTENT_BUFFER_DIRTY 2
/*
* page->private values. Every page that is controlled by the extent
@@ -254,6 +255,8 @@ int clear_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb);
int set_extent_buffer_dirty(struct extent_io_tree *tree,
struct extent_buffer *eb);
+int test_extent_buffer_dirty(struct extent_io_tree *tree,
+ struct extent_buffer *eb);
int set_extent_buffer_uptodate(struct extent_io_tree *tree,
struct extent_buffer *eb);
int clear_extent_buffer_uptodate(struct extent_io_tree *tree,
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 964652435fd..9b99886562d 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -52,6 +52,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans,
file_key.offset = pos;
btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
sizeof(*item));
if (ret < 0)
@@ -523,6 +524,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
key.offset = end_byte - 1;
key.type = BTRFS_EXTENT_CSUM_KEY;
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
if (path->slots[0] == 0)
@@ -757,8 +759,10 @@ insert:
} else {
ins_size = csum_size;
}
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &file_key,
ins_size);
+ path->leave_spinning = 0;
if (ret < 0)
goto fail_unlock;
if (ret != 0) {
@@ -776,7 +780,6 @@ found:
item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
btrfs_item_size_nr(leaf, path->slots[0]));
eb_token = NULL;
- cond_resched();
next_sector:
if (!eb_token ||
@@ -817,9 +820,9 @@ next_sector:
eb_token = NULL;
}
btrfs_mark_buffer_dirty(path->nodes[0]);
- cond_resched();
if (total_bytes < sums->len) {
btrfs_release_path(root, path);
+ cond_resched();
goto again;
}
out:
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index dc78954861b..9c9fb46ccd0 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -606,6 +606,7 @@ next_slot:
btrfs_set_key_type(&ins, BTRFS_EXTENT_DATA_KEY);
btrfs_release_path(root, path);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &ins,
sizeof(*extent));
BUG_ON(ret);
@@ -639,17 +640,22 @@ next_slot:
ram_bytes);
btrfs_set_file_extent_type(leaf, extent, found_type);
+ btrfs_unlock_up_safe(path, 1);
btrfs_mark_buffer_dirty(path->nodes[0]);
+ btrfs_set_lock_blocking(path->nodes[0]);
if (disk_bytenr != 0) {
ret = btrfs_update_extent_ref(trans, root,
- disk_bytenr, orig_parent,
+ disk_bytenr,
+ le64_to_cpu(old.disk_num_bytes),
+ orig_parent,
leaf->start,
root->root_key.objectid,
trans->transid, ins.objectid);
BUG_ON(ret);
}
+ path->leave_spinning = 0;
btrfs_release_path(root, path);
if (disk_bytenr != 0)
inode_add_bytes(inode, extent_end - end);
@@ -912,7 +918,7 @@ again:
btrfs_set_file_extent_other_encoding(leaf, fi, 0);
if (orig_parent != leaf->start) {
- ret = btrfs_update_extent_ref(trans, root, bytenr,
+ ret = btrfs_update_extent_ref(trans, root, bytenr, num_bytes,
orig_parent, leaf->start,
root->root_key.objectid,
trans->transid, inode->i_ino);
@@ -1155,6 +1161,20 @@ out_nolock:
page_cache_release(pinned[1]);
*ppos = pos;
+ /*
+ * we want to make sure fsync finds this change
+ * but we haven't joined a transaction running right now.
+ *
+ * Later on, someone is sure to update the inode and get the
+ * real transid recorded.
+ *
+ * We set last_trans now to the fs_info generation + 1,
+ * this will either be one more than the running transaction
+ * or the generation used for the next transaction if there isn't
+ * one running right now.
+ */
+ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
+
if (num_written > 0 && will_write) {
struct btrfs_trans_handle *trans;
@@ -1167,8 +1187,11 @@ out_nolock:
ret = btrfs_log_dentry_safe(trans, root,
file->f_dentry);
if (ret == 0) {
- btrfs_sync_log(trans, root);
- btrfs_end_transaction(trans, root);
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ btrfs_end_transaction(trans, root);
+ else
+ btrfs_commit_transaction(trans, root);
} else {
btrfs_commit_transaction(trans, root);
}
@@ -1185,6 +1208,18 @@ out_nolock:
int btrfs_release_file(struct inode *inode, struct file *filp)
{
+ /*
+ * ordered_data_close is set by settattr when we are about to truncate
+ * a file from a non-zero size to a zero size. This tries to
+ * flush down new bytes that may have been written if the
+ * application were using truncate to replace a file in place.
+ */
+ if (BTRFS_I(inode)->ordered_data_close) {
+ BTRFS_I(inode)->ordered_data_close = 0;
+ btrfs_add_ordered_operation(NULL, BTRFS_I(inode)->root, inode);
+ if (inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ filemap_flush(inode->i_mapping);
+ }
if (filp->private_data)
btrfs_ioctl_trans_end(filp);
return 0;
@@ -1260,8 +1295,11 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync)
if (ret > 0) {
ret = btrfs_commit_transaction(trans, root);
} else {
- btrfs_sync_log(trans, root);
- ret = btrfs_end_transaction(trans, root);
+ ret = btrfs_sync_log(trans, root);
+ if (ret == 0)
+ ret = btrfs_end_transaction(trans, root);
+ else
+ ret = btrfs_commit_transaction(trans, root);
}
mutex_lock(&dentry->d_inode->i_mutex);
out:
diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c
index 3d46fa1f29a..6b627c61180 100644
--- a/fs/btrfs/inode-item.c
+++ b/fs/btrfs/inode-item.c
@@ -73,6 +73,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
+
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
ret = -ENOENT;
@@ -127,6 +129,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_item(trans, root, path, &key,
ins_len);
if (ret == -EEXIST) {
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 17e608c4dc7..06d8db5afb0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -134,6 +134,7 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
if (!path)
return -ENOMEM;
+ path->leave_spinning = 1;
btrfs_set_trans_block_group(trans, inode);
key.objectid = inode->i_ino;
@@ -167,9 +168,9 @@ static noinline int insert_inline_extent(struct btrfs_trans_handle *trans,
cur_size = min_t(unsigned long, compressed_size,
PAGE_CACHE_SIZE);
- kaddr = kmap(cpage);
+ kaddr = kmap_atomic(cpage, KM_USER0);
write_extent_buffer(leaf, kaddr, ptr, cur_size);
- kunmap(cpage);
+ kunmap_atomic(kaddr, KM_USER0);
i++;
ptr += cur_size;
@@ -204,7 +205,7 @@ fail:
* does the checks required to make sure the data is small enough
* to fit as an inline extent.
*/
-static int cow_file_range_inline(struct btrfs_trans_handle *trans,
+static noinline int cow_file_range_inline(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct inode *inode, u64 start, u64 end,
size_t compressed_size,
@@ -854,11 +855,6 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page,
u64 cur_end;
int limit = 10 * 1024 * 1042;
- if (!btrfs_test_opt(root, COMPRESS)) {
- return cow_file_range(inode, locked_page, start, end,
- page_started, nr_written, 1);
- }
-
clear_extent_bit(&BTRFS_I(inode)->io_tree, start, end, EXTENT_LOCKED |
EXTENT_DELALLOC, 1, 0, GFP_NOFS);
while (start < end) {
@@ -935,7 +931,8 @@ static noinline int csum_exist_in_range(struct btrfs_root *root,
* If no cow copies or snapshots exist, we write directly to the existing
* blocks on disk
*/
-static int run_delalloc_nocow(struct inode *inode, struct page *locked_page,
+static noinline int run_delalloc_nocow(struct inode *inode,
+ struct page *locked_page,
u64 start, u64 end, int *page_started, int force,
unsigned long *nr_written)
{
@@ -1133,6 +1130,7 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
unsigned long *nr_written)
{
int ret;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
if (btrfs_test_flag(inode, NODATACOW))
ret = run_delalloc_nocow(inode, locked_page, start, end,
@@ -1140,10 +1138,12 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page,
else if (btrfs_test_flag(inode, PREALLOC))
ret = run_delalloc_nocow(inode, locked_page, start, end,
page_started, 0, nr_written);
+ else if (!btrfs_test_opt(root, COMPRESS))
+ ret = cow_file_range(inode, locked_page, start, end,
+ page_started, nr_written, 1);
else
ret = cow_file_range_async(inode, locked_page, start, end,
page_started, nr_written);
-
return ret;
}
@@ -1453,6 +1453,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
+ path->leave_spinning = 1;
ret = btrfs_drop_extents(trans, root, inode, file_pos,
file_pos + num_bytes, file_pos, &hint);
BUG_ON(ret);
@@ -1475,6 +1476,10 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
btrfs_set_file_extent_compression(leaf, fi, compression);
btrfs_set_file_extent_encryption(leaf, fi, encryption);
btrfs_set_file_extent_other_encoding(leaf, fi, other_encoding);
+
+ btrfs_unlock_up_safe(path, 1);
+ btrfs_set_lock_blocking(leaf);
+
btrfs_mark_buffer_dirty(leaf);
inode_add_bytes(inode, num_bytes);
@@ -1487,11 +1492,35 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans,
root->root_key.objectid,
trans->transid, inode->i_ino, &ins);
BUG_ON(ret);
-
btrfs_free_path(path);
+
return 0;
}
+/*
+ * helper function for btrfs_finish_ordered_io, this
+ * just reads in some of the csum leaves to prime them into ram
+ * before we start the transaction. It limits the amount of btree
+ * reads required while inside the transaction.
+ */
+static noinline void reada_csum(struct btrfs_root *root,
+ struct btrfs_path *path,
+ struct btrfs_ordered_extent *ordered_extent)
+{
+ struct btrfs_ordered_sum *sum;
+ u64 bytenr;
+
+ sum = list_entry(ordered_extent->list.next, struct btrfs_ordered_sum,
+ list);
+ bytenr = sum->sums[0].bytenr;
+
+ /*
+ * we don't care about the results, the point of this search is
+ * just to get the btree leaves into ram
+ */
+ btrfs_lookup_csum(NULL, root->fs_info->csum_root, path, bytenr, 0);
+}
+
/* as ordered data IO finishes, this gets called so we can finish
* an ordered extent if the range of bytes in the file it covers are
* fully written.
@@ -1500,8 +1529,9 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_trans_handle *trans;
- struct btrfs_ordered_extent *ordered_extent;
+ struct btrfs_ordered_extent *ordered_extent = NULL;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+ struct btrfs_path *path;
int compressed = 0;
int ret;
@@ -1509,9 +1539,33 @@ static int btrfs_finish_ordered_io(struct inode *inode, u64 start, u64 end)
if (!ret)
return 0;
+ /*
+ * before we join the transaction, try to do some of our IO.
+ * This will limit the amount of IO that we have to do with
+ * the transaction running. We're unlikely to need to do any
+ * IO if the file extents are new, the disk_i_size checks
+ * covers the most common case.
+ */
+ if (start < BTRFS_I(inode)->disk_i_size) {
+ path = btrfs_alloc_path();
+ if (path) {
+ ret = btrfs_lookup_file_extent(NULL, root, path,
+ inode->i_ino,
+ start, 0);
+ ordered_extent = btrfs_lookup_ordered_extent(inode,
+ start);
+ if (!list_empty(&ordered_extent->list)) {
+ btrfs_release_path(root, path);
+ reada_csum(root, path, ordered_extent);
+ }
+ btrfs_free_path(path);
+ }
+ }
+
trans = btrfs_join_transaction(root, 1);
- ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+ if (!ordered_extent)
+ ordered_extent = btrfs_lookup_ordered_extent(inode, start);
BUG_ON(!ordered_extent);
if (test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags))
goto nocow;
@@ -2101,6 +2155,7 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans,
path = btrfs_alloc_path();
BUG_ON(!path);
+ path->leave_spinning = 1;
ret = btrfs_lookup_inode(trans, root, path,
&BTRFS_I(inode)->location, 1);
if (ret) {
@@ -2147,6 +2202,7 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
goto err;
}
+ path->leave_spinning = 1;
di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
name, name_len, -1);
if (IS_ERR(di)) {
@@ -2190,8 +2246,6 @@ int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref_in_log(trans, root, name, name_len,
inode, dir->i_ino);
BUG_ON(ret != 0 && ret != -ENOENT);
- if (ret != -ENOENT)
- BTRFS_I(dir)->log_dirty_trans = trans->transid;
ret = btrfs_del_dir_entries_in_log(trans, root, name, name_len,
dir, index);
@@ -2224,6 +2278,9 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry)
trans = btrfs_start_transaction(root, 1);
btrfs_set_trans_block_group(trans, dir);
+
+ btrfs_record_unlink_dir(trans, dir, dentry->d_inode, 0);
+
ret = btrfs_unlink_inode(trans, root, dir, dentry->d_inode,
dentry->d_name.name, dentry->d_name.len);
@@ -2498,6 +2555,7 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
key.type = (u8)-1;
search_again:
+ path->leave_spinning = 1;
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret < 0)
goto error;
@@ -2644,6 +2702,7 @@ delete:
break;
}
if (found_extent) {
+ btrfs_set_path_blocking(path);
ret = btrfs_free_extent(trans, root, extent_start,
extent_num_bytes,
leaf->start, root_owner,
@@ -2848,11 +2907,21 @@ static int btrfs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
- if (S_ISREG(inode->i_mode) &&
- attr->ia_valid & ATTR_SIZE && attr->ia_size > inode->i_size) {
- err = btrfs_cont_expand(inode, attr->ia_size);
- if (err)
- return err;
+ if (S_ISREG(inode->i_mode) && (attr->ia_valid & ATTR_SIZE)) {
+ if (attr->ia_size > inode->i_size) {
+ err = btrfs_cont_expand(inode, attr->ia_size);
+ if (err)
+ return err;
+ } else if (inode->i_size > 0 &&
+ attr->ia_size == 0) {
+
+ /* we're truncating a file that used to have good
+ * data down to zero. Make sure it gets into
+ * the ordered flush list so that any new writes
+ * get down to disk quickly.
+ */
+ BTRFS_I(inode)->ordered_data_close = 1;
+ }
}
err = inode_setattr(inode, attr);
@@ -2984,13 +3053,14 @@ static noinline void init_btrfs_i(struct inode *inode)
bi->disk_i_size = 0;
bi->flags = 0;
bi->index_cnt = (u64)-1;
- bi->log_dirty_trans = 0;
+ bi->last_unlink_trans = 0;
extent_map_tree_init(&BTRFS_I(inode)->extent_tree, GFP_NOFS);
extent_io_tree_init(&BTRFS_I(inode)->io_tree,
inode->i_mapping, GFP_NOFS);
extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
inode->i_mapping, GFP_NOFS);
INIT_LIST_HEAD(&BTRFS_I(inode)->delalloc_inodes);
+ INIT_LIST_HEAD(&BTRFS_I(inode)->ordered_operations);
btrfs_ordered_inode_tree_init(&BTRFS_I(inode)->ordered_tree);
mutex_init(&BTRFS_I(inode)->extent_mutex);
mutex_init(&BTRFS_I(inode)->log_mutex);
@@ -3449,6 +3519,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans,
sizes[0] = sizeof(struct btrfs_inode_item);
sizes[1] = name_len + sizeof(*ref);
+ path->leave_spinning = 1;
ret = btrfs_insert_empty_items(trans, root, path, key, sizes, 2);
if (ret != 0)
goto fail;
@@ -3727,6 +3798,8 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir,
drop_inode = 1;
nr = trans->blocks_used;
+
+ btrfs_log_new_name(trans, inode, NULL, dentry->d_parent);
btrfs_end_transaction_throttle(trans, root);
fail:
if (drop_inode) {
@@ -4363,6 +4436,8 @@ again:
}
ClearPageChecked(page);
set_page_dirty(page);
+
+ BTRFS_I(inode)->last_trans = root->fs_info->generation + 1;
unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
out_unlock:
@@ -4388,6 +4463,27 @@ static void btrfs_truncate(struct inode *inode)
btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
trans = btrfs_start_transaction(root, 1);
+
+ /*
+ * setattr is responsible for setting the ordered_data_close flag,
+ * but that is only tested during the last file release. That
+ * could happen well after the next commit, leaving a great big
+ * window where new writes may get lost if someone chooses to write
+ * to this file after truncating to zero
+ *
+ * The inode doesn't have any dirty data here, and so if we commit
+ * this is a noop. If someone immediately starts writing to the inode
+ * it is very likely we'll catch some of their writes in this
+ * transaction, and the commit will find this file on the ordered
+ * data list with good things to send down.
+ *
+ * This is a best effort solution, there is still a window where
+ * using truncate to replace the contents of the file will
+ * end up with a zero length file after a crash.
+ */
+ if (inode->i_size == 0 && BTRFS_I(inode)->ordered_data_close)
+ btrfs_add_ordered_operation(trans, root, inode);
+
btrfs_set_trans_block_group(trans, inode);
btrfs_i_size_write(inode, inode->i_size);
@@ -4464,12 +4560,15 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
ei->i_acl = BTRFS_ACL_NOT_CACHED;
ei->i_default_acl = BTRFS_ACL_NOT_CACHED;
INIT_LIST_HEAD(&ei->i_orphan);
+ INIT_LIST_HEAD(&ei->ordered_operations);
return &ei->vfs_inode;
}
void btrfs_destroy_inode(struct inode *inode)
{
struct btrfs_ordered_extent *ordered;
+ struct btrfs_root *root = BTRFS_I(inode)->root;
+
WARN_ON(!list_empty(&inode->i_dentry));
WARN_ON(inode->i_data.nrpages);
@@ -4480,13 +4579,24 @@ void btrfs_destroy_inode(struct inode *inode)
BTRFS_I(inode)->i_default_acl != BTRFS_ACL_NOT_CACHED)
posix_acl_release(BTRFS_I(inode)->i_default_acl);
- spin_lock(&BTRFS_I(inode)->root->list_lock);
+ /*
+ * Make sure we're properly removed from the ordered operation
+ * lists.
+ */
+ smp_mb();
+ if (!list_empty(&BTRFS_I(inode)->ordered_operations)) {
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ list_del_init(&BTRFS_I(inode)->ordered_operations);
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ }
+
+ spin_lock(&root->list_lock);
if (!list_empty(&BTRFS_I(inode)->i_orphan)) {
printk(KERN_ERR "BTRFS: inode %lu: inode still on the orphan"
" list\n", inode->i_ino);
dump_stack();
}
- spin_unlock(&BTRFS_I(inode)->root->list_lock);
+ spin_unlock(&root->list_lock);
while (1) {
ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
@@ -4611,8 +4721,36 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret)
goto out_unlock;
+ /*
+ * we're using rename to replace one file with another.
+ * and the replacement file is large. Start IO on it now so
+ * we don't add too much work to the end of the transaction
+ */
+ if (new_inode && old_inode && S_ISREG(old_inode->i_mode) &&
+ new_inode->i_size &&
+ old_inode->i_size > BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT)
+ filemap_flush(old_inode->i_mapping);
+
trans = btrfs_start_transaction(root, 1);
+ /*
+ * make sure the inode gets flushed if it is replacing
+ * something.
+ */
+ if (new_inode && new_inode->i_size &&
+ old_inode && S_ISREG(old_inode->i_mode)) {
+ btrfs_add_ordered_operation(trans, root, old_inode);
+ }
+
+ /*
+ * this is an ugly little race, but the rename is required to make
+ * sure that if we crash, the inode is either at the old name
+ * or the new one. pinning the log transaction lets us make sure
+ * we don't allow a log commit to come in after we unlink the
+ * name but before we add the new name back in.
+ */
+ btrfs_pin_log_trans(root);
+
btrfs_set_trans_block_group(trans, new_dir);
btrfs_inc_nlink(old_dentry->d_inode);
@@ -4620,6 +4758,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
new_dir->i_ctime = new_dir->i_mtime = ctime;
old_inode->i_ctime = ctime;
+ if (old_dentry->d_parent != new_dentry->d_parent)
+ btrfs_record_unlink_dir(trans, old_dir, old_inode, 1);
+
ret = btrfs_unlink_inode(trans, root, old_dir, old_dentry->d_inode,
old_dentry->d_name.name,
old_dentry->d_name.len);
@@ -4651,7 +4792,14 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
if (ret)
goto out_fail;
+ btrfs_log_new_name(trans, old_inode, old_dir,
+ new_dentry->d_parent);
out_fail:
+
+ /* this btrfs_end_log_trans just allows the current
+ * log-sub transaction to complete
+ */
+ btrfs_end_log_trans(root);
btrfs_end_transaction_throttle(trans, root);
out_unlock:
return ret;
diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c
index 47b0a88c12a..a5310c0f41e 100644
--- a/fs/btrfs/locking.c
+++ b/fs/btrfs/locking.c
@@ -71,12 +71,13 @@ void btrfs_clear_lock_blocking(struct extent_buffer *eb)
static int btrfs_spin_on_block(struct extent_buffer *eb)
{
int i;
+
for (i = 0; i < 512; i++) {
- cpu_relax();
if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
return 1;
if (need_resched())
break;
+ cpu_relax();
}
return 0;
}
@@ -95,13 +96,15 @@ int btrfs_try_spin_lock(struct extent_buffer *eb)
{
int i;
- spin_nested(eb);
- if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
- return 1;
- spin_unlock(&eb->lock);
-
+ if (btrfs_spin_on_block(eb)) {
+ spin_nested(eb);
+ if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags))
+ return 1;
+ spin_unlock(&eb->lock);
+ }
/* spin for a bit on the BLOCKING flag */
for (i = 0; i < 2; i++) {
+ cpu_relax();
if (!btrfs_spin_on_block(eb))
break;
@@ -148,6 +151,9 @@ int btrfs_tree_lock(struct extent_buffer *eb)
DEFINE_WAIT(wait);
wait.func = btrfs_wake_function;
+ if (!btrfs_spin_on_block(eb))
+ goto sleep;
+
while(1) {
spin_nested(eb);
@@ -165,9 +171,10 @@ int btrfs_tree_lock(struct extent_buffer *eb)
* spin for a bit, and if the blocking flag goes away,
* loop around
*/
+ cpu_relax();
if (btrfs_spin_on_block(eb))
continue;
-
+sleep:
prepare_to_wait_exclusive(&eb->lock_wq, &wait,
TASK_UNINTERRUPTIBLE);
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 77c2411a5f0..53c87b197d7 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -310,6 +310,16 @@ int btrfs_remove_ordered_extent(struct inode *inode,
spin_lock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
list_del_init(&entry->root_extent_list);
+
+ /*
+ * we have no more ordered extents for this inode and
+ * no dirty pages. We can safely remove it from the
+ * list of ordered extents
+ */
+ if (RB_EMPTY_ROOT(&tree->tree) &&
+ !mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY)) {
+ list_del_init(&BTRFS_I(inode)->ordered_operations);
+ }
spin_unlock(&BTRFS_I(inode)->root->fs_info->ordered_extent_lock);
mutex_unlock(&tree->mutex);
@@ -370,6 +380,68 @@ int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only)
}
/*
+ * this is used during transaction commit to write all the inodes
+ * added to the ordered operation list. These files must be fully on
+ * disk before the transaction commits.
+ *
+ * we have two modes here, one is to just start the IO via filemap_flush
+ * and the other is to wait for all the io. When we wait, we have an
+ * extra check to make sure the ordered operation list really is empty
+ * before we return
+ */
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait)
+{
+ struct btrfs_inode *btrfs_inode;
+ struct inode *inode;
+ struct list_head splice;
+
+ INIT_LIST_HEAD(&splice);
+
+ mutex_lock(&root->fs_info->ordered_operations_mutex);
+ spin_lock(&root->fs_info->ordered_extent_lock);
+again:
+ list_splice_init(&root->fs_info->ordered_operations, &splice);
+
+ while (!list_empty(&splice)) {
+ btrfs_inode = list_entry(splice.next, struct btrfs_inode,
+ ordered_operations);
+
+ inode = &btrfs_inode->vfs_inode;
+
+ list_del_init(&btrfs_inode->ordered_operations);
+
+ /*
+ * the inode may be getting freed (in sys_unlink path).
+ */
+ inode = igrab(inode);
+
+ if (!wait && inode) {
+ list_add_tail(&BTRFS_I(inode)->ordered_operations,
+ &root->fs_info->ordered_operations);
+ }
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ if (inode) {
+ if (wait)
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ else
+ filemap_flush(inode->i_mapping);
+ iput(inode);
+ }
+
+ cond_resched();
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ }
+ if (wait && !list_empty(&root->fs_info->ordered_operations))
+ goto again;
+
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+ mutex_unlock(&root->fs_info->ordered_operations_mutex);
+
+ return 0;
+}
+
+/*
* Used to start IO or wait for a given ordered extent to finish.
*
* If wait is one, this effectively waits on page writeback for all the pages
@@ -726,3 +798,49 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
return ret;
}
+
+/*
+ * add a given inode to the list of inodes that must be fully on
+ * disk before a transaction commit finishes.
+ *
+ * This basically gives us the ext3 style data=ordered mode, and it is mostly
+ * used to make sure renamed files are fully on disk.
+ *
+ * It is a noop if the inode is already fully on disk.
+ *
+ * If trans is not null, we'll do a friendly check for a transaction that
+ * is already flushing things and force the IO down ourselves.
+ */
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode)
+{
+ u64 last_mod;
+
+ last_mod = max(BTRFS_I(inode)->generation, BTRFS_I(inode)->last_trans);
+
+ /*
+ * if this file hasn't been changed since the last transaction
+ * commit, we can safely return without doing anything
+ */
+ if (last_mod < root->fs_info->last_trans_committed)
+ return 0;
+
+ /*
+ * the transaction is already committing. Just start the IO and
+ * don't bother with all of this list nonsense
+ */
+ if (trans && root->fs_info->running_transaction->blocked) {
+ btrfs_wait_ordered_range(inode, 0, (u64)-1);
+ return 0;
+ }
+
+ spin_lock(&root->fs_info->ordered_extent_lock);
+ if (list_empty(&BTRFS_I(inode)->ordered_operations)) {
+ list_add_tail(&BTRFS_I(inode)->ordered_operations,
+ &root->fs_info->ordered_operations);
+ }
+ spin_unlock(&root->fs_info->ordered_extent_lock);
+
+ return 0;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index ab66d5e8d6d..3d31c8827b0 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -155,4 +155,8 @@ int btrfs_wait_on_page_writeback_range(struct address_space *mapping,
int btrfs_fdatawrite_range(struct address_space *mapping, loff_t start,
loff_t end, int sync_mode);
int btrfs_wait_ordered_extents(struct btrfs_root *root, int nocow_only);
+int btrfs_run_ordered_operations(struct btrfs_root *root, int wait);
+int btrfs_add_ordered_operation(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct inode *inode);
#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 4112d53d4f4..664782c6a2d 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -65,6 +65,15 @@ static noinline int join_transaction(struct btrfs_root *root)
cur_trans->use_count = 1;
cur_trans->commit_done = 0;
cur_trans->start_time = get_seconds();
+
+ cur_trans->delayed_refs.root.rb_node = NULL;
+ cur_trans->delayed_refs.num_entries = 0;
+ cur_trans->delayed_refs.num_heads_ready = 0;
+ cur_trans->delayed_refs.num_heads = 0;
+ cur_trans->delayed_refs.flushing = 0;
+ cur_trans->delayed_refs.run_delayed_start = 0;
+ spin_lock_init(&cur_trans->delayed_refs.lock);
+
INIT_LIST_HEAD(&cur_trans->pending_snapshots);
list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
extent_io_tree_init(&cur_trans->dirty_pages,
@@ -182,6 +191,8 @@ static struct btrfs_trans_handle *start_transaction(struct btrfs_root *root,
h->block_group = 0;
h->alloc_exclude_nr = 0;
h->alloc_exclude_start = 0;
+ h->delayed_ref_updates = 0;
+
root->fs_info->running_transaction->use_count++;
mutex_unlock(&root->fs_info->trans_mutex);
return h;
@@ -271,7 +282,6 @@ void btrfs_throttle(struct btrfs_root *root)
if (!root->fs_info->open_ioctl_trans)
wait_current_trans(root);
mutex_unlock(&root->fs_info->trans_mutex);
-
throttle_on_drops(root);
}
@@ -280,6 +290,27 @@ static int __btrfs_end_transaction(struct btrfs_trans_handle *trans,
{
struct btrfs_transaction *cur_trans;
struct btrfs_fs_info *info = root->fs_info;
+ int count = 0;
+
+ while (count < 4) {
+ unsigned long cur = trans->delayed_ref_updates;
+ trans->delayed_ref_updates = 0;
+ if (cur &&
+ trans->transaction->delayed_refs.num_heads_ready > 64) {
+ trans->delayed_ref_updates = 0;
+
+ /*
+ * do a full flush if the transaction is trying
+ * to close
+ */
+ if (trans->transaction->delayed_refs.flushing)
+ cur = 0;
+ btrfs_run_delayed_refs(trans, root, cur);
+ } else {
+ break;
+ }
+ count++;
+ }
mutex_lock(&info->trans_mutex);
cur_trans = info->running_transaction;
@@ -424,9 +455,10 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
u64 old_root_bytenr;
struct btrfs_root *tree_root = root->fs_info->tree_root;
- btrfs_extent_post_op(trans, root);
btrfs_write_dirty_block_groups(trans, root);
- btrfs_extent_post_op(trans, root);
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
while (1) {
old_root_bytenr = btrfs_root_bytenr(&root->root_item);
@@ -438,14 +470,14 @@ static int update_cowonly_root(struct btrfs_trans_handle *trans,
btrfs_header_level(root->node));
btrfs_set_root_generation(&root->root_item, trans->transid);
- btrfs_extent_post_op(trans, root);
-
ret = btrfs_update_root(trans, tree_root,
&root->root_key,
&root->root_item);
BUG_ON(ret);
btrfs_write_dirty_block_groups(trans, root);
- btrfs_extent_post_op(trans, root);
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
}
return 0;
}
@@ -459,15 +491,18 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info = root->fs_info;
struct list_head *next;
struct extent_buffer *eb;
+ int ret;
- btrfs_extent_post_op(trans, fs_info->tree_root);
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
eb = btrfs_lock_root_node(fs_info->tree_root);
- btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb, 0);
+ btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, 0, &eb);
btrfs_tree_unlock(eb);
free_extent_buffer(eb);
- btrfs_extent_post_op(trans, fs_info->tree_root);
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
while (!list_empty(&fs_info->dirty_cowonly_roots)) {
next = fs_info->dirty_cowonly_roots.next;
@@ -475,6 +510,9 @@ int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
root = list_entry(next, struct btrfs_root, dirty_list);
update_cowonly_root(trans, root);
+
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
}
return 0;
}
@@ -635,6 +673,31 @@ int btrfs_defrag_root(struct btrfs_root *root, int cacheonly)
}
/*
+ * when dropping snapshots, we generate a ton of delayed refs, and it makes
+ * sense not to join the transaction while it is trying to flush the current
+ * queue of delayed refs out.
+ *
+ * This is used by the drop snapshot code only
+ */
+static noinline int wait_transaction_pre_flush(struct btrfs_fs_info *info)
+{
+ DEFINE_WAIT(wait);
+
+ mutex_lock(&info->trans_mutex);
+ while (info->running_transaction &&
+ info->running_transaction->delayed_refs.flushing) {
+ prepare_to_wait(&info->transaction_wait, &wait,
+ TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&info->trans_mutex);
+ schedule();
+ mutex_lock(&info->trans_mutex);
+ finish_wait(&info->transaction_wait, &wait);
+ }
+ mutex_unlock(&info->trans_mutex);
+ return 0;
+}
+
+/*
* Given a list of roots that need to be deleted, call btrfs_drop_snapshot on
* all of them
*/
@@ -661,7 +724,22 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root,
atomic_inc(&root->fs_info->throttles);
while (1) {
+ /*
+ * we don't want to jump in and create a bunch of
+ * delayed refs if the transaction is starting to close
+ */
+ wait_transaction_pre_flush(tree_root->fs_info);
trans = btrfs_start_transaction(tree_root, 1);
+
+ /*
+ * we've joined a transaction, make sure it isn't
+ * closing right now
+ */
+ if (trans->transaction->delayed_refs.flushing) {
+ btrfs_end_transaction(trans, tree_root);
+ continue;
+ }
+
mutex_lock(&root->fs_info->drop_mutex);
ret = btrfs_drop_snapshot(trans, dirty->root);
if (ret != -EAGAIN)
@@ -766,7 +844,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
old = btrfs_lock_root_node(root);
- btrfs_cow_block(trans, root, old, NULL, 0, &old, 0);
+ btrfs_cow_block(trans, root, old, NULL, 0, &old);
btrfs_copy_root(trans, root, old, &tmp, objectid);
btrfs_tree_unlock(old);
@@ -894,12 +972,31 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
struct extent_io_tree *pinned_copy;
DEFINE_WAIT(wait);
int ret;
+ int should_grow = 0;
+ unsigned long now = get_seconds();
+
+ btrfs_run_ordered_operations(root, 0);
+
+ /* make a pass through all the delayed refs we have so far
+ * any runnings procs may add more while we are here
+ */
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ BUG_ON(ret);
+
+ cur_trans = trans->transaction;
+ /*
+ * set the flushing flag so procs in this transaction have to
+ * start sending their work down.
+ */
+ cur_trans->delayed_refs.flushing = 1;
+
+ ret = btrfs_run_delayed_refs(trans, root, 0);
+ BUG_ON(ret);
- INIT_LIST_HEAD(&dirty_fs_roots);
mutex_lock(&root->fs_info->trans_mutex);
- if (trans->transaction->in_commit) {
- cur_trans = trans->transaction;
- trans->transaction->use_count++;
+ INIT_LIST_HEAD(&dirty_fs_roots);
+ if (cur_trans->in_commit) {
+ cur_trans->use_count++;
mutex_unlock(&root->fs_info->trans_mutex);
btrfs_end_transaction(trans, root);
@@ -922,7 +1019,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
trans->transaction->in_commit = 1;
trans->transaction->blocked = 1;
- cur_trans = trans->transaction;
if (cur_trans->list.prev != &root->fs_info->trans_list) {
prev_trans = list_entry(cur_trans->list.prev,
struct btrfs_transaction, list);
@@ -937,6 +1033,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
}
}
+ if (now < cur_trans->start_time || now - cur_trans->start_time < 1)
+ should_grow = 1;
+
do {
int snap_pending = 0;
joined = cur_trans->num_joined;
@@ -949,7 +1048,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
if (cur_trans->num_writers > 1)
timeout = MAX_SCHEDULE_TIMEOUT;
- else
+ else if (should_grow)
timeout = 1;
mutex_unlock(&root->fs_info->trans_mutex);
@@ -959,16 +1058,30 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
BUG_ON(ret);
}
- schedule_timeout(timeout);
+ /*
+ * rename don't use btrfs_join_transaction, so, once we
+ * set the transaction to blocked above, we aren't going
+ * to get any new ordered operations. We can safely run
+ * it here and no for sure that nothing new will be added
+ * to the list
+ */
+ btrfs_run_ordered_operations(root, 1);
+
+ smp_mb();
+ if (cur_trans->num_writers > 1 || should_grow)
+ schedule_timeout(timeout);
mutex_lock(&root->fs_info->trans_mutex);
finish_wait(&cur_trans->writer_wait, &wait);
} while (cur_trans->num_writers > 1 ||
- (cur_trans->num_joined != joined));
+ (should_grow && cur_trans->num_joined != joined));
ret = create_pending_snapshots(trans, root->fs_info);
BUG_ON(ret);
+ ret = btrfs_run_delayed_refs(trans, root, (unsigned long)-1);
+ BUG_ON(ret);
+
WARN_ON(cur_trans != trans->transaction);
/* btrfs_commit_tree_roots is responsible for getting the
@@ -1032,6 +1145,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
btrfs_copy_pinned(root, pinned_copy);
trans->transaction->blocked = 0;
+
wake_up(&root->fs_info->transaction_throttle);
wake_up(&root->fs_info->transaction_wait);
@@ -1058,6 +1172,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
mutex_lock(&root->fs_info->trans_mutex);
cur_trans->commit_done = 1;
+
root->fs_info->last_trans_committed = cur_trans->transid;
wake_up(&cur_trans->commit_wait);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index ea292117f88..94f5bde2b58 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,10 +19,16 @@
#ifndef __BTRFS_TRANSACTION__
#define __BTRFS_TRANSACTION__
#include "btrfs_inode.h"
+#include "delayed-ref.h"
struct btrfs_transaction {
u64 transid;
+ /*
+ * total writers in this transaction, it must be zero before the
+ * transaction can end
+ */
unsigned long num_writers;
+
unsigned long num_joined;
int in_commit;
int use_count;
@@ -34,6 +40,7 @@ struct btrfs_transaction {
wait_queue_head_t writer_wait;
wait_queue_head_t commit_wait;
struct list_head pending_snapshots;
+ struct btrfs_delayed_ref_root delayed_refs;
};
struct btrfs_trans_handle {
@@ -44,6 +51,7 @@ struct btrfs_trans_handle {
u64 block_group;
u64 alloc_exclude_start;
u64 alloc_exclude_nr;
+ unsigned long delayed_ref_updates;
};
struct btrfs_pending_snapshot {
diff --git a/fs/btrfs/tree-defrag.c b/fs/btrfs/tree-defrag.c
index 98d25fa4570..b10eacdb162 100644
--- a/fs/btrfs/tree-defrag.c
+++ b/fs/btrfs/tree-defrag.c
@@ -124,8 +124,6 @@ int btrfs_defrag_leaves(struct btrfs_trans_handle *trans,
}
btrfs_release_path(root, path);
- if (is_extent)
- btrfs_extent_post_op(trans, root);
out:
if (path)
btrfs_free_path(path);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 9c462fbd60f..fc9b87a7975 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -35,6 +35,49 @@
#define LOG_INODE_EXISTS 1
/*
+ * directory trouble cases
+ *
+ * 1) on rename or unlink, if the inode being unlinked isn't in the fsync
+ * log, we must force a full commit before doing an fsync of the directory
+ * where the unlink was done.
+ * ---> record transid of last unlink/rename per directory
+ *
+ * mkdir foo/some_dir
+ * normal commit
+ * rename foo/some_dir foo2/some_dir
+ * mkdir foo/some_dir
+ * fsync foo/some_dir/some_file
+ *
+ * The fsync above will unlink the original some_dir without recording
+ * it in its new location (foo2). After a crash, some_dir will be gone
+ * unless the fsync of some_file forces a full commit
+ *
+ * 2) we must log any new names for any file or dir that is in the fsync
+ * log. ---> check inode while renaming/linking.
+ *
+ * 2a) we must log any new names for any file or dir during rename
+ * when the directory they are being removed from was logged.
+ * ---> check inode and old parent dir during rename
+ *
+ * 2a is actually the more important variant. With the extra logging
+ * a crash might unlink the old name without recreating the new one
+ *
+ * 3) after a crash, we must go through any directories with a link count
+ * of zero and redo the rm -rf
+ *
+ * mkdir f1/foo
+ * normal commit
+ * rm -rf f1/foo
+ * fsync(f1)
+ *
+ * The directory f1 was fully removed from the FS, but fsync was never
+ * called on f1, only its parent dir. After a crash the rm -rf must
+ * be replayed. This must be able to recurse down the entire
+ * directory tree. The inode link count fixup code takes care of the
+ * ugly details.
+ */
+
+/*
* stages for the tree walking. The first
* stage (0) is to only pin down the blocks we find
* the second stage (1) is to make sure that all the inodes
@@ -47,12 +90,17 @@
#define LOG_WALK_REPLAY_INODES 1
#define LOG_WALK_REPLAY_ALL 2
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
int inode_only);
static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
+static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root,
+ struct btrfs_root *log,
+ struct btrfs_path *path,
+ u64 dirid, int del_all);
/*
* tree logging is a special write ahead log used to make sure that
@@ -133,10 +181,25 @@ static int join_running_log_trans(struct btrfs_root *root)
}
/*
+ * This either makes the current running log transaction wait
+ * until you call btrfs_end_log_trans() or it makes any future
+ * log transactions wait until you call btrfs_end_log_trans()
+ */
+int btrfs_pin_log_trans(struct btrfs_root *root)
+{
+ int ret = -ENOENT;
+
+ mutex_lock(&root->log_mutex);
+ atomic_inc(&root->log_writers);
+ mutex_unlock(&root->log_mutex);
+ return ret;
+}
+
+/*
* indicate we're done making changes to the log tree
* and wake up anyone waiting to do a sync
*/
-static int end_log_trans(struct btrfs_root *root)
+int btrfs_end_log_trans(struct btrfs_root *root)
{
if (atomic_dec_and_test(&root->log_writers)) {
smp_mb();
@@ -203,7 +266,6 @@ static int process_one_buffer(struct btrfs_root *log,
mutex_lock(&log->fs_info->pinned_mutex);
btrfs_update_pinned_extents(log->fs_info->extent_root,
eb->start, eb->len, 1);
- mutex_unlock(&log->fs_info->pinned_mutex);
}
if (btrfs_buffer_uptodate(eb, gen)) {
@@ -603,6 +665,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
ret = link_to_fixup_dir(trans, root, path, location.objectid);
BUG_ON(ret);
+
ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
BUG_ON(ret);
kfree(name);
@@ -804,6 +867,7 @@ conflict_again:
victim_name_len)) {
btrfs_inc_nlink(inode);
btrfs_release_path(root, path);
+
ret = btrfs_unlink_inode(trans, root, dir,
inode, victim_name,
victim_name_len);
@@ -922,13 +986,20 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
key.offset--;
btrfs_release_path(root, path);
}
- btrfs_free_path(path);
+ btrfs_release_path(root, path);
if (nlink != inode->i_nlink) {
inode->i_nlink = nlink;
btrfs_update_inode(trans, root, inode);
}
BTRFS_I(inode)->index_cnt = (u64)-1;
+ if (inode->i_nlink == 0 && S_ISDIR(inode->i_mode)) {
+ ret = replay_dir_deletes(trans, root, NULL, path,
+ inode->i_ino, 1);
+ BUG_ON(ret);
+ }
+ btrfs_free_path(path);
+
return 0;
}
@@ -971,9 +1042,12 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
iput(inode);
- if (key.offset == 0)
- break;
- key.offset--;
+ /*
+ * fixup on a directory may create new entries,
+ * make sure we always look for the highset possible
+ * offset
+ */
+ key.offset = (u64)-1;
}
btrfs_release_path(root, path);
return 0;
@@ -1313,11 +1387,11 @@ again:
read_extent_buffer(eb, name, (unsigned long)(di + 1),
name_len);
log_di = NULL;
- if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
+ if (log && dir_key->type == BTRFS_DIR_ITEM_KEY) {
log_di = btrfs_lookup_dir_item(trans, log, log_path,
dir_key->objectid,
name, name_len, 0);
- } else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
+ } else if (log && dir_key->type == BTRFS_DIR_INDEX_KEY) {
log_di = btrfs_lookup_dir_index_item(trans, log,
log_path,
dir_key->objectid,
@@ -1378,7 +1452,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *log,
struct btrfs_path *path,
- u64 dirid)
+ u64 dirid, int del_all)
{
u64 range_start;
u64 range_end;
@@ -1408,10 +1482,14 @@ again:
range_start = 0;
range_end = 0;
while (1) {
- ret = find_dir_range(log, path, dirid, key_type,
- &range_start, &range_end);
- if (ret != 0)
- break;
+ if (del_all)
+ range_end = (u64)-1;
+ else {
+ ret = find_dir_range(log, path, dirid, key_type,
+ &range_start, &range_end);
+ if (ret != 0)
+ break;
+ }
dir_key.offset = range_start;
while (1) {
@@ -1437,7 +1515,8 @@ again:
break;
ret = check_item_in_log(trans, root, log, path,
- log_path, dir, &found_key);
+ log_path, dir,
+ &found_key);
BUG_ON(ret);
if (found_key.offset == (u64)-1)
break;
@@ -1514,7 +1593,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
mode = btrfs_inode_mode(eb, inode_item);
if (S_ISDIR(mode)) {
ret = replay_dir_deletes(wc->trans,
- root, log, path, key.objectid);
+ root, log, path, key.objectid, 0);
BUG_ON(ret);
}
ret = overwrite_item(wc->trans, root, path,
@@ -1533,6 +1612,17 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
root, inode, inode->i_size,
BTRFS_EXTENT_DATA_KEY);
BUG_ON(ret);
+
+ /* if the nlink count is zero here, the iput
+ * will free the inode. We bump it to make
+ * sure it doesn't get freed until the link
+ * count fixup is done
+ */
+ if (inode->i_nlink == 0) {
+ btrfs_inc_nlink(inode);
+ btrfs_update_inode(wc->trans,
+ root, inode);
+ }
iput(inode);
}
ret = link_to_fixup_dir(wc->trans, root,
@@ -1840,7 +1930,8 @@ static int update_log_root(struct btrfs_trans_handle *trans,
return ret;
}
-static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
+static int wait_log_commit(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, unsigned long transid)
{
DEFINE_WAIT(wait);
int index = transid % 2;
@@ -1854,9 +1945,12 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
prepare_to_wait(&root->log_commit_wait[index],
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (root->log_transid < transid + 2 &&
+
+ if (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && root->log_transid < transid + 2 &&
atomic_read(&root->log_commit[index]))
schedule();
+
finish_wait(&root->log_commit_wait[index], &wait);
mutex_lock(&root->log_mutex);
} while (root->log_transid < transid + 2 &&
@@ -1864,14 +1958,16 @@ static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
return 0;
}
-static int wait_for_writer(struct btrfs_root *root)
+static int wait_for_writer(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root)
{
DEFINE_WAIT(wait);
while (atomic_read(&root->log_writers)) {
prepare_to_wait(&root->log_writer_wait,
&wait, TASK_UNINTERRUPTIBLE);
mutex_unlock(&root->log_mutex);
- if (atomic_read(&root->log_writers))
+ if (root->fs_info->last_trans_log_full_commit !=
+ trans->transid && atomic_read(&root->log_writers))
schedule();
mutex_lock(&root->log_mutex);
finish_wait(&root->log_writer_wait, &wait);
@@ -1882,7 +1978,14 @@ static int wait_for_writer(struct btrfs_root *root)
/*
* btrfs_sync_log does sends a given tree log down to the disk and
* updates the super blocks to record it. When this call is done,
- * you know that any inodes previously logged are safely on disk
+ * you know that any inodes previously logged are safely on disk only
+ * if it returns 0.
+ *
+ * Any other return value means you need to call btrfs_commit_transaction.
+ * Some of the edge cases for fsyncing directories that have had unlinks
+ * or renames done in the past mean that sometimes the only safe
+ * fsync is to commit the whole FS. When btrfs_sync_log returns -EAGAIN,
+ * that has happened.
*/
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
@@ -1896,7 +1999,7 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
mutex_lock(&root->log_mutex);
index1 = root->log_transid % 2;
if (atomic_read(&root->log_commit[index1])) {
- wait_log_commit(root, root->log_transid);
+ wait_log_commit(trans, root, root->log_transid);
mutex_unlock(&root->log_mutex);
return 0;
}
@@ -1904,18 +2007,26 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
/* wait for previous tree log sync to complete */
if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
- wait_log_commit(root, root->log_transid - 1);
+ wait_log_commit(trans, root, root->log_transid - 1);
while (1) {
unsigned long batch = root->log_batch;
mutex_unlock(&root->log_mutex);
schedule_timeout_uninterruptible(1);
mutex_lock(&root->log_mutex);
- wait_for_writer(root);
+
+ wait_for_writer(trans, root);
if (batch == root->log_batch)
break;
}
+ /* bail out if we need to do a full commit */
+ if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ ret = -EAGAIN;
+ mutex_unlock(&root->log_mutex);
+ goto out;
+ }
+
ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
BUG_ON(ret);
@@ -1951,16 +2062,29 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
index2 = log_root_tree->log_transid % 2;
if (atomic_read(&log_root_tree->log_commit[index2])) {
- wait_log_commit(log_root_tree, log_root_tree->log_transid);
+ wait_log_commit(trans, log_root_tree,
+ log_root_tree->log_transid);
mutex_unlock(&log_root_tree->log_mutex);
goto out;
}
atomic_set(&log_root_tree->log_commit[index2], 1);
- if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
- wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
+ if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2])) {
+ wait_log_commit(trans, log_root_tree,
+ log_root_tree->log_transid - 1);
+ }
+
+ wait_for_writer(trans, log_root_tree);
- wait_for_writer(log_root_tree);
+ /*
+ * now that we've moved on to the tree of log tree roots,
+ * check the full commit flag again
+ */
+ if (root->fs_info->last_trans_log_full_commit == trans->transid) {
+ mutex_unlock(&log_root_tree->log_mutex);
+ ret = -EAGAIN;
+ goto out_wake_log_root;
+ }
ret = btrfs_write_and_wait_marked_extents(log_root_tree,
&log_root_tree->dirty_log_pages);
@@ -1985,7 +2109,9 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
* in and cause problems either.
*/
write_ctree_super(trans, root->fs_info->tree_root, 2);
+ ret = 0;
+out_wake_log_root:
atomic_set(&log_root_tree->log_commit[index2], 0);
smp_mb();
if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
@@ -1998,7 +2124,8 @@ out:
return 0;
}
-/* * free all the extents used by the tree log. This should be called
+/*
+ * free all the extents used by the tree log. This should be called
* at commit time of the full transaction
*/
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root)
@@ -2132,7 +2259,7 @@ int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
btrfs_free_path(path);
mutex_unlock(&BTRFS_I(dir)->log_mutex);
- end_log_trans(root);
+ btrfs_end_log_trans(root);
return 0;
}
@@ -2159,7 +2286,7 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
dirid, &index);
mutex_unlock(&BTRFS_I(inode)->log_mutex);
- end_log_trans(root);
+ btrfs_end_log_trans(root);
return ret;
}
@@ -2559,7 +2686,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
*
* This handles both files and directories.
*/
-static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
+static int btrfs_log_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct inode *inode,
int inode_only)
{
@@ -2585,28 +2712,17 @@ static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
min_key.offset = 0;
max_key.objectid = inode->i_ino;
+
+ /* today the code can only do partial logging of directories */
+ if (!S_ISDIR(inode->i_mode))
+ inode_only = LOG_INODE_ALL;
+
if (inode_only == LOG_INODE_EXISTS || S_ISDIR(inode->i_mode))
max_key.type = BTRFS_XATTR_ITEM_KEY;
else
max_key.type = (u8)-1;
max_key.offset = (u64)-1;
- /*
- * if this inode has already been logged and we're in inode_only
- * mode, we don't want to delete the things that have already
- * been written to the log.
- *
- * But, if the inode has been through an inode_only log,
- * the logged_trans field is not set. This allows us to catch
- * any new names for this inode in the backrefs by logging it
- * again
- */
- if (inode_only == LOG_INODE_EXISTS &&
- BTRFS_I(inode)->logged_trans == trans->transid) {
- btrfs_free_path(path);
- btrfs_free_path(dst_path);
- goto out;
- }
mutex_lock(&BTRFS_I(inode)->log_mutex);
/*
@@ -2693,7 +2809,6 @@ next_slot:
if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
btrfs_release_path(root, path);
btrfs_release_path(log, dst_path);
- BTRFS_I(inode)->log_dirty_trans = 0;
ret = log_directory_changes(trans, root, inode, path, dst_path);
BUG_ON(ret);
}
@@ -2702,19 +2817,69 @@ next_slot:
btrfs_free_path(path);
btrfs_free_path(dst_path);
-out:
return 0;
}
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only)
+/*
+ * follow the dentry parent pointers up the chain and see if any
+ * of the directories in it require a full commit before they can
+ * be logged. Returns zero if nothing special needs to be done or 1 if
+ * a full commit is required.
+ */
+static noinline int check_parent_dirs_for_sync(struct btrfs_trans_handle *trans,
+ struct inode *inode,
+ struct dentry *parent,
+ struct super_block *sb,
+ u64 last_committed)
{
- int ret;
+ int ret = 0;
+ struct btrfs_root *root;
- start_log_trans(trans, root);
- ret = __btrfs_log_inode(trans, root, inode, inode_only);
- end_log_trans(root);
+ /*
+ * for regular files, if its inode is already on disk, we don't
+ * have to worry about the parents at all. This is because
+ * we can use the last_unlink_trans field to record renames
+ * and other fun in this file.
+ */
+ if (S_ISREG(inode->i_mode) &&
+ BTRFS_I(inode)->generation <= last_committed &&
+ BTRFS_I(inode)->last_unlink_trans <= last_committed)
+ goto out;
+
+ if (!S_ISDIR(inode->i_mode)) {
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ goto out;
+ inode = parent->d_inode;
+ }
+
+ while (1) {
+ BTRFS_I(inode)->logged_trans = trans->transid;
+ smp_mb();
+
+ if (BTRFS_I(inode)->last_unlink_trans > last_committed) {
+ root = BTRFS_I(inode)->root;
+
+ /*
+ * make sure any commits to the log are forced
+ * to be full commits
+ */
+ root->fs_info->last_trans_log_full_commit =
+ trans->transid;
+ ret = 1;
+ break;
+ }
+
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
+ break;
+
+ if (parent == sb->s_root)
+ break;
+
+ parent = parent->d_parent;
+ inode = parent->d_inode;
+
+ }
+out:
return ret;
}
@@ -2724,31 +2889,65 @@ int btrfs_log_inode(struct btrfs_trans_handle *trans,
* only logging is done of any parent directories that are older than
* the last committed transaction
*/
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry)
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct dentry *parent, int exists_only)
{
- int inode_only = LOG_INODE_ALL;
+ int inode_only = exists_only ? LOG_INODE_EXISTS : LOG_INODE_ALL;
struct super_block *sb;
- int ret;
+ int ret = 0;
+ u64 last_committed = root->fs_info->last_trans_committed;
+
+ sb = inode->i_sb;
+
+ if (root->fs_info->last_trans_log_full_commit >
+ root->fs_info->last_trans_committed) {
+ ret = 1;
+ goto end_no_trans;
+ }
+
+ ret = check_parent_dirs_for_sync(trans, inode, parent,
+ sb, last_committed);
+ if (ret)
+ goto end_no_trans;
start_log_trans(trans, root);
- sb = dentry->d_inode->i_sb;
- while (1) {
- ret = __btrfs_log_inode(trans, root, dentry->d_inode,
- inode_only);
- BUG_ON(ret);
- inode_only = LOG_INODE_EXISTS;
- dentry = dentry->d_parent;
- if (!dentry || !dentry->d_inode || sb != dentry->d_inode->i_sb)
+ ret = btrfs_log_inode(trans, root, inode, inode_only);
+ BUG_ON(ret);
+
+ /*
+ * for regular files, if its inode is already on disk, we don't
+ * have to worry about the parents at all. This is because
+ * we can use the last_unlink_trans field to record renames
+ * and other fun in this file.
+ */
+ if (S_ISREG(inode->i_mode) &&
+ BTRFS_I(inode)->generation <= last_committed &&
+ BTRFS_I(inode)->last_unlink_trans <= last_committed)
+ goto no_parent;
+
+ inode_only = LOG_INODE_EXISTS;
+ while (1) {
+ if (!parent || !parent->d_inode || sb != parent->d_inode->i_sb)
break;
- if (BTRFS_I(dentry->d_inode)->generation <=
- root->fs_info->last_trans_committed)
+ inode = parent->d_inode;
+ if (BTRFS_I(inode)->generation >
+ root->fs_info->last_trans_committed) {
+ ret = btrfs_log_inode(trans, root, inode, inode_only);
+ BUG_ON(ret);
+ }
+ if (parent == sb->s_root)
break;
+
+ parent = parent->d_parent;
}
- end_log_trans(root);
- return 0;
+no_parent:
+ ret = 0;
+ btrfs_end_log_trans(root);
+end_no_trans:
+ return ret;
}
/*
@@ -2760,12 +2959,8 @@ int btrfs_log_dentry(struct btrfs_trans_handle *trans,
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry)
{
- u64 gen;
- gen = root->fs_info->last_trans_new_blockgroup;
- if (gen > root->fs_info->last_trans_committed)
- return 1;
- else
- return btrfs_log_dentry(trans, root, dentry);
+ return btrfs_log_inode_parent(trans, root, dentry->d_inode,
+ dentry->d_parent, 0);
}
/*
@@ -2884,3 +3079,94 @@ again:
kfree(log_root_tree);
return 0;
}
+
+/*
+ * there are some corner cases where we want to force a full
+ * commit instead of allowing a directory to be logged.
+ *
+ * They revolve around files there were unlinked from the directory, and
+ * this function updates the parent directory so that a full commit is
+ * properly done if it is fsync'd later after the unlinks are done.
+ */
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct inode *inode,
+ int for_rename)
+{
+ /*
+ * when we're logging a file, if it hasn't been renamed
+ * or unlinked, and its inode is fully committed on disk,
+ * we don't have to worry about walking up the directory chain
+ * to log its parents.
+ *
+ * So, we use the last_unlink_trans field to put this transid
+ * into the file. When the file is logged we check it and
+ * don't log the parents if the file is fully on disk.
+ */
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+ /*
+ * if this directory was already logged any new
+ * names for this file/dir will get recorded
+ */
+ smp_mb();
+ if (BTRFS_I(dir)->logged_trans == trans->transid)
+ return;
+
+ /*
+ * if the inode we're about to unlink was logged,
+ * the log will be properly updated for any new names
+ */
+ if (BTRFS_I(inode)->logged_trans == trans->transid)
+ return;
+
+ /*
+ * when renaming files across directories, if the directory
+ * there we're unlinking from gets fsync'd later on, there's
+ * no way to find the destination directory later and fsync it
+ * properly. So, we have to be conservative and force commits
+ * so the new name gets discovered.
+ */
+ if (for_rename)
+ goto record;
+
+ /* we can safely do the unlink without any special recording */
+ return;
+
+record:
+ BTRFS_I(dir)->last_unlink_trans = trans->transid;
+}
+
+/*
+ * Call this after adding a new name for a file and it will properly
+ * update the log to reflect the new name.
+ *
+ * It will return zero if all goes well, and it will return 1 if a
+ * full transaction commit is required.
+ */
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *old_dir,
+ struct dentry *parent)
+{
+ struct btrfs_root * root = BTRFS_I(inode)->root;
+
+ /*
+ * this will force the logging code to walk the dentry chain
+ * up for the file
+ */
+ if (S_ISREG(inode->i_mode))
+ BTRFS_I(inode)->last_unlink_trans = trans->transid;
+
+ /*
+ * if this inode hasn't been logged and directory we're renaming it
+ * from hasn't been logged, we don't need to log it
+ */
+ if (BTRFS_I(inode)->logged_trans <=
+ root->fs_info->last_trans_committed &&
+ (!old_dir || BTRFS_I(old_dir)->logged_trans <=
+ root->fs_info->last_trans_committed))
+ return 0;
+
+ return btrfs_log_inode_parent(trans, root, inode, parent, 1);
+}
+
diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h
index b9409b32ed0..d09c7609e16 100644
--- a/fs/btrfs/tree-log.h
+++ b/fs/btrfs/tree-log.h
@@ -22,14 +22,9 @@
int btrfs_sync_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_free_log(struct btrfs_trans_handle *trans, struct btrfs_root *root);
-int btrfs_log_dentry(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct dentry *dentry);
int btrfs_recover_log_trees(struct btrfs_root *tree_root);
int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct dentry *dentry);
-int btrfs_log_inode(struct btrfs_trans_handle *trans,
- struct btrfs_root *root, struct inode *inode,
- int inode_only);
int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
@@ -38,4 +33,16 @@ int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct inode *inode, u64 dirid);
+int btrfs_join_running_log_trans(struct btrfs_root *root);
+int btrfs_end_log_trans(struct btrfs_root *root);
+int btrfs_pin_log_trans(struct btrfs_root *root);
+int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, struct inode *inode,
+ struct dentry *parent, int exists_only);
+void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans,
+ struct inode *dir, struct inode *inode,
+ int for_rename);
+int btrfs_log_new_name(struct btrfs_trans_handle *trans,
+ struct inode *inode, struct inode *old_dir,
+ struct dentry *parent);
#endif
diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index 38f40d55899..53c72ad8587 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c
@@ -55,7 +55,8 @@ static int ext4_block_in_group(struct super_block *sb, ext4_fsblk_t block,
}
static int ext4_group_used_meta_blocks(struct super_block *sb,
- ext4_group_t block_group)
+ ext4_group_t block_group,
+ struct ext4_group_desc *gdp)
{
ext4_fsblk_t tmp;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -63,10 +64,6 @@ static int ext4_group_used_meta_blocks(struct super_block *sb,
int used_blocks = sbi->s_itb_per_group + 2;
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
- struct ext4_group_desc *gdp;
- struct buffer_head *bh;
-
- gdp = ext4_get_group_desc(sb, block_group, &bh);
if (!ext4_block_in_group(sb, ext4_block_bitmap(sb, gdp),
block_group))
used_blocks--;
@@ -177,7 +174,7 @@ unsigned ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh,
*/
mark_bitmap_end(group_blocks, sb->s_blocksize * 8, bh->b_data);
}
- return free_blocks - ext4_group_used_meta_blocks(sb, block_group);
+ return free_blocks - ext4_group_used_meta_blocks(sb, block_group, gdp);
}
@@ -473,9 +470,8 @@ void ext4_add_groupblocks(handle_t *handle, struct super_block *sb,
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_blocks += blocks_freed;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ atomic_add(blocks_freed,
+ &sbi->s_flex_groups[flex_group].free_blocks);
}
/*
* request to reload the buddy with the
diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 2df2e40b01a..b64789929a6 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c
@@ -67,7 +67,8 @@ int ext4_check_dir_entry(const char *function, struct inode *dir,
unsigned int offset)
{
const char *error_msg = NULL;
- const int rlen = ext4_rec_len_from_disk(de->rec_len);
+ const int rlen = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
if (rlen < EXT4_DIR_REC_LEN(1))
error_msg = "rec_len is smaller than minimal";
@@ -178,10 +179,11 @@ revalidate:
* least that it is non-zero. A
* failure will be detected in the
* dirent test below. */
- if (ext4_rec_len_from_disk(de->rec_len)
- < EXT4_DIR_REC_LEN(1))
+ if (ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize) < EXT4_DIR_REC_LEN(1))
break;
- i += ext4_rec_len_from_disk(de->rec_len);
+ i += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
}
offset = i;
filp->f_pos = (filp->f_pos & ~(sb->s_blocksize - 1))
@@ -203,7 +205,8 @@ revalidate:
ret = stored;
goto out;
}
- offset += ext4_rec_len_from_disk(de->rec_len);
+ offset += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
if (le32_to_cpu(de->inode)) {
/* We might block in the next section
* if the data destination is
@@ -225,7 +228,8 @@ revalidate:
goto revalidate;
stored++;
}
- filp->f_pos += ext4_rec_len_from_disk(de->rec_len);
+ filp->f_pos += ext4_rec_len_from_disk(de->rec_len,
+ sb->s_blocksize);
}
offset = 0;
brelse(bh);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 990c9400092..d0f15ef56de 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -33,14 +33,6 @@
#undef EXT4FS_DEBUG
/*
- * Define EXT4_RESERVATION to reserve data blocks for expanding files
- */
-#define EXT4_DEFAULT_RESERVE_BLOCKS 8
-/*max window size: 1024(direct blocks) + 3([t,d]indirect blocks) */
-#define EXT4_MAX_RESERVE_BLOCKS 1027
-#define EXT4_RESERVE_WINDOW_NOT_ALLOCATED 0
-
-/*
* Debug code
*/
#ifdef EXT4FS_DEBUG
@@ -54,8 +46,6 @@
#define ext4_debug(f, a...) do {} while (0)
#endif
-#define EXT4_MULTIBLOCK_ALLOCATOR 1
-
/* prefer goal again. length */
#define EXT4_MB_HINT_MERGE 1
/* blocks already reserved */
@@ -180,8 +170,9 @@ struct ext4_group_desc
*/
struct flex_groups {
- __u32 free_inodes;
- __u32 free_blocks;
+ atomic_t free_inodes;
+ atomic_t free_blocks;
+ atomic_t used_dirs;
};
#define EXT4_BG_INODE_UNINIT 0x0001 /* Inode table/bitmap not in use */
@@ -249,6 +240,30 @@ struct flex_groups {
#define EXT4_FL_USER_VISIBLE 0x000BDFFF /* User visible flags */
#define EXT4_FL_USER_MODIFIABLE 0x000B80FF /* User modifiable flags */
+/* Flags that should be inherited by new inodes from their parent. */
+#define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\
+ EXT4_SYNC_FL | EXT4_IMMUTABLE_FL | EXT4_APPEND_FL |\
+ EXT4_NODUMP_FL | EXT4_NOATIME_FL |\
+ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\
+ EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL)
+
+/* Flags that are appropriate for regular files (all but dir-specific ones). */
+#define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL))
+
+/* Flags that are appropriate for non-directories/regular files. */
+#define EXT4_OTHER_FLMASK (EXT4_NODUMP_FL | EXT4_NOATIME_FL)
+
+/* Mask out flags that are inappropriate for the given type of inode. */
+static inline __u32 ext4_mask_flags(umode_t mode, __u32 flags)
+{
+ if (S_ISDIR(mode))
+ return flags;
+ else if (S_ISREG(mode))
+ return flags & EXT4_REG_FLMASK;
+ else
+ return flags & EXT4_OTHER_FLMASK;
+}
+
/*
* Inode dynamic state flags
*/
@@ -256,6 +271,7 @@ struct flex_groups {
#define EXT4_STATE_NEW 0x00000002 /* inode is newly created */
#define EXT4_STATE_XATTR 0x00000004 /* has in-inode xattrs */
#define EXT4_STATE_NO_EXPAND 0x00000008 /* No space for expansion */
+#define EXT4_STATE_DA_ALLOC_CLOSE 0x00000010 /* Alloc DA blks on close */
/* Used to pass group descriptor data when online resize is done */
struct ext4_new_group_input {
@@ -303,7 +319,9 @@ struct ext4_new_group_data {
#define EXT4_IOC_GROUP_EXTEND _IOW('f', 7, unsigned long)
#define EXT4_IOC_GROUP_ADD _IOW('f', 8, struct ext4_new_group_input)
#define EXT4_IOC_MIGRATE _IO('f', 9)
+ /* note ioctl 10 reserved for an early version of the FIEMAP ioctl */
/* note ioctl 11 reserved for filesystem-independent FIEMAP ioctl */
+#define EXT4_IOC_ALLOC_DA_BLKS _IO('f', 12)
/*
* ioctl commands in 32 bit emulation
@@ -531,7 +549,7 @@ do { \
#define EXT4_MOUNT_NO_UID32 0x02000 /* Disable 32-bit UIDs */
#define EXT4_MOUNT_XATTR_USER 0x04000 /* Extended user attributes */
#define EXT4_MOUNT_POSIX_ACL 0x08000 /* POSIX Access Control Lists */
-#define EXT4_MOUNT_RESERVATION 0x10000 /* Preallocation */
+#define EXT4_MOUNT_NO_AUTO_DA_ALLOC 0x10000 /* No auto delalloc mapping */
#define EXT4_MOUNT_BARRIER 0x20000 /* Use block barriers */
#define EXT4_MOUNT_NOBH 0x40000 /* No bufferheads */
#define EXT4_MOUNT_QUOTA 0x80000 /* Some quota option set */
@@ -666,7 +684,8 @@ struct ext4_super_block {
__u8 s_log_groups_per_flex; /* FLEX_BG group size */
__u8 s_reserved_char_pad2;
__le16 s_reserved_pad;
- __u32 s_reserved[162]; /* Padding to the end of the block */
+ __le64 s_kbytes_written; /* nr of lifetime kilobytes written */
+ __u32 s_reserved[160]; /* Padding to the end of the block */
};
#ifdef __KERNEL__
@@ -814,6 +833,12 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino)
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
/*
+ * Minimum number of groups in a flexgroup before we separate out
+ * directories into the first block group of a flexgroup
+ */
+#define EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME 4
+
+/*
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
@@ -865,24 +890,6 @@ struct ext4_dir_entry_2 {
~EXT4_DIR_ROUND)
#define EXT4_MAX_REC_LEN ((1<<16)-1)
-static inline unsigned ext4_rec_len_from_disk(__le16 dlen)
-{
- unsigned len = le16_to_cpu(dlen);
-
- if (len == EXT4_MAX_REC_LEN || len == 0)
- return 1 << 16;
- return len;
-}
-
-static inline __le16 ext4_rec_len_to_disk(unsigned len)
-{
- if (len == (1 << 16))
- return cpu_to_le16(EXT4_MAX_REC_LEN);
- else if (len > (1 << 16))
- BUG();
- return cpu_to_le16(len);
-}
-
/*
* Hash Tree Directory indexing
* (c) Daniel Phillips, 2001
@@ -970,22 +977,6 @@ void ext4_get_group_no_and_offset(struct super_block *sb, ext4_fsblk_t blocknr,
extern struct proc_dir_entry *ext4_proc_root;
-#ifdef CONFIG_PROC_FS
-extern const struct file_operations ext4_ui_proc_fops;
-
-#define EXT4_PROC_HANDLER(name, var) \
-do { \
- proc = proc_create_data(name, mode, sbi->s_proc, \
- &ext4_ui_proc_fops, &sbi->s_##var); \
- if (proc == NULL) { \
- printk(KERN_ERR "EXT4-fs: can't create %s\n", name); \
- goto err_out; \
- } \
-} while (0)
-#else
-#define EXT4_PROC_HANDLER(name, var)
-#endif
-
/*
* Function prototypes
*/
@@ -1092,6 +1083,7 @@ extern int ext4_can_truncate(struct inode *inode);
extern void ext4_truncate(struct inode *);
extern void ext4_set_inode_flags(struct inode *);
extern void ext4_get_inode_flags(struct ext4_inode_info *);
+extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_meta_trans_blocks(struct inode *, int nrblocks, int idxblocks);
@@ -1107,7 +1099,10 @@ extern long ext4_compat_ioctl(struct file *, unsigned int, unsigned long);
/* migrate.c */
extern int ext4_ext_migrate(struct inode *);
+
/* namei.c */
+extern unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize);
+extern __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize);
extern int ext4_orphan_add(handle_t *, struct inode *);
extern int ext4_orphan_del(handle_t *, struct inode *);
extern int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index 18cb67b2cbb..f0c3ec85bd4 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h
@@ -241,5 +241,6 @@ extern int ext4_ext_search_left(struct inode *, struct ext4_ext_path *,
extern int ext4_ext_search_right(struct inode *, struct ext4_ext_path *,
ext4_lblk_t *, ext4_fsblk_t *);
extern void ext4_ext_drop_refs(struct ext4_ext_path *);
+extern int ext4_ext_check_inode(struct inode *inode);
#endif /* _EXT4_EXTENTS */
diff --git a/fs/ext4/ext4_i.h b/fs/ext4/ext4_i.h
index e69acc16f5c..4ce2187123a 100644
--- a/fs/ext4/ext4_i.h
+++ b/fs/ext4/ext4_i.h
@@ -33,9 +33,6 @@ typedef __u32 ext4_lblk_t;
/* data type for block group number */
typedef unsigned int ext4_group_t;
-#define rsv_start rsv_window._rsv_start
-#define rsv_end rsv_window._rsv_end
-
/*
* storage for cached extent
*/
@@ -125,6 +122,9 @@ struct ext4_inode_info {
struct list_head i_prealloc_list;
spinlock_t i_prealloc_lock;
+ /* ialloc */
+ ext4_group_t i_last_alloc_group;
+
/* allocation reservation info for delalloc */
unsigned int i_reserved_data_blocks;
unsigned int i_reserved_meta_blocks;
diff --git a/fs/ext4/ext4_sb.h b/fs/ext4/ext4_sb.h
index 039b6ea1a04..57b71fefbcc 100644
--- a/fs/ext4/ext4_sb.h
+++ b/fs/ext4/ext4_sb.h
@@ -62,12 +62,10 @@ struct ext4_sb_info {
struct percpu_counter s_freeinodes_counter;
struct percpu_counter s_dirs_counter;
struct percpu_counter s_dirtyblocks_counter;
- struct blockgroup_lock s_blockgroup_lock;
+ struct blockgroup_lock *s_blockgroup_lock;
struct proc_dir_entry *s_proc;
-
- /* root of the per fs reservation window tree */
- spinlock_t s_rsv_window_lock;
- struct rb_root s_rsv_window_root;
+ struct kobject s_kobj;
+ struct completion s_kobj_unregister;
/* Journaling */
struct inode *s_journal_inode;
@@ -146,6 +144,10 @@ struct ext4_sb_info {
/* locality groups */
struct ext4_locality_group *s_locality_groups;
+ /* for write statistics */
+ unsigned long s_sectors_written_start;
+ u64 s_kbytes_written;
+
unsigned int s_log_groups_per_flex;
struct flex_groups *s_flex_groups;
};
@@ -153,7 +155,7 @@ struct ext4_sb_info {
static inline spinlock_t *
sb_bgl_lock(struct ext4_sb_info *sbi, unsigned int block_group)
{
- return bgl_lock_ptr(&sbi->s_blockgroup_lock, block_group);
+ return bgl_lock_ptr(sbi->s_blockgroup_lock, block_group);
}
#endif /* _EXT4_SB */
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index e0aa4fe4f59..ac77d8b8251 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -152,6 +152,8 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
ext4_grpblk_t colour;
+ ext4_group_t block_group;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
int depth;
if (path) {
@@ -170,10 +172,31 @@ static ext4_fsblk_t ext4_ext_find_goal(struct inode *inode,
}
/* OK. use inode's group */
- bg_start = (ei->i_block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
+ block_group = ei->i_block_group;
+ if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+ /*
+ * If there are at least EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME
+ * block groups per flexgroup, reserve the first block
+ * group for directories and special files. Regular
+ * files will start at the second block group. This
+ * tends to speed up directory access and improves
+ * fsck times.
+ */
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = (block_group * EXT4_BLOCKS_PER_GROUP(inode->i_sb)) +
le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_first_data_block);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -301,7 +324,64 @@ ext4_ext_max_entries(struct inode *inode, int depth)
return max;
}
-static int __ext4_ext_check_header(const char *function, struct inode *inode,
+static int ext4_valid_extent(struct inode *inode, struct ext4_extent *ext)
+{
+ ext4_fsblk_t block = ext_pblock(ext);
+ int len = ext4_ext_get_actual_len(ext);
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+ ((block + len) > ext4_blocks_count(es))))
+ return 0;
+ else
+ return 1;
+}
+
+static int ext4_valid_extent_idx(struct inode *inode,
+ struct ext4_extent_idx *ext_idx)
+{
+ ext4_fsblk_t block = idx_pblock(ext_idx);
+ struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es;
+ if (unlikely(block < le32_to_cpu(es->s_first_data_block) ||
+ (block > ext4_blocks_count(es))))
+ return 0;
+ else
+ return 1;
+}
+
+static int ext4_valid_extent_entries(struct inode *inode,
+ struct ext4_extent_header *eh,
+ int depth)
+{
+ struct ext4_extent *ext;
+ struct ext4_extent_idx *ext_idx;
+ unsigned short entries;
+ if (eh->eh_entries == 0)
+ return 1;
+
+ entries = le16_to_cpu(eh->eh_entries);
+
+ if (depth == 0) {
+ /* leaf entries */
+ ext = EXT_FIRST_EXTENT(eh);
+ while (entries) {
+ if (!ext4_valid_extent(inode, ext))
+ return 0;
+ ext++;
+ entries--;
+ }
+ } else {
+ ext_idx = EXT_FIRST_INDEX(eh);
+ while (entries) {
+ if (!ext4_valid_extent_idx(inode, ext_idx))
+ return 0;
+ ext_idx++;
+ entries--;
+ }
+ }
+ return 1;
+}
+
+static int __ext4_ext_check(const char *function, struct inode *inode,
struct ext4_extent_header *eh,
int depth)
{
@@ -329,11 +409,15 @@ static int __ext4_ext_check_header(const char *function, struct inode *inode,
error_msg = "invalid eh_entries";
goto corrupted;
}
+ if (!ext4_valid_extent_entries(inode, eh, depth)) {
+ error_msg = "invalid extent entries";
+ goto corrupted;
+ }
return 0;
corrupted:
ext4_error(inode->i_sb, function,
- "bad header in inode #%lu: %s - magic %x, "
+ "bad header/extent in inode #%lu: %s - magic %x, "
"entries %u, max %u(%u), depth %u(%u)",
inode->i_ino, error_msg, le16_to_cpu(eh->eh_magic),
le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max),
@@ -342,8 +426,13 @@ corrupted:
return -EIO;
}
-#define ext4_ext_check_header(inode, eh, depth) \
- __ext4_ext_check_header(__func__, inode, eh, depth)
+#define ext4_ext_check(inode, eh, depth) \
+ __ext4_ext_check(__func__, inode, eh, depth)
+
+int ext4_ext_check_inode(struct inode *inode)
+{
+ return ext4_ext_check(inode, ext_inode_hdr(inode), ext_depth(inode));
+}
#ifdef EXT_DEBUG
static void ext4_ext_show_path(struct inode *inode, struct ext4_ext_path *path)
@@ -547,9 +636,6 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
eh = ext_inode_hdr(inode);
depth = ext_depth(inode);
- if (ext4_ext_check_header(inode, eh, depth))
- return ERR_PTR(-EIO);
-
/* account possible depth increase */
if (!path) {
@@ -565,6 +651,8 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
i = depth;
/* walk through the tree */
while (i) {
+ int need_to_validate = 0;
+
ext_debug("depth %d: num %d, max %d\n",
ppos, le16_to_cpu(eh->eh_entries), le16_to_cpu(eh->eh_max));
@@ -573,10 +661,17 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_depth = i;
path[ppos].p_ext = NULL;
- bh = sb_bread(inode->i_sb, path[ppos].p_block);
- if (!bh)
+ bh = sb_getblk(inode->i_sb, path[ppos].p_block);
+ if (unlikely(!bh))
goto err;
-
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ put_bh(bh);
+ goto err;
+ }
+ /* validate the extent entries */
+ need_to_validate = 1;
+ }
eh = ext_block_hdr(bh);
ppos++;
BUG_ON(ppos > depth);
@@ -584,7 +679,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block,
path[ppos].p_hdr = eh;
i--;
- if (ext4_ext_check_header(inode, eh, i))
+ if (need_to_validate && ext4_ext_check(inode, eh, i))
goto err;
}
@@ -1181,7 +1276,7 @@ got_index:
return -EIO;
eh = ext_block_hdr(bh);
/* subtract from p_depth to get proper eh_depth */
- if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+ if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
put_bh(bh);
return -EIO;
}
@@ -1194,7 +1289,7 @@ got_index:
if (bh == NULL)
return -EIO;
eh = ext_block_hdr(bh);
- if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) {
+ if (ext4_ext_check(inode, eh, path->p_depth - depth)) {
put_bh(bh);
return -EIO;
}
@@ -2137,7 +2232,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
return -ENOMEM;
}
path[0].p_hdr = ext_inode_hdr(inode);
- if (ext4_ext_check_header(inode, path[0].p_hdr, depth)) {
+ if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
err = -EIO;
goto out;
}
@@ -2191,7 +2286,7 @@ static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
err = -EIO;
break;
}
- if (ext4_ext_check_header(inode, ext_block_hdr(bh),
+ if (ext4_ext_check(inode, ext_block_hdr(bh),
depth - i - 1)) {
err = -EIO;
break;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index f731cb545a0..588af8c7724 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -33,9 +33,14 @@
*/
static int ext4_release_file(struct inode *inode, struct file *filp)
{
+ if (EXT4_I(inode)->i_state & EXT4_STATE_DA_ALLOC_CLOSE) {
+ ext4_alloc_da_blocks(inode);
+ EXT4_I(inode)->i_state &= ~EXT4_STATE_DA_ALLOC_CLOSE;
+ }
/* if we are the last writer on the inode, drop the block reservation */
if ((filp->f_mode & FMODE_WRITE) &&
- (atomic_read(&inode->i_writecount) == 1))
+ (atomic_read(&inode->i_writecount) == 1) &&
+ !EXT4_I(inode)->i_reserved_data_blocks)
{
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index fb51b40e3e8..47b84e8df56 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -189,7 +189,6 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
struct ext4_super_block *es;
struct ext4_sb_info *sbi;
int fatal = 0, err, count, cleared;
- ext4_group_t flex_group;
if (atomic_read(&inode->i_count) > 1) {
printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
@@ -268,6 +267,13 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
if (is_directory) {
count = ext4_used_dirs_count(sb, gdp) - 1;
ext4_used_dirs_set(sb, gdp, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f;
+
+ f = ext4_flex_group(sbi, block_group);
+ atomic_dec(&sbi->s_flex_groups[f].free_inodes);
+ }
+
}
gdp->bg_checksum = ext4_group_desc_csum(sbi,
block_group, gdp);
@@ -277,10 +283,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode)
percpu_counter_dec(&sbi->s_dirs_counter);
if (sbi->s_log_groups_per_flex) {
- flex_group = ext4_flex_group(sbi, block_group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_inodes++;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ ext4_group_t f;
+
+ f = ext4_flex_group(sbi, block_group);
+ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
}
}
BUFFER_TRACE(bh2, "call ext4_handle_dirty_metadata");
@@ -360,9 +366,9 @@ static int find_group_flex(struct super_block *sb, struct inode *parent,
sbi->s_log_groups_per_flex;
find_close_to_parent:
- flexbg_free_blocks = flex_group[best_flex].free_blocks;
+ flexbg_free_blocks = atomic_read(&flex_group[best_flex].free_blocks);
flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
- if (flex_group[best_flex].free_inodes &&
+ if (atomic_read(&flex_group[best_flex].free_inodes) &&
flex_freeb_ratio > free_block_ratio)
goto found_flexbg;
@@ -375,24 +381,24 @@ find_close_to_parent:
if (i == parent_fbg_group || i == parent_fbg_group - 1)
continue;
- flexbg_free_blocks = flex_group[i].free_blocks;
+ flexbg_free_blocks = atomic_read(&flex_group[i].free_blocks);
flex_freeb_ratio = flexbg_free_blocks * 100 / blocks_per_flex;
if (flex_freeb_ratio > free_block_ratio &&
- flex_group[i].free_inodes) {
+ (atomic_read(&flex_group[i].free_inodes))) {
best_flex = i;
goto found_flexbg;
}
- if (flex_group[best_flex].free_inodes == 0 ||
- (flex_group[i].free_blocks >
- flex_group[best_flex].free_blocks &&
- flex_group[i].free_inodes))
+ if ((atomic_read(&flex_group[best_flex].free_inodes) == 0) ||
+ ((atomic_read(&flex_group[i].free_blocks) >
+ atomic_read(&flex_group[best_flex].free_blocks)) &&
+ atomic_read(&flex_group[i].free_inodes)))
best_flex = i;
}
- if (!flex_group[best_flex].free_inodes ||
- !flex_group[best_flex].free_blocks)
+ if (!atomic_read(&flex_group[best_flex].free_inodes) ||
+ !atomic_read(&flex_group[best_flex].free_blocks))
return -1;
found_flexbg:
@@ -410,6 +416,42 @@ out:
return 0;
}
+struct orlov_stats {
+ __u32 free_inodes;
+ __u32 free_blocks;
+ __u32 used_dirs;
+};
+
+/*
+ * Helper function for Orlov's allocator; returns critical information
+ * for a particular block group or flex_bg. If flex_size is 1, then g
+ * is a block group number; otherwise it is flex_bg number.
+ */
+void get_orlov_stats(struct super_block *sb, ext4_group_t g,
+ int flex_size, struct orlov_stats *stats)
+{
+ struct ext4_group_desc *desc;
+ struct flex_groups *flex_group = EXT4_SB(sb)->s_flex_groups;
+
+ if (flex_size > 1) {
+ stats->free_inodes = atomic_read(&flex_group[g].free_inodes);
+ stats->free_blocks = atomic_read(&flex_group[g].free_blocks);
+ stats->used_dirs = atomic_read(&flex_group[g].used_dirs);
+ return;
+ }
+
+ desc = ext4_get_group_desc(sb, g, NULL);
+ if (desc) {
+ stats->free_inodes = ext4_free_inodes_count(sb, desc);
+ stats->free_blocks = ext4_free_blks_count(sb, desc);
+ stats->used_dirs = ext4_used_dirs_count(sb, desc);
+ } else {
+ stats->free_inodes = 0;
+ stats->free_blocks = 0;
+ stats->used_dirs = 0;
+ }
+}
+
/*
* Orlov's allocator for directories.
*
@@ -425,35 +467,34 @@ out:
* it has too many directories already (max_dirs) or
* it has too few free inodes left (min_inodes) or
* it has too few free blocks left (min_blocks) or
- * it's already running too large debt (max_debt).
* Parent's group is preferred, if it doesn't satisfy these
* conditions we search cyclically through the rest. If none
* of the groups look good we just look for a group with more
* free inodes than average (starting at parent's group).
- *
- * Debt is incremented each time we allocate a directory and decremented
- * when we allocate an inode, within 0--255.
*/
-#define INODE_COST 64
-#define BLOCK_COST 256
-
static int find_group_orlov(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_super_block *es = sbi->s_es;
ext4_group_t ngroups = sbi->s_groups_count;
int inodes_per_group = EXT4_INODES_PER_GROUP(sb);
unsigned int freei, avefreei;
ext4_fsblk_t freeb, avefreeb;
- ext4_fsblk_t blocks_per_dir;
unsigned int ndirs;
- int max_debt, max_dirs, min_inodes;
+ int max_dirs, min_inodes;
ext4_grpblk_t min_blocks;
- ext4_group_t i;
+ ext4_group_t i, grp, g;
struct ext4_group_desc *desc;
+ struct orlov_stats stats;
+ int flex_size = ext4_flex_bg_size(sbi);
+
+ if (flex_size > 1) {
+ ngroups = (ngroups + flex_size - 1) >>
+ sbi->s_log_groups_per_flex;
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }
freei = percpu_counter_read_positive(&sbi->s_freeinodes_counter);
avefreei = freei / ngroups;
@@ -462,71 +503,97 @@ static int find_group_orlov(struct super_block *sb, struct inode *parent,
do_div(avefreeb, ngroups);
ndirs = percpu_counter_read_positive(&sbi->s_dirs_counter);
- if ((parent == sb->s_root->d_inode) ||
- (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL)) {
+ if (S_ISDIR(mode) &&
+ ((parent == sb->s_root->d_inode) ||
+ (EXT4_I(parent)->i_flags & EXT4_TOPDIR_FL))) {
int best_ndir = inodes_per_group;
- ext4_group_t grp;
int ret = -1;
get_random_bytes(&grp, sizeof(grp));
parent_group = (unsigned)grp % ngroups;
for (i = 0; i < ngroups; i++) {
- grp = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, grp, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
+ g = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, g, flex_size, &stats);
+ if (!stats.free_inodes)
continue;
- if (ext4_used_dirs_count(sb, desc) >= best_ndir)
+ if (stats.used_dirs >= best_ndir)
continue;
- if (ext4_free_inodes_count(sb, desc) < avefreei)
+ if (stats.free_inodes < avefreei)
continue;
- if (ext4_free_blks_count(sb, desc) < avefreeb)
+ if (stats.free_blocks < avefreeb)
continue;
- *group = grp;
+ grp = g;
ret = 0;
- best_ndir = ext4_used_dirs_count(sb, desc);
+ best_ndir = stats.used_dirs;
+ }
+ if (ret)
+ goto fallback;
+ found_flex_bg:
+ if (flex_size == 1) {
+ *group = grp;
+ return 0;
+ }
+
+ /*
+ * We pack inodes at the beginning of the flexgroup's
+ * inode tables. Block allocation decisions will do
+ * something similar, although regular files will
+ * start at 2nd block group of the flexgroup. See
+ * ext4_ext_find_goal() and ext4_find_near().
+ */
+ grp *= flex_size;
+ for (i = 0; i < flex_size; i++) {
+ if (grp+i >= sbi->s_groups_count)
+ break;
+ desc = ext4_get_group_desc(sb, grp+i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = grp+i;
+ return 0;
+ }
}
- if (ret == 0)
- return ret;
goto fallback;
}
- blocks_per_dir = ext4_blocks_count(es) - freeb;
- do_div(blocks_per_dir, ndirs);
-
max_dirs = ndirs / ngroups + inodes_per_group / 16;
- min_inodes = avefreei - inodes_per_group / 4;
- min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb) / 4;
-
- max_debt = EXT4_BLOCKS_PER_GROUP(sb);
- max_debt /= max_t(int, blocks_per_dir, BLOCK_COST);
- if (max_debt * INODE_COST > inodes_per_group)
- max_debt = inodes_per_group / INODE_COST;
- if (max_debt > 255)
- max_debt = 255;
- if (max_debt == 0)
- max_debt = 1;
+ min_inodes = avefreei - inodes_per_group*flex_size / 4;
+ if (min_inodes < 1)
+ min_inodes = 1;
+ min_blocks = avefreeb - EXT4_BLOCKS_PER_GROUP(sb)*flex_size / 4;
+
+ /*
+ * Start looking in the flex group where we last allocated an
+ * inode for this parent directory
+ */
+ if (EXT4_I(parent)->i_last_alloc_group != ~0) {
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ if (flex_size > 1)
+ parent_group >>= sbi->s_log_groups_per_flex;
+ }
for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
- if (!desc || !ext4_free_inodes_count(sb, desc))
- continue;
- if (ext4_used_dirs_count(sb, desc) >= max_dirs)
+ grp = (parent_group + i) % ngroups;
+ get_orlov_stats(sb, grp, flex_size, &stats);
+ if (stats.used_dirs >= max_dirs)
continue;
- if (ext4_free_inodes_count(sb, desc) < min_inodes)
+ if (stats.free_inodes < min_inodes)
continue;
- if (ext4_free_blks_count(sb, desc) < min_blocks)
+ if (stats.free_blocks < min_blocks)
continue;
- return 0;
+ goto found_flex_bg;
}
fallback:
+ ngroups = sbi->s_groups_count;
+ avefreei = freei / ngroups;
+ parent_group = EXT4_I(parent)->i_block_group;
for (i = 0; i < ngroups; i++) {
- *group = (parent_group + i) % ngroups;
- desc = ext4_get_group_desc(sb, *group, NULL);
+ grp = (parent_group + i) % ngroups;
+ desc = ext4_get_group_desc(sb, grp, NULL);
if (desc && ext4_free_inodes_count(sb, desc) &&
- ext4_free_inodes_count(sb, desc) >= avefreei)
+ ext4_free_inodes_count(sb, desc) >= avefreei) {
+ *group = grp;
return 0;
+ }
}
if (avefreei) {
@@ -542,12 +609,51 @@ fallback:
}
static int find_group_other(struct super_block *sb, struct inode *parent,
- ext4_group_t *group)
+ ext4_group_t *group, int mode)
{
ext4_group_t parent_group = EXT4_I(parent)->i_block_group;
ext4_group_t ngroups = EXT4_SB(sb)->s_groups_count;
struct ext4_group_desc *desc;
- ext4_group_t i;
+ ext4_group_t i, last;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(sb));
+
+ /*
+ * Try to place the inode is the same flex group as its
+ * parent. If we can't find space, use the Orlov algorithm to
+ * find another flex group, and store that information in the
+ * parent directory's inode information so that use that flex
+ * group for future allocations.
+ */
+ if (flex_size > 1) {
+ int retry = 0;
+
+ try_again:
+ parent_group &= ~(flex_size-1);
+ last = parent_group + flex_size;
+ if (last > ngroups)
+ last = ngroups;
+ for (i = parent_group; i < last; i++) {
+ desc = ext4_get_group_desc(sb, i, NULL);
+ if (desc && ext4_free_inodes_count(sb, desc)) {
+ *group = i;
+ return 0;
+ }
+ }
+ if (!retry && EXT4_I(parent)->i_last_alloc_group != ~0) {
+ retry = 1;
+ parent_group = EXT4_I(parent)->i_last_alloc_group;
+ goto try_again;
+ }
+ /*
+ * If this didn't work, use the Orlov search algorithm
+ * to find a new flex group; we pass in the mode to
+ * avoid the topdir algorithms.
+ */
+ *group = parent_group + flex_size;
+ if (*group > ngroups)
+ *group = 0;
+ return find_group_orlov(sb, parent, group, mode);
+ }
/*
* Try to place the inode in its parent directory
@@ -665,6 +771,11 @@ static int ext4_claim_inode(struct super_block *sb,
if (S_ISDIR(mode)) {
count = ext4_used_dirs_count(sb, gdp) + 1;
ext4_used_dirs_set(sb, gdp, count);
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t f = ext4_flex_group(sbi, group);
+
+ atomic_inc(&sbi->s_flex_groups[f].free_inodes);
+ }
}
gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
err_ret:
@@ -716,10 +827,10 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
sbi = EXT4_SB(sb);
es = sbi->s_es;
- if (sbi->s_log_groups_per_flex) {
+ if (sbi->s_log_groups_per_flex && test_opt(sb, OLDALLOC)) {
ret2 = find_group_flex(sb, dir, &group);
if (ret2 == -1) {
- ret2 = find_group_other(sb, dir, &group);
+ ret2 = find_group_other(sb, dir, &group, mode);
if (ret2 == 0 && once)
once = 0;
printk(KERN_NOTICE "ext4: find_group_flex "
@@ -733,11 +844,12 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode)
if (test_opt(sb, OLDALLOC))
ret2 = find_group_dir(sb, dir, &group);
else
- ret2 = find_group_orlov(sb, dir, &group);
+ ret2 = find_group_orlov(sb, dir, &group, mode);
} else
- ret2 = find_group_other(sb, dir, &group);
+ ret2 = find_group_other(sb, dir, &group, mode);
got_group:
+ EXT4_I(dir)->i_last_alloc_group = group;
err = -ENOSPC;
if (ret2 == -1)
goto out;
@@ -858,9 +970,7 @@ got:
if (sbi->s_log_groups_per_flex) {
flex_group = ext4_flex_group(sbi, group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_inodes--;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ atomic_dec(&sbi->s_flex_groups[flex_group].free_inodes);
}
inode->i_uid = current_fsuid();
@@ -885,19 +995,16 @@ got:
ei->i_disksize = 0;
/*
- * Don't inherit extent flag from directory. We set extent flag on
- * newly created directory and file only if -o extent mount option is
- * specified
+ * Don't inherit extent flag from directory, amongst others. We set
+ * extent flag on newly created directory and file only if -o extent
+ * mount option is specified
*/
- ei->i_flags = EXT4_I(dir)->i_flags & ~(EXT4_INDEX_FL|EXT4_EXTENTS_FL);
- if (S_ISLNK(mode))
- ei->i_flags &= ~(EXT4_IMMUTABLE_FL|EXT4_APPEND_FL);
- /* dirsync only applies to directories */
- if (!S_ISDIR(mode))
- ei->i_flags &= ~EXT4_DIRSYNC_FL;
+ ei->i_flags =
+ ext4_mask_flags(mode, EXT4_I(dir)->i_flags & EXT4_FL_INHERITED);
ei->i_file_acl = 0;
ei->i_dtime = 0;
ei->i_block_group = group;
+ ei->i_last_alloc_group = ~0;
ext4_set_inode_flags(inode);
if (IS_DIRSYNC(inode))
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index dd82ff39006..a2e7952bc5f 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -371,6 +371,34 @@ static int ext4_block_to_path(struct inode *inode,
return n;
}
+static int __ext4_check_blockref(const char *function, struct inode *inode,
+ unsigned int *p, unsigned int max) {
+
+ unsigned int maxblocks = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es);
+ unsigned int *bref = p;
+ while (bref < p+max) {
+ if (unlikely(*bref >= maxblocks)) {
+ ext4_error(inode->i_sb, function,
+ "block reference %u >= max (%u) "
+ "in inode #%lu, offset=%d",
+ *bref, maxblocks,
+ inode->i_ino, (int)(bref-p));
+ return -EIO;
+ }
+ bref++;
+ }
+ return 0;
+}
+
+
+#define ext4_check_indirect_blockref(inode, bh) \
+ __ext4_check_blockref(__func__, inode, (__le32 *)(bh)->b_data, \
+ EXT4_ADDR_PER_BLOCK((inode)->i_sb))
+
+#define ext4_check_inode_blockref(inode) \
+ __ext4_check_blockref(__func__, inode, EXT4_I(inode)->i_data, \
+ EXT4_NDIR_BLOCKS)
+
/**
* ext4_get_branch - read the chain of indirect blocks leading to data
* @inode: inode in question
@@ -415,9 +443,22 @@ static Indirect *ext4_get_branch(struct inode *inode, int depth,
if (!p->key)
goto no_block;
while (--depth) {
- bh = sb_bread(sb, le32_to_cpu(p->key));
- if (!bh)
+ bh = sb_getblk(sb, le32_to_cpu(p->key));
+ if (unlikely(!bh))
goto failure;
+
+ if (!bh_uptodate_or_lock(bh)) {
+ if (bh_submit_read(bh) < 0) {
+ put_bh(bh);
+ goto failure;
+ }
+ /* validate block references */
+ if (ext4_check_indirect_blockref(inode, bh)) {
+ put_bh(bh);
+ goto failure;
+ }
+ }
+
add_chain(++p, bh, (__le32 *)bh->b_data + *++offsets);
/* Reader: end */
if (!p->key)
@@ -459,6 +500,8 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
ext4_fsblk_t bg_start;
ext4_fsblk_t last_block;
ext4_grpblk_t colour;
+ ext4_group_t block_group;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(inode->i_sb));
/* Try to find previous block */
for (p = ind->p - 1; p >= start; p--) {
@@ -474,9 +517,22 @@ static ext4_fsblk_t ext4_find_near(struct inode *inode, Indirect *ind)
* It is going to be referred to from the inode itself? OK, just put it
* into the same cylinder group then.
*/
- bg_start = ext4_group_first_block_no(inode->i_sb, ei->i_block_group);
+ block_group = ei->i_block_group;
+ if (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) {
+ block_group &= ~(flex_size-1);
+ if (S_ISREG(inode->i_mode))
+ block_group++;
+ }
+ bg_start = ext4_group_first_block_no(inode->i_sb, block_group);
last_block = ext4_blocks_count(EXT4_SB(inode->i_sb)->s_es) - 1;
+ /*
+ * If we are doing delayed allocation, we don't need take
+ * colour into account.
+ */
+ if (test_opt(inode->i_sb, DELALLOC))
+ return bg_start;
+
if (bg_start + EXT4_BLOCKS_PER_GROUP(inode->i_sb) <= last_block)
colour = (current->pid % 16) *
(EXT4_BLOCKS_PER_GROUP(inode->i_sb) / 16);
@@ -1052,9 +1108,16 @@ static void ext4_da_update_reserve_space(struct inode *inode, int used)
/*
* free those over-booking quota for metadata blocks
*/
-
if (mdb_free)
vfs_dq_release_reservation_block(inode, mdb_free);
+
+ /*
+ * If we have done all the pending block allocations and if
+ * there aren't any writers on the inode, we can discard the
+ * inode's preallocations.
+ */
+ if (!total && (atomic_read(&inode->i_writecount) == 0))
+ ext4_discard_preallocations(inode);
}
/*
@@ -1688,9 +1751,10 @@ static void ext4_da_page_release_reservation(struct page *page,
struct mpage_da_data {
struct inode *inode;
- struct buffer_head lbh; /* extent of blocks */
+ sector_t b_blocknr; /* start block number of extent */
+ size_t b_size; /* size of extent */
+ unsigned long b_state; /* state of the extent */
unsigned long first_page, next_page; /* extent of pages */
- get_block_t *get_block;
struct writeback_control *wbc;
int io_done;
int pages_written;
@@ -1704,7 +1768,6 @@ struct mpage_da_data {
* @mpd->inode: inode
* @mpd->first_page: first page of the extent
* @mpd->next_page: page after the last page of the extent
- * @mpd->get_block: the filesystem's block mapper function
*
* By the time mpage_da_submit_io() is called we expect all blocks
* to be allocated. this may be wrong if allocation failed.
@@ -1724,7 +1787,7 @@ static int mpage_da_submit_io(struct mpage_da_data *mpd)
/*
* We need to start from the first_page to the next_page - 1
* to make sure we also write the mapped dirty buffer_heads.
- * If we look at mpd->lbh.b_blocknr we would only be looking
+ * If we look at mpd->b_blocknr we would only be looking
* at the currently mapped buffer_heads.
*/
index = mpd->first_page;
@@ -1914,68 +1977,111 @@ static void ext4_print_free_blocks(struct inode *inode)
return;
}
+#define EXT4_DELALLOC_RSVED 1
+static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh_result, int create)
+{
+ int ret;
+ unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
+ loff_t disksize = EXT4_I(inode)->i_disksize;
+ handle_t *handle = NULL;
+
+ handle = ext4_journal_current_handle();
+ BUG_ON(!handle);
+ ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
+ bh_result, create, 0, EXT4_DELALLOC_RSVED);
+ if (ret <= 0)
+ return ret;
+
+ bh_result->b_size = (ret << inode->i_blkbits);
+
+ if (ext4_should_order_data(inode)) {
+ int retval;
+ retval = ext4_jbd2_file_inode(handle, inode);
+ if (retval)
+ /*
+ * Failed to add inode for ordered mode. Don't
+ * update file size
+ */
+ return retval;
+ }
+
+ /*
+ * Update on-disk size along with block allocation we don't
+ * use 'extend_disksize' as size may change within already
+ * allocated block -bzzz
+ */
+ disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
+ if (disksize > i_size_read(inode))
+ disksize = i_size_read(inode);
+ if (disksize > EXT4_I(inode)->i_disksize) {
+ ext4_update_i_disksize(inode, disksize);
+ ret = ext4_mark_inode_dirty(handle, inode);
+ return ret;
+ }
+ return 0;
+}
+
/*
* mpage_da_map_blocks - go through given space
*
- * @mpd->lbh - bh describing space
- * @mpd->get_block - the filesystem's block mapper function
+ * @mpd - bh describing space
*
* The function skips space we know is already mapped to disk blocks.
*
*/
-static int mpage_da_map_blocks(struct mpage_da_data *mpd)
+static int mpage_da_map_blocks(struct mpage_da_data *mpd)
{
int err = 0;
struct buffer_head new;
- struct buffer_head *lbh = &mpd->lbh;
sector_t next;
/*
* We consider only non-mapped and non-allocated blocks
*/
- if (buffer_mapped(lbh) && !buffer_delay(lbh))
+ if ((mpd->b_state & (1 << BH_Mapped)) &&
+ !(mpd->b_state & (1 << BH_Delay)))
return 0;
- new.b_state = lbh->b_state;
+ new.b_state = mpd->b_state;
new.b_blocknr = 0;
- new.b_size = lbh->b_size;
- next = lbh->b_blocknr;
+ new.b_size = mpd->b_size;
+ next = mpd->b_blocknr;
/*
* If we didn't accumulate anything
* to write simply return
*/
if (!new.b_size)
return 0;
- err = mpd->get_block(mpd->inode, next, &new, 1);
- if (err) {
- /* If get block returns with error
- * we simply return. Later writepage
- * will redirty the page and writepages
- * will find the dirty page again
+ err = ext4_da_get_block_write(mpd->inode, next, &new, 1);
+ if (err) {
+ /*
+ * If get block returns with error we simply
+ * return. Later writepage will redirty the page and
+ * writepages will find the dirty page again
*/
if (err == -EAGAIN)
return 0;
if (err == -ENOSPC &&
- ext4_count_free_blocks(mpd->inode->i_sb)) {
+ ext4_count_free_blocks(mpd->inode->i_sb)) {
mpd->retval = err;
return 0;
}
/*
- * get block failure will cause us
- * to loop in writepages. Because
- * a_ops->writepage won't be able to
- * make progress. The page will be redirtied
- * by writepage and writepages will again
- * try to write the same.
+ * get block failure will cause us to loop in
+ * writepages, because a_ops->writepage won't be able
+ * to make progress. The page will be redirtied by
+ * writepage and writepages will again try to write
+ * the same.
*/
printk(KERN_EMERG "%s block allocation failed for inode %lu "
"at logical offset %llu with max blocks "
"%zd with error %d\n",
__func__, mpd->inode->i_ino,
(unsigned long long)next,
- lbh->b_size >> mpd->inode->i_blkbits, err);
+ mpd->b_size >> mpd->inode->i_blkbits, err);
printk(KERN_EMERG "This should not happen.!! "
"Data will be lost\n");
if (err == -ENOSPC) {
@@ -1983,7 +2089,7 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
}
/* invlaidate all the pages */
ext4_da_block_invalidatepages(mpd, next,
- lbh->b_size >> mpd->inode->i_blkbits);
+ mpd->b_size >> mpd->inode->i_blkbits);
return err;
}
BUG_ON(new.b_size == 0);
@@ -1995,7 +2101,8 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* If blocks are delayed marked, we need to
* put actual blocknr and drop delayed bit
*/
- if (buffer_delay(lbh) || buffer_unwritten(lbh))
+ if ((mpd->b_state & (1 << BH_Delay)) ||
+ (mpd->b_state & (1 << BH_Unwritten)))
mpage_put_bnr_to_bhs(mpd, next, &new);
return 0;
@@ -2014,12 +2121,11 @@ static int mpage_da_map_blocks(struct mpage_da_data *mpd)
* the function is used to collect contig. blocks in same state
*/
static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
- sector_t logical, struct buffer_head *bh)
+ sector_t logical, size_t b_size,
+ unsigned long b_state)
{
sector_t next;
- size_t b_size = bh->b_size;
- struct buffer_head *lbh = &mpd->lbh;
- int nrblocks = lbh->b_size >> mpd->inode->i_blkbits;
+ int nrblocks = mpd->b_size >> mpd->inode->i_blkbits;
/* check if thereserved journal credits might overflow */
if (!(EXT4_I(mpd->inode)->i_flags & EXT4_EXTENTS_FL)) {
@@ -2046,19 +2152,19 @@ static void mpage_add_bh_to_extent(struct mpage_da_data *mpd,
/*
* First block in the extent
*/
- if (lbh->b_size == 0) {
- lbh->b_blocknr = logical;
- lbh->b_size = b_size;
- lbh->b_state = bh->b_state & BH_FLAGS;
+ if (mpd->b_size == 0) {
+ mpd->b_blocknr = logical;
+ mpd->b_size = b_size;
+ mpd->b_state = b_state & BH_FLAGS;
return;
}
- next = lbh->b_blocknr + nrblocks;
+ next = mpd->b_blocknr + nrblocks;
/*
* Can we merge the block to our big extent?
*/
- if (logical == next && (bh->b_state & BH_FLAGS) == lbh->b_state) {
- lbh->b_size += b_size;
+ if (logical == next && (b_state & BH_FLAGS) == mpd->b_state) {
+ mpd->b_size += b_size;
return;
}
@@ -2087,7 +2193,7 @@ static int __mpage_da_writepage(struct page *page,
{
struct mpage_da_data *mpd = data;
struct inode *inode = mpd->inode;
- struct buffer_head *bh, *head, fake;
+ struct buffer_head *bh, *head;
sector_t logical;
if (mpd->io_done) {
@@ -2129,9 +2235,9 @@ static int __mpage_da_writepage(struct page *page,
/*
* ... and blocks
*/
- mpd->lbh.b_size = 0;
- mpd->lbh.b_state = 0;
- mpd->lbh.b_blocknr = 0;
+ mpd->b_size = 0;
+ mpd->b_state = 0;
+ mpd->b_blocknr = 0;
}
mpd->next_page = page->index + 1;
@@ -2139,16 +2245,8 @@ static int __mpage_da_writepage(struct page *page,
(PAGE_CACHE_SHIFT - inode->i_blkbits);
if (!page_has_buffers(page)) {
- /*
- * There is no attached buffer heads yet (mmap?)
- * we treat the page asfull of dirty blocks
- */
- bh = &fake;
- bh->b_size = PAGE_CACHE_SIZE;
- bh->b_state = 0;
- set_buffer_dirty(bh);
- set_buffer_uptodate(bh);
- mpage_add_bh_to_extent(mpd, logical, bh);
+ mpage_add_bh_to_extent(mpd, logical, PAGE_CACHE_SIZE,
+ (1 << BH_Dirty) | (1 << BH_Uptodate));
if (mpd->io_done)
return MPAGE_DA_EXTENT_TAIL;
} else {
@@ -2166,8 +2264,10 @@ static int __mpage_da_writepage(struct page *page,
* with the page in ext4_da_writepage
*/
if (buffer_dirty(bh) &&
- (!buffer_mapped(bh) || buffer_delay(bh))) {
- mpage_add_bh_to_extent(mpd, logical, bh);
+ (!buffer_mapped(bh) || buffer_delay(bh))) {
+ mpage_add_bh_to_extent(mpd, logical,
+ bh->b_size,
+ bh->b_state);
if (mpd->io_done)
return MPAGE_DA_EXTENT_TAIL;
} else if (buffer_dirty(bh) && (buffer_mapped(bh))) {
@@ -2179,9 +2279,8 @@ static int __mpage_da_writepage(struct page *page,
* unmapped buffer_head later we need to
* use the b_state flag of that buffer_head.
*/
- if (mpd->lbh.b_size == 0)
- mpd->lbh.b_state =
- bh->b_state & BH_FLAGS;
+ if (mpd->b_size == 0)
+ mpd->b_state = bh->b_state & BH_FLAGS;
}
logical++;
} while ((bh = bh->b_this_page) != head);
@@ -2191,51 +2290,6 @@ static int __mpage_da_writepage(struct page *page,
}
/*
- * mpage_da_writepages - walk the list of dirty pages of the given
- * address space, allocates non-allocated blocks, maps newly-allocated
- * blocks to existing bhs and issue IO them
- *
- * @mapping: address space structure to write
- * @wbc: subtract the number of written pages from *@wbc->nr_to_write
- * @get_block: the filesystem's block mapper function.
- *
- * This is a library function, which implements the writepages()
- * address_space_operation.
- */
-static int mpage_da_writepages(struct address_space *mapping,
- struct writeback_control *wbc,
- struct mpage_da_data *mpd)
-{
- int ret;
-
- if (!mpd->get_block)
- return generic_writepages(mapping, wbc);
-
- mpd->lbh.b_size = 0;
- mpd->lbh.b_state = 0;
- mpd->lbh.b_blocknr = 0;
- mpd->first_page = 0;
- mpd->next_page = 0;
- mpd->io_done = 0;
- mpd->pages_written = 0;
- mpd->retval = 0;
-
- ret = write_cache_pages(mapping, wbc, __mpage_da_writepage, mpd);
- /*
- * Handle last extent of pages
- */
- if (!mpd->io_done && mpd->next_page != mpd->first_page) {
- if (mpage_da_map_blocks(mpd) == 0)
- mpage_da_submit_io(mpd);
-
- mpd->io_done = 1;
- ret = MPAGE_DA_EXTENT_TAIL;
- }
- wbc->nr_to_write -= mpd->pages_written;
- return ret;
-}
-
-/*
* this is a special callback for ->write_begin() only
* it's intention is to return mapped block or reserve space
*/
@@ -2274,51 +2328,6 @@ static int ext4_da_get_block_prep(struct inode *inode, sector_t iblock,
return ret;
}
-#define EXT4_DELALLOC_RSVED 1
-static int ext4_da_get_block_write(struct inode *inode, sector_t iblock,
- struct buffer_head *bh_result, int create)
-{
- int ret;
- unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
- loff_t disksize = EXT4_I(inode)->i_disksize;
- handle_t *handle = NULL;
-
- handle = ext4_journal_current_handle();
- BUG_ON(!handle);
- ret = ext4_get_blocks_wrap(handle, inode, iblock, max_blocks,
- bh_result, create, 0, EXT4_DELALLOC_RSVED);
- if (ret > 0) {
-
- bh_result->b_size = (ret << inode->i_blkbits);
-
- if (ext4_should_order_data(inode)) {
- int retval;
- retval = ext4_jbd2_file_inode(handle, inode);
- if (retval)
- /*
- * Failed to add inode for ordered
- * mode. Don't update file size
- */
- return retval;
- }
-
- /*
- * Update on-disk size along with block allocation
- * we don't use 'extend_disksize' as size may change
- * within already allocated block -bzzz
- */
- disksize = ((loff_t) iblock + ret) << inode->i_blkbits;
- if (disksize > i_size_read(inode))
- disksize = i_size_read(inode);
- if (disksize > EXT4_I(inode)->i_disksize) {
- ext4_update_i_disksize(inode, disksize);
- ret = ext4_mark_inode_dirty(handle, inode);
- return ret;
- }
- ret = 0;
- }
- return ret;
-}
static int ext4_bh_unmapped_or_delay(handle_t *handle, struct buffer_head *bh)
{
@@ -2569,8 +2578,38 @@ retry:
dump_stack();
goto out_writepages;
}
- mpd.get_block = ext4_da_get_block_write;
- ret = mpage_da_writepages(mapping, wbc, &mpd);
+
+ /*
+ * Now call __mpage_da_writepage to find the next
+ * contiguous region of logical blocks that need
+ * blocks to be allocated by ext4. We don't actually
+ * submit the blocks for I/O here, even though
+ * write_cache_pages thinks it will, and will set the
+ * pages as clean for write before calling
+ * __mpage_da_writepage().
+ */
+ mpd.b_size = 0;
+ mpd.b_state = 0;
+ mpd.b_blocknr = 0;
+ mpd.first_page = 0;
+ mpd.next_page = 0;
+ mpd.io_done = 0;
+ mpd.pages_written = 0;
+ mpd.retval = 0;
+ ret = write_cache_pages(mapping, wbc, __mpage_da_writepage,
+ &mpd);
+ /*
+ * If we have a contigous extent of pages and we
+ * haven't done the I/O yet, map the blocks and submit
+ * them for I/O.
+ */
+ if (!mpd.io_done && mpd.next_page != mpd.first_page) {
+ if (mpage_da_map_blocks(&mpd) == 0)
+ mpage_da_submit_io(&mpd);
+ mpd.io_done = 1;
+ ret = MPAGE_DA_EXTENT_TAIL;
+ }
+ wbc->nr_to_write -= mpd.pages_written;
ext4_journal_stop(handle);
@@ -2846,6 +2885,48 @@ out:
return;
}
+/*
+ * Force all delayed allocation blocks to be allocated for a given inode.
+ */
+int ext4_alloc_da_blocks(struct inode *inode)
+{
+ if (!EXT4_I(inode)->i_reserved_data_blocks &&
+ !EXT4_I(inode)->i_reserved_meta_blocks)
+ return 0;
+
+ /*
+ * We do something simple for now. The filemap_flush() will
+ * also start triggering a write of the data blocks, which is
+ * not strictly speaking necessary (and for users of
+ * laptop_mode, not even desirable). However, to do otherwise
+ * would require replicating code paths in:
+ *
+ * ext4_da_writepages() ->
+ * write_cache_pages() ---> (via passed in callback function)
+ * __mpage_da_writepage() -->
+ * mpage_add_bh_to_extent()
+ * mpage_da_map_blocks()
+ *
+ * The problem is that write_cache_pages(), located in
+ * mm/page-writeback.c, marks pages clean in preparation for
+ * doing I/O, which is not desirable if we're not planning on
+ * doing I/O at all.
+ *
+ * We could call write_cache_pages(), and then redirty all of
+ * the pages by calling redirty_page_for_writeback() but that
+ * would be ugly in the extreme. So instead we would need to
+ * replicate parts of the code in the above functions,
+ * simplifying them becuase we wouldn't actually intend to
+ * write out the pages, but rather only collect contiguous
+ * logical block extents, call the multi-block allocator, and
+ * then update the buffer heads with the block allocations.
+ *
+ * For now, though, we'll cheat by calling filemap_flush(),
+ * which will map the blocks, and start the I/O, but not
+ * actually wait for the I/O to complete.
+ */
+ return filemap_flush(inode->i_mapping);
+}
/*
* bmap() is special. It gets used by applications such as lilo and by
@@ -3868,6 +3949,9 @@ void ext4_truncate(struct inode *inode)
if (!ext4_can_truncate(inode))
return;
+ if (inode->i_size == 0 && !test_opt(inode->i_sb, NO_AUTO_DA_ALLOC))
+ ei->i_state |= EXT4_STATE_DA_ALLOC_CLOSE;
+
if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL) {
ext4_ext_truncate(inode);
return;
@@ -4110,12 +4194,7 @@ make_io:
unsigned num;
table = ext4_inode_table(sb, gdp);
- /* Make sure s_inode_readahead_blks is a power of 2 */
- while (EXT4_SB(sb)->s_inode_readahead_blks &
- (EXT4_SB(sb)->s_inode_readahead_blks-1))
- EXT4_SB(sb)->s_inode_readahead_blks =
- (EXT4_SB(sb)->s_inode_readahead_blks &
- (EXT4_SB(sb)->s_inode_readahead_blks-1));
+ /* s_inode_readahead_blks is always a power of 2 */
b = block & ~(EXT4_SB(sb)->s_inode_readahead_blks-1);
if (table > b)
b = table;
@@ -4287,6 +4366,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
ei->i_disksize = inode->i_size;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
ei->i_block_group = iloc.block_group;
+ ei->i_last_alloc_group = ~0;
/*
* NOTE! The in-memory inode i_data array is in little-endian order
* even on big-endian machines: we do NOT byteswap the block numbers!
@@ -4329,6 +4409,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
(__u64)(le32_to_cpu(raw_inode->i_version_hi)) << 32;
}
+ if (ei->i_flags & EXT4_EXTENTS_FL) {
+ /* Validate extent which is part of inode */
+ ret = ext4_ext_check_inode(inode);
+ } else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
+ (S_ISLNK(inode->i_mode) &&
+ !ext4_inode_is_fast_symlink(inode))) {
+ /* Validate block references which are part of inode */
+ ret = ext4_check_inode_blockref(inode);
+ }
+ if (ret) {
+ brelse(bh);
+ goto bad_inode;
+ }
+
if (S_ISREG(inode->i_mode)) {
inode->i_op = &ext4_file_inode_operations;
inode->i_fop = &ext4_file_operations;
@@ -4345,7 +4439,8 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
inode->i_op = &ext4_symlink_inode_operations;
ext4_set_aops(inode);
}
- } else {
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
inode->i_op = &ext4_special_inode_operations;
if (raw_inode->i_block[0])
init_special_inode(inode, inode->i_mode,
@@ -4353,6 +4448,13 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
else
init_special_inode(inode, inode->i_mode,
new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
+ } else {
+ brelse(bh);
+ ret = -EIO;
+ ext4_error(inode->i_sb, __func__,
+ "bogus i_mode (%o) for inode=%lu",
+ inode->i_mode, inode->i_ino);
+ goto bad_inode;
}
brelse(iloc.bh);
ext4_set_inode_flags(inode);
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index 42dc83fb247..91e75f7a9e7 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -48,8 +48,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
if (err)
return err;
- if (!S_ISDIR(inode->i_mode))
- flags &= ~EXT4_DIRSYNC_FL;
+ flags = ext4_mask_flags(inode->i_mode, flags);
err = -EPERM;
mutex_lock(&inode->i_mutex);
@@ -263,6 +262,20 @@ setversion_out:
return err;
}
+ case EXT4_IOC_ALLOC_DA_BLKS:
+ {
+ int err;
+ if (!is_owner_or_cap(inode))
+ return -EACCES;
+
+ err = mnt_want_write(filp->f_path.mnt);
+ if (err)
+ return err;
+ err = ext4_alloc_da_blocks(inode);
+ mnt_drop_write(filp->f_path.mnt);
+ return err;
+ }
+
default:
return -ENOTTY;
}
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index b038188bd03..f871677a798 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -46,22 +46,23 @@
* The allocation request involve request for multiple number of blocks
* near to the goal(block) value specified.
*
- * During initialization phase of the allocator we decide to use the group
- * preallocation or inode preallocation depending on the size file. The
- * size of the file could be the resulting file size we would have after
- * allocation or the current file size which ever is larger. If the size is
- * less that sbi->s_mb_stream_request we select the group
- * preallocation. The default value of s_mb_stream_request is 16
- * blocks. This can also be tuned via
- * /proc/fs/ext4/<partition>/stream_req. The value is represented in terms
- * of number of blocks.
+ * During initialization phase of the allocator we decide to use the
+ * group preallocation or inode preallocation depending on the size of
+ * the file. The size of the file could be the resulting file size we
+ * would have after allocation, or the current file size, which ever
+ * is larger. If the size is less than sbi->s_mb_stream_request we
+ * select to use the group preallocation. The default value of
+ * s_mb_stream_request is 16 blocks. This can also be tuned via
+ * /sys/fs/ext4/<partition>/mb_stream_req. The value is represented in
+ * terms of number of blocks.
*
* The main motivation for having small file use group preallocation is to
- * ensure that we have small file closer in the disk.
+ * ensure that we have small files closer together on the disk.
*
- * First stage the allocator looks at the inode prealloc list
- * ext4_inode_info->i_prealloc_list contain list of prealloc spaces for
- * this particular inode. The inode prealloc space is represented as:
+ * First stage the allocator looks at the inode prealloc list,
+ * ext4_inode_info->i_prealloc_list, which contains list of prealloc
+ * spaces for this particular inode. The inode prealloc space is
+ * represented as:
*
* pa_lstart -> the logical start block for this prealloc space
* pa_pstart -> the physical start block for this prealloc space
@@ -121,29 +122,29 @@
* list. In case of inode preallocation we follow a list of heuristics
* based on file size. This can be found in ext4_mb_normalize_request. If
* we are doing a group prealloc we try to normalize the request to
- * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is set to
+ * sbi->s_mb_group_prealloc. Default value of s_mb_group_prealloc is
* 512 blocks. This can be tuned via
- * /proc/fs/ext4/<partition/group_prealloc. The value is represented in
+ * /sys/fs/ext4/<partition/mb_group_prealloc. The value is represented in
* terms of number of blocks. If we have mounted the file system with -O
* stripe=<value> option the group prealloc request is normalized to the
* stripe value (sbi->s_stripe)
*
- * The regular allocator(using the buddy cache) support few tunables.
+ * The regular allocator(using the buddy cache) supports few tunables.
*
- * /proc/fs/ext4/<partition>/min_to_scan
- * /proc/fs/ext4/<partition>/max_to_scan
- * /proc/fs/ext4/<partition>/order2_req
+ * /sys/fs/ext4/<partition>/mb_min_to_scan
+ * /sys/fs/ext4/<partition>/mb_max_to_scan
+ * /sys/fs/ext4/<partition>/mb_order2_req
*
- * The regular allocator use buddy scan only if the request len is power of
+ * The regular allocator uses buddy scan only if the request len is power of
* 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The
* value of s_mb_order2_reqs can be tuned via
- * /proc/fs/ext4/<partition>/order2_req. If the request len is equal to
+ * /sys/fs/ext4/<partition>/mb_order2_req. If the request len is equal to
* stripe size (sbi->s_stripe), we try to search for contigous block in
- * stripe size. This should result in better allocation on RAID setup. If
- * not we search in the specific group using bitmap for best extents. The
- * tunable min_to_scan and max_to_scan controll the behaviour here.
+ * stripe size. This should result in better allocation on RAID setups. If
+ * not, we search in the specific group using bitmap for best extents. The
+ * tunable min_to_scan and max_to_scan control the behaviour here.
* min_to_scan indicate how long the mballoc __must__ look for a best
- * extent and max_to_scanindicate how long the mballoc __can__ look for a
+ * extent and max_to_scan indicates how long the mballoc __can__ look for a
* best extent in the found extents. Searching for the blocks starts with
* the group specified as the goal value in allocation context via
* ac_g_ex. Each group is first checked based on the criteria whether it
@@ -337,8 +338,6 @@ static void ext4_mb_generate_from_pa(struct super_block *sb, void *bitmap,
ext4_group_t group);
static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
ext4_group_t group);
-static int ext4_mb_init_per_dev_proc(struct super_block *sb);
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb);
static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
@@ -1726,6 +1725,7 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
{
unsigned free, fragments;
unsigned i, bits;
+ int flex_size = ext4_flex_bg_size(EXT4_SB(ac->ac_sb));
struct ext4_group_desc *desc;
struct ext4_group_info *grp = ext4_get_group_info(ac->ac_sb, group);
@@ -1747,6 +1747,12 @@ static int ext4_mb_good_group(struct ext4_allocation_context *ac,
if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))
return 0;
+ /* Avoid using the first bg of a flexgroup for data files */
+ if ((ac->ac_flags & EXT4_MB_HINT_DATA) &&
+ (flex_size >= EXT4_FLEX_SIZE_DIR_ALLOC_SCHEME) &&
+ ((group % flex_size) == 0))
+ return 0;
+
bits = ac->ac_sb->s_blocksize_bits + 1;
for (i = ac->ac_2order; i <= bits; i++)
if (grp->bb_counters[i] > 0)
@@ -1971,7 +1977,7 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
/*
* We search using buddy data only if the order of the request
* is greater than equal to the sbi_s_mb_order2_reqs
- * You can tune it via /proc/fs/ext4/<partition>/order2_req
+ * You can tune it via /sys/fs/ext4/<partition>/mb_order2_req
*/
if (i >= sbi->s_mb_order2_reqs) {
/*
@@ -2693,7 +2699,7 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
i = (sb->s_blocksize_bits + 2) * sizeof(unsigned int);
sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL);
if (sbi->s_mb_maxs == NULL) {
- kfree(sbi->s_mb_maxs);
+ kfree(sbi->s_mb_offsets);
return -ENOMEM;
}
@@ -2746,7 +2752,6 @@ int ext4_mb_init(struct super_block *sb, int needs_recovery)
spin_lock_init(&lg->lg_prealloc_lock);
}
- ext4_mb_init_per_dev_proc(sb);
ext4_mb_history_init(sb);
if (sbi->s_journal)
@@ -2829,7 +2834,6 @@ int ext4_mb_release(struct super_block *sb)
free_percpu(sbi->s_locality_groups);
ext4_mb_history_release(sb);
- ext4_mb_destroy_per_dev_proc(sb);
return 0;
}
@@ -2890,62 +2894,6 @@ static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
mb_debug("freed %u blocks in %u structures\n", count, count2);
}
-#define EXT4_MB_STATS_NAME "stats"
-#define EXT4_MB_MAX_TO_SCAN_NAME "max_to_scan"
-#define EXT4_MB_MIN_TO_SCAN_NAME "min_to_scan"
-#define EXT4_MB_ORDER2_REQ "order2_req"
-#define EXT4_MB_STREAM_REQ "stream_req"
-#define EXT4_MB_GROUP_PREALLOC "group_prealloc"
-
-static int ext4_mb_init_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
- mode_t mode = S_IFREG | S_IRUGO | S_IWUSR;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct proc_dir_entry *proc;
-
- if (sbi->s_proc == NULL)
- return -EINVAL;
-
- EXT4_PROC_HANDLER(EXT4_MB_STATS_NAME, mb_stats);
- EXT4_PROC_HANDLER(EXT4_MB_MAX_TO_SCAN_NAME, mb_max_to_scan);
- EXT4_PROC_HANDLER(EXT4_MB_MIN_TO_SCAN_NAME, mb_min_to_scan);
- EXT4_PROC_HANDLER(EXT4_MB_ORDER2_REQ, mb_order2_reqs);
- EXT4_PROC_HANDLER(EXT4_MB_STREAM_REQ, mb_stream_request);
- EXT4_PROC_HANDLER(EXT4_MB_GROUP_PREALLOC, mb_group_prealloc);
- return 0;
-
-err_out:
- remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
- remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
- remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
- remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
- remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
- return -ENOMEM;
-#else
- return 0;
-#endif
-}
-
-static int ext4_mb_destroy_per_dev_proc(struct super_block *sb)
-{
-#ifdef CONFIG_PROC_FS
- struct ext4_sb_info *sbi = EXT4_SB(sb);
-
- if (sbi->s_proc == NULL)
- return -EINVAL;
-
- remove_proc_entry(EXT4_MB_GROUP_PREALLOC, sbi->s_proc);
- remove_proc_entry(EXT4_MB_STREAM_REQ, sbi->s_proc);
- remove_proc_entry(EXT4_MB_ORDER2_REQ, sbi->s_proc);
- remove_proc_entry(EXT4_MB_MIN_TO_SCAN_NAME, sbi->s_proc);
- remove_proc_entry(EXT4_MB_MAX_TO_SCAN_NAME, sbi->s_proc);
- remove_proc_entry(EXT4_MB_STATS_NAME, sbi->s_proc);
-#endif
- return 0;
-}
-
int __init init_ext4_mballoc(void)
{
ext4_pspace_cachep =
@@ -3096,9 +3044,8 @@ ext4_mb_mark_diskspace_used(struct ext4_allocation_context *ac,
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi,
ac->ac_b_ex.fe_group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_blocks -= ac->ac_b_ex.fe_len;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ atomic_sub(ac->ac_b_ex.fe_len,
+ &sbi->s_flex_groups[flex_group].free_blocks);
}
err = ext4_handle_dirty_metadata(handle, NULL, bitmap_bh);
@@ -3116,7 +3063,7 @@ out_err:
* here we normalize request for locality group
* Group request are normalized to s_strip size if we set the same via mount
* option. If not we set it to s_mb_group_prealloc which can be configured via
- * /proc/fs/ext4/<partition>/group_prealloc
+ * /sys/fs/ext4/<partition>/mb_group_prealloc
*
* XXX: should we try to preallocate more than the group has now?
*/
@@ -3608,8 +3555,11 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac,
spin_unlock(&pa->pa_lock);
grp_blk = pa->pa_pstart;
- /* If linear, pa_pstart may be in the next group when pa is used up */
- if (pa->pa_linear)
+ /*
+ * If doing group-based preallocation, pa_pstart may be in the
+ * next group when pa is used up
+ */
+ if (pa->pa_type == MB_GROUP_PA)
grp_blk--;
ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL);
@@ -3704,7 +3654,7 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac)
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
- pa->pa_linear = 0;
+ pa->pa_type = MB_INODE_PA;
mb_debug("new inode pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -3767,7 +3717,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac)
INIT_LIST_HEAD(&pa->pa_inode_list);
INIT_LIST_HEAD(&pa->pa_group_list);
pa->pa_deleted = 0;
- pa->pa_linear = 1;
+ pa->pa_type = MB_GROUP_PA;
mb_debug("new group pa %p: %llu/%u for %u\n", pa,
pa->pa_pstart, pa->pa_len, pa->pa_lstart);
@@ -4021,7 +3971,7 @@ repeat:
list_del_rcu(&pa->pa_inode_list);
spin_unlock(pa->pa_obj_lock);
- if (pa->pa_linear)
+ if (pa->pa_type == MB_GROUP_PA)
ext4_mb_release_group_pa(&e4b, pa, ac);
else
ext4_mb_release_inode_pa(&e4b, bitmap_bh, pa, ac);
@@ -4121,7 +4071,7 @@ repeat:
spin_unlock(&ei->i_prealloc_lock);
list_for_each_entry_safe(pa, tmp, &list, u.pa_tmp_list) {
- BUG_ON(pa->pa_linear != 0);
+ BUG_ON(pa->pa_type != MB_INODE_PA);
ext4_get_group_no_and_offset(sb, pa->pa_pstart, &group, NULL);
err = ext4_mb_load_buddy(sb, group, &e4b);
@@ -4232,7 +4182,7 @@ static inline void ext4_mb_show_ac(struct ext4_allocation_context *ac)
* file is determined by the current size or the resulting size after
* allocation which ever is larger
*
- * One can tune this size via /proc/fs/ext4/<partition>/stream_req
+ * One can tune this size via /sys/fs/ext4/<partition>/mb_stream_req
*/
static void ext4_mb_group_or_file(struct ext4_allocation_context *ac)
{
@@ -4373,7 +4323,7 @@ ext4_mb_discard_lg_preallocations(struct super_block *sb,
continue;
}
/* only lg prealloc space */
- BUG_ON(!pa->pa_linear);
+ BUG_ON(pa->pa_type != MB_GROUP_PA);
/* seems this one can be freed ... */
pa->pa_deleted = 1;
@@ -4442,7 +4392,7 @@ static void ext4_mb_add_n_trim(struct ext4_allocation_context *ac)
pa_inode_list) {
spin_lock(&tmp_pa->pa_lock);
if (tmp_pa->pa_deleted) {
- spin_unlock(&pa->pa_lock);
+ spin_unlock(&tmp_pa->pa_lock);
continue;
}
if (!added && pa->pa_free < tmp_pa->pa_free) {
@@ -4479,7 +4429,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
{
struct ext4_prealloc_space *pa = ac->ac_pa;
if (pa) {
- if (pa->pa_linear) {
+ if (pa->pa_type == MB_GROUP_PA) {
/* see comment in ext4_mb_use_group_pa() */
spin_lock(&pa->pa_lock);
pa->pa_pstart += ac->ac_b_ex.fe_len;
@@ -4499,7 +4449,7 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac)
* doesn't grow big. We need to release
* alloc_semp before calling ext4_mb_add_n_trim()
*/
- if (pa->pa_linear && likely(pa->pa_free)) {
+ if ((pa->pa_type == MB_GROUP_PA) && likely(pa->pa_free)) {
spin_lock(pa->pa_obj_lock);
list_del_rcu(&pa->pa_inode_list);
spin_unlock(pa->pa_obj_lock);
@@ -4936,9 +4886,7 @@ do_more:
if (sbi->s_log_groups_per_flex) {
ext4_group_t flex_group = ext4_flex_group(sbi, block_group);
- spin_lock(sb_bgl_lock(sbi, flex_group));
- sbi->s_flex_groups[flex_group].free_blocks += count;
- spin_unlock(sb_bgl_lock(sbi, flex_group));
+ atomic_add(count, &sbi->s_flex_groups[flex_group].free_blocks);
}
ext4_mb_release_desc(&e4b);
diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 10a2921baf1..dd9e6cd5f6c 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h
@@ -132,12 +132,15 @@ struct ext4_prealloc_space {
ext4_lblk_t pa_lstart; /* log. block */
unsigned short pa_len; /* len of preallocated chunk */
unsigned short pa_free; /* how many blocks are free */
- unsigned short pa_linear; /* consumed in one direction
- * strictly, for grp prealloc */
+ unsigned short pa_type; /* pa type. inode or group */
spinlock_t *pa_obj_lock;
struct inode *pa_inode; /* hack, for history only */
};
+enum {
+ MB_INODE_PA = 0,
+ MB_GROUP_PA = 1
+};
struct ext4_free_extent {
ext4_lblk_t fe_logical;
@@ -247,7 +250,6 @@ static inline void ext4_mb_store_history(struct ext4_allocation_context *ac)
#define in_range(b, first, len) ((b) >= (first) && (b) <= (first) + (len) - 1)
-struct buffer_head *read_block_bitmap(struct super_block *, ext4_group_t);
static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
struct ext4_free_extent *fex)
{
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 83410244d3e..22098e1cd08 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -161,12 +161,12 @@ static struct dx_frame *dx_probe(const struct qstr *d_name,
struct dx_frame *frame,
int *err);
static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct ext4_dir_entry_2 *de, int size,
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
struct dx_hash_info *hinfo, struct dx_map_entry map[]);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
- struct dx_map_entry *offsets, int count);
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size);
+ struct dx_map_entry *offsets, int count, unsigned blocksize);
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize);
static void dx_insert_block(struct dx_frame *frame,
u32 hash, ext4_lblk_t block);
static int ext4_htree_next_block(struct inode *dir, __u32 hash,
@@ -180,14 +180,38 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
struct inode *inode);
+unsigned int ext4_rec_len_from_disk(__le16 dlen, unsigned blocksize)
+{
+ unsigned len = le16_to_cpu(dlen);
+
+ if (len == EXT4_MAX_REC_LEN || len == 0)
+ return blocksize;
+ return (len & 65532) | ((len & 3) << 16);
+}
+
+__le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
+{
+ if ((len > blocksize) || (blocksize > (1 << 18)) || (len & 3))
+ BUG();
+ if (len < 65536)
+ return cpu_to_le16(len);
+ if (len == blocksize) {
+ if (blocksize == 65536)
+ return cpu_to_le16(EXT4_MAX_REC_LEN);
+ else
+ return cpu_to_le16(0);
+ }
+ return cpu_to_le16((len & 65532) | ((len >> 16) & 3));
+}
+
/*
* p is at least 6 bytes before the end of page
*/
static inline struct ext4_dir_entry_2 *
-ext4_next_entry(struct ext4_dir_entry_2 *p)
+ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
{
return (struct ext4_dir_entry_2 *)((char *)p +
- ext4_rec_len_from_disk(p->rec_len));
+ ext4_rec_len_from_disk(p->rec_len, blocksize));
}
/*
@@ -294,7 +318,7 @@ static struct stats dx_show_leaf(struct dx_hash_info *hinfo, struct ext4_dir_ent
space += EXT4_DIR_REC_LEN(de->name_len);
names++;
}
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, size);
}
printk("(%i)\n", names);
return (struct stats) { names, space, 1 };
@@ -585,7 +609,7 @@ static int htree_dirblock_to_tree(struct file *dir_file,
top = (struct ext4_dir_entry_2 *) ((char *) de +
dir->i_sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de)) {
+ for (; de < top; de = ext4_next_entry(de, dir->i_sb->s_blocksize)) {
if (!ext4_check_dir_entry("htree_dirblock_to_tree", dir, de, bh,
(block<<EXT4_BLOCK_SIZE_BITS(dir->i_sb))
+((char *)de - bh->b_data))) {
@@ -663,7 +687,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
}
if (start_hash < 2 || (start_hash ==2 && start_minor_hash==0)) {
de = (struct ext4_dir_entry_2 *) frames[0].bh->b_data;
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, dir->i_sb->s_blocksize);
if ((err = ext4_htree_store_dirent(dir_file, 2, 0, de)) != 0)
goto errout;
count++;
@@ -713,15 +737,15 @@ errout:
* Create map of hash values, offsets, and sizes, stored at end of block.
* Returns number of entries mapped.
*/
-static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
- struct dx_hash_info *hinfo, struct dx_map_entry *map_tail)
+static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize,
+ struct dx_hash_info *hinfo,
+ struct dx_map_entry *map_tail)
{
int count = 0;
char *base = (char *) de;
struct dx_hash_info h = *hinfo;
- while ((char *) de < base + size)
- {
+ while ((char *) de < base + blocksize) {
if (de->name_len && de->inode) {
ext4fs_dirhash(de->name, de->name_len, &h);
map_tail--;
@@ -732,7 +756,7 @@ static int dx_make_map (struct ext4_dir_entry_2 *de, int size,
cond_resched();
}
/* XXX: do we need to check rec_len == 0 case? -Chris */
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, blocksize);
}
return count;
}
@@ -832,7 +856,8 @@ static inline int search_dirblock(struct buffer_head *bh,
return 1;
}
/* prevent looping on a bad block */
- de_len = ext4_rec_len_from_disk(de->rec_len);
+ de_len = ext4_rec_len_from_disk(de->rec_len,
+ dir->i_sb->s_blocksize);
if (de_len <= 0)
return -1;
offset += de_len;
@@ -996,7 +1021,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q
de = (struct ext4_dir_entry_2 *) bh->b_data;
top = (struct ext4_dir_entry_2 *) ((char *) de + sb->s_blocksize -
EXT4_DIR_REC_LEN(0));
- for (; de < top; de = ext4_next_entry(de)) {
+ for (; de < top; de = ext4_next_entry(de, sb->s_blocksize)) {
int off = (block << EXT4_BLOCK_SIZE_BITS(sb))
+ ((char *) de - bh->b_data);
@@ -1052,8 +1077,16 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, stru
return ERR_PTR(-EIO);
}
inode = ext4_iget(dir->i_sb, ino);
- if (IS_ERR(inode))
- return ERR_CAST(inode);
+ if (unlikely(IS_ERR(inode))) {
+ if (PTR_ERR(inode) == -ESTALE) {
+ ext4_error(dir->i_sb, __func__,
+ "deleted inode referenced: %u",
+ ino);
+ return ERR_PTR(-EIO);
+ } else {
+ return ERR_CAST(inode);
+ }
+ }
}
return d_splice_alias(inode, dentry);
}
@@ -1109,7 +1142,8 @@ static inline void ext4_set_de_type(struct super_block *sb,
* Returns pointer to last entry moved.
*/
static struct ext4_dir_entry_2 *
-dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
+dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count,
+ unsigned blocksize)
{
unsigned rec_len = 0;
@@ -1118,7 +1152,7 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
rec_len = EXT4_DIR_REC_LEN(de->name_len);
memcpy (to, de, rec_len);
((struct ext4_dir_entry_2 *) to)->rec_len =
- ext4_rec_len_to_disk(rec_len);
+ ext4_rec_len_to_disk(rec_len, blocksize);
de->inode = 0;
map++;
to += rec_len;
@@ -1130,19 +1164,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count)
* Compact each dir entry in the range to the minimal rec_len.
* Returns pointer to last entry in range.
*/
-static struct ext4_dir_entry_2* dx_pack_dirents(char *base, int size)
+static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize)
{
struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base;
unsigned rec_len = 0;
prev = to = de;
- while ((char*)de < base + size) {
- next = ext4_next_entry(de);
+ while ((char*)de < base + blocksize) {
+ next = ext4_next_entry(de, blocksize);
if (de->inode && de->name_len) {
rec_len = EXT4_DIR_REC_LEN(de->name_len);
if (de > to)
memmove(to, de, rec_len);
- to->rec_len = ext4_rec_len_to_disk(rec_len);
+ to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize);
prev = to;
to = (struct ext4_dir_entry_2 *) (((char *) to) + rec_len);
}
@@ -1215,10 +1249,12 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir,
hash2, split, count-split));
/* Fancy dance to stay within two buffers */
- de2 = dx_move_dirents(data1, data2, map + split, count - split);
+ de2 = dx_move_dirents(data1, data2, map + split, count - split, blocksize);
de = dx_pack_dirents(data1, blocksize);
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
- de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2);
+ de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+ blocksize);
+ de2->rec_len = ext4_rec_len_to_disk(data2 + blocksize - (char *) de2,
+ blocksize);
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data1, blocksize, 1));
dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1));
@@ -1268,6 +1304,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
const char *name = dentry->d_name.name;
int namelen = dentry->d_name.len;
unsigned int offset = 0;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
unsigned short reclen;
int nlen, rlen, err;
char *top;
@@ -1275,7 +1312,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
reclen = EXT4_DIR_REC_LEN(namelen);
if (!de) {
de = (struct ext4_dir_entry_2 *)bh->b_data;
- top = bh->b_data + dir->i_sb->s_blocksize - reclen;
+ top = bh->b_data + blocksize - reclen;
while ((char *) de <= top) {
if (!ext4_check_dir_entry("ext4_add_entry", dir, de,
bh, offset)) {
@@ -1287,7 +1324,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
return -EEXIST;
}
nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
if ((de->inode? rlen - nlen: rlen) >= reclen)
break;
de = (struct ext4_dir_entry_2 *)((char *)de + rlen);
@@ -1306,11 +1343,11 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry,
/* By now the buffer is marked for journaling */
nlen = EXT4_DIR_REC_LEN(de->name_len);
- rlen = ext4_rec_len_from_disk(de->rec_len);
+ rlen = ext4_rec_len_from_disk(de->rec_len, blocksize);
if (de->inode) {
struct ext4_dir_entry_2 *de1 = (struct ext4_dir_entry_2 *)((char *)de + nlen);
- de1->rec_len = ext4_rec_len_to_disk(rlen - nlen);
- de->rec_len = ext4_rec_len_to_disk(nlen);
+ de1->rec_len = ext4_rec_len_to_disk(rlen - nlen, blocksize);
+ de->rec_len = ext4_rec_len_to_disk(nlen, blocksize);
de = de1;
}
de->file_type = EXT4_FT_UNKNOWN;
@@ -1380,7 +1417,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
/* The 0th block becomes the root, move the dirents out */
fde = &root->dotdot;
de = (struct ext4_dir_entry_2 *)((char *)fde +
- ext4_rec_len_from_disk(fde->rec_len));
+ ext4_rec_len_from_disk(fde->rec_len, blocksize));
if ((char *) de >= (((char *) root) + blocksize)) {
ext4_error(dir->i_sb, __func__,
"invalid rec_len for '..' in inode %lu",
@@ -1402,12 +1439,14 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry,
memcpy (data1, de, len);
de = (struct ext4_dir_entry_2 *) data1;
top = data1 + len;
- while ((char *)(de2 = ext4_next_entry(de)) < top)
+ while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
de = de2;
- de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de);
+ de->rec_len = ext4_rec_len_to_disk(data1 + blocksize - (char *) de,
+ blocksize);
/* Initialize the root; the dot dirents already exist */
de = (struct ext4_dir_entry_2 *) (&root->dotdot);
- de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2));
+ de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2),
+ blocksize);
memset (&root->info, 0, sizeof(root->info));
root->info.info_length = sizeof(root->info);
root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version;
@@ -1488,7 +1527,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry,
return retval;
de = (struct ext4_dir_entry_2 *) bh->b_data;
de->inode = 0;
- de->rec_len = ext4_rec_len_to_disk(blocksize);
+ de->rec_len = ext4_rec_len_to_disk(blocksize, blocksize);
return add_dirent_to_buf(handle, dentry, inode, de, bh);
}
@@ -1551,7 +1590,8 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry,
goto cleanup;
node2 = (struct dx_node *)(bh2->b_data);
entries2 = node2->entries;
- node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize);
+ node2->fake.rec_len = ext4_rec_len_to_disk(sb->s_blocksize,
+ sb->s_blocksize);
node2->fake.inode = 0;
BUFFER_TRACE(frame->bh, "get_write_access");
err = ext4_journal_get_write_access(handle, frame->bh);
@@ -1639,6 +1679,7 @@ static int ext4_delete_entry(handle_t *handle,
struct buffer_head *bh)
{
struct ext4_dir_entry_2 *de, *pde;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
int i;
i = 0;
@@ -1652,8 +1693,11 @@ static int ext4_delete_entry(handle_t *handle,
ext4_journal_get_write_access(handle, bh);
if (pde)
pde->rec_len = ext4_rec_len_to_disk(
- ext4_rec_len_from_disk(pde->rec_len) +
- ext4_rec_len_from_disk(de->rec_len));
+ ext4_rec_len_from_disk(pde->rec_len,
+ blocksize) +
+ ext4_rec_len_from_disk(de->rec_len,
+ blocksize),
+ blocksize);
else
de->inode = 0;
dir->i_version++;
@@ -1661,9 +1705,9 @@ static int ext4_delete_entry(handle_t *handle,
ext4_handle_dirty_metadata(handle, dir, bh);
return 0;
}
- i += ext4_rec_len_from_disk(de->rec_len);
+ i += ext4_rec_len_from_disk(de->rec_len, blocksize);
pde = de;
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, blocksize);
}
return -ENOENT;
}
@@ -1793,6 +1837,7 @@ static int ext4_mkdir(struct inode *dir, struct dentry *dentry, int mode)
struct inode *inode;
struct buffer_head *dir_block;
struct ext4_dir_entry_2 *de;
+ unsigned int blocksize = dir->i_sb->s_blocksize;
int err, retries = 0;
if (EXT4_DIR_LINK_MAX(dir))
@@ -1824,13 +1869,14 @@ retry:
de = (struct ext4_dir_entry_2 *) dir_block->b_data;
de->inode = cpu_to_le32(inode->i_ino);
de->name_len = 1;
- de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len));
+ de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len),
+ blocksize);
strcpy(de->name, ".");
ext4_set_de_type(dir->i_sb, de, S_IFDIR);
- de = ext4_next_entry(de);
+ de = ext4_next_entry(de, blocksize);
de->inode = cpu_to_le32(dir->i_ino);
- de->rec_len = ext4_rec_len_to_disk(inode->i_sb->s_blocksize -
- EXT4_DIR_REC_LEN(1));
+ de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(1),
+ blocksize);
de->name_len = 2;
strcpy(de->name, "..");
ext4_set_de_type(dir->i_sb, de, S_IFDIR);
@@ -1885,7 +1931,7 @@ static int empty_dir(struct inode *inode)
return 1;
}
de = (struct ext4_dir_entry_2 *) bh->b_data;
- de1 = ext4_next_entry(de);
+ de1 = ext4_next_entry(de, sb->s_blocksize);
if (le32_to_cpu(de->inode) != inode->i_ino ||
!le32_to_cpu(de1->inode) ||
strcmp(".", de->name) ||
@@ -1896,9 +1942,9 @@ static int empty_dir(struct inode *inode)
brelse(bh);
return 1;
}
- offset = ext4_rec_len_from_disk(de->rec_len) +
- ext4_rec_len_from_disk(de1->rec_len);
- de = ext4_next_entry(de1);
+ offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize) +
+ ext4_rec_len_from_disk(de1->rec_len, sb->s_blocksize);
+ de = ext4_next_entry(de1, sb->s_blocksize);
while (offset < inode->i_size) {
if (!bh ||
(void *) de >= (void *) (bh->b_data+sb->s_blocksize)) {
@@ -1927,8 +1973,8 @@ static int empty_dir(struct inode *inode)
brelse(bh);
return 0;
}
- offset += ext4_rec_len_from_disk(de->rec_len);
- de = ext4_next_entry(de);
+ offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
+ de = ext4_next_entry(de, sb->s_blocksize);
}
brelse(bh);
return 1;
@@ -2297,8 +2343,8 @@ retry:
return err;
}
-#define PARENT_INO(buffer) \
- (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer))->inode)
+#define PARENT_INO(buffer, size) \
+ (ext4_next_entry((struct ext4_dir_entry_2 *)(buffer), size)->inode)
/*
* Anybody can rename anything with this: the permission checks are left to the
@@ -2311,7 +2357,7 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *old_inode, *new_inode;
struct buffer_head *old_bh, *new_bh, *dir_bh;
struct ext4_dir_entry_2 *old_de, *new_de;
- int retval;
+ int retval, force_da_alloc = 0;
old_bh = new_bh = dir_bh = NULL;
@@ -2358,7 +2404,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
dir_bh = ext4_bread(handle, old_inode, 0, 0, &retval);
if (!dir_bh)
goto end_rename;
- if (le32_to_cpu(PARENT_INO(dir_bh->b_data)) != old_dir->i_ino)
+ if (le32_to_cpu(PARENT_INO(dir_bh->b_data,
+ old_dir->i_sb->s_blocksize)) != old_dir->i_ino)
goto end_rename;
retval = -EMLINK;
if (!new_inode && new_dir != old_dir &&
@@ -2430,7 +2477,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
if (dir_bh) {
BUFFER_TRACE(dir_bh, "get_write_access");
ext4_journal_get_write_access(handle, dir_bh);
- PARENT_INO(dir_bh->b_data) = cpu_to_le32(new_dir->i_ino);
+ PARENT_INO(dir_bh->b_data, new_dir->i_sb->s_blocksize) =
+ cpu_to_le32(new_dir->i_ino);
BUFFER_TRACE(dir_bh, "call ext4_handle_dirty_metadata");
ext4_handle_dirty_metadata(handle, old_dir, dir_bh);
ext4_dec_count(handle, old_dir);
@@ -2449,6 +2497,8 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry,
ext4_mark_inode_dirty(handle, new_inode);
if (!new_inode->i_nlink)
ext4_orphan_add(handle, new_inode);
+ if (!test_opt(new_dir->i_sb, NO_AUTO_DA_ALLOC))
+ force_da_alloc = 1;
}
retval = 0;
@@ -2457,6 +2507,8 @@ end_rename:
brelse(old_bh);
brelse(new_bh);
ext4_journal_stop(handle);
+ if (retval == 0 && force_da_alloc)
+ ext4_alloc_da_blocks(old_inode);
return retval;
}
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index c06886abd65..546c7dd869e 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -938,10 +938,10 @@ int ext4_group_add(struct super_block *sb, struct ext4_new_group_data *input)
if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_FLEX_BG)) {
ext4_group_t flex_group;
flex_group = ext4_flex_group(sbi, input->group);
- sbi->s_flex_groups[flex_group].free_blocks +=
- input->free_blocks_count;
- sbi->s_flex_groups[flex_group].free_inodes +=
- EXT4_INODES_PER_GROUP(sb);
+ atomic_add(input->free_blocks_count,
+ &sbi->s_flex_groups[flex_group].free_blocks);
+ atomic_add(EXT4_INODES_PER_GROUP(sb),
+ &sbi->s_flex_groups[flex_group].free_inodes);
}
ext4_handle_dirty_metadata(handle, NULL, sbi->s_sbh);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index f7371a6a923..9987bba99db 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -35,6 +35,7 @@
#include <linux/quotaops.h>
#include <linux/seq_file.h>
#include <linux/proc_fs.h>
+#include <linux/ctype.h>
#include <linux/marker.h>
#include <linux/log2.h>
#include <linux/crc16.h>
@@ -48,6 +49,7 @@
#include "group.h"
struct proc_dir_entry *ext4_proc_root;
+static struct kset *ext4_kset;
static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
unsigned long journal_devnum);
@@ -577,9 +579,9 @@ static void ext4_put_super(struct super_block *sb)
ext4_commit_super(sb, es, 1);
}
if (sbi->s_proc) {
- remove_proc_entry("inode_readahead_blks", sbi->s_proc);
remove_proc_entry(sb->s_id, ext4_proc_root);
}
+ kobject_del(&sbi->s_kobj);
for (i = 0; i < sbi->s_gdb_count; i++)
brelse(sbi->s_group_desc[i]);
@@ -615,6 +617,17 @@ static void ext4_put_super(struct super_block *sb)
ext4_blkdev_remove(sbi);
}
sb->s_fs_info = NULL;
+ /*
+ * Now that we are completely done shutting down the
+ * superblock, we need to actually destroy the kobject.
+ */
+ unlock_kernel();
+ unlock_super(sb);
+ kobject_put(&sbi->s_kobj);
+ wait_for_completion(&sbi->s_kobj_unregister);
+ lock_super(sb);
+ lock_kernel();
+ kfree(sbi->s_blockgroup_lock);
kfree(sbi);
return;
}
@@ -803,8 +816,6 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
seq_puts(seq, ",noacl");
#endif
- if (!test_opt(sb, RESERVATION))
- seq_puts(seq, ",noreservation");
if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
seq_printf(seq, ",commit=%u",
(unsigned) (sbi->s_commit_interval / HZ));
@@ -855,6 +866,9 @@ static int ext4_show_options(struct seq_file *seq, struct vfsmount *vfs)
if (test_opt(sb, DATA_ERR_ABORT))
seq_puts(seq, ",data_err=abort");
+ if (test_opt(sb, NO_AUTO_DA_ALLOC))
+ seq_puts(seq, ",noauto_da_alloc");
+
ext4_show_quota_options(seq, sb);
return 0;
}
@@ -1004,7 +1018,7 @@ enum {
Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
- Opt_reservation, Opt_noreservation, Opt_noload, Opt_nobh, Opt_bh,
+ Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
Opt_journal_update, Opt_journal_dev,
Opt_journal_checksum, Opt_journal_async_commit,
@@ -1012,8 +1026,8 @@ enum {
Opt_data_err_abort, Opt_data_err_ignore,
Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_quota, Opt_noquota,
- Opt_ignore, Opt_barrier, Opt_err, Opt_resize, Opt_usrquota,
- Opt_grpquota, Opt_i_version,
+ Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err, Opt_resize,
+ Opt_usrquota, Opt_grpquota, Opt_i_version,
Opt_stripe, Opt_delalloc, Opt_nodelalloc,
Opt_inode_readahead_blks, Opt_journal_ioprio
};
@@ -1039,8 +1053,6 @@ static const match_table_t tokens = {
{Opt_nouser_xattr, "nouser_xattr"},
{Opt_acl, "acl"},
{Opt_noacl, "noacl"},
- {Opt_reservation, "reservation"},
- {Opt_noreservation, "noreservation"},
{Opt_noload, "noload"},
{Opt_nobh, "nobh"},
{Opt_bh, "bh"},
@@ -1068,6 +1080,8 @@ static const match_table_t tokens = {
{Opt_quota, "quota"},
{Opt_usrquota, "usrquota"},
{Opt_barrier, "barrier=%u"},
+ {Opt_barrier, "barrier"},
+ {Opt_nobarrier, "nobarrier"},
{Opt_i_version, "i_version"},
{Opt_stripe, "stripe=%u"},
{Opt_resize, "resize"},
@@ -1075,6 +1089,9 @@ static const match_table_t tokens = {
{Opt_nodelalloc, "nodelalloc"},
{Opt_inode_readahead_blks, "inode_readahead_blks=%u"},
{Opt_journal_ioprio, "journal_ioprio=%u"},
+ {Opt_auto_da_alloc, "auto_da_alloc=%u"},
+ {Opt_auto_da_alloc, "auto_da_alloc"},
+ {Opt_noauto_da_alloc, "noauto_da_alloc"},
{Opt_err, NULL},
};
@@ -1207,12 +1224,6 @@ static int parse_options(char *options, struct super_block *sb,
"not supported\n");
break;
#endif
- case Opt_reservation:
- set_opt(sbi->s_mount_opt, RESERVATION);
- break;
- case Opt_noreservation:
- clear_opt(sbi->s_mount_opt, RESERVATION);
- break;
case Opt_journal_update:
/* @@@ FIXME */
/* Eventually we will want to be able to create
@@ -1415,9 +1426,14 @@ set_qf_format:
case Opt_abort:
set_opt(sbi->s_mount_opt, ABORT);
break;
+ case Opt_nobarrier:
+ clear_opt(sbi->s_mount_opt, BARRIER);
+ break;
case Opt_barrier:
- if (match_int(&args[0], &option))
- return 0;
+ if (match_int(&args[0], &option)) {
+ set_opt(sbi->s_mount_opt, BARRIER);
+ break;
+ }
if (option)
set_opt(sbi->s_mount_opt, BARRIER);
else
@@ -1463,6 +1479,11 @@ set_qf_format:
return 0;
if (option < 0 || option > (1 << 30))
return 0;
+ if (option & (option - 1)) {
+ printk(KERN_ERR "EXT4-fs: inode_readahead_blks"
+ " must be a power of 2\n");
+ return 0;
+ }
sbi->s_inode_readahead_blks = option;
break;
case Opt_journal_ioprio:
@@ -1473,6 +1494,19 @@ set_qf_format:
*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
option);
break;
+ case Opt_noauto_da_alloc:
+ set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+ break;
+ case Opt_auto_da_alloc:
+ if (match_int(&args[0], &option)) {
+ clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+ break;
+ }
+ if (option)
+ clear_opt(sbi->s_mount_opt, NO_AUTO_DA_ALLOC);
+ else
+ set_opt(sbi->s_mount_opt,NO_AUTO_DA_ALLOC);
+ break;
default:
printk(KERN_ERR
"EXT4-fs: Unrecognized mount option \"%s\" "
@@ -1612,10 +1646,12 @@ static int ext4_fill_flex_info(struct super_block *sb)
gdp = ext4_get_group_desc(sb, i, &bh);
flex_group = ext4_flex_group(sbi, i);
- sbi->s_flex_groups[flex_group].free_inodes +=
- ext4_free_inodes_count(sb, gdp);
- sbi->s_flex_groups[flex_group].free_blocks +=
- ext4_free_blks_count(sb, gdp);
+ atomic_set(&sbi->s_flex_groups[flex_group].free_inodes,
+ ext4_free_inodes_count(sb, gdp));
+ atomic_set(&sbi->s_flex_groups[flex_group].free_blocks,
+ ext4_free_blks_count(sb, gdp));
+ atomic_set(&sbi->s_flex_groups[flex_group].used_dirs,
+ ext4_used_dirs_count(sb, gdp));
}
return 1;
@@ -1991,6 +2027,181 @@ static unsigned long ext4_get_stripe_size(struct ext4_sb_info *sbi)
return 0;
}
+/* sysfs supprt */
+
+struct ext4_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct ext4_attr *, struct ext4_sb_info *, char *);
+ ssize_t (*store)(struct ext4_attr *, struct ext4_sb_info *,
+ const char *, size_t);
+ int offset;
+};
+
+static int parse_strtoul(const char *buf,
+ unsigned long max, unsigned long *value)
+{
+ char *endp;
+
+ while (*buf && isspace(*buf))
+ buf++;
+ *value = simple_strtoul(buf, &endp, 0);
+ while (*endp && isspace(*endp))
+ endp++;
+ if (*endp || *value > max)
+ return -EINVAL;
+
+ return 0;
+}
+
+static ssize_t delayed_allocation_blocks_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ char *buf)
+{
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ (s64) percpu_counter_sum(&sbi->s_dirtyblocks_counter));
+}
+
+static ssize_t session_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ return snprintf(buf, PAGE_SIZE, "%lu\n",
+ (part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ sbi->s_sectors_written_start) >> 1);
+}
+
+static ssize_t lifetime_write_kbytes_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ struct super_block *sb = sbi->s_buddy_cache->i_sb;
+
+ return snprintf(buf, PAGE_SIZE, "%llu\n",
+ sbi->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1));
+}
+
+static ssize_t inode_readahead_blks_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned long t;
+
+ if (parse_strtoul(buf, 0x40000000, &t))
+ return -EINVAL;
+
+ /* inode_readahead_blks must be a power of 2 */
+ if (t & (t-1))
+ return -EINVAL;
+
+ sbi->s_inode_readahead_blks = t;
+ return count;
+}
+
+static ssize_t sbi_ui_show(struct ext4_attr *a,
+ struct ext4_sb_info *sbi, char *buf)
+{
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+
+ return snprintf(buf, PAGE_SIZE, "%u\n", *ui);
+}
+
+static ssize_t sbi_ui_store(struct ext4_attr *a,
+ struct ext4_sb_info *sbi,
+ const char *buf, size_t count)
+{
+ unsigned int *ui = (unsigned int *) (((char *) sbi) + a->offset);
+ unsigned long t;
+
+ if (parse_strtoul(buf, 0xffffffff, &t))
+ return -EINVAL;
+ *ui = t;
+ return count;
+}
+
+#define EXT4_ATTR_OFFSET(_name,_mode,_show,_store,_elname) \
+static struct ext4_attr ext4_attr_##_name = { \
+ .attr = {.name = __stringify(_name), .mode = _mode }, \
+ .show = _show, \
+ .store = _store, \
+ .offset = offsetof(struct ext4_sb_info, _elname), \
+}
+#define EXT4_ATTR(name, mode, show, store) \
+static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store)
+
+#define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL)
+#define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store)
+#define EXT4_RW_ATTR_SBI_UI(name, elname) \
+ EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname)
+#define ATTR_LIST(name) &ext4_attr_##name.attr
+
+EXT4_RO_ATTR(delayed_allocation_blocks);
+EXT4_RO_ATTR(session_write_kbytes);
+EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
+ inode_readahead_blks_store, s_inode_readahead_blks);
+EXT4_RW_ATTR_SBI_UI(mb_stats, s_mb_stats);
+EXT4_RW_ATTR_SBI_UI(mb_max_to_scan, s_mb_max_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_min_to_scan, s_mb_min_to_scan);
+EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
+EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
+EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
+
+static struct attribute *ext4_attrs[] = {
+ ATTR_LIST(delayed_allocation_blocks),
+ ATTR_LIST(session_write_kbytes),
+ ATTR_LIST(lifetime_write_kbytes),
+ ATTR_LIST(inode_readahead_blks),
+ ATTR_LIST(mb_stats),
+ ATTR_LIST(mb_max_to_scan),
+ ATTR_LIST(mb_min_to_scan),
+ ATTR_LIST(mb_order2_req),
+ ATTR_LIST(mb_stream_req),
+ ATTR_LIST(mb_group_prealloc),
+ NULL,
+};
+
+static ssize_t ext4_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+ return a->show ? a->show(a, sbi, buf) : 0;
+}
+
+static ssize_t ext4_attr_store(struct kobject *kobj,
+ struct attribute *attr,
+ const char *buf, size_t len)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ struct ext4_attr *a = container_of(attr, struct ext4_attr, attr);
+
+ return a->store ? a->store(a, sbi, buf, len) : 0;
+}
+
+static void ext4_sb_release(struct kobject *kobj)
+{
+ struct ext4_sb_info *sbi = container_of(kobj, struct ext4_sb_info,
+ s_kobj);
+ complete(&sbi->s_kobj_unregister);
+}
+
+
+static struct sysfs_ops ext4_attr_ops = {
+ .show = ext4_attr_show,
+ .store = ext4_attr_store,
+};
+
+static struct kobj_type ext4_ktype = {
+ .default_attrs = ext4_attrs,
+ .sysfs_ops = &ext4_attr_ops,
+ .release = ext4_sb_release,
+};
+
static int ext4_fill_super(struct super_block *sb, void *data, int silent)
__releases(kernel_lock)
__acquires(kernel_lock)
@@ -2021,12 +2232,21 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi = kzalloc(sizeof(*sbi), GFP_KERNEL);
if (!sbi)
return -ENOMEM;
+
+ sbi->s_blockgroup_lock =
+ kzalloc(sizeof(struct blockgroup_lock), GFP_KERNEL);
+ if (!sbi->s_blockgroup_lock) {
+ kfree(sbi);
+ return -ENOMEM;
+ }
sb->s_fs_info = sbi;
sbi->s_mount_opt = 0;
sbi->s_resuid = EXT4_DEF_RESUID;
sbi->s_resgid = EXT4_DEF_RESGID;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sb_block = sb_block;
+ sbi->s_sectors_written_start = part_stat_read(sb->s_bdev->bd_part,
+ sectors[1]);
unlock_kernel();
@@ -2064,6 +2284,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sb->s_magic = le16_to_cpu(es->s_magic);
if (sb->s_magic != EXT4_SUPER_MAGIC)
goto cantfind_ext4;
+ sbi->s_kbytes_written = le64_to_cpu(es->s_kbytes_written);
/* Set defaults before we parse the mount options */
def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
@@ -2101,7 +2322,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME;
sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME;
- set_opt(sbi->s_mount_opt, RESERVATION);
set_opt(sbi->s_mount_opt, BARRIER);
/*
@@ -2325,14 +2545,9 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
#ifdef CONFIG_PROC_FS
if (ext4_proc_root)
sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
-
- if (sbi->s_proc)
- proc_create_data("inode_readahead_blks", 0644, sbi->s_proc,
- &ext4_ui_proc_fops,
- &sbi->s_inode_readahead_blks);
#endif
- bgl_lock_init(&sbi->s_blockgroup_lock);
+ bgl_lock_init(sbi->s_blockgroup_lock);
for (i = 0; i < db_count; i++) {
block = descriptor_loc(sb, logical_sb_block, i);
@@ -2564,6 +2779,16 @@ no_journal:
goto failed_mount4;
}
+ sbi->s_kobj.kset = ext4_kset;
+ init_completion(&sbi->s_kobj_unregister);
+ err = kobject_init_and_add(&sbi->s_kobj, &ext4_ktype, NULL,
+ "%s", sb->s_id);
+ if (err) {
+ ext4_mb_release(sb);
+ ext4_ext_release(sb);
+ goto failed_mount4;
+ };
+
/*
* akpm: core read_super() calls in here with the superblock locked.
* That deadlocks, because orphan cleanup needs to lock the superblock
@@ -2618,7 +2843,6 @@ failed_mount2:
kfree(sbi->s_group_desc);
failed_mount:
if (sbi->s_proc) {
- remove_proc_entry("inode_readahead_blks", sbi->s_proc);
remove_proc_entry(sb->s_id, ext4_proc_root);
}
#ifdef CONFIG_QUOTA
@@ -2913,6 +3137,10 @@ static int ext4_commit_super(struct super_block *sb,
set_buffer_uptodate(sbh);
}
es->s_wtime = cpu_to_le32(get_seconds());
+ es->s_kbytes_written =
+ cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
+ ((part_stat_read(sb->s_bdev->bd_part, sectors[1]) -
+ EXT4_SB(sb)->s_sectors_written_start) >> 1));
ext4_free_blocks_count_set(es, percpu_counter_sum_positive(
&EXT4_SB(sb)->s_freeblocks_counter));
es->s_free_inodes_count = cpu_to_le32(percpu_counter_sum_positive(
@@ -3647,45 +3875,6 @@ static int ext4_get_sb(struct file_system_type *fs_type,
return get_sb_bdev(fs_type, flags, dev_name, data, ext4_fill_super, mnt);
}
-#ifdef CONFIG_PROC_FS
-static int ext4_ui_proc_show(struct seq_file *m, void *v)
-{
- unsigned int *p = m->private;
-
- seq_printf(m, "%u\n", *p);
- return 0;
-}
-
-static int ext4_ui_proc_open(struct inode *inode, struct file *file)
-{
- return single_open(file, ext4_ui_proc_show, PDE(inode)->data);
-}
-
-static ssize_t ext4_ui_proc_write(struct file *file, const char __user *buf,
- size_t cnt, loff_t *ppos)
-{
- unsigned long *p = PDE(file->f_path.dentry->d_inode)->data;
- char str[32];
-
- if (cnt >= sizeof(str))
- return -EINVAL;
- if (copy_from_user(str, buf, cnt))
- return -EFAULT;
-
- *p = simple_strtoul(str, NULL, 0);
- return cnt;
-}
-
-const struct file_operations ext4_ui_proc_fops = {
- .owner = THIS_MODULE,
- .open = ext4_ui_proc_open,
- .read = seq_read,
- .llseek = seq_lseek,
- .release = single_release,
- .write = ext4_ui_proc_write,
-};
-#endif
-
static struct file_system_type ext4_fs_type = {
.owner = THIS_MODULE,
.name = "ext4",
@@ -3719,6 +3908,9 @@ static int __init init_ext4_fs(void)
{
int err;
+ ext4_kset = kset_create_and_add("ext4", NULL, fs_kobj);
+ if (!ext4_kset)
+ return -ENOMEM;
ext4_proc_root = proc_mkdir("fs/ext4", NULL);
err = init_ext4_mballoc();
if (err)
@@ -3760,6 +3952,7 @@ static void __exit exit_ext4_fs(void)
exit_ext4_xattr();
exit_ext4_mballoc();
remove_proc_entry("fs/ext4", NULL);
+ kset_unregister(ext4_kset);
}
MODULE_AUTHOR("Remy Card, Stephen Tweedie, Andrew Morton, Andreas Dilger, Theodore Ts'o and others");
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 62804e57a44..4ea72377c7a 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -367,6 +367,7 @@ void jbd2_journal_commit_transaction(journal_t *journal)
int tag_bytes = journal_tag_bytes(journal);
struct buffer_head *cbh = NULL; /* For transactional checksums */
__u32 crc32_sum = ~0;
+ int write_op = WRITE;
/*
* First job: lock down the current transaction and wait for
@@ -401,6 +402,8 @@ void jbd2_journal_commit_transaction(journal_t *journal)
spin_lock(&journal->j_state_lock);
commit_transaction->t_state = T_LOCKED;
+ if (commit_transaction->t_synchronous_commit)
+ write_op = WRITE_SYNC;
stats.u.run.rs_wait = commit_transaction->t_max_wait;
stats.u.run.rs_locked = jiffies;
stats.u.run.rs_running = jbd2_time_diff(commit_transaction->t_start,
@@ -680,7 +683,7 @@ start_journal_io:
clear_buffer_dirty(bh);
set_buffer_uptodate(bh);
bh->b_end_io = journal_end_buffer_io_sync;
- submit_bh(WRITE, bh);
+ submit_bh(write_op, bh);
}
cond_resched();
stats.u.run.rs_blocks_logged += bufs;
diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 257ff262576..bbe6d592d8b 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c
@@ -55,6 +55,25 @@
* need do nothing.
* RevokeValid set, Revoked set:
* buffer has been revoked.
+ *
+ * Locking rules:
+ * We keep two hash tables of revoke records. One hashtable belongs to the
+ * running transaction (is pointed to by journal->j_revoke), the other one
+ * belongs to the committing transaction. Accesses to the second hash table
+ * happen only from the kjournald and no other thread touches this table. Also
+ * journal_switch_revoke_table() which switches which hashtable belongs to the
+ * running and which to the committing transaction is called only from
+ * kjournald. Therefore we need no locks when accessing the hashtable belonging
+ * to the committing transaction.
+ *
+ * All users operating on the hash table belonging to the running transaction
+ * have a handle to the transaction. Therefore they are safe from kjournald
+ * switching hash tables under them. For operations on the lists of entries in
+ * the hash table j_revoke_lock is used.
+ *
+ * Finally, also replay code uses the hash tables but at this moment noone else
+ * can touch them (filesystem isn't mounted yet) and hence no locking is
+ * needed.
*/
#ifndef __KERNEL__
@@ -401,8 +420,6 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr,
* the second time we would still have a pending revoke to cancel. So,
* do not trust the Revoked bit on buffers unless RevokeValid is also
* set.
- *
- * The caller must have the journal locked.
*/
int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh)
{
@@ -480,10 +497,7 @@ void jbd2_journal_switch_revoke_table(journal_t *journal)
/*
* Write revoke records to the journal for all entries in the current
* revoke hash, deleting the entries as we go.
- *
- * Called with the journal lock held.
*/
-
void jbd2_journal_write_revoke_records(journal_t *journal,
transaction_t *transaction)
{
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index 28ce21d8598..996ffda06bf 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1315,6 +1315,8 @@ int jbd2_journal_stop(handle_t *handle)
}
}
+ if (handle->h_sync)
+ transaction->t_synchronous_commit = 1;
current->journal_info = NULL;
spin_lock(&journal->j_state_lock);
spin_lock(&transaction->t_handle_lock);
diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c
index aedc47a264c..1f3b0fc0d35 100644
--- a/fs/lockd/clntlock.c
+++ b/fs/lockd/clntlock.c
@@ -139,55 +139,6 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout)
return 0;
}
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap,
- struct in6_addr *addr_mapped)
-{
- const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
-
- switch (sap->sa_family) {
- case AF_INET6:
- return &((const struct sockaddr_in6 *)sap)->sin6_addr;
- case AF_INET:
- ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped);
- return addr_mapped;
- }
-
- return NULL;
-}
-
-/*
- * If lockd is using a PF_INET6 listener, all incoming requests appear
- * to come from AF_INET6 remotes. The address of AF_INET remotes are
- * mapped to AF_INET6 automatically by the network layer. In case the
- * user passed an AF_INET server address at mount time, ensure both
- * addresses are AF_INET6 before comparing them.
- */
-static int nlmclnt_cmp_addr(const struct nlm_host *host,
- const struct sockaddr *sap)
-{
- const struct in6_addr *addr1;
- const struct in6_addr *addr2;
- struct in6_addr addr1_mapped;
- struct in6_addr addr2_mapped;
-
- addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped);
- if (likely(addr1 != NULL)) {
- addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped);
- if (likely(addr2 != NULL))
- return ipv6_addr_equal(addr1, addr2);
- }
-
- return 0;
-}
-#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
-static int nlmclnt_cmp_addr(const struct nlm_host *host,
- const struct sockaddr *sap)
-{
- return nlm_cmp_addr(nlm_addr(host), sap);
-}
-#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */
-
/*
* The server lockd has called us back to tell us the lock was granted
*/
@@ -215,7 +166,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock)
*/
if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid)
continue;
- if (!nlmclnt_cmp_addr(block->b_host, addr))
+ if (!nlm_cmp_addr(nlm_addr(block->b_host), addr))
continue;
if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0)
continue;
diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c
index 5e2c4d5ac82..6d5d4a4169e 100644
--- a/fs/lockd/mon.c
+++ b/fs/lockd/mon.c
@@ -16,6 +16,8 @@
#include <linux/sunrpc/svc.h>
#include <linux/lockd/lockd.h>
+#include <asm/unaligned.h>
+
#define NLMDBG_FACILITY NLMDBG_MONITOR
#define NSM_PROGRAM 100024
#define NSM_VERSION 1
@@ -274,10 +276,12 @@ static void nsm_init_private(struct nsm_handle *nsm)
{
u64 *p = (u64 *)&nsm->sm_priv.data;
struct timespec ts;
+ s64 ns;
ktime_get_ts(&ts);
- *p++ = timespec_to_ns(&ts);
- *p = (unsigned long)nsm;
+ ns = timespec_to_ns(&ts);
+ put_unaligned(ns, p);
+ put_unaligned((unsigned long)nsm, p + 1);
}
static struct nsm_handle *nsm_create_handle(const struct sockaddr *sap,
diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c
index 64f1c31b585..abf83881f68 100644
--- a/fs/lockd/svc.c
+++ b/fs/lockd/svc.c
@@ -53,17 +53,6 @@ static struct svc_rqst *nlmsvc_rqst;
unsigned long nlmsvc_timeout;
/*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if (defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)) && \
- defined(CONFIG_SUNRPC_REGISTER_V4)
-static const sa_family_t nlmsvc_family = AF_INET6;
-#else /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-static const sa_family_t nlmsvc_family = AF_INET;
-#endif /* (CONFIG_IPV6 || CONFIG_IPV6_MODULE) && CONFIG_SUNRPC_REGISTER_V4 */
-
-/*
* These can be set at insmod time (useful for NFS as root filesystem),
* and also changed through the sysctl interface. -- Jamie Lokier, Aug 2003
*/
@@ -204,19 +193,30 @@ lockd(void *vrqstp)
return 0;
}
-static int create_lockd_listener(struct svc_serv *serv, char *name,
- unsigned short port)
+static int create_lockd_listener(struct svc_serv *serv, const char *name,
+ const int family, const unsigned short port)
{
struct svc_xprt *xprt;
- xprt = svc_find_xprt(serv, name, 0, 0);
+ xprt = svc_find_xprt(serv, name, family, 0);
if (xprt == NULL)
- return svc_create_xprt(serv, name, port, SVC_SOCK_DEFAULTS);
-
+ return svc_create_xprt(serv, name, family, port,
+ SVC_SOCK_DEFAULTS);
svc_xprt_put(xprt);
return 0;
}
+static int create_lockd_family(struct svc_serv *serv, const int family)
+{
+ int err;
+
+ err = create_lockd_listener(serv, "udp", family, nlm_udpport);
+ if (err < 0)
+ return err;
+
+ return create_lockd_listener(serv, "tcp", family, nlm_tcpport);
+}
+
/*
* Ensure there are active UDP and TCP listeners for lockd.
*
@@ -232,13 +232,15 @@ static int make_socks(struct svc_serv *serv)
static int warned;
int err;
- err = create_lockd_listener(serv, "udp", nlm_udpport);
+ err = create_lockd_family(serv, PF_INET);
if (err < 0)
goto out_err;
- err = create_lockd_listener(serv, "tcp", nlm_tcpport);
- if (err < 0)
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ err = create_lockd_family(serv, PF_INET6);
+ if (err < 0 && err != -EAFNOSUPPORT)
goto out_err;
+#endif /* CONFIG_IPV6 || CONFIG_IPV6_MODULE */
warned = 0;
return 0;
@@ -274,7 +276,7 @@ int lockd_up(void)
"lockd_up: no pid, %d users??\n", nlmsvc_users);
error = -ENOMEM;
- serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, nlmsvc_family, NULL);
+ serv = svc_create(&nlmsvc_program, LOCKD_BUFSIZE, NULL);
if (!serv) {
printk(KERN_WARNING "lockd_up: create service failed\n");
goto out;
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 3e634f2a108..a886e692ddd 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -38,19 +38,10 @@ static struct svc_program nfs4_callback_program;
unsigned int nfs_callback_set_tcpport;
unsigned short nfs_callback_tcpport;
+unsigned short nfs_callback_tcpport6;
static const int nfs_set_port_min = 0;
static const int nfs_set_port_max = 65535;
-/*
- * If the kernel has IPv6 support available, always listen for
- * both AF_INET and AF_INET6 requests.
- */
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const sa_family_t nfs_callback_family = AF_INET6;
-#else
-static const sa_family_t nfs_callback_family = AF_INET;
-#endif
-
static int param_set_port(const char *val, struct kernel_param *kp)
{
char *endp;
@@ -116,19 +107,29 @@ int nfs_callback_up(void)
mutex_lock(&nfs_callback_mutex);
if (nfs_callback_info.users++ || nfs_callback_info.task != NULL)
goto out;
- serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE,
- nfs_callback_family, NULL);
+ serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, NULL);
ret = -ENOMEM;
if (!serv)
goto out_err;
- ret = svc_create_xprt(serv, "tcp", nfs_callback_set_tcpport,
- SVC_SOCK_ANONYMOUS);
+ ret = svc_create_xprt(serv, "tcp", PF_INET,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
if (ret <= 0)
goto out_err;
nfs_callback_tcpport = ret;
dprintk("NFS: Callback listener port = %u (af %u)\n",
- nfs_callback_tcpport, nfs_callback_family);
+ nfs_callback_tcpport, PF_INET);
+
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ ret = svc_create_xprt(serv, "tcp", PF_INET6,
+ nfs_callback_set_tcpport, SVC_SOCK_ANONYMOUS);
+ if (ret > 0) {
+ nfs_callback_tcpport6 = ret;
+ dprintk("NFS: Callback listener port = %u (af %u)\n",
+ nfs_callback_tcpport6, PF_INET6);
+ } else if (ret != -EAFNOSUPPORT)
+ goto out_err;
+#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
nfs_callback_info.rqst = svc_prepare_thread(serv, &serv->sv_pools[0]);
if (IS_ERR(nfs_callback_info.rqst)) {
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index bb25d2135ff..e110e286a26 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -72,5 +72,6 @@ extern void nfs_callback_down(void);
extern unsigned int nfs_callback_set_tcpport;
extern unsigned short nfs_callback_tcpport;
+extern unsigned short nfs_callback_tcpport6;
#endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 2277421656e..aba38017bde 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -224,38 +224,6 @@ void nfs_put_client(struct nfs_client *clp)
}
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-static const struct in6_addr *nfs_map_ipv4_addr(const struct sockaddr *sa, struct in6_addr *addr_mapped)
-{
- switch (sa->sa_family) {
- default:
- return NULL;
- case AF_INET6:
- return &((const struct sockaddr_in6 *)sa)->sin6_addr;
- break;
- case AF_INET:
- ipv6_addr_set_v4mapped(((const struct sockaddr_in *)sa)->sin_addr.s_addr,
- addr_mapped);
- return addr_mapped;
- }
-}
-
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct in6_addr *addr1;
- const struct in6_addr *addr2;
- struct in6_addr addr1_mapped;
- struct in6_addr addr2_mapped;
-
- addr1 = nfs_map_ipv4_addr(sa1, &addr1_mapped);
- if (likely(addr1 != NULL)) {
- addr2 = nfs_map_ipv4_addr(sa2, &addr2_mapped);
- if (likely(addr2 != NULL))
- return ipv6_addr_equal(addr1, addr2);
- }
- return 0;
-}
-
/*
* Test if two ip6 socket addresses refer to the same socket by
* comparing relevant fields. The padding bytes specifically, are not
@@ -267,38 +235,21 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
*
* The caller should ensure both socket addresses are AF_INET6.
*/
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
{
- const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1;
- const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2;
+ const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+ const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
- if (!ipv6_addr_equal(&saddr1->sin6_addr,
- &saddr1->sin6_addr))
+ if (ipv6_addr_scope(&sin1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
+ sin1->sin6_scope_id != sin2->sin6_scope_id)
return 0;
- if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL &&
- saddr1->sin6_scope_id != saddr2->sin6_scope_id)
- return 0;
- return saddr1->sin6_port == saddr2->sin6_port;
-}
-#else
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1,
- const struct sockaddr_in *sa2)
-{
- return sa1->sin_addr.s_addr == sa2->sin_addr.s_addr;
-}
-static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- if (unlikely(sa1->sa_family != AF_INET || sa2->sa_family != AF_INET))
- return 0;
- return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1,
- (const struct sockaddr_in *)sa2);
+ return ipv6_addr_equal(&sin1->sin6_addr, &sin1->sin6_addr);
}
-
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
- const struct sockaddr * sa2)
+#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
+static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
{
return 0;
}
@@ -311,20 +262,57 @@ static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1,
*
* The caller should ensure both socket addresses are AF_INET.
*/
+static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+ const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
+
+ return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
+}
+
+static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
+ const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
+
+ return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
+ (sin1->sin6_port == sin2->sin6_port);
+}
+
static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
const struct sockaddr *sa2)
{
- const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1;
- const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2;
+ const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
+ const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
- if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr)
+ return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
+ (sin1->sin_port == sin2->sin_port);
+}
+
+/*
+ * Test if two socket addresses represent the same actual socket,
+ * by comparing (only) relevant fields, excluding the port number.
+ */
+static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
+ const struct sockaddr *sa2)
+{
+ if (sa1->sa_family != sa2->sa_family)
return 0;
- return saddr1->sin_port == saddr2->sin_port;
+
+ switch (sa1->sa_family) {
+ case AF_INET:
+ return nfs_sockaddr_match_ipaddr4(sa1, sa2);
+ case AF_INET6:
+ return nfs_sockaddr_match_ipaddr6(sa1, sa2);
+ }
+ return 0;
}
/*
* Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields.
+ * by comparing (only) relevant fields, including the port number.
*/
static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
const struct sockaddr *sa2)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 78bf72fc1db..370b190a09d 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1624,8 +1624,7 @@ static int nfs_rename(struct inode *old_dir, struct dentry *old_dentry,
} else if (atomic_read(&new_dentry->d_count) > 1)
/* dentry still busy? */
goto out;
- } else
- nfs_drop_nlink(new_inode);
+ }
go_ahead:
/*
@@ -1638,10 +1637,8 @@ go_ahead:
}
nfs_inode_return_delegation(old_inode);
- if (new_inode != NULL) {
+ if (new_inode != NULL)
nfs_inode_return_delegation(new_inode);
- d_delete(new_dentry);
- }
error = NFS_PROTO(old_dir)->rename(old_dir, &old_dentry->d_name,
new_dir, &new_dentry->d_name);
@@ -1650,6 +1647,8 @@ out:
if (rehash)
d_rehash(rehash);
if (!error) {
+ if (new_inode != NULL)
+ nfs_drop_nlink(new_inode);
d_move(old_dentry, new_dentry);
nfs_set_verifier(new_dentry,
nfs_save_change_attribute(new_dir));
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cec79392e4b..0abf3f331f5 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -64,11 +64,7 @@ const struct file_operations nfs_file_operations = {
.write = do_sync_write,
.aio_read = nfs_file_read,
.aio_write = nfs_file_write,
-#ifdef CONFIG_MMU
.mmap = nfs_file_mmap,
-#else
- .mmap = generic_file_mmap,
-#endif
.open = nfs_file_open,
.flush = nfs_file_flush,
.release = nfs_file_release,
@@ -141,9 +137,6 @@ nfs_file_release(struct inode *inode, struct file *filp)
dentry->d_parent->d_name.name,
dentry->d_name.name);
- /* Ensure that dirty pages are flushed out with the right creds */
- if (filp->f_mode & FMODE_WRITE)
- nfs_wb_all(dentry->d_inode);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
return nfs_release(inode, filp);
}
@@ -235,7 +228,6 @@ nfs_file_flush(struct file *file, fl_owner_t id)
struct nfs_open_context *ctx = nfs_file_open_context(file);
struct dentry *dentry = file->f_path.dentry;
struct inode *inode = dentry->d_inode;
- int status;
dprintk("NFS: flush(%s/%s)\n",
dentry->d_parent->d_name.name,
@@ -245,11 +237,8 @@ nfs_file_flush(struct file *file, fl_owner_t id)
return 0;
nfs_inc_stats(inode, NFSIOS_VFSFLUSH);
- /* Ensure that data+attribute caches are up to date after close() */
- status = nfs_do_fsync(ctx, inode);
- if (!status)
- nfs_revalidate_inode(NFS_SERVER(inode), inode);
- return status;
+ /* Flush writes to the server and return any errors */
+ return nfs_do_fsync(ctx, inode);
}
static ssize_t
@@ -304,11 +293,13 @@ nfs_file_mmap(struct file * file, struct vm_area_struct * vma)
dprintk("NFS: mmap(%s/%s)\n",
dentry->d_parent->d_name.name, dentry->d_name.name);
- status = nfs_revalidate_mapping(inode, file->f_mapping);
+ /* Note: generic_file_mmap() returns ENOSYS on nommu systems
+ * so we call that before revalidating the mapping
+ */
+ status = generic_file_mmap(file, vma);
if (!status) {
vma->vm_ops = &nfs_file_vm_ops;
- vma->vm_flags |= VM_CAN_NONLINEAR;
- file_accessed(file);
+ status = nfs_revalidate_mapping(inode, file->f_mapping);
}
return status;
}
@@ -354,6 +345,15 @@ static int nfs_write_begin(struct file *file, struct address_space *mapping,
file->f_path.dentry->d_name.name,
mapping->host->i_ino, len, (long long) pos);
+ /*
+ * Prevent starvation issues if someone is doing a consistency
+ * sync-to-disk
+ */
+ ret = wait_on_bit(&NFS_I(mapping->host)->flags, NFS_INO_FLUSHING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (ret)
+ return ret;
+
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
diff --git a/fs/nfs/getroot.c b/fs/nfs/getroot.c
index b7c9b2df1f2..46177cb8706 100644
--- a/fs/nfs/getroot.c
+++ b/fs/nfs/getroot.c
@@ -156,7 +156,7 @@ int nfs4_path_walk(struct nfs_server *server,
return ret;
}
- if (fattr.type != NFDIR) {
+ if (!S_ISDIR(fattr.mode)) {
printk(KERN_ERR "nfs4_get_root:"
" getroot encountered non-directory\n");
return -ENOTDIR;
@@ -213,7 +213,7 @@ eat_dot_dir:
return ret;
}
- if (fattr.type != NFDIR) {
+ if (!S_ISDIR(fattr.mode)) {
printk(KERN_ERR "nfs4_get_root:"
" lookupfh encountered non-directory\n");
return -ENOTDIR;
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0c381686171..a834d1d850b 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -66,6 +66,18 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr)
}
/**
+ * nfs_wait_bit_killable - helper for functions that are sleeping on bit locks
+ * @word: long word containing the bit lock
+ */
+int nfs_wait_bit_killable(void *word)
+{
+ if (fatal_signal_pending(current))
+ return -ERESTARTSYS;
+ schedule();
+ return 0;
+}
+
+/**
* nfs_compat_user_ino64 - returns the user-visible inode number
* @fileid: 64-bit fileid
*
@@ -249,13 +261,10 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
struct inode *inode = ERR_PTR(-ENOENT);
unsigned long hash;
- if ((fattr->valid & NFS_ATTR_FATTR) == 0)
+ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) == 0)
goto out_no_inode;
-
- if (!fattr->nlink) {
- printk("NFS: Buggy server - nlink == 0!\n");
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) == 0)
goto out_no_inode;
- }
hash = nfs_fattr_to_ino_t(fattr);
@@ -291,7 +300,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
&& fattr->size <= NFS_LIMIT_READDIRPLUS)
set_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags);
/* Deal with crossing mountpoints */
- if (!nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
+ if ((fattr->valid & NFS_ATTR_FATTR_FSID)
+ && !nfs_fsid_equal(&NFS_SB(sb)->fsid, &fattr->fsid)) {
if (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)
inode->i_op = &nfs_referral_inode_operations;
else
@@ -304,28 +314,45 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr)
else
init_special_inode(inode, inode->i_mode, fattr->rdev);
+ memset(&inode->i_atime, 0, sizeof(inode->i_atime));
+ memset(&inode->i_mtime, 0, sizeof(inode->i_mtime));
+ memset(&inode->i_ctime, 0, sizeof(inode->i_ctime));
+ nfsi->change_attr = 0;
+ inode->i_size = 0;
+ inode->i_nlink = 0;
+ inode->i_uid = -2;
+ inode->i_gid = -2;
+ inode->i_blocks = 0;
+ memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+
nfsi->read_cache_jiffies = fattr->time_start;
nfsi->attr_gencount = fattr->gencount;
- inode->i_atime = fattr->atime;
- inode->i_mtime = fattr->mtime;
- inode->i_ctime = fattr->ctime;
- if (fattr->valid & NFS_ATTR_FATTR_V4)
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ inode->i_atime = fattr->atime;
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ inode->i_mtime = fattr->mtime;
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ inode->i_ctime = fattr->ctime;
+ if (fattr->valid & NFS_ATTR_FATTR_CHANGE)
nfsi->change_attr = fattr->change_attr;
- inode->i_size = nfs_size_to_loff_t(fattr->size);
- inode->i_nlink = fattr->nlink;
- inode->i_uid = fattr->uid;
- inode->i_gid = fattr->gid;
- if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE)
+ inode->i_size = nfs_size_to_loff_t(fattr->size);
+ if (fattr->valid & NFS_ATTR_FATTR_NLINK)
+ inode->i_nlink = fattr->nlink;
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER)
+ inode->i_uid = fattr->uid;
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP)
+ inode->i_gid = fattr->gid;
+ if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ inode->i_blocks = fattr->du.nfs2.blocks;
+ if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
* report the blocks in 512byte units
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- } else {
- inode->i_blocks = fattr->du.nfs2.blocks;
}
nfsi->attrtimeo = NFS_MINATTRTIMEO(inode);
nfsi->attrtimeo_timestamp = now;
- memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
nfsi->access_cache = RB_ROOT;
unlock_new_inode(inode);
@@ -514,6 +541,32 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat)
return err;
}
+/**
+ * nfs_close_context - Common close_context() routine NFSv2/v3
+ * @ctx: pointer to context
+ * @is_sync: is this a synchronous close
+ *
+ * always ensure that the attributes are up to date if we're mounted
+ * with close-to-open semantics
+ */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+ struct inode *inode;
+ struct nfs_server *server;
+
+ if (!(ctx->mode & FMODE_WRITE))
+ return;
+ if (!is_sync)
+ return;
+ inode = ctx->path.dentry->d_inode;
+ if (!list_empty(&NFS_I(inode)->open_files))
+ return;
+ server = NFS_SERVER(inode);
+ if (server->flags & NFS_MOUNT_NOCTO)
+ return;
+ nfs_revalidate_inode(server, inode);
+}
+
static struct nfs_open_context *alloc_nfs_open_context(struct vfsmount *mnt, struct dentry *dentry, struct rpc_cred *cred)
{
struct nfs_open_context *ctx;
@@ -540,24 +593,15 @@ struct nfs_open_context *get_nfs_open_context(struct nfs_open_context *ctx)
return ctx;
}
-static void __put_nfs_open_context(struct nfs_open_context *ctx, int wait)
+static void __put_nfs_open_context(struct nfs_open_context *ctx, int is_sync)
{
- struct inode *inode;
-
- if (ctx == NULL)
- return;
+ struct inode *inode = ctx->path.dentry->d_inode;
- inode = ctx->path.dentry->d_inode;
if (!atomic_dec_and_lock(&ctx->count, &inode->i_lock))
return;
list_del(&ctx->list);
spin_unlock(&inode->i_lock);
- if (ctx->state != NULL) {
- if (wait)
- nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
- else
- nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
- }
+ NFS_PROTO(inode)->close_context(ctx, is_sync);
if (ctx->cred != NULL)
put_rpccred(ctx->cred);
path_put(&ctx->path);
@@ -670,9 +714,6 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
if (NFS_STALE(inode))
goto out;
- if (NFS_STALE(inode))
- goto out;
-
nfs_inc_stats(inode, NFSIOS_INODEREVALIDATE);
status = NFS_PROTO(inode)->getattr(server, NFS_FH(inode), &fattr);
if (status != 0) {
@@ -815,25 +856,31 @@ static void nfs_wcc_update_inode(struct inode *inode, struct nfs_fattr *fattr)
{
struct nfs_inode *nfsi = NFS_I(inode);
- if ((fattr->valid & NFS_ATTR_WCC_V4) != 0 &&
- nfsi->change_attr == fattr->pre_change_attr) {
+ if ((fattr->valid & NFS_ATTR_FATTR_PRECHANGE)
+ && (fattr->valid & NFS_ATTR_FATTR_CHANGE)
+ && nfsi->change_attr == fattr->pre_change_attr) {
nfsi->change_attr = fattr->change_attr;
if (S_ISDIR(inode->i_mode))
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
}
/* If we have atomic WCC data, we may update some attributes */
- if ((fattr->valid & NFS_ATTR_WCC) != 0) {
- if (timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
+ if ((fattr->valid & NFS_ATTR_FATTR_PRECTIME)
+ && (fattr->valid & NFS_ATTR_FATTR_CTIME)
+ && timespec_equal(&inode->i_ctime, &fattr->pre_ctime))
memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
- if (timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
+
+ if ((fattr->valid & NFS_ATTR_FATTR_PREMTIME)
+ && (fattr->valid & NFS_ATTR_FATTR_MTIME)
+ && timespec_equal(&inode->i_mtime, &fattr->pre_mtime)) {
memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
if (S_ISDIR(inode->i_mode))
nfsi->cache_validity |= NFS_INO_INVALID_DATA;
- }
- if (i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size) &&
- nfsi->npages == 0)
- i_size_write(inode, nfs_size_to_loff_t(fattr->size));
}
+ if ((fattr->valid & NFS_ATTR_FATTR_PRESIZE)
+ && (fattr->valid & NFS_ATTR_FATTR_SIZE)
+ && i_size_read(inode) == nfs_size_to_loff_t(fattr->pre_size)
+ && nfsi->npages == 0)
+ i_size_write(inode, nfs_size_to_loff_t(fattr->size));
}
/**
@@ -853,35 +900,39 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
/* Has the inode gone and changed behind our back? */
- if (nfsi->fileid != fattr->fileid
- || (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT)) {
+ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
+ return -EIO;
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
return -EIO;
- }
- if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
nfsi->change_attr != fattr->change_attr)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
/* Verify a few of the more important attributes */
- if (!timespec_equal(&inode->i_mtime, &fattr->mtime))
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) && !timespec_equal(&inode->i_mtime, &fattr->mtime))
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
- cur_size = i_size_read(inode);
- new_isize = nfs_size_to_loff_t(fattr->size);
- if (cur_size != new_isize && nfsi->npages == 0)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ cur_size = i_size_read(inode);
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ if (cur_size != new_isize && nfsi->npages == 0)
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_REVAL_PAGECACHE;
+ }
/* Have any file permissions changed? */
- if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)
- || inode->i_uid != fattr->uid
- || inode->i_gid != fattr->gid)
+ if ((fattr->valid & NFS_ATTR_FATTR_MODE) && (inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO))
+ invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+ if ((fattr->valid & NFS_ATTR_FATTR_OWNER) && inode->i_uid != fattr->uid)
+ invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
+ if ((fattr->valid & NFS_ATTR_FATTR_GROUP) && inode->i_gid != fattr->gid)
invalid |= NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL;
/* Has the link count changed? */
- if (inode->i_nlink != fattr->nlink)
+ if ((fattr->valid & NFS_ATTR_FATTR_NLINK) && inode->i_nlink != fattr->nlink)
invalid |= NFS_INO_INVALID_ATTR;
- if (!timespec_equal(&inode->i_atime, &fattr->atime))
+ if ((fattr->valid & NFS_ATTR_FATTR_ATIME) && !timespec_equal(&inode->i_atime, &fattr->atime))
invalid |= NFS_INO_INVALID_ATIME;
if (invalid != 0)
@@ -893,11 +944,15 @@ static int nfs_check_inode_attributes(struct inode *inode, struct nfs_fattr *fat
static int nfs_ctime_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
{
+ if (!(fattr->valid & NFS_ATTR_FATTR_CTIME))
+ return 0;
return timespec_compare(&fattr->ctime, &inode->i_ctime) > 0;
}
static int nfs_size_need_update(const struct inode *inode, const struct nfs_fattr *fattr)
{
+ if (!(fattr->valid & NFS_ATTR_FATTR_SIZE))
+ return 0;
return nfs_size_to_loff_t(fattr->size) > i_size_read(inode);
}
@@ -1033,20 +1088,31 @@ int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fa
/* Don't do a WCC update if these attributes are already stale */
if ((fattr->valid & NFS_ATTR_FATTR) == 0 ||
!nfs_inode_attrs_need_update(inode, fattr)) {
- fattr->valid &= ~(NFS_ATTR_WCC_V4|NFS_ATTR_WCC);
+ fattr->valid &= ~(NFS_ATTR_FATTR_PRECHANGE
+ | NFS_ATTR_FATTR_PRESIZE
+ | NFS_ATTR_FATTR_PREMTIME
+ | NFS_ATTR_FATTR_PRECTIME);
goto out_noforce;
}
- if ((fattr->valid & NFS_ATTR_FATTR_V4) != 0 &&
- (fattr->valid & NFS_ATTR_WCC_V4) == 0) {
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECHANGE) == 0) {
fattr->pre_change_attr = NFS_I(inode)->change_attr;
- fattr->valid |= NFS_ATTR_WCC_V4;
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
}
- if ((fattr->valid & NFS_ATTR_FATTR) != 0 &&
- (fattr->valid & NFS_ATTR_WCC) == 0) {
+ if ((fattr->valid & NFS_ATTR_FATTR_CTIME) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRECTIME) == 0) {
memcpy(&fattr->pre_ctime, &inode->i_ctime, sizeof(fattr->pre_ctime));
+ fattr->valid |= NFS_ATTR_FATTR_PRECTIME;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_MTIME) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PREMTIME) == 0) {
memcpy(&fattr->pre_mtime, &inode->i_mtime, sizeof(fattr->pre_mtime));
+ fattr->valid |= NFS_ATTR_FATTR_PREMTIME;
+ }
+ if ((fattr->valid & NFS_ATTR_FATTR_SIZE) != 0 &&
+ (fattr->valid & NFS_ATTR_FATTR_PRESIZE) == 0) {
fattr->pre_size = i_size_read(inode);
- fattr->valid |= NFS_ATTR_WCC;
+ fattr->valid |= NFS_ATTR_FATTR_PRESIZE;
}
out_noforce:
status = nfs_post_op_update_inode_locked(inode, fattr);
@@ -1078,18 +1144,18 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
__func__, inode->i_sb->s_id, inode->i_ino,
atomic_read(&inode->i_count), fattr->valid);
- if (nfsi->fileid != fattr->fileid)
+ if ((fattr->valid & NFS_ATTR_FATTR_FILEID) && nfsi->fileid != fattr->fileid)
goto out_fileid;
/*
* Make sure the inode's type hasn't changed.
*/
- if ((inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
+ if ((fattr->valid & NFS_ATTR_FATTR_TYPE) && (inode->i_mode & S_IFMT) != (fattr->mode & S_IFMT))
goto out_changed;
server = NFS_SERVER(inode);
/* Update the fsid? */
- if (S_ISDIR(inode->i_mode) &&
+ if (S_ISDIR(inode->i_mode) && (fattr->valid & NFS_ATTR_FATTR_FSID) &&
!nfs_fsid_equal(&server->fsid, &fattr->fsid) &&
!test_bit(NFS_INO_MOUNTPOINT, &nfsi->flags))
server->fsid = fattr->fsid;
@@ -1099,14 +1165,27 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
*/
nfsi->read_cache_jiffies = fattr->time_start;
- nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR | NFS_INO_INVALID_ATIME
- | NFS_INO_REVAL_PAGECACHE);
+ if ((fattr->valid & NFS_ATTR_FATTR_CHANGE) || (fattr->valid & (NFS_ATTR_FATTR_MTIME|NFS_ATTR_FATTR_CTIME)))
+ nfsi->cache_validity &= ~(NFS_INO_INVALID_ATTR
+ | NFS_INO_INVALID_ATIME
+ | NFS_INO_REVAL_PAGECACHE);
/* Do atomic weak cache consistency updates */
nfs_wcc_update_inode(inode, fattr);
/* More cache consistency checks */
- if (!(fattr->valid & NFS_ATTR_FATTR_V4)) {
+ if (fattr->valid & NFS_ATTR_FATTR_CHANGE) {
+ if (nfsi->change_attr != fattr->change_attr) {
+ dprintk("NFS: change_attr change on server for file %s/%ld\n",
+ inode->i_sb->s_id, inode->i_ino);
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ if (S_ISDIR(inode->i_mode))
+ nfs_force_lookup_revalidate(inode);
+ nfsi->change_attr = fattr->change_attr;
+ }
+ }
+
+ if (fattr->valid & NFS_ATTR_FATTR_MTIME) {
/* NFSv2/v3: Check if the mtime agrees */
if (!timespec_equal(&inode->i_mtime, &fattr->mtime)) {
dprintk("NFS: mtime change on server for file %s/%ld\n",
@@ -1114,59 +1193,80 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
if (S_ISDIR(inode->i_mode))
nfs_force_lookup_revalidate(inode);
+ memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
}
+ }
+ if (fattr->valid & NFS_ATTR_FATTR_CTIME) {
/* If ctime has changed we should definitely clear access+acl caches */
- if (!timespec_equal(&inode->i_ctime, &fattr->ctime))
+ if (!timespec_equal(&inode->i_ctime, &fattr->ctime)) {
invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
- } else if (nfsi->change_attr != fattr->change_attr) {
- dprintk("NFS: change_attr change on server for file %s/%ld\n",
- inode->i_sb->s_id, inode->i_ino);
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
- if (S_ISDIR(inode->i_mode))
- nfs_force_lookup_revalidate(inode);
+ /* and probably clear data for a directory too as utimes can cause
+ * havoc with our cache.
+ */
+ if (S_ISDIR(inode->i_mode)) {
+ invalid |= NFS_INO_INVALID_DATA;
+ nfs_force_lookup_revalidate(inode);
+ }
+ memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
+ }
}
/* Check if our cached file size is stale */
- new_isize = nfs_size_to_loff_t(fattr->size);
- cur_isize = i_size_read(inode);
- if (new_isize != cur_isize) {
- /* Do we perhaps have any outstanding writes, or has
- * the file grown beyond our last write? */
- if (nfsi->npages == 0 || new_isize > cur_isize) {
- i_size_write(inode, new_isize);
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ if (fattr->valid & NFS_ATTR_FATTR_SIZE) {
+ new_isize = nfs_size_to_loff_t(fattr->size);
+ cur_isize = i_size_read(inode);
+ if (new_isize != cur_isize) {
+ /* Do we perhaps have any outstanding writes, or has
+ * the file grown beyond our last write? */
+ if (nfsi->npages == 0 || new_isize > cur_isize) {
+ i_size_write(inode, new_isize);
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_DATA;
+ }
+ dprintk("NFS: isize change on server for file %s/%ld\n",
+ inode->i_sb->s_id, inode->i_ino);
}
- dprintk("NFS: isize change on server for file %s/%ld\n",
- inode->i_sb->s_id, inode->i_ino);
}
- memcpy(&inode->i_mtime, &fattr->mtime, sizeof(inode->i_mtime));
- memcpy(&inode->i_ctime, &fattr->ctime, sizeof(inode->i_ctime));
- memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
- nfsi->change_attr = fattr->change_attr;
-
- if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO) ||
- inode->i_uid != fattr->uid ||
- inode->i_gid != fattr->gid)
- invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ if (fattr->valid & NFS_ATTR_FATTR_ATIME)
+ memcpy(&inode->i_atime, &fattr->atime, sizeof(inode->i_atime));
- if (inode->i_nlink != fattr->nlink)
- invalid |= NFS_INO_INVALID_ATTR;
+ if (fattr->valid & NFS_ATTR_FATTR_MODE) {
+ if ((inode->i_mode & S_IALLUGO) != (fattr->mode & S_IALLUGO)) {
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ inode->i_mode = fattr->mode;
+ }
+ }
+ if (fattr->valid & NFS_ATTR_FATTR_OWNER) {
+ if (inode->i_uid != fattr->uid) {
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ inode->i_uid = fattr->uid;
+ }
+ }
+ if (fattr->valid & NFS_ATTR_FATTR_GROUP) {
+ if (inode->i_gid != fattr->gid) {
+ invalid |= NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ACCESS|NFS_INO_INVALID_ACL;
+ inode->i_gid = fattr->gid;
+ }
+ }
- inode->i_mode = fattr->mode;
- inode->i_nlink = fattr->nlink;
- inode->i_uid = fattr->uid;
- inode->i_gid = fattr->gid;
+ if (fattr->valid & NFS_ATTR_FATTR_NLINK) {
+ if (inode->i_nlink != fattr->nlink) {
+ invalid |= NFS_INO_INVALID_ATTR;
+ if (S_ISDIR(inode->i_mode))
+ invalid |= NFS_INO_INVALID_DATA;
+ inode->i_nlink = fattr->nlink;
+ }
+ }
- if (fattr->valid & (NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4)) {
+ if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) {
/*
* report the blocks in 512byte units
*/
inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used);
- } else {
- inode->i_blocks = fattr->du.nfs2.blocks;
}
+ if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED)
+ inode->i_blocks = fattr->du.nfs2.blocks;
/* Update attrtimeo value if we're out of the unstable period */
if (invalid & NFS_INO_INVALID_ATTR) {
@@ -1274,7 +1374,6 @@ static void init_once(void *foo)
INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
INIT_RADIX_TREE(&nfsi->nfs_page_tree, GFP_ATOMIC);
- nfsi->ncommit = 0;
nfsi->npages = 0;
atomic_set(&nfsi->silly_count, 1);
INIT_HLIST_HEAD(&nfsi->silly_list);
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 340ede8f608..2041f68ff1c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -152,6 +152,9 @@ extern __be32 *nfs4_decode_dirent(__be32 *p, struct nfs_entry *entry, int plus);
extern struct rpc_procinfo nfs4_procedures[];
#endif
+/* proc.c */
+void nfs_close_context(struct nfs_open_context *ctx, int is_sync);
+
/* dir.c */
extern int nfs_access_cache_shrinker(int nr_to_scan, gfp_t gfp_mask);
@@ -165,6 +168,7 @@ extern void nfs_clear_inode(struct inode *);
extern void nfs4_clear_inode(struct inode *);
#endif
void nfs_zap_acl_cache(struct inode *inode);
+extern int nfs_wait_bit_killable(void *word);
/* super.c */
void nfs_parse_ip_address(char *, size_t, struct sockaddr *, size_t *);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index 28bab67d151..c862c9340f9 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -120,8 +120,8 @@ xdr_decode_time(__be32 *p, struct timespec *timep)
static __be32 *
xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
{
- u32 rdev;
- fattr->type = (enum nfs_ftype) ntohl(*p++);
+ u32 rdev, type;
+ type = ntohl(*p++);
fattr->mode = ntohl(*p++);
fattr->nlink = ntohl(*p++);
fattr->uid = ntohl(*p++);
@@ -136,10 +136,9 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
p = xdr_decode_time(p, &fattr->atime);
p = xdr_decode_time(p, &fattr->mtime);
p = xdr_decode_time(p, &fattr->ctime);
- fattr->valid |= NFS_ATTR_FATTR;
+ fattr->valid |= NFS_ATTR_FATTR_V2;
fattr->rdev = new_decode_dev(rdev);
- if (fattr->type == NFCHR && rdev == NFS2_FIFO_DEV) {
- fattr->type = NFFIFO;
+ if (type == NFCHR && rdev == NFS2_FIFO_DEV) {
fattr->mode = (fattr->mode & ~S_IFMT) | S_IFIFO;
fattr->rdev = 0;
}
diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c
index c55be7a7679..b82fe6847f1 100644
--- a/fs/nfs/nfs3proc.c
+++ b/fs/nfs/nfs3proc.c
@@ -834,4 +834,5 @@ const struct nfs_rpc_ops nfs_v3_clientops = {
.commit_done = nfs3_commit_done,
.lock = nfs3_proc_lock,
.clear_acl_cache = nfs3_forget_cached_acls,
+ .close_context = nfs_close_context,
};
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 6cdeacffde4..e6a1932c711 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -91,19 +91,15 @@
/*
* Map file type to S_IFMT bits
*/
-static struct {
- unsigned int mode;
- unsigned int nfs2type;
-} nfs_type2fmt[] = {
- { 0, NFNON },
- { S_IFREG, NFREG },
- { S_IFDIR, NFDIR },
- { S_IFBLK, NFBLK },
- { S_IFCHR, NFCHR },
- { S_IFLNK, NFLNK },
- { S_IFSOCK, NFSOCK },
- { S_IFIFO, NFFIFO },
- { 0, NFBAD }
+static const umode_t nfs_type2fmt[] = {
+ [NF3BAD] = 0,
+ [NF3REG] = S_IFREG,
+ [NF3DIR] = S_IFDIR,
+ [NF3BLK] = S_IFBLK,
+ [NF3CHR] = S_IFCHR,
+ [NF3LNK] = S_IFLNK,
+ [NF3SOCK] = S_IFSOCK,
+ [NF3FIFO] = S_IFIFO,
};
/*
@@ -148,13 +144,12 @@ static __be32 *
xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
{
unsigned int type, major, minor;
- int fmode;
+ umode_t fmode;
type = ntohl(*p++);
- if (type >= NF3BAD)
- type = NF3BAD;
- fmode = nfs_type2fmt[type].mode;
- fattr->type = nfs_type2fmt[type].nfs2type;
+ if (type > NF3FIFO)
+ type = NF3NON;
+ fmode = nfs_type2fmt[type];
fattr->mode = (ntohl(*p++) & ~S_IFMT) | fmode;
fattr->nlink = ntohl(*p++);
fattr->uid = ntohl(*p++);
@@ -177,7 +172,7 @@ xdr_decode_fattr(__be32 *p, struct nfs_fattr *fattr)
p = xdr_decode_time3(p, &fattr->ctime);
/* Update the mode bits */
- fattr->valid |= (NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3);
+ fattr->valid |= NFS_ATTR_FATTR_V3;
return p;
}
@@ -233,7 +228,9 @@ xdr_decode_wcc_attr(__be32 *p, struct nfs_fattr *fattr)
p = xdr_decode_hyper(p, &fattr->pre_size);
p = xdr_decode_time3(p, &fattr->pre_mtime);
p = xdr_decode_time3(p, &fattr->pre_ctime);
- fattr->valid |= NFS_ATTR_WCC;
+ fattr->valid |= NFS_ATTR_FATTR_PRESIZE
+ | NFS_ATTR_FATTR_PREMTIME
+ | NFS_ATTR_FATTR_PRECTIME;
return p;
}
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 8dde84b988d..97bacccff57 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -193,14 +193,6 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
kunmap_atomic(start, KM_USER0);
}
-static int nfs4_wait_bit_killable(void *word)
-{
- if (fatal_signal_pending(current))
- return -ERESTARTSYS;
- schedule();
- return 0;
-}
-
static int nfs4_wait_clnt_recover(struct nfs_client *clp)
{
int res;
@@ -208,7 +200,7 @@ static int nfs4_wait_clnt_recover(struct nfs_client *clp)
might_sleep();
res = wait_on_bit(&clp->cl_state, NFS4CLNT_MANAGER_RUNNING,
- nfs4_wait_bit_killable, TASK_KILLABLE);
+ nfs_wait_bit_killable, TASK_KILLABLE);
return res;
}
@@ -1439,7 +1431,7 @@ int nfs4_do_close(struct path *path, struct nfs4_state *state, int wait)
if (calldata->arg.seqid == NULL)
goto out_free_calldata;
calldata->arg.fmode = 0;
- calldata->arg.bitmask = server->attr_bitmask;
+ calldata->arg.bitmask = server->cache_consistency_bitmask;
calldata->res.fattr = &calldata->fattr;
calldata->res.seqid = calldata->arg.seqid;
calldata->res.server = server;
@@ -1580,6 +1572,15 @@ out_drop:
return 0;
}
+void nfs4_close_context(struct nfs_open_context *ctx, int is_sync)
+{
+ if (ctx->state == NULL)
+ return;
+ if (is_sync)
+ nfs4_close_sync(&ctx->path, ctx->state, ctx->mode);
+ else
+ nfs4_close_state(&ctx->path, ctx->state, ctx->mode);
+}
static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *fhandle)
{
@@ -1600,6 +1601,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f
server->caps |= NFS_CAP_HARDLINKS;
if (res.has_symlinks != 0)
server->caps |= NFS_CAP_SYMLINKS;
+ memcpy(server->cache_consistency_bitmask, res.attr_bitmask, sizeof(server->cache_consistency_bitmask));
+ server->cache_consistency_bitmask[0] &= FATTR4_WORD0_CHANGE|FATTR4_WORD0_SIZE;
+ server->cache_consistency_bitmask[1] &= FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY;
server->acl_bitmask = res.acl_bitmask;
}
return status;
@@ -2079,7 +2083,7 @@ static void nfs4_proc_unlink_setup(struct rpc_message *msg, struct inode *dir)
struct nfs_removeargs *args = msg->rpc_argp;
struct nfs_removeres *res = msg->rpc_resp;
- args->bitmask = server->attr_bitmask;
+ args->bitmask = server->cache_consistency_bitmask;
res->server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_REMOVE];
}
@@ -2323,7 +2327,7 @@ static int _nfs4_proc_readdir(struct dentry *dentry, struct rpc_cred *cred,
.pages = &page,
.pgbase = 0,
.count = count,
- .bitmask = NFS_SERVER(dentry->d_inode)->attr_bitmask,
+ .bitmask = NFS_SERVER(dentry->d_inode)->cache_consistency_bitmask,
};
struct nfs4_readdir_res res;
struct rpc_message msg = {
@@ -2552,7 +2556,7 @@ static void nfs4_proc_write_setup(struct nfs_write_data *data, struct rpc_messag
{
struct nfs_server *server = NFS_SERVER(data->inode);
- data->args.bitmask = server->attr_bitmask;
+ data->args.bitmask = server->cache_consistency_bitmask;
data->res.server = server;
data->timestamp = jiffies;
@@ -2575,7 +2579,7 @@ static void nfs4_proc_commit_setup(struct nfs_write_data *data, struct rpc_messa
{
struct nfs_server *server = NFS_SERVER(data->inode);
- data->args.bitmask = server->attr_bitmask;
+ data->args.bitmask = server->cache_consistency_bitmask;
data->res.server = server;
msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_COMMIT];
}
@@ -3678,6 +3682,19 @@ ssize_t nfs4_listxattr(struct dentry *dentry, char *buf, size_t buflen)
return len;
}
+static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr)
+{
+ if (!((fattr->valid & NFS_ATTR_FATTR_FILEID) &&
+ (fattr->valid & NFS_ATTR_FATTR_FSID) &&
+ (fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL)))
+ return;
+
+ fattr->valid |= NFS_ATTR_FATTR_TYPE | NFS_ATTR_FATTR_MODE |
+ NFS_ATTR_FATTR_NLINK;
+ fattr->mode = S_IFDIR | S_IRUGO | S_IXUGO;
+ fattr->nlink = 2;
+}
+
int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
struct nfs4_fs_locations *fs_locations, struct page *page)
{
@@ -3704,6 +3721,7 @@ int nfs4_proc_fs_locations(struct inode *dir, const struct qstr *name,
fs_locations->server = server;
fs_locations->nlocations = 0;
status = rpc_call_sync(server->client, &msg, 0);
+ nfs_fixup_referral_attributes(&fs_locations->fattr);
dprintk("%s: returned status = %d\n", __func__, status);
return status;
}
@@ -3767,6 +3785,7 @@ const struct nfs_rpc_ops nfs_v4_clientops = {
.commit_done = nfs4_commit_done,
.lock = nfs4_proc_lock,
.clear_acl_cache = nfs4_zap_acl_attr,
+ .close_context = nfs4_close_context,
};
/*
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index 2022fe47966..0298e909559 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -62,8 +62,14 @@ static LIST_HEAD(nfs4_clientid_list);
static int nfs4_init_client(struct nfs_client *clp, struct rpc_cred *cred)
{
- int status = nfs4_proc_setclientid(clp, NFS4_CALLBACK,
- nfs_callback_tcpport, cred);
+ unsigned short port;
+ int status;
+
+ port = nfs_callback_tcpport;
+ if (clp->cl_addr.ss_family == AF_INET6)
+ port = nfs_callback_tcpport6;
+
+ status = nfs4_proc_setclientid(clp, NFS4_CALLBACK, port, cred);
if (status == 0)
status = nfs4_proc_setclientid_confirm(clp, cred);
if (status == 0)
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index d1e4c8f8a0a..1690f0e44b9 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -522,20 +522,17 @@ static int nfs4_stat_to_errno(int);
decode_lookup_maxsz + \
decode_fs_locations_maxsz)
-static struct {
- unsigned int mode;
- unsigned int nfs2type;
-} nfs_type2fmt[] = {
- { 0, NFNON },
- { S_IFREG, NFREG },
- { S_IFDIR, NFDIR },
- { S_IFBLK, NFBLK },
- { S_IFCHR, NFCHR },
- { S_IFLNK, NFLNK },
- { S_IFSOCK, NFSOCK },
- { S_IFIFO, NFFIFO },
- { 0, NFNON },
- { 0, NFNON },
+static const umode_t nfs_type2fmt[] = {
+ [NF4BAD] = 0,
+ [NF4REG] = S_IFREG,
+ [NF4DIR] = S_IFDIR,
+ [NF4BLK] = S_IFBLK,
+ [NF4CHR] = S_IFCHR,
+ [NF4LNK] = S_IFLNK,
+ [NF4SOCK] = S_IFSOCK,
+ [NF4FIFO] = S_IFIFO,
+ [NF4ATTRDIR] = 0,
+ [NF4NAMEDATTR] = 0,
};
struct compound_hdr {
@@ -2160,6 +2157,7 @@ static int decode_attr_supported(struct xdr_stream *xdr, uint32_t *bitmap, uint3
static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *type)
{
__be32 *p;
+ int ret = 0;
*type = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_TYPE - 1U)))
@@ -2172,14 +2170,16 @@ static int decode_attr_type(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *
return -EIO;
}
bitmap[0] &= ~FATTR4_WORD0_TYPE;
+ ret = NFS_ATTR_FATTR_TYPE;
}
- dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type].nfs2type);
- return 0;
+ dprintk("%s: type=0%o\n", __func__, nfs_type2fmt[*type]);
+ return ret;
}
static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *change)
{
__be32 *p;
+ int ret = 0;
*change = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_CHANGE - 1U)))
@@ -2188,15 +2188,17 @@ static int decode_attr_change(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
READ_BUF(8);
READ64(*change);
bitmap[0] &= ~FATTR4_WORD0_CHANGE;
+ ret = NFS_ATTR_FATTR_CHANGE;
}
dprintk("%s: change attribute=%Lu\n", __func__,
(unsigned long long)*change);
- return 0;
+ return ret;
}
static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *size)
{
__be32 *p;
+ int ret = 0;
*size = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_SIZE - 1U)))
@@ -2205,9 +2207,10 @@ static int decode_attr_size(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *
READ_BUF(8);
READ64(*size);
bitmap[0] &= ~FATTR4_WORD0_SIZE;
+ ret = NFS_ATTR_FATTR_SIZE;
}
dprintk("%s: file size=%Lu\n", __func__, (unsigned long long)*size);
- return 0;
+ return ret;
}
static int decode_attr_link_support(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2245,6 +2248,7 @@ static int decode_attr_symlink_support(struct xdr_stream *xdr, uint32_t *bitmap,
static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_fsid *fsid)
{
__be32 *p;
+ int ret = 0;
fsid->major = 0;
fsid->minor = 0;
@@ -2255,11 +2259,12 @@ static int decode_attr_fsid(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs
READ64(fsid->major);
READ64(fsid->minor);
bitmap[0] &= ~FATTR4_WORD0_FSID;
+ ret = NFS_ATTR_FATTR_FSID;
}
dprintk("%s: fsid=(0x%Lx/0x%Lx)\n", __func__,
(unsigned long long)fsid->major,
(unsigned long long)fsid->minor);
- return 0;
+ return ret;
}
static int decode_attr_lease_time(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res)
@@ -2297,6 +2302,7 @@ static int decode_attr_aclsupport(struct xdr_stream *xdr, uint32_t *bitmap, uint
static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
{
__be32 *p;
+ int ret = 0;
*fileid = 0;
if (unlikely(bitmap[0] & (FATTR4_WORD0_FILEID - 1U)))
@@ -2305,14 +2311,16 @@ static int decode_attr_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t
READ_BUF(8);
READ64(*fileid);
bitmap[0] &= ~FATTR4_WORD0_FILEID;
+ ret = NFS_ATTR_FATTR_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
- return 0;
+ return ret;
}
static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *fileid)
{
__be32 *p;
+ int ret = 0;
*fileid = 0;
if (unlikely(bitmap[1] & (FATTR4_WORD1_MOUNTED_ON_FILEID - 1U)))
@@ -2321,9 +2329,10 @@ static int decode_attr_mounted_on_fileid(struct xdr_stream *xdr, uint32_t *bitma
READ_BUF(8);
READ64(*fileid);
bitmap[1] &= ~FATTR4_WORD1_MOUNTED_ON_FILEID;
+ ret = NFS_ATTR_FATTR_FILEID;
}
dprintk("%s: fileid=%Lu\n", __func__, (unsigned long long)*fileid);
- return 0;
+ return ret;
}
static int decode_attr_files_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2479,6 +2488,8 @@ static int decode_attr_fs_locations(struct xdr_stream *xdr, uint32_t *bitmap, st
if (res->nlocations < NFS4_FS_LOCATIONS_MAXENTRIES)
res->nlocations++;
}
+ if (res->nlocations != 0)
+ status = NFS_ATTR_FATTR_V4_REFERRAL;
out:
dprintk("%s: fs_locations done, error = %d\n", __func__, status);
return status;
@@ -2580,26 +2591,30 @@ static int decode_attr_maxwrite(struct xdr_stream *xdr, uint32_t *bitmap, uint32
return status;
}
-static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *mode)
+static int decode_attr_mode(struct xdr_stream *xdr, uint32_t *bitmap, umode_t *mode)
{
+ uint32_t tmp;
__be32 *p;
+ int ret = 0;
*mode = 0;
if (unlikely(bitmap[1] & (FATTR4_WORD1_MODE - 1U)))
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_MODE)) {
READ_BUF(4);
- READ32(*mode);
- *mode &= ~S_IFMT;
+ READ32(tmp);
+ *mode = tmp & ~S_IFMT;
bitmap[1] &= ~FATTR4_WORD1_MODE;
+ ret = NFS_ATTR_FATTR_MODE;
}
dprintk("%s: file mode=0%o\n", __func__, (unsigned int)*mode);
- return 0;
+ return ret;
}
static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *nlink)
{
__be32 *p;
+ int ret = 0;
*nlink = 1;
if (unlikely(bitmap[1] & (FATTR4_WORD1_NUMLINKS - 1U)))
@@ -2608,15 +2623,17 @@ static int decode_attr_nlink(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t
READ_BUF(4);
READ32(*nlink);
bitmap[1] &= ~FATTR4_WORD1_NUMLINKS;
+ ret = NFS_ATTR_FATTR_NLINK;
}
dprintk("%s: nlink=%u\n", __func__, (unsigned int)*nlink);
- return 0;
+ return ret;
}
static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *uid)
{
uint32_t len;
__be32 *p;
+ int ret = 0;
*uid = -2;
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER - 1U)))
@@ -2626,7 +2643,9 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
READ32(len);
READ_BUF(len);
if (len < XDR_MAX_NETOBJ) {
- if (nfs_map_name_to_uid(clp, (char *)p, len, uid) != 0)
+ if (nfs_map_name_to_uid(clp, (char *)p, len, uid) == 0)
+ ret = NFS_ATTR_FATTR_OWNER;
+ else
dprintk("%s: nfs_map_name_to_uid failed!\n",
__func__);
} else
@@ -2635,13 +2654,14 @@ static int decode_attr_owner(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
bitmap[1] &= ~FATTR4_WORD1_OWNER;
}
dprintk("%s: uid=%d\n", __func__, (int)*uid);
- return 0;
+ return ret;
}
static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nfs_client *clp, uint32_t *gid)
{
uint32_t len;
__be32 *p;
+ int ret = 0;
*gid = -2;
if (unlikely(bitmap[1] & (FATTR4_WORD1_OWNER_GROUP - 1U)))
@@ -2651,7 +2671,9 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
READ32(len);
READ_BUF(len);
if (len < XDR_MAX_NETOBJ) {
- if (nfs_map_group_to_gid(clp, (char *)p, len, gid) != 0)
+ if (nfs_map_group_to_gid(clp, (char *)p, len, gid) == 0)
+ ret = NFS_ATTR_FATTR_GROUP;
+ else
dprintk("%s: nfs_map_group_to_gid failed!\n",
__func__);
} else
@@ -2660,13 +2682,14 @@ static int decode_attr_group(struct xdr_stream *xdr, uint32_t *bitmap, struct nf
bitmap[1] &= ~FATTR4_WORD1_OWNER_GROUP;
}
dprintk("%s: gid=%d\n", __func__, (int)*gid);
- return 0;
+ return ret;
}
static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rdev)
{
uint32_t major = 0, minor = 0;
__be32 *p;
+ int ret = 0;
*rdev = MKDEV(0,0);
if (unlikely(bitmap[1] & (FATTR4_WORD1_RAWDEV - 1U)))
@@ -2681,9 +2704,10 @@ static int decode_attr_rdev(struct xdr_stream *xdr, uint32_t *bitmap, dev_t *rde
if (MAJOR(tmp) == major && MINOR(tmp) == minor)
*rdev = tmp;
bitmap[1] &= ~ FATTR4_WORD1_RAWDEV;
+ ret = NFS_ATTR_FATTR_RDEV;
}
dprintk("%s: rdev=(0x%x:0x%x)\n", __func__, major, minor);
- return 0;
+ return ret;
}
static int decode_attr_space_avail(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *res)
@@ -2740,6 +2764,7 @@ static int decode_attr_space_total(struct xdr_stream *xdr, uint32_t *bitmap, uin
static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint64_t *used)
{
__be32 *p;
+ int ret = 0;
*used = 0;
if (unlikely(bitmap[1] & (FATTR4_WORD1_SPACE_USED - 1U)))
@@ -2748,10 +2773,11 @@ static int decode_attr_space_used(struct xdr_stream *xdr, uint32_t *bitmap, uint
READ_BUF(8);
READ64(*used);
bitmap[1] &= ~FATTR4_WORD1_SPACE_USED;
+ ret = NFS_ATTR_FATTR_SPACE_USED;
}
dprintk("%s: space used=%Lu\n", __func__,
(unsigned long long)*used);
- return 0;
+ return ret;
}
static int decode_attr_time(struct xdr_stream *xdr, struct timespec *time)
@@ -2778,6 +2804,8 @@ static int decode_attr_time_access(struct xdr_stream *xdr, uint32_t *bitmap, str
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_TIME_ACCESS)) {
status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_ATIME;
bitmap[1] &= ~FATTR4_WORD1_TIME_ACCESS;
}
dprintk("%s: atime=%ld\n", __func__, (long)time->tv_sec);
@@ -2794,6 +2822,8 @@ static int decode_attr_time_metadata(struct xdr_stream *xdr, uint32_t *bitmap, s
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_TIME_METADATA)) {
status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_CTIME;
bitmap[1] &= ~FATTR4_WORD1_TIME_METADATA;
}
dprintk("%s: ctime=%ld\n", __func__, (long)time->tv_sec);
@@ -2810,6 +2840,8 @@ static int decode_attr_time_modify(struct xdr_stream *xdr, uint32_t *bitmap, str
return -EIO;
if (likely(bitmap[1] & FATTR4_WORD1_TIME_MODIFY)) {
status = decode_attr_time(xdr, time);
+ if (status == 0)
+ status = NFS_ATTR_FATTR_MTIME;
bitmap[1] &= ~FATTR4_WORD1_TIME_MODIFY;
}
dprintk("%s: mtime=%ld\n", __func__, (long)time->tv_sec);
@@ -2994,63 +3026,116 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr, cons
uint32_t attrlen,
bitmap[2] = {0},
type;
- int status, fmode = 0;
+ int status;
+ umode_t fmode = 0;
uint64_t fileid;
- if ((status = decode_op_hdr(xdr, OP_GETATTR)) != 0)
- goto xdr_error;
- if ((status = decode_attr_bitmap(xdr, bitmap)) != 0)
+ status = decode_op_hdr(xdr, OP_GETATTR);
+ if (status < 0)
goto xdr_error;
- fattr->bitmap[0] = bitmap[0];
- fattr->bitmap[1] = bitmap[1];
+ status = decode_attr_bitmap(xdr, bitmap);
+ if (status < 0)
+ goto xdr_error;
- if ((status = decode_attr_length(xdr, &attrlen, &savep)) != 0)
+ status = decode_attr_length(xdr, &attrlen, &savep);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_type(xdr, bitmap, &type)) != 0)
+ status = decode_attr_type(xdr, bitmap, &type);
+ if (status < 0)
goto xdr_error;
- fattr->type = nfs_type2fmt[type].nfs2type;
- fmode = nfs_type2fmt[type].mode;
+ fattr->mode = 0;
+ if (status != 0) {
+ fattr->mode |= nfs_type2fmt[type];
+ fattr->valid |= status;
+ }
- if ((status = decode_attr_change(xdr, bitmap, &fattr->change_attr)) != 0)
+ status = decode_attr_change(xdr, bitmap, &fattr->change_attr);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_size(xdr, bitmap, &fattr->size)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_size(xdr, bitmap, &fattr->size);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_fsid(xdr, bitmap, &fattr->fsid)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_fsid(xdr, bitmap, &fattr->fsid);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_fileid(xdr, bitmap, &fattr->fileid)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_fileid(xdr, bitmap, &fattr->fileid);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
+ fattr->valid |= status;
+
+ status = decode_attr_fs_locations(xdr, bitmap, container_of(fattr,
struct nfs4_fs_locations,
- fattr))) != 0)
+ fattr));
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_mode(xdr, bitmap, &fattr->mode)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_mode(xdr, bitmap, &fmode);
+ if (status < 0)
goto xdr_error;
- fattr->mode |= fmode;
- if ((status = decode_attr_nlink(xdr, bitmap, &fattr->nlink)) != 0)
+ if (status != 0) {
+ fattr->mode |= fmode;
+ fattr->valid |= status;
+ }
+
+ status = decode_attr_nlink(xdr, bitmap, &fattr->nlink);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_owner(xdr, bitmap, server->nfs_client, &fattr->uid);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_group(xdr, bitmap, server->nfs_client, &fattr->gid);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_rdev(xdr, bitmap, &fattr->rdev)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_rdev(xdr, bitmap, &fattr->rdev);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_space_used(xdr, bitmap, &fattr->du.nfs3.used);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_time_access(xdr, bitmap, &fattr->atime)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_time_access(xdr, bitmap, &fattr->atime);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_time_metadata(xdr, bitmap, &fattr->ctime);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_time_modify(xdr, bitmap, &fattr->mtime);
+ if (status < 0)
goto xdr_error;
- if ((status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid)) != 0)
+ fattr->valid |= status;
+
+ status = decode_attr_mounted_on_fileid(xdr, bitmap, &fileid);
+ if (status < 0)
goto xdr_error;
- if (fattr->fileid == 0 && fileid != 0)
+ if (status != 0 && !(fattr->valid & status)) {
fattr->fileid = fileid;
- if ((status = verify_attr_len(xdr, savep, attrlen)) == 0)
- fattr->valid = NFS_ATTR_FATTR | NFS_ATTR_FATTR_V3 | NFS_ATTR_FATTR_V4;
+ fattr->valid |= status;
+ }
+
+ status = verify_attr_len(xdr, savep, attrlen);
xdr_error:
dprintk("%s: xdr returned %d\n", __func__, -status);
return status;
@@ -4078,9 +4163,7 @@ static int nfs4_xdr_dec_setattr(struct rpc_rqst *rqstp, __be32 *p, struct nfs_se
status = decode_setattr(&xdr, res);
if (status)
goto out;
- status = decode_getfattr(&xdr, res->fattr, res->server);
- if (status == NFS4ERR_DELAY)
- status = 0;
+ decode_getfattr(&xdr, res->fattr, res->server);
out:
return status;
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 7f079209d70..e2975939126 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -176,17 +176,6 @@ void nfs_release_request(struct nfs_page *req)
kref_put(&req->wb_kref, nfs_free_request);
}
-static int nfs_wait_bit_killable(void *word)
-{
- int ret = 0;
-
- if (fatal_signal_pending(current))
- ret = -ERESTARTSYS;
- else
- schedule();
- return ret;
-}
-
/**
* nfs_wait_on_request - Wait for a request to complete.
* @req: request to wait upon.
diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c
index 193465210d7..7be72d90d49 100644
--- a/fs/nfs/proc.c
+++ b/fs/nfs/proc.c
@@ -663,4 +663,5 @@ const struct nfs_rpc_ops nfs_v2_clientops = {
.commit_setup = nfs_proc_commit_setup,
.lock = nfs_proc_lock,
.lock_check_bounds = nfs_lock_check_bounds,
+ .close_context = nfs_close_context,
};
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d6686f4786d..0942fcbbad3 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1018,6 +1018,7 @@ static int nfs_parse_mount_options(char *raw,
case Opt_rdma:
mnt->flags |= NFS_MOUNT_TCP; /* for side protocols */
mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+ xprt_load_transport(p);
break;
case Opt_acl:
mnt->flags &= ~NFS_MOUNT_NOACL;
@@ -1205,12 +1206,14 @@ static int nfs_parse_mount_options(char *raw,
/* vector side protocols to TCP */
mnt->flags |= NFS_MOUNT_TCP;
mnt->nfs_server.protocol = XPRT_TRANSPORT_RDMA;
+ xprt_load_transport(string);
break;
default:
errors++;
dfprintk(MOUNT, "NFS: unrecognized "
"transport protocol\n");
}
+ kfree(string);
break;
case Opt_mountproto:
string = match_strdup(args);
@@ -1218,7 +1221,6 @@ static int nfs_parse_mount_options(char *raw,
goto out_nomem;
token = match_token(string,
nfs_xprt_protocol_tokens, args);
- kfree(string);
switch (token) {
case Opt_xprt_udp:
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index 9f9845859fc..e560a78995a 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -313,19 +313,34 @@ static int nfs_writepages_callback(struct page *page, struct writeback_control *
int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
+ unsigned long *bitlock = &NFS_I(inode)->flags;
struct nfs_pageio_descriptor pgio;
int err;
+ /* Stop dirtying of new pages while we sync */
+ err = wait_on_bit_lock(bitlock, NFS_INO_FLUSHING,
+ nfs_wait_bit_killable, TASK_KILLABLE);
+ if (err)
+ goto out_err;
+
nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES);
nfs_pageio_init_write(&pgio, inode, wb_priority(wbc));
err = write_cache_pages(mapping, wbc, nfs_writepages_callback, &pgio);
nfs_pageio_complete(&pgio);
+
+ clear_bit_unlock(NFS_INO_FLUSHING, bitlock);
+ smp_mb__after_clear_bit();
+ wake_up_bit(bitlock, NFS_INO_FLUSHING);
+
if (err < 0)
- return err;
- if (pgio.pg_error < 0)
- return pgio.pg_error;
+ goto out_err;
+ err = pgio.pg_error;
+ if (err < 0)
+ goto out_err;
return 0;
+out_err:
+ return err;
}
/*
@@ -404,7 +419,6 @@ nfs_mark_request_commit(struct nfs_page *req)
struct nfs_inode *nfsi = NFS_I(inode);
spin_lock(&inode->i_lock);
- nfsi->ncommit++;
set_bit(PG_CLEAN, &(req)->wb_flags);
radix_tree_tag_set(&nfsi->nfs_page_tree,
req->wb_index,
@@ -524,6 +538,12 @@ static void nfs_cancel_commit_list(struct list_head *head)
}
#if defined(CONFIG_NFS_V3) || defined(CONFIG_NFS_V4)
+static int
+nfs_need_commit(struct nfs_inode *nfsi)
+{
+ return radix_tree_tagged(&nfsi->nfs_page_tree, NFS_PAGE_TAG_COMMIT);
+}
+
/*
* nfs_scan_commit - Scan an inode for commit requests
* @inode: NFS inode to scan
@@ -538,16 +558,18 @@ static int
nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
{
struct nfs_inode *nfsi = NFS_I(inode);
- int res = 0;
- if (nfsi->ncommit != 0) {
- res = nfs_scan_list(nfsi, dst, idx_start, npages,
- NFS_PAGE_TAG_COMMIT);
- nfsi->ncommit -= res;
- }
- return res;
+ if (!nfs_need_commit(nfsi))
+ return 0;
+
+ return nfs_scan_list(nfsi, dst, idx_start, npages, NFS_PAGE_TAG_COMMIT);
}
#else
+static inline int nfs_need_commit(struct nfs_inode *nfsi)
+{
+ return 0;
+}
+
static inline int nfs_scan_commit(struct inode *inode, struct list_head *dst, pgoff_t idx_start, unsigned int npages)
{
return 0;
@@ -820,7 +842,7 @@ static int nfs_write_rpcsetup(struct nfs_page *req,
data->args.stable = NFS_UNSTABLE;
if (how & FLUSH_STABLE) {
data->args.stable = NFS_DATA_SYNC;
- if (!NFS_I(inode)->ncommit)
+ if (!nfs_need_commit(NFS_I(inode)))
data->args.stable = NFS_FILE_SYNC;
}
@@ -1425,18 +1447,13 @@ static int nfs_write_mapping(struct address_space *mapping, int how)
{
struct writeback_control wbc = {
.bdi = mapping->backing_dev_info,
- .sync_mode = WB_SYNC_NONE,
+ .sync_mode = WB_SYNC_ALL,
.nr_to_write = LONG_MAX,
.range_start = 0,
.range_end = LLONG_MAX,
.for_writepages = 1,
};
- int ret;
- ret = __nfs_write_mapping(mapping, &wbc, how);
- if (ret < 0)
- return ret;
- wbc.sync_mode = WB_SYNC_ALL;
return __nfs_write_mapping(mapping, &wbc, how);
}
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 3d93b2064ce..a4ed8644d69 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -938,10 +938,12 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
char transport[16];
int port;
if (sscanf(buf, "%15s %4d", transport, &port) == 2) {
+ if (port < 1 || port > 65535)
+ return -EINVAL;
err = nfsd_create_serv();
if (!err) {
err = svc_create_xprt(nfsd_serv,
- transport, port,
+ transport, PF_INET, port,
SVC_SOCK_ANONYMOUS);
if (err == -ENOENT)
/* Give a reasonable perror msg for
@@ -960,7 +962,7 @@ static ssize_t __write_ports(struct file *file, char *buf, size_t size)
char transport[16];
int port;
if (sscanf(&buf[1], "%15s %4d", transport, &port) == 2) {
- if (port == 0)
+ if (port < 1 || port > 65535)
return -EINVAL;
if (nfsd_serv) {
xprt = svc_find_xprt(nfsd_serv, transport,
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 07e4f5d7baa..bc3567bab8c 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -229,7 +229,6 @@ int nfsd_create_serv(void)
atomic_set(&nfsd_busy, 0);
nfsd_serv = svc_create_pooled(&nfsd_program, nfsd_max_blksize,
- AF_INET,
nfsd_last_thread, nfsd, THIS_MODULE);
if (nfsd_serv == NULL)
err = -ENOMEM;
@@ -244,7 +243,7 @@ static int nfsd_init_socks(int port)
if (!list_empty(&nfsd_serv->sv_permsocks))
return 0;
- error = svc_create_xprt(nfsd_serv, "udp", port,
+ error = svc_create_xprt(nfsd_serv, "udp", PF_INET, port,
SVC_SOCK_DEFAULTS);
if (error < 0)
return error;
@@ -253,7 +252,7 @@ static int nfsd_init_socks(int port)
if (error < 0)
return error;
- error = svc_create_xprt(nfsd_serv, "tcp", port,
+ error = svc_create_xprt(nfsd_serv, "tcp", PF_INET, port,
SVC_SOCK_DEFAULTS);
if (error < 0)
return error;
diff --git a/include/drm/drm_crtc_helper.h b/include/drm/drm_crtc_helper.h
index c7d4b2e606a..ec073d8288d 100644
--- a/include/drm/drm_crtc_helper.h
+++ b/include/drm/drm_crtc_helper.h
@@ -33,7 +33,6 @@
#ifndef __DRM_CRTC_HELPER_H__
#define __DRM_CRTC_HELPER_H__
-#include <linux/i2c.h>
#include <linux/spinlock.h>
#include <linux/types.h>
#include <linux/idr.h>
@@ -92,7 +91,7 @@ struct drm_connector_helper_funcs {
extern int drm_helper_probe_single_connector_modes(struct drm_connector *connector, uint32_t maxX, uint32_t maxY);
extern void drm_helper_disable_unused_functions(struct drm_device *dev);
extern int drm_helper_hotplug_stage_two(struct drm_device *dev);
-extern bool drm_helper_initial_config(struct drm_device *dev, bool can_grow);
+extern bool drm_helper_initial_config(struct drm_device *dev);
extern int drm_crtc_helper_set_config(struct drm_mode_set *set);
extern bool drm_crtc_helper_set_mode(struct drm_crtc *crtc,
struct drm_display_mode *mode,
diff --git a/include/drm/drm_os_linux.h b/include/drm/drm_os_linux.h
index 013551d03c0..26641e95e0a 100644
--- a/include/drm/drm_os_linux.h
+++ b/include/drm/drm_os_linux.h
@@ -7,12 +7,12 @@
#include <linux/delay.h>
#ifndef readq
-static u64 readq(void __iomem *reg)
+static inline u64 readq(void __iomem *reg)
{
return ((u64) readl(reg)) | (((u64) readl(reg + 4UL)) << 32);
}
-static void writeq(u64 val, void __iomem *reg)
+static inline void writeq(u64 val, void __iomem *reg)
{
writel(val & 0xffffffff, reg);
writel(val >> 32, reg + 0x4UL);
diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 4d248b3f132..8815a3456b3 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h
@@ -649,6 +649,12 @@ struct transaction_s
int t_handle_count;
/*
+ * This transaction is being forced and some process is
+ * waiting for it to finish.
+ */
+ int t_synchronous_commit:1;
+
+ /*
* For use by the filesystem to store fs-specific data
* structures associated with the transaction
*/
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 8cc8807f77d..bde2557c2a9 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -166,8 +166,7 @@ struct nfs_inode {
*/
struct radix_tree_root nfs_page_tree;
- unsigned long ncommit,
- npages;
+ unsigned long npages;
/* Open contexts for shared mmap writes */
struct list_head open_files;
@@ -207,6 +206,7 @@ struct nfs_inode {
#define NFS_INO_STALE (1) /* possible stale inode */
#define NFS_INO_ACL_LRU_SET (2) /* Inode is on the LRU list */
#define NFS_INO_MOUNTPOINT (3) /* inode is remote mountpoint */
+#define NFS_INO_FLUSHING (4) /* inode is flushing out data */
static inline struct nfs_inode *NFS_I(const struct inode *inode)
{
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 9bb81aec91c..29b1e40dce9 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -106,6 +106,11 @@ struct nfs_server {
u32 attr_bitmask[2];/* V4 bitmask representing the set
of attributes supported on this
filesystem */
+ u32 cache_consistency_bitmask[2];
+ /* V4 bitmask representing the subset
+ of change attribute, size, ctime
+ and mtime attributes supported by
+ the server */
u32 acl_bitmask; /* V4 bitmask representing the ACEs
that are supported on this
filesystem */
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 43a713fce11..b89c34e40bc 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -27,12 +27,8 @@ static inline int nfs_fsid_equal(const struct nfs_fsid *a, const struct nfs_fsid
}
struct nfs_fattr {
- unsigned short valid; /* which fields are valid */
- __u64 pre_size; /* pre_op_attr.size */
- struct timespec pre_mtime; /* pre_op_attr.mtime */
- struct timespec pre_ctime; /* pre_op_attr.ctime */
- enum nfs_ftype type; /* always use NFSv2 types */
- __u32 mode;
+ unsigned int valid; /* which fields are valid */
+ umode_t mode;
__u32 nlink;
__u32 uid;
__u32 gid;
@@ -52,19 +48,55 @@ struct nfs_fattr {
struct timespec atime;
struct timespec mtime;
struct timespec ctime;
- __u32 bitmap[2]; /* NFSv4 returned attribute bitmap */
__u64 change_attr; /* NFSv4 change attribute */
__u64 pre_change_attr;/* pre-op NFSv4 change attribute */
+ __u64 pre_size; /* pre_op_attr.size */
+ struct timespec pre_mtime; /* pre_op_attr.mtime */
+ struct timespec pre_ctime; /* pre_op_attr.ctime */
unsigned long time_start;
unsigned long gencount;
};
-#define NFS_ATTR_WCC 0x0001 /* pre-op WCC data */
-#define NFS_ATTR_FATTR 0x0002 /* post-op attributes */
-#define NFS_ATTR_FATTR_V3 0x0004 /* NFSv3 attributes */
-#define NFS_ATTR_FATTR_V4 0x0008 /* NFSv4 change attribute */
-#define NFS_ATTR_WCC_V4 0x0010 /* pre-op change attribute */
-#define NFS_ATTR_FATTR_V4_REFERRAL 0x0020 /* NFSv4 referral */
+#define NFS_ATTR_FATTR_TYPE (1U << 0)
+#define NFS_ATTR_FATTR_MODE (1U << 1)
+#define NFS_ATTR_FATTR_NLINK (1U << 2)
+#define NFS_ATTR_FATTR_OWNER (1U << 3)
+#define NFS_ATTR_FATTR_GROUP (1U << 4)
+#define NFS_ATTR_FATTR_RDEV (1U << 5)
+#define NFS_ATTR_FATTR_SIZE (1U << 6)
+#define NFS_ATTR_FATTR_PRESIZE (1U << 7)
+#define NFS_ATTR_FATTR_BLOCKS_USED (1U << 8)
+#define NFS_ATTR_FATTR_SPACE_USED (1U << 9)
+#define NFS_ATTR_FATTR_FSID (1U << 10)
+#define NFS_ATTR_FATTR_FILEID (1U << 11)
+#define NFS_ATTR_FATTR_ATIME (1U << 12)
+#define NFS_ATTR_FATTR_MTIME (1U << 13)
+#define NFS_ATTR_FATTR_CTIME (1U << 14)
+#define NFS_ATTR_FATTR_PREMTIME (1U << 15)
+#define NFS_ATTR_FATTR_PRECTIME (1U << 16)
+#define NFS_ATTR_FATTR_CHANGE (1U << 17)
+#define NFS_ATTR_FATTR_PRECHANGE (1U << 18)
+#define NFS_ATTR_FATTR_V4_REFERRAL (1U << 19) /* NFSv4 referral */
+
+#define NFS_ATTR_FATTR (NFS_ATTR_FATTR_TYPE \
+ | NFS_ATTR_FATTR_MODE \
+ | NFS_ATTR_FATTR_NLINK \
+ | NFS_ATTR_FATTR_OWNER \
+ | NFS_ATTR_FATTR_GROUP \
+ | NFS_ATTR_FATTR_RDEV \
+ | NFS_ATTR_FATTR_SIZE \
+ | NFS_ATTR_FATTR_FSID \
+ | NFS_ATTR_FATTR_FILEID \
+ | NFS_ATTR_FATTR_ATIME \
+ | NFS_ATTR_FATTR_MTIME \
+ | NFS_ATTR_FATTR_CTIME)
+#define NFS_ATTR_FATTR_V2 (NFS_ATTR_FATTR \
+ | NFS_ATTR_FATTR_BLOCKS_USED)
+#define NFS_ATTR_FATTR_V3 (NFS_ATTR_FATTR \
+ | NFS_ATTR_FATTR_SPACE_USED)
+#define NFS_ATTR_FATTR_V4 (NFS_ATTR_FATTR \
+ | NFS_ATTR_FATTR_SPACE_USED \
+ | NFS_ATTR_FATTR_CHANGE)
/*
* Info on the file system
@@ -836,6 +868,7 @@ struct nfs_rpc_ops {
int (*lock)(struct file *, int, struct file_lock *);
int (*lock_check_bounds)(const struct file_lock *);
void (*clear_acl_cache)(struct inode *);
+ void (*close_context)(struct nfs_open_context *ctx, int);
};
/*
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index 3435d24bfe5..d3a4c023193 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -69,7 +69,6 @@ struct svc_serv {
struct list_head sv_tempsocks; /* all temporary sockets */
int sv_tmpcnt; /* count of temporary sockets */
struct timer_list sv_temptimer; /* timer for aging temporary sockets */
- sa_family_t sv_family; /* listener's address family */
char * sv_name; /* service name */
@@ -385,19 +384,19 @@ struct svc_procedure {
/*
* Function prototypes.
*/
-struct svc_serv *svc_create(struct svc_program *, unsigned int, sa_family_t,
+struct svc_serv *svc_create(struct svc_program *, unsigned int,
void (*shutdown)(struct svc_serv *));
struct svc_rqst *svc_prepare_thread(struct svc_serv *serv,
struct svc_pool *pool);
void svc_exit_thread(struct svc_rqst *);
struct svc_serv * svc_create_pooled(struct svc_program *, unsigned int,
- sa_family_t, void (*shutdown)(struct svc_serv *),
+ void (*shutdown)(struct svc_serv *),
svc_thread_fn, struct module *);
int svc_set_num_threads(struct svc_serv *, struct svc_pool *, int);
void svc_destroy(struct svc_serv *);
int svc_process(struct svc_rqst *);
-int svc_register(const struct svc_serv *, const unsigned short,
- const unsigned short);
+int svc_register(const struct svc_serv *, const int,
+ const unsigned short, const unsigned short);
void svc_wake_up(struct svc_serv *);
void svc_reserve(struct svc_rqst *rqstp, int space);
diff --git a/include/linux/sunrpc/svc_xprt.h b/include/linux/sunrpc/svc_xprt.h
index 0127daca435..0d9cb6ef28b 100644
--- a/include/linux/sunrpc/svc_xprt.h
+++ b/include/linux/sunrpc/svc_xprt.h
@@ -71,7 +71,8 @@ int svc_reg_xprt_class(struct svc_xprt_class *);
void svc_unreg_xprt_class(struct svc_xprt_class *);
void svc_xprt_init(struct svc_xprt_class *, struct svc_xprt *,
struct svc_serv *);
-int svc_create_xprt(struct svc_serv *, char *, unsigned short, int);
+int svc_create_xprt(struct svc_serv *, const char *, const int,
+ const unsigned short, int);
void svc_xprt_enqueue(struct svc_xprt *xprt);
void svc_xprt_received(struct svc_xprt *);
void svc_xprt_put(struct svc_xprt *xprt);
@@ -80,7 +81,8 @@ void svc_close_xprt(struct svc_xprt *xprt);
void svc_delete_xprt(struct svc_xprt *xprt);
int svc_port_is_privileged(struct sockaddr *sin);
int svc_print_xprts(char *buf, int maxlen);
-struct svc_xprt *svc_find_xprt(struct svc_serv *, char *, int, int);
+struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+ const sa_family_t af, const unsigned short port);
int svc_xprt_names(struct svc_serv *serv, char *buf, int buflen);
static inline void svc_xprt_get(struct svc_xprt *xprt)
@@ -88,29 +90,32 @@ static inline void svc_xprt_get(struct svc_xprt *xprt)
kref_get(&xprt->xpt_ref);
}
static inline void svc_xprt_set_local(struct svc_xprt *xprt,
- struct sockaddr *sa, int salen)
+ const struct sockaddr *sa,
+ const size_t salen)
{
memcpy(&xprt->xpt_local, sa, salen);
xprt->xpt_locallen = salen;
}
static inline void svc_xprt_set_remote(struct svc_xprt *xprt,
- struct sockaddr *sa, int salen)
+ const struct sockaddr *sa,
+ const size_t salen)
{
memcpy(&xprt->xpt_remote, sa, salen);
xprt->xpt_remotelen = salen;
}
-static inline unsigned short svc_addr_port(struct sockaddr *sa)
+static inline unsigned short svc_addr_port(const struct sockaddr *sa)
{
- unsigned short ret = 0;
+ const struct sockaddr_in *sin = (const struct sockaddr_in *)sa;
+ const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sa;
+
switch (sa->sa_family) {
case AF_INET:
- ret = ntohs(((struct sockaddr_in *)sa)->sin_port);
- break;
+ return ntohs(sin->sin_port);
case AF_INET6:
- ret = ntohs(((struct sockaddr_in6 *)sa)->sin6_port);
- break;
+ return ntohs(sin6->sin6_port);
}
- return ret;
+
+ return 0;
}
static inline size_t svc_addr_len(struct sockaddr *sa)
@@ -124,36 +129,39 @@ static inline size_t svc_addr_len(struct sockaddr *sa)
return -EAFNOSUPPORT;
}
-static inline unsigned short svc_xprt_local_port(struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_local_port(const struct svc_xprt *xprt)
{
- return svc_addr_port((struct sockaddr *)&xprt->xpt_local);
+ return svc_addr_port((const struct sockaddr *)&xprt->xpt_local);
}
-static inline unsigned short svc_xprt_remote_port(struct svc_xprt *xprt)
+static inline unsigned short svc_xprt_remote_port(const struct svc_xprt *xprt)
{
- return svc_addr_port((struct sockaddr *)&xprt->xpt_remote);
+ return svc_addr_port((const struct sockaddr *)&xprt->xpt_remote);
}
-static inline char *__svc_print_addr(struct sockaddr *addr,
- char *buf, size_t len)
+static inline char *__svc_print_addr(const struct sockaddr *addr,
+ char *buf, const size_t len)
{
+ const struct sockaddr_in *sin = (const struct sockaddr_in *)addr;
+ const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)addr;
+
switch (addr->sa_family) {
case AF_INET:
- snprintf(buf, len, "%pI4, port=%u",
- &((struct sockaddr_in *)addr)->sin_addr,
- ntohs(((struct sockaddr_in *) addr)->sin_port));
+ snprintf(buf, len, "%pI4, port=%u", &sin->sin_addr,
+ ntohs(sin->sin_port));
break;
case AF_INET6:
snprintf(buf, len, "%pI6, port=%u",
- &((struct sockaddr_in6 *)addr)->sin6_addr,
- ntohs(((struct sockaddr_in6 *) addr)->sin6_port));
+ &sin6->sin6_addr,
+ ntohs(sin6->sin6_port));
break;
default:
snprintf(buf, len, "unknown address type: %d", addr->sa_family);
break;
}
+
return buf;
}
#endif /* SUNRPC_SVC_XPRT_H */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index 11fc71d50c1..1758d9f5b5c 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -235,6 +235,7 @@ static inline __be32 *xprt_skip_transport_header(struct rpc_xprt *xprt, __be32 *
*/
int xprt_register_transport(struct xprt_class *type);
int xprt_unregister_transport(struct xprt_class *type);
+int xprt_load_transport(const char *);
void xprt_set_retrans_timeout_def(struct rpc_task *task);
void xprt_set_retrans_timeout_rtt(struct rpc_task *task);
void xprt_wake_pending_tasks(struct rpc_xprt *xprt, int status);
@@ -259,6 +260,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie);
#define XPRT_BOUND (4)
#define XPRT_BINDING (5)
#define XPRT_CLOSING (6)
+#define XPRT_CONNECTION_ABORT (7)
static inline void xprt_set_connected(struct rpc_xprt *xprt)
{
diff --git a/net/sunrpc/Kconfig b/net/sunrpc/Kconfig
index 5592883e1e4..afd91c78ce8 100644
--- a/net/sunrpc/Kconfig
+++ b/net/sunrpc/Kconfig
@@ -17,28 +17,6 @@ config SUNRPC_XPRT_RDMA
If unsure, say N.
-config SUNRPC_REGISTER_V4
- bool "Register local RPC services via rpcbind v4 (EXPERIMENTAL)"
- depends on SUNRPC && EXPERIMENTAL
- default n
- help
- Sun added support for registering RPC services at an IPv6
- address by creating two new versions of the rpcbind protocol
- (RFC 1833).
-
- This option enables support in the kernel RPC server for
- registering kernel RPC services via version 4 of the rpcbind
- protocol. If you enable this option, you must run a portmapper
- daemon that supports rpcbind protocol version 4.
-
- Serving NFS over IPv6 from knfsd (the kernel's NFS server)
- requires that you enable this option and use a portmapper that
- supports rpcbind version 4.
-
- If unsure, say N to get traditional behavior (register kernel
- RPC services using only rpcbind version 2). Distributions
- using the legacy Linux portmapper daemon must say N here.
-
config RPCSEC_GSS_KRB5
tristate "Secure RPC: Kerberos V mechanism (EXPERIMENTAL)"
depends on SUNRPC && EXPERIMENTAL
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 836f15c0c4a..5abab094441 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1032,27 +1032,20 @@ call_connect_status(struct rpc_task *task)
dprint_status(task);
task->tk_status = 0;
- if (status >= 0) {
+ if (status >= 0 || status == -EAGAIN) {
clnt->cl_stats->netreconn++;
task->tk_action = call_transmit;
return;
}
- /* Something failed: remote service port may have changed */
- rpc_force_rebind(clnt);
-
switch (status) {
- case -ENOTCONN:
- case -EAGAIN:
- task->tk_action = call_bind;
- if (!RPC_IS_SOFT(task))
- return;
/* if soft mounted, test if we've timed out */
case -ETIMEDOUT:
task->tk_action = call_timeout;
- return;
+ break;
+ default:
+ rpc_exit(task, -EIO);
}
- rpc_exit(task, -EIO);
}
/*
@@ -1105,14 +1098,26 @@ static void
call_transmit_status(struct rpc_task *task)
{
task->tk_action = call_status;
- /*
- * Special case: if we've been waiting on the socket's write_space()
- * callback, then don't call xprt_end_transmit().
- */
- if (task->tk_status == -EAGAIN)
- return;
- xprt_end_transmit(task);
- rpc_task_force_reencode(task);
+ switch (task->tk_status) {
+ case -EAGAIN:
+ break;
+ default:
+ xprt_end_transmit(task);
+ /*
+ * Special cases: if we've been waiting on the
+ * socket's write_space() callback, or if the
+ * socket just returned a connection error,
+ * then hold onto the transport lock.
+ */
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ENOTCONN:
+ case -EHOSTDOWN:
+ case -EHOSTUNREACH:
+ case -ENETUNREACH:
+ case -EPIPE:
+ rpc_task_force_reencode(task);
+ }
}
/*
@@ -1152,9 +1157,12 @@ call_status(struct rpc_task *task)
xprt_conditional_disconnect(task->tk_xprt,
req->rq_connect_cookie);
break;
+ case -ECONNRESET:
case -ECONNREFUSED:
- case -ENOTCONN:
rpc_force_rebind(clnt);
+ rpc_delay(task, 3*HZ);
+ case -EPIPE:
+ case -ENOTCONN:
task->tk_action = call_bind;
break;
case -EAGAIN:
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index 03ae007641e..beee6da3303 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -63,9 +63,16 @@ enum {
* r_owner
*
* The "owner" is allowed to unset a service in the rpcbind database.
- * We always use the following (arbitrary) fixed string.
+ *
+ * For AF_LOCAL SET/UNSET requests, rpcbind treats this string as a
+ * UID which it maps to a local user name via a password lookup.
+ * In all other cases it is ignored.
+ *
+ * For SET/UNSET requests, user space provides a value, even for
+ * network requests, and GETADDR uses an empty string. We follow
+ * those precedents here.
*/
-#define RPCB_OWNER_STRING "rpcb"
+#define RPCB_OWNER_STRING "0"
#define RPCB_MAXOWNERLEN sizeof(RPCB_OWNER_STRING)
static void rpcb_getport_done(struct rpc_task *, void *);
@@ -124,12 +131,6 @@ static const struct sockaddr_in rpcb_inaddr_loopback = {
.sin_port = htons(RPCBIND_PORT),
};
-static const struct sockaddr_in6 rpcb_in6addr_loopback = {
- .sin6_family = AF_INET6,
- .sin6_addr = IN6ADDR_LOOPBACK_INIT,
- .sin6_port = htons(RPCBIND_PORT),
-};
-
static struct rpc_clnt *rpcb_create_local(struct sockaddr *addr,
size_t addrlen, u32 version)
{
@@ -176,9 +177,10 @@ static struct rpc_clnt *rpcb_create(char *hostname, struct sockaddr *srvaddr,
return rpc_create(&args);
}
-static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
- u32 version, struct rpc_message *msg)
+static int rpcb_register_call(const u32 version, struct rpc_message *msg)
{
+ struct sockaddr *addr = (struct sockaddr *)&rpcb_inaddr_loopback;
+ size_t addrlen = sizeof(rpcb_inaddr_loopback);
struct rpc_clnt *rpcb_clnt;
int result, error = 0;
@@ -192,7 +194,7 @@ static int rpcb_register_call(struct sockaddr *addr, size_t addrlen,
error = PTR_ERR(rpcb_clnt);
if (error < 0) {
- printk(KERN_WARNING "RPC: failed to contact local rpcbind "
+ dprintk("RPC: failed to contact local rpcbind "
"server (errno %d).\n", -error);
return error;
}
@@ -254,25 +256,23 @@ int rpcb_register(u32 prog, u32 vers, int prot, unsigned short port)
if (port)
msg.rpc_proc = &rpcb_procedures2[RPCBPROC_SET];
- return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
- sizeof(rpcb_inaddr_loopback),
- RPCBVERS_2, &msg);
+ return rpcb_register_call(RPCBVERS_2, &msg);
}
/*
* Fill in AF_INET family-specific arguments to register
*/
-static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
- struct rpc_message *msg)
+static int rpcb_register_inet4(const struct sockaddr *sap,
+ struct rpc_message *msg)
{
+ const struct sockaddr_in *sin = (const struct sockaddr_in *)sap;
struct rpcbind_args *map = msg->rpc_argp;
- unsigned short port = ntohs(address_to_register->sin_port);
+ unsigned short port = ntohs(sin->sin_port);
char buf[32];
/* Construct AF_INET universal address */
snprintf(buf, sizeof(buf), "%pI4.%u.%u",
- &address_to_register->sin_addr.s_addr,
- port >> 8, port & 0xff);
+ &sin->sin_addr.s_addr, port >> 8, port & 0xff);
map->r_addr = buf;
dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
@@ -284,29 +284,27 @@ static int rpcb_register_netid4(struct sockaddr_in *address_to_register,
if (port)
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
- return rpcb_register_call((struct sockaddr *)&rpcb_inaddr_loopback,
- sizeof(rpcb_inaddr_loopback),
- RPCBVERS_4, msg);
+ return rpcb_register_call(RPCBVERS_4, msg);
}
/*
* Fill in AF_INET6 family-specific arguments to register
*/
-static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
- struct rpc_message *msg)
+static int rpcb_register_inet6(const struct sockaddr *sap,
+ struct rpc_message *msg)
{
+ const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *)sap;
struct rpcbind_args *map = msg->rpc_argp;
- unsigned short port = ntohs(address_to_register->sin6_port);
+ unsigned short port = ntohs(sin6->sin6_port);
char buf[64];
/* Construct AF_INET6 universal address */
- if (ipv6_addr_any(&address_to_register->sin6_addr))
+ if (ipv6_addr_any(&sin6->sin6_addr))
snprintf(buf, sizeof(buf), "::.%u.%u",
port >> 8, port & 0xff);
else
snprintf(buf, sizeof(buf), "%pI6.%u.%u",
- &address_to_register->sin6_addr,
- port >> 8, port & 0xff);
+ &sin6->sin6_addr, port >> 8, port & 0xff);
map->r_addr = buf;
dprintk("RPC: %sregistering [%u, %u, %s, '%s'] with "
@@ -318,9 +316,21 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
if (port)
msg->rpc_proc = &rpcb_procedures4[RPCBPROC_SET];
- return rpcb_register_call((struct sockaddr *)&rpcb_in6addr_loopback,
- sizeof(rpcb_in6addr_loopback),
- RPCBVERS_4, msg);
+ return rpcb_register_call(RPCBVERS_4, msg);
+}
+
+static int rpcb_unregister_all_protofamilies(struct rpc_message *msg)
+{
+ struct rpcbind_args *map = msg->rpc_argp;
+
+ dprintk("RPC: unregistering [%u, %u, '%s'] with "
+ "local rpcbind\n",
+ map->r_prog, map->r_vers, map->r_netid);
+
+ map->r_addr = "";
+ msg->rpc_proc = &rpcb_procedures4[RPCBPROC_UNSET];
+
+ return rpcb_register_call(RPCBVERS_4, msg);
}
/**
@@ -340,10 +350,11 @@ static int rpcb_register_netid6(struct sockaddr_in6 *address_to_register,
* invoke this function once for each [program, version, address,
* netid] tuple they wish to advertise.
*
- * Callers may also unregister RPC services that are no longer
- * available by setting the port number in the passed-in address
- * to zero. Callers pass a netid of "" to unregister all
- * transport netids associated with [program, version, address].
+ * Callers may also unregister RPC services that are registered at a
+ * specific address by setting the port number in @address to zero.
+ * They may unregister all registered protocol families at once for
+ * a service by passing a NULL @address argument. If @netid is ""
+ * then all netids for [program, version, address] are unregistered.
*
* This function uses rpcbind protocol version 4 to contact the
* local rpcbind daemon. The local rpcbind daemon must support
@@ -378,13 +389,14 @@ int rpcb_v4_register(const u32 program, const u32 version,
.rpc_argp = &map,
};
+ if (address == NULL)
+ return rpcb_unregister_all_protofamilies(&msg);
+
switch (address->sa_family) {
case AF_INET:
- return rpcb_register_netid4((struct sockaddr_in *)address,
- &msg);
+ return rpcb_register_inet4(address, &msg);
case AF_INET6:
- return rpcb_register_netid6((struct sockaddr_in6 *)address,
- &msg);
+ return rpcb_register_inet6(address, &msg);
}
return -EAFNOSUPPORT;
@@ -579,7 +591,7 @@ void rpcb_getport_async(struct rpc_task *task)
map->r_xprt = xprt_get(xprt);
map->r_netid = rpc_peeraddr2str(clnt, RPC_DISPLAY_NETID);
map->r_addr = rpc_peeraddr2str(rpcb_clnt, RPC_DISPLAY_UNIVERSAL_ADDR);
- map->r_owner = RPCB_OWNER_STRING; /* ignored for GETADDR */
+ map->r_owner = "";
map->r_status = -EIO;
child = rpcb_call_async(rpcb_clnt, map, proc);
@@ -703,11 +715,16 @@ static int rpcb_decode_getaddr(struct rpc_rqst *req, __be32 *p,
*portp = 0;
addr_len = ntohl(*p++);
+ if (addr_len == 0) {
+ dprintk("RPC: rpcb_decode_getaddr: "
+ "service is not registered\n");
+ return 0;
+ }
+
/*
- * Simple sanity check. The smallest possible universal
- * address is an IPv4 address string containing 11 bytes.
+ * Simple sanity check.
*/
- if (addr_len < 11 || addr_len > RPCBIND_MAXUADDRLEN)
+ if (addr_len > RPCBIND_MAXUADDRLEN)
goto out_err;
/*
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index bb507e2bb94..9f2f2412a2f 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -359,7 +359,7 @@ svc_pool_for_cpu(struct svc_serv *serv, int cpu)
*/
static struct svc_serv *
__svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
- sa_family_t family, void (*shutdown)(struct svc_serv *serv))
+ void (*shutdown)(struct svc_serv *serv))
{
struct svc_serv *serv;
unsigned int vers;
@@ -368,7 +368,6 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
if (!(serv = kzalloc(sizeof(*serv), GFP_KERNEL)))
return NULL;
- serv->sv_family = family;
serv->sv_name = prog->pg_name;
serv->sv_program = prog;
serv->sv_nrthreads = 1;
@@ -427,21 +426,21 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
struct svc_serv *
svc_create(struct svc_program *prog, unsigned int bufsize,
- sa_family_t family, void (*shutdown)(struct svc_serv *serv))
+ void (*shutdown)(struct svc_serv *serv))
{
- return __svc_create(prog, bufsize, /*npools*/1, family, shutdown);
+ return __svc_create(prog, bufsize, /*npools*/1, shutdown);
}
EXPORT_SYMBOL_GPL(svc_create);
struct svc_serv *
svc_create_pooled(struct svc_program *prog, unsigned int bufsize,
- sa_family_t family, void (*shutdown)(struct svc_serv *serv),
+ void (*shutdown)(struct svc_serv *serv),
svc_thread_fn func, struct module *mod)
{
struct svc_serv *serv;
unsigned int npools = svc_pool_map_get();
- serv = __svc_create(prog, bufsize, npools, family, shutdown);
+ serv = __svc_create(prog, bufsize, npools, shutdown);
if (serv != NULL) {
serv->sv_function = func;
@@ -719,8 +718,6 @@ svc_exit_thread(struct svc_rqst *rqstp)
}
EXPORT_SYMBOL_GPL(svc_exit_thread);
-#ifdef CONFIG_SUNRPC_REGISTER_V4
-
/*
* Register an "inet" protocol family netid with the local
* rpcbind daemon via an rpcbind v4 SET request.
@@ -735,12 +732,13 @@ static int __svc_rpcb_register4(const u32 program, const u32 version,
const unsigned short protocol,
const unsigned short port)
{
- struct sockaddr_in sin = {
+ const struct sockaddr_in sin = {
.sin_family = AF_INET,
.sin_addr.s_addr = htonl(INADDR_ANY),
.sin_port = htons(port),
};
- char *netid;
+ const char *netid;
+ int error;
switch (protocol) {
case IPPROTO_UDP:
@@ -750,13 +748,23 @@ static int __svc_rpcb_register4(const u32 program, const u32 version,
netid = RPCBIND_NETID_TCP;
break;
default:
- return -EPROTONOSUPPORT;
+ return -ENOPROTOOPT;
}
- return rpcb_v4_register(program, version,
- (struct sockaddr *)&sin, netid);
+ error = rpcb_v4_register(program, version,
+ (const struct sockaddr *)&sin, netid);
+
+ /*
+ * User space didn't support rpcbind v4, so retry this
+ * registration request with the legacy rpcbind v2 protocol.
+ */
+ if (error == -EPROTONOSUPPORT)
+ error = rpcb_register(program, version, protocol, port);
+
+ return error;
}
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
/*
* Register an "inet6" protocol family netid with the local
* rpcbind daemon via an rpcbind v4 SET request.
@@ -771,12 +779,13 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
const unsigned short protocol,
const unsigned short port)
{
- struct sockaddr_in6 sin6 = {
+ const struct sockaddr_in6 sin6 = {
.sin6_family = AF_INET6,
.sin6_addr = IN6ADDR_ANY_INIT,
.sin6_port = htons(port),
};
- char *netid;
+ const char *netid;
+ int error;
switch (protocol) {
case IPPROTO_UDP:
@@ -786,12 +795,22 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
netid = RPCBIND_NETID_TCP6;
break;
default:
- return -EPROTONOSUPPORT;
+ return -ENOPROTOOPT;
}
- return rpcb_v4_register(program, version,
- (struct sockaddr *)&sin6, netid);
+ error = rpcb_v4_register(program, version,
+ (const struct sockaddr *)&sin6, netid);
+
+ /*
+ * User space didn't support rpcbind version 4, so we won't
+ * use a PF_INET6 listener.
+ */
+ if (error == -EPROTONOSUPPORT)
+ error = -EAFNOSUPPORT;
+
+ return error;
}
+#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
/*
* Register a kernel RPC service via rpcbind version 4.
@@ -799,69 +818,43 @@ static int __svc_rpcb_register6(const u32 program, const u32 version,
* Returns zero on success; a negative errno value is returned
* if any error occurs.
*/
-static int __svc_register(const u32 program, const u32 version,
- const sa_family_t family,
+static int __svc_register(const char *progname,
+ const u32 program, const u32 version,
+ const int family,
const unsigned short protocol,
const unsigned short port)
{
- int error;
+ int error = -EAFNOSUPPORT;
switch (family) {
- case AF_INET:
- return __svc_rpcb_register4(program, version,
+ case PF_INET:
+ error = __svc_rpcb_register4(program, version,
protocol, port);
- case AF_INET6:
+ break;
+#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+ case PF_INET6:
error = __svc_rpcb_register6(program, version,
protocol, port);
- if (error < 0)
- return error;
-
- /*
- * Work around bug in some versions of Linux rpcbind
- * which don't allow registration of both inet and
- * inet6 netids.
- *
- * Error return ignored for now.
- */
- __svc_rpcb_register4(program, version,
- protocol, port);
- return 0;
+#endif /* defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) */
}
- return -EAFNOSUPPORT;
-}
-
-#else /* CONFIG_SUNRPC_REGISTER_V4 */
-
-/*
- * Register a kernel RPC service via rpcbind version 2.
- *
- * Returns zero on success; a negative errno value is returned
- * if any error occurs.
- */
-static int __svc_register(const u32 program, const u32 version,
- sa_family_t family,
- const unsigned short protocol,
- const unsigned short port)
-{
- if (family != AF_INET)
- return -EAFNOSUPPORT;
-
- return rpcb_register(program, version, protocol, port);
+ if (error < 0)
+ printk(KERN_WARNING "svc: failed to register %sv%u RPC "
+ "service (errno %d).\n", progname, version, -error);
+ return error;
}
-#endif /* CONFIG_SUNRPC_REGISTER_V4 */
-
/**
* svc_register - register an RPC service with the local portmapper
* @serv: svc_serv struct for the service to register
+ * @family: protocol family of service's listener socket
* @proto: transport protocol number to advertise
* @port: port to advertise
*
- * Service is registered for any address in serv's address family
+ * Service is registered for any address in the passed-in protocol family
*/
-int svc_register(const struct svc_serv *serv, const unsigned short proto,
- const unsigned short port)
+int svc_register(const struct svc_serv *serv, const int family,
+ const unsigned short proto, const unsigned short port)
{
struct svc_program *progp;
unsigned int i;
@@ -879,15 +872,15 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
i,
proto == IPPROTO_UDP? "udp" : "tcp",
port,
- serv->sv_family,
+ family,
progp->pg_vers[i]->vs_hidden?
" (but not telling portmap)" : "");
if (progp->pg_vers[i]->vs_hidden)
continue;
- error = __svc_register(progp->pg_prog, i,
- serv->sv_family, proto, port);
+ error = __svc_register(progp->pg_name, progp->pg_prog,
+ i, family, proto, port);
if (error < 0)
break;
}
@@ -896,38 +889,31 @@ int svc_register(const struct svc_serv *serv, const unsigned short proto,
return error;
}
-#ifdef CONFIG_SUNRPC_REGISTER_V4
-
+/*
+ * If user space is running rpcbind, it should take the v4 UNSET
+ * and clear everything for this [program, version]. If user space
+ * is running portmap, it will reject the v4 UNSET, but won't have
+ * any "inet6" entries anyway. So a PMAP_UNSET should be sufficient
+ * in this case to clear all existing entries for [program, version].
+ */
static void __svc_unregister(const u32 program, const u32 version,
const char *progname)
{
- struct sockaddr_in6 sin6 = {
- .sin6_family = AF_INET6,
- .sin6_addr = IN6ADDR_ANY_INIT,
- .sin6_port = 0,
- };
int error;
- error = rpcb_v4_register(program, version,
- (struct sockaddr *)&sin6, "");
- dprintk("svc: %s(%sv%u), error %d\n",
- __func__, progname, version, error);
-}
-
-#else /* CONFIG_SUNRPC_REGISTER_V4 */
+ error = rpcb_v4_register(program, version, NULL, "");
-static void __svc_unregister(const u32 program, const u32 version,
- const char *progname)
-{
- int error;
+ /*
+ * User space didn't support rpcbind v4, so retry this
+ * request with the legacy rpcbind v2 protocol.
+ */
+ if (error == -EPROTONOSUPPORT)
+ error = rpcb_register(program, version, 0, 0);
- error = rpcb_register(program, version, 0, 0);
dprintk("svc: %s(%sv%u), error %d\n",
__func__, progname, version, error);
}
-#endif /* CONFIG_SUNRPC_REGISTER_V4 */
-
/*
* All netids, bind addresses and ports registered for [program, version]
* are removed from the local rpcbind database (if the service is not
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index e588df5d6b3..2819ee093f3 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -161,7 +161,9 @@ EXPORT_SYMBOL_GPL(svc_xprt_init);
static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
struct svc_serv *serv,
- unsigned short port, int flags)
+ const int family,
+ const unsigned short port,
+ int flags)
{
struct sockaddr_in sin = {
.sin_family = AF_INET,
@@ -176,12 +178,12 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
struct sockaddr *sap;
size_t len;
- switch (serv->sv_family) {
- case AF_INET:
+ switch (family) {
+ case PF_INET:
sap = (struct sockaddr *)&sin;
len = sizeof(sin);
break;
- case AF_INET6:
+ case PF_INET6:
sap = (struct sockaddr *)&sin6;
len = sizeof(sin6);
break;
@@ -192,7 +194,8 @@ static struct svc_xprt *__svc_xpo_create(struct svc_xprt_class *xcl,
return xcl->xcl_ops->xpo_create(serv, sap, len, flags);
}
-int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
+int svc_create_xprt(struct svc_serv *serv, const char *xprt_name,
+ const int family, const unsigned short port,
int flags)
{
struct svc_xprt_class *xcl;
@@ -209,7 +212,7 @@ int svc_create_xprt(struct svc_serv *serv, char *xprt_name, unsigned short port,
goto err;
spin_unlock(&svc_xprt_class_lock);
- newxprt = __svc_xpo_create(xcl, serv, port, flags);
+ newxprt = __svc_xpo_create(xcl, serv, family, port, flags);
if (IS_ERR(newxprt)) {
module_put(xcl->xcl_owner);
return PTR_ERR(newxprt);
@@ -1033,7 +1036,13 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
return dr;
}
-/*
+/**
+ * svc_find_xprt - find an RPC transport instance
+ * @serv: pointer to svc_serv to search
+ * @xcl_name: C string containing transport's class name
+ * @af: Address family of transport's local address
+ * @port: transport's IP port number
+ *
* Return the transport instance pointer for the endpoint accepting
* connections/peer traffic from the specified transport class,
* address family and port.
@@ -1042,14 +1051,14 @@ static struct svc_deferred_req *svc_deferred_dequeue(struct svc_xprt *xprt)
* wild-card, and will result in matching the first transport in the
* service's list that has a matching class name.
*/
-struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
- int af, int port)
+struct svc_xprt *svc_find_xprt(struct svc_serv *serv, const char *xcl_name,
+ const sa_family_t af, const unsigned short port)
{
struct svc_xprt *xprt;
struct svc_xprt *found = NULL;
/* Sanity check the args */
- if (!serv || !xcl_name)
+ if (serv == NULL || xcl_name == NULL)
return found;
spin_lock_bh(&serv->sv_lock);
@@ -1058,7 +1067,7 @@ struct svc_xprt *svc_find_xprt(struct svc_serv *serv, char *xcl_name,
continue;
if (af != AF_UNSPEC && af != xprt->xpt_local.ss_family)
continue;
- if (port && port != svc_xprt_local_port(xprt))
+ if (port != 0 && port != svc_xprt_local_port(xprt))
continue;
found = xprt;
svc_xprt_get(xprt);
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 5763e6460fe..9d504234af4 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -1110,7 +1110,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
struct svc_sock *svsk;
struct sock *inet;
int pmap_register = !(flags & SVC_SOCK_ANONYMOUS);
- int val;
dprintk("svc: svc_setup_socket %p\n", sock);
if (!(svsk = kzalloc(sizeof(*svsk), GFP_KERNEL))) {
@@ -1122,7 +1121,7 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
/* Register socket with portmapper */
if (*errp >= 0 && pmap_register)
- *errp = svc_register(serv, inet->sk_protocol,
+ *errp = svc_register(serv, inet->sk_family, inet->sk_protocol,
ntohs(inet_sk(inet)->sport));
if (*errp < 0) {
@@ -1143,18 +1142,6 @@ static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
else
svc_tcp_init(svsk, serv);
- /*
- * We start one listener per sv_serv. We want AF_INET
- * requests to be automatically shunted to our AF_INET6
- * listener using a mapped IPv4 address. Make sure
- * no-one starts an equivalent IPv4 listener, which
- * would steal our incoming connections.
- */
- val = 0;
- if (serv->sv_family == AF_INET6)
- kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
- (char *)&val, sizeof(val));
-
dprintk("svc: svc_setup_socket created %p (inet %p)\n",
svsk, svsk->sk_sk);
@@ -1222,6 +1209,8 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
struct sockaddr_storage addr;
struct sockaddr *newsin = (struct sockaddr *)&addr;
int newlen;
+ int family;
+ int val;
RPC_IFDEBUG(char buf[RPC_MAX_ADDRBUFLEN]);
dprintk("svc: svc_create_socket(%s, %d, %s)\n",
@@ -1233,14 +1222,35 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv,
"sockets supported\n");
return ERR_PTR(-EINVAL);
}
+
type = (protocol == IPPROTO_UDP)? SOCK_DGRAM : SOCK_STREAM;
+ switch (sin->sa_family) {
+ case AF_INET6:
+ family = PF_INET6;
+ break;
+ case AF_INET:
+ family = PF_INET;
+ break;
+ default:
+ return ERR_PTR(-EINVAL);
+ }
- error = sock_create_kern(sin->sa_family, type, protocol, &sock);
+ error = sock_create_kern(family, type, protocol, &sock);
if (error < 0)
return ERR_PTR(error);
svc_reclassify_socket(sock);
+ /*
+ * If this is an PF_INET6 listener, we want to avoid
+ * getting requests from IPv4 remotes. Those should
+ * be shunted to a PF_INET listener via rpcbind.
+ */
+ val = 1;
+ if (family == PF_INET6)
+ kernel_setsockopt(sock, SOL_IPV6, IPV6_V6ONLY,
+ (char *)&val, sizeof(val));
+
if (type == SOCK_STREAM)
sock->sk->sk_reuse = 1; /* allow address reuse */
error = kernel_bind(sock, sin, len);
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 62098d101a1..a0bfe53f162 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -152,6 +152,37 @@ out:
EXPORT_SYMBOL_GPL(xprt_unregister_transport);
/**
+ * xprt_load_transport - load a transport implementation
+ * @transport_name: transport to load
+ *
+ * Returns:
+ * 0: transport successfully loaded
+ * -ENOENT: transport module not available
+ */
+int xprt_load_transport(const char *transport_name)
+{
+ struct xprt_class *t;
+ char module_name[sizeof t->name + 5];
+ int result;
+
+ result = 0;
+ spin_lock(&xprt_list_lock);
+ list_for_each_entry(t, &xprt_list, list) {
+ if (strcmp(t->name, transport_name) == 0) {
+ spin_unlock(&xprt_list_lock);
+ goto out;
+ }
+ }
+ spin_unlock(&xprt_list_lock);
+ strcpy(module_name, "xprt");
+ strncat(module_name, transport_name, sizeof t->name);
+ result = request_module(module_name);
+out:
+ return result;
+}
+EXPORT_SYMBOL_GPL(xprt_load_transport);
+
+/**
* xprt_reserve_xprt - serialize write access to transports
* @task: task that is requesting access to the transport
*
@@ -580,7 +611,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt)
dprintk("RPC: disconnected transport %p\n", xprt);
spin_lock_bh(&xprt->transport_lock);
xprt_clear_connected(xprt);
- xprt_wake_pending_tasks(xprt, -ENOTCONN);
+ xprt_wake_pending_tasks(xprt, -EAGAIN);
spin_unlock_bh(&xprt->transport_lock);
}
EXPORT_SYMBOL_GPL(xprt_disconnect_done);
@@ -598,7 +629,7 @@ void xprt_force_disconnect(struct rpc_xprt *xprt)
/* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
queue_work(rpciod_workqueue, &xprt->task_cleanup);
- xprt_wake_pending_tasks(xprt, -ENOTCONN);
+ xprt_wake_pending_tasks(xprt, -EAGAIN);
spin_unlock_bh(&xprt->transport_lock);
}
@@ -625,7 +656,7 @@ void xprt_conditional_disconnect(struct rpc_xprt *xprt, unsigned int cookie)
/* Try to schedule an autoclose RPC call */
if (test_and_set_bit(XPRT_LOCKED, &xprt->state) == 0)
queue_work(rpciod_workqueue, &xprt->task_cleanup);
- xprt_wake_pending_tasks(xprt, -ENOTCONN);
+ xprt_wake_pending_tasks(xprt, -EAGAIN);
out:
spin_unlock_bh(&xprt->transport_lock);
}
@@ -695,9 +726,8 @@ static void xprt_connect_status(struct rpc_task *task)
}
switch (task->tk_status) {
- case -ENOTCONN:
- dprintk("RPC: %5u xprt_connect_status: connection broken\n",
- task->tk_pid);
+ case -EAGAIN:
+ dprintk("RPC: %5u xprt_connect_status: retrying\n", task->tk_pid);
break;
case -ETIMEDOUT:
dprintk("RPC: %5u xprt_connect_status: connect attempt timed "
@@ -818,15 +848,8 @@ int xprt_prepare_transmit(struct rpc_task *task)
err = req->rq_received;
goto out_unlock;
}
- if (!xprt->ops->reserve_xprt(task)) {
+ if (!xprt->ops->reserve_xprt(task))
err = -EAGAIN;
- goto out_unlock;
- }
-
- if (!xprt_connected(xprt)) {
- err = -ENOTCONN;
- goto out_unlock;
- }
out_unlock:
spin_unlock_bh(&xprt->transport_lock);
return err;
@@ -870,32 +893,26 @@ void xprt_transmit(struct rpc_task *task)
req->rq_connect_cookie = xprt->connect_cookie;
req->rq_xtime = jiffies;
status = xprt->ops->send_request(task);
- if (status == 0) {
- dprintk("RPC: %5u xmit complete\n", task->tk_pid);
- spin_lock_bh(&xprt->transport_lock);
+ if (status != 0) {
+ task->tk_status = status;
+ return;
+ }
- xprt->ops->set_retrans_timeout(task);
+ dprintk("RPC: %5u xmit complete\n", task->tk_pid);
+ spin_lock_bh(&xprt->transport_lock);
- xprt->stat.sends++;
- xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
- xprt->stat.bklog_u += xprt->backlog.qlen;
+ xprt->ops->set_retrans_timeout(task);
- /* Don't race with disconnect */
- if (!xprt_connected(xprt))
- task->tk_status = -ENOTCONN;
- else if (!req->rq_received)
- rpc_sleep_on(&xprt->pending, task, xprt_timer);
- spin_unlock_bh(&xprt->transport_lock);
- return;
- }
+ xprt->stat.sends++;
+ xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs;
+ xprt->stat.bklog_u += xprt->backlog.qlen;
- /* Note: at this point, task->tk_sleeping has not yet been set,
- * hence there is no danger of the waking up task being put on
- * schedq, and being picked up by a parallel run of rpciod().
- */
- task->tk_status = status;
- if (status == -ECONNREFUSED)
- rpc_sleep_on(&xprt->sending, task, NULL);
+ /* Don't race with disconnect */
+ if (!xprt_connected(xprt))
+ task->tk_status = -ENOTCONN;
+ else if (!req->rq_received)
+ rpc_sleep_on(&xprt->pending, task, xprt_timer);
+ spin_unlock_bh(&xprt->transport_lock);
}
static inline void do_xprt_reserve(struct rpc_task *task)
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 14106d26bb9..e5e28d1946a 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -310,6 +310,19 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
__func__, pad, destp, rqst->rq_slen, curlen);
copy_len = rqst->rq_snd_buf.page_len;
+
+ if (rqst->rq_snd_buf.tail[0].iov_len) {
+ curlen = rqst->rq_snd_buf.tail[0].iov_len;
+ if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
+ memmove(destp + copy_len,
+ rqst->rq_snd_buf.tail[0].iov_base, curlen);
+ r_xprt->rx_stats.pullup_copy_count += curlen;
+ }
+ dprintk("RPC: %s: tail destp 0x%p len %d\n",
+ __func__, destp + copy_len, curlen);
+ rqst->rq_svec[0].iov_len += curlen;
+ }
+
r_xprt->rx_stats.pullup_copy_count += copy_len;
npages = PAGE_ALIGN(rqst->rq_snd_buf.page_base+copy_len) >> PAGE_SHIFT;
for (i = 0; copy_len && i < npages; i++) {
@@ -332,17 +345,6 @@ rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
destp += curlen;
copy_len -= curlen;
}
- if (rqst->rq_snd_buf.tail[0].iov_len) {
- curlen = rqst->rq_snd_buf.tail[0].iov_len;
- if (destp != rqst->rq_snd_buf.tail[0].iov_base) {
- memcpy(destp,
- rqst->rq_snd_buf.tail[0].iov_base, curlen);
- r_xprt->rx_stats.pullup_copy_count += curlen;
- }
- dprintk("RPC: %s: tail destp 0x%p len %d curlen %d\n",
- __func__, destp, copy_len, curlen);
- rqst->rq_svec[0].iov_len += curlen;
- }
/* header now contains entire send message */
return pad;
}
@@ -656,7 +658,7 @@ rpcrdma_inline_fixup(struct rpc_rqst *rqst, char *srcp, int copy_len, int pad)
if (curlen > rqst->rq_rcv_buf.tail[0].iov_len)
curlen = rqst->rq_rcv_buf.tail[0].iov_len;
if (rqst->rq_rcv_buf.tail[0].iov_base != srcp)
- memcpy(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
+ memmove(rqst->rq_rcv_buf.tail[0].iov_base, srcp, curlen);
dprintk("RPC: %s: tail srcp 0x%p len %d curlen %d\n",
__func__, srcp, copy_len, curlen);
rqst->rq_rcv_buf.tail[0].iov_len = curlen;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index a3334e3b73c..6c26a675435 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -191,7 +191,6 @@ static int map_xdr(struct svcxprt_rdma *xprt,
struct xdr_buf *xdr,
struct svc_rdma_req_map *vec)
{
- int sge_max = (xdr->len+PAGE_SIZE-1) / PAGE_SIZE + 3;
int sge_no;
u32 sge_bytes;
u32 page_bytes;
@@ -235,7 +234,11 @@ static int map_xdr(struct svcxprt_rdma *xprt,
sge_no++;
}
- BUG_ON(sge_no > sge_max);
+ dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
+ "page_base %u page_len %u head_len %zu tail_len %zu\n",
+ sge_no, page_no, xdr->page_base, xdr->page_len,
+ xdr->head[0].iov_len, xdr->tail[0].iov_len);
+
vec->count = sge_no;
return 0;
}
@@ -579,7 +582,6 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->sge[page_no+1].length = 0;
}
BUG_ON(sge_no > rdma->sc_max_sge);
- BUG_ON(sge_no > ctxt->count);
memset(&send_wr, 0, sizeof send_wr);
ctxt->wr_op = IB_WR_SEND;
send_wr.wr_id = (unsigned long)ctxt;
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 568330eebbf..d40ff50887a 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -49,6 +49,9 @@ unsigned int xprt_tcp_slot_table_entries = RPC_DEF_SLOT_TABLE;
unsigned int xprt_min_resvport = RPC_DEF_MIN_RESVPORT;
unsigned int xprt_max_resvport = RPC_DEF_MAX_RESVPORT;
+#define XS_TCP_LINGER_TO (15U * HZ)
+static unsigned int xs_tcp_fin_timeout __read_mostly = XS_TCP_LINGER_TO;
+
/*
* We can register our own files under /proc/sys/sunrpc by
* calling register_sysctl_table() again. The files in that
@@ -117,6 +120,14 @@ static ctl_table xs_tunables_table[] = {
.extra2 = &xprt_max_resvport_limit
},
{
+ .procname = "tcp_fin_timeout",
+ .data = &xs_tcp_fin_timeout,
+ .maxlen = sizeof(xs_tcp_fin_timeout),
+ .mode = 0644,
+ .proc_handler = &proc_dointvec_jiffies,
+ .strategy = sysctl_jiffies
+ },
+ {
.ctl_name = 0,
},
};
@@ -521,11 +532,12 @@ static void xs_nospace_callback(struct rpc_task *task)
* @task: task to put to sleep
*
*/
-static void xs_nospace(struct rpc_task *task)
+static int xs_nospace(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
struct rpc_xprt *xprt = req->rq_xprt;
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+ int ret = 0;
dprintk("RPC: %5u xmit incomplete (%u left of %u)\n",
task->tk_pid, req->rq_slen - req->rq_bytes_sent,
@@ -537,6 +549,7 @@ static void xs_nospace(struct rpc_task *task)
/* Don't race with disconnect */
if (xprt_connected(xprt)) {
if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
+ ret = -EAGAIN;
/*
* Notify TCP that we're limited by the application
* window size
@@ -548,10 +561,11 @@ static void xs_nospace(struct rpc_task *task)
}
} else {
clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
- task->tk_status = -ENOTCONN;
+ ret = -ENOTCONN;
}
spin_unlock_bh(&xprt->transport_lock);
+ return ret;
}
/**
@@ -594,6 +608,8 @@ static int xs_udp_send_request(struct rpc_task *task)
/* Still some bytes left; set up for a retry later. */
status = -EAGAIN;
}
+ if (!transport->sock)
+ goto out;
switch (status) {
case -ENOTSOCK:
@@ -601,21 +617,19 @@ static int xs_udp_send_request(struct rpc_task *task)
/* Should we call xs_close() here? */
break;
case -EAGAIN:
- xs_nospace(task);
+ status = xs_nospace(task);
break;
+ default:
+ dprintk("RPC: sendmsg returned unrecognized error %d\n",
+ -status);
case -ENETUNREACH:
case -EPIPE:
case -ECONNREFUSED:
/* When the server has died, an ICMP port unreachable message
* prompts ECONNREFUSED. */
clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
- break;
- default:
- clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
- dprintk("RPC: sendmsg returned unrecognized error %d\n",
- -status);
}
-
+out:
return status;
}
@@ -697,6 +711,8 @@ static int xs_tcp_send_request(struct rpc_task *task)
status = -EAGAIN;
break;
}
+ if (!transport->sock)
+ goto out;
switch (status) {
case -ENOTSOCK:
@@ -704,23 +720,19 @@ static int xs_tcp_send_request(struct rpc_task *task)
/* Should we call xs_close() here? */
break;
case -EAGAIN:
- xs_nospace(task);
+ status = xs_nospace(task);
break;
+ default:
+ dprintk("RPC: sendmsg returned unrecognized error %d\n",
+ -status);
case -ECONNRESET:
+ case -EPIPE:
xs_tcp_shutdown(xprt);
case -ECONNREFUSED:
case -ENOTCONN:
- case -EPIPE:
- status = -ENOTCONN;
- clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
- break;
- default:
- dprintk("RPC: sendmsg returned unrecognized error %d\n",
- -status);
clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
- xs_tcp_shutdown(xprt);
}
-
+out:
return status;
}
@@ -767,23 +779,13 @@ static void xs_restore_old_callbacks(struct sock_xprt *transport, struct sock *s
sk->sk_error_report = transport->old_error_report;
}
-/**
- * xs_close - close a socket
- * @xprt: transport
- *
- * This is used when all requests are complete; ie, no DRC state remains
- * on the server we want to save.
- */
-static void xs_close(struct rpc_xprt *xprt)
+static void xs_reset_transport(struct sock_xprt *transport)
{
- struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct socket *sock = transport->sock;
struct sock *sk = transport->inet;
- if (!sk)
- goto clear_close_wait;
-
- dprintk("RPC: xs_close xprt %p\n", xprt);
+ if (sk == NULL)
+ return;
write_lock_bh(&sk->sk_callback_lock);
transport->inet = NULL;
@@ -797,8 +799,25 @@ static void xs_close(struct rpc_xprt *xprt)
sk->sk_no_check = 0;
sock_release(sock);
-clear_close_wait:
+}
+
+/**
+ * xs_close - close a socket
+ * @xprt: transport
+ *
+ * This is used when all requests are complete; ie, no DRC state remains
+ * on the server we want to save.
+ */
+static void xs_close(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
+
+ dprintk("RPC: xs_close xprt %p\n", xprt);
+
+ xs_reset_transport(transport);
+
smp_mb__before_clear_bit();
+ clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
clear_bit(XPRT_CLOSING, &xprt->state);
smp_mb__after_clear_bit();
@@ -1126,6 +1145,47 @@ out:
read_unlock(&sk->sk_callback_lock);
}
+/*
+ * Do the equivalent of linger/linger2 handling for dealing with
+ * broken servers that don't close the socket in a timely
+ * fashion
+ */
+static void xs_tcp_schedule_linger_timeout(struct rpc_xprt *xprt,
+ unsigned long timeout)
+{
+ struct sock_xprt *transport;
+
+ if (xprt_test_and_set_connecting(xprt))
+ return;
+ set_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+ transport = container_of(xprt, struct sock_xprt, xprt);
+ queue_delayed_work(rpciod_workqueue, &transport->connect_worker,
+ timeout);
+}
+
+static void xs_tcp_cancel_linger_timeout(struct rpc_xprt *xprt)
+{
+ struct sock_xprt *transport;
+
+ transport = container_of(xprt, struct sock_xprt, xprt);
+
+ if (!test_bit(XPRT_CONNECTION_ABORT, &xprt->state) ||
+ !cancel_delayed_work(&transport->connect_worker))
+ return;
+ clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+ xprt_clear_connecting(xprt);
+}
+
+static void xs_sock_mark_closed(struct rpc_xprt *xprt)
+{
+ smp_mb__before_clear_bit();
+ clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
+ clear_bit(XPRT_CLOSING, &xprt->state);
+ smp_mb__after_clear_bit();
+ /* Mark transport as closed and wake up all pending tasks */
+ xprt_disconnect_done(xprt);
+}
+
/**
* xs_tcp_state_change - callback to handle TCP socket state changes
* @sk: socket whose state has changed
@@ -1158,7 +1218,7 @@ static void xs_tcp_state_change(struct sock *sk)
transport->tcp_flags =
TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID;
- xprt_wake_pending_tasks(xprt, 0);
+ xprt_wake_pending_tasks(xprt, -EAGAIN);
}
spin_unlock_bh(&xprt->transport_lock);
break;
@@ -1171,10 +1231,10 @@ static void xs_tcp_state_change(struct sock *sk)
clear_bit(XPRT_CONNECTED, &xprt->state);
clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
smp_mb__after_clear_bit();
+ xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
break;
case TCP_CLOSE_WAIT:
/* The server initiated a shutdown of the socket */
- set_bit(XPRT_CLOSING, &xprt->state);
xprt_force_disconnect(xprt);
case TCP_SYN_SENT:
xprt->connect_cookie++;
@@ -1187,40 +1247,35 @@ static void xs_tcp_state_change(struct sock *sk)
xprt->reestablish_timeout = XS_TCP_INIT_REEST_TO;
break;
case TCP_LAST_ACK:
+ set_bit(XPRT_CLOSING, &xprt->state);
+ xs_tcp_schedule_linger_timeout(xprt, xs_tcp_fin_timeout);
smp_mb__before_clear_bit();
clear_bit(XPRT_CONNECTED, &xprt->state);
smp_mb__after_clear_bit();
break;
case TCP_CLOSE:
- smp_mb__before_clear_bit();
- clear_bit(XPRT_CLOSE_WAIT, &xprt->state);
- clear_bit(XPRT_CLOSING, &xprt->state);
- smp_mb__after_clear_bit();
- /* Mark transport as closed and wake up all pending tasks */
- xprt_disconnect_done(xprt);
+ xs_tcp_cancel_linger_timeout(xprt);
+ xs_sock_mark_closed(xprt);
}
out:
read_unlock(&sk->sk_callback_lock);
}
/**
- * xs_tcp_error_report - callback mainly for catching RST events
+ * xs_error_report - callback mainly for catching socket errors
* @sk: socket
*/
-static void xs_tcp_error_report(struct sock *sk)
+static void xs_error_report(struct sock *sk)
{
struct rpc_xprt *xprt;
read_lock(&sk->sk_callback_lock);
- if (sk->sk_err != ECONNRESET || sk->sk_state != TCP_ESTABLISHED)
- goto out;
if (!(xprt = xprt_from_sock(sk)))
goto out;
dprintk("RPC: %s client %p...\n"
"RPC: error %d\n",
__func__, xprt, sk->sk_err);
-
- xprt_force_disconnect(xprt);
+ xprt_wake_pending_tasks(xprt, -EAGAIN);
out:
read_unlock(&sk->sk_callback_lock);
}
@@ -1494,6 +1549,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_user_data = xprt;
sk->sk_data_ready = xs_udp_data_ready;
sk->sk_write_space = xs_udp_write_space;
+ sk->sk_error_report = xs_error_report;
sk->sk_no_check = UDP_CSUM_NORCV;
sk->sk_allocation = GFP_ATOMIC;
@@ -1526,9 +1582,10 @@ static void xs_udp_connect_worker4(struct work_struct *work)
goto out;
/* Start by resetting any existing state */
- xs_close(xprt);
+ xs_reset_transport(transport);
- if ((err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
+ err = sock_create_kern(PF_INET, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ if (err < 0) {
dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
goto out;
}
@@ -1545,8 +1602,8 @@ static void xs_udp_connect_worker4(struct work_struct *work)
xs_udp_finish_connecting(xprt, sock);
status = 0;
out:
- xprt_wake_pending_tasks(xprt, status);
xprt_clear_connecting(xprt);
+ xprt_wake_pending_tasks(xprt, status);
}
/**
@@ -1567,9 +1624,10 @@ static void xs_udp_connect_worker6(struct work_struct *work)
goto out;
/* Start by resetting any existing state */
- xs_close(xprt);
+ xs_reset_transport(transport);
- if ((err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock)) < 0) {
+ err = sock_create_kern(PF_INET6, SOCK_DGRAM, IPPROTO_UDP, &sock);
+ if (err < 0) {
dprintk("RPC: can't create UDP transport socket (%d).\n", -err);
goto out;
}
@@ -1586,18 +1644,17 @@ static void xs_udp_connect_worker6(struct work_struct *work)
xs_udp_finish_connecting(xprt, sock);
status = 0;
out:
- xprt_wake_pending_tasks(xprt, status);
xprt_clear_connecting(xprt);
+ xprt_wake_pending_tasks(xprt, status);
}
/*
* We need to preserve the port number so the reply cache on the server can
* find our cached RPC replies when we get around to reconnecting.
*/
-static void xs_tcp_reuse_connection(struct rpc_xprt *xprt)
+static void xs_abort_connection(struct rpc_xprt *xprt, struct sock_xprt *transport)
{
int result;
- struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
struct sockaddr any;
dprintk("RPC: disconnecting xprt %p to reuse port\n", xprt);
@@ -1609,11 +1666,24 @@ static void xs_tcp_reuse_connection(struct rpc_xprt *xprt)
memset(&any, 0, sizeof(any));
any.sa_family = AF_UNSPEC;
result = kernel_connect(transport->sock, &any, sizeof(any), 0);
- if (result)
+ if (!result)
+ xs_sock_mark_closed(xprt);
+ else
dprintk("RPC: AF_UNSPEC connect return code %d\n",
result);
}
+static void xs_tcp_reuse_connection(struct rpc_xprt *xprt, struct sock_xprt *transport)
+{
+ unsigned int state = transport->inet->sk_state;
+
+ if (state == TCP_CLOSE && transport->sock->state == SS_UNCONNECTED)
+ return;
+ if ((1 << state) & (TCPF_ESTABLISHED|TCPF_SYN_SENT))
+ return;
+ xs_abort_connection(xprt, transport);
+}
+
static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
{
struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt);
@@ -1629,7 +1699,7 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
sk->sk_data_ready = xs_tcp_data_ready;
sk->sk_state_change = xs_tcp_state_change;
sk->sk_write_space = xs_tcp_write_space;
- sk->sk_error_report = xs_tcp_error_report;
+ sk->sk_error_report = xs_error_report;
sk->sk_allocation = GFP_ATOMIC;
/* socket options */
@@ -1657,37 +1727,42 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
}
/**
- * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
- * @work: RPC transport to connect
+ * xs_tcp_setup_socket - create a TCP socket and connect to a remote endpoint
+ * @xprt: RPC transport to connect
+ * @transport: socket transport to connect
+ * @create_sock: function to create a socket of the correct type
*
* Invoked by a work queue tasklet.
*/
-static void xs_tcp_connect_worker4(struct work_struct *work)
+static void xs_tcp_setup_socket(struct rpc_xprt *xprt,
+ struct sock_xprt *transport,
+ struct socket *(*create_sock)(struct rpc_xprt *,
+ struct sock_xprt *))
{
- struct sock_xprt *transport =
- container_of(work, struct sock_xprt, connect_worker.work);
- struct rpc_xprt *xprt = &transport->xprt;
struct socket *sock = transport->sock;
- int err, status = -EIO;
+ int status = -EIO;
if (xprt->shutdown)
goto out;
if (!sock) {
- /* start from scratch */
- if ((err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
- dprintk("RPC: can't create TCP transport socket (%d).\n", -err);
+ clear_bit(XPRT_CONNECTION_ABORT, &xprt->state);
+ sock = create_sock(xprt, transport);
+ if (IS_ERR(sock)) {
+ status = PTR_ERR(sock);
goto out;
}
- xs_reclassify_socket4(sock);
+ } else {
+ int abort_and_exit;
- if (xs_bind4(transport, sock) < 0) {
- sock_release(sock);
- goto out;
- }
- } else
+ abort_and_exit = test_and_clear_bit(XPRT_CONNECTION_ABORT,
+ &xprt->state);
/* "close" the socket, preserving the local port */
- xs_tcp_reuse_connection(xprt);
+ xs_tcp_reuse_connection(xprt, transport);
+
+ if (abort_and_exit)
+ goto out_eagain;
+ }
dprintk("RPC: worker connecting xprt %p to address: %s\n",
xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
@@ -1696,83 +1771,104 @@ static void xs_tcp_connect_worker4(struct work_struct *work)
dprintk("RPC: %p connect status %d connected %d sock state %d\n",
xprt, -status, xprt_connected(xprt),
sock->sk->sk_state);
- if (status < 0) {
- switch (status) {
- case -EINPROGRESS:
- case -EALREADY:
- goto out_clear;
- case -ECONNREFUSED:
- case -ECONNRESET:
- /* retry with existing socket, after a delay */
- break;
- default:
- /* get rid of existing socket, and retry */
- xs_tcp_shutdown(xprt);
- }
+ switch (status) {
+ case -ECONNREFUSED:
+ case -ECONNRESET:
+ case -ENETUNREACH:
+ /* retry with existing socket, after a delay */
+ case 0:
+ case -EINPROGRESS:
+ case -EALREADY:
+ xprt_clear_connecting(xprt);
+ return;
}
+ /* get rid of existing socket, and retry */
+ xs_tcp_shutdown(xprt);
+ printk("%s: connect returned unhandled error %d\n",
+ __func__, status);
+out_eagain:
+ status = -EAGAIN;
out:
- xprt_wake_pending_tasks(xprt, status);
-out_clear:
xprt_clear_connecting(xprt);
+ xprt_wake_pending_tasks(xprt, status);
+}
+
+static struct socket *xs_create_tcp_sock4(struct rpc_xprt *xprt,
+ struct sock_xprt *transport)
+{
+ struct socket *sock;
+ int err;
+
+ /* start from scratch */
+ err = sock_create_kern(PF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (err < 0) {
+ dprintk("RPC: can't create TCP transport socket (%d).\n",
+ -err);
+ goto out_err;
+ }
+ xs_reclassify_socket4(sock);
+
+ if (xs_bind4(transport, sock) < 0) {
+ sock_release(sock);
+ goto out_err;
+ }
+ return sock;
+out_err:
+ return ERR_PTR(-EIO);
}
/**
- * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
+ * xs_tcp_connect_worker4 - connect a TCP socket to a remote endpoint
* @work: RPC transport to connect
*
* Invoked by a work queue tasklet.
*/
-static void xs_tcp_connect_worker6(struct work_struct *work)
+static void xs_tcp_connect_worker4(struct work_struct *work)
{
struct sock_xprt *transport =
container_of(work, struct sock_xprt, connect_worker.work);
struct rpc_xprt *xprt = &transport->xprt;
- struct socket *sock = transport->sock;
- int err, status = -EIO;
- if (xprt->shutdown)
- goto out;
+ xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock4);
+}
- if (!sock) {
- /* start from scratch */
- if ((err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock)) < 0) {
- dprintk("RPC: can't create TCP transport socket (%d).\n", -err);
- goto out;
- }
- xs_reclassify_socket6(sock);
+static struct socket *xs_create_tcp_sock6(struct rpc_xprt *xprt,
+ struct sock_xprt *transport)
+{
+ struct socket *sock;
+ int err;
+
+ /* start from scratch */
+ err = sock_create_kern(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (err < 0) {
+ dprintk("RPC: can't create TCP transport socket (%d).\n",
+ -err);
+ goto out_err;
+ }
+ xs_reclassify_socket6(sock);
- if (xs_bind6(transport, sock) < 0) {
- sock_release(sock);
- goto out;
- }
- } else
- /* "close" the socket, preserving the local port */
- xs_tcp_reuse_connection(xprt);
+ if (xs_bind6(transport, sock) < 0) {
+ sock_release(sock);
+ goto out_err;
+ }
+ return sock;
+out_err:
+ return ERR_PTR(-EIO);
+}
- dprintk("RPC: worker connecting xprt %p to address: %s\n",
- xprt, xprt->address_strings[RPC_DISPLAY_ALL]);
+/**
+ * xs_tcp_connect_worker6 - connect a TCP socket to a remote endpoint
+ * @work: RPC transport to connect
+ *
+ * Invoked by a work queue tasklet.
+ */
+static void xs_tcp_connect_worker6(struct work_struct *work)
+{
+ struct sock_xprt *transport =
+ container_of(work, struct sock_xprt, connect_worker.work);
+ struct rpc_xprt *xprt = &transport->xprt;
- status = xs_tcp_finish_connecting(xprt, sock);
- dprintk("RPC: %p connect status %d connected %d sock state %d\n",
- xprt, -status, xprt_connected(xprt), sock->sk->sk_state);
- if (status < 0) {
- switch (status) {
- case -EINPROGRESS:
- case -EALREADY:
- goto out_clear;
- case -ECONNREFUSED:
- case -ECONNRESET:
- /* retry with existing socket, after a delay */
- break;
- default:
- /* get rid of existing socket, and retry */
- xs_tcp_shutdown(xprt);
- }
- }
-out:
- xprt_wake_pending_tasks(xprt, status);
-out_clear:
- xprt_clear_connecting(xprt);
+ xs_tcp_setup_socket(xprt, transport, xs_create_tcp_sock6);
}
/**
@@ -1817,9 +1913,6 @@ static void xs_tcp_connect(struct rpc_task *task)
{
struct rpc_xprt *xprt = task->tk_xprt;
- /* Initiate graceful shutdown of the socket if not already done */
- if (test_bit(XPRT_CONNECTED, &xprt->state))
- xs_tcp_shutdown(xprt);
/* Exit if we need to wait for socket shutdown to complete */
if (test_bit(XPRT_CLOSING, &xprt->state))
return;