From 0e2bedaa394f74fa9f75ee937488c33d90039b5a Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 30 Jan 2009 21:24:41 +0000 Subject: [CIFS] ipv6_addr_equal for address comparison Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/connect.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2209be94305..005df85219a 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -35,6 +34,7 @@ #include #include #include +#include #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -1379,8 +1379,8 @@ cifs_find_tcp_session(struct sockaddr_storage *addr) server->addr.sockAddr.sin_addr.s_addr)) continue; else if (addr->ss_family == AF_INET6 && - memcmp(&server->addr.sockAddr6.sin6_addr, - &addr6->sin6_addr, sizeof(addr6->sin6_addr))) + !ipv6_addr_equal(&server->addr.sockAddr6.sin6_addr, + &addr6->sin6_addr)) continue; ++server->srv_count; -- cgit v1.2.3 From 0bf2f3aec5474da80a60e1baca629af87ecb67b6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Fri, 6 Feb 2009 11:45:46 +0000 Subject: CRED: Fix SUID exec regression The patch: commit a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d CRED: Make execve() take advantage of copy-on-write credentials moved the place in which the 'safeness' of a SUID/SGID exec was performed to before de_thread() was called. This means that LSM_UNSAFE_SHARE is now calculated incorrectly. This flag is set if any of the usage counts for fs_struct, files_struct and sighand_struct are greater than 1 at the time the determination is made. All of which are true for threads created by the pthread library. However, since we wish to make the security calculation before irrevocably damaging the process so that we can return it an error code in the case where we decide we want to reject the exec request on this basis, we have to make the determination before calling de_thread(). So, instead, we count up the number of threads (CLONE_THREAD) that are sharing our fs_struct (CLONE_FS), files_struct (CLONE_FILES) and sighand_structs (CLONE_SIGHAND/CLONE_THREAD) with us. These will be killed by de_thread() and so can be discounted by check_unsafe_exec(). We do have to be careful because CLONE_THREAD does not imply FS or FILES. We _assume_ that there will be no extra references to these structs held by the threads we're going to kill. This can be tested with the attached pair of programs. Build the two programs using the Makefile supplied, and run ./test1 as a non-root user. If successful, you should see something like: [dhowells@andromeda tmp]$ ./test1 --TEST1-- uid=4043, euid=4043 suid=4043 exec ./test2 --TEST2-- uid=4043, euid=0 suid=0 SUCCESS - Correct effective user ID and if unsuccessful, something like: [dhowells@andromeda tmp]$ ./test1 --TEST1-- uid=4043, euid=4043 suid=4043 exec ./test2 --TEST2-- uid=4043, euid=4043 suid=4043 ERROR - Incorrect effective user ID! The non-root user ID you see will depend on the user you run as. [test1.c] #include #include #include #include static void *thread_func(void *arg) { while (1) {} } int main(int argc, char **argv) { pthread_t tid; uid_t uid, euid, suid; printf("--TEST1--\n"); getresuid(&uid, &euid, &suid); printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid); if (pthread_create(&tid, NULL, thread_func, NULL) < 0) { perror("pthread_create"); exit(1); } printf("exec ./test2\n"); execlp("./test2", "test2", NULL); perror("./test2"); _exit(1); } [test2.c] #include #include #include int main(int argc, char **argv) { uid_t uid, euid, suid; getresuid(&uid, &euid, &suid); printf("--TEST2--\n"); printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid); if (euid != 0) { fprintf(stderr, "ERROR - Incorrect effective user ID!\n"); exit(1); } printf("SUCCESS - Correct effective user ID\n"); exit(0); } [Makefile] CFLAGS = -D_GNU_SOURCE -Wall -Werror -Wunused all: test1 test2 test1: test1.c gcc $(CFLAGS) -o test1 test1.c -lpthread test2: test2.c gcc $(CFLAGS) -o test2 test2.c sudo chown root.root test2 sudo chmod +s test2 Reported-by: David Smith Signed-off-by: David Howells Acked-by: David Smith Signed-off-by: James Morris --- fs/compat.c | 2 +- fs/exec.c | 28 ++++++++++++++++++++++------ fs/internal.h | 2 +- 3 files changed, 24 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/compat.c b/fs/compat.c index 65a070e705a..d0145ca2757 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -1407,7 +1407,7 @@ int compat_do_execve(char * filename, bprm->cred = prepare_exec_creds(); if (!bprm->cred) goto out_unlock; - check_unsafe_exec(bprm); + check_unsafe_exec(bprm, current->files); file = open_exec(filename); retval = PTR_ERR(file); diff --git a/fs/exec.c b/fs/exec.c index 0dd60a01f1b..929b58004b7 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1049,16 +1049,32 @@ EXPORT_SYMBOL(install_exec_creds); * - the caller must hold current->cred_exec_mutex to protect against * PTRACE_ATTACH */ -void check_unsafe_exec(struct linux_binprm *bprm) +void check_unsafe_exec(struct linux_binprm *bprm, struct files_struct *files) { - struct task_struct *p = current; + struct task_struct *p = current, *t; + unsigned long flags; + unsigned n_fs, n_files, n_sighand; bprm->unsafe = tracehook_unsafe_exec(p); - if (atomic_read(&p->fs->count) > 1 || - atomic_read(&p->files->count) > 1 || - atomic_read(&p->sighand->count) > 1) + n_fs = 1; + n_files = 1; + n_sighand = 1; + lock_task_sighand(p, &flags); + for (t = next_thread(p); t != p; t = next_thread(t)) { + if (t->fs == p->fs) + n_fs++; + if (t->files == files) + n_files++; + n_sighand++; + } + + if (atomic_read(&p->fs->count) > n_fs || + atomic_read(&p->files->count) > n_files || + atomic_read(&p->sighand->count) > n_sighand) bprm->unsafe |= LSM_UNSAFE_SHARE; + + unlock_task_sighand(p, &flags); } /* @@ -1273,7 +1289,7 @@ int do_execve(char * filename, bprm->cred = prepare_exec_creds(); if (!bprm->cred) goto out_unlock; - check_unsafe_exec(bprm); + check_unsafe_exec(bprm, displaced); file = open_exec(filename); retval = PTR_ERR(file); diff --git a/fs/internal.h b/fs/internal.h index 53af885f173..0d8ac497b3d 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -43,7 +43,7 @@ extern void __init chrdev_init(void); /* * exec.c */ -extern void check_unsafe_exec(struct linux_binprm *); +extern void check_unsafe_exec(struct linux_binprm *, struct files_struct *); /* * namespace.c -- cgit v1.2.3 From 766ccb9ed406c230d13c145def08ebea1b932982 Mon Sep 17 00:00:00 2001 From: Cornelia Huck Date: Tue, 20 Jan 2009 15:31:31 +0100 Subject: async: Rename _special -> _domain for clarity. Rename the async_*_special() functions to async_*_domain(), which describes the purpose of these functions much better. [Broke up long lines to silence checkpatch] Signed-off-by: Cornelia Huck Signed-off-by: Arjan van de Ven --- fs/super.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 645e5403f2a..61dce001dd5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -301,7 +301,7 @@ void generic_shutdown_super(struct super_block *sb) /* * wait for asynchronous fs operations to finish before going further */ - async_synchronize_full_special(&sb->s_async_list); + async_synchronize_full_domain(&sb->s_async_list); /* bad name - it should be evict_inodes() */ invalidate_inodes(sb); @@ -470,7 +470,7 @@ restart: sb->s_count++; spin_unlock(&sb_lock); down_read(&sb->s_umount); - async_synchronize_full_special(&sb->s_async_list); + async_synchronize_full_domain(&sb->s_async_list); if (sb->s_root && (wait || sb->s_dirt)) sb->s_op->sync_fs(sb, wait); up_read(&sb->s_umount); -- cgit v1.2.3 From 9d9b87c1218be78ddecbc85ec3bb91c79c1d56ab Mon Sep 17 00:00:00 2001 From: "J. Bruce Fields" Date: Wed, 4 Feb 2009 17:35:38 -0500 Subject: lockd: fix regression in lockd's handling of blocked locks If a client requests a blocking lock, is denied, then requests it again, then here in nlmsvc_lock() we will call vfs_lock_file() without FL_SLEEP set, because we've already queued a block and don't need the locks code to do it again. But that means vfs_lock_file() will return -EAGAIN instead of FILE_LOCK_DENIED. So we still need to translate that -EAGAIN return into a nlm_lck_blocked error in this case, and put ourselves back on lockd's block list. The bug was introduced by bde74e4bc64415b1 "locks: add special return value for asynchronous locks". Thanks to Frank van Maarseveen for the report; his original test case was essentially for i in `seq 30`; do flock /nfsmount/foo sleep 10 & done Tested-by: Frank van Maarseveen Reported-by: Frank van Maarseveen Cc: Miklos Szeredi Signed-off-by: J. Bruce Fields --- fs/lockd/svclock.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index 6063a8e4b9f..763b78a6e9d 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -427,7 +427,7 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; case -EAGAIN: ret = nlm_lck_denied; - goto out; + break; case FILE_LOCK_DEFERRED: if (wait) break; @@ -443,6 +443,10 @@ nlmsvc_lock(struct svc_rqst *rqstp, struct nlm_file *file, goto out; } + ret = nlm_lck_denied; + if (!wait) + goto out; + ret = nlm_lck_blocked; /* Append to list of blocked */ -- cgit v1.2.3 From 284b066af41579f62649048fdec5c5e7091703e6 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 9 Feb 2009 16:22:03 -0500 Subject: Btrfs: don't use spin_is_contended Btrfs was using spin_is_contended to see if it should drop locks before doing extent allocations during btrfs_search_slot. The idea was to avoid expensive searches in the tree unless the lock was actually contended. But, spin_is_contended is specific to the ticket spinlocks on x86, so this is causing compile errors everywhere else. In practice, the contention could easily appear some time after we started doing the extent allocation, and it makes more sense to always drop the lock instead. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 3 +-- fs/btrfs/locking.c | 22 ---------------------- fs/btrfs/locking.h | 2 -- 3 files changed, 1 insertion(+), 26 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 551177c0011..35443cc4b9a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1530,8 +1530,7 @@ again: * for higher level blocks, try not to allocate blocks * with the block and the parent locks held. */ - if (level > 0 && !prealloc_block.objectid && - btrfs_path_lock_waiting(p, level)) { + if (level > 0 && !prealloc_block.objectid) { u32 size = b->len; u64 hint = b->start; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 68fd9ccf180..9ebe9385129 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -236,25 +236,3 @@ int btrfs_tree_locked(struct extent_buffer *eb) return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || spin_is_locked(&eb->lock); } - -/* - * btrfs_search_slot uses this to decide if it should drop its locks - * before doing something expensive like allocating free blocks for cow. - */ -int btrfs_path_lock_waiting(struct btrfs_path *path, int level) -{ - int i; - struct extent_buffer *eb; - - for (i = level; i <= level + 1 && i < BTRFS_MAX_LEVEL; i++) { - eb = path->nodes[i]; - if (!eb) - break; - smp_mb(); - if (spin_is_contended(&eb->lock) || - waitqueue_active(&eb->lock_wq)) - return 1; - } - return 0; -} - diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index d92e707f587..6bb0afbff92 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -26,8 +26,6 @@ int btrfs_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_lock(struct extent_buffer *eb); int btrfs_try_spin_lock(struct extent_buffer *eb); -int btrfs_path_lock_waiting(struct btrfs_path *path, int level); - void btrfs_set_lock_blocking(struct extent_buffer *eb); void btrfs_clear_lock_blocking(struct extent_buffer *eb); #endif -- cgit v1.2.3 From c88ccea3143975294f5a52097546bcbb75975f52 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Feb 2009 11:27:46 -0500 Subject: jbd2: Fix return value of jbd2_journal_start_commit() The function jbd2_journal_start_commit() returns 1 if either a transaction is committing or the function has queued a transaction commit. But it returns 0 if we raced with somebody queueing the transaction commit as well. This resulted in ext4_sync_fs() not functioning correctly (description from Arthur Jones): In the case of a data=ordered umount with pending long symlinks which are delayed due to a long list of other I/O on the backing block device, this causes the buffer associated with the long symlinks to not be moved to the inode dirty list in the second phase of fsync_super. Then, before they can be dirtied again, kjournald exits, seeing the UMOUNT flag and the dirty pages are never written to the backing block device, causing long symlink corruption and exposing new or previously freed block data to userspace. This can be reproduced with a script created by Eric Sandeen : #!/bin/bash umount /mnt/test2 mount /dev/sdb4 /mnt/test2 rm -f /mnt/test2/* dd if=/dev/zero of=/mnt/test2/bigfile bs=1M count=512 touch /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename ln -s /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename /mnt/test2/link umount /mnt/test2 mount /dev/sdb4 /mnt/test2 ls /mnt/test2/ This patch fixes jbd2_journal_start_commit() to always return 1 when there's a transaction committing or queued for commit. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" CC: Eric Sandeen CC: linux-ext4@vger.kernel.org --- fs/jbd2/journal.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index eb343008ede..58144102bf2 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -450,7 +450,7 @@ int __jbd2_log_space_left(journal_t *journal) } /* - * Called under j_state_lock. Returns true if a transaction was started. + * Called under j_state_lock. Returns true if a transaction commit was started. */ int __jbd2_log_start_commit(journal_t *journal, tid_t target) { @@ -518,7 +518,8 @@ int jbd2_journal_force_commit_nested(journal_t *journal) /* * Start a commit of the current running transaction (if any). Returns true - * if a transaction was started, and fills its tid in at *ptid + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid */ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) { @@ -528,15 +529,19 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; - ret = __jbd2_log_start_commit(journal, tid); - if (ret && ptid) + __jbd2_log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) *ptid = tid; - } else if (journal->j_committing_transaction && ptid) { + ret = 1; + } else if (journal->j_committing_transaction) { /* * If ext3_write_super() recently started a commit, then we * have to wait for completion of that transaction */ - *ptid = journal->j_committing_transaction->t_tid; + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; ret = 1; } spin_unlock(&journal->j_state_lock); -- cgit v1.2.3 From 9eddacf9e9c03578ef2c07c9534423e823d677f8 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Feb 2009 06:46:05 -0500 Subject: Revert "ext4: wait on all pending commits in ext4_sync_fs()" This undoes commit 14ce0cb411c88681ab8f3a4c9caa7f42e97a3184. Since jbd2_journal_start_commit() is now fixed to return 1 when we started a transaction commit, there's some transaction waiting to be committed or there's a transaction already committing, we don't need to call ext4_force_commit() in ext4_sync_fs(). Furthermore ext4_force_commit() can unnecessarily create sync transaction which is expensive so it's worthwhile to remove it when we can. http://bugzilla.kernel.org/show_bug.cgi?id=12224 Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Cc: Eric Sandeen Cc: linux-ext4@vger.kernel.org --- fs/ext4/super.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index e5f06a5f045..a5732c58f67 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3046,14 +3046,17 @@ static void ext4_write_super(struct super_block *sb) static int ext4_sync_fs(struct super_block *sb, int wait) { int ret = 0; + tid_t target; trace_mark(ext4_sync_fs, "dev %s wait %d", sb->s_id, wait); sb->s_dirt = 0; if (EXT4_SB(sb)->s_journal) { - if (wait) - ret = ext4_force_commit(sb); - else - jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, NULL); + if (jbd2_journal_start_commit(EXT4_SB(sb)->s_journal, + &target)) { + if (wait) + jbd2_log_wait_commit(EXT4_SB(sb)->s_journal, + target); + } } else { ext4_commit_super(sb, EXT4_SB(sb)->s_es, wait); } -- cgit v1.2.3 From 7f5aa215088b817add9c71914b83650bdd49f8a9 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 10 Feb 2009 11:15:34 -0500 Subject: jbd2: Avoid possible NULL dereference in jbd2_journal_begin_ordered_truncate() If we race with commit code setting i_transaction to NULL, we could possibly dereference it. Proper locking requires the journal pointer (to access journal->j_list_lock), which we don't have. So we have to change the prototype of the function so that filesystem passes us the journal pointer. Also add a more detailed comment about why the function jbd2_journal_begin_ordered_truncate() does what it does and how it should be used. Thanks to Dan Carpenter for pointing to the suspitious code. Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" Acked-by: Joel Becker CC: linux-ext4@vger.kernel.org CC: ocfs2-devel@oss.oracle.com CC: mfasheh@suse.de CC: Dan Carpenter --- fs/ext4/inode.c | 6 ++++-- fs/jbd2/transaction.c | 42 +++++++++++++++++++++++++++++++----------- fs/ocfs2/journal.h | 6 ++++-- 3 files changed, 39 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 03ba20be132..658c4a7f257 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -47,8 +47,10 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&EXT4_I(inode)->jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + EXT4_SB(inode->i_sb)->s_journal, + &EXT4_I(inode)->jinode, + new_size); } static void ext4_invalidatepage(struct page *page, unsigned long offset); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 46b4e347ed7..28ce21d8598 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -2129,26 +2129,46 @@ done: } /* - * This function must be called when inode is journaled in ordered mode - * before truncation happens. It starts writeout of truncated part in - * case it is in the committing transaction so that we stand to ordered - * mode consistency guarantees. + * File truncate and transaction commit interact with each other in a + * non-trivial way. If a transaction writing data block A is + * committing, we cannot discard the data by truncate until we have + * written them. Otherwise if we crashed after the transaction with + * write has committed but before the transaction with truncate has + * committed, we could see stale data in block A. This function is a + * helper to solve this problem. It starts writeout of the truncated + * part in case it is in the committing transaction. + * + * Filesystem code must call this function when inode is journaled in + * ordered mode before truncation happens and after the inode has been + * placed on orphan list with the new inode size. The second condition + * avoids the race that someone writes new data and we start + * committing the transaction after this function has been called but + * before a transaction for truncate is started (and furthermore it + * allows us to optimize the case where the addition to orphan list + * happens in the same transaction as write --- we don't have to write + * any data in such case). */ -int jbd2_journal_begin_ordered_truncate(struct jbd2_inode *inode, +int jbd2_journal_begin_ordered_truncate(journal_t *journal, + struct jbd2_inode *jinode, loff_t new_size) { - journal_t *journal; - transaction_t *commit_trans; + transaction_t *inode_trans, *commit_trans; int ret = 0; - if (!inode->i_transaction && !inode->i_next_transaction) + /* This is a quick check to avoid locking if not necessary */ + if (!jinode->i_transaction) goto out; - journal = inode->i_transaction->t_journal; + /* Locks are here just to force reading of recent values, it is + * enough that the transaction was not committing before we started + * a transaction adding the inode to orphan list */ spin_lock(&journal->j_state_lock); commit_trans = journal->j_committing_transaction; spin_unlock(&journal->j_state_lock); - if (inode->i_transaction == commit_trans) { - ret = filemap_fdatawrite_range(inode->i_vfs_inode->i_mapping, + spin_lock(&journal->j_list_lock); + inode_trans = jinode->i_transaction; + spin_unlock(&journal->j_list_lock); + if (inode_trans == commit_trans) { + ret = filemap_fdatawrite_range(jinode->i_vfs_inode->i_mapping, new_size, LLONG_MAX); if (ret) jbd2_journal_abort(journal, ret); diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index 3c3532e1307..172850a9a12 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -513,8 +513,10 @@ static inline int ocfs2_jbd2_file_inode(handle_t *handle, struct inode *inode) static inline int ocfs2_begin_ordered_truncate(struct inode *inode, loff_t new_size) { - return jbd2_journal_begin_ordered_truncate(&OCFS2_I(inode)->ip_jinode, - new_size); + return jbd2_journal_begin_ordered_truncate( + OCFS2_SB(inode->i_sb)->journal->j_journal, + &OCFS2_I(inode)->ip_jinode, + new_size); } #endif /* OCFS2_JOURNAL_H */ -- cgit v1.2.3 From 7be2baaa0322c59ba888aa5260a8c130666acd41 Mon Sep 17 00:00:00 2001 From: Wei Yongjun Date: Tue, 10 Feb 2009 09:53:42 -0500 Subject: ext4: Fix to read empty directory blocks correctly in 64k The rec_len field in the directory entry is 16 bits, so there was a problem representing rec_len for filesystems with a 64k block size in the case where the directory entry takes the entire 64k block. Unfortunately, there were two schemes that were proposed; one where all zeros meant 65536 and one where all ones (65535) meant 65536. E2fsprogs used 0, whereas the kernel used 65535. Oops. Fortunately this case happens extremely rarely, with the most common case being the lost+found directory, created by mke2fs. So we will be liberal in what we accept, and accept both encodings, but we will continue to encode 65536 as 65535. This will require a change in e2fsprogs, but with fortunately ext4 filesystems normally have the dir_index feature enabled, which precludes having a completely empty directory block. Signed-off-by: Wei Yongjun Signed-off-by: "Theodore Ts'o" --- fs/ext4/ext4.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index aafc9eba1c2..b0c87dce66a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -868,7 +868,7 @@ static inline unsigned ext4_rec_len_from_disk(__le16 dlen) { unsigned len = le16_to_cpu(dlen); - if (len == EXT4_MAX_REC_LEN) + if (len == EXT4_MAX_REC_LEN || len == 0) return 1 << 16; return len; } -- cgit v1.2.3 From ba4439165f0f0d25b2fe065cf0c1ff8130b802eb Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Tue, 10 Feb 2009 11:14:34 -0500 Subject: ext4: Fix lockdep warning We should not call ext4_mb_add_n_trim while holding alloc_semp. ============================================= [ INFO: possible recursive locking detected ] 2.6.29-rc4-git1-dirty #124 --------------------------------------------- ffsb/3116 is trying to acquire lock: (&meta_group_info[i]->alloc_sem){----}, at: [] ext4_mb_load_buddy+0xd2/0x343 but task is already holding lock: (&meta_group_info[i]->alloc_sem){----}, at: [] ext4_mb_load_buddy+0xd2/0x343 http://bugzilla.kernel.org/show_bug.cgi?id=12672 Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index deba54f6cbe..c962d069050 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -4476,23 +4476,26 @@ static int ext4_mb_release_context(struct ext4_allocation_context *ac) pa->pa_free -= ac->ac_b_ex.fe_len; pa->pa_len -= ac->ac_b_ex.fe_len; spin_unlock(&pa->pa_lock); - /* - * We want to add the pa to the right bucket. - * Remove it from the list and while adding - * make sure the list to which we are adding - * doesn't grow big. - */ - if (likely(pa->pa_free)) { - spin_lock(pa->pa_obj_lock); - list_del_rcu(&pa->pa_inode_list); - spin_unlock(pa->pa_obj_lock); - ext4_mb_add_n_trim(ac); - } } - ext4_mb_put_pa(ac, ac->ac_sb, pa); } if (ac->alloc_semp) up_read(ac->alloc_semp); + if (pa) { + /* + * We want to add the pa to the right bucket. + * Remove it from the list and while adding + * make sure the list to which we are adding + * doesn't grow big. We need to release + * alloc_semp before calling ext4_mb_add_n_trim() + */ + if (pa->pa_linear && likely(pa->pa_free)) { + spin_lock(pa->pa_obj_lock); + list_del_rcu(&pa->pa_inode_list); + spin_unlock(pa->pa_obj_lock); + ext4_mb_add_n_trim(ac); + } + ext4_mb_put_pa(ac, ac->ac_sb, pa); + } if (ac->ac_bitmap_page) page_cache_release(ac->ac_bitmap_page); if (ac->ac_buddy_page) -- cgit v1.2.3 From 5a6fe125950676015f5108fb71b2a67441755003 Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Tue, 10 Feb 2009 14:02:27 +0000 Subject: Do not account for the address space used by hugetlbfs using VM_ACCOUNT When overcommit is disabled, the core VM accounts for pages used by anonymous shared, private mappings and special mappings. It keeps track of VMAs that should be accounted for with VM_ACCOUNT and VMAs that never had a reserve with VM_NORESERVE. Overcommit for hugetlbfs is much riskier than overcommit for base pages due to contiguity requirements. It avoids overcommiting on both shared and private mappings using reservation counters that are checked and updated during mmap(). This ensures (within limits) that hugepages exist in the future when faults occurs or it is too easy to applications to be SIGKILLed. As hugetlbfs makes its own reservations of a different unit to the base page size, VM_ACCOUNT should never be set. Even if the units were correct, we would double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may be set because an application can request no reserves be made for hugetlbfs at the risk of getting killed later. With commit fc8744adc870a8d4366908221508bb113d8b72ee, VM_NORESERVE and VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This breaks the accounting for both the core VM and hugetlbfs, can trigger an OOM storm when hugepage pools are too small lockups and corrupted counters otherwise are used. This patch brings hugetlbfs more in line with how the core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set. Signed-off-by: Mel Gorman Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 6903d37af03..9b800d97a68 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (hugetlb_reserve_pages(inode, vma->vm_pgoff >> huge_page_order(h), - len >> huge_page_shift(h), vma)) + len >> huge_page_shift(h), vma, + vma->vm_flags)) goto out; ret = 0; @@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void) can_do_mlock()); } -struct file *hugetlb_file_setup(const char *name, size_t size) +struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag) { int error = -ENOMEM; struct file *file; @@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size) error = -ENOMEM; if (hugetlb_reserve_pages(inode, 0, - size >> huge_page_shift(hstate_inode(inode)), NULL)) + size >> huge_page_shift(hstate_inode(inode)), NULL, + acctflag)) goto out_inode; d_instantiate(dentry, inode); -- cgit v1.2.3 From 8fe4cd0dc5ea43760c59eb256404188272cc95dd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Feb 2009 13:04:25 -0800 Subject: jbd: fix return value of journal_start_commit() journal_start_commit() returns 1 if either a transaction is committing or the function has queued a transaction commit. But it returns 0 if we raced with somebody queueing the transaction commit as well. This resulted in ext3_sync_fs() not functioning correctly (description from Arthur Jones): In the case of a data=ordered umount with pending long symlinks which are delayed due to a long list of other I/O on the backing block device, this causes the buffer associated with the long symlinks to not be moved to the inode dirty list in the second phase of fsync_super. Then, before they can be dirtied again, kjournald exits, seeing the UMOUNT flag and the dirty pages are never written to the backing block device, causing long symlink corruption and exposing new or previously freed block data to userspace. This can be reproduced with a script created by Eric Sandeen : #!/bin/bash umount /mnt/test2 mount /dev/sdb4 /mnt/test2 rm -f /mnt/test2/* dd if=/dev/zero of=/mnt/test2/bigfile bs=1M count=512 touch /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename ln -s /mnt/test2/thisisveryveryveryveryveryveryveryveryveryveryveryveryveryveryveryverylongfilename /mnt/test2/link umount /mnt/test2 mount /dev/sdb4 /mnt/test2 ls /mnt/test2/ This patch fixes journal_start_commit() to always return 1 when there's a transaction committing or queued for commit. Cc: Eric Sandeen Cc: Mike Snitzer Cc: Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/jbd/journal.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 9e4fa52d7dc..e79c07812af 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -427,7 +427,7 @@ int __log_space_left(journal_t *journal) } /* - * Called under j_state_lock. Returns true if a transaction was started. + * Called under j_state_lock. Returns true if a transaction commit was started. */ int __log_start_commit(journal_t *journal, tid_t target) { @@ -495,7 +495,8 @@ int journal_force_commit_nested(journal_t *journal) /* * Start a commit of the current running transaction (if any). Returns true - * if a transaction was started, and fills its tid in at *ptid + * if a transaction is going to be committed (or is currently already + * committing), and fills its tid in at *ptid */ int journal_start_commit(journal_t *journal, tid_t *ptid) { @@ -505,15 +506,19 @@ int journal_start_commit(journal_t *journal, tid_t *ptid) if (journal->j_running_transaction) { tid_t tid = journal->j_running_transaction->t_tid; - ret = __log_start_commit(journal, tid); - if (ret && ptid) + __log_start_commit(journal, tid); + /* There's a running transaction and we've just made sure + * it's commit has been scheduled. */ + if (ptid) *ptid = tid; - } else if (journal->j_committing_transaction && ptid) { + ret = 1; + } else if (journal->j_committing_transaction) { /* * If ext3_write_super() recently started a commit, then we * have to wait for completion of that transaction */ - *ptid = journal->j_committing_transaction->t_tid; + if (ptid) + *ptid = journal->j_committing_transaction->t_tid; ret = 1; } spin_unlock(&journal->j_state_lock); -- cgit v1.2.3 From 02ac597c9b86af49b2016aa98aee20ab59dbf0d2 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 11 Feb 2009 13:04:26 -0800 Subject: ext3: revert "ext3: wait on all pending commits in ext3_sync_fs" This reverts commit c87591b719737b4e91eb1a9fa8fd55a4ff1886d6. Since journal_start_commit() is now fixed to return 1 when we started a transaction commit, there's some transaction waiting to be committed or there's a transaction already committing, we don't need to call ext3_force_commit() in ext3_sync_fs(). Furthermore ext3_force_commit() can unnecessarily create sync transaction which is expensive so it's worthwhile to remove it when we can. Cc: Eric Sandeen Cc: Signed-off-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext3/super.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext3/super.c b/fs/ext3/super.c index b70d90e08a3..4a970411a45 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -2428,12 +2428,13 @@ static void ext3_write_super (struct super_block * sb) static int ext3_sync_fs(struct super_block *sb, int wait) { - sb->s_dirt = 0; - if (wait) - ext3_force_commit(sb); - else - journal_start_commit(EXT3_SB(sb)->s_journal, NULL); + tid_t target; + sb->s_dirt = 0; + if (journal_start_commit(EXT3_SB(sb)->s_journal, &target)) { + if (wait) + log_wait_commit(EXT3_SB(sb)->s_journal, target); + } return 0; } -- cgit v1.2.3 From 0e4a9b59282914fe057ab17027f55123964bc2e2 Mon Sep 17 00:00:00 2001 From: Carsten Otte Date: Wed, 11 Feb 2009 13:04:37 -0800 Subject: ext2/xip: refuse to change xip flag during remount with busy inodes For a reason that I was unable to understand in three months of debugging, mount ext2 -o remount stopped working properly when remounting from regular operation to xip, or the other way around. According to a git bisect search, the problem was introduced with the VM_MIXEDMAP/PTE_SPECIAL rework in the vm: commit 70688e4dd1647f0ceb502bbd5964fa344c5eb411 Author: Nick Piggin Date: Mon Apr 28 02:13:02 2008 -0700 xip: support non-struct page backed memory In the failing scenario, the filesystem is mounted read only via root= kernel parameter on s390x. During remount (in rc.sysinit), the inodes of the bash binary and its libraries are busy and cannot be invalidated (the bash which is running rc.sysinit resides on subject filesystem). Afterwards, another bash process (running ifup-eth) recurses into a subshell, runs dup_mm (via fork). Some of the mappings in this bash process were created from inodes that could not be invalidated during remount. Both parent and child process crash some time later due to inconsistencies in their address spaces. The issue seems to be timing sensitive, various attempts to recreate it have failed. This patch refuses to change the xip flag during remount in case some inodes cannot be invalidated. This patch keeps users from running into that issue. [akpm@linux-foundation.org: cleanup] Signed-off-by: Carsten Otte Cc: Nick Piggin Cc: Jared Hulbert Cc: Martin Schwidefsky Cc: Heiko Carstens Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/super.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext2/super.c b/fs/ext2/super.c index da8bdeaa2e6..7c6e3606f0e 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1185,9 +1185,12 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) es = sbi->s_es; if (((sbi->s_mount_opt & EXT2_MOUNT_XIP) != (old_mount_opt & EXT2_MOUNT_XIP)) && - invalidate_inodes(sb)) - ext2_warning(sb, __func__, "busy inodes while remounting "\ - "xip remain in cache (no functional problem)"); + invalidate_inodes(sb)) { + ext2_warning(sb, __func__, "refusing change of xip flag " + "with busy inodes while remounting"); + sbi->s_mount_opt &= ~EXT2_MOUNT_XIP; + sbi->s_mount_opt |= old_mount_opt & EXT2_MOUNT_XIP; + } if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; if (*flags & MS_RDONLY) { -- cgit v1.2.3 From eb099670895f22970cd143875467c2768d6d87e5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 12 Feb 2009 09:27:38 -0500 Subject: Btrfs: make sure all pending extent operations are complete Theres a slight problem with finish_current_insert, if we set all to 1 and then go through and don't actually skip any of the extents on the pending list, we could exit right after we've added new extents. This is a problem because by inserting the new extents we could have gotten new COW's to happen and such, so we may have some pending updates to do or even more inserts to do after that. So this patch will only exit if we have never skipped any of the extents in the pending list, and we have no extents to insert, this will make sure that all of the pending work is truly done before we return. I've been running with this patch for a few days with all of my other testing and have not seen issues. Thanks, Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 71 +++++++++++++++++++++++++++++--------------------- 1 file changed, 42 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7527523c2d2..376656f65b3 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1323,8 +1323,25 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, int btrfs_extent_post_op(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - finish_current_insert(trans, root->fs_info->extent_root, 1); - del_pending_extents(trans, root->fs_info->extent_root, 1); + u64 start; + u64 end; + int ret; + + while(1) { + finish_current_insert(trans, root->fs_info->extent_root, 1); + del_pending_extents(trans, root->fs_info->extent_root, 1); + + /* is there more work to do? */ + ret = find_first_extent_bit(&root->fs_info->pending_del, + 0, &start, &end, EXTENT_WRITEBACK); + if (!ret) + continue; + ret = find_first_extent_bit(&root->fs_info->extent_ins, + 0, &start, &end, EXTENT_WRITEBACK); + if (!ret) + continue; + break; + } return 0; } @@ -2211,13 +2228,12 @@ static int finish_current_insert(struct btrfs_trans_handle *trans, u64 end; u64 priv; u64 search = 0; - u64 skipped = 0; struct btrfs_fs_info *info = extent_root->fs_info; struct btrfs_path *path; struct pending_extent_op *extent_op, *tmp; struct list_head insert_list, update_list; int ret; - int num_inserts = 0, max_inserts; + int num_inserts = 0, max_inserts, restart = 0; path = btrfs_alloc_path(); INIT_LIST_HEAD(&insert_list); @@ -2233,19 +2249,19 @@ again: ret = find_first_extent_bit(&info->extent_ins, search, &start, &end, EXTENT_WRITEBACK); if (ret) { - if (skipped && all && !num_inserts && + if (restart && !num_inserts && list_empty(&update_list)) { - skipped = 0; + restart = 0; search = 0; continue; } - mutex_unlock(&info->extent_ins_mutex); break; } ret = try_lock_extent(&info->extent_ins, start, end, GFP_NOFS); if (!ret) { - skipped = 1; + if (all) + restart = 1; search = end + 1; if (need_resched()) { mutex_unlock(&info->extent_ins_mutex); @@ -2264,7 +2280,7 @@ again: list_add_tail(&extent_op->list, &insert_list); search = end + 1; if (num_inserts == max_inserts) { - mutex_unlock(&info->extent_ins_mutex); + restart = 1; break; } } else if (extent_op->type == PENDING_BACKREF_UPDATE) { @@ -2280,7 +2296,6 @@ again: * somebody marked this thing for deletion then just unlock it and be * done, the free_extents will handle it */ - mutex_lock(&info->extent_ins_mutex); list_for_each_entry_safe(extent_op, tmp, &update_list, list) { clear_extent_bits(&info->extent_ins, extent_op->bytenr, extent_op->bytenr + extent_op->num_bytes - 1, @@ -2302,6 +2317,10 @@ again: if (!list_empty(&update_list)) { ret = update_backrefs(trans, extent_root, path, &update_list); BUG_ON(ret); + + /* we may have COW'ed new blocks, so lets start over */ + if (all) + restart = 1; } /* @@ -2309,9 +2328,9 @@ again: * need to make sure everything is cleaned then reset everything and * go back to the beginning */ - if (!num_inserts && all && skipped) { + if (!num_inserts && restart) { search = 0; - skipped = 0; + restart = 0; INIT_LIST_HEAD(&update_list); INIT_LIST_HEAD(&insert_list); goto again; @@ -2368,27 +2387,19 @@ again: BUG_ON(ret); /* - * if we broke out of the loop in order to insert stuff because we hit - * the maximum number of inserts at a time we can handle, then loop - * back and pick up where we left off + * if restart is set for whatever reason we need to go back and start + * searching through the pending list again. + * + * We just inserted some extents, which could have resulted in new + * blocks being allocated, which would result in new blocks needing + * updates, so if all is set we _must_ restart to get the updated + * blocks. */ - if (num_inserts == max_inserts) { - INIT_LIST_HEAD(&insert_list); - INIT_LIST_HEAD(&update_list); - num_inserts = 0; - goto again; - } - - /* - * again, if we need to make absolutely sure there are no more pending - * extent operations left and we know that we skipped some, go back to - * the beginning and do it all again - */ - if (all && skipped) { + if (restart || all) { INIT_LIST_HEAD(&insert_list); INIT_LIST_HEAD(&update_list); search = 0; - skipped = 0; + restart = 0; num_inserts = 0; goto again; } @@ -2709,6 +2720,8 @@ again: goto again; } + if (!err) + finish_current_insert(trans, extent_root, 0); return err; } -- cgit v1.2.3 From b288052e1779261ae80138074989ef50358c4e58 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Feb 2009 09:37:35 -0500 Subject: Btrfs: process mount options on mount -o remount, Btrfs wasn't parsing any new mount options during remount, making it difficult to set mount options on a root drive. Signed-off-by: Chris Mason --- fs/btrfs/super.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f3fd7e2cbc3..66b8341e2db 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -511,6 +511,10 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) struct btrfs_root *root = btrfs_sb(sb); int ret; + ret = btrfs_parse_options(root, data); + if (ret) + return -EINVAL; + if ((*flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) return 0; -- cgit v1.2.3 From 536ac8ae86e68bb5574d7cc81c7d229a86b82601 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Feb 2009 09:41:38 -0500 Subject: Btrfs: use larger metadata clusters in ssd mode Larger metadata clusters can significantly improve writeback performance on ssd drives with large erasure blocks. The larger clusters make it more likely a given IO will completely overwrite the ssd block, so it doesn't have to do an internal rwm cycle. On spinning media, lager metadata clusters end up spreading out the metadata more over time, which makes fsck slower, so we don't want this to be the default. Signed-off-by: Chris Mason --- fs/btrfs/extent-tree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 376656f65b3..c59e12036e2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2872,7 +2872,8 @@ static noinline int find_free_extent(struct btrfs_trans_handle *trans, if (data & BTRFS_BLOCK_GROUP_METADATA) { last_ptr = &root->fs_info->last_alloc; - empty_cluster = 64 * 1024; + if (!btrfs_test_opt(root, SSD)) + empty_cluster = 64 * 1024; } if ((data & BTRFS_BLOCK_GROUP_DATA) && btrfs_test_opt(root, SSD)) -- cgit v1.2.3 From e1df36d2f18254d0690a0fbe036cece74ec311b8 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Feb 2009 09:45:08 -0500 Subject: Btrfs: don't clean old snapshots on sync(1) Cleaning old snapshots can make sync(1) somewhat slow, and some users and applications still use it in a global fsync kind of workload. This patch changes btrfs not to clean old snapshots during sync, which is safe from a FS consistency point of view. The major downside is that it makes it difficult to tell when old snapshots have been reaped and the space they were using has been reclaimed. A new ioctl will be added for this purpose instead. Signed-off-by: Chris Mason --- fs/btrfs/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 66b8341e2db..19a4daf03cc 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -379,7 +379,6 @@ int btrfs_sync_fs(struct super_block *sb, int wait) btrfs_start_delalloc_inodes(root); btrfs_wait_ordered_extents(root, 0); - btrfs_clean_old_snapshots(root); trans = btrfs_start_transaction(root, 1); ret = btrfs_commit_transaction(trans, root); sb->s_dirt = 0; -- cgit v1.2.3 From b335b0034e252e79ec2e9c6697f5d663c4627bec Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 12 Feb 2009 10:06:04 -0500 Subject: Btrfs: Avoid using __GFP_HIGHMEM with slab allocator btrfs_releasepage may call kmem_cache_alloc indirectly, and provide same GFP flags it gets to kmem_cache_alloc. So it's possible to use __GFP_HIGHMEM with the slab allocator. Signed-off-by: Yan Zheng --- fs/btrfs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8f0706210a4..638bcb5e49f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4263,7 +4263,7 @@ static int btrfs_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) return 0; - return __btrfs_releasepage(page, gfp_flags); + return __btrfs_releasepage(page, gfp_flags & GFP_NOFS); } static void btrfs_invalidatepage(struct page *page, unsigned long offset) -- cgit v1.2.3 From 7951f3cefbd711f4429a0cd014aa83a844c399a0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 12 Feb 2009 10:06:15 -0500 Subject: Btrfs: balance_level checks !child after access The BUG_ON() is in the wrong spot. Signed-off-by: Jeff Mahoney Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 35443cc4b9a..6674692f702 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -917,9 +917,9 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* promote the child to a root */ child = read_node_slot(root, mid, 0); + BUG_ON(!child); btrfs_tree_lock(child); btrfs_set_lock_blocking(child); - BUG_ON(!child); ret = btrfs_cow_block(trans, root, child, mid, 0, &child, 0); BUG_ON(ret); -- cgit v1.2.3 From e00f7308658622fbd483cb0d9fe41165bf9050d0 Mon Sep 17 00:00:00 2001 From: Jeff Mahoney Date: Thu, 12 Feb 2009 14:11:25 -0500 Subject: Btrfs: remove btrfs_init_path btrfs_init_path was initially used when the path objects were on the stack. Now all the work is done by btrfs_alloc_path and btrfs_init_path isn't required. This patch removes it, and just uses kmem_cache_zalloc to zero out the object. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 11 ++--------- fs/btrfs/ctree.h | 1 - fs/btrfs/inode-map.c | 1 - fs/btrfs/inode.c | 2 -- 4 files changed, 2 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 6674692f702..c8f4c540cc2 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -38,19 +38,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, static int del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level, int slot); -inline void btrfs_init_path(struct btrfs_path *p) -{ - memset(p, 0, sizeof(*p)); -} - struct btrfs_path *btrfs_alloc_path(void) { struct btrfs_path *path; - path = kmem_cache_alloc(btrfs_path_cachep, GFP_NOFS); - if (path) { - btrfs_init_path(path); + path = kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS); + if (path) path->reada = 1; - } return path; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 531db112c8b..3f7a8058df2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1834,7 +1834,6 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); -void btrfs_init_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); void btrfs_clear_path_blocking(struct btrfs_path *p); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 2aa79873eb4..cc7334d833c 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -84,7 +84,6 @@ int btrfs_find_free_objectid(struct btrfs_trans_handle *trans, search_key.type = 0; search_key.offset = 0; - btrfs_init_path(path); start_found = 0; ret = btrfs_search_slot(trans, root, &search_key, path, 0, 0); if (ret < 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 638bcb5e49f..3cee77ae03c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2531,8 +2531,6 @@ noinline int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, key.offset = (u64)-1; key.type = (u8)-1; - btrfs_init_path(path); - search_again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) -- cgit v1.2.3 From a48ddf08ba9bab91efd95e458737afa9d7699623 Mon Sep 17 00:00:00 2001 From: Qinghuang Feng Date: Thu, 12 Feb 2009 14:25:23 -0500 Subject: Btrfs: remove unused code in split_state() These two lines are not used, remove them. Signed-off-by: Qinghuang Feng Signed-off-by: Chris Mason --- fs/btrfs/extent_io.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 37d43b516b7..ebe6b29e606 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -415,8 +415,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, node = tree_insert(&tree->state, prealloc->end, &prealloc->rb_node); if (node) { - struct extent_state *found; - found = rb_entry(node, struct extent_state, rb_node); free_extent_state(prealloc); return -EEXIST; } -- cgit v1.2.3 From 3f3420df505e47751ef76a652b5cb660e5360d6f Mon Sep 17 00:00:00 2001 From: Julia Lawall Date: Thu, 12 Feb 2009 10:16:03 -0500 Subject: Btrfs: fs/btrfs/volumes.c: remove useless kzalloc The call to kzalloc is followed by a kmalloc whose result is stored in the same variable. The semantic match that finds the problem is as follows: (http://www.emn.fr/x-info/coccinelle/) // @r exists@ local idexpression x; statement S; expression E; identifier f,l; position p1,p2; expression *ptr != NULL; @@ ( if ((x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...)) == NULL) S | x@p1 = \(kmalloc\|kzalloc\|kcalloc\)(...); ... if (x == NULL) S ) <... when != x when != if (...) { <+...x...+> } x->f = E ...> ( return \(0\|<+...x...+>\|ptr\); | return@p2 ...; ) @script:python@ p1 << r.p1; p2 << r.p2; @@ print "* file: %s kmalloc %s return %s" % (p1[0].file,p1[0].line,p2[0].line) // Signed-off-by: Julia Lawall Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 4 ---- 1 file changed, 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bcd14ebccae..c793b6f50d8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2894,10 +2894,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, free_extent_map(em); } - map = kzalloc(sizeof(*map), GFP_NOFS); - if (!map) - return -ENOMEM; - em = alloc_extent_map(GFP_NOFS); if (!em) return -ENOMEM; -- cgit v1.2.3 From 4008c04a07c73ec3cb1be4c1391d2159a8f75d6d Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Thu, 12 Feb 2009 14:09:45 -0500 Subject: Btrfs: make a lockdep class for the extent buffer locks Btrfs is currently using spin_lock_nested with a nested value based on the tree depth of the block. But, this doesn't quite work because the max tree depth is bigger than what spin_lock_nested can deal with, and because locks are sometimes taken before the level field is filled in. The solution here is to use lockdep_set_class_and_name instead, and to set the class before unlocking the pages when the block is read from the disk and just after init of a freshly allocated tree block. btrfs_clear_path_blocking is also changed to take the locks in the proper order, and it also makes sure all the locks currently held are properly set to blocking before it tries to retake the spinlocks. Otherwise, lockdep gets upset about bad lock orderin. The lockdep magic cam from Peter Zijlstra Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 45 ++++++++++++++++++++++++++++++++++----------- fs/btrfs/ctree.h | 10 +++------- fs/btrfs/disk-io.c | 46 +++++++++++++++++++++++++++++++++++++++++++++- fs/btrfs/disk-io.h | 10 ++++++++++ fs/btrfs/extent-tree.c | 7 +++++-- fs/btrfs/locking.c | 11 ----------- fs/btrfs/volumes.c | 2 ++ 7 files changed, 99 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index c8f4c540cc2..42491d728e9 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -62,14 +62,38 @@ noinline void btrfs_set_path_blocking(struct btrfs_path *p) /* * reset all the locked nodes in the patch to spinning locks. + * + * held is used to keep lockdep happy, when lockdep is enabled + * we set held to a blocking lock before we go around and + * retake all the spinlocks in the path. You can safely use NULL + * for held */ -noinline void btrfs_clear_path_blocking(struct btrfs_path *p) +noinline void btrfs_clear_path_blocking(struct btrfs_path *p, + struct extent_buffer *held) { int i; - for (i = 0; i < BTRFS_MAX_LEVEL; i++) { + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* lockdep really cares that we take all of these spinlocks + * in the right order. If any of the locks in the path are not + * currently blocking, it is going to complain. So, make really + * really sure by forcing the path to blocking before we clear + * the path blocking. + */ + if (held) + btrfs_set_lock_blocking(held); + btrfs_set_path_blocking(p); +#endif + + for (i = BTRFS_MAX_LEVEL - 1; i >= 0; i--) { if (p->nodes[i] && p->locks[i]) btrfs_clear_lock_blocking(p->nodes[i]); } + +#ifdef CONFIG_DEBUG_LOCK_ALLOC + if (held) + btrfs_clear_lock_blocking(held); +#endif } /* this also releases the path */ @@ -279,7 +303,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, trans->transid, level, &ins); BUG_ON(ret); cow = btrfs_init_new_buffer(trans, root, prealloc_dest, - buf->len); + buf->len, level); } else { cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, @@ -1559,7 +1583,7 @@ cow_done: if (!p->skip_locking) p->locks[level] = 1; - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); /* * we have a lock on b and as long as we aren't changing @@ -1598,7 +1622,7 @@ cow_done: btrfs_set_path_blocking(p); sret = split_node(trans, root, p, level); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); BUG_ON(sret > 0); if (sret) { @@ -1618,7 +1642,7 @@ cow_done: btrfs_set_path_blocking(p); sret = balance_level(trans, root, p, level); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); if (sret) { ret = sret; @@ -1681,13 +1705,13 @@ cow_done: if (!p->skip_locking) { int lret; - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); lret = btrfs_try_spin_lock(b); if (!lret) { btrfs_set_path_blocking(p); btrfs_tree_lock(b); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, b); } } } else { @@ -1699,7 +1723,7 @@ cow_done: btrfs_set_path_blocking(p); sret = split_leaf(trans, root, key, p, ins_len, ret == 0); - btrfs_clear_path_blocking(p); + btrfs_clear_path_blocking(p, NULL); BUG_ON(sret > 0); if (sret) { @@ -3919,7 +3943,6 @@ find_next_key: btrfs_release_path(root, path); goto again; } else { - btrfs_clear_path_blocking(path); goto out; } } @@ -3939,7 +3962,7 @@ find_next_key: path->locks[level - 1] = 1; path->nodes[level - 1] = cur; unlock_up(path, level, 1); - btrfs_clear_path_blocking(path); + btrfs_clear_path_blocking(path, NULL); } out: if (ret == 0) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 3f7a8058df2..766b31ae318 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -43,11 +43,7 @@ struct btrfs_ordered_sum; #define BTRFS_ACL_NOT_CACHED ((void *)-1) -#ifdef CONFIG_LOCKDEP -# define BTRFS_MAX_LEVEL 7 -#else -# define BTRFS_MAX_LEVEL 8 -#endif +#define BTRFS_MAX_LEVEL 8 /* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL @@ -1715,7 +1711,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, u64 empty_size); struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize); + u64 bytenr, u32 blocksize, + int level); int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 num_bytes, u64 parent, u64 min_bytes, @@ -1835,7 +1832,6 @@ void btrfs_release_path(struct btrfs_root *root, struct btrfs_path *p); struct btrfs_path *btrfs_alloc_path(void); void btrfs_free_path(struct btrfs_path *p); void btrfs_set_path_blocking(struct btrfs_path *p); -void btrfs_clear_path_blocking(struct btrfs_path *p); void btrfs_unlock_up_safe(struct btrfs_path *p, int level); int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 5aebddd7119..adda739a021 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -75,6 +75,40 @@ struct async_submit_bio { struct btrfs_work work; }; +/* These are used to set the lockdep class on the extent buffer locks. + * The class is set by the readpage_end_io_hook after the buffer has + * passed csum validation but before the pages are unlocked. + * + * The lockdep class is also set by btrfs_init_new_buffer on freshly + * allocated blocks. + * + * The class is based on the level in the tree block, which allows lockdep + * to know that lower nodes nest inside the locks of higher nodes. + * + * We also add a check to make sure the highest level of the tree is + * the same as our lockdep setup here. If BTRFS_MAX_LEVEL changes, this + * code needs update as well. + */ +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# if BTRFS_MAX_LEVEL != 8 +# error +# endif +static struct lock_class_key btrfs_eb_class[BTRFS_MAX_LEVEL + 1]; +static const char *btrfs_eb_name[BTRFS_MAX_LEVEL + 1] = { + /* leaf */ + "btrfs-extent-00", + "btrfs-extent-01", + "btrfs-extent-02", + "btrfs-extent-03", + "btrfs-extent-04", + "btrfs-extent-05", + "btrfs-extent-06", + "btrfs-extent-07", + /* highest possible level */ + "btrfs-extent-08", +}; +#endif + /* * extents on the btree inode are pretty simple, there's one extent * that covers the entire device @@ -347,6 +381,15 @@ static int check_tree_block_fsid(struct btrfs_root *root, return ret; } +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level) +{ + lockdep_set_class_and_name(&eb->lock, + &btrfs_eb_class[level], + btrfs_eb_name[level]); +} +#endif + static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, struct extent_state *state) { @@ -392,6 +435,8 @@ static int btree_readpage_end_io_hook(struct page *page, u64 start, u64 end, } found_level = btrfs_header_level(eb); + btrfs_set_buffer_lockdep_class(eb, found_level); + ret = csum_tree_block(root, eb, 1); if (ret) ret = -EIO; @@ -1777,7 +1822,6 @@ struct btrfs_root *open_ctree(struct super_block *sb, ret = find_and_setup_root(tree_root, fs_info, BTRFS_DEV_TREE_OBJECTID, dev_root); dev_root->track_dirty = 1; - if (ret) goto fail_extent_root; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 494a56eb298..95029db227b 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -101,4 +101,14 @@ int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_add_log_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btree_lock_page_hook(struct page *page); + + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, int level); +#else +static inline void btrfs_set_buffer_lockdep_class(struct extent_buffer *eb, + int level) +{ +} +#endif #endif diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index c59e12036e2..cd86bffbdc9 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3416,7 +3416,8 @@ int btrfs_alloc_extent(struct btrfs_trans_handle *trans, struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, u32 blocksize) + u64 bytenr, u32 blocksize, + int level) { struct extent_buffer *buf; @@ -3424,6 +3425,7 @@ struct extent_buffer *btrfs_init_new_buffer(struct btrfs_trans_handle *trans, if (!buf) return ERR_PTR(-ENOMEM); btrfs_set_header_generation(buf, trans->transid); + btrfs_set_buffer_lockdep_class(buf, level); btrfs_tree_lock(buf); clean_tree_block(trans, root, buf); @@ -3467,7 +3469,8 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - buf = btrfs_init_new_buffer(trans, root, ins.objectid, blocksize); + buf = btrfs_init_new_buffer(trans, root, ins.objectid, + blocksize, level); return buf; } diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 9ebe9385129..85506c4a3af 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -25,21 +25,10 @@ #include "extent_io.h" #include "locking.h" -/* - * btrfs_header_level() isn't free, so don't call it when lockdep isn't - * on - */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC -static inline void spin_nested(struct extent_buffer *eb) -{ - spin_lock_nested(&eb->lock, BTRFS_MAX_LEVEL - btrfs_header_level(eb)); -} -#else static inline void spin_nested(struct extent_buffer *eb) { spin_lock(&eb->lock); } -#endif /* * Setting a lock to blocking will drop the spinlock and set the diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c793b6f50d8..1316139bf9e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -3102,6 +3102,8 @@ int btrfs_read_sys_array(struct btrfs_root *root) if (!sb) return -ENOMEM; btrfs_set_buffer_uptodate(sb); + btrfs_set_buffer_lockdep_class(sb, 0); + write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE); array_size = btrfs_super_sys_array_size(super_copy); -- cgit v1.2.3 From 2456242530a21cfee82646ebeeda65d3f74faa4c Mon Sep 17 00:00:00 2001 From: Yan Zheng Date: Thu, 12 Feb 2009 14:14:53 -0500 Subject: Btrfs: hold trans_mutex when using btrfs_record_root_in_trans btrfs_record_root_in_trans needs the trans_mutex held to make sure two callers don't race to setup the root in a given transaction. This adds it to all the places that were missing it. Signed-off-by: Yan Zheng --- fs/btrfs/extent-tree.c | 2 ++ fs/btrfs/transaction.c | 2 ++ fs/btrfs/tree-log.c | 2 ++ 3 files changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index cd86bffbdc9..0a5d796c9f7 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -5658,7 +5658,9 @@ static noinline int relocate_one_extent(struct btrfs_root *extent_root, prev_block = block_start; } + mutex_lock(&extent_root->fs_info->trans_mutex); btrfs_record_root_in_trans(found_root); + mutex_unlock(&extent_root->fs_info->trans_mutex); if (ref_path->owner_objectid >= BTRFS_FIRST_FREE_OBJECTID) { /* * try to update data extent references while diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index 919172de5c9..4112d53d4f4 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -688,7 +688,9 @@ static noinline int drop_dirty_roots(struct btrfs_root *tree_root, num_bytes -= btrfs_root_used(&dirty->root->root_item); bytes_used = btrfs_root_used(&root->root_item); if (num_bytes) { + mutex_lock(&root->fs_info->trans_mutex); btrfs_record_root_in_trans(root); + mutex_unlock(&root->fs_info->trans_mutex); btrfs_set_root_used(&root->root_item, bytes_used - num_bytes); } diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 20794290256..9c462fbd60f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -2832,7 +2832,9 @@ again: BUG_ON(!wc.replay_dest); wc.replay_dest->log_root = log; + mutex_lock(&fs_info->trans_mutex); btrfs_record_root_in_trans(wc.replay_dest); + mutex_unlock(&fs_info->trans_mutex); ret = walk_log_tree(trans, log, &wc); BUG_ON(ret); -- cgit v1.2.3 From efab0b5d3eed6aa71f8e3233e4e11774eedc04dc Mon Sep 17 00:00:00 2001 From: Andres Salomon Date: Wed, 11 Feb 2009 13:27:02 -0800 Subject: [JFFS2] force the jffs2 GC daemon to behave a bit better I've noticed some pretty poor behavior on OLPC machines after bootup, when gdm/X are starting. The GCD monopolizes the scheduler (which in turns means it gets to do more nand i/o), which results in processes taking much much longer than they should to start. As an example, on an OLPC machine going from OFW to a usable X (via auto-login gdm) takes 2m 30s. The majority of this time is consumed by the switch into graphical mode. With this patch, we cut a full 60s off of bootup time. After bootup, things are much snappier as well. Note that we have seen a CRC node error with this patch that causes the machine to fail to boot, but we've also seen that problem without this patch. Signed-off-by: Andres Salomon Signed-off-by: Andrew Morton Signed-off-by: David Woodhouse --- fs/jffs2/background.c | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 3cceef4ad2b..e9580104b6b 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -95,13 +95,17 @@ static int jffs2_garbage_collect_thread(void *_c) spin_unlock(&c->erase_completion_lock); - /* This thread is purely an optimisation. But if it runs when - other things could be running, it actually makes things a - lot worse. Use yield() and put it at the back of the runqueue - every time. Especially during boot, pulling an inode in - with read_inode() is much preferable to having the GC thread - get there first. */ - yield(); + /* Problem - immediately after bootup, the GCD spends a lot + * of time in places like jffs2_kill_fragtree(); so much so + * that userspace processes (like gdm and X) are starved + * despite plenty of cond_resched()s and renicing. Yield() + * doesn't help, either (presumably because userspace and GCD + * are generally competing for a higher latency resource - + * disk). + * This forces the GCD to slow the hell down. Pulling an + * inode in with read_inode() is much preferable to having + * the GC thread get there first. */ + schedule_timeout_interruptible(msecs_to_jiffies(50)); /* Put_super will send a SIGKILL and then wait on the sem. */ -- cgit v1.2.3 From d794bf8e0936dce45104565cd48c571061f4c1e3 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 14 Feb 2009 10:31:16 -0500 Subject: ext4: Initialize preallocation list_head's properly When creating a new ext4_prealloc_space structure, we have to initialize its list_head pointers before we add them to any prealloc lists. Otherwise, with list debug enabled, we will get list corruption warnings. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index c962d069050..4415beeb0b6 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3693,6 +3693,8 @@ ext4_mb_new_inode_pa(struct ext4_allocation_context *ac) pa->pa_free = pa->pa_len; atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); + INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_linear = 0; @@ -3755,6 +3757,7 @@ ext4_mb_new_group_pa(struct ext4_allocation_context *ac) atomic_set(&pa->pa_count, 1); spin_lock_init(&pa->pa_lock); INIT_LIST_HEAD(&pa->pa_inode_list); + INIT_LIST_HEAD(&pa->pa_group_list); pa->pa_deleted = 0; pa->pa_linear = 1; -- cgit v1.2.3 From 2acf2c261b823d9d9ed954f348b97620297a36b5 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Sat, 14 Feb 2009 10:42:58 -0500 Subject: ext4: Implement range_cyclic in ext4_da_writepages instead of write_cache_pages With delayed allocation we lock the page in write_cache_pages() and try to build an in memory extent of contiguous blocks. This is needed so that we can get large contiguous blocks request. If range_cyclic mode is enabled, write_cache_pages() will loop back to the 0 index if no I/O has been done yet, and try to start writing from the beginning of the range. That causes an attempt to take the page lock of lower index page while holding the page lock of higher index page, which can cause a dead lock with another writeback thread. The solution is to implement the range_cyclic behavior in ext4_da_writepages() instead. http://bugzilla.kernel.org/show_bug.cgi?id=12579 Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 658c4a7f257..cbd2ca99d11 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2439,6 +2439,7 @@ static int ext4_da_writepages(struct address_space *mapping, int no_nrwrite_index_update; int pages_written = 0; long pages_skipped; + int range_cyclic, cycled = 1, io_done = 0; int needed_blocks, ret = 0, nr_to_writebump = 0; struct ext4_sb_info *sbi = EXT4_SB(mapping->host->i_sb); @@ -2490,9 +2491,15 @@ static int ext4_da_writepages(struct address_space *mapping, if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; - if (wbc->range_cyclic) + range_cyclic = wbc->range_cyclic; + if (wbc->range_cyclic) { index = mapping->writeback_index; - else + if (index) + cycled = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = LLONG_MAX; + wbc->range_cyclic = 0; + } else index = wbc->range_start >> PAGE_CACHE_SHIFT; mpd.wbc = wbc; @@ -2506,6 +2513,7 @@ static int ext4_da_writepages(struct address_space *mapping, wbc->no_nrwrite_index_update = 1; pages_skipped = wbc->pages_skipped; +retry: while (!ret && wbc->nr_to_write > 0) { /* @@ -2548,6 +2556,7 @@ static int ext4_da_writepages(struct address_space *mapping, pages_written += mpd.pages_written; wbc->pages_skipped = pages_skipped; ret = 0; + io_done = 1; } else if (wbc->nr_to_write) /* * There is no more writeout needed @@ -2556,6 +2565,13 @@ static int ext4_da_writepages(struct address_space *mapping, */ break; } + if (!io_done && !cycled) { + cycled = 1; + index = 0; + wbc->range_start = index << PAGE_CACHE_SHIFT; + wbc->range_end = mapping->writeback_index - 1; + goto retry; + } if (pages_skipped != wbc->pages_skipped) printk(KERN_EMERG "This should not happen leaving %s " "with nr_to_write = %ld ret = %d\n", @@ -2563,6 +2579,7 @@ static int ext4_da_writepages(struct address_space *mapping, /* Update index */ index += pages_written; + wbc->range_cyclic = range_cyclic; if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) /* * set the writeback_index so that range_cyclic -- cgit v1.2.3 From 090542641de833c6f756895fc2f139f046e298f9 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Sun, 15 Feb 2009 20:02:19 -0500 Subject: ext4: Fix NULL dereference in ext4_ext_migrate()'s error handling This was found through a code checker (http://repo.or.cz/w/smatch.git/). It looks like you might be able to trigger the error by trying to migrate a readonly file system. Signed-off-by: Dan Carpenter Signed-off-by: "Theodore Ts'o" --- fs/ext4/migrate.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index 734abca25e3..fe64d9f7985 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -481,7 +481,7 @@ int ext4_ext_migrate(struct inode *inode) + 1); if (IS_ERR(handle)) { retval = PTR_ERR(handle); - goto err_out; + return retval; } tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode, @@ -489,8 +489,7 @@ int ext4_ext_migrate(struct inode *inode) if (IS_ERR(tmp_inode)) { retval = -ENOMEM; ext4_journal_stop(handle); - tmp_inode = NULL; - goto err_out; + return retval; } i_size_write(tmp_inode, i_size_read(inode)); /* @@ -618,8 +617,7 @@ err_out: ext4_journal_stop(handle); - if (tmp_inode) - iput(tmp_inode); + iput(tmp_inode); return retval; } -- cgit v1.2.3 From 1a88b5364b535edaa321d70a566e358390ff0872 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 16 Feb 2009 02:38:12 +0000 Subject: Fix incomplete __mntput locking Getting this wrong caused WARNING: at fs/namespace.c:636 mntput_no_expire+0xac/0xf2() due to optimistically checking cpu_writer->mnt outside the spinlock. Here's what we really want: * we know that nobody will set cpu_writer->mnt to mnt from now on * all changes to that sucker are done under cpu_writer->lock * we want the laziest equivalent of spin_lock(&cpu_writer->lock); if (likely(cpu_writer->mnt != mnt)) { spin_unlock(&cpu_writer->lock); continue; } /* do stuff */ that would make sure we won't miss earlier setting of ->mnt done by another CPU. Anyway, for now we just move the spin_lock() earlier and move the test into the properly locked region. Signed-off-by: Al Viro Reported-and-tested-by: Li Zefan Signed-off-by: Linus Torvalds --- fs/namespace.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/namespace.c b/fs/namespace.c index 228d8c4bfd1..06f8e63f6cb 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -614,9 +614,11 @@ static inline void __mntput(struct vfsmount *mnt) */ for_each_possible_cpu(cpu) { struct mnt_writer *cpu_writer = &per_cpu(mnt_writers, cpu); - if (cpu_writer->mnt != mnt) - continue; spin_lock(&cpu_writer->lock); + if (cpu_writer->mnt != mnt) { + spin_unlock(&cpu_writer->lock); + continue; + } atomic_add(cpu_writer->count, &mnt->__mnt_writers); cpu_writer->count = 0; /* -- cgit v1.2.3 From a60e78e57a17d55bbd5a96da16fe9649d364b987 Mon Sep 17 00:00:00 2001 From: Subhash Peddamallu Date: Mon, 16 Feb 2009 10:27:07 +0100 Subject: fs/bio: bio_alloc_bioset: pass right object ptr to mempool_free When freeing from bio pool use right ptr to account for bs->front_pad, instead of bio ptr, Signed-off-by: Subhash Peddamallu Signed-off-by: Jens Axboe --- fs/bio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 062299acbcc..72ab251cdb9 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -302,9 +302,10 @@ void bio_init(struct bio *bio) struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { struct bio *bio = NULL; + void *p; if (bs) { - void *p = mempool_alloc(bs->bio_pool, gfp_mask); + p = mempool_alloc(bs->bio_pool, gfp_mask); if (p) bio = p + bs->front_pad; @@ -329,7 +330,7 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) } if (unlikely(!bvl)) { if (bs) - mempool_free(bio, bs->bio_pool); + mempool_free(p, bs->bio_pool); else kfree(bio); bio = NULL; -- cgit v1.2.3 From 78f707bfc723552e8309b7c38a8d0cc51012e813 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Tue, 17 Feb 2009 13:59:08 +0100 Subject: block: revert part of 18ce3751ccd488c78d3827e9f6bf54e6322676fb The above commit added WRITE_SYNC and switched various places to using that for committing writes that will be waited upon immediately after submission. However, this causes a performance regression with AS and CFQ for ext3 at least, since sync_dirty_buffer() will submit some writes with WRITE_SYNC while ext3 has sumitted others dependent writes without the sync flag set. This causes excessive anticipation/idling in the IO scheduler because sync and async writes get interleaved, causing a big performance regression for the below test case (which is meant to simulate sqlite like behaviour). ---- test case ---- int main(int argc, char **argv) { int fdes, i; FILE *fp; struct timeval start; struct timeval end; struct timeval res; gettimeofday(&start, NULL); for (i=0; i --- fs/buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 665d446b25b..62b57e330b6 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -3108,7 +3108,7 @@ int sync_dirty_buffer(struct buffer_head *bh) if (test_clear_buffer_dirty(bh)) { get_bh(bh); bh->b_end_io = end_buffer_write_sync; - ret = submit_bh(WRITE_SYNC, bh); + ret = submit_bh(WRITE, bh); wait_on_buffer(bh); if (buffer_eopnotsupp(bh)) { clear_buffer_eopnotsupp(bh); -- cgit v1.2.3 From 8f19d472935c83d823fa4cf02bcc0a7b9952db30 Mon Sep 17 00:00:00 2001 From: Eric Biederman Date: Wed, 18 Feb 2009 14:48:16 -0800 Subject: seq_file: properly cope with pread Currently seq_read assumes that the offset passed to it is always the offset it passed to user space. In the case pread this assumption is broken and we do the wrong thing when presented with pread. To solve this I introduce an offset cache inside of struct seq_file so we know where our logical file position is. Then in seq_read if we try to read from another offset we reset our data structures and attempt to go to the offset user space wanted. [akpm@linux-foundation.org: restore FMODE_PWRITE] [pjt@google.com: seq_open needs its fmode opened up to take advantage of this] Signed-off-by: Eric Biederman Cc: Alexey Dobriyan Cc: Al Viro Cc: Paul Turner Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/seq_file.c | 36 ++++++++++++++++++++++++++++++++---- 1 file changed, 32 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/seq_file.c b/fs/seq_file.c index 5267098532b..a1a4cfe1921 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -48,8 +48,16 @@ int seq_open(struct file *file, const struct seq_operations *op) */ file->f_version = 0; - /* SEQ files support lseek, but not pread/pwrite */ - file->f_mode &= ~(FMODE_PREAD | FMODE_PWRITE); + /* + * seq_files support lseek() and pread(). They do not implement + * write() at all, but we clear FMODE_PWRITE here for historical + * reasons. + * + * If a client of seq_files a) implements file.write() and b) wishes to + * support pwrite() then that client will need to implement its own + * file.open() which calls seq_open() and then sets FMODE_PWRITE. + */ + file->f_mode &= ~FMODE_PWRITE; return 0; } EXPORT_SYMBOL(seq_open); @@ -131,6 +139,22 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) int err = 0; mutex_lock(&m->lock); + + /* Don't assume *ppos is where we left it */ + if (unlikely(*ppos != m->read_pos)) { + m->read_pos = *ppos; + while ((err = traverse(m, *ppos)) == -EAGAIN) + ; + if (err) { + /* With prejudice... */ + m->read_pos = 0; + m->version = 0; + m->index = 0; + m->count = 0; + goto Done; + } + } + /* * seq_file->op->..m_start/m_stop/m_next may do special actions * or optimisations based on the file->f_version, so we want to @@ -230,8 +254,10 @@ Fill: Done: if (!copied) copied = err; - else + else { *ppos += copied; + m->read_pos += copied; + } file->f_version = m->version; mutex_unlock(&m->lock); return copied; @@ -266,16 +292,18 @@ loff_t seq_lseek(struct file *file, loff_t offset, int origin) if (offset < 0) break; retval = offset; - if (offset != file->f_pos) { + if (offset != m->read_pos) { while ((retval=traverse(m, offset)) == -EAGAIN) ; if (retval) { /* with extreme prejudice... */ file->f_pos = 0; + m->read_pos = 0; m->version = 0; m->index = 0; m->count = 0; } else { + m->read_pos = offset; retval = file->f_pos = offset; } } -- cgit v1.2.3 From 610d18f4128ebbd88845d0fc60cce67b49af881e Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Wed, 18 Feb 2009 14:48:18 -0800 Subject: timerfd: add flags check As requested by Michael, add a missing check for valid flags in timerfd_settime(), and make it return EINVAL in case some extra bits are set. Michael said: If this is to be any use to userland apps that want to check flag support (perhaps it is too late already), then the sooner we get it into the kernel the better: 2.6.29 would be good; earlier stables as well would be even better. [akpm@linux-foundation.org: remove unused TFD_FLAGS_SET] Acked-by: Michael Kerrisk Signed-off-by: Davide Libenzi Cc: [2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/timerfd.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/timerfd.c b/fs/timerfd.c index 6a123b8ff3f..b042bd7034b 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -186,10 +186,9 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) BUILD_BUG_ON(TFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(TFD_NONBLOCK != O_NONBLOCK); - if (flags & ~(TFD_CLOEXEC | TFD_NONBLOCK)) - return -EINVAL; - if (clockid != CLOCK_MONOTONIC && - clockid != CLOCK_REALTIME) + if ((flags & ~TFD_CREATE_FLAGS) || + (clockid != CLOCK_MONOTONIC && + clockid != CLOCK_REALTIME)) return -EINVAL; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); @@ -201,7 +200,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) hrtimer_init(&ctx->tmr, clockid, HRTIMER_MODE_ABS); ufd = anon_inode_getfd("[timerfd]", &timerfd_fops, ctx, - flags & (O_CLOEXEC | O_NONBLOCK)); + flags & TFD_SHARED_FCNTL_FLAGS); if (ufd < 0) kfree(ctx); @@ -219,7 +218,8 @@ SYSCALL_DEFINE4(timerfd_settime, int, ufd, int, flags, if (copy_from_user(&ktmr, utmr, sizeof(ktmr))) return -EFAULT; - if (!timespec_valid(&ktmr.it_value) || + if ((flags & ~TFD_SETTIME_FLAGS) || + !timespec_valid(&ktmr.it_value) || !timespec_valid(&ktmr.it_interval)) return -EINVAL; -- cgit v1.2.3 From 1cf6e7d83bf334cc5916137862c920a97aabc018 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Wed, 18 Feb 2009 14:48:18 -0800 Subject: mm: task dirty accounting fix YAMAMOTO-san noticed that task_dirty_inc doesn't seem to be called properly for cases where set_page_dirty is not used to dirty a page (eg. mark_buffer_dirty). Additionally, there is some inconsistency about when task_dirty_inc is called. It is used for dirty balancing, however it even gets called for __set_page_dirty_no_writeback. So rather than increment it in a set_page_dirty wrapper, move it down to exactly where the dirty page accounting stats are incremented. Cc: YAMAMOTO Takashi Signed-off-by: Nick Piggin Acked-by: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/buffer.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 665d446b25b..ff4d1cdd779 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -777,6 +777,7 @@ static int __set_page_dirty(struct page *page, __inc_zone_page_state(page, NR_FILE_DIRTY); __inc_bdi_stat(mapping->backing_dev_info, BDI_RECLAIMABLE); + task_dirty_inc(current); task_io_account_write(PAGE_CACHE_SIZE); } radix_tree_tag_set(&mapping->page_tree, -- cgit v1.2.3 From ada723dcd681e2dffd7d73345cc8fda0eb0df9bd Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Wed, 18 Feb 2009 14:48:30 -0800 Subject: fs/super.c: add lockdep annotation to s_umount Li Zefan said: Thread 1: for ((; ;)) { mount -t cpuset xxx /mnt > /dev/null 2>&1 cat /mnt/cpus > /dev/null 2>&1 umount /mnt > /dev/null 2>&1 } Thread 2: for ((; ;)) { mount -t cpuset xxx /mnt > /dev/null 2>&1 umount /mnt > /dev/null 2>&1 } (Note: It is irrelevant which cgroup subsys is used.) After a while a lockdep warning showed up: ============================================= [ INFO: possible recursive locking detected ] 2.6.28 #479 --------------------------------------------- mount/13554 is trying to acquire lock: (&type->s_umount_key#19){--..}, at: [] sget+0x5e/0x321 but task is already holding lock: (&type->s_umount_key#19){--..}, at: [] sget+0x1e2/0x321 other info that might help us debug this: 1 lock held by mount/13554: #0: (&type->s_umount_key#19){--..}, at: [] sget+0x1e2/0x321 stack backtrace: Pid: 13554, comm: mount Not tainted 2.6.28-mc #479 Call Trace: [] validate_chain+0x4c6/0xbbd [] __lock_acquire+0x676/0x700 [] lock_acquire+0x5d/0x7a [] ? sget+0x5e/0x321 [] down_write+0x34/0x50 [] ? sget+0x5e/0x321 [] sget+0x5e/0x321 [] ? cgroup_set_super+0x0/0x3e [] ? cgroup_test_super+0x0/0x2f [] cgroup_get_sb+0x98/0x2e7 [] cpuset_get_sb+0x4a/0x5f [] vfs_kern_mount+0x40/0x7b [] do_kern_mount+0x37/0xbf [] do_mount+0x5c3/0x61a [] ? copy_mount_options+0x2c/0x111 [] sys_mount+0x69/0xa0 [] sysenter_do_call+0x12/0x31 The cause is after alloc_super() and then retry, an old entry in list fs_supers is found, so grab_super(old) is called, but both functions hold s_umount lock: struct super_block *sget(...) { ... retry: spin_lock(&sb_lock); if (test) { list_for_each_entry(old, &type->fs_supers, s_instances) { if (!test(old, data)) continue; if (!grab_super(old)) <--- 2nd: down_write(&old->s_umount); goto retry; if (s) destroy_super(s); return old; } } if (!s) { spin_unlock(&sb_lock); s = alloc_super(type); <--- 1th: down_write(&s->s_umount) if (!s) return ERR_PTR(-ENOMEM); goto retry; } ... } It seems like a false positive, and seems like VFS but not cgroup needs to be fixed. Peter said: We can simply put the new s_umount instance in a but lockdep doesn't particularly cares about subclass order. If there's any issue with the callers of sget() assuming the s_umount lock being of sublcass 0, then there is another annotation we can use to fix that, but lets not bother with that if this is sufficient. Addresses http://bugzilla.kernel.org/show_bug.cgi?id=12673 Signed-off-by: Peter Zijlstra Tested-by: Li Zefan Reported-by: Li Zefan Cc: Al Viro Cc: Paul Menage Cc: Arjan van de Ven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 17 ++++++++++++++++- 1 file changed, 16 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 61dce001dd5..8349ed6b141 100644 --- a/fs/super.c +++ b/fs/super.c @@ -82,7 +82,22 @@ static struct super_block *alloc_super(struct file_system_type *type) * lock ordering than usbfs: */ lockdep_set_class(&s->s_lock, &type->s_lock_key); - down_write(&s->s_umount); + /* + * sget() can have s_umount recursion. + * + * When it cannot find a suitable sb, it allocates a new + * one (this one), and tries again to find a suitable old + * one. + * + * In case that succeeds, it will acquire the s_umount + * lock of the old one. Since these are clearly distrinct + * locks, and this object isn't exposed yet, there's no + * risk of deadlocks. + * + * Annotate this by putting this lock in a different + * subclass. + */ + down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); s->s_count = S_BIAS; atomic_set(&s->s_active, 1); mutex_init(&s->s_vfs_rename_mutex); -- cgit v1.2.3 From 2db69a9340da12a4db44edb7506dd68799aeff55 Mon Sep 17 00:00:00 2001 From: Bill Nottingham Date: Wed, 18 Feb 2009 14:48:39 -0800 Subject: vt: Declare PIO_CMAP/GIO_CMAP as compatbile ioctls. Otherwise, these don't work when called from 32-bit userspace on 64-bit kernels. Cc: Jiri Kosina Cc: Alan Cox Cc: [2.6.25.x, 2.6.26.x, 2.6.27.x, 2.6.28.x] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/compat_ioctl.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 9c6d815dd19..39bd4d38e88 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1938,6 +1938,8 @@ ULONG_IOCTL(SET_BITMAP_FILE) /* Big K */ COMPATIBLE_IOCTL(PIO_FONT) COMPATIBLE_IOCTL(GIO_FONT) +COMPATIBLE_IOCTL(PIO_CMAP) +COMPATIBLE_IOCTL(GIO_CMAP) ULONG_IOCTL(KDSIGACCEPT) COMPATIBLE_IOCTL(KDGETKEYCODE) COMPATIBLE_IOCTL(KDSETKEYCODE) -- cgit v1.2.3 From f04b30de3c82528f1ab4c58b3dd4c975f5341901 Mon Sep 17 00:00:00 2001 From: Ingo Molnar Date: Wed, 18 Feb 2009 14:48:43 -0800 Subject: inotify: fix GFP_KERNEL related deadlock Enhanced lockdep coverage of __GFP_NOFS turned up this new lockdep assert: [ 1093.677775] [ 1093.677781] ================================= [ 1093.680031] [ INFO: inconsistent lock state ] [ 1093.680031] 2.6.29-rc5-tip-01504-gb49eca1-dirty #1 [ 1093.680031] --------------------------------- [ 1093.680031] inconsistent {RECLAIM_FS-ON-W} -> {IN-RECLAIM_FS-W} usage. [ 1093.680031] kswapd0/308 [HC0[0]:SC0[0]:HE1:SE1] takes: [ 1093.680031] (&inode->inotify_mutex){+.+.?.}, at: [] inotify_inode_is_dead+0x20/0x80 [ 1093.680031] {RECLAIM_FS-ON-W} state was registered at: [ 1093.680031] [] mark_held_locks+0x43/0x5b [ 1093.680031] [] lockdep_trace_alloc+0x6c/0x6e [ 1093.680031] [] kmem_cache_alloc+0x20/0x150 [ 1093.680031] [] idr_pre_get+0x27/0x6c [ 1093.680031] [] inotify_handle_get_wd+0x25/0xad [ 1093.680031] [] inotify_add_watch+0x7a/0x129 [ 1093.680031] [] sys_inotify_add_watch+0x20f/0x250 [ 1093.680031] [] sysenter_do_call+0x12/0x35 [ 1093.680031] [] 0xffffffff [ 1093.680031] irq event stamp: 60417 [ 1093.680031] hardirqs last enabled at (60417): [] call_rcu+0x53/0x59 [ 1093.680031] hardirqs last disabled at (60416): [] call_rcu+0x17/0x59 [ 1093.680031] softirqs last enabled at (59656): [] __do_softirq+0x157/0x16b [ 1093.680031] softirqs last disabled at (59651): [] do_softirq+0x74/0x15d [ 1093.680031] [ 1093.680031] other info that might help us debug this: [ 1093.680031] 2 locks held by kswapd0/308: [ 1093.680031] #0: (shrinker_rwsem){++++..}, at: [] shrink_slab+0x36/0x189 [ 1093.680031] #1: (&type->s_umount_key#4){+++++.}, at: [] shrink_dcache_memory+0x110/0x1fb [ 1093.680031] [ 1093.680031] stack backtrace: [ 1093.680031] Pid: 308, comm: kswapd0 Not tainted 2.6.29-rc5-tip-01504-gb49eca1-dirty #1 [ 1093.680031] Call Trace: [ 1093.680031] [] valid_state+0x12a/0x13d [ 1093.680031] [] mark_lock+0xc1/0x1e9 [ 1093.680031] [] ? check_usage_forwards+0x0/0x3f [ 1093.680031] [] __lock_acquire+0x2c6/0xac8 [ 1093.680031] [] ? register_lock_class+0x17/0x228 [ 1093.680031] [] lock_acquire+0x5d/0x7a [ 1093.680031] [] ? inotify_inode_is_dead+0x20/0x80 [ 1093.680031] [] __mutex_lock_common+0x3a/0x4cb [ 1093.680031] [] ? inotify_inode_is_dead+0x20/0x80 [ 1093.680031] [] mutex_lock_nested+0x2e/0x36 [ 1093.680031] [] ? inotify_inode_is_dead+0x20/0x80 [ 1093.680031] [] inotify_inode_is_dead+0x20/0x80 [ 1093.680031] [] dentry_iput+0x90/0xc2 [ 1093.680031] [] d_kill+0x21/0x45 [ 1093.680031] [] __shrink_dcache_sb+0x27f/0x355 [ 1093.680031] [] shrink_dcache_memory+0x15e/0x1fb [ 1093.680031] [] shrink_slab+0x121/0x189 [ 1093.680031] [] kswapd+0x39f/0x561 [ 1093.680031] [] ? isolate_pages_global+0x0/0x233 [ 1093.680031] [] ? autoremove_wake_function+0x0/0x43 [ 1093.680031] [] ? kswapd+0x0/0x561 [ 1093.680031] [] kthread+0x41/0x82 [ 1093.680031] [] ? kthread+0x0/0x82 [ 1093.680031] [] kernel_thread_helper+0x7/0x10 inotify_handle_get_wd() does idr_pre_get() which does a kmem_cache_alloc() without __GFP_FS - and is hence deadlockable under extreme MM pressure. Signed-off-by: Ingo Molnar Acked-by: Peter Zijlstra Cc: MinChan Kim Cc: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/inotify/inotify.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/notify/inotify/inotify.c b/fs/notify/inotify/inotify.c index dae3f28f30d..331f2e88e28 100644 --- a/fs/notify/inotify/inotify.c +++ b/fs/notify/inotify/inotify.c @@ -156,7 +156,7 @@ static int inotify_handle_get_wd(struct inotify_handle *ih, int ret; do { - if (unlikely(!idr_pre_get(&ih->idr, GFP_KERNEL))) + if (unlikely(!idr_pre_get(&ih->idr, GFP_NOFS))) return -ENOSPC; ret = idr_get_new_above(&ih->idr, watch, ih->last_wd+1, &watch->wd); } while (ret == -EAGAIN); -- cgit v1.2.3 From 7fdf582447aa01658b624adc0a51a31e4278b68c Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Wed, 18 Feb 2009 15:41:28 -0600 Subject: Revert "[XFS] use scalable vmap API" This reverts commit 95f8e302c04c0b0c6de35ab399a5551605eeb006. This commit caused regression. We'll try to fix use of new vmap API for next release. Signed-off-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_buf.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index d71dc44e21e..0b2177a9fbd 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -264,7 +264,7 @@ xfs_buf_free( uint i; if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) - vm_unmap_ram(bp->b_addr - bp->b_offset, bp->b_page_count); + vunmap(bp->b_addr - bp->b_offset); for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; @@ -386,8 +386,8 @@ _xfs_buf_map_pages( bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1, PAGE_KERNEL); + bp->b_addr = vmap(bp->b_pages, bp->b_page_count, + VM_MAP, PAGE_KERNEL); if (unlikely(bp->b_addr == NULL)) return -ENOMEM; bp->b_addr += bp->b_offset; -- cgit v1.2.3 From 27e88bf6af7d42adf790f7b2ed7d65475f191cf2 Mon Sep 17 00:00:00 2001 From: Felix Blyakher Date: Wed, 18 Feb 2009 15:56:51 -0600 Subject: Revert "[XFS] remove old vmap cache" This reverts commit d2859751cd0bf586941ffa7308635a293f943c17. This commit caused regression. We'll try to fix use of new vmap API for next release. Signed-off-by: Christoph Hellwig Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_buf.c | 75 +++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 74 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 0b2177a9fbd..cb329edc925 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -165,6 +165,75 @@ test_page_region( return (mask && (page_private(page) & mask) == mask); } +/* + * Mapping of multi-page buffers into contiguous virtual space + */ + +typedef struct a_list { + void *vm_addr; + struct a_list *next; +} a_list_t; + +static a_list_t *as_free_head; +static int as_list_len; +static DEFINE_SPINLOCK(as_lock); + +/* + * Try to batch vunmaps because they are costly. + */ +STATIC void +free_address( + void *addr) +{ + a_list_t *aentry; + +#ifdef CONFIG_XEN + /* + * Xen needs to be able to make sure it can get an exclusive + * RO mapping of pages it wants to turn into a pagetable. If + * a newly allocated page is also still being vmap()ed by xfs, + * it will cause pagetable construction to fail. This is a + * quick workaround to always eagerly unmap pages so that Xen + * is happy. + */ + vunmap(addr); + return; +#endif + + aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); + if (likely(aentry)) { + spin_lock(&as_lock); + aentry->next = as_free_head; + aentry->vm_addr = addr; + as_free_head = aentry; + as_list_len++; + spin_unlock(&as_lock); + } else { + vunmap(addr); + } +} + +STATIC void +purge_addresses(void) +{ + a_list_t *aentry, *old; + + if (as_free_head == NULL) + return; + + spin_lock(&as_lock); + aentry = as_free_head; + as_free_head = NULL; + as_list_len = 0; + spin_unlock(&as_lock); + + while ((old = aentry) != NULL) { + vunmap(aentry->vm_addr); + aentry = aentry->next; + kfree(old); + } +} + /* * Internal xfs_buf_t object manipulation */ @@ -264,7 +333,7 @@ xfs_buf_free( uint i; if ((bp->b_flags & XBF_MAPPED) && (bp->b_page_count > 1)) - vunmap(bp->b_addr - bp->b_offset); + free_address(bp->b_addr - bp->b_offset); for (i = 0; i < bp->b_page_count; i++) { struct page *page = bp->b_pages[i]; @@ -386,6 +455,8 @@ _xfs_buf_map_pages( bp->b_addr = page_address(bp->b_pages[0]) + bp->b_offset; bp->b_flags |= XBF_MAPPED; } else if (flags & XBF_MAPPED) { + if (as_list_len > 64) + purge_addresses(); bp->b_addr = vmap(bp->b_pages, bp->b_page_count, VM_MAP, PAGE_KERNEL); if (unlikely(bp->b_addr == NULL)) @@ -1672,6 +1743,8 @@ xfsbufd( count++; } + if (as_list_len > 0) + purge_addresses(); if (count) blk_run_address_space(target->bt_mapping); -- cgit v1.2.3 From 2cfbd50b536c878e58ab3681c4e944fa3d99b415 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Fri, 20 Feb 2009 10:55:10 -0500 Subject: Btrfs: check file pointer in btrfs_sync_file fsync can be called by NFS with a null file pointer, and btrfs was oopsing in this case. Signed-off-by: Chris Mason --- fs/btrfs/file.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 3e8023efaff..872f104576e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1222,7 +1222,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) /* * ok we haven't committed the transaction yet, lets do a commit */ - if (file->private_data) + if (file && file->private_data) btrfs_ioctl_trans_end(file); trans = btrfs_start_transaction(root, 1); @@ -1231,7 +1231,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) goto out; } - ret = btrfs_log_dentry_safe(trans, root, file->f_dentry); + ret = btrfs_log_dentry_safe(trans, root, dentry); if (ret < 0) goto out; @@ -1245,7 +1245,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - mutex_unlock(&file->f_dentry->d_inode->i_mutex); + mutex_unlock(&dentry->d_inode->i_mutex); if (ret > 0) { ret = btrfs_commit_transaction(trans, root); @@ -1253,7 +1253,7 @@ int btrfs_sync_file(struct file *file, struct dentry *dentry, int datasync) btrfs_sync_log(trans, root); ret = btrfs_end_transaction(trans, root); } - mutex_lock(&file->f_dentry->d_inode->i_mutex); + mutex_lock(&dentry->d_inode->i_mutex); out: return ret > 0 ? EIO : ret; } -- cgit v1.2.3 From 6a63209fc02d5483371f07e4913ee8abad608051 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 20 Feb 2009 11:00:09 -0500 Subject: Btrfs: add better -ENOSPC handling This is a step in the direction of better -ENOSPC handling. Instead of checking the global bytes counter we check the space_info bytes counters to make sure we have enough space. If we don't we go ahead and try to allocate a new chunk, and then if that fails we return -ENOSPC. This patch adds two counters to btrfs_space_info, bytes_delalloc and bytes_may_use. bytes_delalloc account for extents we've actually setup for delalloc and will be allocated at some point down the line. bytes_may_use is to keep track of how many bytes we may use for delalloc at some point. When we actually set the extent_bit for the delalloc bytes we subtract the reserved bytes from the bytes_may_use counter. This keeps us from not actually being able to allocate space for any delalloc bytes. Signed-off-by: Josef Bacik --- fs/btrfs/btrfs_inode.h | 8 ++ fs/btrfs/ctree.h | 40 ++++++--- fs/btrfs/extent-tree.c | 215 +++++++++++++++++++++++++++++++++++++++++++++---- fs/btrfs/file.c | 16 +++- fs/btrfs/inode.c | 62 ++++---------- fs/btrfs/ioctl.c | 6 +- 6 files changed, 271 insertions(+), 76 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index a8c9693b75a..72677ce2b74 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -66,6 +66,9 @@ struct btrfs_inode { */ struct list_head delalloc_inodes; + /* the space_info for where this inode's data allocations are done */ + struct btrfs_space_info *space_info; + /* full 64 bit generation number, struct vfs_inode doesn't have a big * enough field for this. */ @@ -94,6 +97,11 @@ struct btrfs_inode { */ u64 delalloc_bytes; + /* total number of bytes that may be used for this inode for + * delalloc + */ + u64 reserved_bytes; + /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 766b31ae318..82491ba8fa4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -596,13 +596,27 @@ struct btrfs_block_group_item { struct btrfs_space_info { u64 flags; - u64 total_bytes; - u64 bytes_used; - u64 bytes_pinned; - u64 bytes_reserved; - u64 bytes_readonly; - int full; - int force_alloc; + + u64 total_bytes; /* total bytes in the space */ + u64 bytes_used; /* total bytes used on disk */ + u64 bytes_pinned; /* total bytes pinned, will be freed when the + transaction finishes */ + u64 bytes_reserved; /* total bytes the allocator has reserved for + current allocations */ + u64 bytes_readonly; /* total bytes that are read only */ + + /* delalloc accounting */ + u64 bytes_delalloc; /* number of bytes reserved for allocation, + this space is not necessarily reserved yet + by the allocator */ + u64 bytes_may_use; /* number of bytes that may be used for + delalloc */ + + int full; /* indicates that we cannot allocate any more + chunks for this space */ + int force_alloc; /* set if we need to force a chunk alloc for + this space */ + struct list_head list; /* for block groups in our same type */ @@ -1782,6 +1796,16 @@ int btrfs_add_dead_reloc_root(struct btrfs_root *root); int btrfs_cleanup_reloc_trees(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); +int btrfs_check_metadata_free_space(struct btrfs_root *root); +int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); +void btrfs_free_reserved_data_space(struct btrfs_root *root, + struct inode *inode, u64 bytes); +void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); +void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes); /* ctree.c */ int btrfs_previous_item(struct btrfs_root *root, struct btrfs_path *path, u64 min_objectid, @@ -2027,8 +2051,6 @@ int btrfs_merge_bio_hook(struct page *page, unsigned long offset, unsigned long btrfs_force_ra(struct address_space *mapping, struct file_ra_state *ra, struct file *file, pgoff_t offset, pgoff_t last_index); -int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, - int for_del); int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page); int btrfs_readpage(struct file *file, struct page *page); void btrfs_delete_inode(struct inode *inode); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 0a5d796c9f7..e11875e97c2 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -60,6 +60,10 @@ static int update_block_group(struct btrfs_trans_handle *trans, u64 bytenr, u64 num_bytes, int alloc, int mark_free); +static int do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_root *extent_root, u64 alloc_bytes, + u64 flags, int force); + static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits) { return (cache->flags & bits) == bits; @@ -1909,6 +1913,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->bytes_pinned = 0; found->bytes_reserved = 0; found->bytes_readonly = 0; + found->bytes_delalloc = 0; found->full = 0; found->force_alloc = 0; *space_info = found; @@ -1972,6 +1977,196 @@ u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags) return flags; } +static u64 btrfs_get_alloc_profile(struct btrfs_root *root, u64 data) +{ + struct btrfs_fs_info *info = root->fs_info; + u64 alloc_profile; + + if (data) { + alloc_profile = info->avail_data_alloc_bits & + info->data_alloc_profile; + data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; + } else if (root == root->fs_info->chunk_root) { + alloc_profile = info->avail_system_alloc_bits & + info->system_alloc_profile; + data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; + } else { + alloc_profile = info->avail_metadata_alloc_bits & + info->metadata_alloc_profile; + data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; + } + + return btrfs_reduce_alloc_profile(root, data); +} + +void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *inode) +{ + u64 alloc_target; + + alloc_target = btrfs_get_alloc_profile(root, 1); + BTRFS_I(inode)->space_info = __find_space_info(root->fs_info, + alloc_target); +} + +/* + * for now this just makes sure we have at least 5% of our metadata space free + * for use. + */ +int btrfs_check_metadata_free_space(struct btrfs_root *root) +{ + struct btrfs_fs_info *info = root->fs_info; + struct btrfs_space_info *meta_sinfo; + u64 alloc_target, thresh; + + /* get the space info for where the metadata will live */ + alloc_target = btrfs_get_alloc_profile(root, 0); + meta_sinfo = __find_space_info(info, alloc_target); + + /* + * if the metadata area isn't maxed out then there is no sense in + * checking how much is used, since we can always allocate a new chunk + */ + if (!meta_sinfo->full) + return 0; + + spin_lock(&meta_sinfo->lock); + thresh = meta_sinfo->total_bytes * 95; + + do_div(thresh, 100); + + if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { + spin_unlock(&meta_sinfo->lock); + return -ENOSPC; + } + spin_unlock(&meta_sinfo->lock); + + return 0; +} + +/* + * This will check the space that the inode allocates from to make sure we have + * enough space for bytes. + */ +int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + int ret = 0; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; +again: + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + if (data_sinfo->total_bytes - data_sinfo->bytes_used - + data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - + data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - + data_sinfo->bytes_may_use < bytes) { + /* + * if we don't have enough free bytes in this space then we need + * to alloc a new chunk. + */ + if (!data_sinfo->full) { + u64 alloc_target; + struct btrfs_trans_handle *trans; + + data_sinfo->force_alloc = 1; + spin_unlock(&data_sinfo->lock); + + alloc_target = btrfs_get_alloc_profile(root, 1); + trans = btrfs_start_transaction(root, 1); + if (!trans) + return -ENOMEM; + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + bytes + 2 * 1024 * 1024, + alloc_target, 0); + btrfs_end_transaction(trans, root); + if (ret) + return ret; + goto again; + } + spin_unlock(&data_sinfo->lock); + printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" + ", %llu bytes_used, %llu bytes_reserved, " + "%llu bytes_pinned, %llu bytes_readonly, %llu may use" + "%llu total\n", bytes, data_sinfo->bytes_delalloc, + data_sinfo->bytes_used, data_sinfo->bytes_reserved, + data_sinfo->bytes_pinned, data_sinfo->bytes_readonly, + data_sinfo->bytes_may_use, data_sinfo->total_bytes); + return -ENOSPC; + } + data_sinfo->bytes_may_use += bytes; + BTRFS_I(inode)->reserved_bytes += bytes; + spin_unlock(&data_sinfo->lock); + + return btrfs_check_metadata_free_space(root); +} + +/* + * if there was an error for whatever reason after calling + * btrfs_check_data_free_space, call this so we can cleanup the counters. + */ +void btrfs_free_reserved_data_space(struct btrfs_root *root, + struct inode *inode, u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + + /* make sure bytes are sectorsize aligned */ + bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); + + data_sinfo = BTRFS_I(inode)->space_info; + spin_lock(&data_sinfo->lock); + data_sinfo->bytes_may_use -= bytes; + BTRFS_I(inode)->reserved_bytes -= bytes; + spin_unlock(&data_sinfo->lock); +} + +/* called when we are adding a delalloc extent to the inode's io_tree */ +void btrfs_delalloc_reserve_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *data_sinfo; + + /* get the space info for where this inode will be storing its data */ + data_sinfo = BTRFS_I(inode)->space_info; + + /* make sure we have enough space to handle the data first */ + spin_lock(&data_sinfo->lock); + data_sinfo->bytes_delalloc += bytes; + + /* + * we are adding a delalloc extent without calling + * btrfs_check_data_free_space first. This happens on a weird + * writepage condition, but shouldn't hurt our accounting + */ + if (unlikely(bytes > BTRFS_I(inode)->reserved_bytes)) { + data_sinfo->bytes_may_use -= BTRFS_I(inode)->reserved_bytes; + BTRFS_I(inode)->reserved_bytes = 0; + } else { + data_sinfo->bytes_may_use -= bytes; + BTRFS_I(inode)->reserved_bytes -= bytes; + } + + spin_unlock(&data_sinfo->lock); +} + +/* called when we are clearing an delalloc extent from the inode's io_tree */ +void btrfs_delalloc_free_space(struct btrfs_root *root, struct inode *inode, + u64 bytes) +{ + struct btrfs_space_info *info; + + info = BTRFS_I(inode)->space_info; + + spin_lock(&info->lock); + info->bytes_delalloc -= bytes; + spin_unlock(&info->lock); +} + static int do_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 alloc_bytes, u64 flags, int force) @@ -3105,6 +3300,10 @@ static void dump_space_info(struct btrfs_space_info *info, u64 bytes) (unsigned long long)(info->total_bytes - info->bytes_used - info->bytes_pinned - info->bytes_reserved), (info->full) ? "" : "not "); + printk(KERN_INFO "space_info total=%llu, pinned=%llu, delalloc=%llu," + " may_use=%llu, used=%llu\n", info->total_bytes, + info->bytes_pinned, info->bytes_delalloc, info->bytes_may_use, + info->bytes_used); down_read(&info->groups_sem); list_for_each_entry(cache, &info->block_groups, list) { @@ -3131,24 +3330,10 @@ static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans, { int ret; u64 search_start = 0; - u64 alloc_profile; struct btrfs_fs_info *info = root->fs_info; - if (data) { - alloc_profile = info->avail_data_alloc_bits & - info->data_alloc_profile; - data = BTRFS_BLOCK_GROUP_DATA | alloc_profile; - } else if (root == root->fs_info->chunk_root) { - alloc_profile = info->avail_system_alloc_bits & - info->system_alloc_profile; - data = BTRFS_BLOCK_GROUP_SYSTEM | alloc_profile; - } else { - alloc_profile = info->avail_metadata_alloc_bits & - info->metadata_alloc_profile; - data = BTRFS_BLOCK_GROUP_METADATA | alloc_profile; - } + data = btrfs_get_alloc_profile(root, data); again: - data = btrfs_reduce_alloc_profile(root, data); /* * the only place that sets empty_size is btrfs_realloc_node, which * is not called recursively on allocations diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 872f104576e..dc78954861b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1091,19 +1091,24 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, WARN_ON(num_pages > nrptrs); memset(pages, 0, sizeof(struct page *) * nrptrs); - ret = btrfs_check_free_space(root, write_bytes, 0); + ret = btrfs_check_data_free_space(root, inode, write_bytes); if (ret) goto out; ret = prepare_pages(root, file, pages, num_pages, pos, first_index, last_index, write_bytes); - if (ret) + if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); goto out; + } ret = btrfs_copy_from_user(pos, num_pages, write_bytes, pages, buf); if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); btrfs_drop_pages(pages, num_pages); goto out; } @@ -1111,8 +1116,11 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, ret = dirty_and_release_pages(NULL, root, file, pages, num_pages, pos, write_bytes); btrfs_drop_pages(pages, num_pages); - if (ret) + if (ret) { + btrfs_free_reserved_data_space(root, inode, + write_bytes); goto out; + } if (will_write) { btrfs_fdatawrite_range(inode->i_mapping, pos, @@ -1136,6 +1144,8 @@ static ssize_t btrfs_file_write(struct file *file, const char __user *buf, } out: mutex_unlock(&inode->i_mutex); + if (ret) + err = ret; out_nolock: kfree(pages); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 3cee77ae03c..7d4f948bc22 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -101,34 +101,6 @@ static int btrfs_init_inode_security(struct inode *inode, struct inode *dir) return err; } -/* - * a very lame attempt at stopping writes when the FS is 85% full. There - * are countless ways this is incorrect, but it is better than nothing. - */ -int btrfs_check_free_space(struct btrfs_root *root, u64 num_required, - int for_del) -{ - u64 total; - u64 used; - u64 thresh; - int ret = 0; - - spin_lock(&root->fs_info->delalloc_lock); - total = btrfs_super_total_bytes(&root->fs_info->super_copy); - used = btrfs_super_bytes_used(&root->fs_info->super_copy); - if (for_del) - thresh = total * 90; - else - thresh = total * 85; - - do_div(thresh, 100); - - if (used + root->fs_info->delalloc_bytes + num_required > thresh) - ret = -ENOSPC; - spin_unlock(&root->fs_info->delalloc_lock); - return ret; -} - /* * this does all the hard work for inserting an inline extent into * the btree. The caller should have done a btrfs_drop_extents so that @@ -1190,6 +1162,7 @@ static int btrfs_set_bit_hook(struct inode *inode, u64 start, u64 end, */ if (!(old & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; + btrfs_delalloc_reserve_space(root, inode, end - start + 1); spin_lock(&root->fs_info->delalloc_lock); BTRFS_I(inode)->delalloc_bytes += end - start + 1; root->fs_info->delalloc_bytes += end - start + 1; @@ -1223,9 +1196,12 @@ static int btrfs_clear_bit_hook(struct inode *inode, u64 start, u64 end, (unsigned long long)end - start + 1, (unsigned long long) root->fs_info->delalloc_bytes); + btrfs_delalloc_free_space(root, inode, (u64)-1); root->fs_info->delalloc_bytes = 0; BTRFS_I(inode)->delalloc_bytes = 0; } else { + btrfs_delalloc_free_space(root, inode, + end - start + 1); root->fs_info->delalloc_bytes -= end - start + 1; BTRFS_I(inode)->delalloc_bytes -= end - start + 1; } @@ -2245,10 +2221,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) root = BTRFS_I(dir)->root; - ret = btrfs_check_free_space(root, 1, 1); - if (ret) - goto fail; - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); @@ -2261,7 +2233,6 @@ static int btrfs_unlink(struct inode *dir, struct dentry *dentry) nr = trans->blocks_used; btrfs_end_transaction_throttle(trans, root); -fail: btrfs_btree_balance_dirty(root, nr); return ret; } @@ -2284,10 +2255,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) return -ENOTEMPTY; } - ret = btrfs_check_free_space(root, 1, 1); - if (ret) - goto fail; - trans = btrfs_start_transaction(root, 1); btrfs_set_trans_block_group(trans, dir); @@ -2304,7 +2271,6 @@ static int btrfs_rmdir(struct inode *dir, struct dentry *dentry) fail_trans: nr = trans->blocks_used; ret = btrfs_end_transaction_throttle(trans, root); -fail: btrfs_btree_balance_dirty(root, nr); if (ret && !err) @@ -2818,7 +2784,7 @@ int btrfs_cont_expand(struct inode *inode, loff_t size) if (size <= hole_start) return 0; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) return err; @@ -3014,6 +2980,7 @@ static noinline void init_btrfs_i(struct inode *inode) bi->last_trans = 0; bi->logged_trans = 0; bi->delalloc_bytes = 0; + bi->reserved_bytes = 0; bi->disk_i_size = 0; bi->flags = 0; bi->index_cnt = (u64)-1; @@ -3035,6 +3002,7 @@ static int btrfs_init_locked_inode(struct inode *inode, void *p) inode->i_ino = args->ino; init_btrfs_i(inode); BTRFS_I(inode)->root = args->root; + btrfs_set_inode_space_info(args->root, inode); return 0; } @@ -3455,6 +3423,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, BTRFS_I(inode)->index_cnt = 2; BTRFS_I(inode)->root = root; BTRFS_I(inode)->generation = trans->transid; + btrfs_set_inode_space_info(root, inode); if (mode & S_IFDIR) owner = 0; @@ -3602,7 +3571,7 @@ static int btrfs_mknod(struct inode *dir, struct dentry *dentry, if (!new_valid_dev(rdev)) return -EINVAL; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto fail; @@ -3665,7 +3634,7 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, u64 objectid; u64 index = 0; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto fail; trans = btrfs_start_transaction(root, 1); @@ -3733,7 +3702,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, return -ENOENT; btrfs_inc_nlink(inode); - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto fail; err = btrfs_set_inode_index(dir, &index); @@ -3779,7 +3748,7 @@ static int btrfs_mkdir(struct inode *dir, struct dentry *dentry, int mode) u64 index = 0; unsigned long nr = 1; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto out_unlock; @@ -4336,7 +4305,7 @@ int btrfs_page_mkwrite(struct vm_area_struct *vma, struct page *page) u64 page_start; u64 page_end; - ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0); + ret = btrfs_check_data_free_space(root, inode, PAGE_CACHE_SIZE); if (ret) goto out; @@ -4349,6 +4318,7 @@ again: if ((page->mapping != inode->i_mapping) || (page_start >= size)) { + btrfs_free_reserved_data_space(root, inode, PAGE_CACHE_SIZE); /* page got truncated out from underneath us */ goto out_unlock; } @@ -4631,7 +4601,7 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) return -EXDEV; - ret = btrfs_check_free_space(root, 1, 0); + ret = btrfs_check_metadata_free_space(root); if (ret) goto out_unlock; @@ -4749,7 +4719,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(root)) return -ENAMETOOLONG; - err = btrfs_check_free_space(root, 1, 0); + err = btrfs_check_metadata_free_space(root); if (err) goto out_fail; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 988fdc8b49e..bca729fc80c 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -70,7 +70,7 @@ static noinline int create_subvol(struct btrfs_root *root, u64 index = 0; unsigned long nr = 1; - ret = btrfs_check_free_space(root, 1, 0); + ret = btrfs_check_metadata_free_space(root); if (ret) goto fail_commit; @@ -203,7 +203,7 @@ static int create_snapshot(struct btrfs_root *root, struct dentry *dentry, if (!root->ref_cows) return -EINVAL; - ret = btrfs_check_free_space(root, 1, 0); + ret = btrfs_check_metadata_free_space(root); if (ret) goto fail_unlock; @@ -374,7 +374,7 @@ static int btrfs_defrag_file(struct file *file) unsigned long i; int ret; - ret = btrfs_check_free_space(root, inode->i_size, 0); + ret = btrfs_check_data_free_space(root, inode, inode->i_size); if (ret) return -ENOSPC; -- cgit v1.2.3 From 4e06bdd6cbd5105376e7caf4e683ed131e777389 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 20 Feb 2009 10:59:53 -0500 Subject: Btrfs: try committing transaction before returning ENOSPC This fixes a problem where we could return -ENOSPC when we may actually have plenty of space, the space is just pinned. Instead of returning -ENOSPC immediately, commit the transaction first and then try and do the allocation again. This patch also does chunk allocation for metadata if we pass the 80% threshold for metadata space. This will help with stack usage since the chunk allocation will happen early on, instead of when the allocation is happening. Signed-off-by: Josef Bacik --- fs/btrfs/extent-tree.c | 57 +++++++++++++++++++++++++++++++++++++++++--------- 1 file changed, 47 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index e11875e97c2..6b5966aacf4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2017,26 +2017,49 @@ int btrfs_check_metadata_free_space(struct btrfs_root *root) struct btrfs_fs_info *info = root->fs_info; struct btrfs_space_info *meta_sinfo; u64 alloc_target, thresh; + int committed = 0, ret; /* get the space info for where the metadata will live */ alloc_target = btrfs_get_alloc_profile(root, 0); meta_sinfo = __find_space_info(info, alloc_target); - /* - * if the metadata area isn't maxed out then there is no sense in - * checking how much is used, since we can always allocate a new chunk - */ - if (!meta_sinfo->full) - return 0; - +again: spin_lock(&meta_sinfo->lock); - thresh = meta_sinfo->total_bytes * 95; + if (!meta_sinfo->full) + thresh = meta_sinfo->total_bytes * 80; + else + thresh = meta_sinfo->total_bytes * 95; do_div(thresh, 100); if (meta_sinfo->bytes_used + meta_sinfo->bytes_reserved + meta_sinfo->bytes_pinned + meta_sinfo->bytes_readonly > thresh) { + struct btrfs_trans_handle *trans; + if (!meta_sinfo->full) { + meta_sinfo->force_alloc = 1; + spin_unlock(&meta_sinfo->lock); + + trans = btrfs_start_transaction(root, 1); + if (!trans) + return -ENOMEM; + + ret = do_chunk_alloc(trans, root->fs_info->extent_root, + 2 * 1024 * 1024, alloc_target, 0); + btrfs_end_transaction(trans, root); + goto again; + } spin_unlock(&meta_sinfo->lock); + + if (!committed) { + committed = 1; + trans = btrfs_join_transaction(root, 1); + if (!trans) + return -ENOMEM; + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + goto again; + } return -ENOSPC; } spin_unlock(&meta_sinfo->lock); @@ -2052,7 +2075,7 @@ int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes) { struct btrfs_space_info *data_sinfo; - int ret = 0; + int ret = 0, committed = 0; /* make sure bytes are sectorsize aligned */ bytes = (bytes + root->sectorsize - 1) & ~((u64)root->sectorsize - 1); @@ -2065,13 +2088,14 @@ again: data_sinfo->bytes_delalloc - data_sinfo->bytes_reserved - data_sinfo->bytes_pinned - data_sinfo->bytes_readonly - data_sinfo->bytes_may_use < bytes) { + struct btrfs_trans_handle *trans; + /* * if we don't have enough free bytes in this space then we need * to alloc a new chunk. */ if (!data_sinfo->full) { u64 alloc_target; - struct btrfs_trans_handle *trans; data_sinfo->force_alloc = 1; spin_unlock(&data_sinfo->lock); @@ -2090,6 +2114,19 @@ again: goto again; } spin_unlock(&data_sinfo->lock); + + /* commit the current transaction and try again */ + if (!committed) { + committed = 1; + trans = btrfs_join_transaction(root, 1); + if (!trans) + return -ENOMEM; + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + goto again; + } + printk(KERN_ERR "no space left, need %llu, %llu delalloc bytes" ", %llu bytes_used, %llu bytes_reserved, " "%llu bytes_pinned, %llu bytes_readonly, %llu may use" -- cgit v1.2.3 From e4cce94c9c8797b08faf6a79396df4d175e377fa Mon Sep 17 00:00:00 2001 From: Igor Mammedov Date: Tue, 10 Feb 2009 14:10:26 +0300 Subject: [CIFS] Prevent OOPs when mounting with remote prefixpath. Fixes OOPs with message 'kernel BUG at fs/cifs/cifs_dfs_ref.c:274!'. Checks if the prefixpath in an accesible while we are still in cifs_mount and fails with reporting a error if we can't access the prefixpath Should fix Samba bugs 6086 and 5861 and kernel bug 12192 Signed-off-by: Igor Mammedov Acked-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 1 + fs/cifs/connect.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ fs/cifs/inode.c | 4 ++-- 3 files changed, 48 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 382ba629880..ec9f9c1c7d8 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -42,6 +42,7 @@ extern void _FreeXid(unsigned int); #define GetXid() (int)_GetXid(); cFYI(1,("CIFS VFS: in %s as Xid: %d with uid: %d",__func__, xid,current_fsuid())); #define FreeXid(curr_xid) {_FreeXid(curr_xid); cFYI(1,("CIFS VFS: leaving %s (xid = %d) rc = %d",__func__,curr_xid,(int)rc));} extern char *build_path_from_dentry(struct dentry *); +extern char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb); extern char *build_wildcard_path_from_dentry(struct dentry *direntry); /* extern void renew_parental_timestamps(struct dentry *direntry);*/ extern int SendReceive(const unsigned int /* xid */ , struct cifsSesInfo *, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 005df85219a..da0f4ffa061 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -2180,6 +2180,33 @@ static void setup_cifs_sb(struct smb_vol *pvolume_info, "mount option supported")); } +static int +is_path_accessible(int xid, struct cifsTconInfo *tcon, + struct cifs_sb_info *cifs_sb, const char *full_path) +{ + int rc; + __u64 inode_num; + FILE_ALL_INFO *pfile_info; + + rc = CIFSGetSrvInodeNumber(xid, tcon, full_path, &inode_num, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc != -EOPNOTSUPP) + return rc; + + pfile_info = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); + if (pfile_info == NULL) + return -ENOMEM; + + rc = CIFSSMBQPathInfo(xid, tcon, full_path, pfile_info, + 0 /* not legacy */, cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + kfree(pfile_info); + return rc; +} + int cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, char *mount_data, const char *devname) @@ -2190,6 +2217,7 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, struct cifsSesInfo *pSesInfo = NULL; struct cifsTconInfo *tcon = NULL; struct TCP_Server_Info *srvTcp = NULL; + char *full_path; xid = GetXid(); @@ -2426,6 +2454,23 @@ mount_fail_check: cifs_sb->rsize = min(cifs_sb->rsize, (tcon->ses->server->maxBuf - MAX_CIFS_HDR_SIZE)); + if (!rc && cifs_sb->prepathlen) { + /* build_path_to_root works only when we have a valid tcon */ + full_path = cifs_build_path_to_root(cifs_sb); + if (full_path == NULL) { + rc = -ENOMEM; + goto mount_fail_check; + } + rc = is_path_accessible(xid, tcon, cifs_sb, full_path); + if (rc) { + cERROR(1, ("Path %s in not accessible: %d", + full_path, rc)); + kfree(full_path); + goto mount_fail_check; + } + kfree(full_path); + } + /* volume_info->password is freed above when existing session found (in which case it is not needed anymore) but when new sesion is created the password ptr is put in the new session structure (in which case the diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index bcf7b518466..7342bfb02ae 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -621,7 +621,7 @@ static const struct inode_operations cifs_ipc_inode_ops = { .lookup = cifs_lookup, }; -static char *build_path_to_root(struct cifs_sb_info *cifs_sb) +char *cifs_build_path_to_root(struct cifs_sb_info *cifs_sb) { int pplen = cifs_sb->prepathlen; int dfsplen; @@ -678,7 +678,7 @@ struct inode *cifs_iget(struct super_block *sb, unsigned long ino) return inode; cifs_sb = CIFS_SB(inode->i_sb); - full_path = build_path_to_root(cifs_sb); + full_path = cifs_build_path_to_root(cifs_sb); if (full_path == NULL) return ERR_PTR(-ENOMEM); -- cgit v1.2.3 From 132ac7b77cc95a22d6118d327c96586759fbf006 Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 10 Feb 2009 07:33:57 -0500 Subject: cifs: refactor new_inode() calls and inode initialization Move new inode creation into a separate routine and refactor the callers to take advantage of it. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 2 ++ fs/cifs/inode.c | 96 +++++++++++++++++++++++++++++++++-------------------- fs/cifs/readdir.c | 54 ++++++++++++++---------------- 3 files changed, 86 insertions(+), 66 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index ec9f9c1c7d8..62fd5bd499f 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -92,6 +92,8 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time); +extern struct inode *cifs_new_inode(struct super_block *sb, + unsigned long *inum); extern int cifs_get_inode_info(struct inode **pinode, const unsigned char *search_path, FILE_ALL_INFO *pfile_info, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 7342bfb02ae..c7674f595ad 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -199,6 +199,49 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat, pfnd_dat->Gid = cpu_to_le64(pinode->i_gid); } +/** + * cifs_new inode - create new inode, initialize, and hash it + * @sb - pointer to superblock + * @inum - if valid pointer and serverino is enabled, replace i_ino with val + * + * Create a new inode, initialize it for CIFS and hash it. Returns the new + * inode or NULL if one couldn't be allocated. + * + * If the share isn't mounted with "serverino" or inum is a NULL pointer then + * we'll just use the inode number assigned by new_inode(). Note that this can + * mean i_ino collisions since the i_ino assigned by new_inode is not + * guaranteed to be unique. + */ +struct inode * +cifs_new_inode(struct super_block *sb, unsigned long *inum) +{ + struct inode *inode; + + inode = new_inode(sb); + if (inode == NULL) + return NULL; + + /* + * BB: Is i_ino == 0 legal? Here, we assume that it is. If it isn't we + * stop passing inum as ptr. Are there sanity checks we can use to + * ensure that the server is really filling in that field? Also, + * if serverino is disabled, perhaps we should be using iunique()? + */ + if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) + inode->i_ino = *inum; + + /* + * must set this here instead of cifs_alloc_inode since VFS will + * clobber i_flags + */ + if (sb->s_flags & MS_NOATIME) + inode->i_flags |= S_NOATIME | S_NOCMTIME; + + insert_inode_hash(inode); + + return inode; +} + int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *full_path, struct super_block *sb, int xid) { @@ -233,22 +276,12 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* get new inode */ if (*pinode == NULL) { - *pinode = new_inode(sb); + *pinode = cifs_new_inode(sb, (unsigned long *) + &find_data.UniqueId); if (*pinode == NULL) { rc = -ENOMEM; goto cgiiu_exit; } - /* Is an i_ino of zero legal? */ - /* note ino incremented to unique num in new_inode */ - /* Are there sanity checks we can use to ensure that - the server is really filling in that field? */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) - (*pinode)->i_ino = (unsigned long)find_data.UniqueId; - - if (sb->s_flags & MS_NOATIME) - (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME; - - insert_inode_hash(*pinode); } inode = *pinode; @@ -465,11 +498,8 @@ int cifs_get_inode_info(struct inode **pinode, /* get new inode */ if (*pinode == NULL) { - *pinode = new_inode(sb); - if (*pinode == NULL) { - rc = -ENOMEM; - goto cgii_exit; - } + __u64 inode_num; + /* Is an i_ino of zero legal? Can we use that to check if the server supports returning inode numbers? Are there other sanity checks we can use to ensure that @@ -486,7 +516,6 @@ int cifs_get_inode_info(struct inode **pinode, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { int rc1 = 0; - __u64 inode_num; rc1 = CIFSGetSrvInodeNumber(xid, pTcon, full_path, &inode_num, @@ -496,12 +525,17 @@ int cifs_get_inode_info(struct inode **pinode, if (rc1) { cFYI(1, ("GetSrvInodeNum rc %d", rc1)); /* BB EOPNOSUPP disable SERVER_INUM? */ - } else /* do we need cast or hash to ino? */ - (*pinode)->i_ino = inode_num; - } /* else ino incremented to unique num in new_inode*/ - if (sb->s_flags & MS_NOATIME) - (*pinode)->i_flags |= S_NOATIME | S_NOCMTIME; - insert_inode_hash(*pinode); + } + *pinode = cifs_new_inode(sb, (unsigned long *) + &inode_num); + } else { + *pinode = cifs_new_inode(sb, NULL); + } + + if (*pinode == NULL) { + rc = -ENOMEM; + goto cgii_exit; + } } inode = *pinode; cifsInfo = CIFS_I(inode); @@ -1114,24 +1148,14 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) else direntry->d_op = &cifs_dentry_ops; - newinode = new_inode(inode->i_sb); + newinode = cifs_new_inode(inode->i_sb, (unsigned long *) + &pInfo->UniqueId); if (newinode == NULL) { kfree(pInfo); goto mkdir_get_info; } - /* Is an i_ino of zero legal? */ - /* Are there sanity checks we can use to ensure that - the server is really filling in that field? */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) { - newinode->i_ino = - (unsigned long)pInfo->UniqueId; - } /* note ino incremented to unique num in new_inode */ - if (inode->i_sb->s_flags & MS_NOATIME) - newinode->i_flags |= S_NOATIME | S_NOCMTIME; newinode->i_nlink = 2; - - insert_inode_hash(newinode); d_instantiate(direntry, newinode); /* we already checked in POSIXCreate whether diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 9f51f9bf029..02a20221e84 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -56,35 +56,34 @@ static inline void dump_cifs_file_struct(struct file *file, char *label) } #endif /* DEBUG2 */ -/* Returns one if new inode created (which therefore needs to be hashed) */ +/* Returns 1 if new inode created, 2 if both dentry and inode were */ /* Might check in the future if inode number changed so we can rehash inode */ -static int construct_dentry(struct qstr *qstring, struct file *file, - struct inode **ptmp_inode, struct dentry **pnew_dentry) +static int +construct_dentry(struct qstr *qstring, struct file *file, + struct inode **ptmp_inode, struct dentry **pnew_dentry, + unsigned long *inum) { - struct dentry *tmp_dentry; - struct cifs_sb_info *cifs_sb; - struct cifsTconInfo *pTcon; + struct dentry *tmp_dentry = NULL; + struct super_block *sb = file->f_path.dentry->d_sb; int rc = 0; cFYI(1, ("For %s", qstring->name)); - cifs_sb = CIFS_SB(file->f_path.dentry->d_sb); - pTcon = cifs_sb->tcon; qstring->hash = full_name_hash(qstring->name, qstring->len); tmp_dentry = d_lookup(file->f_path.dentry, qstring); if (tmp_dentry) { + /* BB: overwrite old name? i.e. tmp_dentry->d_name and + * tmp_dentry->d_name.len?? + */ cFYI(0, ("existing dentry with inode 0x%p", tmp_dentry->d_inode)); *ptmp_inode = tmp_dentry->d_inode; -/* BB overwrite old name? i.e. tmp_dentry->d_name and tmp_dentry->d_name.len??*/ if (*ptmp_inode == NULL) { - *ptmp_inode = new_inode(file->f_path.dentry->d_sb); + *ptmp_inode = cifs_new_inode(sb, inum); if (*ptmp_inode == NULL) return rc; rc = 1; } - if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME) - (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME; } else { tmp_dentry = d_alloc(file->f_path.dentry, qstring); if (tmp_dentry == NULL) { @@ -93,15 +92,14 @@ static int construct_dentry(struct qstr *qstring, struct file *file, return rc; } - *ptmp_inode = new_inode(file->f_path.dentry->d_sb); - if (pTcon->nocase) + if (CIFS_SB(sb)->tcon->nocase) tmp_dentry->d_op = &cifs_ci_dentry_ops; else tmp_dentry->d_op = &cifs_dentry_ops; + + *ptmp_inode = cifs_new_inode(sb, inum); if (*ptmp_inode == NULL) return rc; - if (file->f_path.dentry->d_sb->s_flags & MS_NOATIME) - (*ptmp_inode)->i_flags |= S_NOATIME | S_NOCMTIME; rc = 2; } @@ -842,9 +840,7 @@ static int cifs_get_name_from_search_buf(struct qstr *pqst, len = strnlen(filename, PATH_MAX); } - /* BB fixme - hash low and high 32 bits if not 64 bit arch BB */ - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) - *pinum = pFindData->UniqueId; + *pinum = pFindData->UniqueId; } else if (level == SMB_FIND_FILE_DIRECTORY_INFO) { FILE_DIRECTORY_INFO *pFindData = (FILE_DIRECTORY_INFO *)current_entry; @@ -940,20 +936,18 @@ static int cifs_filldir(char *pfindEntry, struct file *file, if (rc) return rc; - rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry); + /* only these two infolevels return valid inode numbers */ + if (pCifsF->srch_inf.info_level == SMB_FIND_FILE_UNIX || + pCifsF->srch_inf.info_level == SMB_FIND_FILE_ID_FULL_DIR_INFO) + rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry, + &inum); + else + rc = construct_dentry(&qstring, file, &tmp_inode, &tmp_dentry, + NULL); + if ((tmp_inode == NULL) || (tmp_dentry == NULL)) return -ENOMEM; - if (rc) { - /* inode created, we need to hash it with right inode number */ - if (inum != 0) { - /* BB fixme - hash the 2 32 quantities bits together if - * necessary BB */ - tmp_inode->i_ino = inum; - } - insert_inode_hash(tmp_inode); - } - /* we pass in rc below, indicating whether it is a new inode, so we can figure out whether to invalidate the inode cached data if the file has changed */ -- cgit v1.2.3 From 950ec52880fab89b957c7dc45e8b8476dd63741f Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 11 Feb 2009 08:08:26 -0500 Subject: cifs: properly handle case where CIFSGetSrvInodeNumber fails ...if it does then we pass a pointer to an unintialized variable for the inode number to cifs_new_inode. Have it pass a NULL pointer instead. Also tweak the function prototypes to reduce the amount of casting. Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 3 +-- fs/cifs/inode.c | 20 ++++++++++---------- fs/cifs/readdir.c | 6 +++--- 3 files changed, 14 insertions(+), 15 deletions(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 62fd5bd499f..446e62cbece 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -92,8 +92,7 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time); -extern struct inode *cifs_new_inode(struct super_block *sb, - unsigned long *inum); +extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum); extern int cifs_get_inode_info(struct inode **pinode, const unsigned char *search_path, FILE_ALL_INFO *pfile_info, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index c7674f595ad..475115c7cc7 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -213,7 +213,7 @@ static void fill_fake_finddataunix(FILE_UNIX_BASIC_INFO *pfnd_dat, * guaranteed to be unique. */ struct inode * -cifs_new_inode(struct super_block *sb, unsigned long *inum) +cifs_new_inode(struct super_block *sb, __u64 *inum) { struct inode *inode; @@ -228,7 +228,7 @@ cifs_new_inode(struct super_block *sb, unsigned long *inum) * if serverino is disabled, perhaps we should be using iunique()? */ if (inum && (CIFS_SB(sb)->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM)) - inode->i_ino = *inum; + inode->i_ino = (unsigned long) *inum; /* * must set this here instead of cifs_alloc_inode since VFS will @@ -276,8 +276,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, /* get new inode */ if (*pinode == NULL) { - *pinode = cifs_new_inode(sb, (unsigned long *) - &find_data.UniqueId); + *pinode = cifs_new_inode(sb, &find_data.UniqueId); if (*pinode == NULL) { rc = -ENOMEM; goto cgiiu_exit; @@ -499,6 +498,7 @@ int cifs_get_inode_info(struct inode **pinode, /* get new inode */ if (*pinode == NULL) { __u64 inode_num; + __u64 *pinum = &inode_num; /* Is an i_ino of zero legal? Can we use that to check if the server supports returning inode numbers? Are @@ -518,20 +518,20 @@ int cifs_get_inode_info(struct inode **pinode, int rc1 = 0; rc1 = CIFSGetSrvInodeNumber(xid, pTcon, - full_path, &inode_num, + full_path, pinum, cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); if (rc1) { cFYI(1, ("GetSrvInodeNum rc %d", rc1)); + pinum = NULL; /* BB EOPNOSUPP disable SERVER_INUM? */ } - *pinode = cifs_new_inode(sb, (unsigned long *) - &inode_num); } else { - *pinode = cifs_new_inode(sb, NULL); + pinum = NULL; } + *pinode = cifs_new_inode(sb, pinum); if (*pinode == NULL) { rc = -ENOMEM; goto cgii_exit; @@ -1148,8 +1148,8 @@ int cifs_mkdir(struct inode *inode, struct dentry *direntry, int mode) else direntry->d_op = &cifs_dentry_ops; - newinode = cifs_new_inode(inode->i_sb, (unsigned long *) - &pInfo->UniqueId); + newinode = cifs_new_inode(inode->i_sb, + &pInfo->UniqueId); if (newinode == NULL) { kfree(pInfo); goto mkdir_get_info; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 02a20221e84..c2c01ff4c32 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -61,7 +61,7 @@ static inline void dump_cifs_file_struct(struct file *file, char *label) static int construct_dentry(struct qstr *qstring, struct file *file, struct inode **ptmp_inode, struct dentry **pnew_dentry, - unsigned long *inum) + __u64 *inum) { struct dentry *tmp_dentry = NULL; struct super_block *sb = file->f_path.dentry->d_sb; @@ -820,7 +820,7 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, /* inode num, inode type and filename returned */ static int cifs_get_name_from_search_buf(struct qstr *pqst, char *current_entry, __u16 level, unsigned int unicode, - struct cifs_sb_info *cifs_sb, int max_len, ino_t *pinum) + struct cifs_sb_info *cifs_sb, int max_len, __u64 *pinum) { int rc = 0; unsigned int len = 0; @@ -903,7 +903,7 @@ static int cifs_filldir(char *pfindEntry, struct file *file, struct qstr qstring; struct cifsFileInfo *pCifsF; unsigned int obj_type; - ino_t inum; + __u64 inum; struct cifs_sb_info *cifs_sb; struct inode *tmp_inode; struct dentry *tmp_dentry; -- cgit v1.2.3 From 44f68fadd865bb288ebdcea2b602f0b1cab27a0c Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Wed, 11 Feb 2009 08:08:28 -0500 Subject: cifs: posix fill in inode needed by posix open function needed to prepare for posix open Signed-off-by: Jeff Layton Signed-off-by: Steve French --- fs/cifs/cifsproto.h | 2 ++ fs/cifs/inode.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 446e62cbece..083dfc57c7a 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -92,6 +92,8 @@ extern u64 cifs_UnixTimeToNT(struct timespec); extern __le64 cnvrtDosCifsTm(__u16 date, __u16 time); extern struct timespec cnvrtDosUnixTm(__u16 date, __u16 time); +extern void posix_fill_in_inode(struct inode *tmp_inode, + FILE_UNIX_BASIC_INFO *pData, int isNewInode); extern struct inode *cifs_new_inode(struct super_block *sb, __u64 *inum); extern int cifs_get_inode_info(struct inode **pinode, const unsigned char *search_path, diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 475115c7cc7..4690a360c85 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -1051,7 +1051,7 @@ out_reval: return rc; } -static void posix_fill_in_inode(struct inode *tmp_inode, +void posix_fill_in_inode(struct inode *tmp_inode, FILE_UNIX_BASIC_INFO *pData, int isNewInode) { struct cifsInodeInfo *cifsInfo = CIFS_I(tmp_inode); -- cgit v1.2.3 From 69765529d701c838df19ea1f5ad2f33a528261ae Mon Sep 17 00:00:00 2001 From: Steve French Date: Tue, 17 Feb 2009 01:29:40 +0000 Subject: [CIFS] Fix oops in cifs_strfromUCS_le mounting to servers which do not specify their OS Fixes kernel bug #10451 http://bugzilla.kernel.org/show_bug.cgi?id=10451 Certain NAS appliances do not set the operating system or network operating system fields in the session setup response on the wire. cifs was oopsing on the unexpected zero length response fields (when trying to null terminate a zero length field). This fixes the oops. Acked-by: Jeff Layton CC: stable Signed-off-by: Steve French --- fs/cifs/CHANGES | 3 ++- fs/cifs/sess.c | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 73ac7ebd1df..1cfa72ef1f3 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -7,7 +7,8 @@ specified and user does not have access to query information about the top of the share. Fix problem in 2.6.28 resolving DFS paths to Samba servers (worked to Windows). Fix rmdir so that pending search (readdir) requests do not get invalid results which include the now -removed directory. +removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable +when using DFS. Version 1.55 ------------ diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 5f22de7b79a..b234407a300 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -228,7 +228,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft, kfree(ses->serverOS); /* UTF-8 string will not grow more than four times as big as UCS-16 */ - ses->serverOS = kzalloc(4 * len, GFP_KERNEL); + ses->serverOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL); if (ses->serverOS != NULL) cifs_strfromUCS_le(ses->serverOS, (__le16 *)data, len, nls_cp); data += 2 * (len + 1); @@ -241,7 +241,7 @@ static int decode_unicode_ssetup(char **pbcc_area, int bleft, return rc; kfree(ses->serverNOS); - ses->serverNOS = kzalloc(4 * len, GFP_KERNEL); /* BB this is wrong length FIXME BB */ + ses->serverNOS = kzalloc((4 * len) + 2 /* trailing null */, GFP_KERNEL); if (ses->serverNOS != NULL) { cifs_strfromUCS_le(ses->serverNOS, (__le16 *)data, len, nls_cp); -- cgit v1.2.3 From c3b2a0c640bff7df85d79fb4f89674949a267ec2 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 20 Feb 2009 04:32:45 +0000 Subject: [CIFS] improve posix semantics of file create Samba server added support for a new posix open/create/mkdir operation a year or so ago, and we added support to cifs for mkdir to use it, but had not added the corresponding code to file create. The following patch helps improve the performance of the cifs create path (to Samba and servers which support the cifs posix protocol extensions). Using Connectathon basic test1, with 2000 files, the performance improved about 15%, and also helped reduce network traffic (17% fewer SMBs sent over the wire) due to saving a network round trip for the SetPathInfo on every file create. It should also help the semantics (and probably the performance) of write (e.g. when posix byte range locks are on the file) on file handles opened with posix create, and adds support for a few flags which would have to be ignored otherwise. Signed-off-by: Steve French --- fs/cifs/CHANGES | 4 +- fs/cifs/dir.c | 307 +++++++++++++++++++++++++++++++++++++------------------- 2 files changed, 208 insertions(+), 103 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 1cfa72ef1f3..72063f5e56b 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -8,7 +8,9 @@ top of the share. Fix problem in 2.6.28 resolving DFS paths to Samba servers (worked to Windows). Fix rmdir so that pending search (readdir) requests do not get invalid results which include the now removed directory. Fix oops in cifs_dfs_ref.c when prefixpath is not reachable -when using DFS. +when using DFS. Add better file create support to servers which support +the CIFS POSIX protocol extensions (this adds support for new flags +on create, and improves semantics for write of locked ranges). Version 1.55 ------------ diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 964aad03c5a..89fb7283265 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -3,7 +3,7 @@ * * vfs operations that deal with dentries * - * Copyright (C) International Business Machines Corp., 2002,2008 + * Copyright (C) International Business Machines Corp., 2002,2009 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -129,6 +129,78 @@ cifs_bp_rename_retry: return full_path; } +static int cifs_posix_open(char *full_path, struct inode **pinode, + struct super_block *sb, int mode, int oflags, + int *poplock, __u16 *pnetfid, int xid) +{ + int rc; + __u32 oplock; + FILE_UNIX_BASIC_INFO *presp_data; + __u32 posix_flags = 0; + struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + + cFYI(1, ("posix open %s", full_path)); + + presp_data = kzalloc(sizeof(FILE_UNIX_BASIC_INFO), GFP_KERNEL); + if (presp_data == NULL) + return -ENOMEM; + +/* So far cifs posix extensions can only map the following flags. + There are other valid fmode oflags such as FMODE_LSEEK, FMODE_PREAD, but + so far we do not seem to need them, and we can treat them as local only */ + if ((oflags & (FMODE_READ | FMODE_WRITE)) == + (FMODE_READ | FMODE_WRITE)) + posix_flags = SMB_O_RDWR; + else if (oflags & FMODE_READ) + posix_flags = SMB_O_RDONLY; + else if (oflags & FMODE_WRITE) + posix_flags = SMB_O_WRONLY; + if (oflags & O_CREAT) + posix_flags |= SMB_O_CREAT; + if (oflags & O_EXCL) + posix_flags |= SMB_O_EXCL; + if (oflags & O_TRUNC) + posix_flags |= SMB_O_TRUNC; + if (oflags & O_APPEND) + posix_flags |= SMB_O_APPEND; + if (oflags & O_SYNC) + posix_flags |= SMB_O_SYNC; + if (oflags & O_DIRECTORY) + posix_flags |= SMB_O_DIRECTORY; + if (oflags & O_NOFOLLOW) + posix_flags |= SMB_O_NOFOLLOW; + if (oflags & O_DIRECT) + posix_flags |= SMB_O_DIRECT; + + + rc = CIFSPOSIXCreate(xid, cifs_sb->tcon, posix_flags, mode, + pnetfid, presp_data, &oplock, full_path, + cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & + CIFS_MOUNT_MAP_SPECIAL_CHR); + if (rc) + goto posix_open_ret; + + if (presp_data->Type == cpu_to_le32(-1)) + goto posix_open_ret; /* open ok, caller does qpathinfo */ + + /* get new inode and set it up */ + if (!pinode) + goto posix_open_ret; /* caller does not need info */ + + *pinode = cifs_new_inode(sb, &presp_data->UniqueId); + + /* We do not need to close the file if new_inode fails since + the caller will retry qpathinfo as long as inode is null */ + if (*pinode == NULL) + goto posix_open_ret; + + posix_fill_in_inode(*pinode, presp_data, 1); + +posix_open_ret: + kfree(presp_data); + return rc; +} + static void setup_cifs_dentry(struct cifsTconInfo *tcon, struct dentry *direntry, struct inode *newinode) @@ -150,7 +222,14 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, int xid; int create_options = CREATE_NOT_DIR; int oplock = 0; - /* BB below access is too much for the mknod to request */ + int oflags; + /* + * BB below access is probably too much for mknod to request + * but we have to do query and setpathinfo so requesting + * less could fail (unless we want to request getatr and setatr + * permissions (only). At least for POSIX we do not have to + * request so much. + */ int desiredAccess = GENERIC_READ | GENERIC_WRITE; __u16 fileHandle; struct cifs_sb_info *cifs_sb; @@ -174,13 +253,43 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, } mode &= ~current->fs->umask; + if (oplockEnabled) + oplock = REQ_OPLOCK; - if (nd && (nd->flags & LOOKUP_OPEN)) { - int oflags = nd->intent.open.flags; + if (nd && (nd->flags & LOOKUP_OPEN)) + oflags = nd->intent.open.flags; + else + oflags = FMODE_READ; + + if (tcon->unix_ext && (tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_POSIX_PATH_OPS_CAP & + le64_to_cpu(tcon->fsUnixInfo.Capability))) { + rc = cifs_posix_open(full_path, &newinode, inode->i_sb, + mode, oflags, &oplock, &fileHandle, xid); + /* EIO could indicate that (posix open) operation is not + supported, despite what server claimed in capability + negotation. EREMOTE indicates DFS junction, which is not + handled in posix open */ + + if ((rc == 0) && (newinode == NULL)) + goto cifs_create_get_file_info; /* query inode info */ + else if (rc == 0) /* success, no need to query */ + goto cifs_create_set_dentry; + else if ((rc != -EIO) && (rc != -EREMOTE) && + (rc != -EOPNOTSUPP)) /* path not found or net err */ + goto cifs_create_out; + /* else fallthrough to retry, using older open call, this is + case where server does not support this SMB level, and + falsely claims capability (also get here for DFS case + which should be rare for path not covered on files) */ + } + if (nd && (nd->flags & LOOKUP_OPEN)) { + /* if the file is going to stay open, then we + need to set the desired access properly */ desiredAccess = 0; if (oflags & FMODE_READ) - desiredAccess |= GENERIC_READ; + desiredAccess |= GENERIC_READ; /* is this too little? */ if (oflags & FMODE_WRITE) { desiredAccess |= GENERIC_WRITE; if (!(oflags & FMODE_READ)) @@ -199,8 +308,6 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, /* BB add processing to set equivalent of mode - e.g. via CreateX with ACLs */ - if (oplockEnabled) - oplock = REQ_OPLOCK; buf = kmalloc(sizeof(FILE_ALL_INFO), GFP_KERNEL); if (buf == NULL) { @@ -233,116 +340,112 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, } if (rc) { cFYI(1, ("cifs_create returned 0x%x", rc)); - } else { - /* If Open reported that we actually created a file - then we now have to set the mode if possible */ - if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) { - struct cifs_unix_set_info_args args = { + goto cifs_create_out; + } + + /* If Open reported that we actually created a file + then we now have to set the mode if possible */ + if ((tcon->unix_ext) && (oplock & CIFS_CREATE_ACTION)) { + struct cifs_unix_set_info_args args = { .mode = mode, .ctime = NO_CHANGE_64, .atime = NO_CHANGE_64, .mtime = NO_CHANGE_64, .device = 0, - }; + }; - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { - args.uid = (__u64) current_fsuid(); - if (inode->i_mode & S_ISGID) - args.gid = (__u64) inode->i_gid; - else - args.gid = (__u64) current_fsgid(); - } else { - args.uid = NO_CHANGE_64; - args.gid = NO_CHANGE_64; - } - CIFSSMBUnixSetInfo(xid, tcon, full_path, &args, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + args.uid = (__u64) current_fsuid(); + if (inode->i_mode & S_ISGID) + args.gid = (__u64) inode->i_gid; + else + args.gid = (__u64) current_fsgid(); } else { - /* BB implement mode setting via Windows security - descriptors e.g. */ - /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/ - - /* Could set r/o dos attribute if mode & 0222 == 0 */ + args.uid = NO_CHANGE_64; + args.gid = NO_CHANGE_64; } + CIFSSMBUnixSetInfo(xid, tcon, full_path, &args, + cifs_sb->local_nls, + cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + } else { + /* BB implement mode setting via Windows security + descriptors e.g. */ + /* CIFSSMBWinSetPerms(xid,tcon,path,mode,-1,-1,nls);*/ - /* server might mask mode so we have to query for it */ - if (tcon->unix_ext) - rc = cifs_get_inode_info_unix(&newinode, full_path, - inode->i_sb, xid); - else { - rc = cifs_get_inode_info(&newinode, full_path, - buf, inode->i_sb, xid, - &fileHandle); - if (newinode) { - if (cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_DYNPERM) - newinode->i_mode = mode; - if ((oplock & CIFS_CREATE_ACTION) && - (cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_SET_UID)) { - newinode->i_uid = current_fsuid(); - if (inode->i_mode & S_ISGID) - newinode->i_gid = - inode->i_gid; - else - newinode->i_gid = - current_fsgid(); - } + /* Could set r/o dos attribute if mode & 0222 == 0 */ + } + +cifs_create_get_file_info: + /* server might mask mode so we have to query for it */ + if (tcon->unix_ext) + rc = cifs_get_inode_info_unix(&newinode, full_path, + inode->i_sb, xid); + else { + rc = cifs_get_inode_info(&newinode, full_path, buf, + inode->i_sb, xid, &fileHandle); + if (newinode) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + newinode->i_mode = mode; + if ((oplock & CIFS_CREATE_ACTION) && + (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { + newinode->i_uid = current_fsuid(); + if (inode->i_mode & S_ISGID) + newinode->i_gid = inode->i_gid; + else + newinode->i_gid = current_fsgid(); } } + } - if (rc != 0) { - cFYI(1, ("Create worked, get_inode_info failed rc = %d", - rc)); - } else - setup_cifs_dentry(tcon, direntry, newinode); - - if ((nd == NULL /* nfsd case - nfs srv does not set nd */) || - (!(nd->flags & LOOKUP_OPEN))) { - /* mknod case - do not leave file open */ - CIFSSMBClose(xid, tcon, fileHandle); - } else if (newinode) { - struct cifsFileInfo *pCifsFile = - kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); - - if (pCifsFile == NULL) - goto cifs_create_out; - pCifsFile->netfid = fileHandle; - pCifsFile->pid = current->tgid; - pCifsFile->pInode = newinode; - pCifsFile->invalidHandle = false; - pCifsFile->closePend = false; - init_MUTEX(&pCifsFile->fh_sem); - mutex_init(&pCifsFile->lock_mutex); - INIT_LIST_HEAD(&pCifsFile->llist); - atomic_set(&pCifsFile->wrtPending, 0); - - /* set the following in open now +cifs_create_set_dentry: + if (rc == 0) + setup_cifs_dentry(tcon, direntry, newinode); + else + cFYI(1, ("Create worked, get_inode_info failed rc = %d", rc)); + + /* nfsd case - nfs srv does not set nd */ + if ((nd == NULL) || (!(nd->flags & LOOKUP_OPEN))) { + /* mknod case - do not leave file open */ + CIFSSMBClose(xid, tcon, fileHandle); + } else if (newinode) { + struct cifsFileInfo *pCifsFile = + kzalloc(sizeof(struct cifsFileInfo), GFP_KERNEL); + + if (pCifsFile == NULL) + goto cifs_create_out; + pCifsFile->netfid = fileHandle; + pCifsFile->pid = current->tgid; + pCifsFile->pInode = newinode; + pCifsFile->invalidHandle = false; + pCifsFile->closePend = false; + init_MUTEX(&pCifsFile->fh_sem); + mutex_init(&pCifsFile->lock_mutex); + INIT_LIST_HEAD(&pCifsFile->llist); + atomic_set(&pCifsFile->wrtPending, 0); + + /* set the following in open now pCifsFile->pfile = file; */ - write_lock(&GlobalSMBSeslock); - list_add(&pCifsFile->tlist, &tcon->openFileList); - pCifsInode = CIFS_I(newinode); - if (pCifsInode) { - /* if readable file instance put first in list*/ - if (write_only) { - list_add_tail(&pCifsFile->flist, - &pCifsInode->openFileList); - } else { - list_add(&pCifsFile->flist, - &pCifsInode->openFileList); - } - if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { - pCifsInode->clientCanCacheAll = true; - pCifsInode->clientCanCacheRead = true; - cFYI(1, ("Exclusive Oplock inode %p", - newinode)); - } else if ((oplock & 0xF) == OPLOCK_READ) - pCifsInode->clientCanCacheRead = true; + write_lock(&GlobalSMBSeslock); + list_add(&pCifsFile->tlist, &tcon->openFileList); + pCifsInode = CIFS_I(newinode); + if (pCifsInode) { + /* if readable file instance put first in list*/ + if (write_only) { + list_add_tail(&pCifsFile->flist, + &pCifsInode->openFileList); + } else { + list_add(&pCifsFile->flist, + &pCifsInode->openFileList); } - write_unlock(&GlobalSMBSeslock); + if ((oplock & 0xF) == OPLOCK_EXCLUSIVE) { + pCifsInode->clientCanCacheAll = true; + pCifsInode->clientCanCacheRead = true; + cFYI(1, ("Exclusive Oplock inode %p", + newinode)); + } else if ((oplock & 0xF) == OPLOCK_READ) + pCifsInode->clientCanCacheRead = true; } + write_unlock(&GlobalSMBSeslock); } cifs_create_out: kfree(buf); -- cgit v1.2.3 From eca6acf91552a9b2e997cc76339115c95eac0217 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 20 Feb 2009 05:43:09 +0000 Subject: [CIFS] Fix multiuser mounts so server does not invalidate earlier security contexts When two different users mount the same Windows 2003 Server share using CIFS, the first session mounted can be invalidated. Some servers invalidate the first smb session when a second similar user (e.g. two users who get mapped by server to "guest") authenticates an smb session from the same client. By making sure that we set the 2nd and subsequent vc numbers to nonzero values, this ensures that we will not have this problem. Fixes Samba bug 6004, problem description follows: How to reproduce: - configure an "open share" (full permissions to Guest user) on Windows 2003 Server (I couldn't reproduce the problem with Samba server or Windows older than 2003) - mount the share twice with different users who will be authenticated as guest. noacl,noperm,user=john,dir_mode=0700,domain=DOMAIN,rw noacl,noperm,user=jeff,dir_mode=0700,domain=DOMAIN,rw Result: - just the mount point mounted last is accessible: Signed-off-by: Steve French --- fs/cifs/CHANGES | 10 +++++++ fs/cifs/cifsfs.h | 2 +- fs/cifs/cifsglob.h | 6 +++- fs/cifs/cifssmb.c | 7 +++-- fs/cifs/sess.c | 87 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 5 files changed, 105 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index 72063f5e56b..851388fafc7 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,13 @@ +Version 1.57 +------------ +Improve support for multiple security contexts to the same server. We +used to use the same "vcnumber" for all connections which could cause +the server to treat subsequent connections, especially those that +are authenticated as guest, as reconnections, invalidating the earlier +user's smb session. This fix allows cifs to mount multiple times to the +same server with different userids without risking invalidating earlier +established security contexts. + Version 1.56 ------------ Add "forcemandatorylock" mount option to allow user to use mandatory diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 7ac481841f8..2b1d28a9ee2 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -100,5 +100,5 @@ extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern const struct export_operations cifs_export_ops; #endif /* EXPERIMENTAL */ -#define CIFS_VERSION "1.56" +#define CIFS_VERSION "1.57" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 94c1ca0ec95..e004f6db5fc 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -164,9 +164,12 @@ struct TCP_Server_Info { /* multiplexed reads or writes */ unsigned int maxBuf; /* maxBuf specifies the maximum */ /* message size the server can send or receive for non-raw SMBs */ - unsigned int maxRw; /* maxRw specifies the maximum */ + unsigned int max_rw; /* maxRw specifies the maximum */ /* message size the server can send or receive for */ /* SMB_COM_WRITE_RAW or SMB_COM_READ_RAW. */ + unsigned int max_vcs; /* maximum number of smb sessions, at least + those that can be specified uniquely with + vcnumbers */ char sessid[4]; /* unique token id for this session */ /* (returned on Negotiate */ int capabilities; /* allow selective disabling of caps by smb sess */ @@ -210,6 +213,7 @@ struct cifsSesInfo { unsigned overrideSecFlg; /* if non-zero override global sec flags */ __u16 ipc_tid; /* special tid for connection to IPC share */ __u16 flags; + __u16 vcnum; char *serverOS; /* name of operating system underlying server */ char *serverNOS; /* name of network operating system of server */ char *serverDomain; /* security realm of server */ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 552642a507c..939e2f76b95 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -528,14 +528,15 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) server->maxReq = le16_to_cpu(rsp->MaxMpxCount); server->maxBuf = min((__u32)le16_to_cpu(rsp->MaxBufSize), (__u32)CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); + server->max_vcs = le16_to_cpu(rsp->MaxNumberVcs); GETU32(server->sessid) = le32_to_cpu(rsp->SessionKey); /* even though we do not use raw we might as well set this accurately, in case we ever find a need for it */ if ((le16_to_cpu(rsp->RawMode) & RAW_ENABLE) == RAW_ENABLE) { - server->maxRw = 0xFF00; + server->max_rw = 0xFF00; server->capabilities = CAP_MPX_MODE | CAP_RAW_MODE; } else { - server->maxRw = 0;/* we do not need to use raw anyway */ + server->max_rw = 0;/* do not need to use raw anyway */ server->capabilities = CAP_MPX_MODE; } tmp = (__s16)le16_to_cpu(rsp->ServerTimeZone); @@ -638,7 +639,7 @@ CIFSSMBNegotiate(unsigned int xid, struct cifsSesInfo *ses) /* probably no need to store and check maxvcs */ server->maxBuf = min(le32_to_cpu(pSMBr->MaxBufferSize), (__u32) CIFSMaxBufSize + MAX_CIFS_HDR_SIZE); - server->maxRw = le32_to_cpu(pSMBr->MaxRawSize); + server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cFYI(DBG2, ("Max buf = %d", ses->server->maxBuf)); GETU32(ses->server->sessid) = le32_to_cpu(pSMBr->SessionKey); server->capabilities = le32_to_cpu(pSMBr->Capabilities); diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index b234407a300..5c68b4282be 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -34,15 +34,99 @@ extern void SMBNTencrypt(unsigned char *passwd, unsigned char *c8, unsigned char *p24); +/* Checks if this is the first smb session to be reconnected after + the socket has been reestablished (so we know whether to use vc 0). + Called while holding the cifs_tcp_ses_lock, so do not block */ +static bool is_first_ses_reconnect(struct cifsSesInfo *ses) +{ + struct list_head *tmp; + struct cifsSesInfo *tmp_ses; + + list_for_each(tmp, &ses->server->smb_ses_list) { + tmp_ses = list_entry(tmp, struct cifsSesInfo, + smb_ses_list); + if (tmp_ses->need_reconnect == false) + return false; + } + /* could not find a session that was already connected, + this must be the first one we are reconnecting */ + return true; +} + +/* + * vc number 0 is treated specially by some servers, and should be the + * first one we request. After that we can use vcnumbers up to maxvcs, + * one for each smb session (some Windows versions set maxvcs incorrectly + * so maxvc=1 can be ignored). If we have too many vcs, we can reuse + * any vc but zero (some servers reset the connection on vcnum zero) + * + */ +static __le16 get_next_vcnum(struct cifsSesInfo *ses) +{ + __u16 vcnum = 0; + struct list_head *tmp; + struct cifsSesInfo *tmp_ses; + __u16 max_vcs = ses->server->max_vcs; + __u16 i; + int free_vc_found = 0; + + /* Quoting the MS-SMB specification: "Windows-based SMB servers set this + field to one but do not enforce this limit, which allows an SMB client + to establish more virtual circuits than allowed by this value ... but + other server implementations can enforce this limit." */ + if (max_vcs < 2) + max_vcs = 0xFFFF; + + write_lock(&cifs_tcp_ses_lock); + if ((ses->need_reconnect) && is_first_ses_reconnect(ses)) + goto get_vc_num_exit; /* vcnum will be zero */ + for (i = ses->server->srv_count - 1; i < max_vcs; i++) { + if (i == 0) /* this is the only connection, use vc 0 */ + break; + + free_vc_found = 1; + + list_for_each(tmp, &ses->server->smb_ses_list) { + tmp_ses = list_entry(tmp, struct cifsSesInfo, + smb_ses_list); + if (tmp_ses->vcnum == i) { + free_vc_found = 0; + break; /* found duplicate, try next vcnum */ + } + } + if (free_vc_found) + break; /* we found a vcnumber that will work - use it */ + } + + if (i == 0) + vcnum = 0; /* for most common case, ie if one smb session, use + vc zero. Also for case when no free vcnum, zero + is safest to send (some clients only send zero) */ + else if (free_vc_found == 0) + vcnum = 1; /* we can not reuse vc=0 safely, since some servers + reset all uids on that, but 1 is ok. */ + else + vcnum = i; + ses->vcnum = vcnum; +get_vc_num_exit: + write_unlock(&cifs_tcp_ses_lock); + + return le16_to_cpu(vcnum); +} + static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB) { __u32 capabilities = 0; /* init fields common to all four types of SessSetup */ - /* note that header is initialized to zero in header_assemble */ + /* Note that offsets for first seven fields in req struct are same */ + /* in CIFS Specs so does not matter which of 3 forms of struct */ + /* that we use in next few lines */ + /* Note that header is initialized to zero in header_assemble */ pSMB->req.AndXCommand = 0xFF; pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); + pSMB->req.VcNumber = get_next_vcnum(ses); /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ @@ -71,7 +155,6 @@ static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB) if (ses->capabilities & CAP_UNIX) capabilities |= CAP_UNIX; - /* BB check whether to init vcnum BB */ return capabilities; } -- cgit v1.2.3 From 4c41bd0ec953954158f92bed5d3062645062b98e Mon Sep 17 00:00:00 2001 From: Thomas Gleixner Date: Mon, 16 Feb 2009 21:29:31 +0100 Subject: [JFFS2] fix mount crash caused by removed nodes At scan time we observed following scenario: node A inserted node B inserted node C inserted -> sets overlapped flag on node B node A is removed due to CRC failure -> overlapped flag on node B remains while (tn->overlapped) tn = tn_prev(tn); ==> crash, when tn_prev(B) is referenced. When the ultimate node is removed at scan time and the overlapped flag is set on the penultimate node, then nothing updates the overlapped flag of that node. The overlapped iterators blindly expect that the ultimate node does not have the overlapped flag set, which causes the scan code to crash. It would be a huge overhead to go through the node chain on node removal and fix up the overlapped flags, so detecting such a case on the fly in the overlapped iterators is a simpler and reliable solution. Cc: stable@kernel.org Signed-off-by: Thomas Gleixner Signed-off-by: David Woodhouse --- fs/jffs2/readinode.c | 42 +++++++++++++++++++++++++++++++++--------- 1 file changed, 33 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/jffs2/readinode.c b/fs/jffs2/readinode.c index 6ca08ad887c..1fc1e92356e 100644 --- a/fs/jffs2/readinode.c +++ b/fs/jffs2/readinode.c @@ -220,7 +220,7 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, struct jffs2_tmp_dnode_info *tn) { uint32_t fn_end = tn->fn->ofs + tn->fn->size; - struct jffs2_tmp_dnode_info *this; + struct jffs2_tmp_dnode_info *this, *ptn; dbg_readinode("insert fragment %#04x-%#04x, ver %u at %08x\n", tn->fn->ofs, fn_end, tn->version, ref_offset(tn->fn->raw)); @@ -251,11 +251,18 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, if (this) { /* If the node is coincident with another at a lower address, back up until the other node is found. It may be relevant */ - while (this->overlapped) - this = tn_prev(this); - - /* First node should never be marked overlapped */ - BUG_ON(!this); + while (this->overlapped) { + ptn = tn_prev(this); + if (!ptn) { + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + this->overlapped = 0; + break; + } + this = ptn; + } dbg_readinode("'this' found %#04x-%#04x (%s)\n", this->fn->ofs, this->fn->ofs + this->fn->size, this->fn ? "data" : "hole"); } @@ -360,7 +367,17 @@ static int jffs2_add_tn_to_tree(struct jffs2_sb_info *c, } if (!this->overlapped) break; - this = tn_prev(this); + + ptn = tn_prev(this); + if (!ptn) { + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + this->overlapped = 0; + break; + } + this = ptn; } } @@ -456,8 +473,15 @@ static int jffs2_build_inode_fragtree(struct jffs2_sb_info *c, eat_last(&rii->tn_root, &last->rb); ver_insert(&ver_root, last); - if (unlikely(last->overlapped)) - continue; + if (unlikely(last->overlapped)) { + if (pen) + continue; + /* + * We killed a node which set the overlapped + * flags during the scan. Fix it up. + */ + last->overlapped = 0; + } /* Now we have a bunch of nodes in reverse version order, in the tree at ver_root. Most of the time, -- cgit v1.2.3 From 05bf9e839d9de4e8a094274a0a2fd07beb47eaf1 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 21 Feb 2009 12:13:24 -0500 Subject: ext4: Add fallback for find_group_flex This is a workaround for find_group_flex() which badly needs to be replaced. One of its problems (besides ignoring the Orlov algorithm) is that it is a bit hyperactive about returning failure under suspicious circumstances. This can lead to spurious ENOSPC failures even when there are inodes still available. Work around this for now by retrying the search using find_group_other() if find_group_flex() returns -1. If find_group_other() succeeds when find_group_flex() has failed, log a warning message. A better block/inode allocator that will fix this problem for real has been queued up for the next merge window. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 4fb86a0061d..f18a919be70 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -715,6 +715,13 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) if (sbi->s_log_groups_per_flex) { ret2 = find_group_flex(sb, dir, &group); + if (ret2 == -1) { + ret2 = find_group_other(sb, dir, &group); + if (ret2 == 0 && printk_ratelimit()) + printk(KERN_NOTICE "ext4: find_group_flex " + "failed, fallback succeeded dir %lu\n", + dir->i_ino); + } goto got_group; } -- cgit v1.2.3 From ebd3610b110bbb18ea6f9f2aeed1e1068c537227 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 22 Feb 2009 21:09:59 -0500 Subject: ext4: Fix deadlock in ext4_write_begin() and ext4_da_write_begin() Functions ext4_write_begin() and ext4_da_write_begin() call grab_cache_page_write_begin() without AOP_FLAG_NOFS. Thus it can happen that page reclaim is triggered in that function and it recurses back into the filesystem (or some other filesystem). But this can lead to various problems as a transaction is already started at that point. Add the necessary flag. http://bugzilla.kernel.org/show_bug.cgi?id=11688 Signed-off-by: Jan Kara Signed-off-by: "Theodore Ts'o" --- fs/ext4/inode.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cbd2ca99d11..51cdd13e1c3 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1368,6 +1368,10 @@ retry: goto out; } + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { ext4_journal_stop(handle); @@ -1377,7 +1381,7 @@ retry: *pagep = page; ret = block_write_begin(file, mapping, pos, len, flags, pagep, fsdata, - ext4_get_block); + ext4_get_block); if (!ret && ext4_should_journal_data(inode)) { ret = walk_page_buffers(handle, page_buffers(page), @@ -2667,6 +2671,9 @@ retry: ret = PTR_ERR(handle); goto out; } + /* We cannot recurse into the filesystem as the transaction is already + * started */ + flags |= AOP_FLAG_NOFS; page = grab_cache_page_write_begin(mapping, index, flags); if (!page) { -- cgit v1.2.3 From cac711211a039ae2e2dc6322ffb3c2279d093bf1 Mon Sep 17 00:00:00 2001 From: Krzysztof Sachanowicz Date: Mon, 23 Feb 2009 22:21:55 +0100 Subject: proc: proc_get_inode should de_put when inode already initialized de_get is called before every proc_get_inode, but corresponding de_put is called only when dropping last reference to an inode. This might cause something like remove_proc_entry: /proc/stats busy, count=14496 to be printed to the syslog. The fix is to call de_put in case of an already initialized inode in proc_get_inode. Signed-off-by: Krzysztof Sachanowicz Tested-by: Marcin Pilipczuk Acked-by: Al Viro Signed-off-by: Linus Torvalds --- fs/proc/inode.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 3e76bb9b3ad..d8bb5c671f4 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -485,8 +485,10 @@ struct inode *proc_get_inode(struct super_block *sb, unsigned int ino, } } unlock_new_inode(inode); - } else + } else { module_put(de->owner); + de_put(de); + } return inode; out_ino: -- cgit v1.2.3 From e07a4b9217d1e97d2f3a62b6b070efdc61212110 Mon Sep 17 00:00:00 2001 From: Helge Bahmann Date: Fri, 20 Feb 2009 16:24:12 +0300 Subject: proc: fix PG_locked reporting in /proc/kpageflags Expr always evaluates to zero. Cc: Matt Mackall Signed-off-by: Andrew Morton Signed-off-by: Alexey Dobriyan --- fs/proc/page.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 767d95a6d1b..2d1345112a4 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -107,7 +107,7 @@ static ssize_t kpageflags_read(struct file *file, char __user *buf, else kflags = ppage->flags; - uflags = kpf_copy_bit(KPF_LOCKED, PG_locked, kflags) | + uflags = kpf_copy_bit(kflags, KPF_LOCKED, PG_locked) | kpf_copy_bit(kflags, KPF_ERROR, PG_error) | kpf_copy_bit(kflags, KPF_REFERENCED, PG_referenced) | kpf_copy_bit(kflags, KPF_UPTODATE, PG_uptodate) | -- cgit v1.2.3 From 8b1a8ff8b321a9384304aeea4dbdb9747daf7ee8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 28 Feb 2009 00:08:53 -0500 Subject: ext4: Remove duplicate call to ext4_commit_super() in ext4_freeze() Commit c4be0c1d added error checking to ext4_freeze() when calling ext4_commit_super(). Unfortunately the patch failed to remove the original call to ext4_commit_super(), with the net result that when freezing the filesystem, the superblock gets written twice, the first time without error checking. Signed-off-by: "Theodore Ts'o" --- fs/ext4/super.c | 1 - 1 file changed, 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a5732c58f67..39d1993cfa1 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -3091,7 +3091,6 @@ static int ext4_freeze(struct super_block *sb) /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); - ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); error = ext4_commit_super(sb, EXT4_SB(sb)->s_es, 1); if (error) goto out; -- cgit v1.2.3 From d8ae4601a4b7ea1fa17fa395c3468c0e144d1275 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 28 Feb 2009 09:50:01 -0500 Subject: ext4: Reorder fs/Makefile so that ext2 root fs's are mounted using ext2 In fs/Makefile, ext3 was placed before ext2 so that a root filesystem that possessed a journal, it would be mounted as ext3 instead of ext2. This was necessary because a cleanly unmounted ext3 filesystem was fully backwards compatible with ext2, and could be mounted by ext2 --- but it was desirable that it be mounted with ext3 so that the journaling would be enabled. The ext4 filesystem supports new incompatible features, so there is no danger of an ext4 filesystem being mistaken for an ext2 filesystem. At that point, the relative ordering of ext4 with respect to ext2 didn't matter until ext4 gained the ability to mount filesystems without a journal starting in 2.6.29-rc1. Now that this is the case, given that ext4 is before ext2, it means that root filesystems that were using the plain-jane ext2 format are getting mounted using the ext4 filesystem driver, which is a change in behavior which could be surprising to users. It's doubtful that there are that many ext2-only root filesystem users that would also have ext4 compiled into the kernel, but to adhere to the principle of least surprise, the correct ordering in fs/Makefile is ext3, followed by ext2, and finally ext4. Signed-off-by: "Theodore Ts'o" --- fs/Makefile | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/Makefile b/fs/Makefile index 38bc735c67a..dc20db34867 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -69,10 +69,12 @@ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT3_FS) += ext3/ # Before ext2 so root fs can be ext3 -obj-$(CONFIG_EXT4_FS) += ext4/ # Before ext2 so root fs can be ext4 +obj-$(CONFIG_EXT2_FS) += ext2/ +# We place ext4 after ext2 so plain ext2 root fs's are mounted using ext2 +# unless explicitly requested by rootfstype +obj-$(CONFIG_EXT4_FS) += ext4/ obj-$(CONFIG_JBD) += jbd/ obj-$(CONFIG_JBD2) += jbd2/ -obj-$(CONFIG_EXT2_FS) += ext2/ obj-$(CONFIG_CRAMFS) += cramfs/ obj-$(CONFIG_SQUASHFS) += squashfs/ obj-y += ramfs/ -- cgit v1.2.3 From 8f64b32eb73fbfe9f38c4123121b63ee409278a7 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Thu, 26 Feb 2009 00:57:35 -0500 Subject: ext4: don't call jbd2_journal_force_commit_nested without journal Running without a journal, I oopsed when I ran out of space, because we called jbd2_journal_force_commit_nested() from ext4_should_retry_alloc() without a journal. This should take care of it, I think. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/balloc.c | 4 +++- fs/ext4/inode.c | 2 +- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 9a50b8052dc..de9459b4cb9 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -609,7 +609,9 @@ int ext4_claim_free_blocks(struct ext4_sb_info *sbi, */ int ext4_should_retry_alloc(struct super_block *sb, int *retries) { - if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || (*retries)++ > 3) + if (!ext4_has_free_blocks(EXT4_SB(sb), 1) || + (*retries)++ > 3 || + !EXT4_SB(sb)->s_journal) return 0; jbd_debug(1, "%s: retrying operation after ENOSPC\n", sb->s_id); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 51cdd13e1c3..c7fed5b1874 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -2544,7 +2544,7 @@ retry: ext4_journal_stop(handle); - if (mpd.retval == -ENOSPC) { + if ((mpd.retval == -ENOSPC) && sbi->s_journal) { /* commit the transaction which would * free blocks released in the transaction * and try again -- cgit v1.2.3 From b2bf96833c5782befc3e7700f791fde754a47b01 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Feb 2009 08:50:26 +0100 Subject: block: fix bogus gcc warning for uninitialized var usage Newer gcc throw this warning: fs/bio.c: In function ?bio_alloc_bioset?: fs/bio.c:305: warning: ?p? may be used uninitialized in this function since it cannot figure out that 'p' is only ever used if 'bs' is non-NULL. Signed-off-by: Jens Axboe --- fs/bio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 72ab251cdb9..124b95c4d58 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -302,7 +302,7 @@ void bio_init(struct bio *bio) struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { struct bio *bio = NULL; - void *p; + void *uninitialized_var(p); if (bs) { p = mempool_alloc(bs->bio_pool, gfp_mask); -- cgit v1.2.3 From 47be12e4eec84c1846f29af64fe25a396b57a026 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 9 Jan 2009 07:32:48 +0800 Subject: ocfs2: Access and dirty the buffer_head in mark_written. In __ocfs2_mark_extent_written, when we meet with the situation of c_split_covers_rec, the old solution just replace the extent record and forget to access and dirty the buffer_head. This will cause a problem when the unwritten extent is in an extent block. So access and dirty it. Signed-off-by: Tao Ma Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 27 ++++++++++++++++++++++++++- 1 file changed, 26 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 60fe74035db..3a9e5deed74 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -4796,6 +4796,29 @@ out: return ret; } +static int ocfs2_replace_extent_rec(struct inode *inode, + handle_t *handle, + struct ocfs2_path *path, + struct ocfs2_extent_list *el, + int split_index, + struct ocfs2_extent_rec *split_rec) +{ + int ret; + + ret = ocfs2_path_bh_journal_access(handle, inode, path, + path_num_items(path) - 1); + if (ret) { + mlog_errno(ret); + goto out; + } + + el->l_recs[split_index] = *split_rec; + + ocfs2_journal_dirty(handle, path_leaf_bh(path)); +out: + return ret; +} + /* * Mark part or all of the extent record at split_index in the leaf * pointed to by path as written. This removes the unwritten @@ -4885,7 +4908,9 @@ static int __ocfs2_mark_extent_written(struct inode *inode, if (ctxt.c_contig_type == CONTIG_NONE) { if (ctxt.c_split_covers_rec) - el->l_recs[split_index] = *split_rec; + ret = ocfs2_replace_extent_rec(inode, handle, + path, el, + split_index, split_rec); else ret = ocfs2_split_and_insert(inode, handle, path, et, &last_eb_bh, split_index, -- cgit v1.2.3 From 7dc102b737e9f49dac426161294cb2d326a97d8e Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Tue, 3 Feb 2009 12:37:13 -0800 Subject: ocfs2/dlm: Retract fix for race between purge and migrate Mainline commit d4f7e650e55af6b235871126f747da88600e8040 attempts to delay the dlm_thread from sending the drop ref message if the lockres is being migrated. The problem is that we make the dlm_thread wait for the migration to complete. This causes a deadlock as dlm_thread also participates in the lockres migration process. A better fix for the original oss bugzilla#1012 is in testing. Signed-off-by: Sunil Mushran Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmthread.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmthread.c b/fs/ocfs2/dlm/dlmthread.c index d1295203029..4060bb328bc 100644 --- a/fs/ocfs2/dlm/dlmthread.c +++ b/fs/ocfs2/dlm/dlmthread.c @@ -181,8 +181,7 @@ static int dlm_purge_lockres(struct dlm_ctxt *dlm, spin_lock(&res->spinlock); /* This ensures that clear refmap is sent after the set */ - __dlm_wait_on_lockres_flags(res, (DLM_LOCK_RES_SETREF_INPROG | - DLM_LOCK_RES_MIGRATING)); + __dlm_wait_on_lockres_flags(res, DLM_LOCK_RES_SETREF_INPROG); spin_unlock(&res->spinlock); /* clear our bit from the master's refmap, ignore errors */ -- cgit v1.2.3 From c74ff8bb2235d848beb67fcfddae71ecbe3f92b1 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Tue, 3 Feb 2009 12:37:14 -0800 Subject: ocfs2: Cleanup the lockname print in dlmglue.c The dentry lock has a different format than other locks. This patch fixes ocfs2_log_dlm_error() macro to make it print the dentry lock correctly. Signed-off-by: Sunil Mushran Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/dlmglue.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 206a2370876..7219a86d34c 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -320,9 +320,14 @@ static void ocfs2_schedule_blocked_lock(struct ocfs2_super *osb, struct ocfs2_lock_res *lockres); static inline void ocfs2_recover_from_dlm_error(struct ocfs2_lock_res *lockres, int convert); -#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \ - mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \ - _err, _func, _lockres->l_name); \ +#define ocfs2_log_dlm_error(_func, _err, _lockres) do { \ + if ((_lockres)->l_type != OCFS2_LOCK_TYPE_DENTRY) \ + mlog(ML_ERROR, "DLM error %d while calling %s on resource %s\n", \ + _err, _func, _lockres->l_name); \ + else \ + mlog(ML_ERROR, "DLM error %d while calling %s on resource %.*s%08x\n", \ + _err, _func, OCFS2_DENTRY_LOCK_INO_START - 1, (_lockres)->l_name, \ + (unsigned int)ocfs2_get_dentry_lock_ino(_lockres)); \ } while (0) static int ocfs2_downconvert_thread(void *arg); static void ocfs2_downconvert_on_unlock(struct ocfs2_super *osb, -- cgit v1.2.3 From dabc47de7a23f57522dc762d9d2ad875700d3497 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Tue, 3 Feb 2009 12:37:15 -0800 Subject: ocfs2/dlm: Use ast_lock to protect ast_list The code was using dlm->spinlock instead of dlm->ast_lock to protect the ast_list. This patch fixes the issue. Signed-off-by: Sunil Mushran Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmunlock.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmunlock.c b/fs/ocfs2/dlm/dlmunlock.c index 86ca085ef32..fcf879ed693 100644 --- a/fs/ocfs2/dlm/dlmunlock.c +++ b/fs/ocfs2/dlm/dlmunlock.c @@ -117,11 +117,11 @@ static enum dlm_status dlmunlock_common(struct dlm_ctxt *dlm, else BUG_ON(res->owner == dlm->node_num); - spin_lock(&dlm->spinlock); + spin_lock(&dlm->ast_lock); /* We want to be sure that we're not freeing a lock * that still has AST's pending... */ in_use = !list_empty(&lock->ast_list); - spin_unlock(&dlm->spinlock); + spin_unlock(&dlm->ast_lock); if (in_use) { mlog(ML_ERROR, "lockres %.*s: Someone is calling dlmunlock " "while waiting for an ast!", res->lockname.len, -- cgit v1.2.3 From 53ecd25e148615e0ed2a72635cc76f4773f97f90 Mon Sep 17 00:00:00 2001 From: Sunil Mushran Date: Tue, 3 Feb 2009 12:37:16 -0800 Subject: ocfs2/dlm: Make dlm_assert_master_handler() kill itself instead of the asserter In dlm_assert_master_handler(), if we get an incorrect assert master from a node that, we reply with EINVAL asking the asserter to die. The problem is that an assert is sent after so many hoops, it is invariably the node that thinks the asserter is wrong, is actually wrong. So instead of killing the asserter, this patch kills the assertee. This patch papers over a race that is still being addressed. Signed-off-by: Sunil Mushran Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/dlm/dlmmaster.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 54e182a27ca..0a281394785 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1849,12 +1849,12 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, if (!mle) { if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN && res->owner != assert->node_idx) { - mlog(ML_ERROR, "assert_master from " - "%u, but current owner is " - "%u! (%.*s)\n", - assert->node_idx, res->owner, - namelen, name); - goto kill; + mlog(ML_ERROR, "DIE! Mastery assert from %u, " + "but current owner is %u! (%.*s)\n", + assert->node_idx, res->owner, namelen, + name); + __dlm_print_one_lock_resource(res); + BUG(); } } else if (mle->type != DLM_MLE_MIGRATION) { if (res->owner != DLM_LOCK_RES_OWNER_UNKNOWN) { -- cgit v1.2.3 From 89a907afe073b8971a83d0ad54f391542b64d327 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 17 Feb 2009 04:39:28 +0800 Subject: ocfs2: Use the right access_* method in ctime update of xattr. In ctime updating of xattr, it use the wrong type of access for inode, so use ocfs2_journal_access_di instead. Reported-and-Tested-by: Tristan Ye Signed-off-by: Tao Ma Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 915039fffe6..e3933158e1d 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2592,8 +2592,9 @@ static int __ocfs2_xattr_set_handle(struct inode *inode, if (!ret) { /* Update inode ctime. */ - ret = ocfs2_journal_access(ctxt->handle, inode, xis->inode_bh, - OCFS2_JOURNAL_ACCESS_WRITE); + ret = ocfs2_journal_access_di(ctxt->handle, inode, + xis->inode_bh, + OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { mlog_errno(ret); goto out; -- cgit v1.2.3 From c8b9cf9a7cd25ba65166116d0a958f0bc709f0a7 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Tue, 24 Feb 2009 17:40:26 -0800 Subject: ocfs2: lock the metaecc process for xattr bucket For other metadata in ocfs2, metaecc is checked in ocfs2_read_blocks with io_mutex held. While for xattr bucket, it is calculated by the whole buckets. So we have to add a spin_lock to prevent multiple processes calculating metaecc. Signed-off-by: Tao Ma Tested-by: Tristan Ye Signed-off-by: Mark Fasheh --- fs/ocfs2/ocfs2.h | 3 +++ fs/ocfs2/super.c | 1 + fs/ocfs2/xattr.c | 4 ++++ 3 files changed, 8 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 077384135f4..946d3c34b90 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -341,6 +341,9 @@ struct ocfs2_super struct ocfs2_node_map osb_recovering_orphan_dirs; unsigned int *osb_orphan_wipes; wait_queue_head_t osb_wipe_event; + + /* used to protect metaecc calculation check of xattr. */ + spinlock_t osb_xattr_lock; }; #define OCFS2_SB(sb) ((struct ocfs2_super *)(sb)->s_fs_info) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index b1cb38fbe80..1c3acc4654d 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1747,6 +1747,7 @@ static int ocfs2_initialize_super(struct super_block *sb, INIT_LIST_HEAD(&osb->blocked_lock_list); osb->blocked_lock_count = 0; spin_lock_init(&osb->osb_lock); + spin_lock_init(&osb->osb_xattr_lock); ocfs2_init_inode_steal_slot(osb); atomic_set(&osb->alloc_stats.moves, 0); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index e3933158e1d..a7c167905c5 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -274,10 +274,12 @@ static int ocfs2_read_xattr_bucket(struct ocfs2_xattr_bucket *bucket, bucket->bu_blocks, bucket->bu_bhs, 0, NULL); if (!rc) { + spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); rc = ocfs2_validate_meta_ecc_bhs(bucket->bu_inode->i_sb, bucket->bu_bhs, bucket->bu_blocks, &bucket_xh(bucket)->xh_check); + spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); if (rc) mlog_errno(rc); } @@ -310,9 +312,11 @@ static void ocfs2_xattr_bucket_journal_dirty(handle_t *handle, { int i; + spin_lock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); ocfs2_compute_meta_ecc_bhs(bucket->bu_inode->i_sb, bucket->bu_bhs, bucket->bu_blocks, &bucket_xh(bucket)->xh_check); + spin_unlock(&OCFS2_SB(bucket->bu_inode->i_sb)->osb_xattr_lock); for (i = 0; i < bucket->bu_blocks; i++) ocfs2_journal_dirty(handle, bucket->bu_bhs[i]); -- cgit v1.2.3 From 4442f518269c6b3686fcbcadad22dc4475309b16 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Fri, 20 Feb 2009 11:11:50 +0800 Subject: ocfs2: set gap to seperate entry and value when xattr in bucket This patch set a gap (4 bytes) between xattr entry and name/value when xattr in bucket. This gap use to seperate entry and name/value when a bucket is full. It had already been set when xattr in inode/block. Signed-off-by: Tiger Yang Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index a7c167905c5..4ddd788add6 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -82,13 +82,14 @@ struct ocfs2_xattr_set_ctxt { #define OCFS2_XATTR_ROOT_SIZE (sizeof(struct ocfs2_xattr_def_value_root)) #define OCFS2_XATTR_INLINE_SIZE 80 +#define OCFS2_XATTR_HEADER_GAP 4 #define OCFS2_XATTR_FREE_IN_IBODY (OCFS2_MIN_XATTR_INLINE_SIZE \ - sizeof(struct ocfs2_xattr_header) \ - - sizeof(__u32)) + - OCFS2_XATTR_HEADER_GAP) #define OCFS2_XATTR_FREE_IN_BLOCK(ptr) ((ptr)->i_sb->s_blocksize \ - sizeof(struct ocfs2_xattr_block) \ - sizeof(struct ocfs2_xattr_header) \ - - sizeof(__u32)) + - OCFS2_XATTR_HEADER_GAP) static struct ocfs2_xattr_def_value_root def_xv = { .xv.xr_list.l_count = cpu_to_le16(1), @@ -1511,7 +1512,7 @@ static int ocfs2_xattr_set_entry(struct inode *inode, last += 1; } - free = min_offs - ((void *)last - xs->base) - sizeof(__u32); + free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; if (free < 0) return -EIO; @@ -2194,7 +2195,7 @@ static int ocfs2_xattr_can_be_in_inode(struct inode *inode, last += 1; } - free = min_offs - ((void *)last - xs->base) - sizeof(__u32); + free = min_offs - ((void *)last - xs->base) - OCFS2_XATTR_HEADER_GAP; if (free < 0) return 0; @@ -5065,8 +5066,8 @@ try_again: xh_free_start = le16_to_cpu(xh->xh_free_start); header_size = sizeof(struct ocfs2_xattr_header) + count * sizeof(struct ocfs2_xattr_entry); - max_free = OCFS2_XATTR_BUCKET_SIZE - - le16_to_cpu(xh->xh_name_value_len) - header_size; + max_free = OCFS2_XATTR_BUCKET_SIZE - header_size - + le16_to_cpu(xh->xh_name_value_len) - OCFS2_XATTR_HEADER_GAP; mlog_bug_on_msg(header_size > blocksize, "bucket %llu has header size " "of %u which exceed block size\n", @@ -5099,7 +5100,7 @@ try_again: need = 0; } - free = xh_free_start - header_size; + free = xh_free_start - header_size - OCFS2_XATTR_HEADER_GAP; /* * We need to make sure the new name/value pair * can exist in the same block. @@ -5132,7 +5133,8 @@ try_again: } xh_free_start = le16_to_cpu(xh->xh_free_start); - free = xh_free_start - header_size; + free = xh_free_start - header_size + - OCFS2_XATTR_HEADER_GAP; if (xh_free_start % blocksize < need) free -= xh_free_start % blocksize; -- cgit v1.2.3 From 28d57d437786eb3e44f1ca3f0f41e7cfe29c6dd4 Mon Sep 17 00:00:00 2001 From: wengang wang Date: Fri, 13 Feb 2009 10:11:47 +0800 Subject: ocfs2: add IO error check in ocfs2_get_sector() Check for IO error in ocfs2_get_sector(). Signed-off-by: Wengang Wang Signed-off-by: Mark Fasheh --- fs/ocfs2/super.c | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 1c3acc4654d..7ac83a81ee5 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1537,6 +1537,13 @@ static int ocfs2_get_sector(struct super_block *sb, unlock_buffer(*bh); ll_rw_block(READ, 1, bh); wait_on_buffer(*bh); + if (!buffer_uptodate(*bh)) { + mlog_errno(-EIO); + brelse(*bh); + *bh = NULL; + return -EIO; + } + return 0; } -- cgit v1.2.3 From adc487204a9373d2b5a535412466326036147a72 Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Fri, 27 Feb 2009 14:02:59 -0800 Subject: EXPORT_SYMBOL(d_obtain_alias) rather than EXPORT_SYMBOL_GPL Commit 4ea3ada2955e4519befa98ff55dd62d6dfbd1705 declares d_obtain_alias() as EXPORT_SYMBOL_GPL where it's supposed to replace d_alloc_anon which was previously declared as EXPORT_SYMBOL and thus available to any loadable module. This patch reverts that. Signed-off-by: Benny Halevy Acked-by: Linus Torvalds Cc: Christoph Hellwig Cc: "J. Bruce Fields" Cc: Trond Myklebust Acked-by: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dcache.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/dcache.c b/fs/dcache.c index 937df0fb0da..07e2d4a44bd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1180,7 +1180,7 @@ struct dentry *d_obtain_alias(struct inode *inode) iput(inode); return res; } -EXPORT_SYMBOL_GPL(d_obtain_alias); +EXPORT_SYMBOL(d_obtain_alias); /** * d_splice_alias - splice a disconnected dentry into the tree if one exists -- cgit v1.2.3 From 5cf8cf4146de03de67d1a8aefbece66b65f255cc Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 26 Feb 2009 21:32:51 +0100 Subject: Fix FREEZE/THAW compat_ioctl regression Commit 8e961870bb9804110d5c8211d5d9d500451c4518 removed the FREEZE/THAW handling in xfs_compat_ioctl but never added any compat handler back, so now any freeze/thaw request from a 32-bit binary ond 64-bit userspace will fail. As these ioctls are 32/64-bit compatible two simple COMPATIBLE_IOCTL entries in fs/compat_ioctl.c will do the job. Signed-off-by: Christoph Hellwig Signed-off-by: Linus Torvalds --- fs/compat_ioctl.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index 39bd4d38e88..45e59d3c7f1 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -1913,6 +1913,9 @@ COMPATIBLE_IOCTL(FIONREAD) /* This is also TIOCINQ */ /* 0x00 */ COMPATIBLE_IOCTL(FIBMAP) COMPATIBLE_IOCTL(FIGETBSZ) +/* 'X' - originally XFS but some now in the VFS */ +COMPATIBLE_IOCTL(FIFREEZE) +COMPATIBLE_IOCTL(FITHAW) /* RAID */ COMPATIBLE_IOCTL(RAID_VERSION) COMPATIBLE_IOCTL(GET_ARRAY_INFO) -- cgit v1.2.3 From 7ce9d5d1f3c8736511daa413c64985a05b2feee3 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Wed, 4 Mar 2009 18:38:18 -0500 Subject: ext4: fix ext4_free_inode() vs. ext4_claim_inode() race I was seeing fsck errors on inode bitmaps after a 4 thread dbench run on a 4 cpu machine: Inode bitmap differences: -50736 -(50752--50753) etc... I believe that this is because ext4_free_inode() uses atomic bitops, and although ext4_new_inode() *used* to also use atomic bitops for synchronization, commit 393418676a7602e1d7d3f6e560159c65c8cbd50e changed this to use the sb_bgl_lock, so that we could also synchronize against read_inode_bitmap and initialization of uninit inode tables. However, that change left ext4_free_inode using atomic bitops, which I think leaves no synchronization between setting & unsetting bits in the inode table. The below patch fixes it for me, although I wonder if we're getting at all heavy-handed with this spinlock... Signed-off-by: Eric Sandeen Reviewed-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index f18a919be70..627f8c3337a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -188,7 +188,7 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) struct ext4_group_desc *gdp; struct ext4_super_block *es; struct ext4_sb_info *sbi; - int fatal = 0, err, count; + int fatal = 0, err, count, cleared; ext4_group_t flex_group; if (atomic_read(&inode->i_count) > 1) { @@ -248,8 +248,10 @@ void ext4_free_inode(handle_t *handle, struct inode *inode) goto error_return; /* Ok, now we can actually update the inode bitmaps.. */ - if (!ext4_clear_bit_atomic(sb_bgl_lock(sbi, block_group), - bit, bitmap_bh->b_data)) + spin_lock(sb_bgl_lock(sbi, block_group)); + cleared = ext4_clear_bit(bit, bitmap_bh->b_data); + spin_unlock(sb_bgl_lock(sbi, block_group)); + if (!cleared) ext4_error(sb, "ext4_free_inode", "bit already cleared for inode %lu", ino); else { -- cgit v1.2.3 From 118e1ef6fabfc023126e6075f6ac0fc729cb5285 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 5 Mar 2009 00:31:12 +0000 Subject: Squashfs: Fix oops when reading fsfuzzer corrupted filesystems This fixes a code regression caused by the recent mainlining changes. The recent code changes call zlib_inflate repeatedly, decompressing into separate 4K buffers, this code didn't check for the possibility that zlib_inflate might ask for too many buffers when decompressing corrupted data. Signed-off-by: Phillip Lougher --- fs/squashfs/block.c | 13 +++++++++++-- fs/squashfs/cache.c | 4 ++-- fs/squashfs/squashfs.h | 2 +- fs/squashfs/super.c | 2 +- 4 files changed, 15 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index c837dfc2b3c..321728f48f2 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -80,7 +80,7 @@ static struct buffer_head *get_block_length(struct super_block *sb, * generated a larger block - this does occasionally happen with zlib). */ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, - int length, u64 *next_index, int srclength) + int length, u64 *next_index, int srclength, int pages) { struct squashfs_sb_info *msblk = sb->s_fs_info; struct buffer_head **bh; @@ -185,6 +185,14 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, } if (msblk->stream.avail_out == 0) { + if (page == pages) { + ERROR("zlib_inflate tried to " + "decompress too much data, " + "expected %d bytes. Zlib " + "data probably corrupt\n", + srclength); + goto release_mutex; + } msblk->stream.next_out = buffer[page++]; msblk->stream.avail_out = PAGE_CACHE_SIZE; } @@ -268,7 +276,8 @@ block_release: put_bh(bh[k]); read_failure: - ERROR("sb_bread failed reading block 0x%llx\n", cur_index); + ERROR("squashfs_read_data failed to read block 0x%llx\n", + (unsigned long long) index); kfree(bh); return -EIO; } diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index f29eda16d25..1c4739e33af 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -119,7 +119,7 @@ struct squashfs_cache_entry *squashfs_cache_get(struct super_block *sb, entry->length = squashfs_read_data(sb, entry->data, block, length, &entry->next_index, - cache->block_size); + cache->block_size, cache->pages); spin_lock(&cache->lock); @@ -406,7 +406,7 @@ int squashfs_read_table(struct super_block *sb, void *buffer, u64 block, for (i = 0; i < pages; i++, buffer += PAGE_CACHE_SIZE) data[i] = buffer; res = squashfs_read_data(sb, data, block, length | - SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length); + SQUASHFS_COMPRESSED_BIT_BLOCK, NULL, length, pages); kfree(data); return res; } diff --git a/fs/squashfs/squashfs.h b/fs/squashfs/squashfs.h index 6b2515d027d..0e9feb6adf7 100644 --- a/fs/squashfs/squashfs.h +++ b/fs/squashfs/squashfs.h @@ -34,7 +34,7 @@ static inline struct squashfs_inode_info *squashfs_i(struct inode *inode) /* block.c */ extern int squashfs_read_data(struct super_block *, void **, u64, int, u64 *, - int); + int, int); /* cache.c */ extern struct squashfs_cache *squashfs_cache_init(char *, int, int); diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 071df5b5b49..681ec0d8379 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -389,7 +389,7 @@ static int __init init_squashfs_fs(void) return err; } - printk(KERN_INFO "squashfs: version 4.0 (2009/01/03) " + printk(KERN_INFO "squashfs: version 4.0 (2009/01/31) " "Phillip Lougher\n"); return 0; -- cgit v1.2.3 From f4f8056a862a9950320429dfda708c88b4ce6025 Mon Sep 17 00:00:00 2001 From: Roel Kluin Date: Thu, 5 Mar 2009 00:55:31 +0000 Subject: Squashfs: frag_size should be signed, as it can hold an error result Signed-off-by: Roel Kluin Signed-off-by: Phillip Lougher --- fs/squashfs/inode.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/inode.c b/fs/squashfs/inode.c index 7a63398bb85..9101dbde39e 100644 --- a/fs/squashfs/inode.c +++ b/fs/squashfs/inode.c @@ -133,7 +133,8 @@ int squashfs_read_inode(struct inode *inode, long long ino) type = le16_to_cpu(sqshb_ino->inode_type); switch (type) { case SQUASHFS_REG_TYPE: { - unsigned int frag_offset, frag_size, frag; + unsigned int frag_offset, frag; + int frag_size; u64 frag_blk; struct squashfs_reg_inode *sqsh_ino = &squashfs_ino.reg; @@ -175,7 +176,8 @@ int squashfs_read_inode(struct inode *inode, long long ino) break; } case SQUASHFS_LREG_TYPE: { - unsigned int frag_offset, frag_size, frag; + unsigned int frag_offset, frag; + int frag_size; u64 frag_blk; struct squashfs_lreg_inode *sqsh_ino = &squashfs_ino.lreg; -- cgit v1.2.3 From ff392c497b43ddedbab5627b53928a654cc5486e Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:36 -0500 Subject: xfs: prevent kernel crash due to corrupted inode log format Andras Korn reported an oops on log replay causes by a corrupted xfs_inode_log_format_t passing a 0 size to kmem_zalloc. This patch handles to small or too large numbers of log regions gracefully by rejecting the log replay with a useful error message. Signed-off-by: Christoph Hellwig Reported-by: Andras Korn Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/xfs_log_recover.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b1047de2fff..61af610d79b 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -1455,10 +1455,19 @@ xlog_recover_add_to_trans( item = item->ri_prev; if (item->ri_total == 0) { /* first region to be added */ - item->ri_total = in_f->ilf_size; - ASSERT(item->ri_total <= XLOG_MAX_REGIONS_IN_ITEM); - item->ri_buf = kmem_zalloc((item->ri_total * - sizeof(xfs_log_iovec_t)), KM_SLEEP); + if (in_f->ilf_size == 0 || + in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { + xlog_warn( + "XFS: bad number of regions (%d) in inode log format", + in_f->ilf_size); + ASSERT(0); + return XFS_ERROR(EIO); + } + + item->ri_total = in_f->ilf_size; + item->ri_buf = + kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), + KM_SLEEP); } ASSERT(item->ri_total > item->ri_cnt); /* Description region is ri_buf[0] */ -- cgit v1.2.3 From 7d46be4a25fdfb503c20bad60a618adebfe2ac5c Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:35 -0500 Subject: xfs: prevent lockdep false positive in xfs_iget_cache_miss The inode can't be locked by anyone else as we just created it a few lines above and it's not been added to any lookup data structure yet. So use a trylock that must succeed to get around the lockdep warnings. Signed-off-by: Christoph Hellwig Reported-by: Alexander Beregalov Reviewed-by: Eric Sandeen Reviewed-by: Felix Blyakher Signed-off-by: Felix Blyakher --- fs/xfs/xfs_iget.c | 15 ++++++++++----- 1 file changed, 10 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index e2fb6210d4c..478e587087f 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -246,9 +246,6 @@ xfs_iget_cache_miss( goto out_destroy; } - if (lock_flags) - xfs_ilock(ip, lock_flags); - /* * Preload the radix tree so we can insert safely under the * write spinlock. Note that we cannot sleep inside the preload @@ -256,7 +253,16 @@ xfs_iget_cache_miss( */ if (radix_tree_preload(GFP_KERNEL)) { error = EAGAIN; - goto out_unlock; + goto out_destroy; + } + + /* + * Because the inode hasn't been added to the radix-tree yet it can't + * be found by another thread, so we can do the non-sleeping lock here. + */ + if (lock_flags) { + if (!xfs_ilock_nowait(ip, lock_flags)) + BUG(); } mask = ~(((XFS_INODE_CLUSTER_SIZE(mp) >> mp->m_sb.sb_inodelog)) - 1); @@ -284,7 +290,6 @@ xfs_iget_cache_miss( out_preload_end: write_unlock(&pag->pag_ici_lock); radix_tree_preload_end(); -out_unlock: if (lock_flags) xfs_iunlock(ip, lock_flags); out_destroy: -- cgit v1.2.3 From c141b2928fe20396a9ecdec85526e4b66ae96c90 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 3 Mar 2009 14:48:37 -0500 Subject: xfs: only issues a cache flush on unmount if barriers are enabled Currently we unconditionally issue a flush from xfs_free_buftarg, but since 2.6.29-rc1 this gives a warning in the style of end_request: I/O error, dev vdb, sector 0 Signed-off-by: Christoph Hellwig Reviewed-by: Eric Sandeen Signed-off-by: Felix Blyakher --- fs/xfs/linux-2.6/xfs_buf.c | 12 ++++++++++-- fs/xfs/linux-2.6/xfs_buf.h | 2 +- fs/xfs/linux-2.6/xfs_super.c | 10 +++++----- 3 files changed, 16 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index cb329edc925..aa1016bb913 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -34,6 +34,12 @@ #include #include +#include "xfs_sb.h" +#include "xfs_inum.h" +#include "xfs_ag.h" +#include "xfs_dmapi.h" +#include "xfs_mount.h" + static kmem_zone_t *xfs_buf_zone; STATIC int xfsbufd(void *); STATIC int xfsbufd_wakeup(int, gfp_t); @@ -1435,10 +1441,12 @@ xfs_unregister_buftarg( void xfs_free_buftarg( - xfs_buftarg_t *btp) + struct xfs_mount *mp, + struct xfs_buftarg *btp) { xfs_flush_buftarg(btp, 1); - xfs_blkdev_issue_flush(btp); + if (mp->m_flags & XFS_MOUNT_BARRIER) + xfs_blkdev_issue_flush(btp); xfs_free_bufhash(btp); iput(btp->bt_mapping->host); diff --git a/fs/xfs/linux-2.6/xfs_buf.h b/fs/xfs/linux-2.6/xfs_buf.h index 288ae7c4c80..9b4d666ad31 100644 --- a/fs/xfs/linux-2.6/xfs_buf.h +++ b/fs/xfs/linux-2.6/xfs_buf.h @@ -413,7 +413,7 @@ static inline int XFS_bwrite(xfs_buf_t *bp) * Handling of buftargs. */ extern xfs_buftarg_t *xfs_alloc_buftarg(struct block_device *, int); -extern void xfs_free_buftarg(xfs_buftarg_t *); +extern void xfs_free_buftarg(struct xfs_mount *, struct xfs_buftarg *); extern void xfs_wait_buftarg(xfs_buftarg_t *); extern int xfs_setsize_buftarg(xfs_buftarg_t *, unsigned int, unsigned int); extern int xfs_flush_buftarg(xfs_buftarg_t *, int); diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index c71e226da7f..32ae5028e96 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -734,15 +734,15 @@ xfs_close_devices( { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) { struct block_device *logdev = mp->m_logdev_targp->bt_bdev; - xfs_free_buftarg(mp->m_logdev_targp); + xfs_free_buftarg(mp, mp->m_logdev_targp); xfs_blkdev_put(logdev); } if (mp->m_rtdev_targp) { struct block_device *rtdev = mp->m_rtdev_targp->bt_bdev; - xfs_free_buftarg(mp->m_rtdev_targp); + xfs_free_buftarg(mp, mp->m_rtdev_targp); xfs_blkdev_put(rtdev); } - xfs_free_buftarg(mp->m_ddev_targp); + xfs_free_buftarg(mp, mp->m_ddev_targp); } /* @@ -811,9 +811,9 @@ xfs_open_devices( out_free_rtdev_targ: if (mp->m_rtdev_targp) - xfs_free_buftarg(mp->m_rtdev_targp); + xfs_free_buftarg(mp, mp->m_rtdev_targp); out_free_ddev_targ: - xfs_free_buftarg(mp->m_ddev_targp); + xfs_free_buftarg(mp, mp->m_ddev_targp); out_close_rtdev: if (rtdev) xfs_blkdev_put(rtdev); -- cgit v1.2.3 From b9447ef80bd301b932ac4d85c9622e929de5fd62 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Mon, 9 Mar 2009 11:45:38 -0400 Subject: Btrfs: fix spinlock assertions on UP systems btrfs_tree_locked was being used to make sure a given extent_buffer was properly locked in a few places. But, it wasn't correct for UP compiled kernels. This switches it to using assert_spin_locked instead, and renames it to btrfs_assert_tree_locked to better reflect how it was really being used. Signed-off-by: Chris Mason --- fs/btrfs/ctree.c | 10 +++++----- fs/btrfs/disk-io.c | 4 ++-- fs/btrfs/extent-tree.c | 4 ++-- fs/btrfs/locking.c | 6 +++--- fs/btrfs/locking.h | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 42491d728e9..37f31b5529a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -277,7 +277,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (*cow_ret == buf) unlock_orig = 1; - WARN_ON(!btrfs_tree_locked(buf)); + btrfs_assert_tree_locked(buf); if (parent) parent_start = parent->start; @@ -2365,7 +2365,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (slot >= btrfs_header_nritems(upper) - 1) return 1; - WARN_ON(!btrfs_tree_locked(path->nodes[1])); + btrfs_assert_tree_locked(path->nodes[1]); right = read_node_slot(root, upper, slot + 1); btrfs_tree_lock(right); @@ -2562,7 +2562,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (right_nritems == 0) return 1; - WARN_ON(!btrfs_tree_locked(path->nodes[1])); + btrfs_assert_tree_locked(path->nodes[1]); left = read_node_slot(root, path->nodes[1], slot - 1); btrfs_tree_lock(left); @@ -4101,7 +4101,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) next = read_node_slot(root, c, slot); if (!path->skip_locking) { - WARN_ON(!btrfs_tree_locked(c)); + btrfs_assert_tree_locked(c); btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } @@ -4126,7 +4126,7 @@ int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path) reada_for_search(root, path, level, slot, 0); next = read_node_slot(root, next, 0); if (!path->skip_locking) { - WARN_ON(!btrfs_tree_locked(path->nodes[level])); + btrfs_assert_tree_locked(path->nodes[level]); btrfs_tree_lock(next); btrfs_set_lock_blocking(next); } diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index adda739a021..3e18175248e 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -857,7 +857,7 @@ int clean_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *btree_inode = root->fs_info->btree_inode; if (btrfs_header_generation(buf) == root->fs_info->running_transaction->transid) { - WARN_ON(!btrfs_tree_locked(buf)); + btrfs_assert_tree_locked(buf); /* ugh, clear_extent_buffer_dirty can be expensive */ btrfs_set_lock_blocking(buf); @@ -2361,7 +2361,7 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf) btrfs_set_lock_blocking(buf); - WARN_ON(!btrfs_tree_locked(buf)); + btrfs_assert_tree_locked(buf); if (transid != root->fs_info->generation) { printk(KERN_CRIT "btrfs transid mismatch buffer %llu, " "found %llu running %llu\n", diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 6b5966aacf4..9abf81f71c4 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4418,13 +4418,13 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); BUG_ON(!path); - BUG_ON(!btrfs_tree_locked(parent)); + btrfs_assert_tree_locked(parent); parent_level = btrfs_header_level(parent); extent_buffer_get(parent); path->nodes[parent_level] = parent; path->slots[parent_level] = btrfs_header_nritems(parent); - BUG_ON(!btrfs_tree_locked(node)); + btrfs_assert_tree_locked(node); level = btrfs_header_level(node); extent_buffer_get(node); path->nodes[level] = node; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 85506c4a3af..47b0a88c12a 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -220,8 +220,8 @@ int btrfs_tree_unlock(struct extent_buffer *eb) return 0; } -int btrfs_tree_locked(struct extent_buffer *eb) +void btrfs_assert_tree_locked(struct extent_buffer *eb) { - return test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags) || - spin_is_locked(&eb->lock); + if (!test_bit(EXTENT_BUFFER_BLOCKING, &eb->bflags)) + assert_spin_locked(&eb->lock); } diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 6bb0afbff92..6c4ce457168 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -21,11 +21,11 @@ int btrfs_tree_lock(struct extent_buffer *eb); int btrfs_tree_unlock(struct extent_buffer *eb); -int btrfs_tree_locked(struct extent_buffer *eb); int btrfs_try_tree_lock(struct extent_buffer *eb); int btrfs_try_spin_lock(struct extent_buffer *eb); void btrfs_set_lock_blocking(struct extent_buffer *eb); void btrfs_clear_lock_blocking(struct extent_buffer *eb); +void btrfs_assert_tree_locked(struct extent_buffer *eb); #endif -- cgit v1.2.3 From 4184ea7f908d95f329febc3665cf66da8568b467 Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 10 Mar 2009 12:39:20 -0400 Subject: Btrfs: Fix locking around adding new space_info Storage allocated to different raid levels in btrfs is tracked by a btrfs_space_info structure, and all of the current space_infos are collected into a list_head. Most filesystems have 3 or 4 of these structs total, and the list is only changed when new raid levels are added or at unmount time. This commit adds rcu locking on the list head, and properly frees things at unmount time. It also clears the space_info->full flag whenever new space is added to the FS. The locking for the space info list goes like this: reads: protected by rcu_read_lock() writes: protected by the chunk_mutex At unmount time we don't need special locking because all the readers are gone. Signed-off-by: Chris Mason --- fs/btrfs/ctree.h | 9 +++++++++ fs/btrfs/extent-tree.c | 45 ++++++++++++++++++++++++++++++++++++++++++--- fs/btrfs/volumes.c | 2 ++ 3 files changed, 53 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 82491ba8fa4..5e1d4e30e9d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -784,7 +784,14 @@ struct btrfs_fs_info { struct list_head dirty_cowonly_roots; struct btrfs_fs_devices *fs_devices; + + /* + * the space_info list is almost entirely read only. It only changes + * when we add a new raid type to the FS, and that happens + * very rarely. RCU is used to protect it. + */ struct list_head space_info; + spinlock_t delalloc_lock; spinlock_t new_trans_lock; u64 delalloc_bytes; @@ -1797,6 +1804,8 @@ int btrfs_cleanup_reloc_trees(struct btrfs_root *root); int btrfs_reloc_clone_csums(struct inode *inode, u64 file_pos, u64 len); u64 btrfs_reduce_alloc_profile(struct btrfs_root *root, u64 flags); void btrfs_set_inode_space_info(struct btrfs_root *root, struct inode *ionde); +void btrfs_clear_space_info_full(struct btrfs_fs_info *info); + int btrfs_check_metadata_free_space(struct btrfs_root *root); int btrfs_check_data_free_space(struct btrfs_root *root, struct inode *inode, u64 bytes); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 9abf81f71c4..fefe83ad205 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -20,6 +20,7 @@ #include #include #include +#include #include "compat.h" #include "hash.h" #include "crc32c.h" @@ -330,13 +331,33 @@ static struct btrfs_space_info *__find_space_info(struct btrfs_fs_info *info, { struct list_head *head = &info->space_info; struct btrfs_space_info *found; - list_for_each_entry(found, head, list) { - if (found->flags == flags) + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) { + if (found->flags == flags) { + rcu_read_unlock(); return found; + } } + rcu_read_unlock(); return NULL; } +/* + * after adding space to the filesystem, we need to clear the full flags + * on all the space infos. + */ +void btrfs_clear_space_info_full(struct btrfs_fs_info *info) +{ + struct list_head *head = &info->space_info; + struct btrfs_space_info *found; + + rcu_read_lock(); + list_for_each_entry_rcu(found, head, list) + found->full = 0; + rcu_read_unlock(); +} + static u64 div_factor(u64 num, int factor) { if (factor == 10) @@ -1903,7 +1924,6 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - list_add(&found->list, &info->space_info); INIT_LIST_HEAD(&found->block_groups); init_rwsem(&found->groups_sem); spin_lock_init(&found->lock); @@ -1917,6 +1937,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, found->full = 0; found->force_alloc = 0; *space_info = found; + list_add_rcu(&found->list, &info->space_info); return 0; } @@ -6320,6 +6341,7 @@ out: int btrfs_free_block_groups(struct btrfs_fs_info *info) { struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; struct rb_node *n; spin_lock(&info->block_group_cache_lock); @@ -6341,6 +6363,23 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) spin_lock(&info->block_group_cache_lock); } spin_unlock(&info->block_group_cache_lock); + + /* now that all the block groups are freed, go through and + * free all the space_info structs. This is only called during + * the final stages of unmount, and so we know nobody is + * using them. We call synchronize_rcu() once before we start, + * just to be on the safe side. + */ + synchronize_rcu(); + + while(!list_empty(&info->space_info)) { + space_info = list_entry(info->space_info.next, + struct btrfs_space_info, + list); + + list_del(&space_info->list); + kfree(space_info); + } return 0; } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1316139bf9e..7aa3810d7f6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1459,6 +1459,8 @@ static int __btrfs_grow_device(struct btrfs_trans_handle *trans, device->fs_devices->total_rw_bytes += diff; device->total_bytes = new_size; + btrfs_clear_space_info_full(device->dev_root->fs_info); + return btrfs_update_device(trans, device); } -- cgit v1.2.3 From 913d952eb573c3d1f7487e83b5590e13e7cae2bd Mon Sep 17 00:00:00 2001 From: Chris Mason Date: Tue, 10 Mar 2009 13:17:18 -0400 Subject: Btrfs: Clear space_info full when adding new devices The full flag on the space info structs tells the allocator not to try and allocate more chunks because the devices in the FS are fully allocated. When more devices are added, we need to clear the full flag so the allocator knows it has more space available. Signed-off-by: Chris Mason --- fs/btrfs/volumes.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'fs') diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7aa3810d7f6..dd06e18e5aa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1374,6 +1374,12 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) ret = btrfs_add_device(trans, root, device); } + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(root->fs_info); + unlock_chunks(root); btrfs_commit_transaction(trans, root); -- cgit v1.2.3 From 395a87bfefbc400011417e9eaae33169f9f036c0 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Tue, 10 Mar 2009 18:18:47 -0400 Subject: ext4: fix header check in ext4_ext_search_right() for deep extent trees. The ext4_ext_search_right() function is confusing; it uses a "depth" variable which is 0 at the root and maximum at the leaves, but the on-disk metadata uses a "depth" (actually eh_depth) which is opposite: maximum at the root, and 0 at the leaves. The ext4_ext_check_header() function is given a depth and checks the header agaisnt that depth; it expects the on-disk semantics, but we are giving it the opposite in the while loop in this function. We should be giving it the on-disk notion of "depth" which we can get from (p_depth - depth) - and if you look, the last (more commonly hit) call to ext4_ext_check_header() does just this. Sending in the wrong depth results in (incorrect) messages about corruption: EXT4-fs error (device sdb1): ext4_ext_search_right: bad header in inode #2621457: unexpected eh_depth - magic f30a, entries 340, max 340(0), depth 1(2) http://bugzilla.kernel.org/show_bug.cgi?id=12821 Reported-by: David Dindorp Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/extents.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index e2eab196875..e0aa4fe4f59 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -1122,7 +1122,8 @@ ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, struct ext4_extent_idx *ix; struct ext4_extent *ex; ext4_fsblk_t block; - int depth, ee_len; + int depth; /* Note, NOT eh_depth; depth from top of tree */ + int ee_len; BUG_ON(path == NULL); depth = path->p_depth; @@ -1179,7 +1180,8 @@ got_index: if (bh == NULL) return -EIO; eh = ext_block_hdr(bh); - if (ext4_ext_check_header(inode, eh, depth)) { + /* subtract from p_depth to get proper eh_depth */ + if (ext4_ext_check_header(inode, eh, path->p_depth - depth)) { put_bh(bh); return -EIO; } -- cgit v1.2.3 From 260219cc48cfb22486e5d0d706c978228a080d63 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Tue, 10 Mar 2009 12:55:51 -0700 Subject: devpts: remove graffiti Very annoying when working with containters. Signed-off-by: Alexey Dobriyan Cc: Alan Cox Cc: "H. Peter Anvin" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/devpts/inode.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 5f3231b9633..bff4052b05e 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -198,9 +198,6 @@ static int mknod_ptmx(struct super_block *sb) fsi->ptmx_dentry = dentry; rc = 0; - - printk(KERN_DEBUG "Created ptmx node in devpts ino %lu\n", - inode->i_ino); out: mutex_unlock(&root->d_inode->i_mutex); return rc; @@ -369,8 +366,6 @@ static int new_pts_mount(struct file_system_type *fs_type, int flags, struct pts_fs_info *fsi; struct pts_mount_opts *opts; - printk(KERN_NOTICE "devpts: newinstance mount\n"); - err = get_sb_nodev(fs_type, flags, data, devpts_fill_super, mnt); if (err) return err; -- cgit v1.2.3 From ef95d31e6de6be9602ce950b85fb7ab8af46ae42 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Mar 2009 20:33:17 -0400 Subject: NFS: Fix misparsing of nfsv4 fs_locations attribute (take 2) The changeset ea31a4437c59219bf3ea946d58984b01a45a289c (nfs: Fix misparsing of nfsv4 fs_locations attribute) causes the mountpath that is calculated at the beginning of try_location() to be clobbered when we later strncpy a non-nul terminated hostname using an incorrect buffer length. Signed-off-by: Trond Myklebust --- fs/nfs/nfs4namespace.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c index 30befc39b3c..2a2a0a7143a 100644 --- a/fs/nfs/nfs4namespace.c +++ b/fs/nfs/nfs4namespace.c @@ -21,7 +21,9 @@ #define NFSDBG_FACILITY NFSDBG_VFS /* - * Check if fs_root is valid + * Convert the NFSv4 pathname components into a standard posix path. + * + * Note that the resulting string will be placed at the end of the buffer */ static inline char *nfs4_pathname_string(const struct nfs4_pathname *pathname, char *buffer, ssize_t buflen) @@ -99,21 +101,20 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, { struct vfsmount *mnt = ERR_PTR(-ENOENT); char *mnt_path; - int page2len; + unsigned int maxbuflen; unsigned int s; mnt_path = nfs4_pathname_string(&location->rootpath, page2, PAGE_SIZE); if (IS_ERR(mnt_path)) return mnt; mountdata->mnt_path = mnt_path; - page2 += strlen(mnt_path) + 1; - page2len = PAGE_SIZE - strlen(mnt_path) - 1; + maxbuflen = mnt_path - 1 - page2; for (s = 0; s < location->nservers; s++) { const struct nfs4_string *buf = &location->servers[s]; struct sockaddr_storage addr; - if (buf->len <= 0 || buf->len >= PAGE_SIZE) + if (buf->len <= 0 || buf->len >= maxbuflen) continue; mountdata->addr = (struct sockaddr *)&addr; @@ -126,8 +127,8 @@ static struct vfsmount *try_location(struct nfs_clone_mount *mountdata, continue; nfs_set_port(mountdata->addr, NFS_PORT); - strncpy(page2, buf->data, page2len); - page2[page2len] = '\0'; + memcpy(page2, buf->data, buf->len); + page2[buf->len] = '\0'; mountdata->hostname = page2; snprintf(page, PAGE_SIZE, "%s:%s", -- cgit v1.2.3 From ae46141ff08f1965b17c531b571953c39ce8b9e2 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Tue, 10 Mar 2009 20:33:18 -0400 Subject: NFSv3: Fix posix ACL code Fix a memory leak due to allocation in the XDR layer. In cases where the RPC call needs to be retransmitted, we end up allocating new pages without clearing the old ones. Fix this by moving the allocation into nfs3_proc_setacls(). Also fix an issue discovered by Kevin Rudd, whereby the amount of memory reserved for the acls in the xdr_buf->head was miscalculated, and causing corruption. Signed-off-by: Trond Myklebust --- fs/nfs/nfs3acl.c | 27 +++++++++++++++++++++------ fs/nfs/nfs3xdr.c | 34 +++++++++++++--------------------- 2 files changed, 34 insertions(+), 27 deletions(-) (limited to 'fs') diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index cef62557c87..6bbf0e6daad 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -292,7 +292,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, { struct nfs_server *server = NFS_SERVER(inode); struct nfs_fattr fattr; - struct page *pages[NFSACL_MAXPAGES] = { }; + struct page *pages[NFSACL_MAXPAGES]; struct nfs3_setaclargs args = { .inode = inode, .mask = NFS_ACL, @@ -303,7 +303,7 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, .rpc_argp = &args, .rpc_resp = &fattr, }; - int status, count; + int status; status = -EOPNOTSUPP; if (!nfs_server_capable(inode, NFS_CAP_ACLS)) @@ -319,6 +319,20 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, if (S_ISDIR(inode->i_mode)) { args.mask |= NFS_DFACL; args.acl_default = dfacl; + args.len = nfsacl_size(acl, dfacl); + } else + args.len = nfsacl_size(acl, NULL); + + if (args.len > NFS_ACL_INLINE_BUFSIZE) { + unsigned int npages = 1 + ((args.len - 1) >> PAGE_SHIFT); + + status = -ENOMEM; + do { + args.pages[args.npages] = alloc_page(GFP_KERNEL); + if (args.pages[args.npages] == NULL) + goto out_freepages; + args.npages++; + } while (args.npages < npages); } dprintk("NFS call setacl\n"); @@ -329,10 +343,6 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, nfs_zap_acl_cache(inode); dprintk("NFS reply setacl: %d\n", status); - /* pages may have been allocated at the xdr layer. */ - for (count = 0; count < NFSACL_MAXPAGES && args.pages[count]; count++) - __free_page(args.pages[count]); - switch (status) { case 0: status = nfs_refresh_inode(inode, &fattr); @@ -346,6 +356,11 @@ static int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, case -ENOTSUPP: status = -EOPNOTSUPP; } +out_freepages: + while (args.npages != 0) { + args.npages--; + __free_page(args.pages[args.npages]); + } out: return status; } diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 11cdddec143..6cdeacffde4 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -82,8 +82,10 @@ #define NFS3_commitres_sz (1+NFS3_wcc_data_sz+2) #define ACL3_getaclargs_sz (NFS3_fh_sz+1) -#define ACL3_setaclargs_sz (NFS3_fh_sz+1+2*(2+5*3)) -#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+2*(2+5*3)) +#define ACL3_setaclargs_sz (NFS3_fh_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) +#define ACL3_getaclres_sz (1+NFS3_post_op_attr_sz+1+ \ + XDR_QUADLEN(NFS_ACL_INLINE_BUFSIZE)) #define ACL3_setaclres_sz (1+NFS3_post_op_attr_sz) /* @@ -703,28 +705,18 @@ nfs3_xdr_setaclargs(struct rpc_rqst *req, __be32 *p, struct nfs3_setaclargs *args) { struct xdr_buf *buf = &req->rq_snd_buf; - unsigned int base, len_in_head, len = nfsacl_size( - (args->mask & NFS_ACL) ? args->acl_access : NULL, - (args->mask & NFS_DFACL) ? args->acl_default : NULL); - int count, err; + unsigned int base; + int err; p = xdr_encode_fhandle(p, NFS_FH(args->inode)); *p++ = htonl(args->mask); - base = (char *)p - (char *)buf->head->iov_base; - /* put as much of the acls into head as possible. */ - len_in_head = min_t(unsigned int, buf->head->iov_len - base, len); - len -= len_in_head; - req->rq_slen = xdr_adjust_iovec(req->rq_svec, p + (len_in_head >> 2)); - - for (count = 0; (count << PAGE_SHIFT) < len; count++) { - args->pages[count] = alloc_page(GFP_KERNEL); - if (!args->pages[count]) { - while (count) - __free_page(args->pages[--count]); - return -ENOMEM; - } - } - xdr_encode_pages(buf, args->pages, 0, len); + req->rq_slen = xdr_adjust_iovec(req->rq_svec, p); + base = req->rq_slen; + + if (args->npages != 0) + xdr_encode_pages(buf, args->pages, 0, args->len); + else + req->rq_slen += args->len; err = nfsacl_encode(buf, base, args->inode, (args->mask & NFS_ACL) ? -- cgit v1.2.3 From 57df675c60c5cf0748ddba9c7f85afde1530d74d Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Tue, 10 Mar 2009 20:33:20 -0400 Subject: NLM: Fix GRANT callback address comparison when IPv6 is enabled The NFS mount command may pass an AF_INET server address to lockd. If lockd happens to be using a PF_INET6 listener, the nlm_cmp_addr() in nlmclnt_grant() will fail to match requests from that host because they will all have a mapped IPv4 AF_INET6 address. Adopt the same solution used in nfs_sockaddr_match_ipaddr() for NFSv4 callbacks: if either address is AF_INET, map it to an AF_INET6 address before doing the comparison. Signed-off-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/lockd/clntlock.c | 51 ++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 50 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/lockd/clntlock.c b/fs/lockd/clntlock.c index 1f3b0fc0d35..aedc47a264c 100644 --- a/fs/lockd/clntlock.c +++ b/fs/lockd/clntlock.c @@ -139,6 +139,55 @@ int nlmclnt_block(struct nlm_wait *block, struct nlm_rqst *req, long timeout) return 0; } +#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) +static const struct in6_addr *nlmclnt_map_v4addr(const struct sockaddr *sap, + struct in6_addr *addr_mapped) +{ + const struct sockaddr_in *sin = (const struct sockaddr_in *)sap; + + switch (sap->sa_family) { + case AF_INET6: + return &((const struct sockaddr_in6 *)sap)->sin6_addr; + case AF_INET: + ipv6_addr_set_v4mapped(sin->sin_addr.s_addr, addr_mapped); + return addr_mapped; + } + + return NULL; +} + +/* + * If lockd is using a PF_INET6 listener, all incoming requests appear + * to come from AF_INET6 remotes. The address of AF_INET remotes are + * mapped to AF_INET6 automatically by the network layer. In case the + * user passed an AF_INET server address at mount time, ensure both + * addresses are AF_INET6 before comparing them. + */ +static int nlmclnt_cmp_addr(const struct nlm_host *host, + const struct sockaddr *sap) +{ + const struct in6_addr *addr1; + const struct in6_addr *addr2; + struct in6_addr addr1_mapped; + struct in6_addr addr2_mapped; + + addr1 = nlmclnt_map_v4addr(nlm_addr(host), &addr1_mapped); + if (likely(addr1 != NULL)) { + addr2 = nlmclnt_map_v4addr(sap, &addr2_mapped); + if (likely(addr2 != NULL)) + return ipv6_addr_equal(addr1, addr2); + } + + return 0; +} +#else /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ +static int nlmclnt_cmp_addr(const struct nlm_host *host, + const struct sockaddr *sap) +{ + return nlm_cmp_addr(nlm_addr(host), sap); +} +#endif /* !(CONFIG_IPV6 || CONFIG_IPV6_MODULE) */ + /* * The server lockd has called us back to tell us the lock was granted */ @@ -166,7 +215,7 @@ __be32 nlmclnt_grant(const struct sockaddr *addr, const struct nlm_lock *lock) */ if (fl_blocked->fl_u.nfs_fl.owner->pid != lock->svid) continue; - if (!nlm_cmp_addr(nlm_addr(block->b_host), addr)) + if (!nlmclnt_cmp_addr(block->b_host, addr)) continue; if (nfs_compare_fh(NFS_FH(fl_blocked->fl_file->f_path.dentry->d_inode) ,fh) != 0) continue; -- cgit v1.2.3 From a71ee337b31271e701f689d544b6153b75609bc5 Mon Sep 17 00:00:00 2001 From: Suresh Jayaraman Date: Tue, 10 Mar 2009 20:33:21 -0400 Subject: NFS: Handle -ESTALE error in access() Hi Trond, I have been looking at a bugreport where trying to open applications on KDE on a NFS mounted home fails temporarily. There have been multiple reports on different kernel versions pointing to this common issue: http://bugzilla.kernel.org/show_bug.cgi?id=12557 https://bugs.launchpad.net/ubuntu/+source/linux/+bug/269954 http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=508866.html This issue can be reproducible consistently by doing this on a NFS mounted home (KDE): 1. Open 2 xterm sessions 2. From one of the xterm session, do "ssh -X " 3. "stat ~/.Xauthority" on the remote SSH session 4. Close the two xterm sessions 5. On the server do a "stat ~/.Xauthority" 6. Now on the client, try to open xterm This will fail. Even if the filehandle had become stale, the NFS client should invalidate the cache/inode and should repeat LOOKUP. Looking at the packet capture when the failure occurs shows that there were two subsequent ACCESS() calls with the same filehandle and both fails with -ESTALE error. I have tested the fix below. Now the client issue a LOOKUP after the ACCESS() call fails with -ESTALE. If all this makes sense to you, can you consider this for inclusion? Thanks, If the server returns an -ESTALE error due to stale filehandle in response to an ACCESS() call, we need to invalidate the cache and inode so that LOOKUP() can be retried. Without this change, the nfs client retries ACCESS() with the same filehandle, fails again and could lead to temporary failure of applications running on nfs mounted home. Signed-off-by: Suresh Jayaraman Signed-off-by: Trond Myklebust --- fs/nfs/dir.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index e35c8199f82..672368f865c 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -1892,8 +1892,14 @@ static int nfs_do_access(struct inode *inode, struct rpc_cred *cred, int mask) cache.cred = cred; cache.jiffies = jiffies; status = NFS_PROTO(inode)->access(inode, &cache); - if (status != 0) + if (status != 0) { + if (status == -ESTALE) { + nfs_zap_caches(inode); + if (!S_ISDIR(inode->i_mode)) + set_bit(NFS_INO_STALE, &NFS_I(inode)->flags); + } return status; + } nfs_access_add_cache(inode, &cache); out: if ((mask & ~cache.mask & (MAY_READ | MAY_WRITE | MAY_EXEC)) == 0) -- cgit v1.2.3 From d7371c41b0cda782256b1df759df4e8d4724584c Mon Sep 17 00:00:00 2001 From: Ian Dall Date: Tue, 10 Mar 2009 20:33:22 -0400 Subject: Bug 11061, NFS mounts dropped Addresses: http://bugzilla.kernel.org/show_bug.cgi?id=11061 sockaddr structures can't be reliably compared using memcmp() because there are padding bytes in the structure which can't be guaranteed to be the same even when the sockaddr structures refer to the same socket. Instead compare all the relevant fields. In the case of IPv6 sin6_flowinfo is not compared because it only affects QoS and sin6_scope_id is only compared if the address is "link local" because "link local" addresses need only be unique to a specific link. Signed-off-by: Ian Dall Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 63 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 62 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 9b728f3565a..06654b831d1 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -272,6 +272,65 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, } #endif +/* + * Test if two ip4 socket addresses refer to the same socket, by + * comparing relevant fields. The padding bytes specifically, are + * not compared. + * + * The caller should ensure both socket addresses are AF_INET. + */ +static int nfs_sockaddr_cmp_ip4(const struct sockaddr_in * saddr1, + const struct sockaddr_in * saddr2) +{ + if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) + return 0; + return saddr1->sin_port == saddr2->sin_port; +} + +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_cmp_ip6 (const struct sockaddr_in6 * saddr1, + const struct sockaddr_in6 * saddr2) +{ + if (!ipv6_addr_equal(&saddr1->sin6_addr, + &saddr1->sin6_addr)) + return 0; + if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + saddr1->sin6_scope_id != saddr2->sin6_scope_id) + return 0; + return saddr1->sin6_port == saddr2->sin6_port; +} + +/* + * Test if two socket addresses represent the same actual socket, + * by comparing (only) relevant fields. + */ +static int nfs_sockaddr_cmp(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + if (sa1->sa_family != sa2->sa_family) + return 0; + + switch (sa1->sa_family) { + case AF_INET: + return nfs_sockaddr_cmp_ip4((const struct sockaddr_in *) sa1, + (const struct sockaddr_in *) sa2); + case AF_INET6: + return nfs_sockaddr_cmp_ip6((const struct sockaddr_in6 *) sa1, + (const struct sockaddr_in6 *) sa2); + } + return 0; +} + /* * Find a client by IP address and protocol version * - returns NULL if no such client @@ -344,8 +403,10 @@ struct nfs_client *nfs_find_client_next(struct nfs_client *clp) static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *data) { struct nfs_client *clp; + const struct sockaddr *sap = data->addr; list_for_each_entry(clp, &nfs_client_list, cl_share_link) { + const struct sockaddr *clap = (struct sockaddr *)&clp->cl_addr; /* Don't match clients that failed to initialise properly */ if (clp->cl_cons_state < 0) continue; @@ -358,7 +419,7 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat continue; /* Match the full socket address */ - if (memcmp(&clp->cl_addr, data->addr, sizeof(clp->cl_addr)) != 0) + if (!nfs_sockaddr_cmp(sap, clap)) continue; atomic_inc(&clp->cl_count); -- cgit v1.2.3 From ad3bdefe877afb47480418fdb05ecd42842de65e Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Wed, 11 Mar 2009 09:00:04 +0800 Subject: proc: fix kflags to uflags copying in /proc/kpageflags Fix kpf_copy_bit(src,dst) to be kpf_copy_bit(dst,src) to match the actual call patterns, e.g. kpf_copy_bit(kflags, KPF_LOCKED, PG_locked). This misplacement of src/dst only affected reporting of PG_writeback, PG_reclaim and PG_buddy. For others kflags==uflags so not affected. Signed-off-by: Wu Fengguang Reviewed-by: KOSAKI Motohiro Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/proc/page.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 2d1345112a4..e9983837d08 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -80,7 +80,7 @@ static const struct file_operations proc_kpagecount_operations = { #define KPF_RECLAIM 9 #define KPF_BUDDY 10 -#define kpf_copy_bit(flags, srcpos, dstpos) (((flags >> srcpos) & 1) << dstpos) +#define kpf_copy_bit(flags, dstpos, srcpos) (((flags >> srcpos) & 1) << dstpos) static ssize_t kpageflags_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) -- cgit v1.2.3 From 3a95ea1155c5d44aa58dde2f64f0ddafe27fd1fb Mon Sep 17 00:00:00 2001 From: OGAWA Hirofumi Date: Thu, 12 Mar 2009 02:03:23 +0900 Subject: Fix _fat_bmap() locking On swapon() path, it has already i_mutex. So, this uses i_alloc_sem instead of it. Signed-off-by: OGAWA Hirofumi Reported-by: Laurent GUERBY Signed-off-by: Linus Torvalds --- fs/fat/inode.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/fat/inode.c b/fs/fat/inode.c index 6b74d09adbe..de0004fe6e0 100644 --- a/fs/fat/inode.c +++ b/fs/fat/inode.c @@ -202,9 +202,9 @@ static sector_t _fat_bmap(struct address_space *mapping, sector_t block) sector_t blocknr; /* fat_get_cluster() assumes the requested blocknr isn't truncated. */ - mutex_lock(&mapping->host->i_mutex); + down_read(&mapping->host->i_alloc_sem); blocknr = generic_block_bmap(mapping, block, fat_get_block); - mutex_unlock(&mapping->host->i_mutex); + up_read(&mapping->host->i_alloc_sem); return blocknr; } -- cgit v1.2.3 From 363911d027d1de1c6df79eb3f487f5476b9619f4 Mon Sep 17 00:00:00 2001 From: Phillip Lougher Date: Thu, 12 Mar 2009 03:23:48 +0000 Subject: Squashfs: Valid filesystems are flagged as bad by the corrupted fs patch The corrupted filesystem patch added a check against zlib trying to output too much data in the presence of data corruption. This check triggered if zlib_inflate asked to be called again (Z_OK) with avail_out == 0 and no more output buffers available. This check proves to be rather dumb, as it incorrectly catches the case where zlib has generated all the output, but there are still input bytes to be processed. This patch does a number of things. It removes the original check and replaces it with code to not move to the next output buffer if there are no more output buffers available, relying on zlib to error if it wants an extra output buffer in the case of data corruption. It also replaces the Z_NO_FLUSH flag with the more correct Z_SYNC_FLUSH flag, and makes the error messages more understandable to non-technical users. Signed-off-by: Phillip Lougher Reported-by: Stefan Lippers-Hollmann --- fs/squashfs/block.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 321728f48f2..2a796031034 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -184,15 +184,7 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, offset = 0; } - if (msblk->stream.avail_out == 0) { - if (page == pages) { - ERROR("zlib_inflate tried to " - "decompress too much data, " - "expected %d bytes. Zlib " - "data probably corrupt\n", - srclength); - goto release_mutex; - } + if (msblk->stream.avail_out == 0 && page < pages) { msblk->stream.next_out = buffer[page++]; msblk->stream.avail_out = PAGE_CACHE_SIZE; } @@ -209,25 +201,20 @@ int squashfs_read_data(struct super_block *sb, void **buffer, u64 index, zlib_init = 1; } - zlib_err = zlib_inflate(&msblk->stream, Z_NO_FLUSH); + zlib_err = zlib_inflate(&msblk->stream, Z_SYNC_FLUSH); if (msblk->stream.avail_in == 0 && k < b) put_bh(bh[k++]); } while (zlib_err == Z_OK); if (zlib_err != Z_STREAM_END) { - ERROR("zlib_inflate returned unexpected result" - " 0x%x, srclength %d, avail_in %d," - " avail_out %d\n", zlib_err, srclength, - msblk->stream.avail_in, - msblk->stream.avail_out); + ERROR("zlib_inflate error, data probably corrupt\n"); goto release_mutex; } zlib_err = zlib_inflateEnd(&msblk->stream); if (zlib_err != Z_OK) { - ERROR("zlib_inflateEnd returned unexpected result 0x%x," - " srclength %d\n", zlib_err, srclength); + ERROR("zlib_inflate error, data probably corrupt\n"); goto release_mutex; } length = msblk->stream.total_out; -- cgit v1.2.3 From 2842c3b5449f31470b61db716f1926b594fb6156 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Thu, 12 Mar 2009 12:20:01 -0400 Subject: ext4: Print the find_group_flex() warning only once This is a short-term warning, and even printk_ratelimit() can result in too much noise in system logs. So only print it once as a warning. Signed-off-by: "Theodore Ts'o" --- fs/ext4/ialloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 627f8c3337a..2d2b3585ee9 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -698,6 +698,7 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) struct inode *ret; ext4_group_t i; int free = 0; + static int once = 1; ext4_group_t flex_group; /* Cannot create files in a deleted directory */ @@ -719,7 +720,8 @@ struct inode *ext4_new_inode(handle_t *handle, struct inode *dir, int mode) ret2 = find_group_flex(sb, dir, &group); if (ret2 == -1) { ret2 = find_group_other(sb, dir, &group); - if (ret2 == 0 && printk_ratelimit()) + if (ret2 == 0 && once) + once = 0; printk(KERN_NOTICE "ext4: find_group_flex " "failed, fallback succeeded dir %lu\n", dir->i_ino); -- cgit v1.2.3 From 9f4c899c0d90e1b51b6864834f3877b47c161a0e Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Thu, 12 Mar 2009 14:51:32 -0400 Subject: NFS: Fix the fix to Bugzilla #11061, when IPv6 isn't defined... Stephen Rothwell reports: Today's linux-next build (powerpc ppc64_defconfig) failed like this: fs/built-in.o: In function `.nfs_get_client': client.c:(.text+0x115010): undefined reference to `.__ipv6_addr_type' Fix by moving the IPV6 specific parts of commit d7371c41b0cda782256b1df759df4e8d4724584c ("Bug 11061, NFS mounts dropped") into the '#ifdef IPV6..." section. Also fix up a couple of formatting issues. Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 68 +++++++++++++++++++++++++++++++++------------------------ 1 file changed, 39 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 06654b831d1..574158ae239 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -255,6 +255,32 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, } return 0; } + +/* + * Test if two ip6 socket addresses refer to the same socket by + * comparing relevant fields. The padding bytes specifically, are not + * compared. sin6_flowinfo is not compared because it only affects QoS + * and sin6_scope_id is only compared if the address is "link local" + * because "link local" addresses need only be unique to a specific + * link. Conversely, ordinary unicast addresses might have different + * sin6_scope_id. + * + * The caller should ensure both socket addresses are AF_INET6. + */ +static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1, + const struct sockaddr *sa2) +{ + const struct sockaddr_in6 *saddr1 = (const struct sockaddr_in6 *)sa1; + const struct sockaddr_in6 *saddr2 = (const struct sockaddr_in6 *)sa2; + + if (!ipv6_addr_equal(&saddr1->sin6_addr, + &saddr1->sin6_addr)) + return 0; + if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && + saddr1->sin6_scope_id != saddr2->sin6_scope_id) + return 0; + return saddr1->sin6_port == saddr2->sin6_port; +} #else static int nfs_sockaddr_match_ipaddr4(const struct sockaddr_in *sa1, const struct sockaddr_in *sa2) @@ -270,6 +296,12 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, return nfs_sockaddr_match_ipaddr4((const struct sockaddr_in *)sa1, (const struct sockaddr_in *)sa2); } + +static int nfs_sockaddr_cmp_ip6(const struct sockaddr * sa1, + const struct sockaddr * sa2) +{ + return 0; +} #endif /* @@ -279,37 +311,17 @@ static int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1, * * The caller should ensure both socket addresses are AF_INET. */ -static int nfs_sockaddr_cmp_ip4(const struct sockaddr_in * saddr1, - const struct sockaddr_in * saddr2) +static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1, + const struct sockaddr *sa2) { + const struct sockaddr_in *saddr1 = (const struct sockaddr_in *)sa1; + const struct sockaddr_in *saddr2 = (const struct sockaddr_in *)sa2; + if (saddr1->sin_addr.s_addr != saddr2->sin_addr.s_addr) return 0; return saddr1->sin_port == saddr2->sin_port; } -/* - * Test if two ip6 socket addresses refer to the same socket by - * comparing relevant fields. The padding bytes specifically, are not - * compared. sin6_flowinfo is not compared because it only affects QoS - * and sin6_scope_id is only compared if the address is "link local" - * because "link local" addresses need only be unique to a specific - * link. Conversely, ordinary unicast addresses might have different - * sin6_scope_id. - * - * The caller should ensure both socket addresses are AF_INET6. - */ -static int nfs_sockaddr_cmp_ip6 (const struct sockaddr_in6 * saddr1, - const struct sockaddr_in6 * saddr2) -{ - if (!ipv6_addr_equal(&saddr1->sin6_addr, - &saddr1->sin6_addr)) - return 0; - if (ipv6_addr_scope(&saddr1->sin6_addr) == IPV6_ADDR_SCOPE_LINKLOCAL && - saddr1->sin6_scope_id != saddr2->sin6_scope_id) - return 0; - return saddr1->sin6_port == saddr2->sin6_port; -} - /* * Test if two socket addresses represent the same actual socket, * by comparing (only) relevant fields. @@ -322,11 +334,9 @@ static int nfs_sockaddr_cmp(const struct sockaddr *sa1, switch (sa1->sa_family) { case AF_INET: - return nfs_sockaddr_cmp_ip4((const struct sockaddr_in *) sa1, - (const struct sockaddr_in *) sa2); + return nfs_sockaddr_cmp_ip4(sa1, sa2); case AF_INET6: - return nfs_sockaddr_cmp_ip6((const struct sockaddr_in6 *) sa1, - (const struct sockaddr_in6 *) sa2); + return nfs_sockaddr_cmp_ip6(sa1, sa2); } return 0; } -- cgit v1.2.3 From e5bc49ba7439b9726006d031d440cba96819f0f8 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Thu, 12 Mar 2009 14:31:28 -0700 Subject: pipe_rdwr_fasync: fix the error handling to prevent the leak/crash If the second fasync_helper() fails, pipe_rdwr_fasync() returns the error but leaves the file on ->fasync_readers. This was always wrong, but since 233e70f4228e78eb2f80dc6650f65d3ae3dbf17c "saner FASYNC handling on file close" we have the new problem. Because in this case setfl() doesn't set FASYNC bit, __fput() will not do ->fasync(0), and we leak fasync_struct with ->fa_file pointing to the freed file. Signed-off-by: Oleg Nesterov Cc: Al Viro Cc: Andi Kleen Cc: Jonathan Corbet Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/pipe.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/pipe.c b/fs/pipe.c index 3a48ba5179d..14f502b89cf 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -699,12 +699,12 @@ pipe_rdwr_fasync(int fd, struct file *filp, int on) int retval; mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); - - if (retval >= 0) + if (retval >= 0) { retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); - + if (retval < 0) /* this can happen only if on == T */ + fasync_helper(-1, filp, 0, &pipe->fasync_readers); + } mutex_unlock(&inode->i_mutex); if (retval < 0) -- cgit v1.2.3 From a3cfbb53b1764a3d1f58ddc032737ab9edaa7d41 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Thu, 12 Mar 2009 14:31:29 -0700 Subject: vfs: add missing unlock in sget() In sget(), destroy_super(s) is called with s->s_umount held, which makes lockdep unhappy. Signed-off-by: Li Zefan Cc: Al Viro Acked-by: Peter Zijlstra Cc: Paul Menage Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/super.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/super.c b/fs/super.c index 8349ed6b141..6ce501447ad 100644 --- a/fs/super.c +++ b/fs/super.c @@ -371,8 +371,10 @@ retry: continue; if (!grab_super(old)) goto retry; - if (s) + if (s) { + up_write(&s->s_umount); destroy_super(s); + } return old; } } @@ -387,6 +389,7 @@ retry: err = set(s, data); if (err) { spin_unlock(&sb_lock); + up_write(&s->s_umount); destroy_super(s); return ERR_PTR(err); } -- cgit v1.2.3 From 7ef0d7377cb287e08f3ae94cebc919448e1f5dff Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Thu, 12 Mar 2009 14:31:38 -0700 Subject: fs: new inode i_state corruption fix There was a report of a data corruption http://lkml.org/lkml/2008/11/14/121. There is a script included to reproduce the problem. During testing, I encountered a number of strange things with ext3, so I tried ext2 to attempt to reduce complexity of the problem. I found that fsstress would quickly hang in wait_on_inode, waiting for I_LOCK to be cleared, even though instrumentation showed that unlock_new_inode had already been called for that inode. This points to memory scribble, or synchronisation problme. i_state of I_NEW inodes is not protected by inode_lock because other processes are not supposed to touch them until I_LOCK (and I_NEW) is cleared. Adding WARN_ON(inode->i_state & I_NEW) to sites where we modify i_state revealed that generic_sync_sb_inodes is picking up new inodes from the inode lists and passing them to __writeback_single_inode without waiting for I_NEW. Subsequently modifying i_state causes corruption. In my case it would look like this: CPU0 CPU1 unlock_new_inode() __sync_single_inode() reg <- inode->i_state reg -> reg & ~(I_LOCK|I_NEW) reg <- inode->i_state reg -> inode->i_state reg -> reg | I_SYNC reg -> inode->i_state Non-atomic RMW on CPU1 overwrites CPU0 store and sets I_LOCK|I_NEW again. Fix for this is rather than wait for I_NEW inodes, just skip over them: inodes concurrently being created are not subject to data integrity operations, and should not significantly contribute to dirty memory either. After this change, I'm unable to reproduce any of the added warnings or hangs after ~1hour of running. Previously, the new warnings would start immediately and hang would happen in under 5 minutes. I'm also testing on ext3 now, and so far no problems there either. I don't know whether this fixes the problem reported above, but it fixes a real problem for me. Cc: "Jorge Boncompte [DTI2]" Reported-by: Adrian Hunter Cc: Jan Kara Cc: Signed-off-by: Nick Piggin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fs-writeback.c | 9 ++++++++- fs/inode.c | 7 +++++++ 2 files changed, 15 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index e5eaa62fd17..e3fe9918faa 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -274,6 +274,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) int ret; BUG_ON(inode->i_state & I_SYNC); + WARN_ON(inode->i_state & I_NEW); /* Set I_SYNC, reset I_DIRTY */ dirty = inode->i_state & I_DIRTY; @@ -298,6 +299,7 @@ __sync_single_inode(struct inode *inode, struct writeback_control *wbc) } spin_lock(&inode_lock); + WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_SYNC; if (!(inode->i_state & I_FREEING)) { if (!(inode->i_state & I_DIRTY) && @@ -470,6 +472,11 @@ void generic_sync_sb_inodes(struct super_block *sb, break; } + if (inode->i_state & I_NEW) { + requeue_io(inode); + continue; + } + if (wbc->nonblocking && bdi_write_congested(bdi)) { wbc->encountered_congestion = 1; if (!sb_is_blkdev_sb(sb)) @@ -531,7 +538,7 @@ void generic_sync_sb_inodes(struct super_block *sb, list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { struct address_space *mapping; - if (inode->i_state & (I_FREEING|I_WILL_FREE)) + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) continue; mapping = inode->i_mapping; if (mapping->nrpages == 0) diff --git a/fs/inode.c b/fs/inode.c index 913ab2d9a5d..826fb0b9d1c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -359,6 +359,7 @@ static int invalidate_list(struct list_head *head, struct list_head *dispose) invalidate_inode_buffers(inode); if (!atomic_read(&inode->i_count)) { list_move(&inode->i_list, dispose); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; count++; continue; @@ -460,6 +461,7 @@ static void prune_icache(int nr_to_scan) continue; } list_move(&inode->i_list, &freeable); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; nr_pruned++; } @@ -656,6 +658,7 @@ void unlock_new_inode(struct inode *inode) * just created it (so there can be no old holders * that haven't tested I_LOCK). */ + WARN_ON((inode->i_state & (I_LOCK|I_NEW)) != (I_LOCK|I_NEW)); inode->i_state &= ~(I_LOCK|I_NEW); wake_up_inode(inode); } @@ -1145,6 +1148,7 @@ void generic_delete_inode(struct inode *inode) list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); @@ -1186,16 +1190,19 @@ static void generic_forget_inode(struct inode *inode) spin_unlock(&inode_lock); return; } + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_WILL_FREE; spin_unlock(&inode_lock); write_inode_now(inode, 1); spin_lock(&inode_lock); + WARN_ON(inode->i_state & I_NEW); inode->i_state &= ~I_WILL_FREE; inodes_stat.nr_unused--; hlist_del_init(&inode->i_hash); } list_del_init(&inode->i_list); list_del_init(&inode->i_sb_list); + WARN_ON(inode->i_state & I_NEW); inode->i_state |= I_FREEING; inodes_stat.nr_inodes--; spin_unlock(&inode_lock); -- cgit v1.2.3 From 6c9fd1dc0a597e575617a7de7086c8a3efa8f524 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Fri, 6 Mar 2009 10:19:30 +0800 Subject: ocfs2: reserve xattr block for new directory with inline data If this is a new directory with inline data, we choose to reserve the entire inline area for directory contents and force an external xattr block. Signed-off-by: Tiger Yang Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 4ddd788add6..c63efb5ef13 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -547,8 +547,12 @@ int ocfs2_calc_xattr_init(struct inode *dir, * when blocksize = 512, may reserve one more cluser for * xattr bucket, otherwise reserve one metadata block * for them is ok. + * If this is a new directory with inline data, + * we choose to reserve the entire inline area for + * directory contents and force an external xattr block. */ if (dir->i_sb->s_blocksize == OCFS2_MIN_BLOCKSIZE || + (S_ISDIR(mode) && ocfs2_supports_inline_data(osb)) || (s_size + a_size) > OCFS2_XATTR_FREE_IN_IBODY) { ret = ocfs2_reserve_new_metadata_blocks(osb, 1, xattr_ac); if (ret) { -- cgit v1.2.3 From d9ae49d6e2b1ac9166e58ae3c9345135604beaa6 Mon Sep 17 00:00:00 2001 From: Tiger Yang Date: Thu, 5 Mar 2009 11:06:15 +0800 Subject: ocfs2: tweak to get the maximum inline data size with xattr Replace max_inline_data with max_inline_data_with_xattr to ensure it correct when xattr inlined. Signed-off-by: Tiger Yang Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/aops.c | 7 +++++-- fs/ocfs2/namei.c | 3 ++- fs/ocfs2/ocfs2_fs.h | 6 ------ 3 files changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index a067a6cffb0..8e1709a679b 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -227,7 +227,7 @@ int ocfs2_read_inline_data(struct inode *inode, struct page *page, size = i_size_read(inode); if (size > PAGE_CACHE_SIZE || - size > ocfs2_max_inline_data(inode->i_sb)) { + size > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) { ocfs2_error(inode->i_sb, "Inode %llu has with inline data has bad size: %Lu", (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -1555,6 +1555,7 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, int ret, written = 0; loff_t end = pos + len; struct ocfs2_inode_info *oi = OCFS2_I(inode); + struct ocfs2_dinode *di = NULL; mlog(0, "Inode %llu, write of %u bytes at off %llu. features: 0x%x\n", (unsigned long long)oi->ip_blkno, len, (unsigned long long)pos, @@ -1587,7 +1588,9 @@ static int ocfs2_try_to_write_inline_data(struct address_space *mapping, /* * Check whether the write can fit. */ - if (mmap_page || end > ocfs2_max_inline_data(inode->i_sb)) + di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + if (mmap_page || + end > ocfs2_max_inline_data_with_xattr(inode->i_sb, di)) return 0; do_inline_write: diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 084aba86c3b..4b11762f249 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -532,7 +532,8 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, fe->i_dyn_features = cpu_to_le16(feat | OCFS2_INLINE_DATA_FL); - fe->id2.i_data.id_count = cpu_to_le16(ocfs2_max_inline_data(osb->sb)); + fe->id2.i_data.id_count = cpu_to_le16( + ocfs2_max_inline_data_with_xattr(osb->sb, fe)); } else { fel = &fe->id2.i_list; fel->l_tree_depth = 0; diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index c7ae45aaa36..2332ef740f4 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -1070,12 +1070,6 @@ static inline int ocfs2_fast_symlink_chars(struct super_block *sb) offsetof(struct ocfs2_dinode, id2.i_symlink); } -static inline int ocfs2_max_inline_data(struct super_block *sb) -{ - return sb->s_blocksize - - offsetof(struct ocfs2_dinode, id2.i_data.id_data); -} - static inline int ocfs2_max_inline_data_with_xattr(struct super_block *sb, struct ocfs2_dinode *di) { -- cgit v1.2.3 From 74e77eb30d0ecbb12964d005b439c8b84a505b84 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Mar 2009 06:24:23 +0800 Subject: ocfs2: Fix a bug found by sparse check. We need to use le32_to_cpu to test rec->e_cpos in ocfs2_dinode_insert_check. Signed-off-by: Tao Ma Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/alloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 3a9e5deed74..19e3a96aa02 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -176,7 +176,8 @@ static int ocfs2_dinode_insert_check(struct inode *inode, BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && - (OCFS2_I(inode)->ip_clusters != rec->e_cpos), + (OCFS2_I(inode)->ip_clusters != + le32_to_cpu(rec->e_cpos)), "Device %s, asking for sparse allocation: inode %llu, " "cpos %u, clusters %u\n", osb->dev_str, -- cgit v1.2.3 From 712e53e46a1da35fcd88c05aa0c675b10f7c0e9d Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Thu, 12 Mar 2009 08:37:34 +0800 Subject: ocfs2: Use xs->bucket to set xattr value outside A long time ago, xs->base is allocated a 4K size and all the contents in the bucket are copied to the it. Now we use ocfs2_xattr_bucket to abstract xattr bucket and xs->base is initialized to the start of the bu_bhs[0]. So xs->base + offset will overflow when the value root is stored outside the first block. Then why we can survive the xattr test by now? It is because we always read the bucket contiguously now and kernel mm allocate continguous memory for us. We are lucky, but we should fix it. So just get the right value root as other callers do. Signed-off-by: Tao Ma Acked-by: Joel Becker Signed-off-by: Mark Fasheh --- fs/ocfs2/xattr.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c63efb5ef13..2563df89fc2 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -4795,19 +4795,33 @@ static int ocfs2_xattr_bucket_set_value_outside(struct inode *inode, char *val, int value_len) { - int offset; + int ret, offset, block_off; struct ocfs2_xattr_value_root *xv; struct ocfs2_xattr_entry *xe = xs->here; + struct ocfs2_xattr_header *xh = bucket_xh(xs->bucket); + void *base; BUG_ON(!xs->base || !xe || ocfs2_xattr_is_local(xe)); - offset = le16_to_cpu(xe->xe_name_offset) + - OCFS2_XATTR_SIZE(xe->xe_name_len); + ret = ocfs2_xattr_bucket_get_name_value(inode, xh, + xe - xh->xh_entries, + &block_off, + &offset); + if (ret) { + mlog_errno(ret); + goto out; + } - xv = (struct ocfs2_xattr_value_root *)(xs->base + offset); + base = bucket_block(xs->bucket, block_off); + xv = (struct ocfs2_xattr_value_root *)(base + offset + + OCFS2_XATTR_SIZE(xe->xe_name_len)); - return __ocfs2_xattr_set_value_outside(inode, handle, - xv, val, value_len); + ret = __ocfs2_xattr_set_value_outside(inode, handle, + xv, val, value_len); + if (ret) + mlog_errno(ret); +out: + return ret; } static int ocfs2_rm_xattr_cluster(struct inode *inode, -- cgit v1.2.3 From 8d03c7a0c550e7ab24cadcef5e66656bfadec8b9 Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Sat, 14 Mar 2009 11:51:46 -0400 Subject: ext4: fix bogus BUG_ONs in in mballoc code Thiemo Nagel reported that: # dd if=/dev/zero of=image.ext4 bs=1M count=2 # mkfs.ext4 -v -F -b 1024 -m 0 -g 512 -G 4 -I 128 -N 1 \ -O large_file,dir_index,flex_bg,extent,sparse_super image.ext4 # mount -o loop image.ext4 mnt/ # dd if=/dev/zero of=mnt/file oopsed, with a BUG_ON in ext4_mb_normalize_request because size == EXT4_BLOCKS_PER_GROUP It appears to me (esp. after talking to Andreas) that the BUG_ON is bogus; a request of exactly EXT4_BLOCKS_PER_GROUP should be allowed, though larger sizes do indicate a problem. Fix that an another (apparently rare) codepath with a similar check. Reported-by: Thiemo Nagel Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 4415beeb0b6..41f4348b62f 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -1447,7 +1447,7 @@ static void ext4_mb_measure_extent(struct ext4_allocation_context *ac, struct ext4_free_extent *gex = &ac->ac_g_ex; BUG_ON(ex->fe_len <= 0); - BUG_ON(ex->fe_len >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(ex->fe_len > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); BUG_ON(ex->fe_start >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); BUG_ON(ac->ac_status != AC_STATUS_CONTINUE); @@ -3292,7 +3292,7 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, } BUG_ON(start + size <= ac->ac_o_ex.fe_logical && start > ac->ac_o_ex.fe_logical); - BUG_ON(size <= 0 || size >= EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); + BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ -- cgit v1.2.3 From 020fe22ff14320927f394de222cbb11708bcc7a8 Mon Sep 17 00:00:00 2001 From: Enrik Berkhan Date: Fri, 13 Mar 2009 13:51:56 -0700 Subject: nommu: ramfs: pages allocated to an inode's pagecache may get wrongly discarded The pages attached to a ramfs inode's pagecache by truncation from nothing - as done by SYSV SHM for example - may get discarded under memory pressure. The problem is that the pages are not marked dirty. Anything that creates data in an MMU-based ramfs will cause the pages holding that data will cause the set_page_dirty() aop to be called. For the NOMMU-based mmap, set_page_dirty() may be called by write(), but it won't be called by page-writing faults on writable mmaps, and it isn't called by ramfs_nommu_expand_for_mapping() when a file is being truncated from nothing to allocate a contiguous run. The solution is to mark the pages dirty at the point of allocation by the truncation code. Signed-off-by: Enrik Berkhan Signed-off-by: David Howells Cc: Peter Zijlstra Cc: Nick Piggin Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index b9b567a2837..90d72bead55 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -114,6 +114,9 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) if (!pagevec_add(&lru_pvec, page)) __pagevec_lru_add_file(&lru_pvec); + /* prevent the page from being discarded on memory pressure */ + SetPageDirty(page); + unlock_page(page); } -- cgit v1.2.3 From 15e7b8767605dc0cb9bd4594caabfec392385210 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Fri, 13 Mar 2009 13:51:58 -0700 Subject: nommu: ramfs: don't leak pages when adding to page cache fails When a ramfs nommu mapping is expanded, contiguous pages are allocated and added to the pagecache. The caller's reference is then passed on by moving whole pagevecs to the file lru list. If the page cache adding fails, make sure that the error path also moves the pagevec contents which might still contain up to PAGEVEC_SIZE successfully added pages, of which we would leak references otherwise. Signed-off-by: Johannes Weiner Cc: David Howells Cc: Enrik Berkhan Cc: Nick Piggin Cc: Peter Zijlstra Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ramfs/file-nommu.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c index 90d72bead55..5d7c7ececa6 100644 --- a/fs/ramfs/file-nommu.c +++ b/fs/ramfs/file-nommu.c @@ -129,6 +129,7 @@ int ramfs_nommu_expand_for_mapping(struct inode *inode, size_t newsize) return -EFBIG; add_error: + pagevec_lru_add_file(&lru_pvec); page_cache_release(pages + loop); for (loop++; loop < npages; loop++) __free_page(pages + loop); -- cgit v1.2.3 From 84814d642a4f1f294bd675ab11aae1ca54c6cedb Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 13 Mar 2009 13:51:59 -0700 Subject: eCryptfs: don't encrypt file key with filename key eCryptfs has file encryption keys (FEK), file encryption key encryption keys (FEKEK), and filename encryption keys (FNEK). The per-file FEK is encrypted with one or more FEKEKs and stored in the header of the encrypted file. I noticed that the FEK is also being encrypted by the FNEK. This is a problem if a user wants to use a different FNEK than their FEKEK, as their file contents will still be accessible with the FNEK. This is a minimalistic patch which prevents the FNEKs signatures from being copied to the inode signatures list. Ultimately, it keeps the FEK from being encrypted with a FNEK. Signed-off-by: Tyler Hicks Cc: Serge Hallyn Acked-by: Dustin Kirkland Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 2 ++ fs/ecryptfs/ecryptfs_kernel.h | 3 ++- fs/ecryptfs/keystore.c | 3 ++- fs/ecryptfs/main.c | 5 +++-- 4 files changed, 9 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index f6caeb1d110..bdca1f4b3a3 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -946,6 +946,8 @@ static int ecryptfs_copy_mount_wide_sigs_to_inode_sigs( list_for_each_entry(global_auth_tok, &mount_crypt_stat->global_auth_tok_list, mount_crypt_stat_list) { + if (global_auth_tok->flags & ECRYPTFS_AUTH_TOK_FNEK) + continue; rc = ecryptfs_add_keysig(crypt_stat, global_auth_tok->sig); if (rc) { printk(KERN_ERR "Error adding keysig; rc = [%d]\n", rc); diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index c11fc95714a..eb2267eca1f 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -328,6 +328,7 @@ struct ecryptfs_dentry_info { */ struct ecryptfs_global_auth_tok { #define ECRYPTFS_AUTH_TOK_INVALID 0x00000001 +#define ECRYPTFS_AUTH_TOK_FNEK 0x00000002 u32 flags; struct list_head mount_crypt_stat_list; struct key *global_auth_tok_key; @@ -696,7 +697,7 @@ ecryptfs_write_header_metadata(char *virt, int ecryptfs_add_keysig(struct ecryptfs_crypt_stat *crypt_stat, char *sig); int ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, - char *sig); + char *sig, u32 global_auth_tok_flags); int ecryptfs_get_global_auth_tok_for_sig( struct ecryptfs_global_auth_tok **global_auth_tok, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, char *sig); diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index ff539420cc6..e4a6223c314 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -2375,7 +2375,7 @@ struct kmem_cache *ecryptfs_global_auth_tok_cache; int ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, - char *sig) + char *sig, u32 global_auth_tok_flags) { struct ecryptfs_global_auth_tok *new_auth_tok; int rc = 0; @@ -2389,6 +2389,7 @@ ecryptfs_add_global_auth_tok(struct ecryptfs_mount_crypt_stat *mount_crypt_stat, goto out; } memcpy(new_auth_tok->sig, sig, ECRYPTFS_SIG_SIZE_HEX); + new_auth_tok->flags = global_auth_tok_flags; new_auth_tok->sig[ECRYPTFS_SIG_SIZE_HEX] = '\0'; mutex_lock(&mount_crypt_stat->global_auth_tok_list_mutex); list_add(&new_auth_tok->mount_crypt_stat_list, diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 789cf2e1be1..aed56c25539 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -319,7 +319,7 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) case ecryptfs_opt_ecryptfs_sig: sig_src = args[0].from; rc = ecryptfs_add_global_auth_tok(mount_crypt_stat, - sig_src); + sig_src, 0); if (rc) { printk(KERN_ERR "Error attempting to register " "global sig; rc = [%d]\n", rc); @@ -370,7 +370,8 @@ static int ecryptfs_parse_options(struct super_block *sb, char *options) ECRYPTFS_SIG_SIZE_HEX] = '\0'; rc = ecryptfs_add_global_auth_tok( mount_crypt_stat, - mount_crypt_stat->global_default_fnek_sig); + mount_crypt_stat->global_default_fnek_sig, + ECRYPTFS_AUTH_TOK_FNEK); if (rc) { printk(KERN_ERR "Error attempting to register " "global fnek sig [%s]; rc = [%d]\n", -- cgit v1.2.3 From 87092698c665e0a358caf9825ae13114343027e8 Mon Sep 17 00:00:00 2001 From: un'ichi Nomura Date: Mon, 9 Mar 2009 10:40:52 +0100 Subject: block: Add gfp_mask parameter to bio_integrity_clone() Stricter gfp_mask might be required for clone allocation. For example, request-based dm may clone bio in interrupt context so it has to use GFP_ATOMIC. Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Acked-by: Martin K. Petersen Cc: Alasdair G Kergon Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 5 +++-- fs/bio.c | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 549b0144da1..fe2b1aa2464 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -685,19 +685,20 @@ EXPORT_SYMBOL(bio_integrity_split); * bio_integrity_clone - Callback for cloning bios with integrity metadata * @bio: New bio * @bio_src: Original bio + * @gfp_mask: Memory allocation mask * @bs: bio_set to allocate bip from * * Description: Called to allocate a bip when cloning a bio */ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, - struct bio_set *bs) + gfp_t gfp_mask, struct bio_set *bs) { struct bio_integrity_payload *bip_src = bio_src->bi_integrity; struct bio_integrity_payload *bip; BUG_ON(bip_src == NULL); - bip = bio_integrity_alloc_bioset(bio, GFP_NOIO, bip_src->bip_vcnt, bs); + bip = bio_integrity_alloc_bioset(bio, gfp_mask, bip_src->bip_vcnt, bs); if (bip == NULL) return -EIO; diff --git a/fs/bio.c b/fs/bio.c index 124b95c4d58..cf747378b97 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -463,7 +463,7 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) if (bio_integrity(bio)) { int ret; - ret = bio_integrity_clone(b, bio, fs_bio_set); + ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); if (ret < 0) return NULL; -- cgit v1.2.3 From 059ea3318c8ede71851a52b4359fbf1ab0cec301 Mon Sep 17 00:00:00 2001 From: Li Zefan Date: Mon, 9 Mar 2009 10:42:45 +0100 Subject: block: fix memory leak in bio_clone() If bio_integrity_clone() fails, bio_clone() returns NULL without freeing the newly allocated bio. Signed-off-by: Li Zefan Signed-off-by: Jens Axboe --- fs/bio.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index cf747378b97..d4f06327c81 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -465,8 +465,10 @@ struct bio *bio_clone(struct bio *bio, gfp_t gfp_mask) ret = bio_integrity_clone(b, bio, gfp_mask, fs_bio_set); - if (ret < 0) + if (ret < 0) { + bio_put(b); return NULL; + } } return b; -- cgit v1.2.3 From ee6f779b9e0851e2f7da292a9f58e0095edf615a Mon Sep 17 00:00:00 2001 From: Zhang Le Date: Mon, 16 Mar 2009 14:44:31 +0800 Subject: filp->f_pos not correctly updated in proc_task_readdir filp->f_pos only get updated at the end of the function. Thus d_off of those dirents who are in the middle will be 0, and this will cause a problem in glibc's readdir implementation, specifically endless loop. Because when overflow occurs, f_pos will be set to next dirent to read, however it will be 0, unless the next one is the last one. So it will start over again and again. There is a sample program in man 2 gendents. This is the output of the program running on a multithread program's task dir before this patch is applied: $ ./a.out /proc/3807/task --------------- nread=128 --------------- i-node# file type d_reclen d_off d_name 506442 directory 16 1 . 506441 directory 16 0 .. 506443 directory 16 0 3807 506444 directory 16 0 3809 506445 directory 16 0 3812 506446 directory 16 0 3861 506447 directory 16 0 3862 506448 directory 16 8 3863 This is the output after this patch is applied $ ./a.out /proc/3807/task --------------- nread=128 --------------- i-node# file type d_reclen d_off d_name 506442 directory 16 1 . 506441 directory 16 2 .. 506443 directory 16 3 3807 506444 directory 16 4 3809 506445 directory 16 5 3812 506446 directory 16 6 3861 506447 directory 16 7 3862 506448 directory 16 8 3863 Signed-off-by: Zhang Le Acked-by: Al Viro Signed-off-by: Linus Torvalds --- fs/proc/base.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index 0c9de19a163..cc6ea2329e7 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3066,7 +3066,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi int retval = -ENOENT; ino_t ino; int tid; - unsigned long pos = filp->f_pos; /* avoiding "long long" filp->f_pos */ struct pid_namespace *ns; task = get_proc_task(inode); @@ -3083,18 +3082,18 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi goto out_no_task; retval = 0; - switch (pos) { + switch (filp->f_pos) { case 0: ino = inode->i_ino; - if (filldir(dirent, ".", 1, pos, ino, DT_DIR) < 0) + if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) goto out; - pos++; + filp->f_pos++; /* fall through */ case 1: ino = parent_ino(dentry); - if (filldir(dirent, "..", 2, pos, ino, DT_DIR) < 0) + if (filldir(dirent, "..", 2, filp->f_pos, ino, DT_DIR) < 0) goto out; - pos++; + filp->f_pos++; /* fall through */ } @@ -3104,9 +3103,9 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi ns = filp->f_dentry->d_sb->s_fs_info; tid = (int)filp->f_version; filp->f_version = 0; - for (task = first_tid(leader, tid, pos - 2, ns); + for (task = first_tid(leader, tid, filp->f_pos - 2, ns); task; - task = next_tid(task), pos++) { + task = next_tid(task), filp->f_pos++) { tid = task_pid_nr_ns(task, ns); if (proc_task_fill_cache(filp, dirent, filldir, task, tid) < 0) { /* returning this tgid failed, save it as the first @@ -3117,7 +3116,6 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi } } out: - filp->f_pos = pos; put_task_struct(leader); out_no_task: return retval; -- cgit v1.2.3 From d33a1976fbee1ee321d6f014333d8f03a39d526c Mon Sep 17 00:00:00 2001 From: Eric Sandeen Date: Mon, 16 Mar 2009 23:25:40 -0400 Subject: ext4: fix bb_prealloc_list corruption due to wrong group locking This is for Red Hat bug 490026: EXT4 panic, list corruption in ext4_mb_new_inode_pa ext4_lock_group(sb, group) is supposed to protect this list for each group, and a common code flow to remove an album is like this: ext4_get_group_no_and_offset(sb, pa->pa_pstart, &grp, NULL); ext4_lock_group(sb, grp); list_del(&pa->pa_group_list); ext4_unlock_group(sb, grp); so it's critical that we get the right group number back for this prealloc context, to lock the right group (the one associated with this pa) and prevent concurrent list manipulation. however, ext4_mb_put_pa() passes in (pa->pa_pstart - 1) with a comment, "-1 is to protect from crossing allocation group". This makes sense for the group_pa, where pa_pstart is advanced by the length which has been used (in ext4_mb_release_context()), and when the entire length has been used, pa_pstart has been advanced to the first block of the next group. However, for inode_pa, pa_pstart is never advanced; it's just set once to the first block in the group and not moved after that. So in this case, if we subtract one in ext4_mb_put_pa(), we are actually locking the *previous* group, and opening the race with the other threads which do not subtract off the extra block. Signed-off-by: Eric Sandeen Signed-off-by: "Theodore Ts'o" --- fs/ext4/mballoc.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 41f4348b62f..9f61e62f435 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3589,6 +3589,7 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, struct super_block *sb, struct ext4_prealloc_space *pa) { ext4_group_t grp; + ext4_fsblk_t grp_blk; if (!atomic_dec_and_test(&pa->pa_count) || pa->pa_free != 0) return; @@ -3603,8 +3604,12 @@ static void ext4_mb_put_pa(struct ext4_allocation_context *ac, pa->pa_deleted = 1; spin_unlock(&pa->pa_lock); - /* -1 is to protect from crossing allocation group */ - ext4_get_group_no_and_offset(sb, pa->pa_pstart - 1, &grp, NULL); + grp_blk = pa->pa_pstart; + /* If linear, pa_pstart may be in the next group when pa is used up */ + if (pa->pa_linear) + grp_blk--; + + ext4_get_group_no_and_offset(sb, grp_blk, &grp, NULL); /* * possible race: -- cgit v1.2.3 From ee568b25ee9e160b32d1aef73d8b2ee9c05d34db Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Tue, 17 Mar 2009 10:02:35 -0700 Subject: Avoid 64-bit "switch()" statements on 32-bit architectures Commit ee6f779b9e0851e2f7da292a9f58e0095edf615a ("filp->f_pos not correctly updated in proc_task_readdir") changed the proc code to use filp->f_pos directly, rather than through a temporary variable. In the process, that caused the operations to be done on the full 64 bits, even though the offset is never that big. That's all fine and dandy per se, but for some unfathomable reason gcc generates absolutely horrid code when using 64-bit values in switch() statements. To the point of actually calling out to gcc helper functions like __cmpdi2 rather than just doing the trivial comparisons directly the way gcc does for normal compares. At which point we get link failures, because we really don't want to support that kind of crazy code. Fix this by just casting the f_pos value to "unsigned long", which is plenty big enough for /proc, and avoids the gcc code generation issue. Reported-by: Alexey Dobriyan Cc: Zhang Le Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/proc/base.c b/fs/proc/base.c index cc6ea2329e7..beaa0ce3b82 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3082,7 +3082,7 @@ static int proc_task_readdir(struct file * filp, void * dirent, filldir_t filldi goto out_no_task; retval = 0; - switch (filp->f_pos) { + switch ((unsigned long)filp->f_pos) { case 0: ino = inode->i_ino; if (filldir(dirent, ".", 1, filp->f_pos, ino, DT_DIR) < 0) -- cgit v1.2.3 From 84f09f46b4ee9e4e9b6381f8af31817516d2091b Mon Sep 17 00:00:00 2001 From: Benny Halevy Date: Wed, 4 Mar 2009 23:05:35 +0200 Subject: NFSD: provide encode routine for OP_OPENATTR Although this operation is unsupported by our implementation we still need to provide an encode routine for it to merely encode its (error) status back in the compound reply. Thanks for Bill Baker at sun.com for testing with the Sun OpenSolaris' client, finding, and reporting this bug at Connectathon 2009. This bug was introduced in 2.6.27 Signed-off-by: Benny Halevy Cc: stable@kernel.org Signed-off-by: J. Bruce Fields --- fs/nfsd/nfs4xdr.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index f65953be39c..9250067943d 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2596,6 +2596,7 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_LOOKUPP] = (nfsd4_enc)nfsd4_encode_noop, [OP_NVERIFY] = (nfsd4_enc)nfsd4_encode_noop, [OP_OPEN] = (nfsd4_enc)nfsd4_encode_open, + [OP_OPENATTR] = (nfsd4_enc)nfsd4_encode_noop, [OP_OPEN_CONFIRM] = (nfsd4_enc)nfsd4_encode_open_confirm, [OP_OPEN_DOWNGRADE] = (nfsd4_enc)nfsd4_encode_open_downgrade, [OP_PUTFH] = (nfsd4_enc)nfsd4_encode_noop, -- cgit v1.2.3 From a8e7d49aa7be728c4ae241a75a2a124cdcabc0c5 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 19 Mar 2009 11:32:05 -0700 Subject: Fix race in create_empty_buffers() vs __set_page_dirty_buffers() Nick Piggin noticed this (very unlikely) race between setting a page dirty and creating the buffers for it - we need to hold the mapping private_lock until we've set the page dirty bit in order to make sure that create_empty_buffers() might not build up a set of buffers without the dirty bits set when the page is dirty. I doubt anybody has ever hit this race (and it didn't solve the issue Nick was looking at), but as Nick says: "Still, it does appear to solve a real race, which we should close." Acked-by: Nick Piggin Signed-off-by: Linus Torvalds --- fs/buffer.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) (limited to 'fs') diff --git a/fs/buffer.c b/fs/buffer.c index 9f697419ed8..891e1c78e4f 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -760,15 +760,9 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode); * If warn is true, then emit a warning if the page is not uptodate and has * not been truncated. */ -static int __set_page_dirty(struct page *page, +static void __set_page_dirty(struct page *page, struct address_space *mapping, int warn) { - if (unlikely(!mapping)) - return !TestSetPageDirty(page); - - if (TestSetPageDirty(page)) - return 0; - spin_lock_irq(&mapping->tree_lock); if (page->mapping) { /* Race with truncate? */ WARN_ON_ONCE(warn && !PageUptodate(page)); @@ -785,8 +779,6 @@ static int __set_page_dirty(struct page *page, } spin_unlock_irq(&mapping->tree_lock); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); - - return 1; } /* @@ -816,6 +808,7 @@ static int __set_page_dirty(struct page *page, */ int __set_page_dirty_buffers(struct page *page) { + int newly_dirty; struct address_space *mapping = page_mapping(page); if (unlikely(!mapping)) @@ -831,9 +824,12 @@ int __set_page_dirty_buffers(struct page *page) bh = bh->b_this_page; } while (bh != head); } + newly_dirty = !TestSetPageDirty(page); spin_unlock(&mapping->private_lock); - return __set_page_dirty(page, mapping, 1); + if (newly_dirty) + __set_page_dirty(page, mapping, 1); + return newly_dirty; } EXPORT_SYMBOL(__set_page_dirty_buffers); @@ -1262,8 +1258,11 @@ void mark_buffer_dirty(struct buffer_head *bh) return; } - if (!test_set_buffer_dirty(bh)) - __set_page_dirty(bh->b_page, page_mapping(bh->b_page), 0); + if (!test_set_buffer_dirty(bh)) { + struct page *page = bh->b_page; + if (!TestSetPageDirty(page)) + __set_page_dirty(page, page_mapping(page), 0); + } } /* -- cgit v1.2.3 From 87c3a86e1c220121d0ced59d1a71e78ed9abc6dd Mon Sep 17 00:00:00 2001 From: Davide Libenzi Date: Wed, 18 Mar 2009 17:04:19 -0700 Subject: eventfd: remove fput() call from possible IRQ context Remove a source of fput() call from inside IRQ context. Myself, like Eric, wasn't able to reproduce an fput() call from IRQ context, but Jeff said he was able to, with the attached test program. Independently from this, the bug is conceptually there, so we might be better off fixing it. This patch adds an optimization similar to the one we already do on ->ki_filp, on ->ki_eventfd. Playing with ->f_count directly is not pretty in general, but the alternative here would be to add a brand new delayed fput() infrastructure, that I'm not sure is worth it. Signed-off-by: Davide Libenzi Cc: Benjamin LaHaise Cc: Trond Myklebust Cc: Eric Dumazet Signed-off-by: Jeff Moyer Cc: Zach Brown Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 37 +++++++++++++++++++++++++++---------- 1 file changed, 27 insertions(+), 10 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 8fa77e23394..4a9d4d641fb 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -443,7 +443,7 @@ static struct kiocb *__aio_get_req(struct kioctx *ctx) req->private = NULL; req->ki_iovec = NULL; INIT_LIST_HEAD(&req->ki_run_list); - req->ki_eventfd = ERR_PTR(-EINVAL); + req->ki_eventfd = NULL; /* Check if the completion queue has enough free space to * accept an event from this io. @@ -485,8 +485,6 @@ static inline void really_put_req(struct kioctx *ctx, struct kiocb *req) { assert_spin_locked(&ctx->ctx_lock); - if (!IS_ERR(req->ki_eventfd)) - fput(req->ki_eventfd); if (req->ki_dtor) req->ki_dtor(req); if (req->ki_iovec != &req->ki_inline_vec) @@ -508,8 +506,11 @@ static void aio_fput_routine(struct work_struct *data) list_del(&req->ki_list); spin_unlock_irq(&fput_lock); - /* Complete the fput */ - __fput(req->ki_filp); + /* Complete the fput(s) */ + if (req->ki_filp != NULL) + __fput(req->ki_filp); + if (req->ki_eventfd != NULL) + __fput(req->ki_eventfd); /* Link the iocb into the context's free list */ spin_lock_irq(&ctx->ctx_lock); @@ -527,12 +528,14 @@ static void aio_fput_routine(struct work_struct *data) */ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) { + int schedule_putreq = 0; + dprintk(KERN_DEBUG "aio_put(%p): f_count=%ld\n", req, atomic_long_read(&req->ki_filp->f_count)); assert_spin_locked(&ctx->ctx_lock); - req->ki_users --; + req->ki_users--; BUG_ON(req->ki_users < 0); if (likely(req->ki_users)) return 0; @@ -540,10 +543,23 @@ static int __aio_put_req(struct kioctx *ctx, struct kiocb *req) req->ki_cancel = NULL; req->ki_retry = NULL; - /* Must be done under the lock to serialise against cancellation. - * Call this aio_fput as it duplicates fput via the fput_work. + /* + * Try to optimize the aio and eventfd file* puts, by avoiding to + * schedule work in case it is not __fput() time. In normal cases, + * we would not be holding the last reference to the file*, so + * this function will be executed w/out any aio kthread wakeup. */ - if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) { + if (unlikely(atomic_long_dec_and_test(&req->ki_filp->f_count))) + schedule_putreq++; + else + req->ki_filp = NULL; + if (req->ki_eventfd != NULL) { + if (unlikely(atomic_long_dec_and_test(&req->ki_eventfd->f_count))) + schedule_putreq++; + else + req->ki_eventfd = NULL; + } + if (unlikely(schedule_putreq)) { get_ioctx(ctx); spin_lock(&fput_lock); list_add(&req->ki_list, &fput_head); @@ -1009,7 +1025,7 @@ int aio_complete(struct kiocb *iocb, long res, long res2) * eventfd. The eventfd_signal() function is safe to be called * from IRQ context. */ - if (!IS_ERR(iocb->ki_eventfd)) + if (iocb->ki_eventfd != NULL) eventfd_signal(iocb->ki_eventfd, 1); put_rq: @@ -1608,6 +1624,7 @@ static int io_submit_one(struct kioctx *ctx, struct iocb __user *user_iocb, req->ki_eventfd = eventfd_fget((int) iocb->aio_resfd); if (IS_ERR(req->ki_eventfd)) { ret = PTR_ERR(req->ki_eventfd); + req->ki_eventfd = NULL; goto out_put_req; } } -- cgit v1.2.3 From 65c24491b4fef017c64e39ec64384fde5e05e0a0 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Wed, 18 Mar 2009 17:04:21 -0700 Subject: aio: lookup_ioctx can return the wrong value when looking up a bogus context The libaio test harness turned up a problem whereby lookup_ioctx on a bogus io context was returning the 1 valid io context from the list (harness/cases/3.p). Because of that, an extra put_iocontext was done, and when the process exited, it hit a BUG_ON in the put_iocontext macro called from exit_aio (since we expect a users count of 1 and instead get 0). The problem was introduced by "aio: make the lookup_ioctx() lockless" (commit abf137dd7712132ee56d5b3143c2ff61a72a5faa). Thanks to Zach for pointing out that hlist_for_each_entry_rcu will not return with a NULL tpos at the end of the loop, even if the entry was not found. Signed-off-by: Jeff Moyer Acked-by: Zach Brown Acked-by: Jens Axboe Cc: Benjamin LaHaise Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/aio.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/aio.c b/fs/aio.c index 4a9d4d641fb..76da1253795 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -587,7 +587,7 @@ int aio_put_req(struct kiocb *req) static struct kioctx *lookup_ioctx(unsigned long ctx_id) { struct mm_struct *mm = current->mm; - struct kioctx *ctx = NULL; + struct kioctx *ctx, *ret = NULL; struct hlist_node *n; rcu_read_lock(); @@ -595,12 +595,13 @@ static struct kioctx *lookup_ioctx(unsigned long ctx_id) hlist_for_each_entry_rcu(ctx, n, &mm->ioctx_list, list) { if (ctx->user_id == ctx_id && !ctx->dead) { get_ioctx(ctx); + ret = ctx; break; } } rcu_read_unlock(); - return ctx; + return ret; } /* -- cgit v1.2.3 From 8faece5f906725c10e7a1f6caf84452abadbdc7b Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Mar 2009 01:25:09 -0500 Subject: eCryptfs: Allocate a variable number of pages for file headers When allocating the memory used to store the eCryptfs header contents, a single, zeroed page was being allocated with get_zeroed_page(). However, the size of an eCryptfs header is either PAGE_CACHE_SIZE or ECRYPTFS_MINIMUM_HEADER_EXTENT_SIZE (8192), whichever is larger, and is stored in the file's private_data->crypt_stat->num_header_bytes_at_front field. ecryptfs_write_metadata_to_contents() was using num_header_bytes_at_front to decide how many bytes should be written to the lower filesystem for the file header. Unfortunately, at least 8K was being written from the page, despite the chance of the single, zeroed page being smaller than 8K. This resulted in random areas of kernel memory being written between the 0x1000 and 0x1FFF bytes offsets in the eCryptfs file headers if PAGE_SIZE was 4K. This patch allocates a variable number of pages, calculated with num_header_bytes_at_front, and passes the number of allocated pages along to ecryptfs_write_metadata_to_contents(). Thanks to Florian Streibelt for reporting the data leak and working with me to find the problem. 2.6.28 is the only kernel release with this vulnerability. Corresponds to CVE-2009-0787 Signed-off-by: Tyler Hicks Acked-by: Dustin Kirkland Reviewed-by: Eric Sandeen Reviewed-by: Eugene Teo Cc: Greg KH Cc: dann frazier Cc: Serge E. Hallyn Cc: Florian Streibelt Cc: stable@kernel.org Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 39 ++++++++++++++++++++++++++------------- 1 file changed, 26 insertions(+), 13 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index bdca1f4b3a3..75bee99de0f 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1324,14 +1324,13 @@ static int ecryptfs_write_headers_virt(char *page_virt, size_t max, } static int -ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, - struct dentry *ecryptfs_dentry, - char *virt) +ecryptfs_write_metadata_to_contents(struct dentry *ecryptfs_dentry, + char *virt, size_t virt_len) { int rc; rc = ecryptfs_write_lower(ecryptfs_dentry->d_inode, virt, - 0, crypt_stat->num_header_bytes_at_front); + 0, virt_len); if (rc) printk(KERN_ERR "%s: Error attempting to write header " "information to lower file; rc = [%d]\n", __func__, @@ -1341,7 +1340,6 @@ ecryptfs_write_metadata_to_contents(struct ecryptfs_crypt_stat *crypt_stat, static int ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, - struct ecryptfs_crypt_stat *crypt_stat, char *page_virt, size_t size) { int rc; @@ -1351,6 +1349,17 @@ ecryptfs_write_metadata_to_xattr(struct dentry *ecryptfs_dentry, return rc; } +static unsigned long ecryptfs_get_zeroed_pages(gfp_t gfp_mask, + unsigned int order) +{ + struct page *page; + + page = alloc_pages(gfp_mask | __GFP_ZERO, order); + if (page) + return (unsigned long) page_address(page); + return 0; +} + /** * ecryptfs_write_metadata * @ecryptfs_dentry: The eCryptfs dentry @@ -1367,7 +1376,9 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) { struct ecryptfs_crypt_stat *crypt_stat = &ecryptfs_inode_to_private(ecryptfs_dentry->d_inode)->crypt_stat; + unsigned int order; char *virt; + size_t virt_len; size_t size = 0; int rc = 0; @@ -1383,33 +1394,35 @@ int ecryptfs_write_metadata(struct dentry *ecryptfs_dentry) rc = -EINVAL; goto out; } + virt_len = crypt_stat->num_header_bytes_at_front; + order = get_order(virt_len); /* Released in this function */ - virt = (char *)get_zeroed_page(GFP_KERNEL); + virt = (char *)ecryptfs_get_zeroed_pages(GFP_KERNEL, order); if (!virt) { printk(KERN_ERR "%s: Out of memory\n", __func__); rc = -ENOMEM; goto out; } - rc = ecryptfs_write_headers_virt(virt, PAGE_CACHE_SIZE, &size, - crypt_stat, ecryptfs_dentry); + rc = ecryptfs_write_headers_virt(virt, virt_len, &size, crypt_stat, + ecryptfs_dentry); if (unlikely(rc)) { printk(KERN_ERR "%s: Error whilst writing headers; rc = [%d]\n", __func__, rc); goto out_free; } if (crypt_stat->flags & ECRYPTFS_METADATA_IN_XATTR) - rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, - crypt_stat, virt, size); + rc = ecryptfs_write_metadata_to_xattr(ecryptfs_dentry, virt, + size); else - rc = ecryptfs_write_metadata_to_contents(crypt_stat, - ecryptfs_dentry, virt); + rc = ecryptfs_write_metadata_to_contents(ecryptfs_dentry, virt, + virt_len); if (rc) { printk(KERN_ERR "%s: Error writing metadata out to lower file; " "rc = [%d]\n", __func__, rc); goto out_free; } out_free: - free_page((unsigned long)virt); + free_pages((unsigned long)virt, order); out: return rc; } -- cgit v1.2.3 From 2aac0cf88681bfa092f731553bc7fbd23516be73 Mon Sep 17 00:00:00 2001 From: Tyler Hicks Date: Fri, 20 Mar 2009 02:23:57 -0500 Subject: eCryptfs: NULL crypt_stat dereference during lookup If ecryptfs_encrypted_view or ecryptfs_xattr_metadata were being specified as mount options, a NULL pointer dereference of crypt_stat was possible during lookup. This patch moves the crypt_stat assignment into ecryptfs_lookup_and_interpose_lower(), ensuring that crypt_stat will not be NULL before we attempt to dereference it. Thanks to Dan Carpenter and his static analysis tool, smatch, for finding this bug. Signed-off-by: Tyler Hicks Acked-by: Dustin Kirkland Cc: Dan Carpenter Cc: Serge Hallyn Signed-off-by: Linus Torvalds --- fs/ecryptfs/crypto.c | 10 ++++++---- fs/ecryptfs/ecryptfs_kernel.h | 1 - fs/ecryptfs/inode.c | 32 ++++++++++++-------------------- 3 files changed, 18 insertions(+), 25 deletions(-) (limited to 'fs') diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 75bee99de0f..8b65f289ee0 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -2221,17 +2221,19 @@ int ecryptfs_decode_and_decrypt_filename(char **plaintext_name, struct dentry *ecryptfs_dir_dentry, const char *name, size_t name_size) { + struct ecryptfs_mount_crypt_stat *mount_crypt_stat = + &ecryptfs_superblock_to_private( + ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; char *decoded_name; size_t decoded_name_size; size_t packet_size; int rc = 0; - if ((name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) + if ((mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) + && !(mount_crypt_stat->flags & ECRYPTFS_ENCRYPTED_VIEW_ENABLED) + && (name_size > ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) && (strncmp(name, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX, ECRYPTFS_FNEK_ENCRYPTED_FILENAME_PREFIX_SIZE) == 0)) { - struct ecryptfs_mount_crypt_stat *mount_crypt_stat = - &ecryptfs_superblock_to_private( - ecryptfs_dir_dentry->d_sb)->mount_crypt_stat; const char *orig_name = name; size_t orig_name_size = name_size; diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index eb2267eca1f..ac749d4d644 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -620,7 +620,6 @@ int ecryptfs_interpose(struct dentry *hidden_dentry, u32 flags); int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dentry, - struct ecryptfs_crypt_stat *crypt_stat, struct inode *ecryptfs_dir_inode, struct nameidata *ecryptfs_nd); int ecryptfs_decode_and_decrypt_filename(char **decrypted_name, diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 5697899a168..55b3145b807 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -246,7 +246,6 @@ out: */ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct dentry *lower_dentry, - struct ecryptfs_crypt_stat *crypt_stat, struct inode *ecryptfs_dir_inode, struct nameidata *ecryptfs_nd) { @@ -254,6 +253,7 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, struct vfsmount *lower_mnt; struct inode *lower_inode; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; + struct ecryptfs_crypt_stat *crypt_stat; char *page_virt = NULL; u64 file_size; int rc = 0; @@ -314,6 +314,11 @@ int ecryptfs_lookup_and_interpose_lower(struct dentry *ecryptfs_dentry, goto out_free_kmem; } } + crypt_stat = &ecryptfs_inode_to_private( + ecryptfs_dentry->d_inode)->crypt_stat; + /* TODO: lock for crypt_stat comparison */ + if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) + ecryptfs_set_default_sizes(crypt_stat); rc = ecryptfs_read_and_validate_header_region(page_virt, ecryptfs_dentry->d_inode); if (rc) { @@ -362,9 +367,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, { char *encrypted_and_encoded_name = NULL; size_t encrypted_and_encoded_name_size; - struct ecryptfs_crypt_stat *crypt_stat = NULL; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; - struct ecryptfs_inode_info *inode_info; struct dentry *lower_dir_dentry, *lower_dentry; int rc = 0; @@ -388,26 +391,15 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, } if (lower_dentry->d_inode) goto lookup_and_interpose; - inode_info = ecryptfs_inode_to_private(ecryptfs_dentry->d_inode); - if (inode_info) { - crypt_stat = &inode_info->crypt_stat; - /* TODO: lock for crypt_stat comparison */ - if (!(crypt_stat->flags & ECRYPTFS_POLICY_APPLIED)) - ecryptfs_set_default_sizes(crypt_stat); - } - if (crypt_stat) - mount_crypt_stat = crypt_stat->mount_crypt_stat; - else - mount_crypt_stat = &ecryptfs_superblock_to_private( - ecryptfs_dentry->d_sb)->mount_crypt_stat; - if (!(crypt_stat && (crypt_stat->flags & ECRYPTFS_ENCRYPT_FILENAMES)) - && !(mount_crypt_stat && (mount_crypt_stat->flags - & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) + mount_crypt_stat = &ecryptfs_superblock_to_private( + ecryptfs_dentry->d_sb)->mount_crypt_stat; + if (!(mount_crypt_stat + && (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES))) goto lookup_and_interpose; dput(lower_dentry); rc = ecryptfs_encrypt_and_encode_filename( &encrypted_and_encoded_name, &encrypted_and_encoded_name_size, - crypt_stat, mount_crypt_stat, ecryptfs_dentry->d_name.name, + NULL, mount_crypt_stat, ecryptfs_dentry->d_name.name, ecryptfs_dentry->d_name.len); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt and encode " @@ -426,7 +418,7 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, } lookup_and_interpose: rc = ecryptfs_lookup_and_interpose_lower(ecryptfs_dentry, lower_dentry, - crypt_stat, ecryptfs_dir_inode, + ecryptfs_dir_inode, ecryptfs_nd); goto out; out_d_drop: -- cgit v1.2.3 From f762dd68218665bb87d4e4a0eeac86fde7530293 Mon Sep 17 00:00:00 2001 From: Gertjan van Wingerde Date: Sat, 21 Mar 2009 23:18:57 +0100 Subject: Update my email address Update all previous incarnations of my email address to the correct one. Signed-off-by: Gertjan van Wingerde Signed-off-by: Linus Torvalds --- fs/minix/inode.c | 2 +- fs/ufs/super.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/minix/inode.c b/fs/minix/inode.c index d1d1eb84679..618865b3128 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -3,7 +3,7 @@ * * Copyright (C) 1991, 1992 Linus Torvalds * - * Copyright (C) 1996 Gertjan van Wingerde (gertjan@cs.vu.nl) + * Copyright (C) 1996 Gertjan van Wingerde * Minix V2 fs support. * * Modified for 680x0 by Andreas Schwab diff --git a/fs/ufs/super.c b/fs/ufs/super.c index e65212dfb60..261a1c2f22d 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -41,7 +41,7 @@ * Stefan Reinauer * * Module usage counts added on 96/04/29 by - * Gertjan van Wingerde + * Gertjan van Wingerde * * Clean swab support on 19970406 by * Francois-Rene Rideau -- cgit v1.2.3