From 5e1f8c9e20a92743eefc9a82c2db835213905e26 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 28 Oct 2008 13:21:55 -0400 Subject: ext3: Add support for non-native signed/unsigned htree hash algorithms The original ext3 hash algorithms assumed that variables of type char were signed, as God and K&R intended. Unfortunately, this assumption is not true on some architectures. Userspace support for marking filesystems with non-native signed/unsigned chars was added two years ago, but the kernel-side support was never added (until now). Signed-off-by: "Theodore Ts'o" Cc: akpm@linux-foundation.org Cc: linux-kernel@vger.kernel.org --- include/linux/ext3_fs.h | 28 +++++++++++++++++++++++++++- include/linux/ext3_fs_sb.h | 1 + 2 files changed, 28 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/ext3_fs.h b/include/linux/ext3_fs.h index d14f0291848..9004794a35f 100644 --- a/include/linux/ext3_fs.h +++ b/include/linux/ext3_fs.h @@ -353,6 +353,13 @@ struct ext3_inode { #define EXT3_ERROR_FS 0x0002 /* Errors detected */ #define EXT3_ORPHAN_FS 0x0004 /* Orphans being recovered */ +/* + * Misc. filesystem flags + */ +#define EXT2_FLAGS_SIGNED_HASH 0x0001 /* Signed dirhash in use */ +#define EXT2_FLAGS_UNSIGNED_HASH 0x0002 /* Unsigned dirhash in use */ +#define EXT2_FLAGS_TEST_FILESYS 0x0004 /* to test development code */ + /* * Mount flags */ @@ -489,7 +496,23 @@ struct ext3_super_block { __u16 s_reserved_word_pad; __le32 s_default_mount_opts; __le32 s_first_meta_bg; /* First metablock block group */ - __u32 s_reserved[190]; /* Padding to the end of the block */ + __le32 s_mkfs_time; /* When the filesystem was created */ + __le32 s_jnl_blocks[17]; /* Backup of the journal inode */ + /* 64bit support valid if EXT4_FEATURE_COMPAT_64BIT */ +/*150*/ __le32 s_blocks_count_hi; /* Blocks count */ + __le32 s_r_blocks_count_hi; /* Reserved blocks count */ + __le32 s_free_blocks_count_hi; /* Free blocks count */ + __le16 s_min_extra_isize; /* All inodes have at least # bytes */ + __le16 s_want_extra_isize; /* New inodes should reserve # bytes */ + __le32 s_flags; /* Miscellaneous flags */ + __le16 s_raid_stride; /* RAID stride */ + __le16 s_mmp_interval; /* # seconds to wait in MMP checking */ + __le64 s_mmp_block; /* Block for multi-mount protection */ + __le32 s_raid_stripe_width; /* blocks on all data disks (N*stride)*/ + __u8 s_log_groups_per_flex; /* FLEX_BG group size */ + __u8 s_reserved_char_pad2; + __le16 s_reserved_pad; + __u32 s_reserved[162]; /* Padding to the end of the block */ }; #ifdef __KERNEL__ @@ -694,6 +717,9 @@ static inline __le16 ext3_rec_len_to_disk(unsigned len) #define DX_HASH_LEGACY 0 #define DX_HASH_HALF_MD4 1 #define DX_HASH_TEA 2 +#define DX_HASH_LEGACY_UNSIGNED 3 +#define DX_HASH_HALF_MD4_UNSIGNED 4 +#define DX_HASH_TEA_UNSIGNED 5 #ifdef __KERNEL__ diff --git a/include/linux/ext3_fs_sb.h b/include/linux/ext3_fs_sb.h index e024e38248f..a4e9216b3a6 100644 --- a/include/linux/ext3_fs_sb.h +++ b/include/linux/ext3_fs_sb.h @@ -57,6 +57,7 @@ struct ext3_sb_info { u32 s_next_generation; u32 s_hash_seed[4]; int s_def_hash_version; + int s_hash_unsigned; /* 3 if hash should be signed, 0 if not */ struct percpu_counter s_freeblocks_counter; struct percpu_counter s_freeinodes_counter; struct percpu_counter s_dirs_counter; -- cgit v1.2.3 From 171bbfbeab7730031eec8025341401fabe540bd5 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Tue, 25 Nov 2008 17:42:31 -0500 Subject: jbd2: Add BH_JBDPrivateStart Add this so that file systems using JBD2 can safely allocate unused b_state bits. In this case, we add it so that Ocfs2 can define a single bit for tracking the validation state of a buffer. Signed-off-by: Mark Fasheh Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index c7d106ef22e..f3664574548 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -329,6 +329,7 @@ enum jbd_state_bits { BH_State, /* Pins most journal_head state */ BH_JournalHead, /* Pins bh->b_private and jh->b_bh */ BH_Unshadow, /* Dummy bit, for BJ_Shadow wakeup filtering */ + BH_JBDPrivateStart, /* First bit available for private use by FS */ }; BUFFER_FNS(JBD, jbd) -- cgit v1.2.3 From e07f7183a486cf9783d1f8c9d2997b5b39eeb2d4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 26 Nov 2008 01:14:26 -0500 Subject: jbd2: improve jbd2 fsync batching This patch removes the static sleep time in favor of a more self optimizing approach where we measure the average amount of time it takes to commit a transaction to disk and the ammount of time a transaction has been running. If somebody does a sync write or an fsync() traditionally we would sleep for 1 jiffies, which depending on the value of HZ could be a significant amount of time compared to how long it takes to commit a transaction to the underlying storage. With this patch instead of sleeping for a jiffie, we check to see if the amount of time this transaction has been running is less than the average commit time, and if it is we sleep for the delta using schedule_hrtimeout to give us a higher precision sleep time. This greatly benefits high end storage where you could end up sleeping for longer than it takes to commit the transaction and therefore sitting idle instead of allowing the transaction to be committed by keeping the sleep time to a minimum so you are sure to always be doing something. Signed-off-by: Josef Bacik Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index f3664574548..ab8cef130c2 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -637,6 +637,11 @@ struct transaction_s */ unsigned long t_expires; + /* + * When this transaction started, in nanoseconds [no locking] + */ + ktime_t t_start_time; + /* * How many handles used this transaction? [t_handle_lock] */ @@ -939,8 +944,18 @@ struct journal_s struct buffer_head **j_wbuf; int j_wbufsize; + /* + * this is the pid of hte last person to run a synchronous operation + * through the journal + */ pid_t j_last_sync_writer; + /* + * the average amount of time in nanoseconds it takes to commit a + * transaction to disk. [j_state_lock] + */ + u64 j_average_commit_time; + /* This function is called when a transaction is closed */ void (*j_commit_callback)(journal_t *, transaction_t *); -- cgit v1.2.3 From 30773840c19cea60dcef39545960d541b1ac1cf8 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Jan 2009 20:27:38 -0500 Subject: ext4: add fsync batch tuning knobs Add new mount options, min_batch_time and max_batch_time, which controls how long the jbd2 layer should wait for additional filesystem operations to get batched with a synchronous write transaction. Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index ab8cef130c2..a3cd647ea1b 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -956,6 +956,14 @@ struct journal_s */ u64 j_average_commit_time; + /* + * minimum and maximum times that we should wait for + * additional filesystem operations to get batched into a + * synchronous handle in microseconds + */ + u32 j_min_batch_time; + u32 j_max_batch_time; + /* This function is called when a transaction is closed */ void (*j_commit_callback)(journal_t *, transaction_t *); -- cgit v1.2.3 From 1a0d3786dd57dbd74f340322054c3d618b999dcf Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 5 Nov 2008 00:09:22 -0500 Subject: jbd2: Remove a large array of bh's from the stack of the checkpoint routine jbd2_log_do_checkpoint()n is one of the kernel's largest stack users. Move the array of buffer head's from the stack of jbd2_log_do_checkpoint() to the in-core journal structure. Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 10 ++++++++++ 1 file changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index a3cd647ea1b..004c9a8d63e 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -687,6 +687,8 @@ jbd2_time_diff(unsigned long start, unsigned long end) return end + (MAX_JIFFY_OFFSET - start); } +#define JBD2_NR_BATCH 64 + /** * struct journal_s - The journal_s type is the concrete type associated with * journal_t. @@ -830,6 +832,14 @@ struct journal_s /* Semaphore for locking against concurrent checkpoints */ struct mutex j_checkpoint_mutex; + /* + * List of buffer heads used by the checkpoint routine. This + * was moved from jbd2_log_do_checkpoint() to reduce stack + * usage. Access to this array is controlled by the + * j_checkpoint_mutex. [j_checkpoint_mutex] + */ + struct buffer_head *j_chkpt_bhs[JBD2_NR_BATCH]; + /* * Journal head: identifies the first unused block in the journal. * [j_state_lock] -- cgit v1.2.3 From fb68407b0d9efba962c03f55009c797e22f024bc Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Thu, 6 Nov 2008 17:50:21 -0500 Subject: jbd2: Call journal commit callback without holding j_list_lock Avoid freeing the transaction in __jbd2_journal_drop_transaction() so the journal commit callback can run without holding j_list_lock, to avoid lock contention on this spinlock. Signed-off-by: Aneesh Kumar K.V Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 004c9a8d63e..9d82084a160 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1179,8 +1179,8 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_log_do_checkpoint(journal_t *journal); void __jbd2_log_wait_for_space(journal_t *journal); -extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); -extern int jbd2_cleanup_journal_tail(journal_t *); +extern void __jbd2_journal_drop_transaction(journal_t *, transaction_t *); +extern int jbd2_cleanup_journal_tail(journal_t *); /* Debugging code only: */ -- cgit v1.2.3 From 87d8fe1ee6b8d2f95076142d58c440dba4e7bdc2 Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Sat, 3 Jan 2009 09:47:09 -0500 Subject: add releasepage hooks to block devices which can be used by file systems Implement blkdev_releasepage() to release the buffer_heads and pages after we release private data belonging to a mounted filesystem. Cc: Toshiyuki Okajima Cc: linux-fsdevel@vger.kernel.org Signed-off-by: "Theodore Ts'o" --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index f2a3010140e..0f54ae0f0cc 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -565,6 +565,7 @@ struct address_space { struct block_device { dev_t bd_dev; /* not a kdev_t - it's a search key */ struct inode * bd_inode; /* will die */ + struct super_block * bd_super; int bd_openers; struct mutex bd_mutex; /* open/close mutex */ struct semaphore bd_mount_sem; @@ -1385,6 +1386,7 @@ struct super_operations { ssize_t (*quota_read)(struct super_block *, int, char *, size_t, loff_t); ssize_t (*quota_write)(struct super_block *, int, const char *, size_t, loff_t); #endif + int (*bdev_try_to_free_page)(struct super_block*, struct page*, gfp_t); }; /* -- cgit v1.2.3 From c31910672376dfb8d020e32afa7249763bcd924a Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Tue, 6 Jan 2009 11:14:25 -0500 Subject: ext4: Remove code to create the journal inode This code has been obsolete in quite some time, since the supported method for adding a journal inode is to use tune2fs (or to creating new filesystem with a journal via mke2fs or mkfs.ext4). Signed-off-by: "Theodore Ts'o" --- include/linux/jbd2.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h index 9d82084a160..adef1c9940d 100644 --- a/include/linux/jbd2.h +++ b/include/linux/jbd2.h @@ -1104,7 +1104,6 @@ extern int jbd2_journal_set_features (journal_t *, unsigned long, unsigned long, unsigned long); extern void jbd2_journal_clear_features (journal_t *, unsigned long, unsigned long, unsigned long); -extern int jbd2_journal_create (journal_t *); extern int jbd2_journal_load (journal_t *journal); extern int jbd2_journal_destroy (journal_t *); extern int jbd2_journal_recover (journal_t *journal); -- cgit v1.2.3 From b3881f74b31b7d47d0f1c4d89ac3e7f0b9c05e3e Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Mon, 5 Jan 2009 22:46:26 -0500 Subject: ext4: Add mount option to set kjournald's I/O priority Signed-off-by: "Theodore Ts'o" Cc: Jens Axboe --- include/linux/ioprio.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h index f98a656b17e..76dad480884 100644 --- a/include/linux/ioprio.h +++ b/include/linux/ioprio.h @@ -86,4 +86,6 @@ static inline int task_nice_ioclass(struct task_struct *task) */ extern int ioprio_best(unsigned short aprio, unsigned short bprio); +extern int set_task_ioprio(struct task_struct *task, int ioprio); + #endif -- cgit v1.2.3