From 3d7a641e544e428191667e8b1f83f96fa46dbd65 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:10:23 +0000 Subject: SLOW_WORK: Wait for outstanding work items belonging to a module to clear Wait for outstanding slow work items belonging to a module to clear when unregistering that module as a user of the facility. This prevents the put_ref code of a work item from being taken away before it returns. Signed-off-by: David Howells --- include/linux/slow-work.h | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h index b65c8881f07..9adb2b30754 100644 --- a/include/linux/slow-work.h +++ b/include/linux/slow-work.h @@ -24,6 +24,9 @@ struct slow_work; * The operations used to support slow work items */ struct slow_work_ops { + /* owner */ + struct module *owner; + /* get a ref on a work item * - return 0 if successful, -ve if not */ @@ -42,6 +45,7 @@ struct slow_work_ops { * queued */ struct slow_work { + struct module *owner; /* the owning module */ unsigned long flags; #define SLOW_WORK_PENDING 0 /* item pending (further) execution */ #define SLOW_WORK_EXECUTING 1 /* item currently executing */ @@ -84,8 +88,8 @@ static inline void vslow_work_init(struct slow_work *work, } extern int slow_work_enqueue(struct slow_work *work); -extern int slow_work_register_user(void); -extern void slow_work_unregister_user(void); +extern int slow_work_register_user(struct module *owner); +extern void slow_work_unregister_user(struct module *owner); #ifdef CONFIG_SYSCTL extern ctl_table slow_work_sysctls[]; -- cgit v1.2.3 From 0160950297c08f8233c89b9f9e7dd59cfb080809 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Nov 2009 18:10:43 +0000 Subject: SLOW_WORK: Add support for cancellation of slow work Add support for cancellation of queued slow work and delayed slow work items. The cancellation functions will wait for items that are pending or undergoing execution to be discarded by the slow work facility. Attempting to enqueue work that is in the process of being cancelled will result in ECANCELED. Signed-off-by: Jens Axboe Signed-off-by: David Howells --- include/linux/slow-work.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h index 9adb2b30754..eef20182d5b 100644 --- a/include/linux/slow-work.h +++ b/include/linux/slow-work.h @@ -51,6 +51,7 @@ struct slow_work { #define SLOW_WORK_EXECUTING 1 /* item currently executing */ #define SLOW_WORK_ENQ_DEFERRED 2 /* item enqueue deferred */ #define SLOW_WORK_VERY_SLOW 3 /* item is very slow */ +#define SLOW_WORK_CANCELLING 4 /* item is being cancelled, don't enqueue */ const struct slow_work_ops *ops; /* operations table for this item */ struct list_head link; /* link in queue */ }; @@ -88,6 +89,7 @@ static inline void vslow_work_init(struct slow_work *work, } extern int slow_work_enqueue(struct slow_work *work); +extern void slow_work_cancel(struct slow_work *work); extern int slow_work_register_user(struct module *owner); extern void slow_work_unregister_user(struct module *owner); -- cgit v1.2.3 From 6b8268b17a1ffc942bc72d7d00274e433d6b6719 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 19 Nov 2009 18:10:47 +0000 Subject: SLOW_WORK: Add delayed_slow_work support This adds support for starting slow work with a delay, similar to the functionality we have for workqueues. Signed-off-by: Jens Axboe Signed-off-by: David Howells --- include/linux/slow-work.h | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) (limited to 'include') diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h index eef20182d5b..b245b9a9cc0 100644 --- a/include/linux/slow-work.h +++ b/include/linux/slow-work.h @@ -17,6 +17,7 @@ #ifdef CONFIG_SLOW_WORK #include +#include struct slow_work; @@ -52,10 +53,16 @@ struct slow_work { #define SLOW_WORK_ENQ_DEFERRED 2 /* item enqueue deferred */ #define SLOW_WORK_VERY_SLOW 3 /* item is very slow */ #define SLOW_WORK_CANCELLING 4 /* item is being cancelled, don't enqueue */ +#define SLOW_WORK_DELAYED 5 /* item is struct delayed_slow_work with active timer */ const struct slow_work_ops *ops; /* operations table for this item */ struct list_head link; /* link in queue */ }; +struct delayed_slow_work { + struct slow_work work; + struct timer_list timer; +}; + /** * slow_work_init - Initialise a slow work item * @work: The work item to initialise @@ -71,6 +78,20 @@ static inline void slow_work_init(struct slow_work *work, INIT_LIST_HEAD(&work->link); } +/** + * slow_work_init - Initialise a delayed slow work item + * @work: The work item to initialise + * @ops: The operations to use to handle the slow work item + * + * Initialise a delayed slow work item. + */ +static inline void delayed_slow_work_init(struct delayed_slow_work *dwork, + const struct slow_work_ops *ops) +{ + init_timer(&dwork->timer); + slow_work_init(&dwork->work, ops); +} + /** * vslow_work_init - Initialise a very slow work item * @work: The work item to initialise @@ -93,6 +114,14 @@ extern void slow_work_cancel(struct slow_work *work); extern int slow_work_register_user(struct module *owner); extern void slow_work_unregister_user(struct module *owner); +extern int delayed_slow_work_enqueue(struct delayed_slow_work *dwork, + unsigned long delay); + +static inline void delayed_slow_work_cancel(struct delayed_slow_work *dwork) +{ + slow_work_cancel(&dwork->work); +} + #ifdef CONFIG_SYSCTL extern ctl_table slow_work_sysctls[]; #endif -- cgit v1.2.3 From 8fba10a42d191de612e60e7009c8f0313f90a9b3 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:10:51 +0000 Subject: SLOW_WORK: Allow the work items to be viewed through a /proc file Allow the executing and queued work items to be viewed through a /proc file for debugging purposes. The contents look something like the following: THR PID ITEM ADDR FL MARK DESC === ===== ================ == ===== ========== 0 3005 ffff880023f52348 a 952ms FSC: OBJ17d3: LOOK 1 3006 ffff880024e33668 2 160ms FSC: OBJ17e5 OP60d3b: Write1/Store fl=2 2 3165 ffff8800296dd180 a 424ms FSC: OBJ17e4: LOOK 3 4089 ffff8800262c8d78 a 212ms FSC: OBJ17ea: CRTN 4 4090 ffff88002792bed8 2 388ms FSC: OBJ17e8 OP60d36: Write1/Store fl=2 5 4092 ffff88002a0ef308 2 388ms FSC: OBJ17e7 OP60d2e: Write1/Store fl=2 6 4094 ffff88002abaf4b8 2 132ms FSC: OBJ17e2 OP60d4e: Write1/Store fl=2 7 4095 ffff88002bb188e0 a 388ms FSC: OBJ17e9: CRTN vsq - ffff880023d99668 1 308ms FSC: OBJ17e0 OP60f91: Write1/EnQ fl=2 vsq - ffff8800295d1740 1 212ms FSC: OBJ16be OP4d4b6: Write1/EnQ fl=2 vsq - ffff880025ba3308 1 160ms FSC: OBJ179a OP58dec: Write1/EnQ fl=2 vsq - ffff880024ec83e0 1 160ms FSC: OBJ17ae OP599f2: Write1/EnQ fl=2 vsq - ffff880026618e00 1 160ms FSC: OBJ17e6 OP60d33: Write1/EnQ fl=2 vsq - ffff880025a2a4b8 1 132ms FSC: OBJ16a2 OP4d583: Write1/EnQ fl=2 vsq - ffff880023cbe6d8 9 212ms FSC: OBJ17eb: LOOK vsq - ffff880024d37590 9 212ms FSC: OBJ17ec: LOOK vsq - ffff880027746cb0 9 212ms FSC: OBJ17ed: LOOK vsq - ffff880024d37ae8 9 212ms FSC: OBJ17ee: LOOK vsq - ffff880024d37cb0 9 212ms FSC: OBJ17ef: LOOK vsq - ffff880025036550 9 212ms FSC: OBJ17f0: LOOK vsq - ffff8800250368e0 9 212ms FSC: OBJ17f1: LOOK vsq - ffff880025036aa8 9 212ms FSC: OBJ17f2: LOOK In the 'THR' column, executing items show the thread they're occupying and queued threads indicate which queue they're on. 'PID' shows the process ID of a slow-work thread that's executing something. 'FL' shows the work item flags. 'MARK' indicates how long since an item was queued or began executing. Lastly, the 'DESC' column permits the owner of an item to give some information. Signed-off-by: David Howells --- include/linux/slow-work.h | 11 +++++++++++ 1 file changed, 11 insertions(+) (limited to 'include') diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h index b245b9a9cc0..f41485145ed 100644 --- a/include/linux/slow-work.h +++ b/include/linux/slow-work.h @@ -20,6 +20,9 @@ #include struct slow_work; +#ifdef CONFIG_SLOW_WORK_PROC +struct seq_file; +#endif /* * The operations used to support slow work items @@ -38,6 +41,11 @@ struct slow_work_ops { /* execute a work item */ void (*execute)(struct slow_work *work); + +#ifdef CONFIG_SLOW_WORK_PROC + /* describe a work item for /proc */ + void (*desc)(struct slow_work *work, struct seq_file *m); +#endif }; /* @@ -56,6 +64,9 @@ struct slow_work { #define SLOW_WORK_DELAYED 5 /* item is struct delayed_slow_work with active timer */ const struct slow_work_ops *ops; /* operations table for this item */ struct list_head link; /* link in queue */ +#ifdef CONFIG_SLOW_WORK_PROC + struct timespec mark; /* jiffies at which queued or exec begun */ +#endif }; struct delayed_slow_work { -- cgit v1.2.3 From 31ba99d304494cb28fa8671ccc769c5543e1165d Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:10:53 +0000 Subject: SLOW_WORK: Allow the owner of a work item to determine if it is queued or not Add a function (slow_work_is_queued()) to permit the owner of a work item to determine if the item is queued or not. The work item is counted as being queued if it is actually on the queue, not just if it is pending. If it is executing and pending, then it is not on the queue, but will rather be put back on the queue when execution finishes. This permits a caller to quickly work out if it may be able to put another, dependent work item on the queue behind it, or whether it will have to wait till that is finished. This can be used by CacheFiles to work out whether the creation a new object can be immediately deferred when it has to wait for an old object to be deleted, or whether a wait must take place. If a wait is necessary, then the slow-work thread can otherwise get blocked, preventing the deletion from taking place. Signed-off-by: David Howells --- include/linux/slow-work.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h index f41485145ed..bfd3ab4c889 100644 --- a/include/linux/slow-work.h +++ b/include/linux/slow-work.h @@ -120,6 +120,25 @@ static inline void vslow_work_init(struct slow_work *work, INIT_LIST_HEAD(&work->link); } +/** + * slow_work_is_queued - Determine if a slow work item is on the work queue + * work: The work item to test + * + * Determine if the specified slow-work item is on the work queue. This + * returns true if it is actually on the queue. + * + * If the item is executing and has been marked for requeue when execution + * finishes, then false will be returned. + * + * Anyone wishing to wait for completion of execution can wait on the + * SLOW_WORK_EXECUTING bit. + */ +static inline bool slow_work_is_queued(struct slow_work *work) +{ + unsigned long flags = work->flags; + return flags & SLOW_WORK_PENDING && !(flags & SLOW_WORK_EXECUTING); +} + extern int slow_work_enqueue(struct slow_work *work); extern void slow_work_cancel(struct slow_work *work); extern int slow_work_register_user(struct module *owner); -- cgit v1.2.3 From 3bde31a4ac225cb5805be02eff6eaaf7e0766ccd Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:10:57 +0000 Subject: SLOW_WORK: Allow a requeueable work item to sleep till the thread is needed Add a function to allow a requeueable work item to sleep till the thread processing it is needed by the slow-work facility to perform other work. Sometimes a work item can't progress immediately, but must wait for the completion of another work item that's currently being processed by another slow-work thread. In some circumstances, the waiting item could instead - theoretically - put itself back on the queue and yield its thread back to the slow-work facility, thus waiting till it gets processing time again before attempting to progress. This would allow other work items processing time on that thread. However, this only works if there is something on the queue for it to queue behind - otherwise it will just get a thread again immediately, and will end up cycling between the queue and the thread, eating up valuable CPU time. So, slow_work_sleep_till_thread_needed() is provided such that an item can put itself on a wait queue that will wake it up when the event it is actually interested in occurs, then call this function in lieu of calling schedule(). This function will then sleep until either the item's event occurs or another work item appears on the queue. If another work item is queued, but the item's event hasn't occurred, then the work item should requeue itself and yield the thread back to the slow-work facility by returning. This can be used by CacheFiles for an object that is being created on one thread to wait for an object being deleted on another thread where there is nothing on the queue for the creation to go and wait behind. As soon as an item appears on the queue that could be given thread time instead, CacheFiles can stick the creating object back on the queue and return to the slow-work facility - assuming the object deletion didn't also complete. Signed-off-by: David Howells --- include/linux/slow-work.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/slow-work.h b/include/linux/slow-work.h index bfd3ab4c889..5035a269173 100644 --- a/include/linux/slow-work.h +++ b/include/linux/slow-work.h @@ -152,6 +152,9 @@ static inline void delayed_slow_work_cancel(struct delayed_slow_work *dwork) slow_work_cancel(&dwork->work); } +extern bool slow_work_sleep_till_thread_needed(struct slow_work *work, + signed long *_timeout); + #ifdef CONFIG_SYSCTL extern ctl_table slow_work_sysctls[]; #endif -- cgit v1.2.3 From 440f0affe247e9990c8f8778f1861da4fd7d5e50 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:01 +0000 Subject: FS-Cache: Annotate slow-work runqueue proc lines for FS-Cache work items Annotate slow-work runqueue proc lines for FS-Cache work items. Objects include the object ID and the state. Operations include the object ID, the operation ID and the operation type and state. Signed-off-by: David Howells --- include/linux/fscache-cache.h | 12 ++++++++++++ 1 file changed, 12 insertions(+) (limited to 'include') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 84d3532dd3e..7a9847ccd19 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -102,6 +102,16 @@ struct fscache_operation { /* operation releaser */ fscache_operation_release_t release; + +#ifdef CONFIG_SLOW_WORK_PROC + const char *name; /* operation name */ + const char *state; /* operation state */ +#define fscache_set_op_name(OP, N) do { (OP)->name = (N); } while(0) +#define fscache_set_op_state(OP, S) do { (OP)->state = (S); } while(0) +#else +#define fscache_set_op_name(OP, N) do { } while(0) +#define fscache_set_op_state(OP, S) do { } while(0) +#endif }; extern atomic_t fscache_op_debug_id; @@ -125,6 +135,7 @@ static inline void fscache_operation_init(struct fscache_operation *op, op->debug_id = atomic_inc_return(&fscache_op_debug_id); op->release = release; INIT_LIST_HEAD(&op->pend_link); + fscache_set_op_state(op, "Init"); } /** @@ -337,6 +348,7 @@ struct fscache_object { FSCACHE_OBJECT_RECYCLING, /* retiring object */ FSCACHE_OBJECT_WITHDRAWING, /* withdrawing object */ FSCACHE_OBJECT_DEAD, /* object is now dead */ + FSCACHE_OBJECT__NSTATES } state; int debug_id; /* debugging ID */ -- cgit v1.2.3 From 4fbf4291aa15926cd4fdca0ffe9122e89d0459db Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:04 +0000 Subject: FS-Cache: Allow the current state of all objects to be dumped Allow the current state of all fscache objects to be dumped by doing: cat /proc/fs/fscache/objects By default, all objects and all fields will be shown. This can be restricted by adding a suitable key to one of the caller's keyrings (such as the session keyring): keyctl add user fscache:objlist "" @s The are: K Show hexdump of object key (don't show if not given) A Show hexdump of object aux data (don't show if not given) And paired restrictions: C Show objects that have a cookie c Show objects that don't have a cookie B Show objects that are busy b Show objects that aren't busy W Show objects that have pending writes w Show objects that don't have pending writes R Show objects that have outstanding reads r Show objects that don't have outstanding reads S Show objects that have slow work queued s Show objects that don't have slow work queued If neither side of a restriction pair is given, then both are implied. For example: keyctl add user fscache:objlist KB @s shows objects that are busy, and lists their object keys, but does not dump their auxiliary data. It also implies "CcWwRrSs", but as 'B' is given, 'b' is not implied. Signed-off-by: David Howells --- include/linux/fscache-cache.h | 13 +++++++++++++ 1 file changed, 13 insertions(+) (limited to 'include') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 7a9847ccd19..184cbdfbcc9 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -91,6 +91,8 @@ struct fscache_operation { #define FSCACHE_OP_WAITING 4 /* cleared when op is woken */ #define FSCACHE_OP_EXCLUSIVE 5 /* exclusive op, other ops must wait */ #define FSCACHE_OP_DEAD 6 /* op is now dead */ +#define FSCACHE_OP_DEC_READ_CNT 7 /* decrement object->n_reads on destruction */ +#define FSCACHE_OP_KEEP_FLAGS 0xc0 /* flags to keep when repurposing an op */ atomic_t usage; unsigned debug_id; /* debugging ID */ @@ -357,6 +359,7 @@ struct fscache_object { int n_obj_ops; /* number of object ops outstanding on object */ int n_in_progress; /* number of ops in progress */ int n_exclusive; /* number of exclusive ops queued */ + atomic_t n_reads; /* number of read ops in progress */ spinlock_t lock; /* state and operations lock */ unsigned long lookup_jif; /* time at which lookup started */ @@ -370,6 +373,7 @@ struct fscache_object { #define FSCACHE_OBJECT_EV_RELEASE 4 /* T if netfs requested object release */ #define FSCACHE_OBJECT_EV_RETIRE 5 /* T if netfs requested object retirement */ #define FSCACHE_OBJECT_EV_WITHDRAW 6 /* T if cache requested object withdrawal */ +#define FSCACHE_OBJECT_EVENTS_MASK 0x7f /* mask of all events*/ unsigned long flags; #define FSCACHE_OBJECT_LOCK 0 /* T if object is busy being processed */ @@ -385,6 +389,9 @@ struct fscache_object { struct list_head dependents; /* FIFO of dependent objects */ struct list_head dep_link; /* link in parent's dependents list */ struct list_head pending_ops; /* unstarted operations on this object */ +#ifdef CONFIG_FSCACHE_OBJECT_LIST + struct rb_node objlist_link; /* link in global object list */ +#endif pgoff_t store_limit; /* current storage limit */ }; @@ -434,6 +441,12 @@ void fscache_object_init(struct fscache_object *object, extern void fscache_object_lookup_negative(struct fscache_object *object); extern void fscache_obtained_object(struct fscache_object *object); +#ifdef CONFIG_FSCACHE_OBJECT_LIST +extern void fscache_object_destroy(struct fscache_object *object); +#else +#define fscache_object_destroy(object) do {} while(0) +#endif + /** * fscache_object_destroyed - Note destruction of an object in a cache * @cache: The cache from which the object came -- cgit v1.2.3 From 1bccf513ac49d44604ba1cddcc29f5886e70f1b6 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:25 +0000 Subject: FS-Cache: Fix lock misorder in fscache_write_op() FS-Cache has two structs internally for keeping track of the internal state of a cached file: the fscache_cookie struct, which represents the netfs's state, and fscache_object struct, which represents the cache's state. Each has a pointer that points to the other (when both are in existence), and each has a spinlock for pointer maintenance. Since netfs operations approach these structures from the cookie side, they get the cookie lock first, then the object lock. Cache operations, on the other hand, approach from the object side, and get the object lock first. It is not then permitted for a cache operation to get the cookie lock whilst it is holding the object lock lest deadlock occur; instead, it must do one of two things: (1) increment the cookie usage counter, drop the object lock and then get both locks in order, or (2) simply hold the object lock as certain parts of the cookie may not be altered whilst the object lock is held. It is also not permitted to follow either pointer without holding the lock at the end you start with. To break the pointers between the cookie and the object, both locks must be held. fscache_write_op(), however, violates the locking rules: It attempts to get the cookie lock without (a) checking that the cookie pointer is a valid pointer, and (b) holding the object lock to protect the cookie pointer whilst it follows it. This is so that it can access the pending page store tree without interference from __fscache_write_page(). This is fixed by splitting the cookie lock, such that the page store tracking tree is protected by its own lock, and checking that the cookie pointer is non-NULL before we attempt to follow it whilst holding the object lock. The new lock is subordinate to both the cookie lock and the object lock, and so should be taken after those. Signed-off-by: David Howells --- include/linux/fscache-cache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 184cbdfbcc9..f3aa4bdafef 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -310,6 +310,7 @@ struct fscache_cookie { atomic_t usage; /* number of users of this cookie */ atomic_t n_children; /* number of children of this cookie */ spinlock_t lock; + spinlock_t stores_lock; /* lock on page store tree */ struct hlist_head backing_objects; /* object(s) backing this file/index */ const struct fscache_cookie_def *def; /* definition */ struct fscache_cookie *parent; /* parent of this entry */ -- cgit v1.2.3 From 201a15428bd54f83eccec8b7c64a04b8f9431204 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:35 +0000 Subject: FS-Cache: Handle pages pending storage that get evicted under OOM conditions Handle netfs pages that the vmscan algorithm wants to evict from the pagecache under OOM conditions, but that are waiting for write to the cache. Under these conditions, vmscan calls the releasepage() function of the netfs, asking if a page can be discarded. The problem is typified by the following trace of a stuck process: kslowd005 D 0000000000000000 0 4253 2 0x00000080 ffff88001b14f370 0000000000000046 ffff880020d0d000 0000000000000007 0000000000000006 0000000000000001 ffff88001b14ffd8 ffff880020d0d2a8 000000000000ddf0 00000000000118c0 00000000000118c0 ffff880020d0d2a8 Call Trace: [] __fscache_wait_on_page_write+0x8b/0xa7 [fscache] [] ? autoremove_wake_function+0x0/0x34 [] ? __fscache_check_page_write+0x63/0x70 [fscache] [] nfs_fscache_release_page+0x4e/0xc4 [nfs] [] nfs_release_page+0x3c/0x41 [nfs] [] try_to_release_page+0x32/0x3b [] shrink_page_list+0x316/0x4ac [] shrink_inactive_list+0x392/0x67c [] ? __mutex_unlock_slowpath+0x100/0x10b [] ? trace_hardirqs_on_caller+0x10c/0x130 [] ? mutex_unlock+0x9/0xb [] shrink_list+0x8d/0x8f [] shrink_zone+0x278/0x33c [] ? ktime_get_ts+0xad/0xba [] try_to_free_pages+0x22e/0x392 [] ? isolate_pages_global+0x0/0x212 [] __alloc_pages_nodemask+0x3dc/0x5cf [] grab_cache_page_write_begin+0x65/0xaa [] ext3_write_begin+0x78/0x1eb [] generic_file_buffered_write+0x109/0x28c [] ? current_fs_time+0x22/0x29 [] __generic_file_aio_write+0x350/0x385 [] ? generic_file_aio_write+0x4a/0xae [] generic_file_aio_write+0x60/0xae [] do_sync_write+0xe3/0x120 [] ? autoremove_wake_function+0x0/0x34 [] ? __dentry_open+0x1a5/0x2b8 [] ? dentry_open+0x82/0x89 [] cachefiles_write_page+0x298/0x335 [cachefiles] [] fscache_write_op+0x178/0x2c2 [fscache] [] fscache_op_execute+0x7a/0xd1 [fscache] [] slow_work_execute+0x18f/0x2d1 [] slow_work_thread+0x1c5/0x308 [] ? autoremove_wake_function+0x0/0x34 [] ? slow_work_thread+0x0/0x308 [] kthread+0x7a/0x82 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? tg_shares_up+0x171/0x227 [] ? kthread+0x0/0x82 [] ? child_rip+0x0/0x20 In the above backtrace, the following is happening: (1) A page storage operation is being executed by a slow-work thread (fscache_write_op()). (2) FS-Cache farms the operation out to the cache to perform (cachefiles_write_page()). (3) CacheFiles is then calling Ext3 to perform the actual write, using Ext3's standard write (do_sync_write()) under KERNEL_DS directly from the netfs page. (4) However, for Ext3 to perform the write, it must allocate some memory, in particular, it must allocate at least one page cache page into which it can copy the data from the netfs page. (5) Under OOM conditions, the memory allocator can't immediately come up with a page, so it uses vmscan to find something to discard (try_to_free_pages()). (6) vmscan finds a clean netfs page it might be able to discard (possibly the one it's trying to write out). (7) The netfs is called to throw the page away (nfs_release_page()) - but it's called with __GFP_WAIT, so the netfs decides to wait for the store to complete (__fscache_wait_on_page_write()). (8) This blocks a slow-work processing thread - possibly against itself. The system ends up stuck because it can't write out any netfs pages to the cache without allocating more memory. To avoid this, we make FS-Cache cancel some writes that aren't in the middle of actually being performed. This means that some data won't make it into the cache this time. To support this, a new FS-Cache function is added fscache_maybe_release_page() that replaces what the netfs releasepage() functions used to do with respect to the cache. The decisions fscache_maybe_release_page() makes are counted and displayed through /proc/fs/fscache/stats on a line labelled "VmScan". There are four counters provided: "nos=N" - pages that weren't pending storage; "gon=N" - pages that were pending storage when we first looked, but weren't by the time we got the object lock; "bsy=N" - pages that we ignored as they were actively being written when we looked; and "can=N" - pages that we cancelled the storage of. What I'd really like to do is alter the behaviour of the cancellation heuristics, depending on how necessary it is to expel pages. If there are plenty of other pages that aren't waiting to be written to the cache that could be ejected first, then it would be nice to hold up on immediate cancellation of cache writes - but I don't see a way of doing that. Signed-off-by: David Howells --- include/linux/fscache-cache.h | 1 + include/linux/fscache.h | 27 +++++++++++++++++++++++++++ 2 files changed, 28 insertions(+) (limited to 'include') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index f3aa4bdafef..4750d5fb419 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -317,6 +317,7 @@ struct fscache_cookie { void *netfs_data; /* back pointer to netfs */ struct radix_tree_root stores; /* pages to be stored on this cookie */ #define FSCACHE_COOKIE_PENDING_TAG 0 /* pages tag: pending write to cache */ +#define FSCACHE_COOKIE_STORING_TAG 1 /* pages tag: writing to cache */ unsigned long flags; #define FSCACHE_COOKIE_LOOKING_UP 0 /* T if non-index cookie being looked up still */ diff --git a/include/linux/fscache.h b/include/linux/fscache.h index 6d8ee466e0a..595ce49288b 100644 --- a/include/linux/fscache.h +++ b/include/linux/fscache.h @@ -202,6 +202,8 @@ extern int __fscache_write_page(struct fscache_cookie *, struct page *, gfp_t); extern void __fscache_uncache_page(struct fscache_cookie *, struct page *); extern bool __fscache_check_page_write(struct fscache_cookie *, struct page *); extern void __fscache_wait_on_page_write(struct fscache_cookie *, struct page *); +extern bool __fscache_maybe_release_page(struct fscache_cookie *, struct page *, + gfp_t); /** * fscache_register_netfs - Register a filesystem as desiring caching services @@ -615,4 +617,29 @@ void fscache_wait_on_page_write(struct fscache_cookie *cookie, __fscache_wait_on_page_write(cookie, page); } +/** + * fscache_maybe_release_page - Consider releasing a page, cancelling a store + * @cookie: The cookie representing the cache object + * @page: The netfs page that is being cached. + * @gfp: The gfp flags passed to releasepage() + * + * Consider releasing a page for the vmscan algorithm, on behalf of the netfs's + * releasepage() call. A storage request on the page may cancelled if it is + * not currently being processed. + * + * The function returns true if the page no longer has a storage request on it, + * and false if a storage request is left in place. If true is returned, the + * page will have been passed to fscache_uncache_page(). If false is returned + * the page cannot be freed yet. + */ +static inline +bool fscache_maybe_release_page(struct fscache_cookie *cookie, + struct page *page, + gfp_t gfp) +{ + if (fscache_cookie_valid(cookie) && PageFsCache(page)) + return __fscache_maybe_release_page(cookie, page, gfp); + return false; +} + #endif /* _LINUX_FSCACHE_H */ -- cgit v1.2.3 From 60d543ca724be155c2b6166e36a00c80b21bd810 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:45 +0000 Subject: FS-Cache: Start processing an object's operations on that object's death Start processing an object's operations when that object moves into the DYING state as the object cannot be destroyed until all its outstanding operations have completed. Furthermore, make sure that read and allocation operations handle being woken up on a dead object. Such events are recorded in the Allocs.abt and Retrvls.abt statistics as viewable through /proc/fs/fscache/stats. The code for waiting for object activation for the read and allocation operations is also extracted into its own function as it is much the same in all cases, differing only in the stats incremented. Signed-off-by: David Howells --- include/linux/fscache-cache.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 4750d5fb419..907bb56c588 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -404,6 +404,10 @@ extern const char *fscache_object_states[]; (obj)->state >= FSCACHE_OBJECT_AVAILABLE && \ (obj)->state < FSCACHE_OBJECT_DYING) +#define fscache_object_is_dead(obj) \ + (test_bit(FSCACHE_IOERROR, &(obj)->cache->flags) && \ + (obj)->state >= FSCACHE_OBJECT_DYING) + extern const struct slow_work_ops fscache_object_slow_work_ops; /** -- cgit v1.2.3 From a17754fb8c28af19cd70dcbec6d5b0773b94e0c1 Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:11:52 +0000 Subject: CacheFiles: Don't write a full page if there's only a partial page to cache cachefiles_write_page() writes a full page to the backing file for the last page of the netfs file, even if the netfs file's last page is only a partial page. This causes the EOF on the backing file to be extended beyond the EOF of the netfs, and thus the backing file will be truncated by cachefiles_attr_changed() called from cachefiles_lookup_object(). So we need to limit the write we make to the backing file on that last page such that it doesn't push the EOF too far. Also, if a backing file that has a partial page at the end is expanded, we discard the partial page and refetch it on the basis that we then have a hole in the file with invalid data, and should the power go out... A better way to deal with this could be to record a note that the partial page contains invalid data until the correct data is written into it. This isn't a problem for netfs's that discard the whole backing file if the file size changes (such as NFS). Signed-off-by: David Howells --- include/linux/fscache-cache.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 907bb56c588..5db50002f3b 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -395,6 +395,7 @@ struct fscache_object { struct rb_node objlist_link; /* link in global object list */ #endif pgoff_t store_limit; /* current storage limit */ + loff_t store_limit_l; /* current storage limit */ }; extern const char *fscache_object_states[]; @@ -439,6 +440,7 @@ void fscache_object_init(struct fscache_object *object, object->events = object->event_mask = 0; object->flags = 0; object->store_limit = 0; + object->store_limit_l = 0; object->cache = cache; object->cookie = cookie; object->parent = NULL; @@ -491,6 +493,7 @@ static inline void fscache_object_lookup_error(struct fscache_object *object) static inline void fscache_set_store_limit(struct fscache_object *object, loff_t i_size) { + object->store_limit_l = i_size; object->store_limit = i_size >> PAGE_SHIFT; if (i_size & ~PAGE_MASK) object->store_limit++; -- cgit v1.2.3 From fee096deb4f33897937b974cb2c5168bab7935be Mon Sep 17 00:00:00 2001 From: David Howells Date: Thu, 19 Nov 2009 18:12:05 +0000 Subject: CacheFiles: Catch an overly long wait for an old active object Catch an overly long wait for an old, dying active object when we want to replace it with a new one. The probability is that all the slow-work threads are hogged, and the delete can't get a look in. What we do instead is: (1) if there's nothing in the slow work queue, we sleep until either the dying object has finished dying or there is something in the slow work queue behind which we can queue our object. (2) if there is something in the slow work queue, we return ETIMEDOUT to fscache_lookup_object(), which then puts us back on the slow work queue, presumably behind the deletion that we're blocked by. We are then deferred for a while until we work our way back through the queue - without blocking a slow-work thread unnecessarily. A backtrace similar to the following may appear in the log without this patch: INFO: task kslowd004:5711 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. kslowd004 D 0000000000000000 0 5711 2 0x00000080 ffff88000340bb80 0000000000000046 ffff88002550d000 0000000000000000 ffff88002550d000 0000000000000007 ffff88000340bfd8 ffff88002550d2a8 000000000000ddf0 00000000000118c0 00000000000118c0 ffff88002550d2a8 Call Trace: [] ? trace_hardirqs_on+0xd/0xf [] ? cachefiles_wait_bit+0x0/0xd [cachefiles] [] cachefiles_wait_bit+0x9/0xd [cachefiles] [] __wait_on_bit+0x43/0x76 [] ? ext3_xattr_get+0x1ec/0x270 [] out_of_line_wait_on_bit+0x69/0x74 [] ? cachefiles_wait_bit+0x0/0xd [cachefiles] [] ? wake_bit_function+0x0/0x2e [] cachefiles_mark_object_active+0x203/0x23b [cachefiles] [] cachefiles_walk_to_object+0x558/0x827 [cachefiles] [] cachefiles_lookup_object+0xac/0x12a [cachefiles] [] fscache_lookup_object+0x1c7/0x214 [fscache] [] fscache_object_state_machine+0xa5/0x52d [fscache] [] fscache_object_slow_work_execute+0x5f/0xa0 [fscache] [] slow_work_execute+0x18f/0x2d1 [] slow_work_thread+0x1c5/0x308 [] ? autoremove_wake_function+0x0/0x34 [] ? slow_work_thread+0x0/0x308 [] kthread+0x7a/0x82 [] child_rip+0xa/0x20 [] ? restore_args+0x0/0x30 [] ? kthread+0x0/0x82 [] ? child_rip+0x0/0x20 1 lock held by kslowd004/5711: #0: (&sb->s_type->i_mutex_key#7/1){+.+.+.}, at: [] cachefiles_walk_to_object+0x1b3/0x827 [cachefiles] Signed-off-by: David Howells --- include/linux/fscache-cache.h | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/fscache-cache.h b/include/linux/fscache-cache.h index 5db50002f3b..7be0c6fbe88 100644 --- a/include/linux/fscache-cache.h +++ b/include/linux/fscache-cache.h @@ -234,8 +234,10 @@ struct fscache_cache_ops { struct fscache_object *(*alloc_object)(struct fscache_cache *cache, struct fscache_cookie *cookie); - /* look up the object for a cookie */ - void (*lookup_object)(struct fscache_object *object); + /* look up the object for a cookie + * - return -ETIMEDOUT to be requeued + */ + int (*lookup_object)(struct fscache_object *object); /* finished looking up */ void (*lookup_complete)(struct fscache_object *object); -- cgit v1.2.3