aboutsummaryrefslogtreecommitdiff
path: root/Documentation/filesystems
diff options
context:
space:
mode:
Diffstat (limited to 'Documentation/filesystems')
-rw-r--r--Documentation/filesystems/00-INDEX14
-rw-r--r--Documentation/filesystems/Locking18
-rw-r--r--Documentation/filesystems/caching/fscache.txt110
-rw-r--r--Documentation/filesystems/caching/netfs-api.txt21
-rw-r--r--Documentation/filesystems/dentry-locking.txt3
-rw-r--r--Documentation/filesystems/exofs.txt23
-rw-r--r--Documentation/filesystems/ext3.txt4
-rw-r--r--Documentation/filesystems/ext4.txt12
-rw-r--r--Documentation/filesystems/nfs/00-INDEX16
-rw-r--r--Documentation/filesystems/nfs/Exporting (renamed from Documentation/filesystems/Exporting)0
-rw-r--r--Documentation/filesystems/nfs/knfsd-stats.txt (renamed from Documentation/filesystems/knfsd-stats.txt)0
-rw-r--r--Documentation/filesystems/nfs/nfs-rdma.txt (renamed from Documentation/filesystems/nfs-rdma.txt)0
-rw-r--r--Documentation/filesystems/nfs/nfs.txt (renamed from Documentation/filesystems/nfs.txt)0
-rw-r--r--Documentation/filesystems/nfs/nfs41-server.txt (renamed from Documentation/filesystems/nfs41-server.txt)14
-rw-r--r--Documentation/filesystems/nfs/nfsroot.txt (renamed from Documentation/filesystems/nfsroot.txt)0
-rw-r--r--Documentation/filesystems/nfs/rpc-cache.txt (renamed from Documentation/filesystems/rpc-cache.txt)0
-rw-r--r--Documentation/filesystems/nilfs2.txt12
-rw-r--r--Documentation/filesystems/ocfs2.txt6
-rw-r--r--Documentation/filesystems/porting2
-rw-r--r--Documentation/filesystems/proc.txt71
-rw-r--r--Documentation/filesystems/seq_file.txt4
-rw-r--r--Documentation/filesystems/sharedsubtree.txt16
-rw-r--r--Documentation/filesystems/sysfs.txt12
-rw-r--r--Documentation/filesystems/vfs.txt2
24 files changed, 286 insertions, 74 deletions
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index d362aa543b2..5139b8c9d5a 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -1,7 +1,5 @@
00-INDEX
- this file (info on some of the filesystems supported by linux).
-Exporting
- - explanation of how to make filesystems exportable.
Locking
- info on locking rules as they pertain to Linux VFS.
9p.txt
@@ -36,6 +34,8 @@ dnotify.txt
- info about directory notification in Linux.
ecryptfs.txt
- docs on eCryptfs: stacked cryptographic filesystem for Linux.
+exofs.txt
+ - info, usage, mount options, design about EXOFS.
ext2.txt
- info, mount options and specifications for the Ext2 filesystem.
ext3.txt
@@ -68,12 +68,8 @@ mandatory-locking.txt
- info on the Linux implementation of Sys V mandatory file locking.
ncpfs.txt
- info on Novell Netware(tm) filesystem using NCP protocol.
-nfs41-server.txt
- - info on the Linux server implementation of NFSv4 minor version 1.
-nfs-rdma.txt
- - how to install and setup the Linux NFS/RDMA client and server software.
-nfsroot.txt
- - short guide on setting up a diskless box with NFS root filesystem.
+nfs/
+ - nfs-related documentation.
nilfs2.txt
- info and mount options for the NILFS2 filesystem.
ntfs.txt
@@ -92,8 +88,6 @@ relay.txt
- info on relay, for efficient streaming from kernel to user space.
romfs.txt
- description of the ROMFS filesystem.
-rpc-cache.txt
- - introduction to the caching mechanisms in the sunrpc layer.
seq_file.txt
- how to use the seq_file API
sharedsubtree.txt
diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking
index 18b9d0ca063..06bbbed7120 100644
--- a/Documentation/filesystems/Locking
+++ b/Documentation/filesystems/Locking
@@ -460,13 +460,6 @@ in sys_read() and friends.
--------------------------- dquot_operations -------------------------------
prototypes:
- int (*initialize) (struct inode *, int);
- int (*drop) (struct inode *);
- int (*alloc_space) (struct inode *, qsize_t, int);
- int (*alloc_inode) (const struct inode *, unsigned long);
- int (*free_space) (struct inode *, qsize_t);
- int (*free_inode) (const struct inode *, unsigned long);
- int (*transfer) (struct inode *, struct iattr *);
int (*write_dquot) (struct dquot *);
int (*acquire_dquot) (struct dquot *);
int (*release_dquot) (struct dquot *);
@@ -479,13 +472,6 @@ a proper locking wrt the filesystem and call the generic quota operations.
What filesystem should expect from the generic quota functions:
FS recursion Held locks when called
-initialize: yes maybe dqonoff_sem
-drop: yes -
-alloc_space: ->mark_dirty() -
-alloc_inode: ->mark_dirty() -
-free_space: ->mark_dirty() -
-free_inode: ->mark_dirty() -
-transfer: yes -
write_dquot: yes dqonoff_sem or dqptr_sem
acquire_dquot: yes dqonoff_sem or dqptr_sem
release_dquot: yes dqonoff_sem or dqptr_sem
@@ -495,10 +481,6 @@ write_info: yes dqonoff_sem
FS recursion means calling ->quota_read() and ->quota_write() from superblock
operations.
-->alloc_space(), ->alloc_inode(), ->free_space(), ->free_inode() are called
-only directly by the filesystem and do not call any fs functions only
-the ->mark_dirty() operation.
-
More details about quota locking can be found in fs/dquot.c.
--------------------------- vm_operations_struct -----------------------------
diff --git a/Documentation/filesystems/caching/fscache.txt b/Documentation/filesystems/caching/fscache.txt
index 9e94b9491d8..a91e2e2095b 100644
--- a/Documentation/filesystems/caching/fscache.txt
+++ b/Documentation/filesystems/caching/fscache.txt
@@ -235,6 +235,7 @@ proc files.
neg=N Number of negative lookups made
pos=N Number of positive lookups made
crt=N Number of objects created by lookup
+ tmo=N Number of lookups timed out and requeued
Updates n=N Number of update cookie requests seen
nul=N Number of upd reqs given a NULL parent
run=N Number of upd reqs granted CPU time
@@ -250,8 +251,10 @@ proc files.
ok=N Number of successful alloc reqs
wt=N Number of alloc reqs that waited on lookup completion
nbf=N Number of alloc reqs rejected -ENOBUFS
+ int=N Number of alloc reqs aborted -ERESTARTSYS
ops=N Number of alloc reqs submitted
owt=N Number of alloc reqs waited for CPU time
+ abt=N Number of alloc reqs aborted due to object death
Retrvls n=N Number of retrieval (read) requests seen
ok=N Number of successful retr reqs
wt=N Number of retr reqs that waited on lookup completion
@@ -261,6 +264,7 @@ proc files.
oom=N Number of retr reqs failed -ENOMEM
ops=N Number of retr reqs submitted
owt=N Number of retr reqs waited for CPU time
+ abt=N Number of retr reqs aborted due to object death
Stores n=N Number of storage (write) requests seen
ok=N Number of successful store reqs
agn=N Number of store reqs on a page already pending storage
@@ -268,12 +272,37 @@ proc files.
oom=N Number of store reqs failed -ENOMEM
ops=N Number of store reqs submitted
run=N Number of store reqs granted CPU time
+ pgs=N Number of pages given store req processing time
+ rxd=N Number of store reqs deleted from tracking tree
+ olm=N Number of store reqs over store limit
+ VmScan nos=N Number of release reqs against pages with no pending store
+ gon=N Number of release reqs against pages stored by time lock granted
+ bsy=N Number of release reqs ignored due to in-progress store
+ can=N Number of page stores cancelled due to release req
Ops pend=N Number of times async ops added to pending queues
run=N Number of times async ops given CPU time
enq=N Number of times async ops queued for processing
+ can=N Number of async ops cancelled
+ rej=N Number of async ops rejected due to object lookup/create failure
dfr=N Number of async ops queued for deferred release
rel=N Number of async ops released
gc=N Number of deferred-release async ops garbage collected
+ CacheOp alo=N Number of in-progress alloc_object() cache ops
+ luo=N Number of in-progress lookup_object() cache ops
+ luc=N Number of in-progress lookup_complete() cache ops
+ gro=N Number of in-progress grab_object() cache ops
+ upo=N Number of in-progress update_object() cache ops
+ dro=N Number of in-progress drop_object() cache ops
+ pto=N Number of in-progress put_object() cache ops
+ syn=N Number of in-progress sync_cache() cache ops
+ atc=N Number of in-progress attr_changed() cache ops
+ rap=N Number of in-progress read_or_alloc_page() cache ops
+ ras=N Number of in-progress read_or_alloc_pages() cache ops
+ alp=N Number of in-progress allocate_page() cache ops
+ als=N Number of in-progress allocate_pages() cache ops
+ wrp=N Number of in-progress write_page() cache ops
+ ucp=N Number of in-progress uncache_page() cache ops
+ dsp=N Number of in-progress dissociate_pages() cache ops
(*) /proc/fs/fscache/histogram
@@ -299,6 +328,87 @@ proc files.
jiffy range covered, and the SECS field the equivalent number of seconds.
+===========
+OBJECT LIST
+===========
+
+If CONFIG_FSCACHE_OBJECT_LIST is enabled, the FS-Cache facility will maintain a
+list of all the objects currently allocated and allow them to be viewed
+through:
+
+ /proc/fs/fscache/objects
+
+This will look something like:
+
+ [root@andromeda ~]# head /proc/fs/fscache/objects
+ OBJECT PARENT STAT CHLDN OPS OOP IPR EX READS EM EV F S | NETFS_COOKIE_DEF TY FL NETFS_DATA OBJECT_KEY, AUX_DATA
+ ======== ======== ==== ===== === === === == ===== == == = = | ================ == == ================ ================
+ 17e4b 2 ACTV 0 0 0 0 0 0 7b 4 0 8 | NFS.fh DT 0 ffff88001dd82820 010006017edcf8bbc93b43298fdfbe71e50b57b13a172c0117f38472, e567634700000000000000000000000063f2404a000000000000000000000000c9030000000000000000000063f2404a
+ 1693a 2 ACTV 0 0 0 0 0 0 7b 4 0 8 | NFS.fh DT 0 ffff88002db23380 010006017edcf8bbc93b43298fdfbe71e50b57b1e0162c01a2df0ea6, 420ebc4a000000000000000000000000420ebc4a0000000000000000000000000e1801000000000000000000420ebc4a
+
+where the first set of columns before the '|' describe the object:
+
+ COLUMN DESCRIPTION
+ ======= ===============================================================
+ OBJECT Object debugging ID (appears as OBJ%x in some debug messages)
+ PARENT Debugging ID of parent object
+ STAT Object state
+ CHLDN Number of child objects of this object
+ OPS Number of outstanding operations on this object
+ OOP Number of outstanding child object management operations
+ IPR
+ EX Number of outstanding exclusive operations
+ READS Number of outstanding read operations
+ EM Object's event mask
+ EV Events raised on this object
+ F Object flags
+ S Object slow-work work item flags
+
+and the second set of columns describe the object's cookie, if present:
+
+ COLUMN DESCRIPTION
+ =============== =======================================================
+ NETFS_COOKIE_DEF Name of netfs cookie definition
+ TY Cookie type (IX - index, DT - data, hex - special)
+ FL Cookie flags
+ NETFS_DATA Netfs private data stored in the cookie
+ OBJECT_KEY Object key } 1 column, with separating comma
+ AUX_DATA Object aux data } presence may be configured
+
+The data shown may be filtered by attaching the a key to an appropriate keyring
+before viewing the file. Something like:
+
+ keyctl add user fscache:objlist <restrictions> @s
+
+where <restrictions> are a selection of the following letters:
+
+ K Show hexdump of object key (don't show if not given)
+ A Show hexdump of object aux data (don't show if not given)
+
+and the following paired letters:
+
+ C Show objects that have a cookie
+ c Show objects that don't have a cookie
+ B Show objects that are busy
+ b Show objects that aren't busy
+ W Show objects that have pending writes
+ w Show objects that don't have pending writes
+ R Show objects that have outstanding reads
+ r Show objects that don't have outstanding reads
+ S Show objects that have slow work queued
+ s Show objects that don't have slow work queued
+
+If neither side of a letter pair is given, then both are implied. For example:
+
+ keyctl add user fscache:objlist KB @s
+
+shows objects that are busy, and lists their object keys, but does not dump
+their auxiliary data. It also implies "CcWwRrSs", but as 'B' is given, 'b' is
+not implied.
+
+By default all objects and all fields will be shown.
+
+
=========
DEBUGGING
=========
diff --git a/Documentation/filesystems/caching/netfs-api.txt b/Documentation/filesystems/caching/netfs-api.txt
index 2666b1ed5e9..1902c57b72e 100644
--- a/Documentation/filesystems/caching/netfs-api.txt
+++ b/Documentation/filesystems/caching/netfs-api.txt
@@ -641,7 +641,7 @@ data file must be retired (see the relinquish cookie function below).
Furthermore, note that this does not cancel the asynchronous read or write
operation started by the read/alloc and write functions, so the page
-invalidation and release functions must use:
+invalidation functions must use:
bool fscache_check_page_write(struct fscache_cookie *cookie,
struct page *page);
@@ -654,6 +654,25 @@ to see if a page is being written to the cache, and:
to wait for it to finish if it is.
+When releasepage() is being implemented, a special FS-Cache function exists to
+manage the heuristics of coping with vmscan trying to eject pages, which may
+conflict with the cache trying to write pages to the cache (which may itself
+need to allocate memory):
+
+ bool fscache_maybe_release_page(struct fscache_cookie *cookie,
+ struct page *page,
+ gfp_t gfp);
+
+This takes the netfs cookie, and the page and gfp arguments as supplied to
+releasepage(). It will return false if the page cannot be released yet for
+some reason and if it returns true, the page has been uncached and can now be
+released.
+
+To make a page available for release, this function may wait for an outstanding
+storage request to complete, or it may attempt to cancel the storage request -
+in which case the page will not be stored in the cache this time.
+
+
==========================
INDEX AND DATA FILE UPDATE
==========================
diff --git a/Documentation/filesystems/dentry-locking.txt b/Documentation/filesystems/dentry-locking.txt
index 4c0c575a401..79334ed5daa 100644
--- a/Documentation/filesystems/dentry-locking.txt
+++ b/Documentation/filesystems/dentry-locking.txt
@@ -62,7 +62,8 @@ changes are :
2. Insertion of a dentry into the hash table is done using
hlist_add_head_rcu() which take care of ordering the writes - the
writes to the dentry must be visible before the dentry is
- inserted. This works in conjunction with hlist_for_each_rcu() while
+ inserted. This works in conjunction with hlist_for_each_rcu(),
+ which has since been replaced by hlist_for_each_entry_rcu(), while
walking the hash chain. The only requirement is that all
initialization to the dentry must be done before
hlist_add_head_rcu() since we don't have dcache_lock protection
diff --git a/Documentation/filesystems/exofs.txt b/Documentation/filesystems/exofs.txt
index 0ced74c2f73..abd2a9b5b78 100644
--- a/Documentation/filesystems/exofs.txt
+++ b/Documentation/filesystems/exofs.txt
@@ -60,13 +60,13 @@ USAGE
mkfs.exofs --pid=65536 --format /dev/osd0
- The --format is optional if not specified no OSD_FORMAT will be
- preformed and a clean file system will be created in the specified pid,
+ The --format is optional. If not specified, no OSD_FORMAT will be
+ performed and a clean file system will be created in the specified pid,
in the available space of the target. (Use --format=size_in_meg to limit
the total LUN space available)
- If pid already exist it will be deleted and a new one will be created in it's
- place. Be careful.
+ If pid already exists, it will be deleted and a new one will be created in
+ its place. Be careful.
An exofs lives inside a single OSD partition. You can create multiple exofs
filesystems on the same device using multiple pids.
@@ -81,7 +81,7 @@ USAGE
7. For reference (See do-exofs example script):
do-exofs start - an example of how to perform the above steps.
- do-exofs stop - an example of how to unmount the file system.
+ do-exofs stop - an example of how to unmount the file system.
do-exofs format - an example of how to format and mkfs a new exofs.
8. Extra compilation flags (uncomment in fs/exofs/Kbuild):
@@ -104,8 +104,8 @@ Where:
exofs specific options: Options are separated by commas (,)
pid=<integer> - The partition number to mount/create as
container of the filesystem.
- This option is mandatory
- to=<integer> - Timeout in ticks for a single command
+ This option is mandatory.
+ to=<integer> - Timeout in ticks for a single command.
default is (60 * HZ) [for debugging only]
===============================================================================
@@ -116,7 +116,7 @@ DESIGN
with a special ID (defined in common.h).
Information included in the file system control block is used to fill the
in-memory superblock structure at mount time. This object is created before
- the file system is used by mkexofs.c It contains information such as:
+ the file system is used by mkexofs.c. It contains information such as:
- The file system's magic number
- The next inode number to be allocated
@@ -134,8 +134,8 @@ DESIGN
attributes. This applies to both regular files and other types (directories,
device files, symlinks, etc.).
-* Credentials are generated per object (inode and superblock) when they is
- created in memory (read off disk or created). The credential works for all
+* Credentials are generated per object (inode and superblock) when they are
+ created in memory (read from disk or created). The credential works for all
operations and is used as long as the object remains in memory.
* Async OSD operations are used whenever possible, but the target may execute
@@ -145,7 +145,8 @@ DESIGN
from executing in reverse order:
- The following are handled with the OBJ_CREATED and OBJ_2BCREATED
flags. OBJ_CREATED is set when we know the object exists on the OSD -
- in create's callback function, and when we successfully do a read_inode.
+ in create's callback function, and when we successfully do a
+ read_inode.
OBJ_2BCREATED is set in the beginning of the create function, so we
know that we should wait.
- create/delete: delete should wait until the object is created
diff --git a/Documentation/filesystems/ext3.txt b/Documentation/filesystems/ext3.txt
index 05d5cf1d743..867c5b50cb4 100644
--- a/Documentation/filesystems/ext3.txt
+++ b/Documentation/filesystems/ext3.txt
@@ -32,8 +32,8 @@ journal_dev=devnum When the external journal device's major/minor numbers
identified through its new major/minor numbers encoded
in devnum.
-noload Don't load the journal on mounting. Note that this forces
- mount of inconsistent filesystem, which can lead to
+norecovery Don't load the journal on mounting. Note that this forces
+noload mount of inconsistent filesystem, which can lead to
various problems.
data=journal All data are committed into the journal prior to being
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 6d94e0696f8..e1def1786e5 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt
@@ -153,8 +153,8 @@ journal_dev=devnum When the external journal device's major/minor numbers
identified through its new major/minor numbers encoded
in devnum.
-noload Don't load the journal on mounting. Note that
- if the filesystem was not unmounted cleanly,
+norecovery Don't load the journal on mounting. Note that
+noload if the filesystem was not unmounted cleanly,
skipping the journal replay will lead to the
filesystem containing inconsistencies that can
lead to any number of problems.
@@ -196,7 +196,7 @@ nobarrier This also requires an IO stack which can support
also be used to enable or disable barriers, for
consistency with other ext4 mount options.
-inode_readahead=n This tuning parameter controls the maximum
+inode_readahead_blks=n This tuning parameter controls the maximum
number of inode table blocks that ext4's inode
table readahead algorithm will pre-read into
the buffer cache. The default value is 32 blocks.
@@ -353,6 +353,12 @@ noauto_da_alloc replacing existing files via patterns such as
system crashes before the delayed allocation
blocks are forced to disk.
+discard Controls whether ext4 should issue discard/TRIM
+nodiscard(*) commands to the underlying block device when
+ blocks are freed. This is useful for SSD devices
+ and sparse/thinly-provisioned LUNs, but it is off
+ by default until sufficient testing has been done.
+
Data Mode
=========
There are 3 different data modes:
diff --git a/Documentation/filesystems/nfs/00-INDEX b/Documentation/filesystems/nfs/00-INDEX
new file mode 100644
index 00000000000..2f68cd68876
--- /dev/null
+++ b/Documentation/filesystems/nfs/00-INDEX
@@ -0,0 +1,16 @@
+00-INDEX
+ - this file (nfs-related documentation).
+Exporting
+ - explanation of how to make filesystems exportable.
+knfsd-stats.txt
+ - statistics which the NFS server makes available to user space.
+nfs.txt
+ - nfs client, and DNS resolution for fs_locations.
+nfs41-server.txt
+ - info on the Linux server implementation of NFSv4 minor version 1.
+nfs-rdma.txt
+ - how to install and setup the Linux NFS/RDMA client and server software
+nfsroot.txt
+ - short guide on setting up a diskless box with NFS root filesystem.
+rpc-cache.txt
+ - introduction to the caching mechanisms in the sunrpc layer.
diff --git a/Documentation/filesystems/Exporting b/Documentation/filesystems/nfs/Exporting
index 87019d2b598..87019d2b598 100644
--- a/Documentation/filesystems/Exporting
+++ b/Documentation/filesystems/nfs/Exporting
diff --git a/Documentation/filesystems/knfsd-stats.txt b/Documentation/filesystems/nfs/knfsd-stats.txt
index 64ced5149d3..64ced5149d3 100644
--- a/Documentation/filesystems/knfsd-stats.txt
+++ b/Documentation/filesystems/nfs/knfsd-stats.txt
diff --git a/Documentation/filesystems/nfs-rdma.txt b/Documentation/filesystems/nfs/nfs-rdma.txt
index e386f7e4bce..e386f7e4bce 100644
--- a/Documentation/filesystems/nfs-rdma.txt
+++ b/Documentation/filesystems/nfs/nfs-rdma.txt
diff --git a/Documentation/filesystems/nfs.txt b/Documentation/filesystems/nfs/nfs.txt
index f50f26ce6cd..f50f26ce6cd 100644
--- a/Documentation/filesystems/nfs.txt
+++ b/Documentation/filesystems/nfs/nfs.txt
diff --git a/Documentation/filesystems/nfs41-server.txt b/Documentation/filesystems/nfs/nfs41-server.txt
index 5920fe26e6f..6a53a84afc7 100644
--- a/Documentation/filesystems/nfs41-server.txt
+++ b/Documentation/filesystems/nfs/nfs41-server.txt
@@ -17,8 +17,7 @@ kernels must turn 4.1 on or off *before* turning support for version 4
on or off; rpc.nfsd does this correctly.)
The NFSv4 minorversion 1 (NFSv4.1) implementation in nfsd is based
-on the latest NFSv4.1 Internet Draft:
-http://tools.ietf.org/html/draft-ietf-nfsv4-minorversion1-29
+on RFC 5661.
From the many new features in NFSv4.1 the current implementation
focuses on the mandatory-to-implement NFSv4.1 Sessions, providing
@@ -41,10 +40,10 @@ interoperability problems with future clients. Known issues:
conformant with the spec (for example, we don't use kerberos
on the backchannel correctly).
- no trunking support: no clients currently take advantage of
- trunking, but this is a mandatory failure, and its use is
+ trunking, but this is a mandatory feature, and its use is
recommended to clients in a number of places. (E.g. to ensure
timely renewal in case an existing connection's retry timeouts
- have gotten too long; see section 8.3 of the draft.)
+ have gotten too long; see section 8.3 of the RFC.)
Therefore, lack of this feature may cause future clients to
fail.
- Incomplete backchannel support: incomplete backchannel gss
@@ -213,3 +212,10 @@ The following cases aren't supported yet:
DESTROY_CLIENTID, DESTROY_SESSION, EXCHANGE_ID.
* DESTROY_SESSION MUST be the final operation in the COMPOUND request.
+Nonstandard compound limitations:
+* No support for a sessions fore channel RPC compound that requires both a
+ ca_maxrequestsize request and a ca_maxresponsesize reply, so we may
+ fail to live up to the promise we made in CREATE_SESSION fore channel
+ negotiation.
+* No more than one IO operation (read, write, readdir) allowed per
+ compound.
diff --git a/Documentation/filesystems/nfsroot.txt b/Documentation/filesystems/nfs/nfsroot.txt
index 3ba0b945aaf..3ba0b945aaf 100644
--- a/Documentation/filesystems/nfsroot.txt
+++ b/Documentation/filesystems/nfs/nfsroot.txt
diff --git a/Documentation/filesystems/rpc-cache.txt b/Documentation/filesystems/nfs/rpc-cache.txt
index 8a382bea680..8a382bea680 100644
--- a/Documentation/filesystems/rpc-cache.txt
+++ b/Documentation/filesystems/nfs/rpc-cache.txt
diff --git a/Documentation/filesystems/nilfs2.txt b/Documentation/filesystems/nilfs2.txt
index 01539f41067..cf6d0d85ca8 100644
--- a/Documentation/filesystems/nilfs2.txt
+++ b/Documentation/filesystems/nilfs2.txt
@@ -28,7 +28,7 @@ described in the man pages included in the package.
Project web page: http://www.nilfs.org/en/
Download page: http://www.nilfs.org/en/download.html
Git tree web page: http://www.nilfs.org/git/
-NILFS mailing lists: http://www.nilfs.org/mailman/listinfo/users
+List info: http://vger.kernel.org/vger-lists.html#linux-nilfs
Caveats
=======
@@ -49,8 +49,7 @@ Mount options
NILFS2 supports the following mount options:
(*) == default
-barrier=on(*) This enables/disables barriers. barrier=off disables
- it, barrier=on enables it.
+nobarrier Disables barriers.
errors=continue(*) Keep going on a filesystem error.
errors=remount-ro Remount the filesystem read-only on an error.
errors=panic Panic and halt the machine if an error occurs.
@@ -71,6 +70,13 @@ order=strict Apply strict in-order semantics that preserves sequence
blocks. That means, it is guaranteed that no
overtaking of events occurs in the recovered file
system after a crash.
+norecovery Disable recovery of the filesystem on mount.
+ This disables every write access on the device for
+ read-only mounts or snapshots. This option will fail
+ for r/w mounts on an unclean volume.
+discard Issue discard/TRIM commands to the underlying block
+ device when blocks are freed. This is useful for SSD
+ devices and sparse/thinly-provisioned LUNs.
NILFS2 usage
============
diff --git a/Documentation/filesystems/ocfs2.txt b/Documentation/filesystems/ocfs2.txt
index c2a0871280a..c58b9f5ba00 100644
--- a/Documentation/filesystems/ocfs2.txt
+++ b/Documentation/filesystems/ocfs2.txt
@@ -20,15 +20,16 @@ Lots of code taken from ext3 and other projects.
Authors in alphabetical order:
Joel Becker <joel.becker@oracle.com>
Zach Brown <zach.brown@oracle.com>
-Mark Fasheh <mark.fasheh@oracle.com>
+Mark Fasheh <mfasheh@suse.com>
Kurt Hackel <kurt.hackel@oracle.com>
+Tao Ma <tao.ma@oracle.com>
Sunil Mushran <sunil.mushran@oracle.com>
Manish Singh <manish.singh@oracle.com>
+Tiger Yang <tiger.yang@oracle.com>
Caveats
=======
Features which OCFS2 does not support yet:
- - quotas
- Directory change notification (F_NOTIFY)
- Distributed Caching (F_SETLEASE/F_GETLEASE/break_lease)
@@ -70,7 +71,6 @@ commit=nrsec (*) Ocfs2 can be told to sync all its data and metadata
performance.
localalloc=8(*) Allows custom localalloc size in MB. If the value is too
large, the fs will silently revert it to the default.
- Localalloc is not enabled for local mounts.
localflocks This disables cluster aware flock.
inode64 Indicates that Ocfs2 is allowed to create inodes at
any location in the filesystem, including those which
diff --git a/Documentation/filesystems/porting b/Documentation/filesystems/porting
index 92b888d540a..a7e9746ee7e 100644
--- a/Documentation/filesystems/porting
+++ b/Documentation/filesystems/porting
@@ -140,7 +140,7 @@ Callers of notify_change() need ->i_mutex now.
New super_block field "struct export_operations *s_export_op" for
explicit support for exporting, e.g. via NFS. The structure is fully
documented at its declaration in include/linux/fs.h, and in
-Documentation/filesystems/Exporting.
+Documentation/filesystems/nfs/Exporting.
Briefly it allows for the definition of decode_fh and encode_fh operations
to encode and decode filehandles, and allows the filesystem to use
diff --git a/Documentation/filesystems/proc.txt b/Documentation/filesystems/proc.txt
index 2c48f945546..96a44dd95e0 100644
--- a/Documentation/filesystems/proc.txt
+++ b/Documentation/filesystems/proc.txt
@@ -38,6 +38,7 @@ Table of Contents
3.3 /proc/<pid>/io - Display the IO accounting fields
3.4 /proc/<pid>/coredump_filter - Core dump filtering settings
3.5 /proc/<pid>/mountinfo - Information about mounts
+ 3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm
------------------------------------------------------------------------------
@@ -163,6 +164,7 @@ read the file /proc/PID/status:
VmExe: 68 kB
VmLib: 1412 kB
VmPTE: 20 kb
+ VmSwap: 0 kB
Threads: 1
SigQ: 0/28578
SigPnd: 0000000000000000
@@ -176,7 +178,6 @@ read the file /proc/PID/status:
CapBnd: ffffffffffffffff
voluntary_ctxt_switches: 0
nonvoluntary_ctxt_switches: 1
- Stack usage: 12 kB
This shows you nearly the same information you would get if you viewed it with
the ps command. In fact, ps uses the proc file system to obtain its
@@ -188,6 +189,12 @@ memory usage. Its seven fields are explained in Table 1-3. The stat file
contains details information about the process itself. Its fields are
explained in Table 1-4.
+(for SMP CONFIG users)
+For making accounting scalable, RSS related information are handled in
+asynchronous manner and the vaule may not be very precise. To see a precise
+snapshot of a moment, you can see /proc/<pid>/smaps file and scan page table.
+It's slow but very precise.
+
Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
..............................................................................
Field Content
@@ -213,6 +220,7 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
VmExe size of text segment
VmLib size of shared library code
VmPTE size of page table entries
+ VmSwap size of swap usage (the number of referred swapents)
Threads number of threads
SigQ number of signals queued/max. number for queue
SigPnd bitmap of pending signals for the thread
@@ -230,7 +238,6 @@ Table 1-2: Contents of the statm files (as of 2.6.30-rc7)
Mems_allowed_list Same as previous, but in "list format"
voluntary_ctxt_switches number of voluntary context switches
nonvoluntary_ctxt_switches number of non voluntary context switches
- Stack usage: stack usage high water mark (round up to page size)
..............................................................................
Table 1-3: Contents of the statm files (as of 2.6.8-rc3)
@@ -431,6 +438,7 @@ Table 1-5: Kernel info in /proc
modules List of loaded modules
mounts Mounted filesystems
net Networking info (see text)
+ pagetypeinfo Additional page allocator information (see text) (2.5)
partitions Table of partitions known to the system
pci Deprecated info of PCI bus (new way -> /proc/bus/pci/,
decoupled by lspci (2.4)
@@ -585,7 +593,7 @@ Node 0, zone DMA 0 4 5 4 4 3 ...
Node 0, zone Normal 1 0 0 1 101 8 ...
Node 0, zone HighMem 2 0 0 1 1 0 ...
-Memory fragmentation is a problem under some workloads, and buddyinfo is a
+External fragmentation is a problem under some workloads, and buddyinfo is a
useful tool for helping diagnose these problems. Buddyinfo will give you a
clue as to how big an area you can safely allocate, or why a previous
allocation failed.
@@ -595,6 +603,48 @@ available. In this case, there are 0 chunks of 2^0*PAGE_SIZE available in
ZONE_DMA, 4 chunks of 2^1*PAGE_SIZE in ZONE_DMA, 101 chunks of 2^4*PAGE_SIZE
available in ZONE_NORMAL, etc...
+More information relevant to external fragmentation can be found in
+pagetypeinfo.
+
+> cat /proc/pagetypeinfo
+Page block order: 9
+Pages per block: 512
+
+Free pages count per migrate type at order 0 1 2 3 4 5 6 7 8 9 10
+Node 0, zone DMA, type Unmovable 0 0 0 1 1 1 1 1 1 1 0
+Node 0, zone DMA, type Reclaimable 0 0 0 0 0 0 0 0 0 0 0
+Node 0, zone DMA, type Movable 1 1 2 1 2 1 1 0 1 0 2
+Node 0, zone DMA, type Reserve 0 0 0 0 0 0 0 0 0 1 0
+Node 0, zone DMA, type Isolate 0 0 0 0 0 0 0 0 0 0 0
+Node 0, zone DMA32, type Unmovable 103 54 77 1 1 1 11 8 7 1 9
+Node 0, zone DMA32, type Reclaimable 0 0 2 1 0 0 0 0 1 0 0
+Node 0, zone DMA32, type Movable 169 152 113 91 77 54 39 13 6 1 452
+Node 0, zone DMA32, type Reserve 1 2 2 2 2 0 1 1 1 1 0
+Node 0, zone DMA32, type Isolate 0 0 0 0 0 0 0 0 0 0 0
+
+Number of blocks type Unmovable Reclaimable Movable Reserve Isolate
+Node 0, zone DMA 2 0 5 1 0
+Node 0, zone DMA32 41 6 967 2 0
+
+Fragmentation avoidance in the kernel works by grouping pages of different
+migrate types into the same contiguous regions of memory called page blocks.
+A page block is typically the size of the default hugepage size e.g. 2MB on
+X86-64. By keeping pages grouped based on their ability to move, the kernel
+can reclaim pages within a page block to satisfy a high-order allocation.
+
+The pagetypinfo begins with information on the size of a page block. It
+then gives the same type of information as buddyinfo except broken down
+by migrate-type and finishes with details on how many page blocks of each
+type exist.
+
+If min_free_kbytes has been tuned correctly (recommendations made by hugeadm
+from libhugetlbfs http://sourceforge.net/projects/libhugetlbfs/), one can
+make an estimate of the likely number of huge pages that can be allocated
+at a given point in time. All the "Movable" blocks should be allocatable
+unless memory has been mlock()'d. Some of the Reclaimable blocks should
+also be allocatable although a lot of filesystem metadata may have to be
+reclaimed to achieve this.
+
..............................................................................
meminfo:
@@ -1072,7 +1122,8 @@ second). The meanings of the columns are as follows, from left to right:
- irq: servicing interrupts
- softirq: servicing softirqs
- steal: involuntary wait
-- guest: running a guest
+- guest: running a normal guest
+- guest_nice: running a niced guest
The "intr" line gives counts of interrupts serviced since boot time, for each
of the possible system interrupts. The first column is the total of all
@@ -1088,8 +1139,8 @@ The "processes" line gives the number of processes and threads created, which
includes (but is not limited to) those created by calls to the fork() and
clone() system calls.
-The "procs_running" line gives the number of processes currently running on
-CPUs.
+The "procs_running" line gives the total number of threads that are
+running or ready to run (i.e., the total number of runnable threads).
The "procs_blocked" line gives the number of processes currently blocked,
waiting for I/O to complete.
@@ -1408,3 +1459,11 @@ For more information on mount propagation see:
Documentation/filesystems/sharedsubtree.txt
+
+3.6 /proc/<pid>/comm & /proc/<pid>/task/<tid>/comm
+--------------------------------------------------------
+These files provide a method to access a tasks comm value. It also allows for
+a task to set its own or one of its thread siblings comm value. The comm value
+is limited in size compared to the cmdline value, so writing anything longer
+then the kernel's TASK_COMM_LEN (currently 16 chars) will result in a truncated
+comm value.
diff --git a/Documentation/filesystems/seq_file.txt b/Documentation/filesystems/seq_file.txt
index 0d15ebccf5b..a1e2e0dda90 100644
--- a/Documentation/filesystems/seq_file.txt
+++ b/Documentation/filesystems/seq_file.txt
@@ -248,9 +248,7 @@ code, that is done in the initialization code in the usual way:
{
struct proc_dir_entry *entry;
- entry = create_proc_entry("sequence", 0, NULL);
- if (entry)
- entry->proc_fops = &ct_file_ops;
+ proc_create("sequence", 0, NULL, &ct_file_ops);
return 0;
}
diff --git a/Documentation/filesystems/sharedsubtree.txt b/Documentation/filesystems/sharedsubtree.txt
index 23a181074f9..fc0e39af43c 100644
--- a/Documentation/filesystems/sharedsubtree.txt
+++ b/Documentation/filesystems/sharedsubtree.txt
@@ -837,6 +837,9 @@ replicas continue to be exactly same.
individual lists does not affect propagation or the way propagation
tree is modified by operations.
+ All vfsmounts in a peer group have the same ->mnt_master. If it is
+ non-NULL, they form a contiguous (ordered) segment of slave list.
+
A example propagation tree looks as shown in the figure below.
[ NOTE: Though it looks like a forest, if we consider all the shared
mounts as a conceptual entity called 'pnode', it becomes a tree]
@@ -874,8 +877,19 @@ replicas continue to be exactly same.
NOTE: The propagation tree is orthogonal to the mount tree.
+8B Locking:
+
+ ->mnt_share, ->mnt_slave, ->mnt_slave_list, ->mnt_master are protected
+ by namespace_sem (exclusive for modifications, shared for reading).
+
+ Normally we have ->mnt_flags modifications serialized by vfsmount_lock.
+ There are two exceptions: do_add_mount() and clone_mnt().
+ The former modifies a vfsmount that has not been visible in any shared
+ data structures yet.
+ The latter holds namespace_sem and the only references to vfsmount
+ are in lists that can't be traversed without namespace_sem.
-8B Algorithm:
+8C Algorithm:
The crux of the implementation resides in rbind/move operation.
diff --git a/Documentation/filesystems/sysfs.txt b/Documentation/filesystems/sysfs.txt
index b245d524d56..931c806642c 100644
--- a/Documentation/filesystems/sysfs.txt
+++ b/Documentation/filesystems/sysfs.txt
@@ -91,8 +91,8 @@ struct device_attribute {
const char *buf, size_t count);
};
-int device_create_file(struct device *, struct device_attribute *);
-void device_remove_file(struct device *, struct device_attribute *);
+int device_create_file(struct device *, const struct device_attribute *);
+void device_remove_file(struct device *, const struct device_attribute *);
It also defines this helper for defining device attributes:
@@ -316,8 +316,8 @@ DEVICE_ATTR(_name, _mode, _show, _store);
Creation/Removal:
-int device_create_file(struct device *device, struct device_attribute * attr);
-void device_remove_file(struct device * dev, struct device_attribute * attr);
+int device_create_file(struct device *dev, const struct device_attribute * attr);
+void device_remove_file(struct device *dev, const struct device_attribute * attr);
- bus drivers (include/linux/device.h)
@@ -358,7 +358,7 @@ DRIVER_ATTR(_name, _mode, _show, _store)
Creation/Removal:
-int driver_create_file(struct device_driver *, struct driver_attribute *);
-void driver_remove_file(struct device_driver *, struct driver_attribute *);
+int driver_create_file(struct device_driver *, const struct driver_attribute *);
+void driver_remove_file(struct device_driver *, const struct driver_attribute *);
diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt
index 623f094c9d8..3de2f32edd9 100644
--- a/Documentation/filesystems/vfs.txt
+++ b/Documentation/filesystems/vfs.txt
@@ -472,7 +472,7 @@ __sync_single_inode) to check if ->writepages has been successful in
writing out the whole address_space.
The Writeback tag is used by filemap*wait* and sync_page* functions,
-via wait_on_page_writeback_range, to wait for all writeback to
+via filemap_fdatawait_range, to wait for all writeback to
complete. While waiting ->sync_page (if defined) will be called on
each page that is found to require writeback.