From b842e240f27678aa5d71611cddc8d17a93fb0caf Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Thu, 10 May 2007 19:02:07 -0400
Subject: locks: reverse order of posix_locks_conflict() arguments

The first argument to posix_locks_conflict() is meant to be a lock request,
and the second a lock from an inode's lock request.  It doesn't really
make a difference which order you call them in, since the only
asymmetric test in posix_lock_conflict() is the check whether the second
argument is a posix lock--and every caller already does that check for
some reason.

But may as well fix posix_test_lock() to call posix_locks_conflict()
with the arguments in the same order as everywhere else.

Signed-off-by: "J. Bruce Fields" <bfields@citi.umich.edu>
---
 fs/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/locks.c b/fs/locks.c
index c795eaaf6c4..51bae6227c2 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -668,7 +668,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl)
 	for (cfl = filp->f_path.dentry->d_inode->i_flock; cfl; cfl = cfl->fl_next) {
 		if (!IS_POSIX(cfl))
 			continue;
-		if (posix_locks_conflict(cfl, fl))
+		if (posix_locks_conflict(fl, cfl))
 			break;
 	}
 	if (cfl)
-- 
cgit v1.2.3


From 526985b9dd6ef7716b87f5fe6f0e2438ea3a89c7 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@fieldses.org>
Date: Tue, 14 Nov 2006 16:54:36 -0500
Subject: locks: kill redundant local variable

There's no need for another variable local to this loop; we can use the
variable (of the same name!) already declared at the top of the function,
and not used till later (at which point it's initialized, so this is safe).

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/locks.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/locks.c b/fs/locks.c
index 51bae6227c2..efe1affe6be 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -819,7 +819,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str
 	lock_kernel();
 	if (request->fl_type != F_UNLCK) {
 		for_each_lock(inode, before) {
-			struct file_lock *fl = *before;
+			fl = *before;
 			if (!IS_POSIX(fl))
 				continue;
 			if (!posix_locks_conflict(request, fl))
-- 
cgit v1.2.3


From 84d535ade62b6f8ce852745731ad6200c46b977c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Tue, 11 Sep 2007 16:38:13 +0400
Subject: Memory shortage can result in inconsistent flocks state

When the flock_lock_file() is called to change the flock
from F_RDLCK to F_WRLCK or vice versa the existing flock
can be removed without appropriate warning.

Look:
        for_each_lock(inode, before) {
                struct file_lock *fl = *before;
                if (IS_POSIX(fl))
                        break;
                if (IS_LEASE(fl))
                        continue;
                if (filp != fl->fl_file)
                        continue;
                if (request->fl_type == fl->fl_type)
                        goto out;
                found = 1;
                locks_delete_lock(before); <<<<<< !
                break;
        }

if after this point the subsequent locks_alloc_lock() will
fail the return code will be -ENOMEM, but the existing lock
is already removed.

This is a known feature that such "re-locking" is not atomic,
but in the racy case the file should stay locked (although by
some other process), but in this case the file will be unlocked.

The proposal is to prepare the lock in advance keeping no chance
to fail in the future code.

Found during making the flocks pid-namespaces aware.

(Note: Thanks to Reuben Farrelly for finding a bug in an earlier version
of this patch.)

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Reuben Farrelly <reuben-linuxkernel@reub.net>
---
 fs/locks.c | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index efe1affe6be..6e22c8129a8 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -733,6 +733,15 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 	lock_kernel();
 	if (request->fl_flags & FL_ACCESS)
 		goto find_conflict;
+
+	if (request->fl_type != F_UNLCK) {
+		error = -ENOMEM;
+		new_fl = locks_alloc_lock();
+		if (new_fl == NULL)
+			goto out;
+		error = 0;
+	}
+
 	for_each_lock(inode, before) {
 		struct file_lock *fl = *before;
 		if (IS_POSIX(fl))
@@ -754,10 +763,6 @@ static int flock_lock_file(struct file *filp, struct file_lock *request)
 		goto out;
 	}
 
-	error = -ENOMEM;
-	new_fl = locks_alloc_lock();
-	if (new_fl == NULL)
-		goto out;
 	/*
 	 * If a higher-priority process was blocked on the old file lock,
 	 * give it the opportunity to lock the file.
-- 
cgit v1.2.3


From 02888f41e9d7fa95d1f5b2f76e0f0af6ea8198cc Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Wed, 12 Sep 2007 15:45:07 -0400
Subject: locks: fix flock_lock_file() comment

This comment wasn't updated when lease support was added, and it makes
essentially the same mistake that the code made before a recent bugfix.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/locks.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 6e22c8129a8..c7c69d29a57 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -715,8 +715,7 @@ next_task:
 }
 
 /* Try to create a FLOCK lock on filp. We always insert new FLOCK locks
- * at the head of the list, but that's secret knowledge known only to
- * flock_lock_file and posix_lock_file.
+ * after any leases, but before any posix locks.
  *
  * Note that if called with an FL_EXISTS argument, the caller may determine
  * whether or not a lock was successfully freed by testing the return
-- 
cgit v1.2.3


From f0c1cd0eaf0b127356c2c09e40305453bc361b0f Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Wed, 19 Sep 2007 16:44:07 +0400
Subject: Use list_first_entry in locks_wake_up_blocks

This routine deletes all the elements from the list
with the "while (!list_empty())" loop, and we already
have a list_first_entry() macro to help it look nicer :)

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
---
 fs/locks.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/fs/locks.c b/fs/locks.c
index c7c69d29a57..282b6c11670 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -534,7 +534,9 @@ static void locks_insert_block(struct file_lock *blocker,
 static void locks_wake_up_blocks(struct file_lock *blocker)
 {
 	while (!list_empty(&blocker->fl_block)) {
-		struct file_lock *waiter = list_entry(blocker->fl_block.next,
+		struct file_lock *waiter;
+
+		waiter = list_first_entry(&blocker->fl_block,
 				struct file_lock, fl_block);
 		__locks_delete_block(waiter);
 		if (waiter->fl_lmops && waiter->fl_lmops->fl_notify)
-- 
cgit v1.2.3


From 85c59580b30c82aa771aa33b37217a6b6851bc14 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Thu, 20 Sep 2007 12:45:02 +0400
Subject: locks: Fix potential OOPS in generic_setlease()

This code is run under lock_kernel(), which is dropped during
sleeping operations, so the following race is possible:

CPU1:                                CPU2:
  vfs_setlease();                    vfs_setlease();
  lock_kernel();
                                     lock_kernel(); /* spin */
  generic_setlease():
    ...
    for (before = ...)
    /* here we found some lease after
     * which we will insert the new one
     */
    fl = locks_alloc_lock();
    /* go to sleep in this allocation and
     * drop the BKL
     */
                                     generic_setlease():
                                       ...
                                       for (before = ...)
                                       /* here we find the "before" pointing
                                        * at the one we found on CPU1
                                        */
                                      ->fl_change(my_before, arg);
                                              lease_modify();
                                                     locks_free_lock();
                                                     /* and we freed it */
                                     ...
                                     unlock_kernel();
   locks_insert_lock(before, fl);
   /* OOPS! We have just tried to add the lease
    * at the tail of already removed one
    */

The similar races are already handled in other code - all the
allocations are performed before any checks/updates.

Thanks to Kamalesh Babulal for testing and for a bug report on an
earlier version.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Cc: Kamalesh Babulal <kamalesh@linux.vnet.ibm.com>
---
 fs/locks.c | 21 ++++++++++++---------
 1 file changed, 12 insertions(+), 9 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 282b6c11670..43dbc7f566f 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1343,6 +1343,7 @@ int fcntl_getlease(struct file *filp)
 int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 {
 	struct file_lock *fl, **before, **my_before = NULL, *lease;
+	struct file_lock *new_fl = NULL;
 	struct dentry *dentry = filp->f_path.dentry;
 	struct inode *inode = dentry->d_inode;
 	int error, rdlease_count = 0, wrlease_count = 0;
@@ -1369,6 +1370,11 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 		|| (atomic_read(&inode->i_count) > 1)))
 		goto out;
 
+	error = -ENOMEM;
+	new_fl = locks_alloc_lock();
+	if (new_fl == NULL)
+		goto out;
+
 	/*
 	 * At this point, we know that if there is an exclusive
 	 * lease on this file, then we hold it on this filp
@@ -1411,18 +1417,15 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp)
 	if (!leases_enable)
 		goto out;
 
-	error = -ENOMEM;
-	fl = locks_alloc_lock();
-	if (fl == NULL)
-		goto out;
-
-	locks_copy_lock(fl, lease);
+	locks_copy_lock(new_fl, lease);
+	locks_insert_lock(before, new_fl);
 
-	locks_insert_lock(before, fl);
+	*flp = new_fl;
+	return 0;
 
-	*flp = fl;
-	error = 0;
 out:
+	if (new_fl != NULL)
+		locks_free_lock(new_fl);
 	return error;
 }
 EXPORT_SYMBOL(generic_setlease);
-- 
cgit v1.2.3


From 4f3b19ca41fbe572e3d44caf516c215b286fe2a6 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Mon, 24 Sep 2007 18:52:09 -0400
Subject: Documentation: move mandatory locking documentation to filesystems/

Shouldn't this mandatory-locking documentation be in the
Documentation/filesystems directory?

Give it a more descriptive name while we're at it, and update 00-INDEX
with a more inclusive description of Documentation/filesystems (which
has already talked about more than just individual filesystems).

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
Acked-by: Randy Dunlap <randy.dunlap@oracle.com>
---
 Documentation/00-INDEX                          |   4 +-
 Documentation/filesystems/00-INDEX              |   2 +
 Documentation/filesystems/mandatory-locking.txt | 152 ++++++++++++++++++++++++
 Documentation/locks.txt                         |  10 +-
 Documentation/mandatory.txt                     | 152 ------------------------
 5 files changed, 160 insertions(+), 160 deletions(-)
 create mode 100644 Documentation/filesystems/mandatory-locking.txt
 delete mode 100644 Documentation/mandatory.txt

diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 43e89b1537d..910473cb1c1 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -145,7 +145,7 @@ fb/
 feature-removal-schedule.txt
 	- list of files and features that are going to be removed.
 filesystems/
-	- directory with info on the various filesystems that Linux supports.
+	- info on the vfs and the various filesystems that Linux supports.
 firmware_class/
 	- request_firmware() hotplug interface info.
 floppy.txt
@@ -240,8 +240,6 @@ m68k/
 	- directory with info about Linux on Motorola 68k architecture.
 magic-number.txt
 	- list of magic numbers used to mark/protect kernel data structures.
-mandatory.txt
-	- info on the Linux implementation of Sys V mandatory file locking.
 mca.txt
 	- info on supporting Micro Channel Architecture (e.g. PS/2) systems.
 md.txt
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index 59db1bca702..e801076812a 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -52,6 +52,8 @@ isofs.txt
 	- info and mount options for the ISO 9660 (CDROM) filesystem.
 jfs.txt
 	- info and mount options for the JFS filesystem.
+mandatory-locking.txt
+	- info on the Linux implementation of Sys V mandatory file locking.
 ncpfs.txt
 	- info on Novell Netware(tm) filesystem using NCP protocol.
 ntfs.txt
diff --git a/Documentation/filesystems/mandatory-locking.txt b/Documentation/filesystems/mandatory-locking.txt
new file mode 100644
index 00000000000..bc449d49eee
--- /dev/null
+++ b/Documentation/filesystems/mandatory-locking.txt
@@ -0,0 +1,152 @@
+	Mandatory File Locking For The Linux Operating System
+
+		Andy Walker <andy@lysaker.kvaerner.no>
+
+			   15 April 1996
+
+
+1. What is  mandatory locking?
+------------------------------
+
+Mandatory locking is kernel enforced file locking, as opposed to the more usual
+cooperative file locking used to guarantee sequential access to files among
+processes. File locks are applied using the flock() and fcntl() system calls
+(and the lockf() library routine which is a wrapper around fcntl().) It is
+normally a process' responsibility to check for locks on a file it wishes to
+update, before applying its own lock, updating the file and unlocking it again.
+The most commonly used example of this (and in the case of sendmail, the most
+troublesome) is access to a user's mailbox. The mail user agent and the mail
+transfer agent must guard against updating the mailbox at the same time, and
+prevent reading the mailbox while it is being updated.
+
+In a perfect world all processes would use and honour a cooperative, or
+"advisory" locking scheme. However, the world isn't perfect, and there's
+a lot of poorly written code out there.
+
+In trying to address this problem, the designers of System V UNIX came up
+with a "mandatory" locking scheme, whereby the operating system kernel would
+block attempts by a process to write to a file that another process holds a
+"read" -or- "shared" lock on, and block attempts to both read and write to a 
+file that a process holds a "write " -or- "exclusive" lock on.
+
+The System V mandatory locking scheme was intended to have as little impact as
+possible on existing user code. The scheme is based on marking individual files
+as candidates for mandatory locking, and using the existing fcntl()/lockf()
+interface for applying locks just as if they were normal, advisory locks.
+
+Note 1: In saying "file" in the paragraphs above I am actually not telling
+the whole truth. System V locking is based on fcntl(). The granularity of
+fcntl() is such that it allows the locking of byte ranges in files, in addition
+to entire files, so the mandatory locking rules also have byte level
+granularity.
+
+Note 2: POSIX.1 does not specify any scheme for mandatory locking, despite
+borrowing the fcntl() locking scheme from System V. The mandatory locking
+scheme is defined by the System V Interface Definition (SVID) Version 3.
+
+2. Marking a file for mandatory locking
+---------------------------------------
+
+A file is marked as a candidate for mandatory locking by setting the group-id
+bit in its file mode but removing the group-execute bit. This is an otherwise
+meaningless combination, and was chosen by the System V implementors so as not
+to break existing user programs.
+
+Note that the group-id bit is usually automatically cleared by the kernel when
+a setgid file is written to. This is a security measure. The kernel has been
+modified to recognize the special case of a mandatory lock candidate and to
+refrain from clearing this bit. Similarly the kernel has been modified not
+to run mandatory lock candidates with setgid privileges.
+
+3. Available implementations
+----------------------------
+
+I have considered the implementations of mandatory locking available with
+SunOS 4.1.x, Solaris 2.x and HP-UX 9.x.
+
+Generally I have tried to make the most sense out of the behaviour exhibited
+by these three reference systems. There are many anomalies.
+
+All the reference systems reject all calls to open() for a file on which
+another process has outstanding mandatory locks. This is in direct
+contravention of SVID 3, which states that only calls to open() with the
+O_TRUNC flag set should be rejected. The Linux implementation follows the SVID
+definition, which is the "Right Thing", since only calls with O_TRUNC can
+modify the contents of the file.
+
+HP-UX even disallows open() with O_TRUNC for a file with advisory locks, not
+just mandatory locks. That would appear to contravene POSIX.1.
+
+mmap() is another interesting case. All the operating systems mentioned
+prevent mandatory locks from being applied to an mmap()'ed file, but  HP-UX
+also disallows advisory locks for such a file. SVID actually specifies the
+paranoid HP-UX behaviour.
+
+In my opinion only MAP_SHARED mappings should be immune from locking, and then
+only from mandatory locks - that is what is currently implemented.
+
+SunOS is so hopeless that it doesn't even honour the O_NONBLOCK flag for
+mandatory locks, so reads and writes to locked files always block when they
+should return EAGAIN.
+
+I'm afraid that this is such an esoteric area that the semantics described
+below are just as valid as any others, so long as the main points seem to
+agree. 
+
+4. Semantics
+------------
+
+1. Mandatory locks can only be applied via the fcntl()/lockf() locking
+   interface - in other words the System V/POSIX interface. BSD style
+   locks using flock() never result in a mandatory lock.
+
+2. If a process has locked a region of a file with a mandatory read lock, then
+   other processes are permitted to read from that region. If any of these
+   processes attempts to write to the region it will block until the lock is
+   released, unless the process has opened the file with the O_NONBLOCK
+   flag in which case the system call will return immediately with the error
+   status EAGAIN.
+
+3. If a process has locked a region of a file with a mandatory write lock, all
+   attempts to read or write to that region block until the lock is released,
+   unless a process has opened the file with the O_NONBLOCK flag in which case
+   the system call will return immediately with the error status EAGAIN.
+
+4. Calls to open() with O_TRUNC, or to creat(), on a existing file that has
+   any mandatory locks owned by other processes will be rejected with the
+   error status EAGAIN.
+
+5. Attempts to apply a mandatory lock to a file that is memory mapped and
+   shared (via mmap() with MAP_SHARED) will be rejected with the error status
+   EAGAIN.
+
+6. Attempts to create a shared memory map of a file (via mmap() with MAP_SHARED)
+   that has any mandatory locks in effect will be rejected with the error status
+   EAGAIN.
+
+5. Which system calls are affected?
+-----------------------------------
+
+Those which modify a file's contents, not just the inode. That gives read(),
+write(), readv(), writev(), open(), creat(), mmap(), truncate() and
+ftruncate(). truncate() and ftruncate() are considered to be "write" actions
+for the purposes of mandatory locking.
+
+The affected region is usually defined as stretching from the current position
+for the total number of bytes read or written. For the truncate calls it is
+defined as the bytes of a file removed or added (we must also consider bytes
+added, as a lock can specify just "the whole file", rather than a specific
+range of bytes.)
+
+Note 3: I may have overlooked some system calls that need mandatory lock
+checking in my eagerness to get this code out the door. Please let me know, or
+better still fix the system calls yourself and submit a patch to me or Linus.
+
+6. Warning!
+-----------
+
+Not even root can override a mandatory lock, so runaway processes can wreak
+havoc if they lock crucial files. The way around it is to change the file
+permissions (remove the setgid bit) before trying to read or write to it.
+Of course, that might be a bit tricky if the system is hung :-(
+
diff --git a/Documentation/locks.txt b/Documentation/locks.txt
index e3b402ef33b..fab857accbd 100644
--- a/Documentation/locks.txt
+++ b/Documentation/locks.txt
@@ -53,11 +53,11 @@ fcntl(), with all the problems that implies.
 1.3 Mandatory Locking As A Mount Option
 ---------------------------------------
 
-Mandatory locking, as described in 'Documentation/mandatory.txt' was prior
-to this release a general configuration option that was valid for all
-mounted filesystems. This had a number of inherent dangers, not the least
-of which was the ability to freeze an NFS server by asking it to read a
-file for which a mandatory lock existed.
+Mandatory locking, as described in 'Documentation/filesystems/mandatory.txt'
+was prior to this release a general configuration option that was valid for
+all mounted filesystems.  This had a number of inherent dangers, not the
+least of which was the ability to freeze an NFS server by asking it to read
+a file for which a mandatory lock existed.
 
 From this release of the kernel, mandatory locking can be turned on and off
 on a per-filesystem basis, using the mount options 'mand' and 'nomand'.
diff --git a/Documentation/mandatory.txt b/Documentation/mandatory.txt
deleted file mode 100644
index bc449d49eee..00000000000
--- a/Documentation/mandatory.txt
+++ /dev/null
@@ -1,152 +0,0 @@
-	Mandatory File Locking For The Linux Operating System
-
-		Andy Walker <andy@lysaker.kvaerner.no>
-
-			   15 April 1996
-
-
-1. What is  mandatory locking?
-------------------------------
-
-Mandatory locking is kernel enforced file locking, as opposed to the more usual
-cooperative file locking used to guarantee sequential access to files among
-processes. File locks are applied using the flock() and fcntl() system calls
-(and the lockf() library routine which is a wrapper around fcntl().) It is
-normally a process' responsibility to check for locks on a file it wishes to
-update, before applying its own lock, updating the file and unlocking it again.
-The most commonly used example of this (and in the case of sendmail, the most
-troublesome) is access to a user's mailbox. The mail user agent and the mail
-transfer agent must guard against updating the mailbox at the same time, and
-prevent reading the mailbox while it is being updated.
-
-In a perfect world all processes would use and honour a cooperative, or
-"advisory" locking scheme. However, the world isn't perfect, and there's
-a lot of poorly written code out there.
-
-In trying to address this problem, the designers of System V UNIX came up
-with a "mandatory" locking scheme, whereby the operating system kernel would
-block attempts by a process to write to a file that another process holds a
-"read" -or- "shared" lock on, and block attempts to both read and write to a 
-file that a process holds a "write " -or- "exclusive" lock on.
-
-The System V mandatory locking scheme was intended to have as little impact as
-possible on existing user code. The scheme is based on marking individual files
-as candidates for mandatory locking, and using the existing fcntl()/lockf()
-interface for applying locks just as if they were normal, advisory locks.
-
-Note 1: In saying "file" in the paragraphs above I am actually not telling
-the whole truth. System V locking is based on fcntl(). The granularity of
-fcntl() is such that it allows the locking of byte ranges in files, in addition
-to entire files, so the mandatory locking rules also have byte level
-granularity.
-
-Note 2: POSIX.1 does not specify any scheme for mandatory locking, despite
-borrowing the fcntl() locking scheme from System V. The mandatory locking
-scheme is defined by the System V Interface Definition (SVID) Version 3.
-
-2. Marking a file for mandatory locking
----------------------------------------
-
-A file is marked as a candidate for mandatory locking by setting the group-id
-bit in its file mode but removing the group-execute bit. This is an otherwise
-meaningless combination, and was chosen by the System V implementors so as not
-to break existing user programs.
-
-Note that the group-id bit is usually automatically cleared by the kernel when
-a setgid file is written to. This is a security measure. The kernel has been
-modified to recognize the special case of a mandatory lock candidate and to
-refrain from clearing this bit. Similarly the kernel has been modified not
-to run mandatory lock candidates with setgid privileges.
-
-3. Available implementations
-----------------------------
-
-I have considered the implementations of mandatory locking available with
-SunOS 4.1.x, Solaris 2.x and HP-UX 9.x.
-
-Generally I have tried to make the most sense out of the behaviour exhibited
-by these three reference systems. There are many anomalies.
-
-All the reference systems reject all calls to open() for a file on which
-another process has outstanding mandatory locks. This is in direct
-contravention of SVID 3, which states that only calls to open() with the
-O_TRUNC flag set should be rejected. The Linux implementation follows the SVID
-definition, which is the "Right Thing", since only calls with O_TRUNC can
-modify the contents of the file.
-
-HP-UX even disallows open() with O_TRUNC for a file with advisory locks, not
-just mandatory locks. That would appear to contravene POSIX.1.
-
-mmap() is another interesting case. All the operating systems mentioned
-prevent mandatory locks from being applied to an mmap()'ed file, but  HP-UX
-also disallows advisory locks for such a file. SVID actually specifies the
-paranoid HP-UX behaviour.
-
-In my opinion only MAP_SHARED mappings should be immune from locking, and then
-only from mandatory locks - that is what is currently implemented.
-
-SunOS is so hopeless that it doesn't even honour the O_NONBLOCK flag for
-mandatory locks, so reads and writes to locked files always block when they
-should return EAGAIN.
-
-I'm afraid that this is such an esoteric area that the semantics described
-below are just as valid as any others, so long as the main points seem to
-agree. 
-
-4. Semantics
-------------
-
-1. Mandatory locks can only be applied via the fcntl()/lockf() locking
-   interface - in other words the System V/POSIX interface. BSD style
-   locks using flock() never result in a mandatory lock.
-
-2. If a process has locked a region of a file with a mandatory read lock, then
-   other processes are permitted to read from that region. If any of these
-   processes attempts to write to the region it will block until the lock is
-   released, unless the process has opened the file with the O_NONBLOCK
-   flag in which case the system call will return immediately with the error
-   status EAGAIN.
-
-3. If a process has locked a region of a file with a mandatory write lock, all
-   attempts to read or write to that region block until the lock is released,
-   unless a process has opened the file with the O_NONBLOCK flag in which case
-   the system call will return immediately with the error status EAGAIN.
-
-4. Calls to open() with O_TRUNC, or to creat(), on a existing file that has
-   any mandatory locks owned by other processes will be rejected with the
-   error status EAGAIN.
-
-5. Attempts to apply a mandatory lock to a file that is memory mapped and
-   shared (via mmap() with MAP_SHARED) will be rejected with the error status
-   EAGAIN.
-
-6. Attempts to create a shared memory map of a file (via mmap() with MAP_SHARED)
-   that has any mandatory locks in effect will be rejected with the error status
-   EAGAIN.
-
-5. Which system calls are affected?
------------------------------------
-
-Those which modify a file's contents, not just the inode. That gives read(),
-write(), readv(), writev(), open(), creat(), mmap(), truncate() and
-ftruncate(). truncate() and ftruncate() are considered to be "write" actions
-for the purposes of mandatory locking.
-
-The affected region is usually defined as stretching from the current position
-for the total number of bytes read or written. For the truncate calls it is
-defined as the bytes of a file removed or added (we must also consider bytes
-added, as a lock can specify just "the whole file", rather than a specific
-range of bytes.)
-
-Note 3: I may have overlooked some system calls that need mandatory lock
-checking in my eagerness to get this code out the door. Please let me know, or
-better still fix the system calls yourself and submit a patch to me or Linus.
-
-6. Warning!
------------
-
-Not even root can override a mandatory lock, so runaway processes can wreak
-havoc if they lock crucial files. The way around it is to change the file
-permissions (remove the setgid bit) before trying to read or write to it.
-Of course, that might be a bit tricky if the system is hung :-(
-
-- 
cgit v1.2.3


From 9efa68ed079af97f4e9477eadef567ffe64f7afc Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 25 Sep 2007 11:57:19 -0400
Subject: locks: add warning about mandatory locking races

The mandatory file locking implementation has long-standing races that
probably render it useless.  I know of no plans to fix them.  Till we
do, we should at least warn people.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 Documentation/filesystems/mandatory-locking.txt | 21 ++++++++++++++++++++-
 1 file changed, 20 insertions(+), 1 deletion(-)

diff --git a/Documentation/filesystems/mandatory-locking.txt b/Documentation/filesystems/mandatory-locking.txt
index bc449d49eee..0979d1d2ca8 100644
--- a/Documentation/filesystems/mandatory-locking.txt
+++ b/Documentation/filesystems/mandatory-locking.txt
@@ -3,7 +3,26 @@
 		Andy Walker <andy@lysaker.kvaerner.no>
 
 			   15 April 1996
-
+		     (Updated September 2007)
+
+0. Why you should avoid mandatory locking
+-----------------------------------------
+
+The Linux implementation is prey to a number of difficult-to-fix race
+conditions which in practice make it not dependable:
+
+	- The write system call checks for a mandatory lock only once
+	  at its start.  It is therefore possible for a lock request to
+	  be granted after this check but before the data is modified.
+	  A process may then see file data change even while a mandatory
+	  lock was held.
+	- Similarly, an exclusive lock may be granted on a file after
+	  the kernel has decided to proceed with a read, but before the
+	  read has actually completed, and the reading process may see
+	  the file data in a state which should not have been visible
+	  to it.
+	- Similar races make the claimed mutual exclusion between lock
+	  and mmap similarly unreliable.
 
 1. What is  mandatory locking?
 ------------------------------
-- 
cgit v1.2.3


From 98257af5a2ad0c5b502ebd07094d9fd8ce87acef Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Sun, 30 Sep 2007 22:18:55 -0400
Subject: Documentation: move locks.txt in filesystems/

This documentation (about file locking) belongs in filesystems/.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 Documentation/00-INDEX              |  2 --
 Documentation/filesystems/00-INDEX  |  2 ++
 Documentation/filesystems/locks.txt | 67 +++++++++++++++++++++++++++++++++++++
 Documentation/locks.txt             | 67 -------------------------------------
 4 files changed, 69 insertions(+), 69 deletions(-)
 create mode 100644 Documentation/filesystems/locks.txt
 delete mode 100644 Documentation/locks.txt

diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX
index 910473cb1c1..cc10ce7dc33 100644
--- a/Documentation/00-INDEX
+++ b/Documentation/00-INDEX
@@ -230,8 +230,6 @@ local_ops.txt
 	- semantics and behavior of local atomic operations.
 lockdep-design.txt
 	- documentation on the runtime locking correctness validator.
-locks.txt
-	- info on file locking implementations, flock() vs. fcntl(), etc.
 logo.gif
 	- full colour GIF image of Linux logo (penguin - Tux).
 logo.txt
diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX
index e801076812a..599593a1706 100644
--- a/Documentation/filesystems/00-INDEX
+++ b/Documentation/filesystems/00-INDEX
@@ -52,6 +52,8 @@ isofs.txt
 	- info and mount options for the ISO 9660 (CDROM) filesystem.
 jfs.txt
 	- info and mount options for the JFS filesystem.
+locks.txt
+	- info on file locking implementations, flock() vs. fcntl(), etc.
 mandatory-locking.txt
 	- info on the Linux implementation of Sys V mandatory file locking.
 ncpfs.txt
diff --git a/Documentation/filesystems/locks.txt b/Documentation/filesystems/locks.txt
new file mode 100644
index 00000000000..fab857accbd
--- /dev/null
+++ b/Documentation/filesystems/locks.txt
@@ -0,0 +1,67 @@
+		      File Locking Release Notes
+
+		Andy Walker <andy@lysaker.kvaerner.no>
+
+			    12 May 1997
+
+
+1. What's New?
+--------------
+
+1.1 Broken Flock Emulation
+--------------------------
+
+The old flock(2) emulation in the kernel was swapped for proper BSD
+compatible flock(2) support in the 1.3.x series of kernels. With the
+release of the 2.1.x kernel series, support for the old emulation has
+been totally removed, so that we don't need to carry this baggage
+forever.
+
+This should not cause problems for anybody, since everybody using a
+2.1.x kernel should have updated their C library to a suitable version
+anyway (see the file "Documentation/Changes".)
+
+1.2 Allow Mixed Locks Again
+---------------------------
+
+1.2.1 Typical Problems - Sendmail
+---------------------------------
+Because sendmail was unable to use the old flock() emulation, many sendmail
+installations use fcntl() instead of flock(). This is true of Slackware 3.0
+for example. This gave rise to some other subtle problems if sendmail was
+configured to rebuild the alias file. Sendmail tried to lock the aliases.dir
+file with fcntl() at the same time as the GDBM routines tried to lock this
+file with flock(). With pre 1.3.96 kernels this could result in deadlocks that,
+over time, or under a very heavy mail load, would eventually cause the kernel
+to lock solid with deadlocked processes.
+
+
+1.2.2 The Solution
+------------------
+The solution I have chosen, after much experimentation and discussion,
+is to make flock() and fcntl() locks oblivious to each other. Both can
+exists, and neither will have any effect on the other.
+
+I wanted the two lock styles to be cooperative, but there were so many
+race and deadlock conditions that the current solution was the only
+practical one. It puts us in the same position as, for example, SunOS
+4.1.x and several other commercial Unices. The only OS's that support
+cooperative flock()/fcntl() are those that emulate flock() using
+fcntl(), with all the problems that implies.
+
+
+1.3 Mandatory Locking As A Mount Option
+---------------------------------------
+
+Mandatory locking, as described in 'Documentation/filesystems/mandatory.txt'
+was prior to this release a general configuration option that was valid for
+all mounted filesystems.  This had a number of inherent dangers, not the
+least of which was the ability to freeze an NFS server by asking it to read
+a file for which a mandatory lock existed.
+
+From this release of the kernel, mandatory locking can be turned on and off
+on a per-filesystem basis, using the mount options 'mand' and 'nomand'.
+The default is to disallow mandatory locking. The intention is that
+mandatory locking only be enabled on a local filesystem as the specific need
+arises.
+
diff --git a/Documentation/locks.txt b/Documentation/locks.txt
deleted file mode 100644
index fab857accbd..00000000000
--- a/Documentation/locks.txt
+++ /dev/null
@@ -1,67 +0,0 @@
-		      File Locking Release Notes
-
-		Andy Walker <andy@lysaker.kvaerner.no>
-
-			    12 May 1997
-
-
-1. What's New?
---------------
-
-1.1 Broken Flock Emulation
---------------------------
-
-The old flock(2) emulation in the kernel was swapped for proper BSD
-compatible flock(2) support in the 1.3.x series of kernels. With the
-release of the 2.1.x kernel series, support for the old emulation has
-been totally removed, so that we don't need to carry this baggage
-forever.
-
-This should not cause problems for anybody, since everybody using a
-2.1.x kernel should have updated their C library to a suitable version
-anyway (see the file "Documentation/Changes".)
-
-1.2 Allow Mixed Locks Again
----------------------------
-
-1.2.1 Typical Problems - Sendmail
----------------------------------
-Because sendmail was unable to use the old flock() emulation, many sendmail
-installations use fcntl() instead of flock(). This is true of Slackware 3.0
-for example. This gave rise to some other subtle problems if sendmail was
-configured to rebuild the alias file. Sendmail tried to lock the aliases.dir
-file with fcntl() at the same time as the GDBM routines tried to lock this
-file with flock(). With pre 1.3.96 kernels this could result in deadlocks that,
-over time, or under a very heavy mail load, would eventually cause the kernel
-to lock solid with deadlocked processes.
-
-
-1.2.2 The Solution
-------------------
-The solution I have chosen, after much experimentation and discussion,
-is to make flock() and fcntl() locks oblivious to each other. Both can
-exists, and neither will have any effect on the other.
-
-I wanted the two lock styles to be cooperative, but there were so many
-race and deadlock conditions that the current solution was the only
-practical one. It puts us in the same position as, for example, SunOS
-4.1.x and several other commercial Unices. The only OS's that support
-cooperative flock()/fcntl() are those that emulate flock() using
-fcntl(), with all the problems that implies.
-
-
-1.3 Mandatory Locking As A Mount Option
----------------------------------------
-
-Mandatory locking, as described in 'Documentation/filesystems/mandatory.txt'
-was prior to this release a general configuration option that was valid for
-all mounted filesystems.  This had a number of inherent dangers, not the
-least of which was the ability to freeze an NFS server by asking it to read
-a file for which a mandatory lock existed.
-
-From this release of the kernel, mandatory locking can be turned on and off
-on a per-filesystem basis, using the mount options 'mand' and 'nomand'.
-The default is to disallow mandatory locking. The intention is that
-mandatory locking only be enabled on a local filesystem as the specific need
-arises.
-
-- 
cgit v1.2.3


From a16877ca9cec211708a161057a7cbfbf2cbc3a53 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 1 Oct 2007 14:41:11 -0700
Subject: Cleanup macros for distinguishing mandatory locks

The combination of S_ISGID bit set and S_IXGRP bit unset is used to mark the
inode as "mandatory lockable" and there's a macro for this check called
MANDATORY_LOCK(inode).  However, fs/locks.c and some filesystems still perform
the explicit i_mode checking.  Besides, Andrew pointed out, that this macro is
buggy itself, as it dereferences the inode arg twice.

Convert this macro into static inline function and switch its users to it,
making the code shorter and more readable.

The __mandatory_lock() helper is to be used in places where the IS_MANDLOCK()
for superblock is already known to be true.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/locks.c          | 14 ++++----------
 fs/nfsd/nfs4state.c |  2 +-
 fs/nfsd/vfs.c       |  2 +-
 fs/read_write.c     |  2 +-
 include/linux/fs.h  | 21 +++++++++++++++++----
 5 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 43dbc7f566f..9a3fe0d8285 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -1119,7 +1119,7 @@ int locks_mandatory_area(int read_write, struct inode *inode,
 			 * If we've been sleeping someone might have
 			 * changed the permissions behind our back.
 			 */
-			if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+			if (__mandatory_lock(inode))
 				continue;
 		}
 
@@ -1761,9 +1761,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd,
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
 	 */
-	if (IS_MANDLOCK(inode) &&
-	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    mapping_writably_mapped(filp->f_mapping)) {
+	if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
 		error = -EAGAIN;
 		goto out;
 	}
@@ -1887,9 +1885,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd,
 	/* Don't allow mandatory locks on files that may be memory mapped
 	 * and shared.
 	 */
-	if (IS_MANDLOCK(inode) &&
-	    (inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    mapping_writably_mapped(filp->f_mapping)) {
+	if (mandatory_lock(inode) && mapping_writably_mapped(filp->f_mapping)) {
 		error = -EAGAIN;
 		goto out;
 	}
@@ -2083,9 +2079,7 @@ static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx)
 		out += sprintf(out, "%6s %s ",
 			     (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
 			     (inode == NULL) ? "*NOINODE*" :
-			     (IS_MANDLOCK(inode) &&
-			      (inode->i_mode & (S_IXGRP | S_ISGID)) == S_ISGID) ?
-			     "MANDATORY" : "ADVISORY ");
+			     mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
 	} else if (IS_FLOCK(fl)) {
 		if (fl->fl_type & LOCK_MAND) {
 			out += sprintf(out, "FLOCK  MSNFS     ");
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 3f559700788..3c028b9c6e0 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -2044,7 +2044,7 @@ static inline int
 io_during_grace_disallowed(struct inode *inode, int flags)
 {
 	return nfs4_in_grace() && (flags & (RD_STATE | WR_STATE))
-		&& MANDATORY_LOCK(inode);
+		&& mandatory_lock(inode);
 }
 
 /*
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 7867151ebb8..9152f87eea1 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -65,7 +65,7 @@
  * locks on them because there is no way to know if the accesser has
  * the lock.
  */
-#define IS_ISMNDLK(i)	(S_ISREG((i)->i_mode) && MANDATORY_LOCK(i))
+#define IS_ISMNDLK(i)	(S_ISREG((i)->i_mode) && mandatory_lock(i))
 
 /*
  * This is a cache of readahead params that help us choose the proper
diff --git a/fs/read_write.c b/fs/read_write.c
index 507ddff48a9..124693e8d3f 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -205,7 +205,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count
 	if (unlikely((pos < 0) || (loff_t) (pos + count) < 0))
 		goto Einval;
 
-	if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) {
+	if (unlikely(inode->i_flock && mandatory_lock(inode))) {
 		int retval = locks_mandatory_area(
 			read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE,
 			inode, file, pos, count);
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 16421f662a7..f5075e0e730 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1369,12 +1369,25 @@ extern int locks_mandatory_area(int, struct inode *, struct file *, loff_t, size
  * Candidates for mandatory locking have the setgid bit set
  * but no group execute bit -  an otherwise meaningless combination.
  */
-#define MANDATORY_LOCK(inode) \
-	(IS_MANDLOCK(inode) && ((inode)->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+
+static inline int __mandatory_lock(struct inode *ino)
+{
+	return (ino->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID;
+}
+
+/*
+ * ... and these candidates should be on MS_MANDLOCK mounted fs,
+ * otherwise these will be advisory locks
+ */
+
+static inline int mandatory_lock(struct inode *ino)
+{
+	return IS_MANDLOCK(ino) && __mandatory_lock(ino);
+}
 
 static inline int locks_verify_locked(struct inode *inode)
 {
-	if (MANDATORY_LOCK(inode))
+	if (mandatory_lock(inode))
 		return locks_mandatory_locked(inode);
 	return 0;
 }
@@ -1385,7 +1398,7 @@ static inline int locks_verify_truncate(struct inode *inode,
 				    struct file *filp,
 				    loff_t size)
 {
-	if (inode->i_flock && MANDATORY_LOCK(inode))
+	if (inode->i_flock && mandatory_lock(inode))
 		return locks_mandatory_area(
 			FLOCK_VERIFY_WRITE, inode, filp,
 			size < inode->i_size ? size : inode->i_size,
-- 
cgit v1.2.3


From 7afaac6202782ec28f2039503bdaef666834d60c Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 1 Oct 2007 14:41:13 -0700
Subject: GFS2: clean up explicit check for mandatory locks

The __mandatory_lock(inode) function makes the same check, but makes the code
more readable.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Steven Whitehouse <swhiteho@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/gfs2/ops_file.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/fs/gfs2/ops_file.c b/fs/gfs2/ops_file.c
index 94d76ace0b9..28773cab4a3 100644
--- a/fs/gfs2/ops_file.c
+++ b/fs/gfs2/ops_file.c
@@ -535,7 +535,7 @@ static int gfs2_lock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_POSIX))
 		return -ENOLCK;
-	if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+	if (__mandatory_lock(&ip->i_inode))
 		return -ENOLCK;
 
 	if (sdp->sd_args.ar_localflocks) {
@@ -637,7 +637,7 @@ static int gfs2_flock(struct file *file, int cmd, struct file_lock *fl)
 
 	if (!(fl->fl_flags & FL_FLOCK))
 		return -ENOLCK;
-	if ((ip->i_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+	if (__mandatory_lock(&ip->i_inode))
 		return -ENOLCK;
 
 	if (sdp->sd_args.ar_localflocks)
-- 
cgit v1.2.3


From 66abe5f257e719547744fdb8691cf5d20603f051 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 1 Oct 2007 14:41:13 -0700
Subject: 9PFS: clean up explicit check for mandatory locks

The __mandatory_lock(inode) macro makes the same check, but makes the code
more readable.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Eric Van Hensbergen <ericvh@gmail.com>
Cc: Ron Minnich <rminnich@sandia.gov>
Cc: Latchesar Ionkov <lucho@ionkov.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/9p/vfs_file.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 2a40c2946d0..716691689fd 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -105,7 +105,7 @@ static int v9fs_file_lock(struct file *filp, int cmd, struct file_lock *fl)
 	P9_DPRINTK(P9_DEBUG_VFS, "filp: %p lock: %p\n", filp, fl);
 
 	/* No mandatory locks */
-	if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID)
+	if (__mandatory_lock(inode))
 		return -ENOLCK;
 
 	if ((IS_SETLK(cmd) || IS_SETLKW(cmd)) && fl->fl_type != F_UNLCK) {
-- 
cgit v1.2.3


From fc5846e555177c2ae01bcded7fddf60cb10dcfd0 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 1 Oct 2007 14:41:14 -0700
Subject: AFS: clean up explicit check for mandatory locks

The __mandatory_lock(inode) macro makes the same check, but makes the code
more readable.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: David Howells <dhowells@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/afs/flock.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index af6952e39a1..210acafe4a9 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -524,8 +524,7 @@ int afs_lock(struct file *file, int cmd, struct file_lock *fl)
 	       (long long) fl->fl_start, (long long) fl->fl_end);
 
 	/* AFS doesn't support mandatory locks */
-	if ((vnode->vfs_inode.i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    fl->fl_type != F_UNLCK)
+	if (__mandatory_lock(&vnode->vfs_inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	if (IS_GETLK(cmd))
-- 
cgit v1.2.3


From dfad9441be82f1eadc3fa3f1bbc93f93d48d1bdf Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 1 Oct 2007 14:41:15 -0700
Subject: NFS: clean up explicit check for mandatory locks

The __mandatory_lock(inode) macro makes the same check, but makes the code
more readable.

Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/nfs/file.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 579cf8a7d4a..9c98ccbf9de 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -522,8 +522,7 @@ static int nfs_lock(struct file *filp, int cmd, struct file_lock *fl)
 	nfs_inc_stats(inode, NFSIOS_VFSLOCK);
 
 	/* No mandatory locks over NFS */
-	if ((inode->i_mode & (S_ISGID | S_IXGRP)) == S_ISGID &&
-	    fl->fl_type != F_UNLCK)
+	if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK)
 		return -ENOLCK;
 
 	if (IS_GETLK(cmd))
-- 
cgit v1.2.3


From 094f2825218fec1b240cb8537d2d0a10edf5ddc9 Mon Sep 17 00:00:00 2001
From: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Date: Tue, 2 Oct 2007 11:21:34 -0700
Subject: fs/locks.c: use list_for_each_entry() instead of list_for_each()

fs/locks.c: use list_for_each_entry() instead of list_for_each() in
posix_locks_deadlock() and get_locks_status()

Signed-off-by: Matthias Kaehlcke <matthias.kaehlcke@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/locks.c | 17 +++++++----------
 1 file changed, 7 insertions(+), 10 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index 9a3fe0d8285..c17bc00b1e8 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -700,13 +700,12 @@ EXPORT_SYMBOL(posix_test_lock);
 static int posix_locks_deadlock(struct file_lock *caller_fl,
 				struct file_lock *block_fl)
 {
-	struct list_head *tmp;
+	struct file_lock *fl;
 
 next_task:
 	if (posix_same_owner(caller_fl, block_fl))
 		return 1;
-	list_for_each(tmp, &blocked_list) {
-		struct file_lock *fl = list_entry(tmp, struct file_lock, fl_link);
+	list_for_each_entry(fl, &blocked_list, fl_link) {
 		if (posix_same_owner(fl, block_fl)) {
 			fl = fl->fl_next;
 			block_fl = fl;
@@ -2164,24 +2163,22 @@ static void move_lock_status(char **p, off_t* pos, off_t offset)
 
 int get_locks_status(char *buffer, char **start, off_t offset, int length)
 {
-	struct list_head *tmp;
+	struct file_lock *fl;
 	char *q = buffer;
 	off_t pos = 0;
 	int i = 0;
 
 	lock_kernel();
-	list_for_each(tmp, &file_lock_list) {
-		struct list_head *btmp;
-		struct file_lock *fl = list_entry(tmp, struct file_lock, fl_link);
+	list_for_each_entry(fl, &file_lock_list, fl_link) {
+		struct file_lock *bfl;
+
 		lock_get_status(q, fl, ++i, "");
 		move_lock_status(&q, &pos, offset);
 
 		if(pos >= offset+length)
 			goto done;
 
-		list_for_each(btmp, &fl->fl_block) {
-			struct file_lock *bfl = list_entry(btmp,
-					struct file_lock, fl_block);
+		list_for_each_entry(bfl, &fl->fl_block, fl_block) {
 			lock_get_status(q, bfl, i, " ->");
 			move_lock_status(&q, &pos, offset);
 
-- 
cgit v1.2.3


From 7f8ada98d9edd83d6ebd01e431e15b024a4a3dc4 Mon Sep 17 00:00:00 2001
From: Pavel Emelyanov <xemul@openvz.org>
Date: Mon, 1 Oct 2007 14:41:15 -0700
Subject: Rework /proc/locks via seq_files and seq_list helpers

Currently /proc/locks is shown with a proc_read function, but its behavior
is rather complex as it has to manually handle current offset and buffer
length.  On the other hand, files that show objects from lists can be
easily reimplemented using the sequential files and the seq_list_XXX()
helpers.

This saves (as usually) 16 lines of code and more than 200 from
the .text section.

[akpm@linux-foundation.org: no externs in C]
[akpm@linux-foundation.org: warning fixes]
Signed-off-by: Pavel Emelyanov <xemul@openvz.org>
Cc: "J. Bruce Fields" <bfields@fieldses.org>
Cc: Trond Myklebust <trond.myklebust@fys.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
---
 fs/locks.c          | 122 ++++++++++++++++++++++------------------------------
 fs/proc/proc_misc.c |  19 ++++----
 include/linux/fs.h  |   1 +
 3 files changed, 62 insertions(+), 80 deletions(-)

diff --git a/fs/locks.c b/fs/locks.c
index c17bc00b1e8..7f9a3ea4741 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2066,134 +2066,114 @@ int vfs_cancel_lock(struct file *filp, struct file_lock *fl)
 
 EXPORT_SYMBOL_GPL(vfs_cancel_lock);
 
-static void lock_get_status(char* out, struct file_lock *fl, int id, char *pfx)
+#ifdef CONFIG_PROC_FS
+#include <linux/seq_file.h>
+
+static void lock_get_status(struct seq_file *f, struct file_lock *fl,
+							int id, char *pfx)
 {
 	struct inode *inode = NULL;
 
 	if (fl->fl_file != NULL)
 		inode = fl->fl_file->f_path.dentry->d_inode;
 
-	out += sprintf(out, "%d:%s ", id, pfx);
+	seq_printf(f, "%d:%s ", id, pfx);
 	if (IS_POSIX(fl)) {
-		out += sprintf(out, "%6s %s ",
+		seq_printf(f, "%6s %s ",
 			     (fl->fl_flags & FL_ACCESS) ? "ACCESS" : "POSIX ",
 			     (inode == NULL) ? "*NOINODE*" :
 			     mandatory_lock(inode) ? "MANDATORY" : "ADVISORY ");
 	} else if (IS_FLOCK(fl)) {
 		if (fl->fl_type & LOCK_MAND) {
-			out += sprintf(out, "FLOCK  MSNFS     ");
+			seq_printf(f, "FLOCK  MSNFS     ");
 		} else {
-			out += sprintf(out, "FLOCK  ADVISORY  ");
+			seq_printf(f, "FLOCK  ADVISORY  ");
 		}
 	} else if (IS_LEASE(fl)) {
-		out += sprintf(out, "LEASE  ");
+		seq_printf(f, "LEASE  ");
 		if (fl->fl_type & F_INPROGRESS)
-			out += sprintf(out, "BREAKING  ");
+			seq_printf(f, "BREAKING  ");
 		else if (fl->fl_file)
-			out += sprintf(out, "ACTIVE    ");
+			seq_printf(f, "ACTIVE    ");
 		else
-			out += sprintf(out, "BREAKER   ");
+			seq_printf(f, "BREAKER   ");
 	} else {
-		out += sprintf(out, "UNKNOWN UNKNOWN  ");
+		seq_printf(f, "UNKNOWN UNKNOWN  ");
 	}
 	if (fl->fl_type & LOCK_MAND) {
-		out += sprintf(out, "%s ",
+		seq_printf(f, "%s ",
 			       (fl->fl_type & LOCK_READ)
 			       ? (fl->fl_type & LOCK_WRITE) ? "RW   " : "READ "
 			       : (fl->fl_type & LOCK_WRITE) ? "WRITE" : "NONE ");
 	} else {
-		out += sprintf(out, "%s ",
+		seq_printf(f, "%s ",
 			       (fl->fl_type & F_INPROGRESS)
 			       ? (fl->fl_type & F_UNLCK) ? "UNLCK" : "READ "
 			       : (fl->fl_type & F_WRLCK) ? "WRITE" : "READ ");
 	}
 	if (inode) {
 #ifdef WE_CAN_BREAK_LSLK_NOW
-		out += sprintf(out, "%d %s:%ld ", fl->fl_pid,
+		seq_printf(f, "%d %s:%ld ", fl->fl_pid,
 				inode->i_sb->s_id, inode->i_ino);
 #else
 		/* userspace relies on this representation of dev_t ;-( */
-		out += sprintf(out, "%d %02x:%02x:%ld ", fl->fl_pid,
+		seq_printf(f, "%d %02x:%02x:%ld ", fl->fl_pid,
 				MAJOR(inode->i_sb->s_dev),
 				MINOR(inode->i_sb->s_dev), inode->i_ino);
 #endif
 	} else {
-		out += sprintf(out, "%d <none>:0 ", fl->fl_pid);
+		seq_printf(f, "%d <none>:0 ", fl->fl_pid);
 	}
 	if (IS_POSIX(fl)) {
 		if (fl->fl_end == OFFSET_MAX)
-			out += sprintf(out, "%Ld EOF\n", fl->fl_start);
+			seq_printf(f, "%Ld EOF\n", fl->fl_start);
 		else
-			out += sprintf(out, "%Ld %Ld\n", fl->fl_start,
-					fl->fl_end);
+			seq_printf(f, "%Ld %Ld\n", fl->fl_start, fl->fl_end);
 	} else {
-		out += sprintf(out, "0 EOF\n");
+		seq_printf(f, "0 EOF\n");
 	}
 }
 
-static void move_lock_status(char **p, off_t* pos, off_t offset)
+static int locks_show(struct seq_file *f, void *v)
 {
-	int len;
-	len = strlen(*p);
-	if(*pos >= offset) {
-		/* the complete line is valid */
-		*p += len;
-		*pos += len;
-		return;
-	}
-	if(*pos+len > offset) {
-		/* use the second part of the line */
-		int i = offset-*pos;
-		memmove(*p,*p+i,len-i);
-		*p += len-i;
-		*pos += len;
-		return;
-	}
-	/* discard the complete line */
-	*pos += len;
-}
+	struct file_lock *fl, *bfl;
 
-/**
- *	get_locks_status	-	reports lock usage in /proc/locks
- *	@buffer: address in userspace to write into
- *	@start: ?
- *	@offset: how far we are through the buffer
- *	@length: how much to read
- */
+	fl = list_entry(v, struct file_lock, fl_link);
 
-int get_locks_status(char *buffer, char **start, off_t offset, int length)
-{
-	struct file_lock *fl;
-	char *q = buffer;
-	off_t pos = 0;
-	int i = 0;
+	lock_get_status(f, fl, (long)f->private, "");
 
-	lock_kernel();
-	list_for_each_entry(fl, &file_lock_list, fl_link) {
-		struct file_lock *bfl;
+	list_for_each_entry(bfl, &fl->fl_block, fl_block)
+		lock_get_status(f, bfl, (long)f->private, " ->");
 
-		lock_get_status(q, fl, ++i, "");
-		move_lock_status(&q, &pos, offset);
+	f->private++;
+	return 0;
+}
 
-		if(pos >= offset+length)
-			goto done;
+static void *locks_start(struct seq_file *f, loff_t *pos)
+{
+	lock_kernel();
+	f->private = (void *)1;
+	return seq_list_start(&file_lock_list, *pos);
+}
 
-		list_for_each_entry(bfl, &fl->fl_block, fl_block) {
-			lock_get_status(q, bfl, i, " ->");
-			move_lock_status(&q, &pos, offset);
+static void *locks_next(struct seq_file *f, void *v, loff_t *pos)
+{
+	return seq_list_next(v, &file_lock_list, pos);
+}
 
-			if(pos >= offset+length)
-				goto done;
-		}
-	}
-done:
+static void locks_stop(struct seq_file *f, void *v)
+{
 	unlock_kernel();
-	*start = buffer;
-	if(q-buffer < length)
-		return (q-buffer);
-	return length;
 }
 
+struct seq_operations locks_seq_operations = {
+	.start	= locks_start,
+	.next	= locks_next,
+	.stop	= locks_stop,
+	.show	= locks_show,
+};
+#endif
+
 /**
  *	lock_may_read - checks that the region is free of locks
  *	@inode: the inode that is being read
diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c
index bee251cb87c..c9d6d5f400a 100644
--- a/fs/proc/proc_misc.c
+++ b/fs/proc/proc_misc.c
@@ -66,7 +66,6 @@ extern int get_stram_list(char *);
 extern int get_filesystem_list(char *);
 extern int get_exec_domain_list(char *);
 extern int get_dma_list(char *);
-extern int get_locks_status (char *, char **, off_t, int);
 
 static int proc_calc_metrics(char *page, char **start, off_t off,
 				 int count, int *eof, int len)
@@ -617,16 +616,18 @@ static int cmdline_read_proc(char *page, char **start, off_t off,
 	return proc_calc_metrics(page, start, off, count, eof, len);
 }
 
-static int locks_read_proc(char *page, char **start, off_t off,
-				 int count, int *eof, void *data)
+static int locks_open(struct inode *inode, struct file *filp)
 {
-	int len = get_locks_status(page, start, off, count);
-
-	if (len < count)
-		*eof = 1;
-	return len;
+	return seq_open(filp, &locks_seq_operations);
 }
 
+static const struct file_operations proc_locks_operations = {
+	.open		= locks_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
 static int execdomains_read_proc(char *page, char **start, off_t off,
 				 int count, int *eof, void *data)
 {
@@ -684,7 +685,6 @@ void __init proc_misc_init(void)
 #endif
 		{"filesystems",	filesystems_read_proc},
 		{"cmdline",	cmdline_read_proc},
-		{"locks",	locks_read_proc},
 		{"execdomains",	execdomains_read_proc},
 		{NULL,}
 	};
@@ -702,6 +702,7 @@ void __init proc_misc_init(void)
 			entry->proc_fops = &proc_kmsg_operations;
 	}
 #endif
+	create_seq_entry("locks", 0, &proc_locks_operations);
 	create_seq_entry("devices", 0, &proc_devinfo_operations);
 	create_seq_entry("cpuinfo", 0, &proc_cpuinfo_operations);
 #ifdef CONFIG_BLOCK
diff --git a/include/linux/fs.h b/include/linux/fs.h
index f5075e0e730..4f1e8cebea7 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -883,6 +883,7 @@ extern int vfs_setlease(struct file *, long, struct file_lock **);
 extern int lease_modify(struct file_lock **, int);
 extern int lock_may_read(struct inode *, loff_t start, unsigned long count);
 extern int lock_may_write(struct inode *, loff_t start, unsigned long count);
+extern struct seq_operations locks_seq_operations;
 
 struct fasync_struct {
 	int	magic;
-- 
cgit v1.2.3


From 5e7fc436426b1f9e106f511a049de91c82ec2c53 Mon Sep 17 00:00:00 2001
From: "J. Bruce Fields" <bfields@citi.umich.edu>
Date: Tue, 2 Oct 2007 14:18:12 -0400
Subject: nfsd: remove IS_ISMNDLCK macro

This macro is only used in one place; in this place it seems simpler to
put open-code it and move the comment to where it's used.

Signed-off-by: J. Bruce Fields <bfields@citi.umich.edu>
---
 fs/nfsd/vfs.c | 13 ++++++-------
 1 file changed, 6 insertions(+), 7 deletions(-)

diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 9152f87eea1..085ded6f6d3 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -61,12 +61,6 @@
 #define NFSDDBG_FACILITY		NFSDDBG_FILEOP
 
 
-/* We must ignore files (but only files) which might have mandatory
- * locks on them because there is no way to know if the accesser has
- * the lock.
- */
-#define IS_ISMNDLK(i)	(S_ISREG((i)->i_mode) && mandatory_lock(i))
-
 /*
  * This is a cache of readahead params that help us choose the proper
  * readahead strategy. Initially, we set all readahead parameters to 0
@@ -680,7 +674,12 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
 	err = nfserr_perm;
 	if (IS_APPEND(inode) && (access & MAY_WRITE))
 		goto out;
-	if (IS_ISMNDLK(inode))
+	/*
+	 * We must ignore files (but only files) which might have mandatory
+	 * locks on them because there is no way to know if the accesser has
+	 * the lock.
+	 */
+	if (S_ISREG((inode)->i_mode) && mandatory_lock(inode))
 		goto out;
 
 	if (!inode->i_fop)
-- 
cgit v1.2.3