diff options
Diffstat (limited to 'fs')
141 files changed, 3632 insertions, 1809 deletions
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index b0a0ae509c0..61c599b4a1e 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -127,12 +127,13 @@ static struct super_block *v9fs_get_sb(struct file_system_type if ((newfid = v9fs_session_init(v9ses, dev_name, data)) < 0) { dprintk(DEBUG_ERROR, "problem initiating session\n"); - kfree(v9ses); - return ERR_PTR(newfid); + sb = ERR_PTR(newfid); + goto out_free_session; } sb = sget(fs_type, NULL, v9fs_set_super, v9ses); - + if (IS_ERR(sb)) + goto out_close_session; v9fs_fill_super(sb, v9ses, flags); inode = v9fs_get_inode(sb, S_IFDIR | mode); @@ -185,6 +186,12 @@ static struct super_block *v9fs_get_sb(struct file_system_type return sb; +out_close_session: + v9fs_session_close(v9ses); +out_free_session: + kfree(v9ses); + return sb; + put_back_sb: /* deactivate_super calls v9fs_kill_super which will frees the rest */ up_write(&sb->s_umount); diff --git a/fs/Kconfig b/fs/Kconfig index e207be68d4c..2524629dc83 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -799,6 +799,7 @@ config PROC_KCORE config PROC_VMCORE bool "/proc/vmcore support (EXPERIMENTAL)" depends on PROC_FS && EXPERIMENTAL && CRASH_DUMP + default y help Exports the dump image of crashed kernel in ELF format. @@ -861,7 +862,7 @@ config RAMFS config CONFIGFS_FS tristate "Userspace-driven configuration filesystem (EXPERIMENTAL)" - depends on EXPERIMENTAL + depends on SYSFS && EXPERIMENTAL help configfs is a ram-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based diff --git a/fs/Makefile b/fs/Makefile index 080b3867be4..83bf478e786 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -10,7 +10,7 @@ obj-y := open.o read_write.o file_table.o buffer.o bio.o super.o \ ioctl.o readdir.o select.o fifo.o locks.o dcache.o inode.o \ attr.o bad_inode.o file.o filesystems.o namespace.o aio.o \ seq_file.o xattr.o libfs.o fs-writeback.o mpage.o direct-io.o \ - ioprio.o pnode.o drop_caches.o + ioprio.o pnode.o drop_caches.o splice.o sync.o obj-$(CONFIG_INOTIFY) += inotify.o obj-$(CONFIG_EPOLL) += eventpoll.o diff --git a/fs/char_dev.c b/fs/char_dev.c index 4e1b849f912..f3418f7a6e9 100644 --- a/fs/char_dev.c +++ b/fs/char_dev.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/smp_lock.h> #include <linux/devfs_fs_kernel.h> +#include <linux/seq_file.h> #include <linux/kobject.h> #include <linux/kobj_map.h> @@ -27,8 +28,6 @@ static struct kobj_map *cdev_map; -#define MAX_PROBE_HASH 255 /* random */ - static DEFINE_MUTEX(chrdevs_lock); static struct char_device_struct { @@ -39,93 +38,29 @@ static struct char_device_struct { char name[64]; struct file_operations *fops; struct cdev *cdev; /* will die */ -} *chrdevs[MAX_PROBE_HASH]; +} *chrdevs[CHRDEV_MAJOR_HASH_SIZE]; /* index in the above */ static inline int major_to_index(int major) { - return major % MAX_PROBE_HASH; -} - -struct chrdev_info { - int index; - struct char_device_struct *cd; -}; - -void *get_next_chrdev(void *dev) -{ - struct chrdev_info *info; - - if (dev == NULL) { - info = kmalloc(sizeof(*info), GFP_KERNEL); - if (!info) - goto out; - info->index=0; - info->cd = chrdevs[info->index]; - if (info->cd) - goto out; - } else { - info = dev; - } - - while (info->index < ARRAY_SIZE(chrdevs)) { - if (info->cd) - info->cd = info->cd->next; - if (info->cd) - goto out; - /* - * No devices on this chain, move to the next - */ - info->index++; - info->cd = (info->index < ARRAY_SIZE(chrdevs)) ? - chrdevs[info->index] : NULL; - if (info->cd) - goto out; - } - -out: - return info; -} - -void *acquire_chrdev_list(void) -{ - mutex_lock(&chrdevs_lock); - return get_next_chrdev(NULL); -} - -void release_chrdev_list(void *dev) -{ - mutex_unlock(&chrdevs_lock); - kfree(dev); + return major % CHRDEV_MAJOR_HASH_SIZE; } +#ifdef CONFIG_PROC_FS -int count_chrdev_list(void) +void chrdev_show(struct seq_file *f, off_t offset) { struct char_device_struct *cd; - int i, count; - - count = 0; - for (i = 0; i < ARRAY_SIZE(chrdevs) ; i++) { - for (cd = chrdevs[i]; cd; cd = cd->next) - count++; + if (offset < CHRDEV_MAJOR_HASH_SIZE) { + mutex_lock(&chrdevs_lock); + for (cd = chrdevs[offset]; cd; cd = cd->next) + seq_printf(f, "%3d %s\n", cd->major, cd->name); + mutex_unlock(&chrdevs_lock); } - - return count; } -int get_chrdev_info(void *dev, int *major, char **name) -{ - struct chrdev_info *info = dev; - - if (info->cd == NULL) - return 1; - - *major = info->cd->major; - *name = info->cd->name; - return 0; -} +#endif /* CONFIG_PROC_FS */ /* * Register a single major with a specified minor range. diff --git a/fs/cifs/CHANGES b/fs/cifs/CHANGES index cb68efba35d..8a2de038882 100644 --- a/fs/cifs/CHANGES +++ b/fs/cifs/CHANGES @@ -1,3 +1,21 @@ +Version 1.42 +------------ +Fix slow oplock break when mounted to different servers at the same time and +the tids match and we try to find matching fid on wrong server. + +Version 1.41 +------------ +Fix NTLMv2 security (can be enabled in /proc/fs/cifs) so customers can +configure stronger authentication. Fix sfu symlinks so they can +be followed (not just recognized). Fix wraparound of bcc on +read responses when buffer size over 64K and also fix wrap of +max smb buffer size when CIFSMaxBufSize over 64K. Fix oops in +cifs_user_read and cifs_readpages (when EAGAIN on send of smb +on socket is returned over and over). Add POSIX (advisory) byte range +locking support (requires server with newest CIFS UNIX Extensions +to the protocol implemented). Slow down negprot slightly in port 139 +RFC1001 case to give session_init time on buggy servers. + Version 1.40 ------------ Use fsuid (fsgid) more consistently instead of uid (gid). Improve performance diff --git a/fs/cifs/Makefile b/fs/cifs/Makefile index 7384947a0f9..58c77254a23 100644 --- a/fs/cifs/Makefile +++ b/fs/cifs/Makefile @@ -3,4 +3,4 @@ # obj-$(CONFIG_CIFS) += cifs.o -cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o +cifs-objs := cifsfs.o cifssmb.o cifs_debug.o connect.o dir.o file.o inode.o link.o misc.o netmisc.o smbdes.o smbencrypt.o transport.o asn1.o md4.o md5.o cifs_unicode.o nterr.o xattr.o cifsencrypt.o fcntl.o readdir.o ioctl.o ntlmssp.o diff --git a/fs/cifs/README b/fs/cifs/README index b0070d1b149..b2b4d080376 100644 --- a/fs/cifs/README +++ b/fs/cifs/README @@ -422,6 +422,13 @@ A partial list of the supported mount options follows: nomapchars Do not translate any of these seven characters (default). nocase Request case insensitive path name matching (case sensitive is the default if the server suports it). + posixpaths If CIFS Unix extensions are supported, attempt to + negotiate posix path name support which allows certain + characters forbidden in typical CIFS filenames, without + requiring remapping. (default) + noposixpaths If CIFS Unix extensions are supported, do not request + posix path name support (this may cause servers to + reject creatingfile with certain reserved characters). nobrl Do not send byte range lock requests to the server. This is necessary for certain applications that break with cifs style mandatory byte range locks (and most diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index a2c24858d40..e7d63737e65 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -1,7 +1,7 @@ /* * fs/cifs/cifsencrypt.c * - * Copyright (C) International Business Machines Corp., 2005 + * Copyright (C) International Business Machines Corp., 2005,2006 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -36,7 +36,8 @@ extern void mdfour(unsigned char *out, unsigned char *in, int n); extern void E_md4hash(const unsigned char *passwd, unsigned char *p16); -static int cifs_calculate_signature(const struct smb_hdr * cifs_pdu, const char * key, char * signature) +static int cifs_calculate_signature(const struct smb_hdr * cifs_pdu, + const char * key, char * signature) { struct MD5Context context; @@ -56,9 +57,6 @@ int cifs_sign_smb(struct smb_hdr * cifs_pdu, struct TCP_Server_Info * server, int rc = 0; char smb_signature[20]; - /* BB remember to initialize sequence number elsewhere and initialize mac_signing key elsewhere BB */ - /* BB remember to add code to save expected sequence number in midQ entry BB */ - if((cifs_pdu == NULL) || (server == NULL)) return -EINVAL; @@ -85,20 +83,33 @@ int cifs_sign_smb(struct smb_hdr * cifs_pdu, struct TCP_Server_Info * server, static int cifs_calc_signature2(const struct kvec * iov, int n_vec, const char * key, char * signature) { - struct MD5Context context; - - if((iov == NULL) || (signature == NULL)) - return -EINVAL; + struct MD5Context context; + int i; - MD5Init(&context); - MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16); + if((iov == NULL) || (signature == NULL)) + return -EINVAL; -/* MD5Update(&context,cifs_pdu->Protocol,cifs_pdu->smb_buf_length); */ /* BB FIXME BB */ + MD5Init(&context); + MD5Update(&context,key,CIFS_SESSION_KEY_SIZE+16); + for(i=0;i<n_vec;i++) { + if(iov[i].iov_base == NULL) { + cERROR(1,("null iovec entry")); + return -EIO; + } else if(iov[i].iov_len == 0) + break; /* bail out if we are sent nothing to sign */ + /* The first entry includes a length field (which does not get + signed that occupies the first 4 bytes before the header */ + if(i==0) { + if (iov[0].iov_len <= 8 ) /* cmd field at offset 9 */ + break; /* nothing to sign or corrupt header */ + MD5Update(&context,iov[0].iov_base+4, iov[0].iov_len-4); + } else + MD5Update(&context,iov[i].iov_base, iov[i].iov_len); + } - MD5Final(signature,&context); + MD5Final(signature,&context); - return -EOPNOTSUPP; -/* return 0; */ + return 0; } @@ -259,4 +270,5 @@ void CalcNTLMv2_response(const struct cifsSesInfo * ses,char * v2_session_respon /* hmac_md5_update(v2_session_response+16)client thing,8,&context); */ /* BB fix */ hmac_md5_final(v2_session_response,&context); + cifs_dump_mem("v2_sess_rsp: ", v2_session_response, 32); /* BB removeme BB */ } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 4bbc544857b..d4b713e5aff 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -93,13 +93,10 @@ cifs_read_super(struct super_block *sb, void *data, int rc = 0; sb->s_flags |= MS_NODIRATIME; /* and probably even noatime */ - sb->s_fs_info = kmalloc(sizeof(struct cifs_sb_info),GFP_KERNEL); + sb->s_fs_info = kzalloc(sizeof(struct cifs_sb_info),GFP_KERNEL); cifs_sb = CIFS_SB(sb); if(cifs_sb == NULL) return -ENOMEM; - else - memset(cifs_sb,0,sizeof(struct cifs_sb_info)); - rc = cifs_mount(sb, cifs_sb, data, devname); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 74f405ae4da..4e829dc672a 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -99,5 +99,5 @@ extern ssize_t cifs_getxattr(struct dentry *, const char *, void *, size_t); extern ssize_t cifs_listxattr(struct dentry *, char *, size_t); extern int cifs_ioctl (struct inode * inode, struct file * filep, unsigned int command, unsigned long arg); -#define CIFS_VERSION "1.40" +#define CIFS_VERSION "1.42" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 7bed27601ce..006eb33bff5 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1,7 +1,7 @@ /* * fs/cifs/cifsglob.h * - * Copyright (C) International Business Machines Corp., 2002,2005 + * Copyright (C) International Business Machines Corp., 2002,2006 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -430,6 +430,15 @@ struct dir_notify_req { #define CIFS_LARGE_BUFFER 2 #define CIFS_IOVEC 4 /* array of response buffers */ +/* Type of session setup needed */ +#define CIFS_PLAINTEXT 0 +#define CIFS_LANMAN 1 +#define CIFS_NTLM 2 +#define CIFS_NTLMSSP_NEG 3 +#define CIFS_NTLMSSP_AUTH 4 +#define CIFS_SPNEGO_INIT 5 +#define CIFS_SPNEGO_TARG 6 + /* ***************************************************************** * All constants go here diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index cc2471094ca..b2233ac05bd 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -859,7 +859,10 @@ typedef struct smb_com_lock_req { LOCKING_ANDX_RANGE Locks[1]; } __attribute__((packed)) LOCK_REQ; - +/* lock type */ +#define CIFS_RDLCK 0 +#define CIFS_WRLCK 1 +#define CIFS_UNLCK 2 typedef struct cifs_posix_lock { __le16 lock_type; /* 0 = Read, 1 = Write, 2 = Unlock */ __le16 lock_flags; /* 1 = Wait (only valid for setlock) */ @@ -1786,7 +1789,13 @@ typedef struct { #define CIFS_UNIX_POSIX_ACL_CAP 0x00000002 /* support getfacl/setfacl */ #define CIFS_UNIX_XATTR_CAP 0x00000004 /* support new namespace */ #define CIFS_UNIX_EXTATTR_CAP 0x00000008 /* support chattr/chflag */ -#define CIFS_UNIX_POSIX_PATHNAMES_CAP 0x00000010 /* Use POSIX pathnames on the wire. */ +#define CIFS_UNIX_POSIX_PATHNAMES_CAP 0x00000010 /* Allow POSIX path chars */ +#ifdef CONFIG_CIFS_POSIX +#define CIFS_UNIX_CAP_MASK 0x0000001b +#else +#define CIFS_UNIX_CAP_MASK 0x00000013 +#endif /* CONFIG_CIFS_POSIX */ + #define CIFS_POSIX_EXTENSIONS 0x00000010 /* support for new QFSInfo */ diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 7b25463d3c1..2879ba343ca 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -1,7 +1,7 @@ /* * fs/cifs/cifsproto.h * - * Copyright (c) International Business Machines Corp., 2002,2005 + * Copyright (c) International Business Machines Corp., 2002,2006 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -64,6 +64,14 @@ extern int map_smb_to_linux_error(struct smb_hdr *smb); extern void header_assemble(struct smb_hdr *, char /* command */ , const struct cifsTconInfo *, int /* length of fixed section (word count) in two byte units */); +#ifdef CONFIG_CIFS_EXPERIMENTAL +extern int small_smb_init_no_tc(const int smb_cmd, const int wct, + struct cifsSesInfo *ses, + void ** request_buf); +extern int CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, + const int stage, int * pNTLMv2_flg, + const struct nls_table *nls_cp); +#endif extern __u16 GetNextMid(struct TCP_Server_Info *server); extern struct oplock_q_entry * AllocOplockQEntry(struct inode *, u16, struct cifsTconInfo *); @@ -257,7 +265,10 @@ extern int CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, const __u64 offset, const __u32 numUnlock, const __u32 numLock, const __u8 lockType, const int waitFlag); - +extern int CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, + const __u16 smb_file_id, const int get_flag, + const __u64 len, const __u64 offset, + const __u16 lock_type, const int waitFlag); extern int CIFSSMBTDis(const int xid, struct cifsTconInfo *tcon); extern int CIFSSMBLogoff(const int xid, struct cifsSesInfo *ses); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index a243fe2792d..d705500aa28 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1,7 +1,7 @@ /* * fs/cifs/cifssmb.c * - * Copyright (C) International Business Machines Corp., 2002,2005 + * Copyright (C) International Business Machines Corp., 2002,2006 * Author(s): Steve French (sfrench@us.ibm.com) * * Contains the routines for constructing the SMB PDUs themselves @@ -186,7 +186,35 @@ small_smb_init(int smb_command, int wct, struct cifsTconInfo *tcon, cifs_stats_inc(&tcon->num_smbs_sent); return rc; -} +} + +#ifdef CONFIG_CIFS_EXPERIMENTAL +int +small_smb_init_no_tc(const int smb_command, const int wct, + struct cifsSesInfo *ses, void **request_buf) +{ + int rc; + struct smb_hdr * buffer; + + rc = small_smb_init(smb_command, wct, NULL, request_buf); + if(rc) + return rc; + + buffer = (struct smb_hdr *)*request_buf; + buffer->Mid = GetNextMid(ses->server); + if (ses->capabilities & CAP_UNICODE) + buffer->Flags2 |= SMBFLG2_UNICODE; + if (ses->capabilities & CAP_STATUS32) + buffer->Flags2 |= SMBFLG2_ERR_STATUS; + + /* uid, tid can stay at zero as set in header assemble */ + + /* BB add support for turning on the signing when + this function is used after 1st of session setup requests */ + + return rc; +} +#endif /* CONFIG_CIFS_EXPERIMENTAL */ /* If the return code is zero, this function must fill in request_buf pointer */ static int @@ -1042,7 +1070,7 @@ CIFSSMBRead(const int xid, struct cifsTconInfo *tcon, } } - cifs_small_buf_release(pSMB); +/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ if(*buf) { if(resp_buf_type == CIFS_SMALL_BUFFER) cifs_small_buf_release(iov[0].iov_base); @@ -1246,7 +1274,7 @@ CIFSSMBWrite2(const int xid, struct cifsTconInfo *tcon, *nbytes += le16_to_cpu(pSMBr->Count); } - cifs_small_buf_release(pSMB); +/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ if(resp_buf_type == CIFS_SMALL_BUFFER) cifs_small_buf_release(iov[0].iov_base); else if(resp_buf_type == CIFS_LARGE_BUFFER) @@ -1325,6 +1353,85 @@ CIFSSMBLock(const int xid, struct cifsTconInfo *tcon, } int +CIFSSMBPosixLock(const int xid, struct cifsTconInfo *tcon, + const __u16 smb_file_id, const int get_flag, const __u64 len, + const __u64 lkoffset, const __u16 lock_type, const int waitFlag) +{ + struct smb_com_transaction2_sfi_req *pSMB = NULL; + struct smb_com_transaction2_sfi_rsp *pSMBr = NULL; + char *data_offset; + struct cifs_posix_lock *parm_data; + int rc = 0; + int bytes_returned = 0; + __u16 params, param_offset, offset, byte_count, count; + + cFYI(1, ("Posix Lock")); + rc = small_smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB); + + if (rc) + return rc; + + pSMBr = (struct smb_com_transaction2_sfi_rsp *)pSMB; + + params = 6; + pSMB->MaxSetupCount = 0; + pSMB->Reserved = 0; + pSMB->Flags = 0; + pSMB->Timeout = 0; + pSMB->Reserved2 = 0; + param_offset = offsetof(struct smb_com_transaction2_sfi_req, Fid) - 4; + offset = param_offset + params; + + data_offset = (char *) (&pSMB->hdr.Protocol) + offset; + + count = sizeof(struct cifs_posix_lock); + pSMB->MaxParameterCount = cpu_to_le16(2); + pSMB->MaxDataCount = cpu_to_le16(1000); /* BB find max SMB PDU from sess */ + pSMB->SetupCount = 1; + pSMB->Reserved3 = 0; + if(get_flag) + pSMB->SubCommand = cpu_to_le16(TRANS2_QUERY_FILE_INFORMATION); + else + pSMB->SubCommand = cpu_to_le16(TRANS2_SET_FILE_INFORMATION); + byte_count = 3 /* pad */ + params + count; + pSMB->DataCount = cpu_to_le16(count); + pSMB->ParameterCount = cpu_to_le16(params); + pSMB->TotalDataCount = pSMB->DataCount; + pSMB->TotalParameterCount = pSMB->ParameterCount; + pSMB->ParameterOffset = cpu_to_le16(param_offset); + parm_data = (struct cifs_posix_lock *) + (((char *) &pSMB->hdr.Protocol) + offset); + + parm_data->lock_type = cpu_to_le16(lock_type); + if(waitFlag) + parm_data->lock_flags = 1; + parm_data->pid = cpu_to_le32(current->tgid); + parm_data->start = lkoffset; + parm_data->length = len; /* normalize negative numbers */ + + pSMB->DataOffset = cpu_to_le16(offset); + pSMB->Fid = smb_file_id; + pSMB->InformationLevel = cpu_to_le16(SMB_SET_POSIX_LOCK); + pSMB->Reserved4 = 0; + pSMB->hdr.smb_buf_length += byte_count; + pSMB->ByteCount = cpu_to_le16(byte_count); + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) { + cFYI(1, ("Send error in Posix Lock = %d", rc)); + } + + if (pSMB) + cifs_small_buf_release(pSMB); + + /* Note: On -EAGAIN error only caller can retry on handle based calls + since file handle passed in no longer valid */ + + return rc; +} + + +int CIFSSMBClose(const int xid, struct cifsTconInfo *tcon, int smb_file_id) { int rc = 0; @@ -2578,7 +2685,7 @@ qsec_out: cifs_small_buf_release(iov[0].iov_base); else if(buf_type == CIFS_LARGE_BUFFER) cifs_buf_release(iov[0].iov_base); - cifs_small_buf_release(pSMB); +/* cifs_small_buf_release(pSMB); */ /* Freed earlier now in SendReceive2 */ return rc; } @@ -2954,7 +3061,8 @@ findFirstRetry: pSMB->TotalParameterCount = cpu_to_le16(params); pSMB->ParameterCount = pSMB->TotalParameterCount; pSMB->ParameterOffset = cpu_to_le16( - offsetof(struct smb_com_transaction2_ffirst_req, SearchAttributes) - 4); + offsetof(struct smb_com_transaction2_ffirst_req, SearchAttributes) + - 4); pSMB->DataCount = 0; pSMB->DataOffset = 0; pSMB->SetupCount = 1; /* one byte, no need to make endian neutral */ @@ -2977,12 +3085,12 @@ findFirstRetry: (struct smb_hdr *) pSMBr, &bytes_returned, 0); cifs_stats_inc(&tcon->num_ffirst); - if (rc) {/* BB add logic to retry regular search if Unix search rejected unexpectedly by server */ + if (rc) {/* BB add logic to retry regular search if Unix search + rejected unexpectedly by server */ /* BB Add code to handle unsupported level rc */ cFYI(1, ("Error in FindFirst = %d", rc)); - if (pSMB) - cifs_buf_release(pSMB); + cifs_buf_release(pSMB); /* BB eventually could optimize out free and realloc of buf */ /* for this case */ @@ -2998,6 +3106,7 @@ findFirstRetry: psrch_inf->unicode = FALSE; psrch_inf->ntwrk_buf_start = (char *)pSMBr; + psrch_inf->smallBuf = 0; psrch_inf->srch_entries_start = (char *) &pSMBr->hdr.Protocol + le16_to_cpu(pSMBr->t2.DataOffset); @@ -3118,9 +3227,14 @@ int CIFSFindNext(const int xid, struct cifsTconInfo *tcon, parms = (T2_FNEXT_RSP_PARMS *)response_data; response_data = (char *)&pSMBr->hdr.Protocol + le16_to_cpu(pSMBr->t2.DataOffset); - cifs_buf_release(psrch_inf->ntwrk_buf_start); + if(psrch_inf->smallBuf) + cifs_small_buf_release( + psrch_inf->ntwrk_buf_start); + else + cifs_buf_release(psrch_inf->ntwrk_buf_start); psrch_inf->srch_entries_start = response_data; psrch_inf->ntwrk_buf_start = (char *)pSMB; + psrch_inf->smallBuf = 0; if(parms->EndofSearch) psrch_inf->endOfSearch = TRUE; else @@ -3834,6 +3948,7 @@ CIFSSMBSetFSUnixInfo(const int xid, struct cifsTconInfo *tcon, __u64 cap) cFYI(1, ("In SETFSUnixInfo")); SETFSUnixRetry: + /* BB switch to small buf init to save memory */ rc = smb_init(SMB_COM_TRANSACTION2, 15, tcon, (void **) &pSMB, (void **) &pSMBr); if (rc) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 2a0c1f4ca0a..0b86d5ca901 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -1,7 +1,7 @@ /* * fs/cifs/connect.c * - * Copyright (C) International Business Machines Corp., 2002,2005 + * Copyright (C) International Business Machines Corp., 2002,2006 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify @@ -564,7 +564,7 @@ cifs_demultiplex_thread(struct TCP_Server_Info *server) dump_smb(smb_buffer, length); - if (checkSMB (smb_buffer, smb_buffer->Mid, total_read+4)) { + if (checkSMB(smb_buffer, smb_buffer->Mid, total_read+4)) { cifs_dump_mem("Bad SMB: ", smb_buffer, 48); continue; } @@ -1476,6 +1476,14 @@ ipv4_connect(struct sockaddr_in *psin_server, struct socket **csocket, rc = smb_send(*csocket, smb_buf, 0x44, (struct sockaddr *)psin_server); kfree(ses_init_buf); + msleep(1); /* RFC1001 layer in at least one server + requires very short break before negprot + presumably because not expecting negprot + to follow so fast. This is a simple + solution that works without + complicating the code and causes no + significant slowing down on mount + for everyone else */ } /* else the negprot may still work without this even though malloc failed */ @@ -1920,27 +1928,34 @@ cifs_mount(struct super_block *sb, struct cifs_sb_info *cifs_sb, cifs_sb->tcon = tcon; tcon->ses = pSesInfo; - /* do not care if following two calls succeed - informational only */ + /* do not care if following two calls succeed - informational */ CIFSSMBQFSDeviceInfo(xid, tcon); CIFSSMBQFSAttributeInfo(xid, tcon); + if (tcon->ses->capabilities & CAP_UNIX) { if(!CIFSSMBQFSUnixInfo(xid, tcon)) { - if(!volume_info.no_psx_acl) { - if(CIFS_UNIX_POSIX_ACL_CAP & - le64_to_cpu(tcon->fsUnixInfo.Capability)) - cFYI(1,("server negotiated posix acl support")); - sb->s_flags |= MS_POSIXACL; + __u64 cap = + le64_to_cpu(tcon->fsUnixInfo.Capability); + cap &= CIFS_UNIX_CAP_MASK; + if(volume_info.no_psx_acl) + cap &= ~CIFS_UNIX_POSIX_ACL_CAP; + else if(CIFS_UNIX_POSIX_ACL_CAP & cap) { + cFYI(1,("negotiated posix acl support")); + sb->s_flags |= MS_POSIXACL; } - /* Try and negotiate POSIX pathnames if we can. */ - if (volume_info.posix_paths && (CIFS_UNIX_POSIX_PATHNAMES_CAP & - le64_to_cpu(tcon->fsUnixInfo.Capability))) { - if (!CIFSSMBSetFSUnixInfo(xid, tcon, CIFS_UNIX_POSIX_PATHNAMES_CAP)) { - cFYI(1,("negotiated posix pathnames support")); - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_POSIX_PATHS; - } else { - cFYI(1,("posix pathnames support requested but not supported")); - } + if(volume_info.posix_paths == 0) + cap &= ~CIFS_UNIX_POSIX_PATHNAMES_CAP; + else if(cap & CIFS_UNIX_POSIX_PATHNAMES_CAP) { + cFYI(1,("negotiate posix pathnames")); + cifs_sb->mnt_cifs_flags |= + CIFS_MOUNT_POSIX_PATHS; + } + + cFYI(1,("Negotiate caps 0x%x",(int)cap)); + + if (CIFSSMBSetFSUnixInfo(xid, tcon, cap)) { + cFYI(1,("setting capabilities failed")); } } } @@ -2278,6 +2293,8 @@ CIFSSpnegoSessSetup(unsigned int xid, struct cifsSesInfo *ses, smb_buffer->Mid = GetNextMid(ses->server); pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; pSMB->req.AndXCommand = 0xFF; + if(ses->server->maxBuf > 64*1024) + ses->server->maxBuf = (64*1023); pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); @@ -2525,7 +2542,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, __u32 negotiate_flags, capabilities; __u16 count; - cFYI(1, ("In NTLMSSP sesssetup (negotiate) ")); + cFYI(1, ("In NTLMSSP sesssetup (negotiate)")); if(ses == NULL) return -EINVAL; domain = ses->domainName; @@ -2575,7 +2592,8 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, SecurityBlob->MessageType = NtLmNegotiate; negotiate_flags = NTLMSSP_NEGOTIATE_UNICODE | NTLMSSP_NEGOTIATE_OEM | - NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_NTLM | 0x80000000 | + NTLMSSP_REQUEST_TARGET | NTLMSSP_NEGOTIATE_NTLM | + NTLMSSP_NEGOTIATE_56 | /* NTLMSSP_NEGOTIATE_ALWAYS_SIGN | */ NTLMSSP_NEGOTIATE_128; if(sign_CIFS_PDUs) negotiate_flags |= NTLMSSP_NEGOTIATE_SIGN; @@ -2588,26 +2606,11 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, SecurityBlob->WorkstationName.Length = 0; SecurityBlob->WorkstationName.MaximumLength = 0; - if (domain == NULL) { - SecurityBlob->DomainName.Buffer = 0; - SecurityBlob->DomainName.Length = 0; - SecurityBlob->DomainName.MaximumLength = 0; - } else { - __u16 len; - negotiate_flags |= NTLMSSP_NEGOTIATE_DOMAIN_SUPPLIED; - strncpy(bcc_ptr, domain, 63); - len = strnlen(domain, 64); - SecurityBlob->DomainName.MaximumLength = - cpu_to_le16(len); - SecurityBlob->DomainName.Buffer = - cpu_to_le32((long) &SecurityBlob-> - DomainString - - (long) &SecurityBlob->Signature); - bcc_ptr += len; - SecurityBlobLength += len; - SecurityBlob->DomainName.Length = - cpu_to_le16(len); - } + /* Domain not sent on first Sesssetup in NTLMSSP, instead it is sent + along with username on auth request (ie the response to challenge) */ + SecurityBlob->DomainName.Buffer = 0; + SecurityBlob->DomainName.Length = 0; + SecurityBlob->DomainName.MaximumLength = 0; if (ses->capabilities & CAP_UNICODE) { if ((long) bcc_ptr % 2) { *bcc_ptr = 0; @@ -2677,7 +2680,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, SecurityBlob2->MessageType)); } else if (ses) { ses->Suid = smb_buffer_response->Uid; /* UID left in le format */ - cFYI(1, ("UID = %d ", ses->Suid)); + cFYI(1, ("UID = %d", ses->Suid)); if ((pSMBr->resp.hdr.WordCount == 3) || ((pSMBr->resp.hdr.WordCount == 4) && (blob_len < @@ -2685,17 +2688,17 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, if (pSMBr->resp.hdr.WordCount == 4) { bcc_ptr += blob_len; - cFYI(1, - ("Security Blob Length %d ", + cFYI(1, ("Security Blob Length %d", blob_len)); } - cFYI(1, ("NTLMSSP Challenge rcvd ")); + cFYI(1, ("NTLMSSP Challenge rcvd")); memcpy(ses->server->cryptKey, SecurityBlob2->Challenge, CIFS_CRYPTO_KEY_SIZE); - if(SecurityBlob2->NegotiateFlags & cpu_to_le32(NTLMSSP_NEGOTIATE_NTLMV2)) + if(SecurityBlob2->NegotiateFlags & + cpu_to_le32(NTLMSSP_NEGOTIATE_NTLMV2)) *pNTLMv2_flag = TRUE; if((SecurityBlob2->NegotiateFlags & @@ -2818,7 +2821,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, bcc_ptr++; } else cFYI(1, - ("Variable field of length %d extends beyond end of smb ", + ("Variable field of length %d extends beyond end of smb", len)); } } else { @@ -2830,7 +2833,7 @@ CIFSNTLMSSPNegotiateSessSetup(unsigned int xid, } } else { cERROR(1, - (" Invalid Word count %d: ", + (" Invalid Word count %d:", smb_buffer_response->WordCount)); rc = -EIO; } @@ -3447,7 +3450,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) && (pSesInfo->server->secType == NTLMSSP)) { - cFYI(1, ("New style sesssetup ")); + cFYI(1, ("New style sesssetup")); rc = CIFSSpnegoSessSetup(xid, pSesInfo, NULL /* security blob */, 0 /* blob length */, @@ -3455,7 +3458,7 @@ int cifs_setup_session(unsigned int xid, struct cifsSesInfo *pSesInfo, } else if (extended_security && (pSesInfo->capabilities & CAP_EXTENDED_SECURITY) && (pSesInfo->server->secType == RawNTLMSSP)) { - cFYI(1, ("NTLMSSP sesssetup ")); + cFYI(1, ("NTLMSSP sesssetup")); rc = CIFSNTLMSSPNegotiateSessSetup(xid, pSesInfo, &ntlmv2_flag, diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 632561dd9c5..1d0ca3eaaca 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -48,13 +48,14 @@ build_path_from_dentry(struct dentry *direntry) struct dentry *temp; int namelen = 0; char *full_path; - char dirsep = CIFS_DIR_SEP(CIFS_SB(direntry->d_sb)); + char dirsep; if(direntry == NULL) return NULL; /* not much we can do if dentry is freed and we need to reopen the file after it was closed implicitly when the server crashed */ + dirsep = CIFS_DIR_SEP(CIFS_SB(direntry->d_sb)); cifs_bp_rename_retry: for (temp = direntry; !IS_ROOT(temp);) { namelen += (1 + temp->d_name.len); @@ -255,12 +256,10 @@ cifs_create(struct inode *inode, struct dentry *direntry, int mode, CIFSSMBClose(xid, pTcon, fileHandle); } else if(newinode) { pCifsFile = - kmalloc(sizeof (struct cifsFileInfo), GFP_KERNEL); + kzalloc(sizeof (struct cifsFileInfo), GFP_KERNEL); if(pCifsFile == NULL) goto cifs_create_out; - memset((char *)pCifsFile, 0, - sizeof (struct cifsFileInfo)); pCifsFile->netfid = fileHandle; pCifsFile->pid = current->tgid; pCifsFile->pInode = newinode; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index fb49aef1f2e..5c497c52977 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -555,7 +555,10 @@ int cifs_closedir(struct inode *inode, struct file *file) if (ptmp) { cFYI(1, ("closedir free smb buf in srch struct")); pCFileStruct->srch_inf.ntwrk_buf_start = NULL; - cifs_buf_release(ptmp); + if(pCFileStruct->srch_inf.smallBuf) + cifs_small_buf_release(ptmp); + else + cifs_buf_release(ptmp); } ptmp = pCFileStruct->search_resume_name; if (ptmp) { @@ -574,13 +577,14 @@ int cifs_closedir(struct inode *inode, struct file *file) int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) { int rc, xid; - __u32 lockType = LOCKING_ANDX_LARGE_FILES; __u32 numLock = 0; __u32 numUnlock = 0; __u64 length; int wait_flag = FALSE; struct cifs_sb_info *cifs_sb; struct cifsTconInfo *pTcon; + __u16 netfid; + __u8 lockType = LOCKING_ANDX_LARGE_FILES; length = 1 + pfLock->fl_end - pfLock->fl_start; rc = -EACCES; @@ -592,11 +596,11 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) pfLock->fl_end)); if (pfLock->fl_flags & FL_POSIX) - cFYI(1, ("Posix ")); + cFYI(1, ("Posix")); if (pfLock->fl_flags & FL_FLOCK) - cFYI(1, ("Flock ")); + cFYI(1, ("Flock")); if (pfLock->fl_flags & FL_SLEEP) { - cFYI(1, ("Blocking lock ")); + cFYI(1, ("Blocking lock")); wait_flag = TRUE; } if (pfLock->fl_flags & FL_ACCESS) @@ -612,21 +616,23 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) cFYI(1, ("F_WRLCK ")); numLock = 1; } else if (pfLock->fl_type == F_UNLCK) { - cFYI(1, ("F_UNLCK ")); + cFYI(1, ("F_UNLCK")); numUnlock = 1; + /* Check if unlock includes more than + one lock range */ } else if (pfLock->fl_type == F_RDLCK) { - cFYI(1, ("F_RDLCK ")); + cFYI(1, ("F_RDLCK")); lockType |= LOCKING_ANDX_SHARED_LOCK; numLock = 1; } else if (pfLock->fl_type == F_EXLCK) { - cFYI(1, ("F_EXLCK ")); + cFYI(1, ("F_EXLCK")); numLock = 1; } else if (pfLock->fl_type == F_SHLCK) { - cFYI(1, ("F_SHLCK ")); + cFYI(1, ("F_SHLCK")); lockType |= LOCKING_ANDX_SHARED_LOCK; numLock = 1; } else - cFYI(1, ("Unknown type of lock ")); + cFYI(1, ("Unknown type of lock")); cifs_sb = CIFS_SB(file->f_dentry->d_sb); pTcon = cifs_sb->tcon; @@ -635,27 +641,41 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) FreeXid(xid); return -EBADF; } + netfid = ((struct cifsFileInfo *)file->private_data)->netfid; + + /* BB add code here to normalize offset and length to + account for negative length which we can not accept over the + wire */ if (IS_GETLK(cmd)) { - rc = CIFSSMBLock(xid, pTcon, - ((struct cifsFileInfo *)file-> - private_data)->netfid, - length, - pfLock->fl_start, 0, 1, lockType, - 0 /* wait flag */ ); + if(experimEnabled && + (cifs_sb->tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & + le64_to_cpu(cifs_sb->tcon->fsUnixInfo.Capability))) { + int posix_lock_type; + if(lockType & LOCKING_ANDX_SHARED_LOCK) + posix_lock_type = CIFS_RDLCK; + else + posix_lock_type = CIFS_WRLCK; + rc = CIFSSMBPosixLock(xid, pTcon, netfid, 1 /* get */, + length, pfLock->fl_start, + posix_lock_type, wait_flag); + FreeXid(xid); + return rc; + } + + /* BB we could chain these into one lock request BB */ + rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start, + 0, 1, lockType, 0 /* wait flag */ ); if (rc == 0) { - rc = CIFSSMBLock(xid, pTcon, - ((struct cifsFileInfo *) file-> - private_data)->netfid, - length, + rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start, 1 /* numUnlock */ , 0 /* numLock */ , lockType, 0 /* wait flag */ ); pfLock->fl_type = F_UNLCK; if (rc != 0) cERROR(1, ("Error unlocking previously locked " - "range %d during test of lock ", - rc)); + "range %d during test of lock", rc)); rc = 0; } else { @@ -667,12 +687,30 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *pfLock) FreeXid(xid); return rc; } - - rc = CIFSSMBLock(xid, pTcon, - ((struct cifsFileInfo *) file->private_data)-> - netfid, length, - pfLock->fl_start, numUnlock, numLock, lockType, - wait_flag); + if (experimEnabled && + (cifs_sb->tcon->ses->capabilities & CAP_UNIX) && + (CIFS_UNIX_FCNTL_CAP & + le64_to_cpu(cifs_sb->tcon->fsUnixInfo.Capability))) { + int posix_lock_type; + if(lockType & LOCKING_ANDX_SHARED_LOCK) + posix_lock_type = CIFS_RDLCK; + else + posix_lock_type = CIFS_WRLCK; + + if(numUnlock == 1) + posix_lock_type = CIFS_UNLCK; + else if(numLock == 0) { + /* if no lock or unlock then nothing + to do since we do not know what it is */ + FreeXid(xid); + return -EOPNOTSUPP; + } + rc = CIFSSMBPosixLock(xid, pTcon, netfid, 0 /* set */, + length, pfLock->fl_start, + posix_lock_type, wait_flag); + } else + rc = CIFSSMBLock(xid, pTcon, netfid, length, pfLock->fl_start, + numUnlock, numLock, lockType, wait_flag); if (pfLock->fl_flags & FL_POSIX) posix_lock_file_wait(file, pfLock); FreeXid(xid); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 598eec9778f..957ddd1571c 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -565,11 +565,14 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry) struct cifsInodeInfo *cifsInode; FILE_BASIC_INFO *pinfo_buf; - cFYI(1, ("cifs_unlink, inode = 0x%p with ", inode)); + cFYI(1, ("cifs_unlink, inode = 0x%p", inode)); xid = GetXid(); - cifs_sb = CIFS_SB(inode->i_sb); + if(inode) + cifs_sb = CIFS_SB(inode->i_sb); + else + cifs_sb = CIFS_SB(direntry->d_sb); pTcon = cifs_sb->tcon; /* Unlink can be called from rename so we can not grab the sem here @@ -609,9 +612,8 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry) } } else if (rc == -EACCES) { /* try only if r/o attribute set in local lookup data? */ - pinfo_buf = kmalloc(sizeof(FILE_BASIC_INFO), GFP_KERNEL); + pinfo_buf = kzalloc(sizeof(FILE_BASIC_INFO), GFP_KERNEL); if (pinfo_buf) { - memset(pinfo_buf, 0, sizeof(FILE_BASIC_INFO)); /* ATTRS set to normal clears r/o bit */ pinfo_buf->Attributes = cpu_to_le32(ATTR_NORMAL); if (!(pTcon->ses->flags & CIFS_SES_NT4)) @@ -693,9 +695,11 @@ int cifs_unlink(struct inode *inode, struct dentry *direntry) when needed */ direntry->d_inode->i_ctime = current_fs_time(inode->i_sb); } - inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); - cifsInode = CIFS_I(inode); - cifsInode->time = 0; /* force revalidate of dir as well */ + if(inode) { + inode->i_ctime = inode->i_mtime = current_fs_time(inode->i_sb); + cifsInode = CIFS_I(inode); + cifsInode->time = 0; /* force revalidate of dir as well */ + } kfree(full_path); FreeXid(xid); @@ -1167,7 +1171,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) nfid, npid, FALSE); atomic_dec(&open_file->wrtPending); cFYI(1,("SetFSize for attrs rc = %d", rc)); - if(rc == -EINVAL) { + if((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { int bytes_written; rc = CIFSSMBWrite(xid, pTcon, nfid, 0, attrs->ia_size, @@ -1189,7 +1193,7 @@ int cifs_setattr(struct dentry *direntry, struct iattr *attrs) cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); cFYI(1, ("SetEOF by path (setattrs) rc = %d", rc)); - if(rc == -EINVAL) { + if((rc == -EINVAL) || (rc == -EOPNOTSUPP)) { __u16 netfid; int oplock = FALSE; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 8d0da7c87c7..9562f5bba65 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -67,7 +67,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, cifs_sb_target->local_nls, cifs_sb_target->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); - if(rc == -EIO) + if((rc == -EIO) || (rc == -EINVAL)) rc = -EOPNOTSUPP; } diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 432ba15e2c2..fafd056426e 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -72,10 +72,9 @@ sesInfoAlloc(void) struct cifsSesInfo *ret_buf; ret_buf = - (struct cifsSesInfo *) kmalloc(sizeof (struct cifsSesInfo), + (struct cifsSesInfo *) kzalloc(sizeof (struct cifsSesInfo), GFP_KERNEL); if (ret_buf) { - memset(ret_buf, 0, sizeof (struct cifsSesInfo)); write_lock(&GlobalSMBSeslock); atomic_inc(&sesInfoAllocCount); ret_buf->status = CifsNew; @@ -110,10 +109,9 @@ tconInfoAlloc(void) { struct cifsTconInfo *ret_buf; ret_buf = - (struct cifsTconInfo *) kmalloc(sizeof (struct cifsTconInfo), + (struct cifsTconInfo *) kzalloc(sizeof (struct cifsTconInfo), GFP_KERNEL); if (ret_buf) { - memset(ret_buf, 0, sizeof (struct cifsTconInfo)); write_lock(&GlobalSMBSeslock); atomic_inc(&tconInfoAllocCount); list_add(&ret_buf->cifsConnectionList, @@ -423,9 +421,7 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length) { __u32 len = smb->smb_buf_length; __u32 clc_len; /* calculated length */ - cFYI(0, - ("Entering checkSMB with Length: %x, smb_buf_length: %x", - length, len)); + cFYI(0, ("checkSMB Length: 0x%x, smb_buf_length: 0x%x", length, len)); if (((unsigned int)length < 2 + sizeof (struct smb_hdr)) || (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4)) { if ((unsigned int)length < 2 + sizeof (struct smb_hdr)) { @@ -433,29 +429,36 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length) sizeof (struct smb_hdr) - 1) && (smb->Status.CifsError != 0)) { smb->WordCount = 0; - return 0; /* some error cases do not return wct and bcc */ + /* some error cases do not return wct and bcc */ + return 0; } else { cERROR(1, ("Length less than smb header size")); } - } if (len > CIFSMaxBufSize + MAX_CIFS_HDR_SIZE - 4) - cERROR(1, - ("smb_buf_length greater than MaxBufSize")); - cERROR(1, - ("bad smb detected. Illegal length. mid=%d", - smb->Mid)); + cERROR(1, ("smb length greater than MaxBufSize, mid=%d", + smb->Mid)); return 1; } if (checkSMBhdr(smb, mid)) return 1; clc_len = smbCalcSize_LE(smb); - if ((4 + len != clc_len) - || (4 + len != (unsigned int)length)) { - cERROR(1, ("Calculated size 0x%x vs actual length 0x%x", - clc_len, 4 + len)); - cERROR(1, ("bad smb size detected for Mid=%d", smb->Mid)); + + if(4 + len != (unsigned int)length) { + cERROR(1, ("Length read does not match RFC1001 length %d",len)); + return 1; + } + + if (4 + len != clc_len) { + /* check if bcc wrapped around for large read responses */ + if((len > 64 * 1024) && (len > clc_len)) { + /* check if lengths match mod 64K */ + if(((4 + len) & 0xFFFF) == (clc_len & 0xFFFF)) + return 0; /* bcc wrapped */ + } + cFYI(1, ("Calculated size %d vs length %d mismatch for mid %d", + clc_len, 4 + len, smb->Mid)); /* Windows XP can return a few bytes too much, presumably an illegal pad, at the end of byte range lock responses so we allow for that three byte pad, as long as actual @@ -469,8 +472,11 @@ checkSMB(struct smb_hdr *smb, __u16 mid, int length) wct and bcc to minimum size and drop the t2 parms and data */ if((4+len > clc_len) && (len <= clc_len + 512)) return 0; - else + else { + cERROR(1, ("RFC1001 size %d bigger than SMB for Mid=%d", + len, smb->Mid)); return 1; + } } return 0; } diff --git a/fs/cifs/ntlmssp.c b/fs/cifs/ntlmssp.c new file mode 100644 index 00000000000..78866f92574 --- /dev/null +++ b/fs/cifs/ntlmssp.c @@ -0,0 +1,129 @@ +/* + * fs/cifs/ntlmssp.h + * + * Copyright (c) International Business Machines Corp., 2006 + * Author(s): Steve French (sfrench@us.ibm.com) + * + * This library is free software; you can redistribute it and/or modify + * it under the terms of the GNU Lesser General Public License as published + * by the Free Software Foundation; either version 2.1 of the License, or + * (at your option) any later version. + * + * This library is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See + * the GNU Lesser General Public License for more details. + * + * You should have received a copy of the GNU Lesser General Public License + * along with this library; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA + */ + +#include "cifspdu.h" +#include "cifsglob.h" +#include "cifsproto.h" +#include "cifs_unicode.h" +#include "cifs_debug.h" +#include "ntlmssp.h" +#include "nterr.h" + +#ifdef CONFIG_CIFS_EXPERIMENTAL +static __u32 cifs_ssetup_hdr(struct cifsSesInfo *ses, SESSION_SETUP_ANDX *pSMB) +{ + __u32 capabilities = 0; + + /* init fields common to all four types of SessSetup */ + /* note that header is initialized to zero in header_assemble */ + pSMB->req.AndXCommand = 0xFF; + pSMB->req.MaxBufferSize = cpu_to_le16(ses->server->maxBuf); + pSMB->req.MaxMpxCount = cpu_to_le16(ses->server->maxReq); + + /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ + + /* BB verify whether signing required on neg or just on auth frame + (and NTLM case) */ + + capabilities = CAP_LARGE_FILES | CAP_NT_SMBS | CAP_LEVEL_II_OPLOCKS | + CAP_LARGE_WRITE_X | CAP_LARGE_READ_X; + + if(ses->server->secMode & (SECMODE_SIGN_REQUIRED | SECMODE_SIGN_ENABLED)) + pSMB->req.hdr.Flags2 |= SMBFLG2_SECURITY_SIGNATURE; + + if (ses->capabilities & CAP_UNICODE) { + pSMB->req.hdr.Flags2 |= SMBFLG2_UNICODE; + capabilities |= CAP_UNICODE; + } + if (ses->capabilities & CAP_STATUS32) { + pSMB->req.hdr.Flags2 |= SMBFLG2_ERR_STATUS; + capabilities |= CAP_STATUS32; + } + if (ses->capabilities & CAP_DFS) { + pSMB->req.hdr.Flags2 |= SMBFLG2_DFS; + capabilities |= CAP_DFS; + } + + /* BB check whether to init vcnum BB */ + return capabilities; +} +int +CIFS_SessSetup(unsigned int xid, struct cifsSesInfo *ses, const int type, + int * pNTLMv2_flg, const struct nls_table *nls_cp) +{ + int rc = 0; + int wct; + struct smb_hdr *smb_buffer; + char *bcc_ptr; + SESSION_SETUP_ANDX *pSMB; + __u32 capabilities; + + if(ses == NULL) + return -EINVAL; + + cFYI(1,("SStp type: %d",type)); + if(type < CIFS_NTLM) { +#ifndef CONFIG_CIFS_WEAK_PW_HASH + /* LANMAN and plaintext are less secure and off by default. + So we make this explicitly be turned on in kconfig (in the + build) and turned on at runtime (changed from the default) + in proc/fs/cifs or via mount parm. Unfortunately this is + needed for old Win (e.g. Win95), some obscure NAS and OS/2 */ + return -EOPNOTSUPP; +#endif + wct = 10; /* lanman 2 style sessionsetup */ + } else if(type < CIFS_NTLMSSP_NEG) + wct = 13; /* old style NTLM sessionsetup */ + else /* same size for negotiate or auth, NTLMSSP or extended security */ + wct = 12; + + rc = small_smb_init_no_tc(SMB_COM_SESSION_SETUP_ANDX, wct, ses, + (void **)&smb_buffer); + if(rc) + return rc; + + pSMB = (SESSION_SETUP_ANDX *)smb_buffer; + + capabilities = cifs_ssetup_hdr(ses, pSMB); + bcc_ptr = pByteArea(smb_buffer); + if(type > CIFS_NTLM) { + pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; + capabilities |= CAP_EXTENDED_SECURITY; + pSMB->req.Capabilities = cpu_to_le32(capabilities); + /* BB set password lengths */ + } else if(type < CIFS_NTLM) /* lanman */ { + /* no capabilities flags in old lanman negotiation */ + /* pSMB->old_req.PasswordLength = */ /* BB fixme BB */ + } else /* type CIFS_NTLM */ { + pSMB->req_no_secext.Capabilities = cpu_to_le32(capabilities); + pSMB->req_no_secext.CaseInsensitivePasswordLength = + cpu_to_le16(CIFS_SESSION_KEY_SIZE); + pSMB->req_no_secext.CaseSensitivePasswordLength = + cpu_to_le16(CIFS_SESSION_KEY_SIZE); + } + + +/* rc = SendReceive2(xid, ses, iov, num_iovecs, &resp_buf_type, 0); */ + /* SMB request buf freed in SendReceive2 */ + + return rc; +} +#endif /* CONFIG_CIFS_EXPERIMENTAL */ diff --git a/fs/cifs/ntlmssp.h b/fs/cifs/ntlmssp.h index 803389b64a2..d39b712a11c 100644 --- a/fs/cifs/ntlmssp.h +++ b/fs/cifs/ntlmssp.h @@ -1,7 +1,7 @@ /* * fs/cifs/ntlmssp.h * - * Copyright (c) International Business Machines Corp., 2002 + * Copyright (c) International Business Machines Corp., 2002,2006 * Author(s): Steve French (sfrench@us.ibm.com) * * This library is free software; you can redistribute it and/or modify diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 488bd0d81dc..2f6e2825571 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -604,7 +604,12 @@ static int find_cifs_entry(const int xid, struct cifsTconInfo *pTcon, cifsFile->search_resume_name = NULL; if(cifsFile->srch_inf.ntwrk_buf_start) { cFYI(1,("freeing SMB ff cache buf on search rewind")); - cifs_buf_release(cifsFile->srch_inf.ntwrk_buf_start); + if(cifsFile->srch_inf.smallBuf) + cifs_small_buf_release(cifsFile->srch_inf. + ntwrk_buf_start); + else + cifs_buf_release(cifsFile->srch_inf. + ntwrk_buf_start); } rc = initiate_cifs_search(xid,file); if(rc) { diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index b12cb8a7da7..3da80409466 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -309,17 +309,16 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, *pRespBufType = CIFS_NO_BUFFER; /* no response buf yet */ - if (ses == NULL) { - cERROR(1,("Null smb session")); - return -EIO; - } - if(ses->server == NULL) { - cERROR(1,("Null tcp session")); + if ((ses == NULL) || (ses->server == NULL)) { + cifs_small_buf_release(in_buf); + cERROR(1,("Null session")); return -EIO; } - if(ses->server->tcpStatus == CifsExiting) + if(ses->server->tcpStatus == CifsExiting) { + cifs_small_buf_release(in_buf); return -ENOENT; + } /* Ensure that we do not send more than 50 overlapping requests to the same server. We may make this configurable later or @@ -346,6 +345,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, } else { if(ses->server->tcpStatus == CifsExiting) { spin_unlock(&GlobalMid_Lock); + cifs_small_buf_release(in_buf); return -ENOENT; } @@ -385,6 +385,7 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, midQ = AllocMidQEntry(in_buf, ses); if (midQ == NULL) { up(&ses->server->tcpSem); + cifs_small_buf_release(in_buf); /* If not lock req, update # of requests on wire to server */ if(long_op < 3) { atomic_dec(&ses->server->inFlight); @@ -408,14 +409,18 @@ SendReceive2(const unsigned int xid, struct cifsSesInfo *ses, if(rc < 0) { DeleteMidQEntry(midQ); up(&ses->server->tcpSem); + cifs_small_buf_release(in_buf); /* If not lock req, update # of requests on wire to server */ if(long_op < 3) { atomic_dec(&ses->server->inFlight); wake_up(&ses->server->request_q); } return rc; - } else + } else { up(&ses->server->tcpSem); + cifs_small_buf_release(in_buf); + } + if (long_op == -1) goto cifs_no_response_exit2; else if (long_op == 2) /* writes past end of file can take loong time */ @@ -543,6 +548,7 @@ cifs_no_response_exit2: out_unlock2: up(&ses->server->tcpSem); + cifs_small_buf_release(in_buf); /* If not lock req, update # of requests on wire to server */ if(long_op < 3) { atomic_dec(&ses->server->inFlight); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 8ed9b06a982..5638c8f9362 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -504,7 +504,7 @@ static int populate_groups(struct config_group *group) int ret = 0; int i; - if (group && group->default_groups) { + if (group->default_groups) { /* FYI, we're faking mkdir here * I'm not sure we need this semaphore, as we're called * from our parent's mkdir. That holds our parent's diff --git a/fs/dcache.c b/fs/dcache.c index 19458d39950..940d188e5d1 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1101,6 +1101,32 @@ next: } /** + * d_hash_and_lookup - hash the qstr then search for a dentry + * @dir: Directory to search in + * @name: qstr of name we wish to find + * + * On hash failure or on lookup failure NULL is returned. + */ +struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) +{ + struct dentry *dentry = NULL; + + /* + * Check for a fs-specific hash function. Note that we must + * calculate the standard hash first, as the d_op->d_hash() + * routine may choose to leave the hash value unchanged. + */ + name->hash = full_name_hash(name->name, name->len); + if (dir->d_op && dir->d_op->d_hash) { + if (dir->d_op->d_hash(dir, name) < 0) + goto out; + } + dentry = d_lookup(dir, name); +out: + return dentry; +} + +/** * d_validate - verify dentry provided from insecure source * @dentry: The dentry alleged to be valid child of @dparent * @dparent: The parent dentry (known to be valid) @@ -1172,11 +1198,11 @@ void d_delete(struct dentry * dentry) spin_lock(&dentry->d_lock); isdir = S_ISDIR(dentry->d_inode->i_mode); if (atomic_read(&dentry->d_count) == 1) { - /* remove this and other inotify debug checks after 2.6.18 */ - dentry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED; - dentry_iput(dentry); fsnotify_nameremove(dentry, isdir); + + /* remove this and other inotify debug checks after 2.6.18 */ + dentry->d_flags &= ~DCACHE_INOTIFY_PARENT_WATCHED; return; } @@ -1616,26 +1642,12 @@ ino_t find_inode_number(struct dentry *dir, struct qstr *name) struct dentry * dentry; ino_t ino = 0; - /* - * Check for a fs-specific hash function. Note that we must - * calculate the standard hash first, as the d_op->d_hash() - * routine may choose to leave the hash value unchanged. - */ - name->hash = full_name_hash(name->name, name->len); - if (dir->d_op && dir->d_op->d_hash) - { - if (dir->d_op->d_hash(dir, name) != 0) - goto out; - } - - dentry = d_lookup(dir, name); - if (dentry) - { + dentry = d_hash_and_lookup(dir, name); + if (dentry) { if (dentry->d_inode) ino = dentry->d_inode->i_ino; dput(dentry); } -out: return ino; } diff --git a/fs/direct-io.c b/fs/direct-io.c index 9d1d2aa73e4..b05d1b21877 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -524,8 +524,6 @@ static int get_more_blocks(struct dio *dio) */ ret = dio->page_errors; if (ret == 0) { - map_bh->b_state = 0; - map_bh->b_size = 0; BUG_ON(dio->block_in_file >= dio->final_block_in_request); fs_startblk = dio->block_in_file >> dio->blkfactor; dio_count = dio->final_block_in_request - dio->block_in_file; @@ -534,6 +532,9 @@ static int get_more_blocks(struct dio *dio) if (dio_count & blkmask) fs_count++; + map_bh->b_state = 0; + map_bh->b_size = fs_count << dio->inode->i_blkbits; + create = dio->rw == WRITE; if (dio->lock_type == DIO_LOCKING) { if (dio->block_in_file < (i_size_read(dio->inode) >> @@ -542,13 +543,13 @@ static int get_more_blocks(struct dio *dio) } else if (dio->lock_type == DIO_NO_LOCKING) { create = 0; } + /* * For writes inside i_size we forbid block creations: only * overwrites are permitted. We fall back to buffered writes * at a higher level for inside-i_size block-instantiating * writes. */ - map_bh->b_size = fs_count << dio->blkbits; ret = (*dio->get_block)(dio->inode, fs_startblk, map_bh, create); } @@ -928,8 +929,7 @@ do_holes: block_in_page += this_chunk_blocks; dio->blocks_available -= this_chunk_blocks; next_block: - if (dio->block_in_file > dio->final_block_in_request) - BUG(); + BUG_ON(dio->block_in_file > dio->final_block_in_request); if (dio->block_in_file == dio->final_block_in_request) break; } diff --git a/fs/dquot.c b/fs/dquot.c index 6b388692093..81d87a413c6 100644 --- a/fs/dquot.c +++ b/fs/dquot.c @@ -590,8 +590,7 @@ we_slept: atomic_dec(&dquot->dq_count); #ifdef __DQUOT_PARANOIA /* sanity check */ - if (!list_empty(&dquot->dq_free)) - BUG(); + BUG_ON(!list_empty(&dquot->dq_free)); #endif put_dquot_last(dquot); spin_unlock(&dq_list_lock); @@ -666,8 +665,7 @@ we_slept: return NODQUOT; } #ifdef __DQUOT_PARANOIA - if (!dquot->dq_sb) /* Has somebody invalidated entry under us? */ - BUG(); + BUG_ON(!dquot->dq_sb); /* Has somebody invalidated entry under us? */ #endif return dquot; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 242fe1a66ce..1b4491cdd11 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -599,7 +599,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) switch (op) { case EPOLL_CTL_ADD: if (!epi) { - epds.events |= POLLERR | POLLHUP | POLLRDHUP; + epds.events |= POLLERR | POLLHUP; error = ep_insert(ep, &epds, tfile, fd); } else @@ -613,7 +613,7 @@ sys_epoll_ctl(int epfd, int op, int fd, struct epoll_event __user *event) break; case EPOLL_CTL_MOD: if (epi) { - epds.events |= POLLERR | POLLHUP | POLLRDHUP; + epds.events |= POLLERR | POLLHUP; error = ep_modify(ep, epi, &epds); } else error = -ENOENT; diff --git a/fs/exec.c b/fs/exec.c index 950ebd43cdc..4121bb55973 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -561,7 +561,7 @@ static int exec_mmap(struct mm_struct *mm) arch_pick_mmap_layout(mm); if (old_mm) { up_read(&old_mm->mmap_sem); - if (active_mm != old_mm) BUG(); + BUG_ON(active_mm != old_mm); mmput(old_mm); return 0; } @@ -665,9 +665,7 @@ static int de_thread(struct task_struct *tsk) * and to assume its PID: */ if (!thread_group_leader(current)) { - struct task_struct *parent; struct dentry *proc_dentry1, *proc_dentry2; - unsigned long ptrace; /* * Wait for the thread group leader to be a zombie. @@ -678,6 +676,18 @@ static int de_thread(struct task_struct *tsk) while (leader->exit_state != EXIT_ZOMBIE) yield(); + /* + * The only record we have of the real-time age of a + * process, regardless of execs it's done, is start_time. + * All the past CPU time is accumulated in signal_struct + * from sister threads now dead. But in this non-leader + * exec, nothing survives from the original leader thread, + * whose birth marks the true age of this process now. + * When we take on its identity by switching to its PID, we + * also take its birthdate (always earlier than our own). + */ + current->start_time = leader->start_time; + spin_lock(&leader->proc_lock); spin_lock(¤t->proc_lock); proc_dentry1 = proc_pid_unhash(current); @@ -692,22 +702,6 @@ static int de_thread(struct task_struct *tsk) * two threads with a switched PID, and release * the former thread group leader: */ - ptrace = leader->ptrace; - parent = leader->parent; - if (unlikely(ptrace) && unlikely(parent == current)) { - /* - * Joker was ptracing his own group leader, - * and now he wants to be his own parent! - * We can't have that. - */ - ptrace = 0; - } - - ptrace_unlink(current); - ptrace_unlink(leader); - remove_parent(current); - remove_parent(leader); - /* Become a process group leader with the old leader's pid. * Note: The old leader also uses thispid until release_task @@ -720,17 +714,13 @@ static int de_thread(struct task_struct *tsk) attach_pid(current, PIDTYPE_SID, current->signal->session); list_add_tail(¤t->tasks, &init_task.tasks); - current->parent = current->real_parent = leader->real_parent; - leader->parent = leader->real_parent = child_reaper; current->group_leader = current; - leader->group_leader = leader; + leader->group_leader = current; - add_parent(current); - add_parent(leader); - if (ptrace) { - current->ptrace = ptrace; - __ptrace_link(current, parent); - } + /* Reduce leader to a thread */ + detach_pid(leader, PIDTYPE_PGID); + detach_pid(leader, PIDTYPE_SID); + list_del_init(&leader->tasks); current->exit_signal = SIGCHLD; diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 509cceca04d..23e2c7ccec1 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -53,6 +53,8 @@ const struct file_operations ext2_file_operations = { .readv = generic_file_readv, .writev = generic_file_writev, .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, }; #ifdef CONFIG_EXT2_FS_XIP diff --git a/fs/ext3/file.c b/fs/ext3/file.c index 783a796220b..1efefb630ea 100644 --- a/fs/ext3/file.c +++ b/fs/ext3/file.c @@ -119,6 +119,8 @@ const struct file_operations ext3_file_operations = { .release = ext3_release_file, .fsync = ext3_sync_file, .sendfile = generic_file_sendfile, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, }; struct inode_operations ext3_file_inode_operations = { diff --git a/fs/ext3/resize.c b/fs/ext3/resize.c index 1041dab6de2..c5ffa852396 100644 --- a/fs/ext3/resize.c +++ b/fs/ext3/resize.c @@ -767,6 +767,7 @@ int ext3_group_add(struct super_block *sb, struct ext3_new_group_data *input) if (input->group != sbi->s_groups_count) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); + unlock_super(sb); err = -EBUSY; goto exit_journal; } @@ -974,6 +975,7 @@ int ext3_group_extend(struct super_block *sb, struct ext3_super_block *es, if (o_blocks_count != le32_to_cpu(es->s_blocks_count)) { ext3_warning(sb, __FUNCTION__, "multiple resizers run on filesystem!"); + unlock_super(sb); err = -EBUSY; goto exit_put; } diff --git a/fs/fcntl.c b/fs/fcntl.c index 2a2479196f9..d35cbc6bc11 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -453,8 +453,7 @@ static void send_sigio_to_task(struct task_struct *p, /* Make sure we are called with one of the POLL_* reasons, otherwise we could leak kernel stack into userspace. */ - if ((reason & __SI_MASK) != __SI_POLL) - BUG(); + BUG_ON((reason & __SI_MASK) != __SI_POLL); if (reason - POLL_IN >= NSIGPOLL) si.si_band = ~0L; else diff --git a/fs/fifo.c b/fs/fifo.c index 889f722ee36..49035b174b4 100644 --- a/fs/fifo.c +++ b/fs/fifo.c @@ -15,30 +15,35 @@ #include <linux/fs.h> #include <linux/pipe_fs_i.h> -static void wait_for_partner(struct inode* inode, unsigned int* cnt) +static void wait_for_partner(struct inode* inode, unsigned int *cnt) { int cur = *cnt; - while(cur == *cnt) { - pipe_wait(inode); - if(signal_pending(current)) + + while (cur == *cnt) { + pipe_wait(inode->i_pipe); + if (signal_pending(current)) break; } } static void wake_up_partner(struct inode* inode) { - wake_up_interruptible(PIPE_WAIT(*inode)); + wake_up_interruptible(&inode->i_pipe->wait); } static int fifo_open(struct inode *inode, struct file *filp) { + struct pipe_inode_info *pipe; int ret; - mutex_lock(PIPE_MUTEX(*inode)); - if (!inode->i_pipe) { + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + if (!pipe) { ret = -ENOMEM; - if(!pipe_new(inode)) + pipe = alloc_pipe_info(inode); + if (!pipe) goto err_nocleanup; + inode->i_pipe = pipe; } filp->f_version = 0; @@ -53,18 +58,18 @@ static int fifo_open(struct inode *inode, struct file *filp) * opened, even when there is no process writing the FIFO. */ filp->f_op = &read_fifo_fops; - PIPE_RCOUNTER(*inode)++; - if (PIPE_READERS(*inode)++ == 0) + pipe->r_counter++; + if (pipe->readers++ == 0) wake_up_partner(inode); - if (!PIPE_WRITERS(*inode)) { + if (!pipe->writers) { if ((filp->f_flags & O_NONBLOCK)) { /* suppress POLLHUP until we have * seen a writer */ - filp->f_version = PIPE_WCOUNTER(*inode); + filp->f_version = pipe->w_counter; } else { - wait_for_partner(inode, &PIPE_WCOUNTER(*inode)); + wait_for_partner(inode, &pipe->w_counter); if(signal_pending(current)) goto err_rd; } @@ -78,16 +83,16 @@ static int fifo_open(struct inode *inode, struct file *filp) * errno=ENXIO when there is no process reading the FIFO. */ ret = -ENXIO; - if ((filp->f_flags & O_NONBLOCK) && !PIPE_READERS(*inode)) + if ((filp->f_flags & O_NONBLOCK) && !pipe->readers) goto err; filp->f_op = &write_fifo_fops; - PIPE_WCOUNTER(*inode)++; - if (!PIPE_WRITERS(*inode)++) + pipe->w_counter++; + if (!pipe->writers++) wake_up_partner(inode); - if (!PIPE_READERS(*inode)) { - wait_for_partner(inode, &PIPE_RCOUNTER(*inode)); + if (!pipe->readers) { + wait_for_partner(inode, &pipe->r_counter); if (signal_pending(current)) goto err_wr; } @@ -102,11 +107,11 @@ static int fifo_open(struct inode *inode, struct file *filp) */ filp->f_op = &rdwr_fifo_fops; - PIPE_READERS(*inode)++; - PIPE_WRITERS(*inode)++; - PIPE_RCOUNTER(*inode)++; - PIPE_WCOUNTER(*inode)++; - if (PIPE_READERS(*inode) == 1 || PIPE_WRITERS(*inode) == 1) + pipe->readers++; + pipe->writers++; + pipe->r_counter++; + pipe->w_counter++; + if (pipe->readers == 1 || pipe->writers == 1) wake_up_partner(inode); break; @@ -116,27 +121,27 @@ static int fifo_open(struct inode *inode, struct file *filp) } /* Ok! */ - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return 0; err_rd: - if (!--PIPE_READERS(*inode)) - wake_up_interruptible(PIPE_WAIT(*inode)); + if (!--pipe->readers) + wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err_wr: - if (!--PIPE_WRITERS(*inode)) - wake_up_interruptible(PIPE_WAIT(*inode)); + if (!--pipe->writers) + wake_up_interruptible(&pipe->wait); ret = -ERESTARTSYS; goto err; err: - if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) + if (!pipe->readers && !pipe->writers) free_pipe_info(inode); err_nocleanup: - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return ret; } diff --git a/fs/freevxfs/vxfs_olt.c b/fs/freevxfs/vxfs_olt.c index 76a0708ae97..04950084790 100644 --- a/fs/freevxfs/vxfs_olt.c +++ b/fs/freevxfs/vxfs_olt.c @@ -42,24 +42,21 @@ static inline void vxfs_get_fshead(struct vxfs_oltfshead *fshp, struct vxfs_sb_info *infp) { - if (infp->vsi_fshino) - BUG(); + BUG_ON(infp->vsi_fshino); infp->vsi_fshino = fshp->olt_fsino[0]; } static inline void vxfs_get_ilist(struct vxfs_oltilist *ilistp, struct vxfs_sb_info *infp) { - if (infp->vsi_iext) - BUG(); + BUG_ON(infp->vsi_iext); infp->vsi_iext = ilistp->olt_iext[0]; } static inline u_long vxfs_oblock(struct super_block *sbp, daddr_t block, u_long bsize) { - if (sbp->s_blocksize % bsize) - BUG(); + BUG_ON(sbp->s_blocksize % bsize); return (block * (sbp->s_blocksize / bsize)); } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 23d1f52eb1b..cc750c68fe7 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -23,13 +23,11 @@ static kmem_cache_t *fuse_req_cachep; static struct fuse_conn *fuse_get_conn(struct file *file) { - struct fuse_conn *fc; - spin_lock(&fuse_lock); - fc = file->private_data; - if (fc && !fc->connected) - fc = NULL; - spin_unlock(&fuse_lock); - return fc; + /* + * Lockless access is OK, because file->private data is set + * once during mount and is valid until the file is released. + */ + return file->private_data; } static void fuse_request_init(struct fuse_req *req) @@ -74,10 +72,8 @@ static void restore_sigs(sigset_t *oldset) */ void fuse_reset_request(struct fuse_req *req) { - int preallocated = req->preallocated; BUG_ON(atomic_read(&req->count) != 1); fuse_request_init(req); - req->preallocated = preallocated; } static void __fuse_get_request(struct fuse_req *req) @@ -92,80 +88,54 @@ static void __fuse_put_request(struct fuse_req *req) atomic_dec(&req->count); } -static struct fuse_req *do_get_request(struct fuse_conn *fc) +struct fuse_req *fuse_get_req(struct fuse_conn *fc) { struct fuse_req *req; - - spin_lock(&fuse_lock); - BUG_ON(list_empty(&fc->unused_list)); - req = list_entry(fc->unused_list.next, struct fuse_req, list); - list_del_init(&req->list); - spin_unlock(&fuse_lock); - fuse_request_init(req); - req->preallocated = 1; - req->in.h.uid = current->fsuid; - req->in.h.gid = current->fsgid; - req->in.h.pid = current->pid; - return req; -} - -/* This can return NULL, but only in case it's interrupted by a SIGKILL */ -struct fuse_req *fuse_get_request(struct fuse_conn *fc) -{ - int intr; sigset_t oldset; + int intr; + int err; atomic_inc(&fc->num_waiting); block_sigs(&oldset); - intr = down_interruptible(&fc->outstanding_sem); + intr = wait_event_interruptible(fc->blocked_waitq, !fc->blocked); restore_sigs(&oldset); - if (intr) { - atomic_dec(&fc->num_waiting); - return NULL; - } - return do_get_request(fc); -} + err = -EINTR; + if (intr) + goto out; -/* Must be called with fuse_lock held */ -static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req) -{ - if (req->preallocated) { - atomic_dec(&fc->num_waiting); - list_add(&req->list, &fc->unused_list); - } else - fuse_request_free(req); + req = fuse_request_alloc(); + err = -ENOMEM; + if (!req) + goto out; - /* If we are in debt decrease that first */ - if (fc->outstanding_debt) - fc->outstanding_debt--; - else - up(&fc->outstanding_sem); + req->in.h.uid = current->fsuid; + req->in.h.gid = current->fsgid; + req->in.h.pid = current->pid; + req->waiting = 1; + return req; + + out: + atomic_dec(&fc->num_waiting); + return ERR_PTR(err); } void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req) { if (atomic_dec_and_test(&req->count)) { - spin_lock(&fuse_lock); - fuse_putback_request(fc, req); - spin_unlock(&fuse_lock); + if (req->waiting) + atomic_dec(&fc->num_waiting); + fuse_request_free(req); } } -static void fuse_put_request_locked(struct fuse_conn *fc, struct fuse_req *req) -{ - if (atomic_dec_and_test(&req->count)) - fuse_putback_request(fc, req); -} - -void fuse_release_background(struct fuse_req *req) +void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req) { - iput(req->inode); - iput(req->inode2); - if (req->file) - fput(req->file); - spin_lock(&fuse_lock); - list_del(&req->bg_entry); - spin_unlock(&fuse_lock); + list_del_init(&req->bg_entry); + if (fc->num_background == FUSE_MAX_BACKGROUND) { + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); + } + fc->num_background--; } /* @@ -184,28 +154,38 @@ void fuse_release_background(struct fuse_req *req) * interrupted and put in the background, it will return with an error * and hence never be reset and reused. * - * Called with fuse_lock, unlocks it + * Called with fc->lock, unlocks it */ static void request_end(struct fuse_conn *fc, struct fuse_req *req) { list_del(&req->list); req->state = FUSE_REQ_FINISHED; if (!req->background) { + spin_unlock(&fc->lock); wake_up(&req->waitq); - fuse_put_request_locked(fc, req); - spin_unlock(&fuse_lock); + fuse_put_request(fc, req); } else { + struct inode *inode = req->inode; + struct inode *inode2 = req->inode2; + struct file *file = req->file; void (*end) (struct fuse_conn *, struct fuse_req *) = req->end; req->end = NULL; - spin_unlock(&fuse_lock); - down_read(&fc->sbput_sem); - if (fc->mounted) - fuse_release_background(req); - up_read(&fc->sbput_sem); + req->inode = NULL; + req->inode2 = NULL; + req->file = NULL; + if (!list_empty(&req->bg_entry)) + fuse_remove_background(fc, req); + spin_unlock(&fc->lock); + if (end) end(fc, req); else fuse_put_request(fc, req); + + if (file) + fput(file); + iput(inode); + iput(inode2); } } @@ -242,6 +222,9 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req) { req->background = 1; list_add(&req->bg_entry, &fc->background); + fc->num_background++; + if (fc->num_background == FUSE_MAX_BACKGROUND) + fc->blocked = 1; if (req->inode) req->inode = igrab(req->inode); if (req->inode2) @@ -250,16 +233,16 @@ static void background_request(struct fuse_conn *fc, struct fuse_req *req) get_file(req->file); } -/* Called with fuse_lock held. Releases, and then reacquires it. */ +/* Called with fc->lock held. Releases, and then reacquires it. */ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) { sigset_t oldset; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); block_sigs(&oldset); wait_event_interruptible(req->waitq, req->state == FUSE_REQ_FINISHED); restore_sigs(&oldset); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (req->state == FUSE_REQ_FINISHED && !req->interrupted) return; @@ -273,9 +256,9 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req) locked state, there mustn't be any filesystem operation (e.g. page fault), since that could lead to deadlock */ - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } if (req->state == FUSE_REQ_PENDING) { list_del(&req->list); @@ -304,19 +287,14 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) req->in.h.unique = fc->reqctr; req->in.h.len = sizeof(struct fuse_in_header) + len_args(req->in.numargs, (struct fuse_arg *) req->in.args); - if (!req->preallocated) { - /* If request is not preallocated (either FORGET or - RELEASE), then still decrease outstanding_sem, so - user can't open infinite number of files while not - processing the RELEASE requests. However for - efficiency do it without blocking, so if down() - would block, just increase the debt instead */ - if (down_trylock(&fc->outstanding_sem)) - fc->outstanding_debt++; - } list_add_tail(&req->list, &fc->pending); req->state = FUSE_REQ_PENDING; + if (!req->waiting) { + req->waiting = 1; + atomic_inc(&fc->num_waiting); + } wake_up(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); } /* @@ -325,7 +303,7 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req) void request_send(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (!fc->connected) req->out.h.error = -ENOTCONN; else if (fc->conn_error) @@ -338,15 +316,16 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req) request_wait_answer(fc, req); } - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } static void request_send_nowait(struct fuse_conn *fc, struct fuse_req *req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); + background_request(fc, req); if (fc->connected) { queue_request(fc, req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } else { req->out.h.error = -ENOTCONN; request_end(fc, req); @@ -362,9 +341,6 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req) void request_send_background(struct fuse_conn *fc, struct fuse_req *req) { req->isreply = 1; - spin_lock(&fuse_lock); - background_request(fc, req); - spin_unlock(&fuse_lock); request_send_nowait(fc, req); } @@ -373,16 +349,16 @@ void request_send_background(struct fuse_conn *fc, struct fuse_req *req) * anything that could cause a page-fault. If the request was already * interrupted bail out. */ -static int lock_request(struct fuse_req *req) +static int lock_request(struct fuse_conn *fc, struct fuse_req *req) { int err = 0; if (req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (req->interrupted) err = -ENOENT; else req->locked = 1; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } return err; } @@ -392,18 +368,19 @@ static int lock_request(struct fuse_req *req) * requester thread is currently waiting for it to be unlocked, so * wake it up. */ -static void unlock_request(struct fuse_req *req) +static void unlock_request(struct fuse_conn *fc, struct fuse_req *req) { if (req) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (req->interrupted) wake_up(&req->waitq); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } } struct fuse_copy_state { + struct fuse_conn *fc; int write; struct fuse_req *req; const struct iovec *iov; @@ -416,11 +393,12 @@ struct fuse_copy_state { unsigned len; }; -static void fuse_copy_init(struct fuse_copy_state *cs, int write, - struct fuse_req *req, const struct iovec *iov, - unsigned long nr_segs) +static void fuse_copy_init(struct fuse_copy_state *cs, struct fuse_conn *fc, + int write, struct fuse_req *req, + const struct iovec *iov, unsigned long nr_segs) { memset(cs, 0, sizeof(*cs)); + cs->fc = fc; cs->write = write; cs->req = req; cs->iov = iov; @@ -450,7 +428,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) unsigned long offset; int err; - unlock_request(cs->req); + unlock_request(cs->fc, cs->req); fuse_copy_finish(cs); if (!cs->seglen) { BUG_ON(!cs->nr_segs); @@ -473,7 +451,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs) cs->seglen -= cs->len; cs->addr += cs->len; - return lock_request(cs->req); + return lock_request(cs->fc, cs->req); } /* Do as much copy to/from userspace buffer as we can */ @@ -585,9 +563,9 @@ static void request_wait(struct fuse_conn *fc) if (signal_pending(current)) break; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); schedule(); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } set_current_state(TASK_RUNNING); remove_wait_queue(&fc->waitq, &wait); @@ -606,18 +584,21 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, unsigned long nr_segs, loff_t *off) { int err; - struct fuse_conn *fc; struct fuse_req *req; struct fuse_in *in; struct fuse_copy_state cs; unsigned reqsize; + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; restart: - spin_lock(&fuse_lock); - fc = file->private_data; - err = -EPERM; - if (!fc) + spin_lock(&fc->lock); + err = -EAGAIN; + if ((file->f_flags & O_NONBLOCK) && fc->connected && + list_empty(&fc->pending)) goto err_unlock; + request_wait(fc); err = -ENODEV; if (!fc->connected) @@ -641,14 +622,14 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, request_end(fc, req); goto restart; } - spin_unlock(&fuse_lock); - fuse_copy_init(&cs, 1, req, iov, nr_segs); + spin_unlock(&fc->lock); + fuse_copy_init(&cs, fc, 1, req, iov, nr_segs); err = fuse_copy_one(&cs, &in->h, sizeof(in->h)); if (!err) err = fuse_copy_args(&cs, in->numargs, in->argpages, (struct fuse_arg *) in->args, 0); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (!err && req->interrupted) err = -ENOENT; @@ -663,12 +644,12 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov, else { req->state = FUSE_REQ_SENT; list_move_tail(&req->list, &fc->processing); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } return reqsize; err_unlock: - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); return err; } @@ -735,9 +716,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, struct fuse_copy_state cs; struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return -EPERM; - fuse_copy_init(&cs, 0, NULL, iov, nr_segs); + fuse_copy_init(&cs, fc, 0, NULL, iov, nr_segs); if (nbytes < sizeof(struct fuse_out_header)) return -EINVAL; @@ -749,7 +730,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, oh.len != nbytes) goto err_finish; - spin_lock(&fuse_lock); + spin_lock(&fc->lock); err = -ENOENT; if (!fc->connected) goto err_unlock; @@ -760,9 +741,9 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, goto err_unlock; if (req->interrupted) { - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); request_end(fc, req); return -ENOENT; } @@ -770,12 +751,12 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, req->out.h = oh; req->locked = 1; cs.req = req; - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); err = copy_out_args(&cs, &req->out, nbytes); fuse_copy_finish(&cs); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); req->locked = 0; if (!err) { if (req->interrupted) @@ -787,7 +768,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov, return err ? err : nbytes; err_unlock: - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); err_finish: fuse_copy_finish(&cs); return err; @@ -804,18 +785,19 @@ static ssize_t fuse_dev_write(struct file *file, const char __user *buf, static unsigned fuse_dev_poll(struct file *file, poll_table *wait) { - struct fuse_conn *fc = fuse_get_conn(file); unsigned mask = POLLOUT | POLLWRNORM; - + struct fuse_conn *fc = fuse_get_conn(file); if (!fc) - return -ENODEV; + return POLLERR; poll_wait(file, &fc->waitq, wait); - spin_lock(&fuse_lock); - if (!list_empty(&fc->pending)) - mask |= POLLIN | POLLRDNORM; - spin_unlock(&fuse_lock); + spin_lock(&fc->lock); + if (!fc->connected) + mask = POLLERR; + else if (!list_empty(&fc->pending)) + mask |= POLLIN | POLLRDNORM; + spin_unlock(&fc->lock); return mask; } @@ -823,7 +805,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait) /* * Abort all requests on the given list (pending or processing) * - * This function releases and reacquires fuse_lock + * This function releases and reacquires fc->lock */ static void end_requests(struct fuse_conn *fc, struct list_head *head) { @@ -832,7 +814,7 @@ static void end_requests(struct fuse_conn *fc, struct list_head *head) req = list_entry(head->next, struct fuse_req, list); req->out.h.error = -ECONNABORTED; request_end(fc, req); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } } @@ -863,10 +845,10 @@ static void end_io_requests(struct fuse_conn *fc) req->end = NULL; /* The end function will consume this reference */ __fuse_get_request(req); - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); wait_event(req->waitq, !req->locked); end(fc, req); - spin_lock(&fuse_lock); + spin_lock(&fc->lock); } } } @@ -893,35 +875,44 @@ static void end_io_requests(struct fuse_conn *fc) */ void fuse_abort_conn(struct fuse_conn *fc) { - spin_lock(&fuse_lock); + spin_lock(&fc->lock); if (fc->connected) { fc->connected = 0; end_io_requests(fc); end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); wake_up_all(&fc->waitq); + kill_fasync(&fc->fasync, SIGIO, POLL_IN); } - spin_unlock(&fuse_lock); + spin_unlock(&fc->lock); } static int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_conn *fc; - - spin_lock(&fuse_lock); - fc = file->private_data; + struct fuse_conn *fc = fuse_get_conn(file); if (fc) { + spin_lock(&fc->lock); fc->connected = 0; end_requests(fc, &fc->pending); end_requests(fc, &fc->processing); - } - spin_unlock(&fuse_lock); - if (fc) + spin_unlock(&fc->lock); + fasync_helper(-1, file, 0, &fc->fasync); kobject_put(&fc->kobj); + } return 0; } +static int fuse_dev_fasync(int fd, struct file *file, int on) +{ + struct fuse_conn *fc = fuse_get_conn(file); + if (!fc) + return -EPERM; + + /* No locking - fasync_helper does its own locking */ + return fasync_helper(fd, file, on, &fc->fasync); +} + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .llseek = no_llseek, @@ -931,6 +922,7 @@ const struct file_operations fuse_dev_operations = { .writev = fuse_dev_writev, .poll = fuse_dev_poll, .release = fuse_dev_release, + .fasync = fuse_dev_fasync, }; static struct miscdevice fuse_miscdevice = { diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 256355b8025..8d7546e832e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -117,8 +117,8 @@ static int fuse_dentry_revalidate(struct dentry *entry, struct nameidata *nd) return 0; fc = get_fuse_conn(inode); - req = fuse_get_request(fc); - if (!req) + req = fuse_get_req(fc); + if (IS_ERR(req)) return 0; fuse_lookup_init(req, entry->d_parent->d_inode, entry, &outarg); @@ -188,9 +188,9 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, if (entry->d_name.len > FUSE_NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - req = fuse_get_request(fc); - if (!req) - return ERR_PTR(-EINTR); + req = fuse_get_req(fc); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); fuse_lookup_init(req, dir, entry, &outarg); request_send(fc, req); @@ -244,15 +244,14 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, struct file *file; int flags = nd->intent.open.flags - 1; - err = -ENOSYS; if (fc->no_create) - goto out; + return -ENOSYS; - err = -EINTR; - req = fuse_get_request(fc); - if (!req) - goto out; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); + err = -ENOMEM; ff = fuse_file_alloc(); if (!ff) goto out_put_request; @@ -314,7 +313,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, int mode, fuse_file_free(ff); out_put_request: fuse_put_request(fc, req); - out: return err; } @@ -375,9 +373,9 @@ static int fuse_mknod(struct inode *dir, struct dentry *entry, int mode, { struct fuse_mknod_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; @@ -407,9 +405,9 @@ static int fuse_mkdir(struct inode *dir, struct dentry *entry, int mode) { struct fuse_mkdir_in inarg; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mode = mode; @@ -427,9 +425,9 @@ static int fuse_symlink(struct inode *dir, struct dentry *entry, { struct fuse_conn *fc = get_fuse_conn(dir); unsigned len = strlen(link) + 1; - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_SYMLINK; req->in.numargs = 2; @@ -444,9 +442,9 @@ static int fuse_unlink(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_UNLINK; req->in.h.nodeid = get_node_id(dir); @@ -476,9 +474,9 @@ static int fuse_rmdir(struct inode *dir, struct dentry *entry) { int err; struct fuse_conn *fc = get_fuse_conn(dir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_RMDIR; req->in.h.nodeid = get_node_id(dir); @@ -504,9 +502,9 @@ static int fuse_rename(struct inode *olddir, struct dentry *oldent, int err; struct fuse_rename_in inarg; struct fuse_conn *fc = get_fuse_conn(olddir); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.newdir = get_node_id(newdir); @@ -553,9 +551,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_link_in inarg; struct inode *inode = entry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); @@ -583,9 +581,9 @@ int fuse_do_getattr(struct inode *inode) int err; struct fuse_attr_out arg; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); - if (!req) - return -EINTR; + struct fuse_req *req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_GETATTR; req->in.h.nodeid = get_node_id(inode); @@ -673,9 +671,9 @@ static int fuse_access(struct inode *inode, int mask) if (fc->no_access) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.mask = mask; @@ -780,9 +778,9 @@ static int fuse_readdir(struct file *file, void *dstbuf, filldir_t filldir) if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); page = alloc_page(GFP_KERNEL); if (!page) { @@ -809,11 +807,11 @@ static char *read_link(struct dentry *dentry) { struct inode *inode = dentry->d_inode; struct fuse_conn *fc = get_fuse_conn(inode); - struct fuse_req *req = fuse_get_request(fc); + struct fuse_req *req = fuse_get_req(fc); char *link; - if (!req) - return ERR_PTR(-EINTR); + if (IS_ERR(req)) + return ERR_PTR(PTR_ERR(req)); link = (char *) __get_free_page(GFP_KERNEL); if (!link) { @@ -933,9 +931,9 @@ static int fuse_setattr(struct dentry *entry, struct iattr *attr) } } - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); iattr_to_fattr(attr, &inarg); @@ -995,9 +993,9 @@ static int fuse_setxattr(struct dentry *entry, const char *name, if (fc->no_setxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1035,9 +1033,9 @@ static ssize_t fuse_getxattr(struct dentry *entry, const char *name, if (fc->no_getxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1085,9 +1083,9 @@ static ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size) if (fc->no_listxattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.size = size; @@ -1131,9 +1129,9 @@ static int fuse_removexattr(struct dentry *entry, const char *name) if (fc->no_removexattr) return -EOPNOTSUPP; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->in.h.opcode = FUSE_REMOVEXATTR; req->in.h.nodeid = get_node_id(inode); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 975f2697e86..fc342cf7c2c 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -22,9 +22,9 @@ static int fuse_send_open(struct inode *inode, struct file *file, int isdir, struct fuse_req *req; int err; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.flags = file->f_flags & ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC); @@ -184,9 +184,9 @@ static int fuse_flush(struct file *file) if (fc->no_flush) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -223,9 +223,9 @@ int fuse_fsync_common(struct file *file, struct dentry *de, int datasync, if ((!isdir && fc->no_fsync) || (isdir && fc->no_fsyncdir)) return 0; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&inarg, 0, sizeof(inarg)); inarg.fh = ff->fh; @@ -297,9 +297,9 @@ static int fuse_readpage(struct file *file, struct page *page) if (is_bad_inode(inode)) goto out; - err = -EINTR; - req = fuse_get_request(fc); - if (!req) + req = fuse_get_req(fc); + err = PTR_ERR(req); + if (IS_ERR(req)) goto out; req->out.page_zeroing = 1; @@ -368,10 +368,10 @@ static int fuse_readpages_fill(void *_data, struct page *page) (req->num_pages + 1) * PAGE_CACHE_SIZE > fc->max_read || req->pages[req->num_pages - 1]->index + 1 != page->index)) { fuse_send_readpages(req, data->file, inode); - data->req = req = fuse_get_request(fc); - if (!req) { + data->req = req = fuse_get_req(fc); + if (IS_ERR(req)) { unlock_page(page); - return -EINTR; + return PTR_ERR(req); } } req->pages[req->num_pages] = page; @@ -392,13 +392,17 @@ static int fuse_readpages(struct file *file, struct address_space *mapping, data.file = file; data.inode = inode; - data.req = fuse_get_request(fc); - if (!data.req) - return -EINTR; + data.req = fuse_get_req(fc); + if (IS_ERR(data.req)) + return PTR_ERR(data.req); err = read_cache_pages(mapping, pages, fuse_readpages_fill, &data); - if (!err) - fuse_send_readpages(data.req, file, inode); + if (!err) { + if (data.req->num_pages) + fuse_send_readpages(data.req, file, inode); + else + fuse_put_request(fc, data.req); + } return err; } @@ -451,9 +455,9 @@ static int fuse_commit_write(struct file *file, struct page *page, if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); req->num_pages = 1; req->pages[0] = page; @@ -528,9 +532,9 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, if (is_bad_inode(inode)) return -EIO; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); while (count) { size_t nres; @@ -561,8 +565,12 @@ static ssize_t fuse_direct_io(struct file *file, const char __user *buf, buf += nres; if (nres != nbytes) break; - if (count) - fuse_reset_request(req); + if (count) { + fuse_put_request(fc, req); + req = fuse_get_req(fc); + if (IS_ERR(req)) + break; + } } fuse_put_request(fc, req); if (res > 0) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index a16a04fcf41..59661c481d9 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -18,8 +18,8 @@ /** Max number of pages that can be used in a single read request */ #define FUSE_MAX_PAGES_PER_REQ 32 -/** If more requests are outstanding, then the operation will block */ -#define FUSE_MAX_OUTSTANDING 10 +/** Maximum number of outstanding background requests */ +#define FUSE_MAX_BACKGROUND 10 /** It could be as large as PATH_MAX, but would that have any uses? */ #define FUSE_NAME_MAX 1024 @@ -131,8 +131,8 @@ struct fuse_conn; * A request to the client */ struct fuse_req { - /** This can be on either unused_list, pending processing or - io lists in fuse_conn */ + /** This can be on either pending processing or io lists in + fuse_conn */ struct list_head list; /** Entry on the background list */ @@ -144,15 +144,12 @@ struct fuse_req { /* * The following bitfields are either set once before the * request is queued or setting/clearing them is protected by - * fuse_lock + * fuse_conn->lock */ /** True if the request has reply */ unsigned isreply:1; - /** The request is preallocated */ - unsigned preallocated:1; - /** The request was interrupted */ unsigned interrupted:1; @@ -162,6 +159,9 @@ struct fuse_req { /** Data is being copied to/from the request */ unsigned locked:1; + /** Request is counted as "waiting" */ + unsigned waiting:1; + /** State of the request */ enum fuse_req_state state; @@ -213,6 +213,9 @@ struct fuse_req { * unmounted. */ struct fuse_conn { + /** Lock protecting accessess to members of this structure */ + spinlock_t lock; + /** The user id for this mount */ uid_t user_id; @@ -244,25 +247,20 @@ struct fuse_conn { interrupted request) */ struct list_head background; - /** Controls the maximum number of outstanding requests */ - struct semaphore outstanding_sem; + /** Number of requests currently in the background */ + unsigned num_background; - /** This counts the number of outstanding requests if - outstanding_sem would go negative */ - unsigned outstanding_debt; + /** Flag indicating if connection is blocked. This will be + the case before the INIT reply is received, and if there + are too many outstading backgrounds requests */ + int blocked; - /** RW semaphore for exclusion with fuse_put_super() */ - struct rw_semaphore sbput_sem; - - /** The list of unused requests */ - struct list_head unused_list; + /** waitq for blocked connection */ + wait_queue_head_t blocked_waitq; /** The next unique request id */ u64 reqctr; - /** Mount is active */ - unsigned mounted; - /** Connection established, cleared on umount, connection abort and device release */ unsigned connected; @@ -318,6 +316,9 @@ struct fuse_conn { /** kobject */ struct kobject kobj; + + /** O_ASYNC requests */ + struct fasync_struct *fasync; }; static inline struct fuse_conn *get_fuse_conn_super(struct super_block *sb) @@ -349,21 +350,6 @@ static inline u64 get_node_id(struct inode *inode) extern const struct file_operations fuse_dev_operations; /** - * This is the single global spinlock which protects FUSE's structures - * - * The following data is protected by this lock: - * - * - the private_data field of the device file - * - the s_fs_info field of the super block - * - unused_list, pending, processing lists in fuse_conn - * - background list in fuse_conn - * - the unique request ID counter reqctr in fuse_conn - * - the sb (super_block) field in fuse_conn - * - the file (device file) field in fuse_conn - */ -extern spinlock_t fuse_lock; - -/** * Get a filled in inode */ struct inode *fuse_iget(struct super_block *sb, unsigned long nodeid, @@ -461,11 +447,11 @@ void fuse_reset_request(struct fuse_req *req); /** * Reserve a preallocated request */ -struct fuse_req *fuse_get_request(struct fuse_conn *fc); +struct fuse_req *fuse_get_req(struct fuse_conn *fc); /** - * Decrement reference count of a request. If count goes to zero put - * on unused list (preallocated) or free request (not preallocated). + * Decrement reference count of a request. If count goes to zero free + * the request. */ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req); @@ -485,11 +471,11 @@ void request_send_noreply(struct fuse_conn *fc, struct fuse_req *req); void request_send_background(struct fuse_conn *fc, struct fuse_req *req); /** - * Release inodes and file associated with background request + * Remove request from the the background list */ -void fuse_release_background(struct fuse_req *req); +void fuse_remove_background(struct fuse_conn *fc, struct fuse_req *req); -/* Abort all requests */ +/** Abort all requests */ void fuse_abort_conn(struct fuse_conn *fc); /** diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 879e6fba948..43a6fc0db8a 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1,6 +1,6 @@ /* FUSE: Filesystem in Userspace - Copyright (C) 2001-2005 Miklos Szeredi <miklos@szeredi.hu> + Copyright (C) 2001-2006 Miklos Szeredi <miklos@szeredi.hu> This program can be distributed under the terms of the GNU GPL. See the file COPYING. @@ -22,7 +22,6 @@ MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); MODULE_DESCRIPTION("Filesystem in Userspace"); MODULE_LICENSE("GPL"); -spinlock_t fuse_lock; static kmem_cache_t *fuse_inode_cachep; static struct subsystem connections_subsys; @@ -205,17 +204,28 @@ static void fuse_put_super(struct super_block *sb) { struct fuse_conn *fc = get_fuse_conn_super(sb); - down_write(&fc->sbput_sem); - while (!list_empty(&fc->background)) - fuse_release_background(list_entry(fc->background.next, - struct fuse_req, bg_entry)); - - spin_lock(&fuse_lock); - fc->mounted = 0; + spin_lock(&fc->lock); fc->connected = 0; - spin_unlock(&fuse_lock); - up_write(&fc->sbput_sem); + while (!list_empty(&fc->background)) { + struct fuse_req *req = list_entry(fc->background.next, + struct fuse_req, bg_entry); + struct inode *inode = req->inode; + struct inode *inode2 = req->inode2; + + /* File would hold a reference to vfsmount */ + BUG_ON(req->file); + req->inode = NULL; + req->inode2 = NULL; + fuse_remove_background(fc, req); + + spin_unlock(&fc->lock); + iput(inode); + iput(inode2); + spin_lock(&fc->lock); + } + spin_unlock(&fc->lock); /* Flush all readers on this fs */ + kill_fasync(&fc->fasync, SIGIO, POLL_IN); wake_up_all(&fc->waitq); kobject_del(&fc->kobj); kobject_put(&fc->kobj); @@ -242,9 +252,9 @@ static int fuse_statfs(struct super_block *sb, struct kstatfs *buf) struct fuse_statfs_out outarg; int err; - req = fuse_get_request(fc); - if (!req) - return -EINTR; + req = fuse_get_req(fc); + if (IS_ERR(req)) + return PTR_ERR(req); memset(&outarg, 0, sizeof(outarg)); req->in.numargs = 0; @@ -369,15 +379,7 @@ static int fuse_show_options(struct seq_file *m, struct vfsmount *mnt) static void fuse_conn_release(struct kobject *kobj) { - struct fuse_conn *fc = get_fuse_conn_kobj(kobj); - - while (!list_empty(&fc->unused_list)) { - struct fuse_req *req; - req = list_entry(fc->unused_list.next, struct fuse_req, list); - list_del(&req->list); - fuse_request_free(req); - } - kfree(fc); + kfree(get_fuse_conn_kobj(kobj)); } static struct fuse_conn *new_conn(void) @@ -386,64 +388,24 @@ static struct fuse_conn *new_conn(void) fc = kzalloc(sizeof(*fc), GFP_KERNEL); if (fc) { - int i; + spin_lock_init(&fc->lock); init_waitqueue_head(&fc->waitq); + init_waitqueue_head(&fc->blocked_waitq); INIT_LIST_HEAD(&fc->pending); INIT_LIST_HEAD(&fc->processing); INIT_LIST_HEAD(&fc->io); - INIT_LIST_HEAD(&fc->unused_list); INIT_LIST_HEAD(&fc->background); - sema_init(&fc->outstanding_sem, 1); /* One for INIT */ - init_rwsem(&fc->sbput_sem); kobj_set_kset_s(fc, connections_subsys); kobject_init(&fc->kobj); atomic_set(&fc->num_waiting, 0); - for (i = 0; i < FUSE_MAX_OUTSTANDING; i++) { - struct fuse_req *req = fuse_request_alloc(); - if (!req) { - kobject_put(&fc->kobj); - return NULL; - } - list_add(&req->list, &fc->unused_list); - } fc->bdi.ra_pages = (VM_MAX_READAHEAD * 1024) / PAGE_CACHE_SIZE; fc->bdi.unplug_io_fn = default_unplug_io_fn; fc->reqctr = 0; + fc->blocked = 1; } return fc; } -static struct fuse_conn *get_conn(struct file *file, struct super_block *sb) -{ - struct fuse_conn *fc; - int err; - - err = -EINVAL; - if (file->f_op != &fuse_dev_operations) - goto out_err; - - err = -ENOMEM; - fc = new_conn(); - if (!fc) - goto out_err; - - spin_lock(&fuse_lock); - err = -EINVAL; - if (file->private_data) - goto out_unlock; - - kobject_get(&fc->kobj); - file->private_data = fc; - spin_unlock(&fuse_lock); - return fc; - - out_unlock: - spin_unlock(&fuse_lock); - kobject_put(&fc->kobj); - out_err: - return ERR_PTR(err); -} - static struct inode *get_root_inode(struct super_block *sb, unsigned mode) { struct fuse_attr attr; @@ -467,7 +429,6 @@ static struct super_operations fuse_super_operations = { static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) { - int i; struct fuse_init_out *arg = &req->misc.init_out; if (req->out.h.error || arg->major != FUSE_KERNEL_VERSION) @@ -486,22 +447,13 @@ static void process_init_reply(struct fuse_conn *fc, struct fuse_req *req) fc->minor = arg->minor; fc->max_write = arg->minor < 5 ? 4096 : arg->max_write; } - - /* After INIT reply is received other requests can go - out. So do (FUSE_MAX_OUTSTANDING - 1) number of - up()s on outstanding_sem. The last up() is done in - fuse_putback_request() */ - for (i = 1; i < FUSE_MAX_OUTSTANDING; i++) - up(&fc->outstanding_sem); - fuse_put_request(fc, req); + fc->blocked = 0; + wake_up_all(&fc->blocked_waitq); } -static void fuse_send_init(struct fuse_conn *fc) +static void fuse_send_init(struct fuse_conn *fc, struct fuse_req *req) { - /* This is called from fuse_read_super() so there's guaranteed - to be exactly one request available */ - struct fuse_req *req = fuse_get_request(fc); struct fuse_init_in *arg = &req->misc.init_in; arg->major = FUSE_KERNEL_VERSION; @@ -525,12 +477,9 @@ static void fuse_send_init(struct fuse_conn *fc) static unsigned long long conn_id(void) { + /* BKL is held for ->get_sb() */ static unsigned long long ctr = 1; - unsigned long long val; - spin_lock(&fuse_lock); - val = ctr++; - spin_unlock(&fuse_lock); - return val; + return ctr++; } static int fuse_fill_super(struct super_block *sb, void *data, int silent) @@ -540,6 +489,7 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) struct fuse_mount_data d; struct file *file; struct dentry *root_dentry; + struct fuse_req *init_req; int err; if (!parse_fuse_opt((char *) data, &d)) @@ -555,10 +505,17 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) if (!file) return -EINVAL; - fc = get_conn(file, sb); - fput(file); - if (IS_ERR(fc)) - return PTR_ERR(fc); + if (file->f_op != &fuse_dev_operations) + return -EINVAL; + + /* Setting file->private_data can't race with other mount() + instances, since BKL is held for ->get_sb() */ + if (file->private_data) + return -EINVAL; + + fc = new_conn(); + if (!fc) + return -ENOMEM; fc->flags = d.flags; fc->user_id = d.user_id; @@ -579,27 +536,39 @@ static int fuse_fill_super(struct super_block *sb, void *data, int silent) goto err; } + init_req = fuse_request_alloc(); + if (!init_req) + goto err_put_root; + err = kobject_set_name(&fc->kobj, "%llu", conn_id()); if (err) - goto err_put_root; + goto err_free_req; err = kobject_add(&fc->kobj); if (err) - goto err_put_root; + goto err_free_req; sb->s_root = root_dentry; - spin_lock(&fuse_lock); - fc->mounted = 1; fc->connected = 1; - spin_unlock(&fuse_lock); + kobject_get(&fc->kobj); + file->private_data = fc; + /* + * atomic_dec_and_test() in fput() provides the necessary + * memory barrier for file->private_data to be visible on all + * CPUs after this + */ + fput(file); - fuse_send_init(fc); + fuse_send_init(fc, init_req); return 0; + err_free_req: + fuse_request_free(init_req); err_put_root: dput(root_dentry); err: + fput(file); kobject_put(&fc->kobj); return err; } @@ -753,7 +722,6 @@ static int __init fuse_init(void) printk("fuse init (API version %i.%i)\n", FUSE_KERNEL_VERSION, FUSE_KERNEL_MINOR_VERSION); - spin_lock_init(&fuse_lock); res = fuse_fs_init(); if (res) goto err; diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 8f07e8fbd03..746abc9ecf7 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -466,8 +466,7 @@ void hfs_bnode_unhash(struct hfs_bnode *node) for (p = &node->tree->node_hash[hfs_bnode_hash(node->this)]; *p && *p != node; p = &(*p)->next_hash) ; - if (!*p) - BUG(); + BUG_ON(!*p); *p = node->next_hash; node->tree->node_hash_cnt--; } @@ -622,8 +621,7 @@ void hfs_bnode_put(struct hfs_bnode *node) dprint(DBG_BNODE_REFS, "put_node(%d:%d): %d\n", node->tree->cnid, node->this, atomic_read(&node->refcnt)); - if (!atomic_read(&node->refcnt)) - BUG(); + BUG_ON(!atomic_read(&node->refcnt)); if (!atomic_dec_and_lock(&node->refcnt, &tree->hash_lock)) return; for (i = 0; i < tree->pages_per_bnode; i++) { diff --git a/fs/hfsplus/btree.c b/fs/hfsplus/btree.c index a67edfa34e9..effa8991999 100644 --- a/fs/hfsplus/btree.c +++ b/fs/hfsplus/btree.c @@ -269,8 +269,7 @@ void hfs_bmap_free(struct hfs_bnode *node) u8 *data, byte, m; dprint(DBG_BNODE_MOD, "btree_free_node: %u\n", node->this); - if (!node->this) - BUG(); + BUG_ON(!node->this); tree = node->tree; nidx = node->this; node = hfs_bnode_find(tree, 0); diff --git a/fs/hppfs/hppfs_kern.c b/fs/hppfs/hppfs_kern.c index 2ba20cdb5ba..5e6363be246 100644 --- a/fs/hppfs/hppfs_kern.c +++ b/fs/hppfs/hppfs_kern.c @@ -216,10 +216,10 @@ static struct dentry *hppfs_lookup(struct inode *ino, struct dentry *dentry, static struct inode_operations hppfs_file_iops = { }; -static ssize_t read_proc(struct file *file, char *buf, ssize_t count, +static ssize_t read_proc(struct file *file, char __user *buf, ssize_t count, loff_t *ppos, int is_user) { - ssize_t (*read)(struct file *, char *, size_t, loff_t *); + ssize_t (*read)(struct file *, char __user *, size_t, loff_t *); ssize_t n; read = file->f_dentry->d_inode->i_fop->read; @@ -236,7 +236,7 @@ static ssize_t read_proc(struct file *file, char *buf, ssize_t count, return n; } -static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count) +static ssize_t hppfs_read_file(int fd, char __user *buf, ssize_t count) { ssize_t n; int cur, err; @@ -274,7 +274,7 @@ static ssize_t hppfs_read_file(int fd, char *buf, ssize_t count) return n; } -static ssize_t hppfs_read(struct file *file, char *buf, size_t count, +static ssize_t hppfs_read(struct file *file, char __user *buf, size_t count, loff_t *ppos) { struct hppfs_private *hppfs = file->private_data; @@ -313,12 +313,12 @@ static ssize_t hppfs_read(struct file *file, char *buf, size_t count, return(count); } -static ssize_t hppfs_write(struct file *file, const char *buf, size_t len, +static ssize_t hppfs_write(struct file *file, const char __user *buf, size_t len, loff_t *ppos) { struct hppfs_private *data = file->private_data; struct file *proc_file = data->proc_file; - ssize_t (*write)(struct file *, const char *, size_t, loff_t *); + ssize_t (*write)(struct file *, const char __user *, size_t, loff_t *); int err; write = proc_file->f_dentry->d_inode->i_fop->write; @@ -658,7 +658,7 @@ static struct super_operations hppfs_sbops = { .statfs = hppfs_statfs, }; -static int hppfs_readlink(struct dentry *dentry, char *buffer, int buflen) +static int hppfs_readlink(struct dentry *dentry, char __user *buffer, int buflen) { struct file *proc_file; struct dentry *proc_dentry; diff --git a/fs/inode.c b/fs/inode.c index 32b7c337502..3a2446a27d2 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -172,8 +172,7 @@ static struct inode *alloc_inode(struct super_block *sb) void destroy_inode(struct inode *inode) { - if (inode_has_buffers(inode)) - BUG(); + BUG_ON(inode_has_buffers(inode)); security_inode_free(inode); if (inode->i_sb->s_op->destroy_inode) inode->i_sb->s_op->destroy_inode(inode); @@ -249,12 +248,9 @@ void clear_inode(struct inode *inode) might_sleep(); invalidate_inode_buffers(inode); - if (inode->i_data.nrpages) - BUG(); - if (!(inode->i_state & I_FREEING)) - BUG(); - if (inode->i_state & I_CLEAR) - BUG(); + BUG_ON(inode->i_data.nrpages); + BUG_ON(!(inode->i_state & I_FREEING)); + BUG_ON(inode->i_state & I_CLEAR); wait_on_inode(inode); DQUOT_DROP(inode); if (inode->i_sb && inode->i_sb->s_op->clear_inode) @@ -1054,8 +1050,7 @@ void generic_delete_inode(struct inode *inode) hlist_del_init(&inode->i_hash); spin_unlock(&inode_lock); wake_up_inode(inode); - if (inode->i_state != I_CLEAR) - BUG(); + BUG_ON(inode->i_state != I_CLEAR); destroy_inode(inode); } diff --git a/fs/inotify.c b/fs/inotify.c index 367c487c014..1f50302849c 100644 --- a/fs/inotify.c +++ b/fs/inotify.c @@ -538,7 +538,7 @@ void inotify_d_instantiate(struct dentry *entry, struct inode *inode) WARN_ON(entry->d_flags & DCACHE_INOTIFY_PARENT_WATCHED); spin_lock(&entry->d_lock); parent = entry->d_parent; - if (inotify_inode_watched(parent->d_inode)) + if (parent->d_inode && inotify_inode_watched(parent->d_inode)) entry->d_flags |= DCACHE_INOTIFY_PARENT_WATCHED; spin_unlock(&entry->d_lock); } diff --git a/fs/jffs2/background.c b/fs/jffs2/background.c index 7b77a954112..ff2a872e80e 100644 --- a/fs/jffs2/background.c +++ b/fs/jffs2/background.c @@ -35,8 +35,7 @@ int jffs2_start_garbage_collect_thread(struct jffs2_sb_info *c) pid_t pid; int ret = 0; - if (c->gc_task) - BUG(); + BUG_ON(c->gc_task); init_completion(&c->gc_thread_start); init_completion(&c->gc_thread_exit); diff --git a/fs/locks.c b/fs/locks.c index 4d9e71d43e7..dda83d6cd48 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -168,18 +168,9 @@ static void locks_release_private(struct file_lock *fl) /* Free a lock which is not in use. */ static void locks_free_lock(struct file_lock *fl) { - if (fl == NULL) { - BUG(); - return; - } - if (waitqueue_active(&fl->fl_wait)) - panic("Attempting to free lock with active wait queue"); - - if (!list_empty(&fl->fl_block)) - panic("Attempting to free lock with active block list"); - - if (!list_empty(&fl->fl_link)) - panic("Attempting to free lock on active lock list"); + BUG_ON(waitqueue_active(&fl->fl_wait)); + BUG_ON(!list_empty(&fl->fl_block)); + BUG_ON(!list_empty(&fl->fl_link)); locks_release_private(fl); kmem_cache_free(filelock_cache, fl); @@ -735,8 +726,9 @@ EXPORT_SYMBOL(posix_locks_deadlock); * at the head of the list, but that's secret knowledge known only to * flock_lock_file and posix_lock_file. */ -static int flock_lock_file(struct file *filp, struct file_lock *new_fl) +static int flock_lock_file(struct file *filp, struct file_lock *request) { + struct file_lock *new_fl = NULL; struct file_lock **before; struct inode * inode = filp->f_dentry->d_inode; int error = 0; @@ -751,17 +743,19 @@ static int flock_lock_file(struct file *filp, struct file_lock *new_fl) continue; if (filp != fl->fl_file) continue; - if (new_fl->fl_type == fl->fl_type) + if (request->fl_type == fl->fl_type) goto out; found = 1; locks_delete_lock(before); break; } - unlock_kernel(); - if (new_fl->fl_type == F_UNLCK) - return 0; + if (request->fl_type == F_UNLCK) + goto out; + new_fl = locks_alloc_lock(); + if (new_fl == NULL) + goto out; /* * If a higher-priority process was blocked on the old file lock, * give it the opportunity to lock the file. @@ -769,26 +763,27 @@ static int flock_lock_file(struct file *filp, struct file_lock *new_fl) if (found) cond_resched(); - lock_kernel(); for_each_lock(inode, before) { struct file_lock *fl = *before; if (IS_POSIX(fl)) break; if (IS_LEASE(fl)) continue; - if (!flock_locks_conflict(new_fl, fl)) + if (!flock_locks_conflict(request, fl)) continue; error = -EAGAIN; - if (new_fl->fl_flags & FL_SLEEP) { - locks_insert_block(fl, new_fl); - } + if (request->fl_flags & FL_SLEEP) + locks_insert_block(fl, request); goto out; } + locks_copy_lock(new_fl, request); locks_insert_lock(&inode->i_flock, new_fl); - error = 0; + new_fl = NULL; out: unlock_kernel(); + if (new_fl) + locks_free_lock(new_fl); return error; } @@ -1569,9 +1564,7 @@ asmlinkage long sys_flock(unsigned int fd, unsigned int cmd) error = flock_lock_file_wait(filp, lock); out_free: - if (list_empty(&lock->fl_link)) { - locks_free_lock(lock); - } + locks_free_lock(lock); out_putf: fput(filp); diff --git a/fs/msdos/namei.c b/fs/msdos/namei.c index 626a367bcd8..5b76ccd19e3 100644 --- a/fs/msdos/namei.c +++ b/fs/msdos/namei.c @@ -12,14 +12,6 @@ #include <linux/msdos_fs.h> #include <linux/smp_lock.h> -/* MS-DOS "device special files" */ -static const unsigned char *reserved_names[] = { - "CON ", "PRN ", "NUL ", "AUX ", - "LPT1 ", "LPT2 ", "LPT3 ", "LPT4 ", - "COM1 ", "COM2 ", "COM3 ", "COM4 ", - NULL -}; - /* Characters that are undesirable in an MS-DOS file name */ static unsigned char bad_chars[] = "*?<>|\""; static unsigned char bad_if_strict_pc[] = "+=,; "; @@ -40,7 +32,6 @@ static int msdos_format_name(const unsigned char *name, int len, */ { unsigned char *walk; - const unsigned char **reserved; unsigned char c; int space; @@ -127,11 +118,7 @@ static int msdos_format_name(const unsigned char *name, int len, } while (walk - res < MSDOS_NAME) *walk++ = ' '; - if (!opts->atari) - /* GEMDOS is less stupid and has no reserved names */ - for (reserved = reserved_names; *reserved; reserved++) - if (!strncmp(res, *reserved, 8)) - return -EINVAL; + return 0; } diff --git a/fs/namei.c b/fs/namei.c index 22f6e8d16aa..96723ae83c8 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1254,7 +1254,7 @@ out: return dentry; } -struct dentry * lookup_hash(struct nameidata *nd) +static struct dentry *lookup_hash(struct nameidata *nd) { return __lookup_hash(&nd->last, nd->dentry, nd); } @@ -2697,7 +2697,6 @@ EXPORT_SYMBOL(follow_up); EXPORT_SYMBOL(get_write_access); /* binfmt_aout */ EXPORT_SYMBOL(getname); EXPORT_SYMBOL(lock_rename); -EXPORT_SYMBOL(lookup_hash); EXPORT_SYMBOL(lookup_one_len); EXPORT_SYMBOL(page_follow_link_light); EXPORT_SYMBOL(page_put_link); diff --git a/fs/namespace.c b/fs/namespace.c index bf478addb85..2c5f1f80bdc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -899,11 +899,13 @@ static int do_change_type(struct nameidata *nd, int flag) /* * do loopback mount. */ -static int do_loopback(struct nameidata *nd, char *old_name, int recurse) +static int do_loopback(struct nameidata *nd, char *old_name, unsigned long flags, int mnt_flags) { struct nameidata old_nd; struct vfsmount *mnt = NULL; + int recurse = flags & MS_REC; int err = mount_is_safe(nd); + if (err) return err; if (!old_name || !*old_name) @@ -937,6 +939,7 @@ static int do_loopback(struct nameidata *nd, char *old_name, int recurse) spin_unlock(&vfsmount_lock); release_mounts(&umount_list); } + mnt->mnt_flags = mnt_flags; out: up_write(&namespace_sem); @@ -1350,7 +1353,7 @@ long do_mount(char *dev_name, char *dir_name, char *type_page, retval = do_remount(&nd, flags & ~MS_REMOUNT, mnt_flags, data_page); else if (flags & MS_BIND) - retval = do_loopback(&nd, dev_name, flags & MS_REC); + retval = do_loopback(&nd, dev_name, flags, mnt_flags); else if (flags & (MS_SHARED | MS_PRIVATE | MS_SLAVE | MS_UNBINDABLE)) retval = do_change_type(&nd, flags); else if (flags & MS_MOVE) diff --git a/fs/nfsd/auth.c b/fs/nfsd/auth.c index cfe9ce88161..6e92b0fe532 100644 --- a/fs/nfsd/auth.c +++ b/fs/nfsd/auth.c @@ -14,46 +14,46 @@ int nfsd_setuser(struct svc_rqst *rqstp, struct svc_export *exp) { - struct svc_cred *cred = &rqstp->rq_cred; + struct svc_cred cred = rqstp->rq_cred; int i; int ret; if (exp->ex_flags & NFSEXP_ALLSQUASH) { - cred->cr_uid = exp->ex_anon_uid; - cred->cr_gid = exp->ex_anon_gid; - put_group_info(cred->cr_group_info); - cred->cr_group_info = groups_alloc(0); + cred.cr_uid = exp->ex_anon_uid; + cred.cr_gid = exp->ex_anon_gid; + cred.cr_group_info = groups_alloc(0); } else if (exp->ex_flags & NFSEXP_ROOTSQUASH) { struct group_info *gi; - if (!cred->cr_uid) - cred->cr_uid = exp->ex_anon_uid; - if (!cred->cr_gid) - cred->cr_gid = exp->ex_anon_gid; - gi = groups_alloc(cred->cr_group_info->ngroups); + if (!cred.cr_uid) + cred.cr_uid = exp->ex_anon_uid; + if (!cred.cr_gid) + cred.cr_gid = exp->ex_anon_gid; + gi = groups_alloc(cred.cr_group_info->ngroups); if (gi) - for (i = 0; i < cred->cr_group_info->ngroups; i++) { - if (!GROUP_AT(cred->cr_group_info, i)) + for (i = 0; i < cred.cr_group_info->ngroups; i++) { + if (!GROUP_AT(cred.cr_group_info, i)) GROUP_AT(gi, i) = exp->ex_anon_gid; else - GROUP_AT(gi, i) = GROUP_AT(cred->cr_group_info, i); + GROUP_AT(gi, i) = GROUP_AT(cred.cr_group_info, i); } - put_group_info(cred->cr_group_info); - cred->cr_group_info = gi; - } + cred.cr_group_info = gi; + } else + get_group_info(cred.cr_group_info); - if (cred->cr_uid != (uid_t) -1) - current->fsuid = cred->cr_uid; + if (cred.cr_uid != (uid_t) -1) + current->fsuid = cred.cr_uid; else current->fsuid = exp->ex_anon_uid; - if (cred->cr_gid != (gid_t) -1) - current->fsgid = cred->cr_gid; + if (cred.cr_gid != (gid_t) -1) + current->fsgid = cred.cr_gid; else current->fsgid = exp->ex_anon_gid; - if (!cred->cr_group_info) + if (!cred.cr_group_info) return -ENOMEM; - ret = set_current_groups(cred->cr_group_info); - if ((cred->cr_uid)) { + ret = set_current_groups(cred.cr_group_info); + put_group_info(cred.cr_group_info); + if ((cred.cr_uid)) { cap_t(current->cap_effective) &= ~CAP_NFSD_MASK; } else { cap_t(current->cap_effective) |= (CAP_NFSD_MASK & diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index c340be0a3f5..4e0578121d9 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -422,7 +422,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) if ((len=qword_get(&mesg, buf, PAGE_SIZE)) <= 0) goto out; err = path_lookup(buf, 0, &nd); - if (err) goto out; + if (err) goto out_no_path; exp.h.flags = 0; exp.ex_client = dom; @@ -475,6 +475,7 @@ static int svc_export_parse(struct cache_detail *cd, char *mesg, int mlen) out: if (nd.dentry) path_release(&nd); + out_no_path: if (dom) auth_domain_put(dom); kfree(buf); diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 6d2dfed1de0..f61142afea4 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -682,7 +682,7 @@ static struct svc_procedure nfsd_procedures3[22] = { PROC(lookup, dirop, dirop, fhandle2, RC_NOCACHE, ST+FH+pAT+pAT), PROC(access, access, access, fhandle, RC_NOCACHE, ST+pAT+1), PROC(readlink, readlink, readlink, fhandle, RC_NOCACHE, ST+pAT+1+NFS3_MAXPATHLEN/4), - PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE), + PROC(read, read, read, fhandle, RC_NOCACHE, ST+pAT+4+NFSSVC_MAXBLKSIZE/4), PROC(write, write, write, fhandle, RC_REPLBUFF, ST+WC+4), PROC(create, create, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), PROC(mkdir, mkdir, create, fhandle2, RC_REPLBUFF, ST+(1+FH+pAT)+WC), diff --git a/fs/nfsd/nfs4acl.c b/fs/nfsd/nfs4acl.c index 7391f4aabed..edb107e61b9 100644 --- a/fs/nfsd/nfs4acl.c +++ b/fs/nfsd/nfs4acl.c @@ -710,9 +710,9 @@ calculate_posix_ace_count(struct nfs4_acl *n4acl) /* Also, the remaining entries are for named users and * groups, and come in threes (mask, allow, deny): */ if (n4acl->naces < 7) - return -1; + return -EINVAL; if ((n4acl->naces - 7) % 3) - return -1; + return -EINVAL; return 4 + (n4acl->naces - 7)/3; } } @@ -790,7 +790,7 @@ nfs4_acl_split(struct nfs4_acl *acl, struct nfs4_acl *dacl) continue; error = nfs4_acl_add_ace(dacl, ace->type, ace->flag, - ace->access_mask, ace->whotype, ace->who) == -1; + ace->access_mask, ace->whotype, ace->who); if (error < 0) goto out; @@ -866,7 +866,7 @@ nfs4_acl_add_ace(struct nfs4_acl *acl, u32 type, u32 flag, u32 access_mask, struct nfs4_ace *ace; if ((ace = kmalloc(sizeof(*ace), GFP_KERNEL)) == NULL) - return -1; + return -ENOMEM; ace->type = type; ace->flag = flag; diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index c872bd07fc1..dbaf3f93f32 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -441,8 +441,9 @@ nfsd4_probe_callback(struct nfs4_client *clp) goto out_clnt; } - /* the task holds a reference to the nfs4_client struct */ cb->cb_client = clnt; + + /* the task holds a reference to the nfs4_client struct */ atomic_inc(&clp->cl_count); msg.rpc_cred = nfsd4_lookupcred(clp,0); @@ -460,13 +461,12 @@ nfsd4_probe_callback(struct nfs4_client *clp) out_rpciod: atomic_dec(&clp->cl_count); rpciod_down(); + cb->cb_client = NULL; out_clnt: rpc_shutdown_client(clnt); - goto out_err; out_err: dprintk("NFSD: warning: no callback path to client %.*s\n", (int)clp->cl_name.len, clp->cl_name.data); - cb->cb_client = NULL; } static void diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 6d63f1d9e5f..b0e095ea0c0 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -288,8 +288,6 @@ nfsd4_putrootfh(struct svc_rqst *rqstp, struct svc_fh *current_fh) fh_put(current_fh); status = exp_pseudoroot(rqstp->rq_client, current_fh, &rqstp->rq_chandle); - if (!status) - status = nfserrno(nfsd_setuser(rqstp, current_fh->fh_export)); return status; } @@ -975,7 +973,7 @@ struct nfsd4_voidargs { int dummy; }; */ static struct svc_procedure nfsd_procedures4[2] = { PROC(null, void, void, void, RC_NOCACHE, 1), - PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE) + PROC(compound, compound, compound, compound, RC_NOCACHE, NFSD_BUFSIZE/4) }; struct svc_version nfsd_version4 = { diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 47ec112b266..96c7578cbe1 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -147,6 +147,42 @@ get_nfs4_file(struct nfs4_file *fi) kref_get(&fi->fi_ref); } +static int num_delegations; + +/* + * Open owner state (share locks) + */ + +/* hash tables for nfs4_stateowner */ +#define OWNER_HASH_BITS 8 +#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) +#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) + +#define ownerid_hashval(id) \ + ((id) & OWNER_HASH_MASK) +#define ownerstr_hashval(clientid, ownername) \ + (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) + +static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; +static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; + +/* hash table for nfs4_file */ +#define FILE_HASH_BITS 8 +#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) +#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) +/* hash table for (open)nfs4_stateid */ +#define STATEID_HASH_BITS 10 +#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) +#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) + +#define file_hashval(x) \ + hash_ptr(x, FILE_HASH_BITS) +#define stateid_hashval(owner_id, file_id) \ + (((owner_id) + (file_id)) & STATEID_HASH_MASK) + +static struct list_head file_hashtbl[FILE_HASH_SIZE]; +static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; + static struct nfs4_delegation * alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_fh *current_fh, u32 type) { @@ -155,9 +191,12 @@ alloc_init_deleg(struct nfs4_client *clp, struct nfs4_stateid *stp, struct svc_f struct nfs4_callback *cb = &stp->st_stateowner->so_client->cl_callback; dprintk("NFSD alloc_init_deleg\n"); + if (num_delegations > STATEID_HASH_SIZE * 4) + return NULL; dp = kmem_cache_alloc(deleg_slab, GFP_KERNEL); if (dp == NULL) return dp; + num_delegations++; INIT_LIST_HEAD(&dp->dl_perfile); INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); @@ -192,6 +231,7 @@ nfs4_put_delegation(struct nfs4_delegation *dp) dprintk("NFSD: freeing dp %p\n",dp); put_nfs4_file(dp->dl_file); kmem_cache_free(deleg_slab, dp); + num_delegations--; } } @@ -330,22 +370,29 @@ put_nfs4_client(struct nfs4_client *clp) } static void +shutdown_callback_client(struct nfs4_client *clp) +{ + struct rpc_clnt *clnt = clp->cl_callback.cb_client; + + /* shutdown rpc client, ending any outstanding recall rpcs */ + if (clnt) { + clp->cl_callback.cb_client = NULL; + rpc_shutdown_client(clnt); + rpciod_down(); + } +} + +static void expire_client(struct nfs4_client *clp) { struct nfs4_stateowner *sop; struct nfs4_delegation *dp; - struct nfs4_callback *cb = &clp->cl_callback; - struct rpc_clnt *clnt = clp->cl_callback.cb_client; struct list_head reaplist; dprintk("NFSD: expire_client cl_count %d\n", atomic_read(&clp->cl_count)); - /* shutdown rpc client, ending any outstanding recall rpcs */ - if (atomic_read(&cb->cb_set) == 1 && clnt) { - rpc_shutdown_client(clnt); - clnt = clp->cl_callback.cb_client = NULL; - } + shutdown_callback_client(clp); INIT_LIST_HEAD(&reaplist); spin_lock(&recall_lock); @@ -936,40 +983,6 @@ out: return status; } -/* - * Open owner state (share locks) - */ - -/* hash tables for nfs4_stateowner */ -#define OWNER_HASH_BITS 8 -#define OWNER_HASH_SIZE (1 << OWNER_HASH_BITS) -#define OWNER_HASH_MASK (OWNER_HASH_SIZE - 1) - -#define ownerid_hashval(id) \ - ((id) & OWNER_HASH_MASK) -#define ownerstr_hashval(clientid, ownername) \ - (((clientid) + opaque_hashval((ownername.data), (ownername.len))) & OWNER_HASH_MASK) - -static struct list_head ownerid_hashtbl[OWNER_HASH_SIZE]; -static struct list_head ownerstr_hashtbl[OWNER_HASH_SIZE]; - -/* hash table for nfs4_file */ -#define FILE_HASH_BITS 8 -#define FILE_HASH_SIZE (1 << FILE_HASH_BITS) -#define FILE_HASH_MASK (FILE_HASH_SIZE - 1) -/* hash table for (open)nfs4_stateid */ -#define STATEID_HASH_BITS 10 -#define STATEID_HASH_SIZE (1 << STATEID_HASH_BITS) -#define STATEID_HASH_MASK (STATEID_HASH_SIZE - 1) - -#define file_hashval(x) \ - hash_ptr(x, FILE_HASH_BITS) -#define stateid_hashval(owner_id, file_id) \ - (((owner_id) + (file_id)) & STATEID_HASH_MASK) - -static struct list_head file_hashtbl[FILE_HASH_SIZE]; -static struct list_head stateid_hashtbl[STATEID_HASH_SIZE]; - /* OPEN Share state helper functions */ static inline struct nfs4_file * alloc_init_file(struct inode *ino) @@ -1186,8 +1199,7 @@ move_to_close_lru(struct nfs4_stateowner *sop) { dprintk("NFSD: move_to_close_lru nfs4_stateowner %p\n", sop); - unhash_stateowner(sop); - list_add_tail(&sop->so_close_lru, &close_lru); + list_move_tail(&sop->so_close_lru, &close_lru); sop->so_time = get_seconds(); } @@ -1916,8 +1928,7 @@ nfs4_laundromat(void) } dprintk("NFSD: purging unused open stateowner (so_id %d)\n", sop->so_id); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); + release_stateowner(sop); } if (clientid_val < NFSD_LAUNDROMAT_MINTIMEOUT) clientid_val = NFSD_LAUNDROMAT_MINTIMEOUT; @@ -2495,36 +2506,27 @@ nfs4_transform_lock_offset(struct file_lock *lock) lock->fl_end = OFFSET_MAX; } -static int -nfs4_verify_lock_stateowner(struct nfs4_stateowner *sop, unsigned int hashval) -{ - struct nfs4_stateowner *local = NULL; - int status = 0; - - if (hashval >= LOCK_HASH_SIZE) - goto out; - list_for_each_entry(local, &lock_ownerid_hashtbl[hashval], so_idhash) { - if (local == sop) { - status = 1; - goto out; - } - } -out: - return status; -} - +/* Hack!: For now, we're defining this just so we can use a pointer to it + * as a unique cookie to identify our (NFSv4's) posix locks. */ +static struct lock_manager_operations nfsd_posix_mng_ops = { +}; static inline void nfs4_set_lock_denied(struct file_lock *fl, struct nfsd4_lock_denied *deny) { - struct nfs4_stateowner *sop = (struct nfs4_stateowner *) fl->fl_owner; - unsigned int hval = lockownerid_hashval(sop->so_id); + struct nfs4_stateowner *sop; + unsigned int hval; - deny->ld_sop = NULL; - if (nfs4_verify_lock_stateowner(sop, hval)) { + if (fl->fl_lmops == &nfsd_posix_mng_ops) { + sop = (struct nfs4_stateowner *) fl->fl_owner; + hval = lockownerid_hashval(sop->so_id); kref_get(&sop->so_ref); deny->ld_sop = sop; deny->ld_clientid = sop->so_client->cl_clientid; + } else { + deny->ld_sop = NULL; + deny->ld_clientid.cl_boot = 0; + deny->ld_clientid.cl_id = 0; } deny->ld_start = fl->fl_start; deny->ld_length = ~(u64)0; @@ -2736,6 +2738,7 @@ nfsd4_lock(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lock->lk_offset; if ((lock->lk_length == ~(u64)0) || @@ -2841,6 +2844,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_owner = (fl_owner_t)lockt->lt_stateowner; file_lock.fl_pid = current->tgid; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = lockt->lt_offset; if ((lockt->lt_length == ~(u64)0) || LOFF_OVERFLOW(lockt->lt_offset, lockt->lt_length)) @@ -2900,6 +2904,7 @@ nfsd4_locku(struct svc_rqst *rqstp, struct svc_fh *current_fh, struct nfsd4_lock file_lock.fl_pid = current->tgid; file_lock.fl_file = filp; file_lock.fl_flags = FL_POSIX; + file_lock.fl_lmops = &nfsd_posix_mng_ops; file_lock.fl_start = locku->lu_offset; if ((locku->lu_length == ~(u64)0) || LOFF_OVERFLOW(locku->lu_offset, locku->lu_length)) @@ -3211,15 +3216,8 @@ __nfs4_state_shutdown(void) int i; struct nfs4_client *clp = NULL; struct nfs4_delegation *dp = NULL; - struct nfs4_stateowner *sop = NULL; struct list_head *pos, *next, reaplist; - list_for_each_safe(pos, next, &close_lru) { - sop = list_entry(pos, struct nfs4_stateowner, so_close_lru); - list_del(&sop->so_close_lru); - nfs4_put_stateowner(sop); - } - for (i = 0; i < CLIENT_HASH_SIZE; i++) { while (!list_empty(&conf_id_hashtbl[i])) { clp = list_entry(conf_id_hashtbl[i].next, struct nfs4_client, cl_idhash); @@ -3244,8 +3242,6 @@ __nfs4_state_shutdown(void) } cancel_delayed_work(&laundromat_work); - flush_workqueue(laundry_wq); - destroy_workqueue(laundry_wq); nfsd4_shutdown_recdir(); nfs4_init = 0; } @@ -3253,6 +3249,8 @@ __nfs4_state_shutdown(void) void nfs4_state_shutdown(void) { + cancel_rearming_delayed_workqueue(laundry_wq, &laundromat_work); + destroy_workqueue(laundry_wq); nfs4_lock_state(); nfs4_release_reclaim(); __nfs4_state_shutdown(); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 03857fd8112..de3998f15f1 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -299,11 +299,10 @@ nfsd4_decode_fattr(struct nfsd4_compoundargs *argp, u32 *bmval, struct iattr *ia buf, dummy32, &ace.who); if (status) goto out_nfserr; - if (nfs4_acl_add_ace(*acl, ace.type, ace.flag, - ace.access_mask, ace.whotype, ace.who) != 0) { - status = -ENOMEM; + status = nfs4_acl_add_ace(*acl, ace.type, ace.flag, + ace.access_mask, ace.whotype, ace.who); + if (status) goto out_nfserr; - } } } else *acl = NULL; @@ -2085,27 +2084,20 @@ nfsd4_encode_read(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_read WRITE32(eof); WRITE32(maxcount); ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; - + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; resp->xbuf->page_len = maxcount; - /* read zero bytes -> don't set up tail */ - if(!maxcount) - return 0; - - /* set up page for remaining responses */ - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; - if (maxcount&3) { - *(resp->p)++ = 0; + RESERVE_SPACE(4); + WRITE32(0); resp->xbuf->tail[0].iov_base += maxcount&3; resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); } return 0; } @@ -2142,21 +2134,20 @@ nfsd4_encode_readlink(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_r WRITE32(maxcount); ADJUST_ARGS(); - resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->head[0].iov_len = (char*)p + - (char*)resp->xbuf->head[0].iov_base; + resp->xbuf->page_len = maxcount; - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = p; resp->xbuf->tail[0].iov_len = 0; - resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; - - resp->xbuf->page_len = maxcount; if (maxcount&3) { - *(resp->p)++ = 0; + RESERVE_SPACE(4); + WRITE32(0); resp->xbuf->tail[0].iov_base += maxcount&3; resp->xbuf->tail[0].iov_len = 4 - (maxcount&3); + ADJUST_ARGS(); } return 0; } @@ -2166,7 +2157,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re { int maxcount; loff_t offset; - u32 *page, *savep; + u32 *page, *savep, *tailbase; ENCODE_HEAD; if (nfserr) @@ -2182,6 +2173,7 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re WRITE32(0); ADJUST_ARGS(); resp->xbuf->head[0].iov_len = ((char*)resp->p) - (char*)resp->xbuf->head[0].iov_base; + tailbase = p; maxcount = PAGE_SIZE; if (maxcount > readdir->rd_maxcount) @@ -2226,14 +2218,12 @@ nfsd4_encode_readdir(struct nfsd4_compoundres *resp, int nfserr, struct nfsd4_re *p++ = htonl(readdir->common.err == nfserr_eof); resp->xbuf->page_len = ((char*)p) - (char*)page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - /* allocate a page for the tail */ - svc_take_page(resp->rqstp); - resp->xbuf->tail[0].iov_base = - page_address(resp->rqstp->rq_respages[resp->rqstp->rq_resused-1]); - resp->rqstp->rq_restailpage = resp->rqstp->rq_resused-1; + /* Use rest of head for padding and remaining ops: */ + resp->rqstp->rq_restailpage = 0; + resp->xbuf->tail[0].iov_base = tailbase; resp->xbuf->tail[0].iov_len = 0; resp->p = resp->xbuf->tail[0].iov_base; - resp->end = resp->p + PAGE_SIZE/4; + resp->end = resp->p + (PAGE_SIZE - resp->xbuf->head[0].iov_len)/4; return 0; err_no_verf: diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 3e6b75cd90f..06cd0db0f32 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -553,7 +553,7 @@ static struct svc_procedure nfsd_procedures2[18] = { PROC(none, void, void, none, RC_NOCACHE, ST), PROC(lookup, diropargs, diropres, fhandle, RC_NOCACHE, ST+FH+AT), PROC(readlink, readlinkargs, readlinkres, none, RC_NOCACHE, ST+1+NFS_MAXPATHLEN/4), - PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE), + PROC(read, readargs, readres, fhandle, RC_NOCACHE, ST+AT+1+NFSSVC_MAXBLKSIZE/4), PROC(none, void, void, none, RC_NOCACHE, ST), PROC(write, writeargs, attrstat, fhandle, RC_REPLBUFF, ST+AT), PROC(create, createargs, diropres, fhandle, RC_REPLBUFF, ST+FH+AT), diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 31018333dc3..6aa92d0e687 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -371,7 +371,6 @@ out_nfserr: static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) { ssize_t buflen; - int error; buflen = vfs_getxattr(dentry, key, NULL, 0); if (buflen <= 0) @@ -381,10 +380,7 @@ static ssize_t nfsd_getxattr(struct dentry *dentry, char *key, void **buf) if (!*buf) return -ENOMEM; - error = vfs_getxattr(dentry, key, *buf, buflen); - if (error < 0) - return error; - return buflen; + return vfs_getxattr(dentry, key, *buf, buflen); } #endif diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index bff0f0d0686..21f38accd03 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -153,6 +153,7 @@ struct o2hb_region { struct o2hb_bio_wait_ctxt { atomic_t wc_num_reqs; struct completion wc_io_complete; + int wc_error; }; static void o2hb_write_timeout(void *arg) @@ -186,6 +187,7 @@ static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc, { atomic_set(&wc->wc_num_reqs, num_ios); init_completion(&wc->wc_io_complete); + wc->wc_error = 0; } /* Used in error paths too */ @@ -218,8 +220,10 @@ static int o2hb_bio_end_io(struct bio *bio, { struct o2hb_bio_wait_ctxt *wc = bio->bi_private; - if (error) + if (error) { mlog(ML_ERROR, "IO Error %d\n", error); + wc->wc_error = error; + } if (bio->bi_size) return 1; @@ -390,6 +394,8 @@ static int o2hb_read_slots(struct o2hb_region *reg, bail_and_wait: o2hb_wait_on_io(reg, &wc); + if (wc.wc_error && !status) + status = wc.wc_error; if (bios) { for(i = 0; i < num_bios; i++) @@ -790,20 +796,24 @@ static int o2hb_highest_node(unsigned long *nodes, return highest; } -static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) +static int o2hb_do_disk_heartbeat(struct o2hb_region *reg) { int i, ret, highest_node, change = 0; unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)]; struct bio *write_bio; struct o2hb_bio_wait_ctxt write_wc; - if (o2nm_configured_node_map(configured_nodes, sizeof(configured_nodes))) - return; + ret = o2nm_configured_node_map(configured_nodes, + sizeof(configured_nodes)); + if (ret) { + mlog_errno(ret); + return ret; + } highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES); if (highest_node >= O2NM_MAX_NODES) { mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n"); - return; + return -EINVAL; } /* No sense in reading the slots of nodes that don't exist @@ -813,7 +823,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_read_slots(reg, highest_node + 1); if (ret < 0) { mlog_errno(ret); - return; + return ret; } /* With an up to date view of the slots, we can check that no @@ -831,7 +841,7 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) ret = o2hb_issue_node_write(reg, &write_bio, &write_wc); if (ret < 0) { mlog_errno(ret); - return; + return ret; } i = -1; @@ -847,6 +857,15 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) */ o2hb_wait_on_io(reg, &write_wc); bio_put(write_bio); + if (write_wc.wc_error) { + /* Do not re-arm the write timeout on I/O error - we + * can't be sure that the new block ever made it to + * disk */ + mlog(ML_ERROR, "Write error %d on device \"%s\"\n", + write_wc.wc_error, reg->hr_dev_name); + return write_wc.wc_error; + } + o2hb_arm_write_timeout(reg); /* let the person who launched us know when things are steady */ @@ -854,6 +873,8 @@ static void o2hb_do_disk_heartbeat(struct o2hb_region *reg) if (atomic_dec_and_test(®->hr_steady_iterations)) wake_up(&o2hb_steady_queue); } + + return 0; } /* Subtract b from a, storing the result in a. a *must* have a larger @@ -913,7 +934,10 @@ static int o2hb_thread(void *data) * likely to time itself out. */ do_gettimeofday(&before_hb); - o2hb_do_disk_heartbeat(reg); + i = 0; + do { + ret = o2hb_do_disk_heartbeat(reg); + } while (ret && ++i < 2); do_gettimeofday(&after_hb); elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb); diff --git a/fs/ocfs2/dlm/userdlm.c b/fs/ocfs2/dlm/userdlm.c index c3764f4744e..74ca4e5f976 100644 --- a/fs/ocfs2/dlm/userdlm.c +++ b/fs/ocfs2/dlm/userdlm.c @@ -139,6 +139,10 @@ static void user_ast(void *opaque) return; } + mlog_bug_on_msg(lockres->l_requested == LKM_IVMODE, + "Lockres %s, requested ivmode. flags 0x%x\n", + lockres->l_name, lockres->l_flags); + /* we're downconverting. */ if (lockres->l_requested < lockres->l_level) { if (lockres->l_requested <= @@ -229,23 +233,42 @@ static void user_unlock_ast(void *opaque, enum dlm_status status) mlog(0, "UNLOCK AST called on lock %s\n", lockres->l_name); - if (status != DLM_NORMAL) + if (status != DLM_NORMAL && status != DLM_CANCELGRANT) mlog(ML_ERROR, "Dlm returns status %d\n", status); spin_lock(&lockres->l_lock); - if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) + /* The teardown flag gets set early during the unlock process, + * so test the cancel flag to make sure that this ast isn't + * for a concurrent cancel. */ + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN + && !(lockres->l_flags & USER_LOCK_IN_CANCEL)) { lockres->l_level = LKM_IVMODE; - else { + } else if (status == DLM_CANCELGRANT) { + mlog(0, "Lock %s, cancel fails, flags 0x%x\n", + lockres->l_name, lockres->l_flags); + /* We tried to cancel a convert request, but it was + * already granted. Don't clear the busy flag - the + * ast should've done this already. */ + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + lockres->l_flags &= ~USER_LOCK_IN_CANCEL; + goto out_noclear; + } else { + BUG_ON(!(lockres->l_flags & USER_LOCK_IN_CANCEL)); + /* Cancel succeeded, we want to re-queue */ + mlog(0, "Lock %s, cancel succeeds, flags 0x%x\n", + lockres->l_name, lockres->l_flags); lockres->l_requested = LKM_IVMODE; /* cancel an * upconvert * request. */ lockres->l_flags &= ~USER_LOCK_IN_CANCEL; /* we want the unblock thread to look at it again * now. */ - __user_dlm_queue_lockres(lockres); + if (lockres->l_flags & USER_LOCK_BLOCKED) + __user_dlm_queue_lockres(lockres); } lockres->l_flags &= ~USER_LOCK_BUSY; +out_noclear: spin_unlock(&lockres->l_lock); wake_up(&lockres->l_event); @@ -268,13 +291,26 @@ static void user_dlm_unblock_lock(void *opaque) spin_lock(&lockres->l_lock); - BUG_ON(!(lockres->l_flags & USER_LOCK_BLOCKED)); - BUG_ON(!(lockres->l_flags & USER_LOCK_QUEUED)); + mlog_bug_on_msg(!(lockres->l_flags & USER_LOCK_QUEUED), + "Lockres %s, flags 0x%x\n", + lockres->l_name, lockres->l_flags); - /* notice that we don't clear USER_LOCK_BLOCKED here. That's - * for user_ast to do. */ + /* notice that we don't clear USER_LOCK_BLOCKED here. If it's + * set, we want user_ast clear it. */ lockres->l_flags &= ~USER_LOCK_QUEUED; + /* It's valid to get here and no longer be blocked - if we get + * several basts in a row, we might be queued by the first + * one, the unblock thread might run and clear the queued + * flag, and finally we might get another bast which re-queues + * us before our ast for the downconvert is called. */ + if (!(lockres->l_flags & USER_LOCK_BLOCKED)) { + mlog(0, "Lockres %s, flags 0x%x: queued but not blocking\n", + lockres->l_name, lockres->l_flags); + spin_unlock(&lockres->l_lock); + goto drop_ref; + } + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { mlog(0, "lock is in teardown so we do nothing\n"); spin_unlock(&lockres->l_lock); @@ -282,7 +318,9 @@ static void user_dlm_unblock_lock(void *opaque) } if (lockres->l_flags & USER_LOCK_BUSY) { - mlog(0, "BUSY flag detected...\n"); + mlog(0, "Cancel lock %s, flags 0x%x\n", + lockres->l_name, lockres->l_flags); + if (lockres->l_flags & USER_LOCK_IN_CANCEL) { spin_unlock(&lockres->l_lock); goto drop_ref; @@ -296,14 +334,7 @@ static void user_dlm_unblock_lock(void *opaque) LKM_CANCEL, user_unlock_ast, lockres); - if (status == DLM_CANCELGRANT) { - /* If we got this, then the ast was fired - * before we could cancel. We cleanup our - * state, and restart the function. */ - spin_lock(&lockres->l_lock); - lockres->l_flags &= ~USER_LOCK_IN_CANCEL; - spin_unlock(&lockres->l_lock); - } else if (status != DLM_NORMAL) + if (status != DLM_NORMAL) user_log_dlm_error("dlmunlock", status, lockres); goto drop_ref; } @@ -581,6 +612,14 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) mlog(0, "asked to destroy %s\n", lockres->l_name); spin_lock(&lockres->l_lock); + if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) { + mlog(0, "Lock is already torn down\n"); + spin_unlock(&lockres->l_lock); + return 0; + } + + lockres->l_flags |= USER_LOCK_IN_TEARDOWN; + while (lockres->l_flags & USER_LOCK_BUSY) { spin_unlock(&lockres->l_lock); @@ -606,7 +645,6 @@ int user_dlm_destroy_lock(struct user_lock_res *lockres) lockres->l_flags &= ~USER_LOCK_ATTACHED; lockres->l_flags |= USER_LOCK_BUSY; - lockres->l_flags |= USER_LOCK_IN_TEARDOWN; spin_unlock(&lockres->l_lock); mlog(0, "unlocking lockres %s\n", lockres->l_name); diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 34e903a6a46..581eb451a41 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -260,6 +260,17 @@ static int ocfs2_truncate_file(struct inode *inode, if (new_i_size == le64_to_cpu(fe->i_size)) goto bail; + /* This forces other nodes to sync and drop their pages. Do + * this even if we have a truncate without allocation change - + * ocfs2 cluster sizes can be much greater than page size, so + * we have to truncate them anyway. */ + status = ocfs2_data_lock(inode, 1); + if (status < 0) { + mlog_errno(status); + goto bail; + } + ocfs2_data_unlock(inode, 1); + if (le32_to_cpu(fe->i_clusters) == ocfs2_clusters_for_bytes(osb->sb, new_i_size)) { mlog(0, "fe->i_clusters = %u, so we do a simple truncate\n", @@ -272,14 +283,6 @@ static int ocfs2_truncate_file(struct inode *inode, goto bail; } - /* This forces other nodes to sync and drop their pages */ - status = ocfs2_data_lock(inode, 1); - if (status < 0) { - mlog_errno(status); - goto bail; - } - ocfs2_data_unlock(inode, 1); - /* alright, we're going to need to do a full blown alloc size * change. Orphan the inode so that recovery can complete the * truncate if necessary. This does the task of marking diff --git a/fs/open.c b/fs/open.c index c32c89d6d8d..53ec28c3677 100644 --- a/fs/open.c +++ b/fs/open.c @@ -331,7 +331,10 @@ out: asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length) { - return do_sys_ftruncate(fd, length, 1); + long ret = do_sys_ftruncate(fd, length, 1); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } /* LFS versions of truncate are only needed on 32 bit machines */ @@ -343,7 +346,10 @@ asmlinkage long sys_truncate64(const char __user * path, loff_t length) asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length) { - return do_sys_ftruncate(fd, length, 0); + long ret = do_sys_ftruncate(fd, length, 0); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } #endif @@ -1093,20 +1099,30 @@ long do_sys_open(int dfd, const char __user *filename, int flags, int mode) asmlinkage long sys_open(const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(AT_FDCWD, filename, flags, mode); + ret = do_sys_open(AT_FDCWD, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } EXPORT_SYMBOL_GPL(sys_open); asmlinkage long sys_openat(int dfd, const char __user *filename, int flags, int mode) { + long ret; + if (force_o_largefile()) flags |= O_LARGEFILE; - return do_sys_open(dfd, filename, flags, mode); + ret = do_sys_open(dfd, filename, flags, mode); + /* avoid REGPARM breakage on x86: */ + prevent_tail_call(ret); + return ret; } EXPORT_SYMBOL_GPL(sys_openat); diff --git a/fs/partitions/check.c b/fs/partitions/check.c index af0cb4b9e78..45ae7dd3c65 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -331,7 +331,9 @@ void delete_partition(struct gendisk *disk, int part) devfs_remove("%s/part%d", disk->devfs_name, part); if (p->holder_dir) kobject_unregister(p->holder_dir); - kobject_unregister(&p->kobj); + kobject_uevent(&p->kobj, KOBJ_REMOVE); + kobject_del(&p->kobj); + kobject_put(&p->kobj); } void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) @@ -357,7 +359,10 @@ void add_partition(struct gendisk *disk, int part, sector_t start, sector_t len) snprintf(p->kobj.name,KOBJ_NAME_LEN,"%s%d",disk->kobj.name,part); p->kobj.parent = &disk->kobj; p->kobj.ktype = &ktype_part; - kobject_register(&p->kobj); + kobject_init(&p->kobj); + kobject_add(&p->kobj); + if (!disk->part_uevent_suppress) + kobject_uevent(&p->kobj, KOBJ_ADD); partition_sysfs_add_subdir(p); disk->part[part-1] = p; } @@ -367,6 +372,7 @@ static char *make_block_name(struct gendisk *disk) char *name; static char *block_str = "block:"; int size; + char *s; size = strlen(block_str) + strlen(disk->disk_name) + 1; name = kmalloc(size, GFP_KERNEL); @@ -374,6 +380,10 @@ static char *make_block_name(struct gendisk *disk) return NULL; strcpy(name, block_str); strcat(name, disk->disk_name); + /* ewww... some of these buggers have / in name... */ + s = strchr(name, '/'); + if (s) + *s = '!'; return name; } @@ -395,6 +405,8 @@ void register_disk(struct gendisk *disk) { struct block_device *bdev; char *s; + int i; + struct hd_struct *p; int err; strlcpy(disk->kobj.name,disk->disk_name,KOBJ_NAME_LEN); @@ -406,13 +418,12 @@ void register_disk(struct gendisk *disk) return; disk_sysfs_symlinks(disk); disk_sysfs_add_subdirs(disk); - kobject_uevent(&disk->kobj, KOBJ_ADD); /* No minors to use for partitions */ if (disk->minors == 1) { if (disk->devfs_name[0] != '\0') devfs_add_disk(disk); - return; + goto exit; } /* always add handle for the whole disk */ @@ -420,16 +431,32 @@ void register_disk(struct gendisk *disk) /* No such device (e.g., media were just removed) */ if (!get_capacity(disk)) - return; + goto exit; bdev = bdget_disk(disk, 0); if (!bdev) - return; + goto exit; + /* scan partition table, but suppress uevents */ bdev->bd_invalidated = 1; - if (blkdev_get(bdev, FMODE_READ, 0) < 0) - return; + disk->part_uevent_suppress = 1; + err = blkdev_get(bdev, FMODE_READ, 0); + disk->part_uevent_suppress = 0; + if (err < 0) + goto exit; blkdev_put(bdev); + +exit: + /* announce disk after possible partitions are already created */ + kobject_uevent(&disk->kobj, KOBJ_ADD); + + /* announce possible partitions */ + for (i = 1; i < disk->minors; i++) { + p = disk->part[i-1]; + if (!p || !p->nr_sects) + continue; + kobject_uevent(&p->kobj, KOBJ_ADD); + } } int rescan_partitions(struct gendisk *disk, struct block_device *bdev) diff --git a/fs/partitions/mac.c b/fs/partitions/mac.c index bb22cdd0cb1..813292f2121 100644 --- a/fs/partitions/mac.c +++ b/fs/partitions/mac.c @@ -12,6 +12,7 @@ #include "mac.h" #ifdef CONFIG_PPC_PMAC +#include <asm/machdep.h> extern void note_bootable_part(dev_t dev, int part, int goodness); #endif @@ -79,7 +80,7 @@ int mac_partition(struct parsed_partitions *state, struct block_device *bdev) * If this is the first bootable partition, tell the * setup code, in case it wants to make this the root. */ - if (_machine == _MACH_Pmac) { + if (machine_is(powermac)) { int goodness = 0; mac_fix_string(part->processor, 16); diff --git a/fs/pipe.c b/fs/pipe.c index e2f4f1d9ffc..7fefb10db8d 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -15,6 +15,7 @@ #include <linux/pipe_fs_i.h> #include <linux/uio.h> #include <linux/highmem.h> +#include <linux/pagemap.h> #include <asm/uaccess.h> #include <asm/ioctls.h> @@ -35,7 +36,7 @@ */ /* Drop the inode semaphore and wait for a pipe event, atomically */ -void pipe_wait(struct inode * inode) +void pipe_wait(struct pipe_inode_info *pipe) { DEFINE_WAIT(wait); @@ -43,11 +44,14 @@ void pipe_wait(struct inode * inode) * Pipes are system-local resources, so sleeping on them * is considered a noninteractive wait: */ - prepare_to_wait(PIPE_WAIT(*inode), &wait, TASK_INTERRUPTIBLE|TASK_NONINTERACTIVE); - mutex_unlock(PIPE_MUTEX(*inode)); + prepare_to_wait(&pipe->wait, &wait, + TASK_INTERRUPTIBLE | TASK_NONINTERACTIVE); + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); schedule(); - finish_wait(PIPE_WAIT(*inode), &wait); - mutex_lock(PIPE_MUTEX(*inode)); + finish_wait(&pipe->wait, &wait); + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); } static int @@ -90,32 +94,56 @@ pipe_iov_copy_to_user(struct iovec *iov, const void *from, unsigned long len) return 0; } -static void anon_pipe_buf_release(struct pipe_inode_info *info, struct pipe_buffer *buf) +static void anon_pipe_buf_release(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { struct page *page = buf->page; - if (info->tmp_page) { - __free_page(page); - return; - } - info->tmp_page = page; + buf->flags &= ~PIPE_BUF_FLAG_STOLEN; + + /* + * If nobody else uses this page, and we don't already have a + * temporary page, let's keep track of it as a one-deep + * allocation cache. (Otherwise just release our reference to it) + */ + if (page_count(page) == 1 && !pipe->tmp_page) + pipe->tmp_page = page; + else + page_cache_release(page); } -static void *anon_pipe_buf_map(struct file *file, struct pipe_inode_info *info, struct pipe_buffer *buf) +static void * anon_pipe_buf_map(struct file *file, struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { return kmap(buf->page); } -static void anon_pipe_buf_unmap(struct pipe_inode_info *info, struct pipe_buffer *buf) +static void anon_pipe_buf_unmap(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) { kunmap(buf->page); } +static int anon_pipe_buf_steal(struct pipe_inode_info *pipe, + struct pipe_buffer *buf) +{ + buf->flags |= PIPE_BUF_FLAG_STOLEN; + return 0; +} + +static void anon_pipe_buf_get(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} + static struct pipe_buf_operations anon_pipe_buf_ops = { .can_merge = 1, .map = anon_pipe_buf_map, .unmap = anon_pipe_buf_unmap, .release = anon_pipe_buf_release, + .steal = anon_pipe_buf_steal, + .get = anon_pipe_buf_get, }; static ssize_t @@ -123,7 +151,7 @@ pipe_readv(struct file *filp, const struct iovec *_iov, unsigned long nr_segs, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; int do_wakeup; ssize_t ret; struct iovec *iov = (struct iovec *)_iov; @@ -136,13 +164,13 @@ pipe_readv(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(PIPE_MUTEX(*inode)); - info = inode->i_pipe; + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; for (;;) { - int bufs = info->nrbufs; + int bufs = pipe->nrbufs; if (bufs) { - int curbuf = info->curbuf; - struct pipe_buffer *buf = info->bufs + curbuf; + int curbuf = pipe->curbuf; + struct pipe_buffer *buf = pipe->bufs + curbuf; struct pipe_buf_operations *ops = buf->ops; void *addr; size_t chars = buf->len; @@ -151,11 +179,17 @@ pipe_readv(struct file *filp, const struct iovec *_iov, if (chars > total_len) chars = total_len; - addr = ops->map(filp, info, buf); + addr = ops->map(filp, pipe, buf); + if (IS_ERR(addr)) { + if (!ret) + ret = PTR_ERR(addr); + break; + } error = pipe_iov_copy_to_user(iov, addr + buf->offset, chars); - ops->unmap(info, buf); + ops->unmap(pipe, buf); if (unlikely(error)) { - if (!ret) ret = -EFAULT; + if (!ret) + ret = -EFAULT; break; } ret += chars; @@ -163,10 +197,10 @@ pipe_readv(struct file *filp, const struct iovec *_iov, buf->len -= chars; if (!buf->len) { buf->ops = NULL; - ops->release(info, buf); + ops->release(pipe, buf); curbuf = (curbuf + 1) & (PIPE_BUFFERS-1); - info->curbuf = curbuf; - info->nrbufs = --bufs; + pipe->curbuf = curbuf; + pipe->nrbufs = --bufs; do_wakeup = 1; } total_len -= chars; @@ -175,9 +209,9 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } if (bufs) /* More to do? */ continue; - if (!PIPE_WRITERS(*inode)) + if (!pipe->writers) break; - if (!PIPE_WAITING_WRITERS(*inode)) { + if (!pipe->waiting_writers) { /* syscall merging: Usually we must not sleep * if O_NONBLOCK is set, or if we got some data. * But if a writer sleeps in kernel space, then @@ -191,20 +225,22 @@ pipe_readv(struct file *filp, const struct iovec *_iov, } } if (signal_pending(current)) { - if (!ret) ret = -ERESTARTSYS; + if (!ret) + ret = -ERESTARTSYS; break; } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - pipe_wait(inode); + pipe_wait(pipe); } - mutex_unlock(PIPE_MUTEX(*inode)); - /* Signal writers asynchronously that there is more room. */ + mutex_unlock(&inode->i_mutex); + + /* Signal writers asynchronously that there is more room. */ if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } if (ret > 0) file_accessed(filp); @@ -215,6 +251,7 @@ static ssize_t pipe_read(struct file *filp, char __user *buf, size_t count, loff_t *ppos) { struct iovec iov = { .iov_base = buf, .iov_len = count }; + return pipe_readv(filp, &iov, 1, ppos); } @@ -223,7 +260,7 @@ pipe_writev(struct file *filp, const struct iovec *_iov, unsigned long nr_segs, loff_t *ppos) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; ssize_t ret; int do_wakeup; struct iovec *iov = (struct iovec *)_iov; @@ -237,10 +274,10 @@ pipe_writev(struct file *filp, const struct iovec *_iov, do_wakeup = 0; ret = 0; - mutex_lock(PIPE_MUTEX(*inode)); - info = inode->i_pipe; + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; - if (!PIPE_READERS(*inode)) { + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; @@ -248,15 +285,25 @@ pipe_writev(struct file *filp, const struct iovec *_iov, /* We try to merge small writes */ chars = total_len & (PAGE_SIZE-1); /* size of the last buffer */ - if (info->nrbufs && chars != 0) { - int lastbuf = (info->curbuf + info->nrbufs - 1) & (PIPE_BUFFERS-1); - struct pipe_buffer *buf = info->bufs + lastbuf; + if (pipe->nrbufs && chars != 0) { + int lastbuf = (pipe->curbuf + pipe->nrbufs - 1) & + (PIPE_BUFFERS-1); + struct pipe_buffer *buf = pipe->bufs + lastbuf; struct pipe_buf_operations *ops = buf->ops; int offset = buf->offset + buf->len; + if (ops->can_merge && offset + chars <= PAGE_SIZE) { - void *addr = ops->map(filp, info, buf); - int error = pipe_iov_copy_from_user(offset + addr, iov, chars); - ops->unmap(info, buf); + void *addr; + int error; + + addr = ops->map(filp, pipe, buf); + if (IS_ERR(addr)) { + error = PTR_ERR(addr); + goto out; + } + error = pipe_iov_copy_from_user(offset + addr, iov, + chars); + ops->unmap(pipe, buf); ret = error; do_wakeup = 1; if (error) @@ -271,16 +318,18 @@ pipe_writev(struct file *filp, const struct iovec *_iov, for (;;) { int bufs; - if (!PIPE_READERS(*inode)) { + + if (!pipe->readers) { send_sig(SIGPIPE, current, 0); - if (!ret) ret = -EPIPE; + if (!ret) + ret = -EPIPE; break; } - bufs = info->nrbufs; + bufs = pipe->nrbufs; if (bufs < PIPE_BUFFERS) { - int newbuf = (info->curbuf + bufs) & (PIPE_BUFFERS-1); - struct pipe_buffer *buf = info->bufs + newbuf; - struct page *page = info->tmp_page; + int newbuf = (pipe->curbuf + bufs) & (PIPE_BUFFERS-1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + struct page *page = pipe->tmp_page; int error; if (!page) { @@ -289,9 +338,9 @@ pipe_writev(struct file *filp, const struct iovec *_iov, ret = ret ? : -ENOMEM; break; } - info->tmp_page = page; + pipe->tmp_page = page; } - /* Always wakeup, even if the copy fails. Otherwise + /* Always wake up, even if the copy fails. Otherwise * we lock up (O_NONBLOCK-)readers that sleep due to * syscall merging. * FIXME! Is this really true? @@ -304,7 +353,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, error = pipe_iov_copy_from_user(kmap(page), iov, chars); kunmap(page); if (unlikely(error)) { - if (!ret) ret = -EFAULT; + if (!ret) + ret = -EFAULT; break; } ret += chars; @@ -314,8 +364,8 @@ pipe_writev(struct file *filp, const struct iovec *_iov, buf->ops = &anon_pipe_buf_ops; buf->offset = 0; buf->len = chars; - info->nrbufs = ++bufs; - info->tmp_page = NULL; + pipe->nrbufs = ++bufs; + pipe->tmp_page = NULL; total_len -= chars; if (!total_len) @@ -324,27 +374,29 @@ pipe_writev(struct file *filp, const struct iovec *_iov, if (bufs < PIPE_BUFFERS) continue; if (filp->f_flags & O_NONBLOCK) { - if (!ret) ret = -EAGAIN; + if (!ret) + ret = -EAGAIN; break; } if (signal_pending(current)) { - if (!ret) ret = -ERESTARTSYS; + if (!ret) + ret = -ERESTARTSYS; break; } if (do_wakeup) { - wake_up_interruptible_sync(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); do_wakeup = 0; } - PIPE_WAITING_WRITERS(*inode)++; - pipe_wait(inode); - PIPE_WAITING_WRITERS(*inode)--; + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; } out: - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); if (do_wakeup) { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); } if (ret > 0) file_update_time(filp); @@ -356,6 +408,7 @@ pipe_write(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) { struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = count }; + return pipe_writev(filp, &iov, 1, ppos); } @@ -366,7 +419,8 @@ bad_pipe_r(struct file *filp, char __user *buf, size_t count, loff_t *ppos) } static ssize_t -bad_pipe_w(struct file *filp, const char __user *buf, size_t count, loff_t *ppos) +bad_pipe_w(struct file *filp, const char __user *buf, size_t count, + loff_t *ppos) { return -EBADF; } @@ -376,21 +430,22 @@ pipe_ioctl(struct inode *pino, struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info; + struct pipe_inode_info *pipe; int count, buf, nrbufs; switch (cmd) { case FIONREAD: - mutex_lock(PIPE_MUTEX(*inode)); - info = inode->i_pipe; + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; count = 0; - buf = info->curbuf; - nrbufs = info->nrbufs; + buf = pipe->curbuf; + nrbufs = pipe->nrbufs; while (--nrbufs >= 0) { - count += info->bufs[buf].len; + count += pipe->bufs[buf].len; buf = (buf+1) & (PIPE_BUFFERS-1); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); + return put_user(count, (int __user *)arg); default: return -EINVAL; @@ -403,17 +458,17 @@ pipe_poll(struct file *filp, poll_table *wait) { unsigned int mask; struct inode *inode = filp->f_dentry->d_inode; - struct pipe_inode_info *info = inode->i_pipe; + struct pipe_inode_info *pipe = inode->i_pipe; int nrbufs; - poll_wait(filp, PIPE_WAIT(*inode), wait); + poll_wait(filp, &pipe->wait, wait); /* Reading only -- no need for acquiring the semaphore. */ - nrbufs = info->nrbufs; + nrbufs = pipe->nrbufs; mask = 0; if (filp->f_mode & FMODE_READ) { mask = (nrbufs > 0) ? POLLIN | POLLRDNORM : 0; - if (!PIPE_WRITERS(*inode) && filp->f_version != PIPE_WCOUNTER(*inode)) + if (!pipe->writers && filp->f_version != pipe->w_counter) mask |= POLLHUP; } @@ -423,7 +478,7 @@ pipe_poll(struct file *filp, poll_table *wait) * Most Unices do not set POLLERR for FIFOs but on Linux they * behave exactly like pipes for poll(). */ - if (!PIPE_READERS(*inode)) + if (!pipe->readers) mask |= POLLERR; } @@ -433,17 +488,21 @@ pipe_poll(struct file *filp, poll_table *wait) static int pipe_release(struct inode *inode, int decr, int decw) { - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_READERS(*inode) -= decr; - PIPE_WRITERS(*inode) -= decw; - if (!PIPE_READERS(*inode) && !PIPE_WRITERS(*inode)) { + struct pipe_inode_info *pipe; + + mutex_lock(&inode->i_mutex); + pipe = inode->i_pipe; + pipe->readers -= decr; + pipe->writers -= decw; + + if (!pipe->readers && !pipe->writers) { free_pipe_info(inode); } else { - wake_up_interruptible(PIPE_WAIT(*inode)); - kill_fasync(PIPE_FASYNC_READERS(*inode), SIGIO, POLL_IN); - kill_fasync(PIPE_FASYNC_WRITERS(*inode), SIGIO, POLL_OUT); + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); } - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); return 0; } @@ -454,9 +513,9 @@ pipe_read_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_readers); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -471,9 +530,9 @@ pipe_write_fasync(int fd, struct file *filp, int on) struct inode *inode = filp->f_dentry->d_inode; int retval; - mutex_lock(PIPE_MUTEX(*inode)); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + retval = fasync_helper(fd, filp, on, &inode->i_pipe->fasync_writers); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -486,16 +545,17 @@ static int pipe_rdwr_fasync(int fd, struct file *filp, int on) { struct inode *inode = filp->f_dentry->d_inode; + struct pipe_inode_info *pipe = inode->i_pipe; int retval; - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_READERS(*inode)); + retval = fasync_helper(fd, filp, on, &pipe->fasync_readers); if (retval >= 0) - retval = fasync_helper(fd, filp, on, PIPE_FASYNC_WRITERS(*inode)); + retval = fasync_helper(fd, filp, on, &pipe->fasync_writers); - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_unlock(&inode->i_mutex); if (retval < 0) return retval; @@ -534,9 +594,9 @@ pipe_read_open(struct inode *inode, struct file *filp) { /* We could have perhaps used atomic_t, but this and friends below are the only places. So it doesn't seem worthwhile. */ - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_READERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + inode->i_pipe->readers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -544,9 +604,9 @@ pipe_read_open(struct inode *inode, struct file *filp) static int pipe_write_open(struct inode *inode, struct file *filp) { - mutex_lock(PIPE_MUTEX(*inode)); - PIPE_WRITERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); + inode->i_pipe->writers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -554,12 +614,12 @@ pipe_write_open(struct inode *inode, struct file *filp) static int pipe_rdwr_open(struct inode *inode, struct file *filp) { - mutex_lock(PIPE_MUTEX(*inode)); + mutex_lock(&inode->i_mutex); if (filp->f_mode & FMODE_READ) - PIPE_READERS(*inode)++; + inode->i_pipe->readers++; if (filp->f_mode & FMODE_WRITE) - PIPE_WRITERS(*inode)++; - mutex_unlock(PIPE_MUTEX(*inode)); + inode->i_pipe->writers++; + mutex_unlock(&inode->i_mutex); return 0; } @@ -642,37 +702,38 @@ static struct file_operations rdwr_pipe_fops = { .fasync = pipe_rdwr_fasync, }; -void free_pipe_info(struct inode *inode) +struct pipe_inode_info * alloc_pipe_info(struct inode *inode) +{ + struct pipe_inode_info *pipe; + + pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); + if (pipe) { + init_waitqueue_head(&pipe->wait); + pipe->r_counter = pipe->w_counter = 1; + pipe->inode = inode; + } + + return pipe; +} + +void __free_pipe_info(struct pipe_inode_info *pipe) { int i; - struct pipe_inode_info *info = inode->i_pipe; - inode->i_pipe = NULL; for (i = 0; i < PIPE_BUFFERS; i++) { - struct pipe_buffer *buf = info->bufs + i; + struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) - buf->ops->release(info, buf); + buf->ops->release(pipe, buf); } - if (info->tmp_page) - __free_page(info->tmp_page); - kfree(info); + if (pipe->tmp_page) + __free_page(pipe->tmp_page); + kfree(pipe); } -struct inode* pipe_new(struct inode* inode) +void free_pipe_info(struct inode *inode) { - struct pipe_inode_info *info; - - info = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); - if (!info) - goto fail_page; - inode->i_pipe = info; - - init_waitqueue_head(PIPE_WAIT(*inode)); - PIPE_RCOUNTER(*inode) = PIPE_WCOUNTER(*inode) = 1; - - return inode; -fail_page: - return NULL; + __free_pipe_info(inode->i_pipe); + inode->i_pipe = NULL; } static struct vfsmount *pipe_mnt __read_mostly; @@ -680,6 +741,7 @@ static int pipefs_delete_dentry(struct dentry *dentry) { return 1; } + static struct dentry_operations pipefs_dentry_operations = { .d_delete = pipefs_delete_dentry, }; @@ -687,13 +749,17 @@ static struct dentry_operations pipefs_dentry_operations = { static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode(pipe_mnt->mnt_sb); + struct pipe_inode_info *pipe; if (!inode) goto fail_inode; - if(!pipe_new(inode)) + pipe = alloc_pipe_info(inode); + if (!pipe) goto fail_iput; - PIPE_READERS(*inode) = PIPE_WRITERS(*inode) = 1; + inode->i_pipe = pipe; + + pipe->readers = pipe->writers = 1; inode->i_fop = &rdwr_pipe_fops; /* @@ -708,10 +774,12 @@ static struct inode * get_pipe_inode(void) inode->i_gid = current->fsgid; inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; inode->i_blksize = PAGE_SIZE; + return inode; fail_iput: iput(inode); + fail_inode: return NULL; } @@ -724,7 +792,7 @@ int do_pipe(int *fd) struct inode * inode; struct file *f1, *f2; int error; - int i,j; + int i, j; error = -ENFILE; f1 = get_empty_filp(); @@ -757,6 +825,7 @@ int do_pipe(int *fd) dentry = d_alloc(pipe_mnt->mnt_sb->s_root, &this); if (!dentry) goto close_f12_inode_i_j; + dentry->d_op = &pipefs_dentry_operations; d_add(dentry, inode); f1->f_vfsmnt = f2->f_vfsmnt = mntget(mntget(pipe_mnt)); @@ -780,6 +849,7 @@ int do_pipe(int *fd) fd_install(j, f2); fd[0] = i; fd[1] = j; + return 0; close_f12_inode_i_j: @@ -804,8 +874,9 @@ no_files: * d_name - pipe: will go nicely and kill the special-casing in procfs. */ -static struct super_block *pipefs_get_sb(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static struct super_block * +pipefs_get_sb(struct file_system_type *fs_type, int flags, + const char *dev_name, void *data) { return get_sb_pseudo(fs_type, "pipe:", NULL, PIPEFS_MAGIC); } @@ -819,6 +890,7 @@ static struct file_system_type pipe_fs_type = { static int __init init_pipe_fs(void) { int err = register_filesystem(&pipe_fs_type); + if (!err) { pipe_mnt = kern_mount(&pipe_fs_type); if (IS_ERR(pipe_mnt)) { diff --git a/fs/proc/base.c b/fs/proc/base.c index 8f1f49ceebe..a3a3eecef68 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -534,12 +534,15 @@ static int proc_oom_score(struct task_struct *task, char *buffer) /* If the process being read is separated by chroot from the reading process, * don't let the reader access the threads. + * + * note: this does dput(root) and mntput(vfsmnt) on exit. */ static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt) { struct dentry *de, *base; struct vfsmount *our_vfsmnt, *mnt; int res = 0; + read_lock(¤t->fs->lock); our_vfsmnt = mntget(current->fs->rootmnt); base = dget(current->fs->root); @@ -549,11 +552,11 @@ static int proc_check_chroot(struct dentry *root, struct vfsmount *vfsmnt) de = root; mnt = vfsmnt; - while (vfsmnt != our_vfsmnt) { - if (vfsmnt == vfsmnt->mnt_parent) + while (mnt != our_vfsmnt) { + if (mnt == mnt->mnt_parent) goto out; - de = vfsmnt->mnt_mountpoint; - vfsmnt = vfsmnt->mnt_parent; + de = mnt->mnt_mountpoint; + mnt = mnt->mnt_parent; } if (!is_subdir(de, base)) @@ -564,7 +567,7 @@ exit: dput(base); mntput(our_vfsmnt); dput(root); - mntput(mnt); + mntput(vfsmnt); return res; out: spin_unlock(&vfsmount_lock); diff --git a/fs/proc/proc_devtree.c b/fs/proc/proc_devtree.c index 596b4b4f1cc..abdf068bc27 100644 --- a/fs/proc/proc_devtree.c +++ b/fs/proc/proc_devtree.c @@ -52,7 +52,8 @@ static int property_read_proc(char *page, char **start, off_t off, * Add a property to a node */ static struct proc_dir_entry * -__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp) +__proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp, + const char *name) { struct proc_dir_entry *ent; @@ -60,14 +61,14 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp) * Unfortunately proc_register puts each new entry * at the beginning of the list. So we rearrange them. */ - ent = create_proc_read_entry(pp->name, - strncmp(pp->name, "security-", 9) + ent = create_proc_read_entry(name, + strncmp(name, "security-", 9) ? S_IRUGO : S_IRUSR, de, property_read_proc, pp); if (ent == NULL) return NULL; - if (!strncmp(pp->name, "security-", 9)) + if (!strncmp(name, "security-", 9)) ent->size = 0; /* don't leak number of password chars */ else ent->size = pp->length; @@ -78,7 +79,7 @@ __proc_device_tree_add_prop(struct proc_dir_entry *de, struct property *pp) void proc_device_tree_add_prop(struct proc_dir_entry *pde, struct property *prop) { - __proc_device_tree_add_prop(pde, prop); + __proc_device_tree_add_prop(pde, prop, prop->name); } void proc_device_tree_remove_prop(struct proc_dir_entry *pde, @@ -106,6 +107,69 @@ void proc_device_tree_update_prop(struct proc_dir_entry *pde, } /* + * Various dodgy firmware might give us nodes and/or properties with + * conflicting names. That's generally ok, except for exporting via /proc, + * so munge names here to ensure they're unique. + */ + +static int duplicate_name(struct proc_dir_entry *de, const char *name) +{ + struct proc_dir_entry *ent; + int found = 0; + + spin_lock(&proc_subdir_lock); + + for (ent = de->subdir; ent != NULL; ent = ent->next) { + if (strcmp(ent->name, name) == 0) { + found = 1; + break; + } + } + + spin_unlock(&proc_subdir_lock); + + return found; +} + +static const char *fixup_name(struct device_node *np, struct proc_dir_entry *de, + const char *name) +{ + char *fixed_name; + int fixup_len = strlen(name) + 2 + 1; /* name + #x + \0 */ + int i = 1, size; + +realloc: + fixed_name = kmalloc(fixup_len, GFP_KERNEL); + if (fixed_name == NULL) { + printk(KERN_ERR "device-tree: Out of memory trying to fixup " + "name \"%s\"\n", name); + return name; + } + +retry: + size = snprintf(fixed_name, fixup_len, "%s#%d", name, i); + size++; /* account for NULL */ + + if (size > fixup_len) { + /* We ran out of space, free and reallocate. */ + kfree(fixed_name); + fixup_len = size; + goto realloc; + } + + if (duplicate_name(de, fixed_name)) { + /* Multiple duplicates. Retry with a different offset. */ + i++; + goto retry; + } + + printk(KERN_WARNING "device-tree: Duplicate name in %s, " + "renamed to \"%s\"\n", np->full_name, fixed_name); + + return fixed_name; +} + +/* * Process a node, adding entries for its children and its properties. */ void proc_device_tree_add_node(struct device_node *np, @@ -118,37 +182,30 @@ void proc_device_tree_add_node(struct device_node *np, set_node_proc_entry(np, de); for (child = NULL; (child = of_get_next_child(np, child));) { + /* Use everything after the last slash, or the full name */ p = strrchr(child->full_name, '/'); if (!p) p = child->full_name; else ++p; + + if (duplicate_name(de, p)) + p = fixup_name(np, de, p); + ent = proc_mkdir(p, de); if (ent == 0) break; proc_device_tree_add_node(child, ent); } of_node_put(child); + for (pp = np->properties; pp != 0; pp = pp->next) { - /* - * Yet another Apple device-tree bogosity: on some machines, - * they have properties & nodes with the same name. Those - * properties are quite unimportant for us though, thus we - * simply "skip" them here, but we do have to check. - */ - spin_lock(&proc_subdir_lock); - for (ent = de->subdir; ent != NULL; ent = ent->next) - if (!strcmp(ent->name, pp->name)) - break; - spin_unlock(&proc_subdir_lock); - if (ent != NULL) { - printk(KERN_WARNING "device-tree: property \"%s\" name" - " conflicts with node in %s\n", pp->name, - np->full_name); - continue; - } + p = pp->name; + + if (duplicate_name(de, p)) + p = fixup_name(np, de, p); - ent = __proc_device_tree_add_prop(de, pp); + ent = __proc_device_tree_add_prop(de, pp, p); if (ent == 0) break; } diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index ef5a3323f4b..5c10ea15742 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -249,144 +249,60 @@ static int cpuinfo_open(struct inode *inode, struct file *file) return seq_open(file, &cpuinfo_op); } -enum devinfo_states { - CHR_HDR, - CHR_LIST, - BLK_HDR, - BLK_LIST, - DEVINFO_DONE -}; - -struct devinfo_state { - void *chrdev; - void *blkdev; - unsigned int num_records; - unsigned int cur_record; - enum devinfo_states state; +static struct file_operations proc_cpuinfo_operations = { + .open = cpuinfo_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, }; -static void *devinfo_start(struct seq_file *f, loff_t *pos) +static int devinfo_show(struct seq_file *f, void *v) { - struct devinfo_state *info = f->private; + int i = *(loff_t *) v; - if (*pos) { - if ((info) && (*pos <= info->num_records)) - return info; - return NULL; + if (i < CHRDEV_MAJOR_HASH_SIZE) { + if (i == 0) + seq_printf(f, "Character devices:\n"); + chrdev_show(f, i); + } else { + i -= CHRDEV_MAJOR_HASH_SIZE; + if (i == 0) + seq_printf(f, "\nBlock devices:\n"); + blkdev_show(f, i); } - info = kmalloc(sizeof(*info), GFP_KERNEL); - f->private = info; - info->chrdev = acquire_chrdev_list(); - info->blkdev = acquire_blkdev_list(); - info->state = CHR_HDR; - info->num_records = count_chrdev_list(); - info->num_records += count_blkdev_list(); - info->num_records += 2; /* Character and Block headers */ - *pos = 1; - info->cur_record = *pos; - return info; + return 0; } -static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) +static void *devinfo_start(struct seq_file *f, loff_t *pos) { - int idummy; - char *ndummy; - struct devinfo_state *info = f->private; - - switch (info->state) { - case CHR_HDR: - info->state = CHR_LIST; - (*pos)++; - /*fallthrough*/ - case CHR_LIST: - if (get_chrdev_info(info->chrdev,&idummy,&ndummy)) { - /* - * The character dev list is complete - */ - info->state = BLK_HDR; - } else { - info->chrdev = get_next_chrdev(info->chrdev); - } - (*pos)++; - break; - case BLK_HDR: - info->state = BLK_LIST; - (*pos)++; - /*fallthrough*/ - case BLK_LIST: - if (get_blkdev_info(info->blkdev,&idummy,&ndummy)) { - /* - * The block dev list is complete - */ - info->state = DEVINFO_DONE; - } else { - info->blkdev = get_next_blkdev(info->blkdev); - } - (*pos)++; - break; - case DEVINFO_DONE: - (*pos)++; - info->cur_record = *pos; - info = NULL; - break; - default: - break; - } - if (info) - info->cur_record = *pos; - return info; + if (*pos < (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + return pos; + return NULL; } -static void devinfo_stop(struct seq_file *f, void *v) +static void *devinfo_next(struct seq_file *f, void *v, loff_t *pos) { - struct devinfo_state *info = f->private; - - if (info) { - release_chrdev_list(info->chrdev); - release_blkdev_list(info->blkdev); - f->private = NULL; - kfree(info); - } + (*pos)++; + if (*pos >= (BLKDEV_MAJOR_HASH_SIZE + CHRDEV_MAJOR_HASH_SIZE)) + return NULL; + return pos; } -static int devinfo_show(struct seq_file *f, void *arg) -{ - int major; - char *name; - struct devinfo_state *info = f->private; - - switch(info->state) { - case CHR_HDR: - seq_printf(f,"Character devices:\n"); - /* fallthrough */ - case CHR_LIST: - if (!get_chrdev_info(info->chrdev,&major,&name)) - seq_printf(f,"%3d %s\n",major,name); - break; - case BLK_HDR: - seq_printf(f,"\nBlock devices:\n"); - /* fallthrough */ - case BLK_LIST: - if (!get_blkdev_info(info->blkdev,&major,&name)) - seq_printf(f,"%3d %s\n",major,name); - break; - default: - break; - } - - return 0; +static void devinfo_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ } -static struct seq_operations devinfo_op = { - .start = devinfo_start, - .next = devinfo_next, - .stop = devinfo_stop, - .show = devinfo_show, +static struct seq_operations devinfo_ops = { + .start = devinfo_start, + .next = devinfo_next, + .stop = devinfo_stop, + .show = devinfo_show }; -static int devinfo_open(struct inode *inode, struct file *file) +static int devinfo_open(struct inode *inode, struct file *filp) { - return seq_open(file, &devinfo_op); + return seq_open(filp, &devinfo_ops); } static struct file_operations proc_devinfo_operations = { @@ -396,13 +312,6 @@ static struct file_operations proc_devinfo_operations = { .release = seq_release, }; -static struct file_operations proc_cpuinfo_operations = { - .open = cpuinfo_open, - .read = seq_read, - .llseek = seq_lseek, - .release = seq_release, -}; - extern struct seq_operations vmstat_op; static int vmstat_open(struct inode *inode, struct file *file) { diff --git a/fs/proc/vmcore.c b/fs/proc/vmcore.c index 7efa73d44c9..20d4b2237fc 100644 --- a/fs/proc/vmcore.c +++ b/fs/proc/vmcore.c @@ -103,8 +103,8 @@ static ssize_t read_vmcore(struct file *file, char __user *buffer, size_t buflen, loff_t *fpos) { ssize_t acc = 0, tmp; - size_t tsz, nr_bytes; - u64 start; + size_t tsz; + u64 start, nr_bytes; struct vmcore *curr_m = NULL; if (buflen == 0 || *fpos >= vmcore_size) diff --git a/fs/read_write.c b/fs/read_write.c index 6256ca81a71..5bc0e9234f9 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -202,7 +202,7 @@ int rw_verify_area(int read_write, struct file *file, loff_t *ppos, size_t count goto Einval; inode = file->f_dentry->d_inode; - if (inode->i_flock && MANDATORY_LOCK(inode)) { + if (unlikely(inode->i_flock && MANDATORY_LOCK(inode))) { int retval = locks_mandatory_area( read_write == READ ? FLOCK_VERIFY_READ : FLOCK_VERIFY_WRITE, inode, file, pos, count); diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 010094d14da..cf6e1cf4035 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -1576,6 +1576,8 @@ const struct file_operations reiserfs_file_operations = { .sendfile = generic_file_sendfile, .aio_read = generic_file_aio_read, .aio_write = reiserfs_aio_write, + .splice_read = generic_file_splice_read, + .splice_write = generic_file_splice_write, }; struct inode_operations reiserfs_file_inode_operations = { diff --git a/fs/select.c b/fs/select.c index b3a3a1326af..a8109baa5e4 100644 --- a/fs/select.c +++ b/fs/select.c @@ -310,11 +310,12 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, fd_set __user *exp, s64 *timeout) { fd_set_bits fds; - char *bits; - int ret, size, max_fdset; + void *bits; + int ret, max_fdset; + unsigned int size; struct fdtable *fdt; /* Allocate small arguments on the stack to save memory and be faster */ - char stack_fds[SELECT_STACK_ALLOC]; + long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; if (n < 0) @@ -333,20 +334,21 @@ static int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, * since we used fdset we need to allocate memory in units of * long-words. */ - ret = -ENOMEM; size = FDS_BYTES(n); - if (6*size < SELECT_STACK_ALLOC) - bits = stack_fds; - else + bits = stack_fds; + if (size > sizeof(stack_fds) / 6) { + /* Not enough space in on-stack array; must use kmalloc */ + ret = -ENOMEM; bits = kmalloc(6 * size, GFP_KERNEL); - if (!bits) - goto out_nofds; - fds.in = (unsigned long *) bits; - fds.out = (unsigned long *) (bits + size); - fds.ex = (unsigned long *) (bits + 2*size); - fds.res_in = (unsigned long *) (bits + 3*size); - fds.res_out = (unsigned long *) (bits + 4*size); - fds.res_ex = (unsigned long *) (bits + 5*size); + if (!bits) + goto out_nofds; + } + fds.in = bits; + fds.out = bits + size; + fds.ex = bits + 2*size; + fds.res_in = bits + 3*size; + fds.res_out = bits + 4*size; + fds.res_ex = bits + 5*size; if ((ret = get_fd_set(n, inp, fds.in)) || (ret = get_fd_set(n, outp, fds.out)) || @@ -639,8 +641,10 @@ int do_sys_poll(struct pollfd __user *ufds, unsigned int nfds, s64 *timeout) struct poll_list *walk; struct fdtable *fdt; int max_fdset; - /* Allocate small arguments on the stack to save memory and be faster */ - char stack_pps[POLL_STACK_ALLOC]; + /* Allocate small arguments on the stack to save memory and be + faster - use long to make sure the buffer is aligned properly + on 64 bit archs to avoid unaligned access */ + long stack_pps[POLL_STACK_ALLOC/sizeof(long)]; struct poll_list *stack_pp = NULL; /* Do a sanity check on nfds ... */ diff --git a/fs/smbfs/file.c b/fs/smbfs/file.c index c56bd99a970..ed9a24d19d7 100644 --- a/fs/smbfs/file.c +++ b/fs/smbfs/file.c @@ -178,11 +178,9 @@ smb_writepage(struct page *page, struct writeback_control *wbc) unsigned offset = PAGE_CACHE_SIZE; int err; - if (!mapping) - BUG(); + BUG_ON(!mapping); inode = mapping->host; - if (!inode) - BUG(); + BUG_ON(!inode); end_index = inode->i_size >> PAGE_CACHE_SHIFT; diff --git a/fs/splice.c b/fs/splice.c new file mode 100644 index 00000000000..8d57e89924a --- /dev/null +++ b/fs/splice.c @@ -0,0 +1,1151 @@ +/* + * "splice": joining two ropes together by interweaving their strands. + * + * This is the "extended pipe" functionality, where a pipe is used as + * an arbitrary in-memory buffer. Think of a pipe as a small kernel + * buffer that you can use to transfer data from one end to the other. + * + * The traditional unix read/write is extended with a "splice()" operation + * that transfers data buffers to or from a pipe buffer. + * + * Named by Larry McVoy, original implementation from Linus, extended by + * Jens to support splicing to files, network, direct splicing, etc and + * fixing lots of bugs. + * + * Copyright (C) 2005-2006 Jens Axboe <axboe@suse.de> + * Copyright (C) 2005-2006 Linus Torvalds <torvalds@osdl.org> + * Copyright (C) 2006 Ingo Molnar <mingo@elte.hu> + * + */ +#include <linux/fs.h> +#include <linux/file.h> +#include <linux/pagemap.h> +#include <linux/pipe_fs_i.h> +#include <linux/mm_inline.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/buffer_head.h> +#include <linux/module.h> +#include <linux/syscalls.h> + +/* + * Passed to the actors + */ +struct splice_desc { + unsigned int len, total_len; /* current and remaining length */ + unsigned int flags; /* splice flags */ + struct file *file; /* file to read/write */ + loff_t pos; /* file position */ +}; + +/* + * Attempt to steal a page from a pipe buffer. This should perhaps go into + * a vm helper function, it's already simplified quite a bit by the + * addition of remove_mapping(). If success is returned, the caller may + * attempt to reuse this page for another destination. + */ +static int page_cache_pipe_buf_steal(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + struct address_space *mapping = page_mapping(page); + + WARN_ON(!PageLocked(page)); + WARN_ON(!PageUptodate(page)); + + /* + * At least for ext2 with nobh option, we need to wait on writeback + * completing on this page, since we'll remove it from the pagecache. + * Otherwise truncate wont wait on the page, allowing the disk + * blocks to be reused by someone else before we actually wrote our + * data to them. fs corruption ensues. + */ + wait_on_page_writeback(page); + + if (PagePrivate(page)) + try_to_release_page(page, mapping_gfp_mask(mapping)); + + if (!remove_mapping(mapping, page)) + return 1; + + buf->flags |= PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU; + return 0; +} + +static void page_cache_pipe_buf_release(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + page_cache_release(buf->page); + buf->page = NULL; + buf->flags &= ~(PIPE_BUF_FLAG_STOLEN | PIPE_BUF_FLAG_LRU); +} + +static void *page_cache_pipe_buf_map(struct file *file, + struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + struct page *page = buf->page; + int err; + + if (!PageUptodate(page)) { + lock_page(page); + + /* + * Page got truncated/unhashed. This will cause a 0-byte + * splice, if this is the first page. + */ + if (!page->mapping) { + err = -ENODATA; + goto error; + } + + /* + * Uh oh, read-error from disk. + */ + if (!PageUptodate(page)) { + err = -EIO; + goto error; + } + + /* + * Page is ok afterall, fall through to mapping. + */ + unlock_page(page); + } + + return kmap(page); +error: + unlock_page(page); + return ERR_PTR(err); +} + +static void page_cache_pipe_buf_unmap(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + kunmap(buf->page); +} + +static void page_cache_pipe_buf_get(struct pipe_inode_info *info, + struct pipe_buffer *buf) +{ + page_cache_get(buf->page); +} + +static struct pipe_buf_operations page_cache_pipe_buf_ops = { + .can_merge = 0, + .map = page_cache_pipe_buf_map, + .unmap = page_cache_pipe_buf_unmap, + .release = page_cache_pipe_buf_release, + .steal = page_cache_pipe_buf_steal, + .get = page_cache_pipe_buf_get, +}; + +/* + * Pipe output worker. This sets up our pipe format with the page cache + * pipe buffer operations. Otherwise very similar to the regular pipe_writev(). + */ +static ssize_t move_to_pipe(struct pipe_inode_info *pipe, struct page **pages, + int nr_pages, unsigned long offset, + unsigned long len, unsigned int flags) +{ + int ret, do_wakeup, i; + + ret = 0; + do_wakeup = 0; + i = 0; + + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); + + for (;;) { + if (!pipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + + if (pipe->nrbufs < PIPE_BUFFERS) { + int newbuf = (pipe->curbuf + pipe->nrbufs) & (PIPE_BUFFERS - 1); + struct pipe_buffer *buf = pipe->bufs + newbuf; + struct page *page = pages[i++]; + unsigned long this_len; + + this_len = PAGE_CACHE_SIZE - offset; + if (this_len > len) + this_len = len; + + buf->page = page; + buf->offset = offset; + buf->len = this_len; + buf->ops = &page_cache_pipe_buf_ops; + pipe->nrbufs++; + if (pipe->inode) + do_wakeup = 1; + + ret += this_len; + len -= this_len; + offset = 0; + if (!--nr_pages) + break; + if (!len) + break; + if (pipe->nrbufs < PIPE_BUFFERS) + continue; + + break; + } + + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + + pipe->waiting_writers++; + pipe_wait(pipe); + pipe->waiting_writers--; + } + + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); + } + + while (i < nr_pages) + page_cache_release(pages[i++]); + + return ret; +} + +static int +__generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + struct address_space *mapping = in->f_mapping; + unsigned int offset, nr_pages; + struct page *pages[PIPE_BUFFERS]; + struct page *page; + pgoff_t index; + int i, error; + + index = *ppos >> PAGE_CACHE_SHIFT; + offset = *ppos & ~PAGE_CACHE_MASK; + nr_pages = (len + offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + + if (nr_pages > PIPE_BUFFERS) + nr_pages = PIPE_BUFFERS; + + /* + * Initiate read-ahead on this page range. however, don't call into + * read-ahead if this is a non-zero offset (we are likely doing small + * chunk splice and the page is already there) for a single page. + */ + if (!offset || nr_pages > 1) + do_page_cache_readahead(mapping, in, index, nr_pages); + + /* + * Now fill in the holes: + */ + error = 0; + for (i = 0; i < nr_pages; i++, index++) { +find_page: + /* + * lookup the page for this index + */ + page = find_get_page(mapping, index); + if (!page) { + /* + * If in nonblock mode then dont block on + * readpage (we've kicked readahead so there + * will be asynchronous progress): + */ + if (flags & SPLICE_F_NONBLOCK) + break; + + /* + * page didn't exist, allocate one + */ + page = page_cache_alloc_cold(mapping); + if (!page) + break; + + error = add_to_page_cache_lru(page, mapping, index, + mapping_gfp_mask(mapping)); + if (unlikely(error)) { + page_cache_release(page); + break; + } + + goto readpage; + } + + /* + * If the page isn't uptodate, we may need to start io on it + */ + if (!PageUptodate(page)) { + lock_page(page); + + /* + * page was truncated, stop here. if this isn't the + * first page, we'll just complete what we already + * added + */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + break; + } + /* + * page was already under io and is now done, great + */ + if (PageUptodate(page)) { + unlock_page(page); + goto fill_it; + } + +readpage: + /* + * need to read in the page + */ + error = mapping->a_ops->readpage(in, page); + + if (unlikely(error)) { + page_cache_release(page); + if (error == AOP_TRUNCATED_PAGE) + goto find_page; + break; + } + } +fill_it: + pages[i] = page; + } + + if (i) + return move_to_pipe(pipe, pages, i, offset, len, flags); + + return error; +} + +/** + * generic_file_splice_read - splice data from file to a pipe + * @in: file to splice from + * @pipe: pipe to splice to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will read pages from given file and fill them into a pipe. + */ +ssize_t generic_file_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + ssize_t spliced; + int ret; + + ret = 0; + spliced = 0; + + while (len) { + ret = __generic_file_splice_read(in, ppos, pipe, len, flags); + + if (ret <= 0) + break; + + *ppos += ret; + len -= ret; + spliced += ret; + + if (!(flags & SPLICE_F_NONBLOCK)) + continue; + ret = -EAGAIN; + break; + } + + if (spliced) + return spliced; + + return ret; +} + +EXPORT_SYMBOL(generic_file_splice_read); + +/* + * Send 'sd->len' bytes to socket from 'sd->file' at position 'sd->pos' + * using sendpage(). + */ +static int pipe_to_sendpage(struct pipe_inode_info *info, + struct pipe_buffer *buf, struct splice_desc *sd) +{ + struct file *file = sd->file; + loff_t pos = sd->pos; + unsigned int offset; + ssize_t ret; + void *ptr; + int more; + + /* + * Sub-optimal, but we are limited by the pipe ->map. We don't + * need a kmap'ed buffer here, we just want to make sure we + * have the page pinned if the pipe page originates from the + * page cache. + */ + ptr = buf->ops->map(file, info, buf); + if (IS_ERR(ptr)) + return PTR_ERR(ptr); + + offset = pos & ~PAGE_CACHE_MASK; + more = (sd->flags & SPLICE_F_MORE) || sd->len < sd->total_len; + + ret = file->f_op->sendpage(file, buf->page, offset, sd->len, &pos,more); + + buf->ops->unmap(info, buf); + if (ret == sd->len) + return 0; + + return -EIO; +} + +/* + * This is a little more tricky than the file -> pipe splicing. There are + * basically three cases: + * + * - Destination page already exists in the address space and there + * are users of it. For that case we have no other option that + * copying the data. Tough luck. + * - Destination page already exists in the address space, but there + * are no users of it. Make sure it's uptodate, then drop it. Fall + * through to last case. + * - Destination page does not exist, we can add the pipe page to + * the page cache and avoid the copy. + * + * If asked to move pages to the output file (SPLICE_F_MOVE is set in + * sd->flags), we attempt to migrate pages from the pipe to the output + * file address space page cache. This is possible if no one else has + * the pipe page referenced outside of the pipe and page cache. If + * SPLICE_F_MOVE isn't set, or we cannot move the page, we simply create + * a new page in the output file page cache and fill/dirty that. + */ +static int pipe_to_file(struct pipe_inode_info *info, struct pipe_buffer *buf, + struct splice_desc *sd) +{ + struct file *file = sd->file; + struct address_space *mapping = file->f_mapping; + gfp_t gfp_mask = mapping_gfp_mask(mapping); + unsigned int offset; + struct page *page; + pgoff_t index; + char *src; + int ret; + + /* + * make sure the data in this buffer is uptodate + */ + src = buf->ops->map(file, info, buf); + if (IS_ERR(src)) + return PTR_ERR(src); + + index = sd->pos >> PAGE_CACHE_SHIFT; + offset = sd->pos & ~PAGE_CACHE_MASK; + + /* + * Reuse buf page, if SPLICE_F_MOVE is set. + */ + if (sd->flags & SPLICE_F_MOVE) { + /* + * If steal succeeds, buf->page is now pruned from the vm + * side (LRU and page cache) and we can reuse it. + */ + if (buf->ops->steal(info, buf)) + goto find_page; + + /* + * this will also set the page locked + */ + page = buf->page; + if (add_to_page_cache(page, mapping, index, gfp_mask)) + goto find_page; + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) + lru_cache_add(page); + } else { +find_page: + ret = -ENOMEM; + page = find_or_create_page(mapping, index, gfp_mask); + if (!page) + goto out_nomem; + + /* + * If the page is uptodate, it is also locked. If it isn't + * uptodate, we can mark it uptodate if we are filling the + * full page. Otherwise we need to read it in first... + */ + if (!PageUptodate(page)) { + if (sd->len < PAGE_CACHE_SIZE) { + ret = mapping->a_ops->readpage(file, page); + if (unlikely(ret)) + goto out; + + lock_page(page); + + if (!PageUptodate(page)) { + /* + * Page got invalidated, repeat. + */ + if (!page->mapping) { + unlock_page(page); + page_cache_release(page); + goto find_page; + } + ret = -EIO; + goto out; + } + } else { + WARN_ON(!PageLocked(page)); + SetPageUptodate(page); + } + } + } + + ret = mapping->a_ops->prepare_write(file, page, 0, sd->len); + if (ret == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } else if (ret) + goto out; + + if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { + char *dst = kmap_atomic(page, KM_USER0); + + memcpy(dst + offset, src + buf->offset, sd->len); + flush_dcache_page(page); + kunmap_atomic(dst, KM_USER0); + } + + ret = mapping->a_ops->commit_write(file, page, 0, sd->len); + if (ret == AOP_TRUNCATED_PAGE) { + page_cache_release(page); + goto find_page; + } else if (ret) + goto out; + + mark_page_accessed(page); + balance_dirty_pages_ratelimited(mapping); +out: + if (!(buf->flags & PIPE_BUF_FLAG_STOLEN)) { + page_cache_release(page); + unlock_page(page); + } +out_nomem: + buf->ops->unmap(info, buf); + return ret; +} + +typedef int (splice_actor)(struct pipe_inode_info *, struct pipe_buffer *, + struct splice_desc *); + +/* + * Pipe input worker. Most of this logic works like a regular pipe, the + * key here is the 'actor' worker passed in that actually moves the data + * to the wanted destination. See pipe_to_file/pipe_to_sendpage above. + */ +static ssize_t move_from_pipe(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags, + splice_actor *actor) +{ + int ret, do_wakeup, err; + struct splice_desc sd; + + ret = 0; + do_wakeup = 0; + + sd.total_len = len; + sd.flags = flags; + sd.file = out; + sd.pos = *ppos; + + if (pipe->inode) + mutex_lock(&pipe->inode->i_mutex); + + for (;;) { + if (pipe->nrbufs) { + struct pipe_buffer *buf = pipe->bufs + pipe->curbuf; + struct pipe_buf_operations *ops = buf->ops; + + sd.len = buf->len; + if (sd.len > sd.total_len) + sd.len = sd.total_len; + + err = actor(pipe, buf, &sd); + if (err) { + if (!ret && err != -ENODATA) + ret = err; + + break; + } + + ret += sd.len; + buf->offset += sd.len; + buf->len -= sd.len; + + if (!buf->len) { + buf->ops = NULL; + ops->release(pipe, buf); + pipe->curbuf = (pipe->curbuf + 1) & (PIPE_BUFFERS - 1); + pipe->nrbufs--; + if (pipe->inode) + do_wakeup = 1; + } + + sd.pos += sd.len; + sd.total_len -= sd.len; + if (!sd.total_len) + break; + } + + if (pipe->nrbufs) + continue; + if (!pipe->writers) + break; + if (!pipe->waiting_writers) { + if (ret) + break; + } + + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible_sync(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + do_wakeup = 0; + } + + pipe_wait(pipe); + } + + if (pipe->inode) + mutex_unlock(&pipe->inode->i_mutex); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&pipe->wait)) + wake_up_interruptible(&pipe->wait); + kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + } + + return ret; +} + +/** + * generic_file_splice_write - splice data from a pipe to a file + * @pipe: pipe info + * @out: file to write to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will either move or copy pages (determined by @flags options) from + * the given pipe inode to the given file. + * + */ +ssize_t +generic_file_splice_write(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + struct address_space *mapping = out->f_mapping; + ssize_t ret; + + ret = move_from_pipe(pipe, out, ppos, len, flags, pipe_to_file); + + /* + * If file or inode is SYNC and we actually wrote some data, sync it. + */ + if (unlikely((out->f_flags & O_SYNC) || IS_SYNC(mapping->host)) + && ret > 0) { + struct inode *inode = mapping->host; + int err; + + mutex_lock(&inode->i_mutex); + err = generic_osync_inode(mapping->host, mapping, + OSYNC_METADATA|OSYNC_DATA); + mutex_unlock(&inode->i_mutex); + + if (err) + ret = err; + } + + return ret; +} + +EXPORT_SYMBOL(generic_file_splice_write); + +/** + * generic_splice_sendpage - splice data from a pipe to a socket + * @inode: pipe inode + * @out: socket to write to + * @len: number of bytes to splice + * @flags: splice modifier flags + * + * Will send @len bytes from the pipe to a network socket. No data copying + * is involved. + * + */ +ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + return move_from_pipe(pipe, out, ppos, len, flags, pipe_to_sendpage); +} + +EXPORT_SYMBOL(generic_splice_sendpage); + +/* + * Attempt to initiate a splice from pipe to file. + */ +static long do_splice_from(struct pipe_inode_info *pipe, struct file *out, + loff_t *ppos, size_t len, unsigned int flags) +{ + int ret; + + if (unlikely(!out->f_op || !out->f_op->splice_write)) + return -EINVAL; + + if (unlikely(!(out->f_mode & FMODE_WRITE))) + return -EBADF; + + ret = rw_verify_area(WRITE, out, ppos, len); + if (unlikely(ret < 0)) + return ret; + + return out->f_op->splice_write(pipe, out, ppos, len, flags); +} + +/* + * Attempt to initiate a splice from a file to a pipe. + */ +static long do_splice_to(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, + unsigned int flags) +{ + loff_t isize, left; + int ret; + + if (unlikely(!in->f_op || !in->f_op->splice_read)) + return -EINVAL; + + if (unlikely(!(in->f_mode & FMODE_READ))) + return -EBADF; + + ret = rw_verify_area(READ, in, ppos, len); + if (unlikely(ret < 0)) + return ret; + + isize = i_size_read(in->f_mapping->host); + if (unlikely(*ppos >= isize)) + return 0; + + left = isize - *ppos; + if (unlikely(left < len)) + len = left; + + return in->f_op->splice_read(in, ppos, pipe, len, flags); +} + +long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, + size_t len, unsigned int flags) +{ + struct pipe_inode_info *pipe; + long ret, bytes; + loff_t out_off; + umode_t i_mode; + int i; + + /* + * We require the input being a regular file, as we don't want to + * randomly drop data for eg socket -> socket splicing. Use the + * piped splicing for that! + */ + i_mode = in->f_dentry->d_inode->i_mode; + if (unlikely(!S_ISREG(i_mode) && !S_ISBLK(i_mode))) + return -EINVAL; + + /* + * neither in nor out is a pipe, setup an internal pipe attached to + * 'out' and transfer the wanted data from 'in' to 'out' through that + */ + pipe = current->splice_pipe; + if (unlikely(!pipe)) { + pipe = alloc_pipe_info(NULL); + if (!pipe) + return -ENOMEM; + + /* + * We don't have an immediate reader, but we'll read the stuff + * out of the pipe right after the move_to_pipe(). So set + * PIPE_READERS appropriately. + */ + pipe->readers = 1; + + current->splice_pipe = pipe; + } + + /* + * Do the splice. + */ + ret = 0; + bytes = 0; + out_off = 0; + + while (len) { + size_t read_len, max_read_len; + + /* + * Do at most PIPE_BUFFERS pages worth of transfer: + */ + max_read_len = min(len, (size_t)(PIPE_BUFFERS*PAGE_SIZE)); + + ret = do_splice_to(in, ppos, pipe, max_read_len, flags); + if (unlikely(ret < 0)) + goto out_release; + + read_len = ret; + + /* + * NOTE: nonblocking mode only applies to the input. We + * must not do the output in nonblocking mode as then we + * could get stuck data in the internal pipe: + */ + ret = do_splice_from(pipe, out, &out_off, read_len, + flags & ~SPLICE_F_NONBLOCK); + if (unlikely(ret < 0)) + goto out_release; + + bytes += ret; + len -= ret; + + /* + * In nonblocking mode, if we got back a short read then + * that was due to either an IO error or due to the + * pagecache entry not being there. In the IO error case + * the _next_ splice attempt will produce a clean IO error + * return value (not a short read), so in both cases it's + * correct to break out of the loop here: + */ + if ((flags & SPLICE_F_NONBLOCK) && (read_len < max_read_len)) + break; + } + + pipe->nrbufs = pipe->curbuf = 0; + + return bytes; + +out_release: + /* + * If we did an incomplete transfer we must release + * the pipe buffers in question: + */ + for (i = 0; i < PIPE_BUFFERS; i++) { + struct pipe_buffer *buf = pipe->bufs + i; + + if (buf->ops) { + buf->ops->release(pipe, buf); + buf->ops = NULL; + } + } + pipe->nrbufs = pipe->curbuf = 0; + + /* + * If we transferred some data, return the number of bytes: + */ + if (bytes > 0) + return bytes; + + return ret; +} + +EXPORT_SYMBOL(do_splice_direct); + +/* + * Determine where to splice to/from. + */ +static long do_splice(struct file *in, loff_t __user *off_in, + struct file *out, loff_t __user *off_out, + size_t len, unsigned int flags) +{ + struct pipe_inode_info *pipe; + loff_t offset, *off; + + pipe = in->f_dentry->d_inode->i_pipe; + if (pipe) { + if (off_in) + return -ESPIPE; + if (off_out) { + if (out->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&offset, off_out, sizeof(loff_t))) + return -EFAULT; + off = &offset; + } else + off = &out->f_pos; + + return do_splice_from(pipe, out, off, len, flags); + } + + pipe = out->f_dentry->d_inode->i_pipe; + if (pipe) { + if (off_out) + return -ESPIPE; + if (off_in) { + if (in->f_op->llseek == no_llseek) + return -EINVAL; + if (copy_from_user(&offset, off_in, sizeof(loff_t))) + return -EFAULT; + off = &offset; + } else + off = &in->f_pos; + + return do_splice_to(in, off, pipe, len, flags); + } + + return -EINVAL; +} + +asmlinkage long sys_splice(int fd_in, loff_t __user *off_in, + int fd_out, loff_t __user *off_out, + size_t len, unsigned int flags) +{ + long error; + struct file *in, *out; + int fput_in, fput_out; + + if (unlikely(!len)) + return 0; + + error = -EBADF; + in = fget_light(fd_in, &fput_in); + if (in) { + if (in->f_mode & FMODE_READ) { + out = fget_light(fd_out, &fput_out); + if (out) { + if (out->f_mode & FMODE_WRITE) + error = do_splice(in, off_in, + out, off_out, + len, flags); + fput_light(out, fput_out); + } + } + + fput_light(in, fput_in); + } + + return error; +} + +/* + * Link contents of ipipe to opipe. + */ +static int link_pipe(struct pipe_inode_info *ipipe, + struct pipe_inode_info *opipe, + size_t len, unsigned int flags) +{ + struct pipe_buffer *ibuf, *obuf; + int ret = 0, do_wakeup = 0, i; + + /* + * Potential ABBA deadlock, work around it by ordering lock + * grabbing by inode address. Otherwise two different processes + * could deadlock (one doing tee from A -> B, the other from B -> A). + */ + if (ipipe->inode < opipe->inode) { + mutex_lock(&ipipe->inode->i_mutex); + mutex_lock(&opipe->inode->i_mutex); + } else { + mutex_lock(&opipe->inode->i_mutex); + mutex_lock(&ipipe->inode->i_mutex); + } + + for (i = 0;; i++) { + if (!opipe->readers) { + send_sig(SIGPIPE, current, 0); + if (!ret) + ret = -EPIPE; + break; + } + if (ipipe->nrbufs - i) { + ibuf = ipipe->bufs + ((ipipe->curbuf + i) & (PIPE_BUFFERS - 1)); + + /* + * If we have room, fill this buffer + */ + if (opipe->nrbufs < PIPE_BUFFERS) { + int nbuf = (opipe->curbuf + opipe->nrbufs) & (PIPE_BUFFERS - 1); + + /* + * Get a reference to this pipe buffer, + * so we can copy the contents over. + */ + ibuf->ops->get(ipipe, ibuf); + + obuf = opipe->bufs + nbuf; + *obuf = *ibuf; + + if (obuf->len > len) + obuf->len = len; + + opipe->nrbufs++; + do_wakeup = 1; + ret += obuf->len; + len -= obuf->len; + + if (!len) + break; + if (opipe->nrbufs < PIPE_BUFFERS) + continue; + } + + /* + * We have input available, but no output room. + * If we already copied data, return that. + */ + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + do_wakeup = 0; + } + + opipe->waiting_writers++; + pipe_wait(opipe); + opipe->waiting_writers--; + continue; + } + + /* + * No input buffers, do the usual checks for available + * writers and blocking and wait if necessary + */ + if (!ipipe->writers) + break; + if (!ipipe->waiting_writers) { + if (ret) + break; + } + if (flags & SPLICE_F_NONBLOCK) { + if (!ret) + ret = -EAGAIN; + break; + } + if (signal_pending(current)) { + if (!ret) + ret = -ERESTARTSYS; + break; + } + + if (waitqueue_active(&ipipe->wait)) + wake_up_interruptible_sync(&ipipe->wait); + kill_fasync(&ipipe->fasync_writers, SIGIO, POLL_OUT); + + pipe_wait(ipipe); + } + + mutex_unlock(&ipipe->inode->i_mutex); + mutex_unlock(&opipe->inode->i_mutex); + + if (do_wakeup) { + smp_mb(); + if (waitqueue_active(&opipe->wait)) + wake_up_interruptible(&opipe->wait); + kill_fasync(&opipe->fasync_readers, SIGIO, POLL_IN); + } + + return ret; +} + +/* + * This is a tee(1) implementation that works on pipes. It doesn't copy + * any data, it simply references the 'in' pages on the 'out' pipe. + * The 'flags' used are the SPLICE_F_* variants, currently the only + * applicable one is SPLICE_F_NONBLOCK. + */ +static long do_tee(struct file *in, struct file *out, size_t len, + unsigned int flags) +{ + struct pipe_inode_info *ipipe = in->f_dentry->d_inode->i_pipe; + struct pipe_inode_info *opipe = out->f_dentry->d_inode->i_pipe; + + /* + * Link ipipe to the two output pipes, consuming as we go along. + */ + if (ipipe && opipe) + return link_pipe(ipipe, opipe, len, flags); + + return -EINVAL; +} + +asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags) +{ + struct file *in; + int error, fput_in; + + if (unlikely(!len)) + return 0; + + error = -EBADF; + in = fget_light(fdin, &fput_in); + if (in) { + if (in->f_mode & FMODE_READ) { + int fput_out; + struct file *out = fget_light(fdout, &fput_out); + + if (out) { + if (out->f_mode & FMODE_WRITE) + error = do_tee(in, out, len, flags); + fput_light(out, fput_out); + } + } + fput_light(in, fput_in); + } + + return error; +} diff --git a/fs/sync.c b/fs/sync.c new file mode 100644 index 00000000000..aab5ffe77e9 --- /dev/null +++ b/fs/sync.c @@ -0,0 +1,164 @@ +/* + * High-level sync()-related operations + */ + +#include <linux/kernel.h> +#include <linux/file.h> +#include <linux/fs.h> +#include <linux/module.h> +#include <linux/writeback.h> +#include <linux/syscalls.h> +#include <linux/linkage.h> +#include <linux/pagemap.h> + +#define VALID_FLAGS (SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE| \ + SYNC_FILE_RANGE_WAIT_AFTER) + +/* + * sys_sync_file_range() permits finely controlled syncing over a segment of + * a file in the range offset .. (offset+nbytes-1) inclusive. If nbytes is + * zero then sys_sync_file_range() will operate from offset out to EOF. + * + * The flag bits are: + * + * SYNC_FILE_RANGE_WAIT_BEFORE: wait upon writeout of all pages in the range + * before performing the write. + * + * SYNC_FILE_RANGE_WRITE: initiate writeout of all those dirty pages in the + * range which are not presently under writeback. + * + * SYNC_FILE_RANGE_WAIT_AFTER: wait upon writeout of all pages in the range + * after performing the write. + * + * Useful combinations of the flag bits are: + * + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE: ensures that all pages + * in the range which were dirty on entry to sys_sync_file_range() are placed + * under writeout. This is a start-write-for-data-integrity operation. + * + * SYNC_FILE_RANGE_WRITE: start writeout of all dirty pages in the range which + * are not presently under writeout. This is an asynchronous flush-to-disk + * operation. Not suitable for data integrity operations. + * + * SYNC_FILE_RANGE_WAIT_BEFORE (or SYNC_FILE_RANGE_WAIT_AFTER): wait for + * completion of writeout of all pages in the range. This will be used after an + * earlier SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE operation to wait + * for that operation to complete and to return the result. + * + * SYNC_FILE_RANGE_WAIT_BEFORE|SYNC_FILE_RANGE_WRITE|SYNC_FILE_RANGE_WAIT_AFTER: + * a traditional sync() operation. This is a write-for-data-integrity operation + * which will ensure that all pages in the range which were dirty on entry to + * sys_sync_file_range() are committed to disk. + * + * + * SYNC_FILE_RANGE_WAIT_BEFORE and SYNC_FILE_RANGE_WAIT_AFTER will detect any + * I/O errors or ENOSPC conditions and will return those to the caller, after + * clearing the EIO and ENOSPC flags in the address_space. + * + * It should be noted that none of these operations write out the file's + * metadata. So unless the application is strictly performing overwrites of + * already-instantiated disk blocks, there are no guarantees here that the data + * will be available after a crash. + */ +asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes, + unsigned int flags) +{ + int ret; + struct file *file; + loff_t endbyte; /* inclusive */ + int fput_needed; + umode_t i_mode; + + ret = -EINVAL; + if (flags & ~VALID_FLAGS) + goto out; + + endbyte = offset + nbytes; + + if ((s64)offset < 0) + goto out; + if ((s64)endbyte < 0) + goto out; + if (endbyte < offset) + goto out; + + if (sizeof(pgoff_t) == 4) { + if (offset >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { + /* + * The range starts outside a 32 bit machine's + * pagecache addressing capabilities. Let it "succeed" + */ + ret = 0; + goto out; + } + if (endbyte >= (0x100000000ULL << PAGE_CACHE_SHIFT)) { + /* + * Out to EOF + */ + nbytes = 0; + } + } + + if (nbytes == 0) + endbyte = -1; + else + endbyte--; /* inclusive */ + + ret = -EBADF; + file = fget_light(fd, &fput_needed); + if (!file) + goto out; + + i_mode = file->f_dentry->d_inode->i_mode; + ret = -ESPIPE; + if (!S_ISREG(i_mode) && !S_ISBLK(i_mode) && !S_ISDIR(i_mode) && + !S_ISLNK(i_mode)) + goto out_put; + + ret = do_sync_file_range(file, offset, endbyte, flags); +out_put: + fput_light(file, fput_needed); +out: + return ret; +} + +/* + * `endbyte' is inclusive + */ +int do_sync_file_range(struct file *file, loff_t offset, loff_t endbyte, + unsigned int flags) +{ + int ret; + struct address_space *mapping; + + mapping = file->f_mapping; + if (!mapping) { + ret = -EINVAL; + goto out; + } + + ret = 0; + if (flags & SYNC_FILE_RANGE_WAIT_BEFORE) { + ret = wait_on_page_writeback_range(mapping, + offset >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + if (ret < 0) + goto out; + } + + if (flags & SYNC_FILE_RANGE_WRITE) { + ret = __filemap_fdatawrite_range(mapping, offset, endbyte, + WB_SYNC_NONE); + if (ret < 0) + goto out; + } + + if (flags & SYNC_FILE_RANGE_WAIT_AFTER) { + ret = wait_on_page_writeback_range(mapping, + offset >> PAGE_CACHE_SHIFT, + endbyte >> PAGE_CACHE_SHIFT); + } +out: + return ret; +} +EXPORT_SYMBOL_GPL(do_sync_file_range); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index f26880a4785..610b5bdbe75 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -43,6 +43,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, memset(sd, 0, sizeof(*sd)); atomic_set(&sd->s_count, 1); + atomic_set(&sd->s_event, 0); INIT_LIST_HEAD(&sd->s_children); list_add(&sd->s_sibling, &parent_sd->s_children); sd->s_element = element; @@ -50,7 +51,7 @@ static struct sysfs_dirent * sysfs_new_dirent(struct sysfs_dirent * parent_sd, return sd; } -/** +/* * * Return -EEXIST if there is already a sysfs element with the same name for * the same parent. diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 830f76fa098..cf3786625bf 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -6,6 +6,7 @@ #include <linux/fsnotify.h> #include <linux/kobject.h> #include <linux/namei.h> +#include <linux/poll.h> #include <asm/uaccess.h> #include <asm/semaphore.h> @@ -57,6 +58,7 @@ struct sysfs_buffer { struct sysfs_ops * ops; struct semaphore sem; int needs_read_fill; + int event; }; @@ -72,6 +74,7 @@ struct sysfs_buffer { */ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer) { + struct sysfs_dirent * sd = dentry->d_fsdata; struct attribute * attr = to_attr(dentry); struct kobject * kobj = to_kobj(dentry->d_parent); struct sysfs_ops * ops = buffer->ops; @@ -83,6 +86,7 @@ static int fill_read_buffer(struct dentry * dentry, struct sysfs_buffer * buffer if (!buffer->page) return -ENOMEM; + buffer->event = atomic_read(&sd->s_event); count = ops->show(kobj,attr,buffer->page); buffer->needs_read_fill = 0; BUG_ON(count > (ssize_t)PAGE_SIZE); @@ -183,7 +187,7 @@ fill_write_buffer(struct sysfs_buffer * buffer, const char __user * buf, size_t return -ENOMEM; if (count >= PAGE_SIZE) - count = PAGE_SIZE; + count = PAGE_SIZE - 1; error = copy_from_user(buffer->page,buf,count); buffer->needs_read_fill = 1; return error ? -EFAULT : count; @@ -348,12 +352,84 @@ static int sysfs_release(struct inode * inode, struct file * filp) return 0; } +/* Sysfs attribute files are pollable. The idea is that you read + * the content and then you use 'poll' or 'select' to wait for + * the content to change. When the content changes (assuming the + * manager for the kobject supports notification), poll will + * return POLLERR|POLLPRI, and select will return the fd whether + * it is waiting for read, write, or exceptions. + * Once poll/select indicates that the value has changed, you + * need to close and re-open the file, as simply seeking and reading + * again will not get new data, or reset the state of 'poll'. + * Reminder: this only works for attributes which actively support + * it, and it is not possible to test an attribute from userspace + * to see if it supports poll (Nether 'poll' or 'select' return + * an appropriate error code). When in doubt, set a suitable timeout value. + */ +static unsigned int sysfs_poll(struct file *filp, poll_table *wait) +{ + struct sysfs_buffer * buffer = filp->private_data; + struct kobject * kobj = to_kobj(filp->f_dentry->d_parent); + struct sysfs_dirent * sd = filp->f_dentry->d_fsdata; + int res = 0; + + poll_wait(filp, &kobj->poll, wait); + + if (buffer->event != atomic_read(&sd->s_event)) { + res = POLLERR|POLLPRI; + buffer->needs_read_fill = 1; + } + + return res; +} + + +static struct dentry *step_down(struct dentry *dir, const char * name) +{ + struct dentry * de; + + if (dir == NULL || dir->d_inode == NULL) + return NULL; + + mutex_lock(&dir->d_inode->i_mutex); + de = lookup_one_len(name, dir, strlen(name)); + mutex_unlock(&dir->d_inode->i_mutex); + dput(dir); + if (IS_ERR(de)) + return NULL; + if (de->d_inode == NULL) { + dput(de); + return NULL; + } + return de; +} + +void sysfs_notify(struct kobject * k, char *dir, char *attr) +{ + struct dentry *de = k->dentry; + if (de) + dget(de); + if (de && dir) + de = step_down(de, dir); + if (de && attr) + de = step_down(de, attr); + if (de) { + struct sysfs_dirent * sd = de->d_fsdata; + if (sd) + atomic_inc(&sd->s_event); + wake_up_interruptible(&k->poll); + dput(de); + } +} +EXPORT_SYMBOL_GPL(sysfs_notify); + const struct file_operations sysfs_file_operations = { .read = sysfs_read_file, .write = sysfs_write_file, .llseek = generic_file_llseek, .open = sysfs_open_file, .release = sysfs_release, + .poll = sysfs_poll, }; diff --git a/fs/sysfs/inode.c b/fs/sysfs/inode.c index 4c29ac41ac3..f0b347bd12c 100644 --- a/fs/sysfs/inode.c +++ b/fs/sysfs/inode.c @@ -175,8 +175,7 @@ const unsigned char * sysfs_get_name(struct sysfs_dirent *sd) struct bin_attribute * bin_attr; struct sysfs_symlink * sl; - if (!sd || !sd->s_element) - BUG(); + BUG_ON(!sd || !sd->s_element); switch (sd->s_type) { case SYSFS_DIR: diff --git a/fs/sysfs/sysfs.h b/fs/sysfs/sysfs.h index 32958a7c50e..3651ffb5ec0 100644 --- a/fs/sysfs/sysfs.h +++ b/fs/sysfs/sysfs.h @@ -11,6 +11,7 @@ extern int sysfs_make_dirent(struct sysfs_dirent *, struct dentry *, void *, extern int sysfs_add_file(struct dentry *, const struct attribute *, int); extern void sysfs_hash_and_remove(struct dentry * dir, const char * name); +extern struct sysfs_dirent *sysfs_find(struct sysfs_dirent *dir, const char * name); extern int sysfs_create_subdir(struct kobject *, const char *, struct dentry **); extern void sysfs_remove_subdir(struct dentry *); diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c index 8c66e9270dd..d7074341ee8 100644 --- a/fs/sysv/dir.c +++ b/fs/sysv/dir.c @@ -253,8 +253,7 @@ int sysv_delete_entry(struct sysv_dir_entry *de, struct page *page) lock_page(page); err = mapping->a_ops->prepare_write(NULL, page, from, to); - if (err) - BUG(); + BUG_ON(err); de->inode = 0; err = dir_commit_chunk(page, from, to); dir_put_page(page); @@ -353,8 +352,7 @@ void sysv_set_link(struct sysv_dir_entry *de, struct page *page, lock_page(page); err = page->mapping->a_ops->prepare_write(NULL, page, from, to); - if (err) - BUG(); + BUG_ON(err); de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); err = dir_commit_chunk(page, from, to); dir_put_page(page); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 81e0e8459af..2983afd5e7f 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -312,12 +312,10 @@ static int udf_get_block(struct inode *inode, sector_t block, struct buffer_head err = 0; bh = inode_getblk(inode, block, &err, &phys, &new); - if (bh) - BUG(); + BUG_ON(bh); if (err) goto abort; - if (!phys) - BUG(); + BUG_ON(!phys); if (new) set_buffer_new(bh_result); diff --git a/fs/vfat/namei.c b/fs/vfat/namei.c index ef46939c0c1..a56cec3be5f 100644 --- a/fs/vfat/namei.c +++ b/fs/vfat/namei.c @@ -185,24 +185,6 @@ static int vfat_valid_longname(const unsigned char *name, unsigned int len) return -EINVAL; if (len >= 256) return -ENAMETOOLONG; - - /* MS-DOS "device special files" */ - if (len == 3 || (len > 3 && name[3] == '.')) { /* basename == 3 */ - if (!strnicmp(name, "aux", 3) || - !strnicmp(name, "con", 3) || - !strnicmp(name, "nul", 3) || - !strnicmp(name, "prn", 3)) - return -EINVAL; - } - if (len == 4 || (len > 4 && name[4] == '.')) { /* basename == 4 */ - /* "com1", "com2", ... */ - if ('1' <= name[3] && name[3] <= '9') { - if (!strnicmp(name, "com", 3) || - !strnicmp(name, "lpt", 3)) - return -EINVAL; - } - } - return 0; } diff --git a/fs/xfs/linux-2.6/mrlock.h b/fs/xfs/linux-2.6/mrlock.h index 16b44c3c236..1b262b790d9 100644 --- a/fs/xfs/linux-2.6/mrlock.h +++ b/fs/xfs/linux-2.6/mrlock.h @@ -79,7 +79,7 @@ static inline void mrdemote(mrlock_t *mrp) * Debug-only routine, without some platform-specific asm code, we can * now only answer requests regarding whether we hold the lock for write * (reader state is outside our visibility, we only track writer state). - * Note: means !ismrlocked would give false positivies, so don't do that. + * Note: means !ismrlocked would give false positives, so don't do that. */ static inline int ismrlocked(mrlock_t *mrp, int type) { diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c index c02f7c5b746..4d191ef39b6 100644 --- a/fs/xfs/linux-2.6/xfs_aops.c +++ b/fs/xfs/linux-2.6/xfs_aops.c @@ -372,7 +372,7 @@ static inline int bio_add_buffer(struct bio *bio, struct buffer_head *bh) * assumes that all buffers on the page are started at the same time. * * The fix is two passes across the ioend list - one to start writeback on the - * bufferheads, and then the second one submit them for I/O. + * buffer_heads, and then submit them for I/O on the second pass. */ STATIC void xfs_submit_ioend( @@ -699,7 +699,7 @@ xfs_convert_page( /* * page_dirty is initially a count of buffers on the page before - * EOF and is decrememted as we move each into a cleanable state. + * EOF and is decremented as we move each into a cleanable state. * * Derivation: * @@ -842,7 +842,7 @@ xfs_cluster_write( * page if possible. * The bh->b_state's cannot know if any of the blocks or which block for * that matter are dirty due to mmap writes, and therefore bh uptodate is - * only vaild if the page itself isn't completely uptodate. Some layers + * only valid if the page itself isn't completely uptodate. Some layers * may clear the page dirty flag prior to calling write page, under the * assumption the entire page will be written out; by not writing out the * whole page the page can be reused before all valid dirty data is @@ -870,12 +870,14 @@ xfs_page_state_convert( pgoff_t end_index, last_index, tlast; ssize_t size, len; int flags, err, iomap_valid = 0, uptodate = 1; - int page_dirty, count = 0, trylock_flag = 0; + int page_dirty, count = 0; + int trylock = 0; int all_bh = unmapped; - /* wait for other IO threads? */ - if (startio && (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking)) - trylock_flag |= BMAPI_TRYLOCK; + if (startio) { + if (wbc->sync_mode == WB_SYNC_NONE && wbc->nonblocking) + trylock |= BMAPI_TRYLOCK; + } /* Is this page beyond the end of the file? */ offset = i_size_read(inode); @@ -892,7 +894,7 @@ xfs_page_state_convert( /* * page_dirty is initially a count of buffers on the page before - * EOF and is decrememted as we move each into a cleanable state. + * EOF and is decremented as we move each into a cleanable state. * * Derivation: * @@ -956,15 +958,13 @@ xfs_page_state_convert( if (buffer_unwritten(bh)) { type = IOMAP_UNWRITTEN; - flags = BMAPI_WRITE|BMAPI_IGNSTATE; + flags = BMAPI_WRITE | BMAPI_IGNSTATE; } else if (buffer_delay(bh)) { type = IOMAP_DELAY; - flags = BMAPI_ALLOCATE; - if (!startio) - flags |= trylock_flag; + flags = BMAPI_ALLOCATE | trylock; } else { type = IOMAP_NEW; - flags = BMAPI_WRITE|BMAPI_MMAP; + flags = BMAPI_WRITE | BMAPI_MMAP; } if (!iomap_valid) { @@ -1223,10 +1223,9 @@ free_buffers: } STATIC int -__xfs_get_block( +__xfs_get_blocks( struct inode *inode, sector_t iblock, - unsigned long blocks, struct buffer_head *bh_result, int create, int direct, @@ -1236,22 +1235,17 @@ __xfs_get_block( xfs_iomap_t iomap; xfs_off_t offset; ssize_t size; - int retpbbm = 1; + int niomap = 1; int error; offset = (xfs_off_t)iblock << inode->i_blkbits; - if (blocks) - size = (ssize_t) min_t(xfs_off_t, LONG_MAX, - (xfs_off_t)blocks << inode->i_blkbits); - else - size = 1 << inode->i_blkbits; - + ASSERT(bh_result->b_size >= (1 << inode->i_blkbits)); + size = bh_result->b_size; VOP_BMAP(vp, offset, size, - create ? flags : BMAPI_READ, &iomap, &retpbbm, error); + create ? flags : BMAPI_READ, &iomap, &niomap, error); if (error) return -error; - - if (retpbbm == 0) + if (niomap == 0) return 0; if (iomap.iomap_bn != IOMAP_DADDR_NULL) { @@ -1271,12 +1265,16 @@ __xfs_get_block( } } - /* If this is a realtime file, data might be on a new device */ + /* + * If this is a realtime file, data may be on a different device. + * to that pointed to from the buffer_head b_bdev currently. + */ bh_result->b_bdev = iomap.iomap_target->bt_bdev; - /* If we previously allocated a block out beyond eof and - * we are now coming back to use it then we will need to - * flag it as new even if it has a disk address. + /* + * If we previously allocated a block out beyond eof and we are + * now coming back to use it then we will need to flag it as new + * even if it has a disk address. */ if (create && ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) || @@ -1292,26 +1290,24 @@ __xfs_get_block( } } - if (blocks) { + if (direct || size > (1 << inode->i_blkbits)) { ASSERT(iomap.iomap_bsize - iomap.iomap_delta > 0); offset = min_t(xfs_off_t, - iomap.iomap_bsize - iomap.iomap_delta, - (xfs_off_t)blocks << inode->i_blkbits); - bh_result->b_size = (u32) min_t(xfs_off_t, UINT_MAX, offset); + iomap.iomap_bsize - iomap.iomap_delta, size); + bh_result->b_size = (ssize_t)min_t(xfs_off_t, LONG_MAX, offset); } return 0; } int -xfs_get_block( +xfs_get_blocks( struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - return __xfs_get_block(inode, iblock, - bh_result->b_size >> inode->i_blkbits, + return __xfs_get_blocks(inode, iblock, bh_result, create, 0, BMAPI_WRITE); } @@ -1322,8 +1318,7 @@ xfs_get_blocks_direct( struct buffer_head *bh_result, int create) { - return __xfs_get_block(inode, iblock, - bh_result->b_size >> inode->i_blkbits, + return __xfs_get_blocks(inode, iblock, bh_result, create, 1, BMAPI_WRITE|BMAPI_DIRECT); } @@ -1339,9 +1334,9 @@ xfs_end_io_direct( /* * Non-NULL private data means we need to issue a transaction to * convert a range from unwritten to written extents. This needs - * to happen from process contect but aio+dio I/O completion + * to happen from process context but aio+dio I/O completion * happens from irq context so we need to defer it to a workqueue. - * This is not nessecary for synchronous direct I/O, but we do + * This is not necessary for synchronous direct I/O, but we do * it anyway to keep the code uniform and simpler. * * The core direct I/O code might be changed to always call the @@ -1358,7 +1353,7 @@ xfs_end_io_direct( } /* - * blockdev_direct_IO can return an error even afer the I/O + * blockdev_direct_IO can return an error even after the I/O * completion handler was called. Thus we need to protect * against double-freeing. */ @@ -1405,7 +1400,7 @@ xfs_vm_prepare_write( unsigned int from, unsigned int to) { - return block_prepare_write(page, from, to, xfs_get_block); + return block_prepare_write(page, from, to, xfs_get_blocks); } STATIC sector_t @@ -1422,7 +1417,7 @@ xfs_vm_bmap( VOP_RWLOCK(vp, VRWLOCK_READ); VOP_FLUSH_PAGES(vp, (xfs_off_t)0, -1, 0, FI_REMAPF, error); VOP_RWUNLOCK(vp, VRWLOCK_READ); - return generic_block_bmap(mapping, block, xfs_get_block); + return generic_block_bmap(mapping, block, xfs_get_blocks); } STATIC int @@ -1430,7 +1425,7 @@ xfs_vm_readpage( struct file *unused, struct page *page) { - return mpage_readpage(page, xfs_get_block); + return mpage_readpage(page, xfs_get_blocks); } STATIC int @@ -1440,7 +1435,7 @@ xfs_vm_readpages( struct list_head *pages, unsigned nr_pages) { - return mpage_readpages(mapping, pages, nr_pages, xfs_get_block); + return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks); } STATIC void diff --git a/fs/xfs/linux-2.6/xfs_aops.h b/fs/xfs/linux-2.6/xfs_aops.h index 795699f121d..60716543c68 100644 --- a/fs/xfs/linux-2.6/xfs_aops.h +++ b/fs/xfs/linux-2.6/xfs_aops.h @@ -41,6 +41,6 @@ typedef struct xfs_ioend { } xfs_ioend_t; extern struct address_space_operations xfs_address_space_operations; -extern int xfs_get_block(struct inode *, sector_t, struct buffer_head *, int); +extern int xfs_get_blocks(struct inode *, sector_t, struct buffer_head *, int); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_buf.c b/fs/xfs/linux-2.6/xfs_buf.c index 9fb0312665c..26fed0756f0 100644 --- a/fs/xfs/linux-2.6/xfs_buf.c +++ b/fs/xfs/linux-2.6/xfs_buf.c @@ -182,7 +182,7 @@ free_address( { a_list_t *aentry; - aentry = kmalloc(sizeof(a_list_t), GFP_ATOMIC & ~__GFP_HIGH); + aentry = kmalloc(sizeof(a_list_t), GFP_NOWAIT); if (likely(aentry)) { spin_lock(&as_lock); aentry->next = as_free_head; diff --git a/fs/xfs/linux-2.6/xfs_export.h b/fs/xfs/linux-2.6/xfs_export.h index e5b0559700a..e794ca4efc7 100644 --- a/fs/xfs/linux-2.6/xfs_export.h +++ b/fs/xfs/linux-2.6/xfs_export.h @@ -54,7 +54,7 @@ * Note, the NFS filehandle also includes an fsid portion which * may have an inode number in it. That number is hardcoded to * 32bits and there is no way for XFS to intercept it. In - * practice this means when exporting an XFS filesytem with 64bit + * practice this means when exporting an XFS filesystem with 64bit * inodes you should either export the mountpoint (rather than * a subdirectory) or use the "fsid" export option. */ diff --git a/fs/xfs/linux-2.6/xfs_file.c b/fs/xfs/linux-2.6/xfs_file.c index 85997b1205f..c847416f6d1 100644 --- a/fs/xfs/linux-2.6/xfs_file.c +++ b/fs/xfs/linux-2.6/xfs_file.c @@ -69,7 +69,6 @@ __xfs_file_read( return rval; } - STATIC ssize_t xfs_file_aio_read( struct kiocb *iocb, @@ -90,7 +89,6 @@ xfs_file_aio_read_invis( return __xfs_file_read(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); } - STATIC inline ssize_t __xfs_file_write( struct kiocb *iocb, @@ -113,7 +111,6 @@ __xfs_file_write( return rval; } - STATIC ssize_t xfs_file_aio_write( struct kiocb *iocb, @@ -134,7 +131,6 @@ xfs_file_aio_write_invis( return __xfs_file_write(iocb, buf, IO_ISAIO|IO_INVIS, count, pos); } - STATIC inline ssize_t __xfs_file_readv( struct file *file, @@ -179,7 +175,6 @@ xfs_file_readv_invis( return __xfs_file_readv(file, iov, IO_INVIS, nr_segs, ppos); } - STATIC inline ssize_t __xfs_file_writev( struct file *file, @@ -204,7 +199,6 @@ __xfs_file_writev( return rval; } - STATIC ssize_t xfs_file_writev( struct file *file, @@ -228,7 +222,7 @@ xfs_file_writev_invis( STATIC ssize_t xfs_file_sendfile( struct file *filp, - loff_t *ppos, + loff_t *pos, size_t count, read_actor_t actor, void *target) @@ -236,10 +230,84 @@ xfs_file_sendfile( vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); ssize_t rval; - VOP_SENDFILE(vp, filp, ppos, 0, count, actor, target, NULL, rval); + VOP_SENDFILE(vp, filp, pos, 0, count, actor, target, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_sendfile_invis( + struct file *filp, + loff_t *pos, + size_t count, + read_actor_t actor, + void *target) +{ + vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); + ssize_t rval; + + VOP_SENDFILE(vp, filp, pos, IO_INVIS, count, actor, target, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_splice_read( + struct file *infilp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, 0, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_splice_read_invis( + struct file *infilp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(infilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_READ(vp, infilp, ppos, pipe, len, flags, IO_INVIS, NULL, rval); return rval; } +STATIC ssize_t +xfs_file_splice_write( + struct pipe_inode_info *pipe, + struct file *outfilp, + loff_t *ppos, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, 0, NULL, rval); + return rval; +} + +STATIC ssize_t +xfs_file_splice_write_invis( + struct pipe_inode_info *pipe, + struct file *outfilp, + loff_t *ppos, + size_t len, + unsigned int flags) +{ + vnode_t *vp = vn_from_inode(outfilp->f_dentry->d_inode); + ssize_t rval; + + VOP_SPLICE_WRITE(vp, pipe, outfilp, ppos, len, flags, IO_INVIS, NULL, rval); + return rval; +} STATIC int xfs_file_open( @@ -251,13 +319,10 @@ xfs_file_open( if (!(filp->f_flags & O_LARGEFILE) && i_size_read(inode) > MAX_NON_LFS) return -EFBIG; - - ASSERT(vp); VOP_OPEN(vp, NULL, error); return -error; } - STATIC int xfs_file_release( struct inode *inode, @@ -271,7 +336,6 @@ xfs_file_release( return -error; } - STATIC int xfs_file_fsync( struct file *filp, @@ -285,21 +349,11 @@ xfs_file_fsync( if (datasync) flags |= FSYNC_DATA; - - ASSERT(vp); VOP_FSYNC(vp, flags, NULL, (xfs_off_t)0, (xfs_off_t)-1, error); return -error; } -/* - * xfs_file_readdir maps to VOP_READDIR(). - * We need to build a uio, cred, ... - */ - -#define nextdp(dp) ((struct xfs_dirent *)((char *)(dp) + (dp)->d_reclen)) - #ifdef CONFIG_XFS_DMAPI - STATIC struct page * xfs_vm_nopage( struct vm_area_struct *area, @@ -319,10 +373,8 @@ xfs_vm_nopage( return filemap_nopage(area, address, type); } - #endif /* CONFIG_XFS_DMAPI */ - STATIC int xfs_file_readdir( struct file *filp, @@ -330,7 +382,7 @@ xfs_file_readdir( filldir_t filldir) { int error = 0; - vnode_t *vp; + vnode_t *vp = vn_from_inode(filp->f_dentry->d_inode); uio_t uio; iovec_t iov; int eof = 0; @@ -340,9 +392,6 @@ xfs_file_readdir( xfs_off_t start_offset, curr_offset; xfs_dirent_t *dbp = NULL; - vp = vn_from_inode(filp->f_dentry->d_inode); - ASSERT(vp); - /* Try fairly hard to get memory */ do { if ((read_buf = (caddr_t)kmalloc(rlen, GFP_KERNEL))) @@ -387,7 +436,7 @@ xfs_file_readdir( } size -= dbp->d_reclen; curr_offset = (loff_t)dbp->d_off /* & 0x7fffffff */; - dbp = nextdp(dbp); + dbp = (xfs_dirent_t *)((char *)dbp + dbp->d_reclen); } } done: @@ -402,7 +451,6 @@ done: return -error; } - STATIC int xfs_file_mmap( struct file *filp, @@ -457,11 +505,10 @@ xfs_file_ioctl_invis( unsigned int cmd, unsigned long arg) { - int error; struct inode *inode = filp->f_dentry->d_inode; vnode_t *vp = vn_from_inode(inode); + int error; - ASSERT(vp); VOP_IOCTL(vp, inode, filp, IO_INVIS, cmd, (void __user *)arg, error); VMODIFY(vp); @@ -537,6 +584,8 @@ const struct file_operations xfs_file_operations = { .aio_read = xfs_file_aio_read, .aio_write = xfs_file_aio_write, .sendfile = xfs_file_sendfile, + .splice_read = xfs_file_splice_read, + .splice_write = xfs_file_splice_write, .unlocked_ioctl = xfs_file_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_ioctl, @@ -558,7 +607,9 @@ const struct file_operations xfs_invis_file_operations = { .writev = xfs_file_writev_invis, .aio_read = xfs_file_aio_read_invis, .aio_write = xfs_file_aio_write_invis, - .sendfile = xfs_file_sendfile, + .sendfile = xfs_file_sendfile_invis, + .splice_read = xfs_file_splice_read_invis, + .splice_write = xfs_file_splice_write_invis, .unlocked_ioctl = xfs_file_ioctl_invis, #ifdef CONFIG_COMPAT .compat_ioctl = xfs_file_compat_invis_ioctl, diff --git a/fs/xfs/linux-2.6/xfs_ioctl32.c b/fs/xfs/linux-2.6/xfs_ioctl32.c index b6321abd9a8..251bfe451a3 100644 --- a/fs/xfs/linux-2.6/xfs_ioctl32.c +++ b/fs/xfs/linux-2.6/xfs_ioctl32.c @@ -72,7 +72,7 @@ xfs_ioctl32_flock( copy_in_user(&p->l_pid, &p32->l_pid, sizeof(u32)) || copy_in_user(&p->l_pad, &p32->l_pad, 4*sizeof(u32))) return -EFAULT; - + return (unsigned long)p; } @@ -107,11 +107,15 @@ xfs_ioctl32_bulkstat( #endif STATIC long -xfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg) +xfs_compat_ioctl( + int mode, + struct file *file, + unsigned cmd, + unsigned long arg) { + struct inode *inode = file->f_dentry->d_inode; + vnode_t *vp = vn_from_inode(inode); int error; - struct inode *inode = f->f_dentry->d_inode; - vnode_t *vp = vn_to_inode(inode); switch (cmd) { case XFS_IOC_DIOINFO: @@ -189,7 +193,7 @@ xfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg) return -ENOIOCTLCMD; } - VOP_IOCTL(vp, inode, f, mode, cmd, (void __user *)arg, error); + VOP_IOCTL(vp, inode, file, mode, cmd, (void __user *)arg, error); VMODIFY(vp); return error; @@ -197,18 +201,18 @@ xfs_compat_ioctl(int mode, struct file *f, unsigned cmd, unsigned long arg) long xfs_file_compat_ioctl( - struct file *f, + struct file *file, unsigned cmd, unsigned long arg) { - return xfs_compat_ioctl(0, f, cmd, arg); + return xfs_compat_ioctl(0, file, cmd, arg); } long xfs_file_compat_invis_ioctl( - struct file *f, + struct file *file, unsigned cmd, unsigned long arg) { - return xfs_compat_ioctl(IO_INVIS, f, cmd, arg); + return xfs_compat_ioctl(IO_INVIS, file, cmd, arg); } diff --git a/fs/xfs/linux-2.6/xfs_iops.c b/fs/xfs/linux-2.6/xfs_iops.c index af487437bd7..2e2e275c786 100644 --- a/fs/xfs/linux-2.6/xfs_iops.c +++ b/fs/xfs/linux-2.6/xfs_iops.c @@ -673,8 +673,7 @@ xfs_vn_setattr( if (ia_valid & ATTR_ATIME) { vattr.va_mask |= XFS_AT_ATIME; vattr.va_atime = attr->ia_atime; - if (ia_valid & ATTR_ATIME_SET) - inode->i_atime = attr->ia_atime; + inode->i_atime = attr->ia_atime; } if (ia_valid & ATTR_MTIME) { vattr.va_mask |= XFS_AT_MTIME; @@ -708,7 +707,7 @@ STATIC void xfs_vn_truncate( struct inode *inode) { - block_truncate_page(inode->i_mapping, inode->i_size, xfs_get_block); + block_truncate_page(inode->i_mapping, inode->i_size, xfs_get_blocks); } STATIC int diff --git a/fs/xfs/linux-2.6/xfs_linux.h b/fs/xfs/linux-2.6/xfs_linux.h index 1fe09f2d651..e9fe43d7476 100644 --- a/fs/xfs/linux-2.6/xfs_linux.h +++ b/fs/xfs/linux-2.6/xfs_linux.h @@ -103,6 +103,7 @@ */ #undef HAVE_REFCACHE /* reference cache not needed for NFS in 2.6 */ #define HAVE_SENDFILE /* sendfile(2) exists in 2.6, but not in 2.4 */ +#define HAVE_SPLICE /* a splice(2) exists in 2.6, but not in 2.4 */ #ifdef CONFIG_SMP #define HAVE_PERCPU_SB /* per cpu superblock counters are a 2.6 feature */ #else diff --git a/fs/xfs/linux-2.6/xfs_lrw.c b/fs/xfs/linux-2.6/xfs_lrw.c index 0169360475c..67efe330898 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.c +++ b/fs/xfs/linux-2.6/xfs_lrw.c @@ -301,36 +301,23 @@ xfs_sendfile( void *target, cred_t *credp) { + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_mount_t *mp = ip->i_mount; ssize_t ret; - xfs_fsize_t n; - xfs_inode_t *ip; - xfs_mount_t *mp; - vnode_t *vp; - - ip = XFS_BHVTOI(bdp); - vp = BHV_TO_VNODE(bdp); - mp = ip->i_mount; XFS_STATS_INC(xs_read_calls); - - n = XFS_MAXIOFFSET(mp) - *offset; - if ((n <= 0) || (count == 0)) - return 0; - - if (n < count) - count = n; - - if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; xfs_ilock(ip, XFS_IOLOCK_SHARED); - if (DM_EVENT_ENABLED(vp->v_vfsp, ip, DM_EVENT_READ) && + if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && (!(ioflags & IO_INVIS))) { vrwlock_t locktype = VRWLOCK_READ; int error; - error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), *offset, count, + error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), + *offset, count, FILP_DELAY_FLAG(filp), &locktype); if (error) { xfs_iunlock(ip, XFS_IOLOCK_SHARED); @@ -340,12 +327,98 @@ xfs_sendfile( xfs_rw_enter_trace(XFS_SENDFILE_ENTER, &ip->i_iocore, (void *)(unsigned long)target, count, *offset, ioflags); ret = generic_file_sendfile(filp, offset, count, actor, target); + if (ret > 0) + XFS_STATS_ADD(xs_read_bytes, ret); xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} +ssize_t +xfs_splice_read( + bhv_desc_t *bdp, + struct file *infilp, + loff_t *ppos, + struct pipe_inode_info *pipe, + size_t count, + int flags, + int ioflags, + cred_t *credp) +{ + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_mount_t *mp = ip->i_mount; + ssize_t ret; + + XFS_STATS_INC(xs_read_calls); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_SHARED); + + if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_READ) && + (!(ioflags & IO_INVIS))) { + vrwlock_t locktype = VRWLOCK_READ; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_READ, BHV_TO_VNODE(bdp), + *ppos, count, + FILP_DELAY_FLAG(infilp), &locktype); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return -error; + } + } + xfs_rw_enter_trace(XFS_SPLICE_READ_ENTER, &ip->i_iocore, + pipe, count, *ppos, ioflags); + ret = generic_file_splice_read(infilp, ppos, pipe, count, flags); if (ret > 0) XFS_STATS_ADD(xs_read_bytes, ret); + xfs_iunlock(ip, XFS_IOLOCK_SHARED); + return ret; +} + +ssize_t +xfs_splice_write( + bhv_desc_t *bdp, + struct pipe_inode_info *pipe, + struct file *outfilp, + loff_t *ppos, + size_t count, + int flags, + int ioflags, + cred_t *credp) +{ + xfs_inode_t *ip = XFS_BHVTOI(bdp); + xfs_mount_t *mp = ip->i_mount; + ssize_t ret; + + XFS_STATS_INC(xs_write_calls); + if (XFS_FORCED_SHUTDOWN(ip->i_mount)) + return -EIO; + + xfs_ilock(ip, XFS_IOLOCK_EXCL); + + if (DM_EVENT_ENABLED(BHV_TO_VNODE(bdp)->v_vfsp, ip, DM_EVENT_WRITE) && + (!(ioflags & IO_INVIS))) { + vrwlock_t locktype = VRWLOCK_WRITE; + int error; + + error = XFS_SEND_DATA(mp, DM_EVENT_WRITE, BHV_TO_VNODE(bdp), + *ppos, count, + FILP_DELAY_FLAG(outfilp), &locktype); + if (error) { + xfs_iunlock(ip, XFS_IOLOCK_EXCL); + return -error; + } + } + xfs_rw_enter_trace(XFS_SPLICE_WRITE_ENTER, &ip->i_iocore, + pipe, count, *ppos, ioflags); + ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags); + if (ret > 0) + XFS_STATS_ADD(xs_write_bytes, ret); + + xfs_iunlock(ip, XFS_IOLOCK_EXCL); return ret; } @@ -363,7 +436,7 @@ xfs_zero_last_block( xfs_fsize_t end_size) { xfs_fileoff_t last_fsb; - xfs_mount_t *mp; + xfs_mount_t *mp = io->io_mount; int nimaps; int zero_offset; int zero_len; @@ -373,8 +446,6 @@ xfs_zero_last_block( ASSERT(ismrlocked(io->io_lock, MR_UPDATE) != 0); - mp = io->io_mount; - zero_offset = XFS_B_FSB_OFFSET(mp, isize); if (zero_offset == 0) { /* @@ -405,10 +476,9 @@ xfs_zero_last_block( * don't deadlock when the buffer cache calls back to us. */ XFS_IUNLOCK(mp, io, XFS_ILOCK_EXCL| XFS_EXTSIZE_RD); - loff = XFS_FSB_TO_B(mp, last_fsb); + loff = XFS_FSB_TO_B(mp, last_fsb); zero_len = mp->m_sb.sb_blocksize - zero_offset; - error = xfs_iozero(ip, loff + zero_offset, zero_len, end_size); XFS_ILOCK(mp, io, XFS_ILOCK_EXCL|XFS_EXTSIZE_RD); @@ -441,7 +511,7 @@ xfs_zero_eof( xfs_fileoff_t zero_count_fsb; xfs_fileoff_t last_fsb; xfs_extlen_t buf_len_fsb; - xfs_mount_t *mp; + xfs_mount_t *mp = io->io_mount; int nimaps; int error = 0; xfs_bmbt_irec_t imap; @@ -450,8 +520,6 @@ xfs_zero_eof( ASSERT(ismrlocked(io->io_iolock, MR_UPDATE)); ASSERT(offset > isize); - mp = io->io_mount; - /* * First handle zeroing the block on which isize resides. * We only zero a part of that block so it is handled specially. @@ -681,7 +749,7 @@ start: eventsent = 1; /* - * The iolock was dropped and reaquired in XFS_SEND_DATA + * The iolock was dropped and reacquired in XFS_SEND_DATA * so we have to recheck the size when appending. * We will only "goto start;" once, since having sent the * event prevents another call to XFS_SEND_DATA, which is diff --git a/fs/xfs/linux-2.6/xfs_lrw.h b/fs/xfs/linux-2.6/xfs_lrw.h index 38864a88d42..8f453995235 100644 --- a/fs/xfs/linux-2.6/xfs_lrw.h +++ b/fs/xfs/linux-2.6/xfs_lrw.h @@ -60,6 +60,8 @@ struct xfs_iomap; #define XFS_IOMAP_ALLOC_ENTER 25 #define XFS_IOMAP_ALLOC_MAP 26 #define XFS_IOMAP_UNWRITTEN 27 +#define XFS_SPLICE_READ_ENTER 28 +#define XFS_SPLICE_WRITE_ENTER 29 extern void xfs_rw_enter_trace(int, struct xfs_iocore *, void *, size_t, loff_t, int); extern void xfs_inval_cached_trace(struct xfs_iocore *, @@ -78,6 +80,7 @@ extern int xfs_bmap(struct bhv_desc *, xfs_off_t, ssize_t, int, struct xfs_iomap *, int *); extern int xfsbdstrat(struct xfs_mount *, struct xfs_buf *); extern int xfs_bdstrat_cb(struct xfs_buf *); +extern int xfs_dev_is_read_only(struct xfs_mount *, char *); extern int xfs_zero_eof(struct vnode *, struct xfs_iocore *, xfs_off_t, xfs_fsize_t, xfs_fsize_t); @@ -90,7 +93,11 @@ extern ssize_t xfs_write(struct bhv_desc *, struct kiocb *, extern ssize_t xfs_sendfile(struct bhv_desc *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); - -extern int xfs_dev_is_read_only(struct xfs_mount *, char *); +extern ssize_t xfs_splice_read(struct bhv_desc *, struct file *, loff_t *, + struct pipe_inode_info *, size_t, int, int, + struct cred *); +extern ssize_t xfs_splice_write(struct bhv_desc *, struct pipe_inode_info *, + struct file *, loff_t *, size_t, int, int, + struct cred *); #endif /* __XFS_LRW_H__ */ diff --git a/fs/xfs/linux-2.6/xfs_super.c b/fs/xfs/linux-2.6/xfs_super.c index 1884300417e..68f4793e8a1 100644 --- a/fs/xfs/linux-2.6/xfs_super.c +++ b/fs/xfs/linux-2.6/xfs_super.c @@ -67,7 +67,8 @@ mempool_t *xfs_ioend_pool; STATIC struct xfs_mount_args * xfs_args_allocate( - struct super_block *sb) + struct super_block *sb, + int silent) { struct xfs_mount_args *args; @@ -80,8 +81,8 @@ xfs_args_allocate( args->flags |= XFSMNT_DIRSYNC; if (sb->s_flags & MS_SYNCHRONOUS) args->flags |= XFSMNT_WSYNC; - - /* Default to 32 bit inodes on Linux all the time */ + if (silent) + args->flags |= XFSMNT_QUIET; args->flags |= XFSMNT_32BITINODES; return args; @@ -719,7 +720,7 @@ xfs_fs_remount( char *options) { vfs_t *vfsp = vfs_from_sb(sb); - struct xfs_mount_args *args = xfs_args_allocate(sb); + struct xfs_mount_args *args = xfs_args_allocate(sb, 0); int error; VFS_PARSEARGS(vfsp, options, args, 1, error); @@ -825,7 +826,7 @@ xfs_fs_fill_super( { vnode_t *rootvp; struct vfs *vfsp = vfs_allocate(sb); - struct xfs_mount_args *args = xfs_args_allocate(sb); + struct xfs_mount_args *args = xfs_args_allocate(sb, silent); struct kstatfs statvfs; int error, error2; diff --git a/fs/xfs/linux-2.6/xfs_vfs.h b/fs/xfs/linux-2.6/xfs_vfs.h index 8fed356db05..841200c0309 100644 --- a/fs/xfs/linux-2.6/xfs_vfs.h +++ b/fs/xfs/linux-2.6/xfs_vfs.h @@ -92,7 +92,7 @@ typedef enum { #define SYNC_FSDATA 0x0020 /* flush fs data (e.g. superblocks) */ #define SYNC_REFCACHE 0x0040 /* prune some of the nfs ref cache */ #define SYNC_REMOUNT 0x0080 /* remount readonly, no dummy LRs */ -#define SYNC_QUIESCE 0x0100 /* quiesce fileystem for a snapshot */ +#define SYNC_QUIESCE 0x0100 /* quiesce filesystem for a snapshot */ typedef int (*vfs_mount_t)(bhv_desc_t *, struct xfs_mount_args *, struct cred *); diff --git a/fs/xfs/linux-2.6/xfs_vnode.h b/fs/xfs/linux-2.6/xfs_vnode.h index 06f5845e956..2a8e16c2235 100644 --- a/fs/xfs/linux-2.6/xfs_vnode.h +++ b/fs/xfs/linux-2.6/xfs_vnode.h @@ -173,6 +173,12 @@ typedef ssize_t (*vop_write_t)(bhv_desc_t *, struct kiocb *, typedef ssize_t (*vop_sendfile_t)(bhv_desc_t *, struct file *, loff_t *, int, size_t, read_actor_t, void *, struct cred *); +typedef ssize_t (*vop_splice_read_t)(bhv_desc_t *, struct file *, loff_t *, + struct pipe_inode_info *, size_t, int, int, + struct cred *); +typedef ssize_t (*vop_splice_write_t)(bhv_desc_t *, struct pipe_inode_info *, + struct file *, loff_t *, size_t, int, int, + struct cred *); typedef int (*vop_ioctl_t)(bhv_desc_t *, struct inode *, struct file *, int, unsigned int, void __user *); typedef int (*vop_getattr_t)(bhv_desc_t *, struct vattr *, int, @@ -231,6 +237,8 @@ typedef struct vnodeops { vop_read_t vop_read; vop_write_t vop_write; vop_sendfile_t vop_sendfile; + vop_splice_read_t vop_splice_read; + vop_splice_write_t vop_splice_write; vop_ioctl_t vop_ioctl; vop_getattr_t vop_getattr; vop_setattr_t vop_setattr; @@ -276,6 +284,10 @@ typedef struct vnodeops { rv = _VOP_(vop_write, vp)((vp)->v_fbhv,file,iov,segs,offset,ioflags,cr) #define VOP_SENDFILE(vp,f,off,ioflags,cnt,act,targ,cr,rv) \ rv = _VOP_(vop_sendfile, vp)((vp)->v_fbhv,f,off,ioflags,cnt,act,targ,cr) +#define VOP_SPLICE_READ(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_read, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr) +#define VOP_SPLICE_WRITE(vp,f,o,pipe,cnt,fl,iofl,cr,rv) \ + rv = _VOP_(vop_splice_write, vp)((vp)->v_fbhv,f,o,pipe,cnt,fl,iofl,cr) #define VOP_BMAP(vp,of,sz,rw,b,n,rv) \ rv = _VOP_(vop_bmap, vp)((vp)->v_fbhv,of,sz,rw,b,n) #define VOP_OPEN(vp, cr, rv) \ diff --git a/fs/xfs/quota/xfs_dquot_item.c b/fs/xfs/quota/xfs_dquot_item.c index e4e5f05b841..546f48af882 100644 --- a/fs/xfs/quota/xfs_dquot_item.c +++ b/fs/xfs/quota/xfs_dquot_item.c @@ -221,7 +221,7 @@ xfs_qm_dqunpin_wait( * as possible. * * We must not be holding the AIL_LOCK at this point. Calling incore() to - * search the buffercache can be a time consuming thing, and AIL_LOCK is a + * search the buffer cache can be a time consuming thing, and AIL_LOCK is a * spinlock. */ STATIC void diff --git a/fs/xfs/quota/xfs_qm.c b/fs/xfs/quota/xfs_qm.c index 1fb757ef3f4..7fb5eca9bd5 100644 --- a/fs/xfs/quota/xfs_qm.c +++ b/fs/xfs/quota/xfs_qm.c @@ -289,7 +289,7 @@ xfs_qm_rele_quotafs_ref( /* * This is called at mount time from xfs_mountfs to initialize the quotainfo - * structure and start the global quotamanager (xfs_Gqm) if it hasn't done + * structure and start the global quota manager (xfs_Gqm) if it hasn't done * so already. Note that the superblock has not been read in yet. */ void @@ -807,7 +807,7 @@ xfs_qm_dqattach_one( * Given a udquot and gdquot, attach a ptr to the group dquot in the * udquot as a hint for future lookups. The idea sounds simple, but the * execution isn't, because the udquot might have a group dquot attached - * already and getting rid of that gets us into lock ordering contraints. + * already and getting rid of that gets us into lock ordering constraints. * The process is complicated more by the fact that the dquots may or may not * be locked on entry. */ @@ -1094,10 +1094,10 @@ xfs_qm_sync( } /* * If we can't grab the flush lock then if the caller - * really wanted us to give this our best shot, + * really wanted us to give this our best shot, so * see if we can give a push to the buffer before we wait * on the flush lock. At this point, we know that - * eventhough the dquot is being flushed, + * even though the dquot is being flushed, * it has (new) dirty data. */ xfs_qm_dqflock_pushbuf_wait(dqp); @@ -1491,7 +1491,7 @@ xfs_qm_reset_dqcounts( /* * Do a sanity check, and if needed, repair the dqblk. Don't * output any warnings because it's perfectly possible to - * find unitialized dquot blks. See comment in xfs_qm_dqcheck. + * find uninitialised dquot blks. See comment in xfs_qm_dqcheck. */ (void) xfs_qm_dqcheck(ddq, id+j, type, XFS_QMOPT_DQREPAIR, "xfs_quotacheck"); @@ -1580,7 +1580,7 @@ xfs_qm_dqiterate( error = 0; /* - * This looks racey, but we can't keep an inode lock across a + * This looks racy, but we can't keep an inode lock across a * trans_reserve. But, this gets called during quotacheck, and that * happens only at mount time which is single threaded. */ @@ -1824,7 +1824,7 @@ xfs_qm_dqusage_adjust( * we have to start from the beginning anyway. * Once we're done, we'll log all the dquot bufs. * - * The *QUOTA_ON checks below may look pretty racey, but quotachecks + * The *QUOTA_ON checks below may look pretty racy, but quotachecks * and quotaoffs don't race. (Quotachecks happen at mount time only). */ if (XFS_IS_UQUOTA_ON(mp)) { @@ -2624,7 +2624,7 @@ xfs_qm_vop_chown_reserve( { int error; xfs_mount_t *mp; - uint delblks, blkflags; + uint delblks, blkflags, prjflags = 0; xfs_dquot_t *unresudq, *unresgdq, *delblksudq, *delblksgdq; ASSERT(XFS_ISLOCKED_INODE(ip)); @@ -2650,10 +2650,13 @@ xfs_qm_vop_chown_reserve( } } if (XFS_IS_OQUOTA_ON(ip->i_mount) && gdqp) { - if ((XFS_IS_GQUOTA_ON(ip->i_mount) && - ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id)) || - (XFS_IS_PQUOTA_ON(ip->i_mount) && - ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id))) { + if (XFS_IS_PQUOTA_ON(ip->i_mount) && + ip->i_d.di_projid != be32_to_cpu(gdqp->q_core.d_id)) + prjflags = XFS_QMOPT_ENOSPC; + + if (prjflags || + (XFS_IS_GQUOTA_ON(ip->i_mount) && + ip->i_d.di_gid != be32_to_cpu(gdqp->q_core.d_id))) { delblksgdq = gdqp; if (delblks) { ASSERT(ip->i_gdquot); @@ -2664,7 +2667,7 @@ xfs_qm_vop_chown_reserve( if ((error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, delblksudq, delblksgdq, ip->i_d.di_nblocks, 1, - flags | blkflags))) + flags | blkflags | prjflags))) return (error); /* @@ -2681,7 +2684,7 @@ xfs_qm_vop_chown_reserve( ASSERT(unresudq || unresgdq); if ((error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, delblksudq, delblksgdq, (xfs_qcnt_t)delblks, 0, - flags | blkflags))) + flags | blkflags | prjflags))) return (error); xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount, unresudq, unresgdq, -((xfs_qcnt_t)delblks), 0, diff --git a/fs/xfs/quota/xfs_qm_syscalls.c b/fs/xfs/quota/xfs_qm_syscalls.c index 676884394aa..c55db463bbf 100644 --- a/fs/xfs/quota/xfs_qm_syscalls.c +++ b/fs/xfs/quota/xfs_qm_syscalls.c @@ -912,7 +912,7 @@ xfs_qm_export_dquot( /* * Internally, we don't reset all the timers when quota enforcement - * gets turned off. No need to confuse the userlevel code, + * gets turned off. No need to confuse the user level code, * so return zeroes in that case. */ if (! XFS_IS_QUOTA_ENFORCED(mp)) { diff --git a/fs/xfs/quota/xfs_trans_dquot.c b/fs/xfs/quota/xfs_trans_dquot.c index 3290975d31f..9168918db25 100644 --- a/fs/xfs/quota/xfs_trans_dquot.c +++ b/fs/xfs/quota/xfs_trans_dquot.c @@ -595,12 +595,19 @@ xfs_trans_unreserve_and_mod_dquots( } } +STATIC int +xfs_quota_error(uint flags) +{ + if (flags & XFS_QMOPT_ENOSPC) + return ENOSPC; + return EDQUOT; +} + /* * This reserves disk blocks and inodes against a dquot. * Flags indicate if the dquot is to be locked here and also * if the blk reservation is for RT or regular blocks. * Sending in XFS_QMOPT_FORCE_RES flag skips the quota check. - * Returns EDQUOT if quota is exceeded. */ STATIC int xfs_trans_dqresv( @@ -666,19 +673,15 @@ xfs_trans_dqresv( */ if (hardlimit > 0ULL && (hardlimit <= nblks + *resbcountp)) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } if (softlimit > 0ULL && (softlimit <= nblks + *resbcountp)) { - /* - * If timer or warnings has expired, - * return EDQUOT - */ if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } } @@ -695,16 +698,12 @@ xfs_trans_dqresv( if (!softlimit) softlimit = q->qi_isoftlimit; if (hardlimit > 0ULL && count >= hardlimit) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } else if (softlimit > 0ULL && count >= softlimit) { - /* - * If timer or warnings has expired, - * return EDQUOT - */ if ((timer != 0 && get_seconds() > timer) || (warns != 0 && warns >= warnlimit)) { - error = EDQUOT; + error = xfs_quota_error(flags); goto error_return; } } @@ -751,13 +750,14 @@ error_return: /* - * Given a dquot(s), make disk block and/or inode reservations against them. + * Given dquot(s), make disk block and/or inode reservations against them. * The fact that this does the reservation against both the usr and - * grp quotas is important, because this follows a both-or-nothing + * grp/prj quotas is important, because this follows a both-or-nothing * approach. * * flags = XFS_QMOPT_DQLOCK indicate if dquot(s) need to be locked. * XFS_QMOPT_FORCE_RES evades limit enforcement. Used by chown. + * XFS_QMOPT_ENOSPC returns ENOSPC not EDQUOT. Used by pquota. * XFS_TRANS_DQ_RES_BLKS reserves regular disk blocks * XFS_TRANS_DQ_RES_RTBLKS reserves realtime disk blocks * dquots are unlocked on return, if they were not locked by caller. @@ -772,25 +772,27 @@ xfs_trans_reserve_quota_bydquots( long ninos, uint flags) { - int resvd; + int resvd = 0, error; - if (! XFS_IS_QUOTA_ON(mp)) - return (0); + if (!XFS_IS_QUOTA_ON(mp)) + return 0; if (tp && tp->t_dqinfo == NULL) xfs_trans_alloc_dqinfo(tp); ASSERT(flags & XFS_QMOPT_RESBLK_MASK); - resvd = 0; if (udqp) { - if (xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, flags)) - return (EDQUOT); + error = xfs_trans_dqresv(tp, mp, udqp, nblks, ninos, + (flags & ~XFS_QMOPT_ENOSPC)); + if (error) + return error; resvd = 1; } if (gdqp) { - if (xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags)) { + error = xfs_trans_dqresv(tp, mp, gdqp, nblks, ninos, flags); + if (error) { /* * can't do it, so backout previous reservation */ @@ -799,14 +801,14 @@ xfs_trans_reserve_quota_bydquots( xfs_trans_dqresv(tp, mp, udqp, -nblks, -ninos, flags); } - return (EDQUOT); + return error; } } /* - * Didnt change anything critical, so, no need to log + * Didn't change anything critical, so, no need to log */ - return (0); + return 0; } @@ -814,8 +816,6 @@ xfs_trans_reserve_quota_bydquots( * Lock the dquot and change the reservation if we can. * This doesn't change the actual usage, just the reservation. * The inode sent in is locked. - * - * Returns 0 on success, EDQUOT or other errors otherwise */ STATIC int xfs_trans_reserve_quota_nblks( @@ -824,20 +824,24 @@ xfs_trans_reserve_quota_nblks( xfs_inode_t *ip, long nblks, long ninos, - uint type) + uint flags) { int error; if (!XFS_IS_QUOTA_ON(mp)) - return (0); + return 0; + if (XFS_IS_PQUOTA_ON(mp)) + flags |= XFS_QMOPT_ENOSPC; ASSERT(ip->i_ino != mp->m_sb.sb_uquotino); ASSERT(ip->i_ino != mp->m_sb.sb_gquotino); ASSERT(XFS_ISLOCKED_INODE_EXCL(ip)); ASSERT(XFS_IS_QUOTA_RUNNING(ip->i_mount)); - ASSERT((type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_RTBLKS || - (type & ~XFS_QMOPT_FORCE_RES) == XFS_TRANS_DQ_RES_BLKS); + ASSERT((flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_RTBLKS || + (flags & ~(XFS_QMOPT_FORCE_RES | XFS_QMOPT_ENOSPC)) == + XFS_TRANS_DQ_RES_BLKS); /* * Reserve nblks against these dquots, with trans as the mediator. @@ -845,8 +849,8 @@ xfs_trans_reserve_quota_nblks( error = xfs_trans_reserve_quota_bydquots(tp, mp, ip->i_udquot, ip->i_gdquot, nblks, ninos, - type); - return (error); + flags); + return error; } /* diff --git a/fs/xfs/xfs_acl.c b/fs/xfs/xfs_acl.c index 4ff0f4e41c6..2539af34eb6 100644 --- a/fs/xfs/xfs_acl.c +++ b/fs/xfs/xfs_acl.c @@ -395,7 +395,7 @@ xfs_acl_allow_set( * The access control process to determine the access permission: * if uid == file owner id, use the file owner bits. * if gid == file owner group id, use the file group bits. - * scan ACL for a maching user or group, and use matched entry + * scan ACL for a matching user or group, and use matched entry * permission. Use total permissions of all matching group entries, * until all acl entries are exhausted. The final permission produced * by matching acl entry or entries needs to be & with group permission. diff --git a/fs/xfs/xfs_ag.h b/fs/xfs/xfs_ag.h index a96e2ffce0c..dc2361dd740 100644 --- a/fs/xfs/xfs_ag.h +++ b/fs/xfs/xfs_ag.h @@ -179,7 +179,7 @@ typedef struct xfs_perag { char pagf_init; /* this agf's entry is initialized */ char pagi_init; /* this agi's entry is initialized */ - char pagf_metadata; /* the agf is prefered to be metadata */ + char pagf_metadata; /* the agf is preferred to be metadata */ char pagi_inodeok; /* The agi is ok for inodes */ __uint8_t pagf_levels[XFS_BTNUM_AGF]; /* # of levels in bno & cnt btree */ diff --git a/fs/xfs/xfs_alloc.c b/fs/xfs/xfs_alloc.c index f4328e1e2a7..64ee07db0d5 100644 --- a/fs/xfs/xfs_alloc.c +++ b/fs/xfs/xfs_alloc.c @@ -511,7 +511,7 @@ STATIC void xfs_alloc_trace_busy( char *name, /* function tag string */ char *str, /* additional string */ - xfs_mount_t *mp, /* file system mount poing */ + xfs_mount_t *mp, /* file system mount point */ xfs_agnumber_t agno, /* allocation group number */ xfs_agblock_t agbno, /* a.g. relative block number */ xfs_extlen_t len, /* length of extent */ @@ -1843,7 +1843,7 @@ xfs_alloc_fix_freelist( } else agbp = NULL; - /* If this is a metadata prefered pag and we are user data + /* If this is a metadata preferred pag and we are user data * then try somewhere else if we are not being asked to * try harder at this point */ @@ -2458,7 +2458,7 @@ error0: /* * AG Busy list management * The busy list contains block ranges that have been freed but whose - * transacations have not yet hit disk. If any block listed in a busy + * transactions have not yet hit disk. If any block listed in a busy * list is reused, the transaction that freed it must be forced to disk * before continuing to use the block. * diff --git a/fs/xfs/xfs_alloc.h b/fs/xfs/xfs_alloc.h index 3546dea27b7..2d1f8928b26 100644 --- a/fs/xfs/xfs_alloc.h +++ b/fs/xfs/xfs_alloc.h @@ -68,7 +68,7 @@ typedef struct xfs_alloc_arg { xfs_alloctype_t otype; /* original allocation type */ char wasdel; /* set if allocation was prev delayed */ char wasfromfl; /* set if allocation is from freelist */ - char isfl; /* set if is freelist blocks - !actg */ + char isfl; /* set if is freelist blocks - !acctg */ char userdata; /* set if this is user data */ } xfs_alloc_arg_t; diff --git a/fs/xfs/xfs_attr.c b/fs/xfs/xfs_attr.c index 093fac476bd..b6e1e02bbb2 100644 --- a/fs/xfs/xfs_attr.c +++ b/fs/xfs/xfs_attr.c @@ -294,7 +294,7 @@ xfs_attr_set_int(xfs_inode_t *dp, const char *name, int namelen, xfs_trans_ihold(args.trans, dp); /* - * If the attribute list is non-existant or a shortform list, + * If the attribute list is non-existent or a shortform list, * upgrade it to a single-leaf-block attribute list. */ if ((dp->i_d.di_aformat == XFS_DINODE_FMT_LOCAL) || @@ -1584,7 +1584,7 @@ out: * Fill in the disk block numbers in the state structure for the buffers * that are attached to the state structure. * This is done so that we can quickly reattach ourselves to those buffers - * after some set of transaction commit's has released these buffers. + * after some set of transaction commits have released these buffers. */ STATIC int xfs_attr_fillstate(xfs_da_state_t *state) @@ -1631,7 +1631,7 @@ xfs_attr_fillstate(xfs_da_state_t *state) /* * Reattach the buffers to the state structure based on the disk block * numbers stored in the state structure. - * This is done after some set of transaction commit's has released those + * This is done after some set of transaction commits have released those * buffers from our grip. */ STATIC int diff --git a/fs/xfs/xfs_attr_leaf.c b/fs/xfs/xfs_attr_leaf.c index 717682747bd..9462be86aa1 100644 --- a/fs/xfs/xfs_attr_leaf.c +++ b/fs/xfs/xfs_attr_leaf.c @@ -524,7 +524,7 @@ xfs_attr_shortform_compare(const void *a, const void *b) /* * Copy out entries of shortform attribute lists for attr_list(). - * Shortform atrtribute lists are not stored in hashval sorted order. + * Shortform attribute lists are not stored in hashval sorted order. * If the output buffer is not large enough to hold them all, then we * we have to calculate each entries' hashvalue and sort them before * we can begin returning them to the user. @@ -1541,7 +1541,7 @@ xfs_attr_leaf_toosmall(xfs_da_state_t *state, int *action) /* * Check for the degenerate case of the block being empty. * If the block is empty, we'll simply delete it, no need to - * coalesce it with a sibling block. We choose (aribtrarily) + * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ if (count == 0) { diff --git a/fs/xfs/xfs_behavior.c b/fs/xfs/xfs_behavior.c index 9880adae393..f4fe3715a80 100644 --- a/fs/xfs/xfs_behavior.c +++ b/fs/xfs/xfs_behavior.c @@ -31,7 +31,7 @@ * The behavior chain is ordered based on the 'position' number which * lives in the first field of the ops vector (higher numbers first). * - * Attemps to insert duplicate ops result in an EINVAL return code. + * Attempts to insert duplicate ops result in an EINVAL return code. * Otherwise, return 0 to indicate success. */ int @@ -84,7 +84,7 @@ bhv_insert(bhv_head_t *bhp, bhv_desc_t *bdp) /* * Remove a behavior descriptor from a position in a behavior chain; - * the postition is guaranteed not to be the first position. + * the position is guaranteed not to be the first position. * Should only be called by the bhv_remove() macro. */ void diff --git a/fs/xfs/xfs_behavior.h b/fs/xfs/xfs_behavior.h index 2cd89bb5ab1..1d8ff103201 100644 --- a/fs/xfs/xfs_behavior.h +++ b/fs/xfs/xfs_behavior.h @@ -39,7 +39,7 @@ * behaviors is synchronized with operations-in-progress (oip's) so that * the oip's always see a consistent view of the chain. * - * The term "interpostion" is used to refer to the act of inserting + * The term "interposition" is used to refer to the act of inserting * a behavior such that it interposes on (i.e., is inserted in front * of) a particular other behavior. A key example of this is when a * system implementing distributed single system image wishes to @@ -51,7 +51,7 @@ * * Behavior synchronization is logic which is necessary under certain * circumstances that there is no conflict between ongoing operations - * traversing the behavior chain and those dunamically modifying the + * traversing the behavior chain and those dynamically modifying the * behavior chain. Because behavior synchronization adds extra overhead * to virtual operation invocation, we want to restrict, as much as * we can, the requirement for this extra code, to those situations diff --git a/fs/xfs/xfs_bmap.c b/fs/xfs/xfs_bmap.c index 2d702e4a74a..26939d364bc 100644 --- a/fs/xfs/xfs_bmap.c +++ b/fs/xfs/xfs_bmap.c @@ -3467,113 +3467,6 @@ done: return error; } -xfs_bmbt_rec_t * /* pointer to found extent entry */ -xfs_bmap_do_search_extents( - xfs_bmbt_rec_t *base, /* base of extent list */ - xfs_extnum_t lastx, /* last extent index used */ - xfs_extnum_t nextents, /* number of file extents */ - xfs_fileoff_t bno, /* block number searched for */ - int *eofp, /* out: end of file found */ - xfs_extnum_t *lastxp, /* out: last extent index */ - xfs_bmbt_irec_t *gotp, /* out: extent entry found */ - xfs_bmbt_irec_t *prevp) /* out: previous extent entry found */ -{ - xfs_bmbt_rec_t *ep; /* extent list entry pointer */ - xfs_bmbt_irec_t got; /* extent list entry, decoded */ - int high; /* high index of binary search */ - int low; /* low index of binary search */ - - /* - * Initialize the extent entry structure to catch access to - * uninitialized br_startblock field. - */ - got.br_startoff = 0xffa5a5a5a5a5a5a5LL; - got.br_blockcount = 0xa55a5a5a5a5a5a5aLL; - got.br_state = XFS_EXT_INVALID; - -#if XFS_BIG_BLKNOS - got.br_startblock = 0xffffa5a5a5a5a5a5LL; -#else - got.br_startblock = 0xffffa5a5; -#endif - - if (lastx != NULLEXTNUM && lastx < nextents) - ep = base + lastx; - else - ep = NULL; - prevp->br_startoff = NULLFILEOFF; - if (ep && bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep)) && - bno < got.br_startoff + - (got.br_blockcount = xfs_bmbt_get_blockcount(ep))) - *eofp = 0; - else if (ep && lastx < nextents - 1 && - bno >= (got.br_startoff = xfs_bmbt_get_startoff(ep + 1)) && - bno < got.br_startoff + - (got.br_blockcount = xfs_bmbt_get_blockcount(ep + 1))) { - lastx++; - ep++; - *eofp = 0; - } else if (nextents == 0) - *eofp = 1; - else if (bno == 0 && - (got.br_startoff = xfs_bmbt_get_startoff(base)) == 0) { - ep = base; - lastx = 0; - got.br_blockcount = xfs_bmbt_get_blockcount(ep); - *eofp = 0; - } else { - low = 0; - high = nextents - 1; - /* binary search the extents array */ - while (low <= high) { - XFS_STATS_INC(xs_cmp_exlist); - lastx = (low + high) >> 1; - ep = base + lastx; - got.br_startoff = xfs_bmbt_get_startoff(ep); - got.br_blockcount = xfs_bmbt_get_blockcount(ep); - if (bno < got.br_startoff) - high = lastx - 1; - else if (bno >= got.br_startoff + got.br_blockcount) - low = lastx + 1; - else { - got.br_startblock = xfs_bmbt_get_startblock(ep); - got.br_state = xfs_bmbt_get_state(ep); - *eofp = 0; - *lastxp = lastx; - *gotp = got; - return ep; - } - } - if (bno >= got.br_startoff + got.br_blockcount) { - lastx++; - if (lastx == nextents) { - *eofp = 1; - got.br_startblock = xfs_bmbt_get_startblock(ep); - got.br_state = xfs_bmbt_get_state(ep); - *prevp = got; - ep = NULL; - } else { - *eofp = 0; - xfs_bmbt_get_all(ep, prevp); - ep++; - got.br_startoff = xfs_bmbt_get_startoff(ep); - got.br_blockcount = xfs_bmbt_get_blockcount(ep); - } - } else { - *eofp = 0; - if (ep > base) - xfs_bmbt_get_all(ep - 1, prevp); - } - } - if (ep) { - got.br_startblock = xfs_bmbt_get_startblock(ep); - got.br_state = xfs_bmbt_get_state(ep); - } - *lastxp = lastx; - *gotp = got; - return ep; -} - /* * Search the extent records for the entry containing block bno. * If bno lies in a hole, point to the next entry. If bno lies @@ -4826,18 +4719,17 @@ xfs_bmapi( /* * Make a transaction-less quota reservation for * delayed allocation blocks. This number gets - * adjusted later. - * We return EDQUOT if we haven't allocated - * blks already inside this loop; + * adjusted later. We return if we haven't + * allocated blocks already inside this loop. */ - if (XFS_TRANS_RESERVE_QUOTA_NBLKS( + if ((error = XFS_TRANS_RESERVE_QUOTA_NBLKS( mp, NULL, ip, (long)alen, 0, rt ? XFS_QMOPT_RES_RTBLKS : - XFS_QMOPT_RES_REGBLKS)) { + XFS_QMOPT_RES_REGBLKS))) { if (n == 0) { *nmap = 0; ASSERT(cur == NULL); - return XFS_ERROR(EDQUOT); + return error; } break; } diff --git a/fs/xfs/xfs_bmap.h b/fs/xfs/xfs_bmap.h index 011ccaa9a1c..8e0d73d9ccc 100644 --- a/fs/xfs/xfs_bmap.h +++ b/fs/xfs/xfs_bmap.h @@ -353,23 +353,16 @@ xfs_check_nostate_extents( xfs_extnum_t num); /* - * Call xfs_bmap_do_search_extents() to search for the extent - * record containing block bno. If in multi-level in-core extent - * allocation mode, find and extract the target extent buffer, - * otherwise just use the direct extent list. + * Search the extent records for the entry containing block bno. + * If bno lies in a hole, point to the next entry. If bno lies + * past eof, *eofp will be set, and *prevp will contain the last + * entry (null if none). Else, *lastxp will be set to the index + * of the found entry; *gotp will contain the entry. */ xfs_bmbt_rec_t * xfs_bmap_search_multi_extents(struct xfs_ifork *, xfs_fileoff_t, int *, xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *); -/* - * Search an extent list for the extent which includes block - * bno. - */ -xfs_bmbt_rec_t *xfs_bmap_do_search_extents(xfs_bmbt_rec_t *, - xfs_extnum_t, xfs_extnum_t, xfs_fileoff_t, int *, - xfs_extnum_t *, xfs_bmbt_irec_t *, xfs_bmbt_irec_t *); - #endif /* __KERNEL__ */ #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 07e2324152b..5fed15682dd 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -98,12 +98,12 @@ xfs_buf_item_flush_log_debug( } /* - * This function is called to verify that our caller's have logged + * This function is called to verify that our callers have logged * all the bytes that they changed. * * It does this by comparing the original copy of the buffer stored in * the buf log item's bli_orig array to the current copy of the buffer - * and ensuring that all bytes which miscompare are set in the bli_logged + * and ensuring that all bytes which mismatch are set in the bli_logged * array of the buf log item. */ STATIC void diff --git a/fs/xfs/xfs_cap.h b/fs/xfs/xfs_cap.h index 433ec537f9b..d0035c6e951 100644 --- a/fs/xfs/xfs_cap.h +++ b/fs/xfs/xfs_cap.h @@ -38,7 +38,7 @@ typedef struct xfs_cap_set { /* * For Linux, we take the bitfields directly from capability.h * and no longer attempt to keep this attribute ondisk compatible - * with IRIX. Since this attribute is only set on exectuables, + * with IRIX. Since this attribute is only set on executables, * it just doesn't make much sense to try. We do use a different * named attribute though, to avoid confusion. */ diff --git a/fs/xfs/xfs_clnt.h b/fs/xfs/xfs_clnt.h index 022fff62085..5b7eb81453b 100644 --- a/fs/xfs/xfs_clnt.h +++ b/fs/xfs/xfs_clnt.h @@ -68,6 +68,7 @@ struct xfs_mount_args { * enforcement */ #define XFSMNT_PQUOTAENF 0x00000040 /* IRIX project quota limit * enforcement */ +#define XFSMNT_QUIET 0x00000080 /* don't report mount errors */ #define XFSMNT_NOALIGN 0x00000200 /* don't allocate at * stripe boundaries*/ #define XFSMNT_RETERR 0x00000400 /* return error to user */ diff --git a/fs/xfs/xfs_da_btree.c b/fs/xfs/xfs_da_btree.c index 4bae3a76c67..8988b905117 100644 --- a/fs/xfs/xfs_da_btree.c +++ b/fs/xfs/xfs_da_btree.c @@ -840,7 +840,7 @@ xfs_da_node_toosmall(xfs_da_state_t *state, int *action) /* * Check for the degenerate case of the block being empty. * If the block is empty, we'll simply delete it, no need to - * coalesce it with a sibling block. We choose (aribtrarily) + * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ if (count == 0) { diff --git a/fs/xfs/xfs_dir2_block.c b/fs/xfs/xfs_dir2_block.c index bd5cee6aa51..972ded59547 100644 --- a/fs/xfs/xfs_dir2_block.c +++ b/fs/xfs/xfs_dir2_block.c @@ -533,7 +533,7 @@ xfs_dir2_block_getdents( /* * Reached the end of the block. - * Set the offset to a nonexistent block 1 and return. + * Set the offset to a non-existent block 1 and return. */ *eofp = 1; diff --git a/fs/xfs/xfs_dir2_leaf.c b/fs/xfs/xfs_dir2_leaf.c index 08648b18265..0f5e2f2ce6e 100644 --- a/fs/xfs/xfs_dir2_leaf.c +++ b/fs/xfs/xfs_dir2_leaf.c @@ -515,7 +515,7 @@ xfs_dir2_leaf_addname( ASSERT(be32_to_cpu(leaf->ents[highstale].address) == XFS_DIR2_NULL_DATAPTR); /* - * Copy entries down to copver the stale entry + * Copy entries down to cover the stale entry * and make room for the new entry. */ if (highstale - index > 0) diff --git a/fs/xfs/xfs_dir2_node.c b/fs/xfs/xfs_dir2_node.c index af556f16a0c..ac511ab9c52 100644 --- a/fs/xfs/xfs_dir2_node.c +++ b/fs/xfs/xfs_dir2_node.c @@ -830,7 +830,7 @@ xfs_dir2_leafn_rebalance( state->inleaf = 1; blk2->index = 0; cmn_err(CE_ALERT, - "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting orignal leaf: " + "xfs_dir2_leafn_rebalance: picked the wrong leaf? reverting original leaf: " "blk1->index %d\n", blk1->index); } diff --git a/fs/xfs/xfs_dir_leaf.c b/fs/xfs/xfs_dir_leaf.c index ee88751c3be..6d711869262 100644 --- a/fs/xfs/xfs_dir_leaf.c +++ b/fs/xfs/xfs_dir_leaf.c @@ -1341,7 +1341,7 @@ xfs_dir_leaf_toosmall(xfs_da_state_t *state, int *action) /* * Check for the degenerate case of the block being empty. * If the block is empty, we'll simply delete it, no need to - * coalesce it with a sibling block. We choose (aribtrarily) + * coalesce it with a sibling block. We choose (arbitrarily) * to merge with the forward block unless it is NULL. */ if (count == 0) { diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h index 26b8e709a56..bc43163456e 100644 --- a/fs/xfs/xfs_error.h +++ b/fs/xfs/xfs_error.h @@ -186,4 +186,7 @@ extern void xfs_fs_cmn_err(int level, struct xfs_mount *mp, char *fmt, ...); #define xfs_fs_repair_cmn_err(level, mp, fmt, args...) \ xfs_fs_cmn_err(level, mp, fmt " Unmount and run xfs_repair.", ## args) +#define xfs_fs_mount_cmn_err(f, fmt, args...) \ + ((f & XFS_MFSI_QUIET)? cmn_err(CE_WARN, "XFS: " fmt, ## args) : (void)0) + #endif /* __XFS_ERROR_H__ */ diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 56caa88713a..dfa3527b20a 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -477,7 +477,7 @@ xfs_fs_counts( * * xfs_reserve_blocks is called to set m_resblks * in the in-core mount table. The number of unused reserved blocks - * is kept in m_resbls_avail. + * is kept in m_resblks_avail. * * Reserve the requested number of blocks if available. Otherwise return * as many as possible to satisfy the request. The actual number diff --git a/fs/xfs/xfs_ialloc.c b/fs/xfs/xfs_ialloc.c index 0024892841a..deddbd03c16 100644 --- a/fs/xfs/xfs_ialloc.c +++ b/fs/xfs/xfs_ialloc.c @@ -136,7 +136,7 @@ xfs_ialloc_ag_alloc( int ninodes; /* num inodes per buf */ xfs_agino_t thisino; /* current inode number, for loop */ int version; /* inode version number to use */ - int isaligned; /* inode allocation at stripe unit */ + int isaligned = 0; /* inode allocation at stripe unit */ /* boundary */ args.tp = tp; @@ -152,46 +152,75 @@ xfs_ialloc_ag_alloc( return XFS_ERROR(ENOSPC); args.minlen = args.maxlen = XFS_IALLOC_BLOCKS(args.mp); /* - * Set the alignment for the allocation. - * If stripe alignment is turned on then align at stripe unit - * boundary. - * If the cluster size is smaller than a filesystem block - * then we're doing I/O for inodes in filesystem block size pieces, - * so don't need alignment anyway. - */ - isaligned = 0; - if (args.mp->m_sinoalign) { - ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); - args.alignment = args.mp->m_dalign; - isaligned = 1; - } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && - args.mp->m_sb.sb_inoalignmt >= - XFS_B_TO_FSBT(args.mp, XFS_INODE_CLUSTER_SIZE(args.mp))) - args.alignment = args.mp->m_sb.sb_inoalignmt; - else - args.alignment = 1; + * First try to allocate inodes contiguous with the last-allocated + * chunk of inodes. If the filesystem is striped, this will fill + * an entire stripe unit with inodes. + */ agi = XFS_BUF_TO_AGI(agbp); - /* - * Need to figure out where to allocate the inode blocks. - * Ideally they should be spaced out through the a.g. - * For now, just allocate blocks up front. - */ - args.agbno = be32_to_cpu(agi->agi_root); - args.fsbno = XFS_AGB_TO_FSB(args.mp, be32_to_cpu(agi->agi_seqno), - args.agbno); - /* - * Allocate a fixed-size extent of inodes. - */ - args.type = XFS_ALLOCTYPE_NEAR_BNO; - args.mod = args.total = args.wasdel = args.isfl = args.userdata = - args.minalignslop = 0; - args.prod = 1; - /* - * Allow space for the inode btree to split. - */ - args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; - if ((error = xfs_alloc_vextent(&args))) - return error; + newino = be32_to_cpu(agi->agi_newino); + args.agbno = XFS_AGINO_TO_AGBNO(args.mp, newino) + + XFS_IALLOC_BLOCKS(args.mp); + if (likely(newino != NULLAGINO && + (args.agbno < be32_to_cpu(agi->agi_length)))) { + args.fsbno = XFS_AGB_TO_FSB(args.mp, + be32_to_cpu(agi->agi_seqno), args.agbno); + args.type = XFS_ALLOCTYPE_THIS_BNO; + args.mod = args.total = args.wasdel = args.isfl = + args.userdata = args.minalignslop = 0; + args.prod = 1; + args.alignment = 1; + /* + * Allow space for the inode btree to split. + */ + args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } else + args.fsbno = NULLFSBLOCK; + + if (unlikely(args.fsbno == NULLFSBLOCK)) { + /* + * Set the alignment for the allocation. + * If stripe alignment is turned on then align at stripe unit + * boundary. + * If the cluster size is smaller than a filesystem block + * then we're doing I/O for inodes in filesystem block size + * pieces, so don't need alignment anyway. + */ + isaligned = 0; + if (args.mp->m_sinoalign) { + ASSERT(!(args.mp->m_flags & XFS_MOUNT_NOALIGN)); + args.alignment = args.mp->m_dalign; + isaligned = 1; + } else if (XFS_SB_VERSION_HASALIGN(&args.mp->m_sb) && + args.mp->m_sb.sb_inoalignmt >= + XFS_B_TO_FSBT(args.mp, + XFS_INODE_CLUSTER_SIZE(args.mp))) + args.alignment = args.mp->m_sb.sb_inoalignmt; + else + args.alignment = 1; + /* + * Need to figure out where to allocate the inode blocks. + * Ideally they should be spaced out through the a.g. + * For now, just allocate blocks up front. + */ + args.agbno = be32_to_cpu(agi->agi_root); + args.fsbno = XFS_AGB_TO_FSB(args.mp, + be32_to_cpu(agi->agi_seqno), args.agbno); + /* + * Allocate a fixed-size extent of inodes. + */ + args.type = XFS_ALLOCTYPE_NEAR_BNO; + args.mod = args.total = args.wasdel = args.isfl = + args.userdata = args.minalignslop = 0; + args.prod = 1; + /* + * Allow space for the inode btree to split. + */ + args.minleft = XFS_IN_MAXLEVELS(args.mp) - 1; + if ((error = xfs_alloc_vextent(&args))) + return error; + } /* * If stripe alignment is turned on, then try again with cluster @@ -1023,7 +1052,7 @@ xfs_difree( rec.ir_freecount++; /* - * When an inode cluster is free, it becomes elgible for removal + * When an inode cluster is free, it becomes eligible for removal */ if ((mp->m_flags & XFS_MOUNT_IDELETE) && (rec.ir_freecount == XFS_IALLOC_INODES(mp))) { diff --git a/fs/xfs/xfs_iget.c b/fs/xfs/xfs_iget.c index 3ce35a6f700..b5385432526 100644 --- a/fs/xfs/xfs_iget.c +++ b/fs/xfs/xfs_iget.c @@ -421,7 +421,10 @@ finish_inode: ip->i_chash = chlnew; chlnew->chl_ip = ip; chlnew->chl_blkno = ip->i_blkno; + if (ch->ch_list) + ch->ch_list->chl_prev = chlnew; chlnew->chl_next = ch->ch_list; + chlnew->chl_prev = NULL; ch->ch_list = chlnew; chlnew = NULL; } @@ -509,7 +512,7 @@ retry: } else { /* * If the inode is not fully constructed due to - * filehandle mistmatches wait for the inode to go + * filehandle mismatches wait for the inode to go * away and try again. * * iget_locked will call __wait_on_freeing_inode @@ -723,23 +726,15 @@ xfs_iextract( ASSERT(ip->i_cnext == ip && ip->i_cprev == ip); ASSERT(ip->i_chash != NULL); chm=NULL; - for (chl = ch->ch_list; chl != NULL; chl = chl->chl_next) { - if (chl->chl_blkno == ip->i_blkno) { - if (chm == NULL) { - /* first item on the list */ - ch->ch_list = chl->chl_next; - } else { - chm->chl_next = chl->chl_next; - } - kmem_zone_free(xfs_chashlist_zone, chl); - break; - } else { - ASSERT(chl->chl_ip != ip); - chm = chl; - } - } - ASSERT_ALWAYS(chl != NULL); - } else { + chl = ip->i_chash; + if (chl->chl_prev) + chl->chl_prev->chl_next = chl->chl_next; + else + ch->ch_list = chl->chl_next; + if (chl->chl_next) + chl->chl_next->chl_prev = chl->chl_prev; + kmem_zone_free(xfs_chashlist_zone, chl); + } else { /* delete one inode from a non-empty list */ iq = ip->i_cnext; iq->i_cprev = ip->i_cprev; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index 88a517fad07..94b60dd0380 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -160,7 +160,7 @@ xfs_inotobp( xfs_dinode_t *dip; /* - * Call the space managment code to find the location of the + * Call the space management code to find the location of the * inode on disk. */ imap.im_blkno = 0; @@ -837,7 +837,7 @@ xfs_dic2xflags( /* * Given a mount structure and an inode number, return a pointer - * to a newly allocated in-core inode coresponding to the given + * to a newly allocated in-core inode corresponding to the given * inode number. * * Initialize the inode's attributes and extent pointers if it @@ -2723,7 +2723,7 @@ xfs_ipin( /* * Decrement the pin count of the given inode, and wake up * anyone in xfs_iwait_unpin() if the count goes to 0. The - * inode must have been previoulsy pinned with a call to xfs_ipin(). + * inode must have been previously pinned with a call to xfs_ipin(). */ void xfs_iunpin( @@ -2732,16 +2732,29 @@ xfs_iunpin( ASSERT(atomic_read(&ip->i_pincount) > 0); if (atomic_dec_and_test(&ip->i_pincount)) { - vnode_t *vp = XFS_ITOV_NULL(ip); + /* + * If the inode is currently being reclaimed, the + * linux inode _and_ the xfs vnode may have been + * freed so we cannot reference either of them safely. + * Hence we should not try to do anything to them + * if the xfs inode is currently in the reclaim + * path. + * + * However, we still need to issue the unpin wakeup + * call as the inode reclaim may be blocked waiting for + * the inode to become unpinned. + */ + if (!(ip->i_flags & (XFS_IRECLAIM|XFS_IRECLAIMABLE))) { + vnode_t *vp = XFS_ITOV_NULL(ip); - /* make sync come back and flush this inode */ - if (vp) { - struct inode *inode = vn_to_inode(vp); + /* make sync come back and flush this inode */ + if (vp) { + struct inode *inode = vn_to_inode(vp); - if (!(inode->i_state & I_NEW)) - mark_inode_dirty_sync(inode); + if (!(inode->i_state & I_NEW)) + mark_inode_dirty_sync(inode); + } } - wake_up(&ip->i_ipin_wait); } } @@ -3690,7 +3703,7 @@ void xfs_iext_add( xfs_ifork_t *ifp, /* inode fork pointer */ xfs_extnum_t idx, /* index to begin adding exts */ - int ext_diff) /* nubmer of extents to add */ + int ext_diff) /* number of extents to add */ { int byte_diff; /* new bytes being added */ int new_size; /* size of extents after adding */ @@ -4038,7 +4051,7 @@ xfs_iext_remove_indirect( xfs_extnum_t ext_diff; /* extents to remove in current list */ xfs_extnum_t nex1; /* number of extents before idx */ xfs_extnum_t nex2; /* extents after idx + count */ - int nlists; /* entries in indirecton array */ + int nlists; /* entries in indirection array */ int page_idx = idx; /* index in target extent list */ ASSERT(ifp->if_flags & XFS_IFEXTIREC); @@ -4291,9 +4304,9 @@ xfs_iext_bno_to_ext( xfs_filblks_t blockcount = 0; /* number of blocks in extent */ xfs_bmbt_rec_t *ep = NULL; /* pointer to target extent */ xfs_ext_irec_t *erp = NULL; /* indirection array pointer */ - int high; /* upper boundry in search */ + int high; /* upper boundary in search */ xfs_extnum_t idx = 0; /* index of target extent */ - int low; /* lower boundry in search */ + int low; /* lower boundary in search */ xfs_extnum_t nextents; /* number of file extents */ xfs_fileoff_t startoff = 0; /* start offset of extent */ diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index 39ef9c36ea5..3b544db1790 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -189,6 +189,7 @@ typedef struct xfs_ihash { */ typedef struct xfs_chashlist { struct xfs_chashlist *chl_next; + struct xfs_chashlist *chl_prev; struct xfs_inode *chl_ip; xfs_daddr_t chl_blkno; /* starting block number of * the cluster */ diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 36aa1fcb90a..7497a481b2f 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -580,7 +580,7 @@ xfs_inode_item_unpin_remove( * been or is in the process of being flushed, then (ideally) we'd like to * see if the inode's buffer is still incore, and if so give it a nudge. * We delay doing so until the pushbuf routine, though, to avoid holding - * the AIL lock across a call to the blackhole which is the buffercache. + * the AIL lock across a call to the blackhole which is the buffer cache. * Also we don't want to sleep in any device strategy routines, which can happen * if we do the subsequent bawrite in here. */ diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 32247b6bfee..94068d014f2 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -272,7 +272,7 @@ xfs_bulkstat( size_t statstruct_size, /* sizeof struct filling */ char __user *ubuffer, /* buffer with inode stats */ int flags, /* defined in xfs_itable.h */ - int *done) /* 1 if there're more stats to get */ + int *done) /* 1 if there are more stats to get */ { xfs_agblock_t agbno=0;/* allocation group block number */ xfs_buf_t *agbp; /* agi header buffer */ @@ -676,7 +676,7 @@ xfs_bulkstat_single( xfs_mount_t *mp, /* mount point for filesystem */ xfs_ino_t *lastinop, /* inode to return */ char __user *buffer, /* buffer with inode stats */ - int *done) /* 1 if there're more stats to get */ + int *done) /* 1 if there are more stats to get */ { int count; /* count value for bulkstat call */ int error; /* return value */ diff --git a/fs/xfs/xfs_itable.h b/fs/xfs/xfs_itable.h index 047d834ed21..11eb4e1b18c 100644 --- a/fs/xfs/xfs_itable.h +++ b/fs/xfs/xfs_itable.h @@ -60,7 +60,7 @@ xfs_bulkstat( size_t statstruct_size,/* sizeof struct that we're filling */ char __user *ubuffer,/* buffer with inode stats */ int flags, /* flag to control access method */ - int *done); /* 1 if there're more stats to get */ + int *done); /* 1 if there are more stats to get */ int xfs_bulkstat_single( diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index 9176995160e..32e841d2f26 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -59,7 +59,7 @@ STATIC xlog_t * xlog_alloc_log(xfs_mount_t *mp, int num_bblks); STATIC int xlog_space_left(xlog_t *log, int cycle, int bytes); STATIC int xlog_sync(xlog_t *log, xlog_in_core_t *iclog); -STATIC void xlog_unalloc_log(xlog_t *log); +STATIC void xlog_dealloc_log(xlog_t *log); STATIC int xlog_write(xfs_mount_t *mp, xfs_log_iovec_t region[], int nentries, xfs_log_ticket_t tic, xfs_lsn_t *start_lsn, @@ -304,7 +304,7 @@ xfs_log_done(xfs_mount_t *mp, if ((ticket->t_flags & XLOG_TIC_PERM_RESERV) == 0 || (flags & XFS_LOG_REL_PERM_RESERV)) { /* - * Release ticket if not permanent reservation or a specifc + * Release ticket if not permanent reservation or a specific * request has been made to release a permanent reservation. */ xlog_trace_loggrant(log, ticket, "xfs_log_done: (non-permanent)"); @@ -511,7 +511,7 @@ xfs_log_mount(xfs_mount_t *mp, vfsp->vfs_flag |= VFS_RDONLY; if (error) { cmn_err(CE_WARN, "XFS: log mount/recovery failed: error %d", error); - xlog_unalloc_log(mp->m_log); + xlog_dealloc_log(mp->m_log); return error; } } @@ -667,7 +667,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) * * Go through the motions of sync'ing and releasing * the iclog, even though no I/O will actually happen, - * we need to wait for other log I/O's that may already + * we need to wait for other log I/Os that may already * be in progress. Do this as a separate section of * code so we'll know if we ever get stuck here that * we're in this odd situation of trying to unmount @@ -704,7 +704,7 @@ xfs_log_unmount_write(xfs_mount_t *mp) void xfs_log_unmount_dealloc(xfs_mount_t *mp) { - xlog_unalloc_log(mp->m_log); + xlog_dealloc_log(mp->m_log); } /* @@ -1492,7 +1492,7 @@ xlog_sync(xlog_t *log, ASSERT(XFS_BUF_ADDR(bp) <= log->l_logBBsize-1); ASSERT(XFS_BUF_ADDR(bp) + BTOBB(count) <= log->l_logBBsize); - /* account for internal log which does't start at block #0 */ + /* account for internal log which doesn't start at block #0 */ XFS_BUF_SET_ADDR(bp, XFS_BUF_ADDR(bp) + log->l_logBBstart); XFS_BUF_WRITE(bp); if ((error = XFS_bwrite(bp))) { @@ -1506,10 +1506,10 @@ xlog_sync(xlog_t *log, /* - * Unallocate a log structure + * Deallocate a log structure */ void -xlog_unalloc_log(xlog_t *log) +xlog_dealloc_log(xlog_t *log) { xlog_in_core_t *iclog, *next_iclog; xlog_ticket_t *tic, *next_tic; @@ -1539,7 +1539,7 @@ xlog_unalloc_log(xlog_t *log) if ((log->l_ticket_cnt != log->l_ticket_tcnt) && !XLOG_FORCED_SHUTDOWN(log)) { xfs_fs_cmn_err(CE_WARN, log->l_mp, - "xlog_unalloc_log: (cnt: %d, total: %d)", + "xlog_dealloc_log: (cnt: %d, total: %d)", log->l_ticket_cnt, log->l_ticket_tcnt); /* ASSERT(log->l_ticket_cnt == log->l_ticket_tcnt); */ @@ -1562,7 +1562,7 @@ xlog_unalloc_log(xlog_t *log) #endif log->l_mp->m_log = NULL; kmem_free(log, sizeof(xlog_t)); -} /* xlog_unalloc_log */ +} /* xlog_dealloc_log */ /* * Update counters atomically now that memcpy is done. @@ -2829,7 +2829,7 @@ xlog_state_release_iclog(xlog_t *log, /* * We let the log lock go, so it's possible that we hit a log I/O - * error or someother SHUTDOWN condition that marks the iclog + * error or some other SHUTDOWN condition that marks the iclog * as XLOG_STATE_IOERROR before the bwrite. However, we know that * this iclog has consistent data, so we ignore IOERROR * flags after this point. diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h index 4b2ac88dbb8..eacb3d4987f 100644 --- a/fs/xfs/xfs_log.h +++ b/fs/xfs/xfs_log.h @@ -27,7 +27,7 @@ #ifdef __KERNEL__ /* - * By comparing each compnent, we don't have to worry about extra + * By comparing each component, we don't have to worry about extra * endian issues in treating two 32 bit numbers as one 64 bit number */ static inline xfs_lsn_t _lsn_cmp(xfs_lsn_t lsn1, xfs_lsn_t lsn2) diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index add13f507ed..1f0016b0b4e 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -583,7 +583,7 @@ xlog_find_head( * x | x ... | x - 1 | x * Another case that fits this picture would be * x | x + 1 | x ... | x - * In this case the head really is somwhere at the end of the + * In this case the head really is somewhere at the end of the * log, as one of the latest writes at the beginning was * incomplete. * One more case is @@ -2799,7 +2799,7 @@ xlog_recover_do_trans( * we don't need to worry about the block number being * truncated in > 1 TB buffers because in user-land, * we're now n32 or 64-bit so xfs_daddr_t is 64-bits so - * the blkno's will get through the user-mode buffer + * the blknos will get through the user-mode buffer * cache properly. The only bad case is o32 kernels * where xfs_daddr_t is 32-bits but mount will warn us * off a > 1 TB filesystem before we get here. diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 20e8abc16d1..c0b1c290688 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -213,7 +213,8 @@ xfs_mount_free( STATIC int xfs_mount_validate_sb( xfs_mount_t *mp, - xfs_sb_t *sbp) + xfs_sb_t *sbp, + int flags) { /* * If the log device and data device have the @@ -223,33 +224,29 @@ xfs_mount_validate_sb( * a volume filesystem in a non-volume manner. */ if (sbp->sb_magicnum != XFS_SB_MAGIC) { - cmn_err(CE_WARN, "XFS: bad magic number"); + xfs_fs_mount_cmn_err(flags, "bad magic number"); return XFS_ERROR(EWRONGFS); } if (!XFS_SB_GOOD_VERSION(sbp)) { - cmn_err(CE_WARN, "XFS: bad version"); + xfs_fs_mount_cmn_err(flags, "bad version"); return XFS_ERROR(EWRONGFS); } if (unlikely( sbp->sb_logstart == 0 && mp->m_logdev_targp == mp->m_ddev_targp)) { - cmn_err(CE_WARN, - "XFS: filesystem is marked as having an external log; " - "specify logdev on the\nmount command line."); - XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(1)", - XFS_ERRLEVEL_HIGH, mp, sbp); - return XFS_ERROR(EFSCORRUPTED); + xfs_fs_mount_cmn_err(flags, + "filesystem is marked as having an external log; " + "specify logdev on the\nmount command line."); + return XFS_ERROR(EINVAL); } if (unlikely( sbp->sb_logstart != 0 && mp->m_logdev_targp != mp->m_ddev_targp)) { - cmn_err(CE_WARN, - "XFS: filesystem is marked as having an internal log; " - "don't specify logdev on\nthe mount command line."); - XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(2)", - XFS_ERRLEVEL_HIGH, mp, sbp); - return XFS_ERROR(EFSCORRUPTED); + xfs_fs_mount_cmn_err(flags, + "filesystem is marked as having an internal log; " + "do not specify logdev on\nthe mount command line."); + return XFS_ERROR(EINVAL); } /* @@ -273,10 +270,8 @@ xfs_mount_validate_sb( (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || (sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) || - (sbp->sb_imax_pct > 100 || sbp->sb_imax_pct < 1))) { - cmn_err(CE_WARN, "XFS: SB sanity check 1 failed"); - XFS_CORRUPTION_ERROR("xfs_mount_validate_sb(3)", - XFS_ERRLEVEL_LOW, mp, sbp); + (sbp->sb_imax_pct > 100 /* zero sb_imax_pct is valid */))) { + xfs_fs_mount_cmn_err(flags, "SB sanity check 1 failed"); return XFS_ERROR(EFSCORRUPTED); } @@ -289,9 +284,7 @@ xfs_mount_validate_sb( (xfs_drfsbno_t)sbp->sb_agcount * sbp->sb_agblocks || sbp->sb_dblocks < (xfs_drfsbno_t)(sbp->sb_agcount - 1) * sbp->sb_agblocks + XFS_MIN_AG_BLOCKS)) { - cmn_err(CE_WARN, "XFS: SB sanity check 2 failed"); - XFS_ERROR_REPORT("xfs_mount_validate_sb(4)", - XFS_ERRLEVEL_LOW, mp); + xfs_fs_mount_cmn_err(flags, "SB sanity check 2 failed"); return XFS_ERROR(EFSCORRUPTED); } @@ -307,15 +300,13 @@ xfs_mount_validate_sb( (sbp->sb_dblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX || (sbp->sb_rblocks << (sbp->sb_blocklog - BBSHIFT)) > UINT_MAX)) { #endif - cmn_err(CE_WARN, - "XFS: File system is too large to be mounted on this system."); + xfs_fs_mount_cmn_err(flags, + "file system too large to be mounted on this system."); return XFS_ERROR(E2BIG); } if (unlikely(sbp->sb_inprogress)) { - cmn_err(CE_WARN, "XFS: file system busy"); - XFS_ERROR_REPORT("xfs_mount_validate_sb(5)", - XFS_ERRLEVEL_LOW, mp); + xfs_fs_mount_cmn_err(flags, "file system busy"); return XFS_ERROR(EFSCORRUPTED); } @@ -323,8 +314,8 @@ xfs_mount_validate_sb( * Version 1 directory format has never worked on Linux. */ if (unlikely(!XFS_SB_VERSION_HASDIRV2(sbp))) { - cmn_err(CE_WARN, - "XFS: Attempted to mount file system using version 1 directory format"); + xfs_fs_mount_cmn_err(flags, + "file system using version 1 directory format"); return XFS_ERROR(ENOSYS); } @@ -332,11 +323,11 @@ xfs_mount_validate_sb( * Until this is fixed only page-sized or smaller data blocks work. */ if (unlikely(sbp->sb_blocksize > PAGE_SIZE)) { - cmn_err(CE_WARN, - "XFS: Attempted to mount file system with blocksize %d bytes", + xfs_fs_mount_cmn_err(flags, + "file system with blocksize %d bytes", sbp->sb_blocksize); - cmn_err(CE_WARN, - "XFS: Only page-sized (%ld) or less blocksizes currently work.", + xfs_fs_mount_cmn_err(flags, + "only pagesize (%ld) or less will currently work.", PAGE_SIZE); return XFS_ERROR(ENOSYS); } @@ -393,7 +384,7 @@ xfs_initialize_perag( break; } - /* This ag is prefered for inodes */ + /* This ag is preferred for inodes */ pag = &mp->m_perag[index]; pag->pagi_inodeok = 1; if (index < max_metadata) @@ -484,7 +475,7 @@ xfs_xlatesb( * Does the initial read of the superblock. */ int -xfs_readsb(xfs_mount_t *mp) +xfs_readsb(xfs_mount_t *mp, int flags) { unsigned int sector_size; unsigned int extra_flags; @@ -506,7 +497,7 @@ xfs_readsb(xfs_mount_t *mp) bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), extra_flags); if (!bp || XFS_BUF_ISERROR(bp)) { - cmn_err(CE_WARN, "XFS: SB read failed"); + xfs_fs_mount_cmn_err(flags, "SB read failed"); error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; goto fail; } @@ -520,9 +511,9 @@ xfs_readsb(xfs_mount_t *mp) sbp = XFS_BUF_TO_SBP(bp); xfs_xlatesb(XFS_BUF_PTR(bp), &(mp->m_sb), 1, XFS_SB_ALL_BITS); - error = xfs_mount_validate_sb(mp, &(mp->m_sb)); + error = xfs_mount_validate_sb(mp, &(mp->m_sb), flags); if (error) { - cmn_err(CE_WARN, "XFS: SB validate failed"); + xfs_fs_mount_cmn_err(flags, "SB validate failed"); goto fail; } @@ -530,8 +521,8 @@ xfs_readsb(xfs_mount_t *mp) * We must be able to do sector-sized and sector-aligned IO. */ if (sector_size > mp->m_sb.sb_sectsize) { - cmn_err(CE_WARN, - "XFS: device supports only %u byte sectors (not %u)", + xfs_fs_mount_cmn_err(flags, + "device supports only %u byte sectors (not %u)", sector_size, mp->m_sb.sb_sectsize); error = ENOSYS; goto fail; @@ -548,7 +539,7 @@ xfs_readsb(xfs_mount_t *mp) bp = xfs_buf_read_flags(mp->m_ddev_targp, XFS_SB_DADDR, BTOBB(sector_size), extra_flags); if (!bp || XFS_BUF_ISERROR(bp)) { - cmn_err(CE_WARN, "XFS: SB re-read failed"); + xfs_fs_mount_cmn_err(flags, "SB re-read failed"); error = bp ? XFS_BUF_GETERROR(bp) : ENOMEM; goto fail; } @@ -678,7 +669,7 @@ xfs_mountfs( int error = 0; if (mp->m_sb_bp == NULL) { - if ((error = xfs_readsb(mp))) { + if ((error = xfs_readsb(mp, mfsi_flags))) { return error; } } @@ -1728,7 +1719,7 @@ xfs_mount_log_sbunit( * We cannot use the hotcpu_register() function because it does * not allow notifier instances. We need a notifier per filesystem * as we need to be able to identify the filesystem to balance - * the counters out. This is acheived by having a notifier block + * the counters out. This is achieved by having a notifier block * embedded in the xfs_mount_t and doing pointer magic to get the * mount pointer from the notifier block address. */ diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index ebd73960e9d..668ad23fd37 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -379,7 +379,7 @@ typedef struct xfs_mount { #endif int m_dalign; /* stripe unit */ int m_swidth; /* stripe width */ - int m_sinoalign; /* stripe unit inode alignmnt */ + int m_sinoalign; /* stripe unit inode alignment */ int m_attr_magicpct;/* 37% of the blocksize */ int m_dir_magicpct; /* 37% of the dir blocksize */ __uint8_t m_mk_sharedro; /* mark shared ro on unmount */ @@ -510,9 +510,12 @@ xfs_preferred_iosize(xfs_mount_t *mp) */ #define XFS_MFSI_SECOND 0x01 /* Secondary mount -- skip stuff */ #define XFS_MFSI_CLIENT 0x02 /* Is a client -- skip lots of stuff */ +/* XFS_MFSI_RRINODES */ #define XFS_MFSI_NOUNLINK 0x08 /* Skip unlinked inode processing in */ /* log recovery */ #define XFS_MFSI_NO_QUOTACHECK 0x10 /* Skip quotacheck processing */ +/* XFS_MFSI_CONVERT_SUNIT */ +#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */ /* * Macros for getting from mount to vfs and back. @@ -581,7 +584,7 @@ extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t, extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *, uint, int); extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int); -extern int xfs_readsb(xfs_mount_t *mp); +extern int xfs_readsb(xfs_mount_t *, int); extern void xfs_freesb(xfs_mount_t *); extern void xfs_do_force_shutdown(bhv_desc_t *, int, char *, int); extern int xfs_syncsub(xfs_mount_t *, int, int, int *); diff --git a/fs/xfs/xfs_quota.h b/fs/xfs/xfs_quota.h index 82a08baf437..7fbef974bce 100644 --- a/fs/xfs/xfs_quota.h +++ b/fs/xfs/xfs_quota.h @@ -31,7 +31,7 @@ typedef __uint32_t xfs_dqid_t; /* - * Eventhough users may not have quota limits occupying all 64-bits, + * Even though users may not have quota limits occupying all 64-bits, * they may need 64-bit accounting. Hence, 64-bit quota-counters, * and quota-limits. This is a waste in the common case, but hey ... */ @@ -196,10 +196,11 @@ typedef struct xfs_qoff_logformat { #define XFS_QMOPT_QUOTAOFF 0x0000080 /* quotas are being turned off */ #define XFS_QMOPT_UMOUNTING 0x0000100 /* filesys is being unmounted */ #define XFS_QMOPT_DOLOG 0x0000200 /* log buf changes (in quotacheck) */ -#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if necessary */ +#define XFS_QMOPT_DOWARN 0x0000400 /* increase warning cnt if needed */ #define XFS_QMOPT_ILOCKED 0x0000800 /* inode is already locked (excl) */ -#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot, if damaged. */ +#define XFS_QMOPT_DQREPAIR 0x0001000 /* repair dquot if damaged */ #define XFS_QMOPT_GQUOTA 0x0002000 /* group dquot requested */ +#define XFS_QMOPT_ENOSPC 0x0004000 /* enospc instead of edquot (prj) */ /* * flags to xfs_trans_mod_dquot to indicate which field needs to be @@ -246,7 +247,7 @@ typedef struct xfs_qoff_logformat { #ifdef __KERNEL__ /* * This check is done typically without holding the inode lock; - * that may seem racey, but it is harmless in the context that it is used. + * that may seem racy, but it is harmless in the context that it is used. * The inode cannot go inactive as long a reference is kept, and * therefore if dquot(s) were attached, they'll stay consistent. * If, for example, the ownership of the inode changes while diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c index 2918956553a..8d056cef5d1 100644 --- a/fs/xfs/xfs_trans.c +++ b/fs/xfs/xfs_trans.c @@ -490,7 +490,7 @@ xfs_trans_mod_sb( case XFS_TRANS_SB_RES_FREXTENTS: /* * The allocation has already been applied to the - * in-core superblocks's counter. This should only + * in-core superblock's counter. This should only * be applied to the on-disk superblock. */ ASSERT(delta < 0); @@ -611,7 +611,7 @@ xfs_trans_apply_sb_deltas( if (whole) /* - * Log the whole thing, the fields are discontiguous. + * Log the whole thing, the fields are noncontiguous. */ xfs_trans_log_buf(tp, bp, 0, sizeof(xfs_sb_t) - 1); else @@ -669,7 +669,7 @@ xfs_trans_unreserve_and_mod_sb( /* * Apply any superblock modifications to the in-core version. * The t_res_fdblocks_delta and t_res_frextents_delta fields are - * explicity NOT applied to the in-core superblock. + * explicitly NOT applied to the in-core superblock. * The idea is that that has already been done. */ if (tp->t_flags & XFS_TRANS_SB_DIRTY) { diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h index e48befa4e33..100d9a4b38e 100644 --- a/fs/xfs/xfs_trans.h +++ b/fs/xfs/xfs_trans.h @@ -354,7 +354,7 @@ typedef struct xfs_trans { xfs_lsn_t t_commit_lsn; /* log seq num of end of * transaction. */ struct xfs_mount *t_mountp; /* ptr to fs mount struct */ - struct xfs_dquot_acct *t_dqinfo; /* accting info for dquots */ + struct xfs_dquot_acct *t_dqinfo; /* acctg info for dquots */ xfs_trans_callback_t t_callback; /* transaction callback */ void *t_callarg; /* callback arg */ unsigned int t_flags; /* misc flags */ diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index e341409172d..7c5894d59f8 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -272,7 +272,7 @@ xfs_trans_log_inode( * This is to coordinate with the xfs_iflush() and xfs_iflush_done() * routines in the eventual clearing of the ilf_fields bits. * See the big comment in xfs_iflush() for an explanation of - * this coorination mechanism. + * this coordination mechanism. */ flags |= ip->i_itemp->ili_last_fields; ip->i_itemp->ili_format.ilf_fields |= flags; diff --git a/fs/xfs/xfs_vfsops.c b/fs/xfs/xfs_vfsops.c index d4ec4dfaf19..f0e09ca1413 100644 --- a/fs/xfs/xfs_vfsops.c +++ b/fs/xfs/xfs_vfsops.c @@ -442,6 +442,9 @@ xfs_mount( p = vfs_bhv_lookup(vfsp, VFS_POSITION_IO); mp->m_io_ops = p ? *(xfs_ioops_t *) vfs_bhv_custom(p) : xfs_iocore_xfs; + if (args->flags & XFSMNT_QUIET) + flags |= XFS_MFSI_QUIET; + /* * Open real time and log devices - order is important. */ @@ -492,7 +495,7 @@ xfs_mount( error = xfs_start_flags(vfsp, args, mp); if (error) goto error1; - error = xfs_readsb(mp); + error = xfs_readsb(mp, flags); if (error) goto error1; error = xfs_finish_flags(vfsp, args, mp); @@ -880,10 +883,10 @@ xfs_statvfs( * determine if they should be flushed sync, async, or * delwri. * SYNC_CLOSE - This flag is passed when the system is being - * unmounted. We should sync and invalidate everthing. + * unmounted. We should sync and invalidate everything. * SYNC_FSDATA - This indicates that the caller would like to make * sure the superblock is safe on disk. We can ensure - * this by simply makeing sure the log gets flushed + * this by simply making sure the log gets flushed * if SYNC_BDFLUSH is set, and by actually writing it * out otherwise. * @@ -908,7 +911,7 @@ xfs_sync( * * This routine supports all of the flags defined for the generic VFS_SYNC * interface as explained above under xfs_sync. In the interests of not - * changing interfaces within the 6.5 family, additional internallly- + * changing interfaces within the 6.5 family, additional internally- * required functions are specified within a separate xflags parameter, * only available by calling this routine. * @@ -1090,7 +1093,7 @@ xfs_sync_inodes( * If this is just vfs_sync() or pflushd() calling * then we can skip inodes for which it looks like * there is nothing to do. Since we don't have the - * inode locked this is racey, but these are periodic + * inode locked this is racy, but these are periodic * calls so it doesn't matter. For the others we want * to know for sure, so we at least try to lock them. */ @@ -1429,7 +1432,7 @@ xfs_sync_inodes( * * This routine supports all of the flags defined for the generic VFS_SYNC * interface as explained above under xfs_sync. In the interests of not - * changing interfaces within the 6.5 family, additional internallly- + * changing interfaces within the 6.5 family, additional internally- * required functions are specified within a separate xflags parameter, * only available by calling this routine. * @@ -1697,8 +1700,9 @@ xfs_parseargs( int dsunit, dswidth, vol_dsunit, vol_dswidth; int iosize; - args->flags2 |= XFSMNT2_COMPAT_IOSIZE; args->flags |= XFSMNT_IDELETE; + args->flags |= XFSMNT_BARRIER; + args->flags2 |= XFSMNT2_COMPAT_IOSIZE; if (!options) goto done; @@ -1947,8 +1951,6 @@ xfs_showargs( seq_printf(m, "," MNTOPT_IKEEP); if (!(mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)) seq_printf(m, "," MNTOPT_LARGEIO); - if (mp->m_flags & XFS_MOUNT_BARRIER) - seq_printf(m, "," MNTOPT_BARRIER); if (!(vfsp->vfs_flag & VFS_32BITINODES)) seq_printf(m, "," MNTOPT_64BITINODE); diff --git a/fs/xfs/xfs_vnodeops.c b/fs/xfs/xfs_vnodeops.c index 0f0a64e81db..fa71b305ba5 100644 --- a/fs/xfs/xfs_vnodeops.c +++ b/fs/xfs/xfs_vnodeops.c @@ -848,7 +848,7 @@ xfs_setattr( * If this is a synchronous mount, make sure that the * transaction goes to disk before returning to the user. * This is slightly sub-optimal in that truncates require - * two sync transactions instead of one for wsync filesytems. + * two sync transactions instead of one for wsync filesystems. * One for the truncate and one for the timestamps since we * don't want to change the timestamps unless we're sure the * truncate worked. Truncates are less than 1% of the laddis @@ -1170,7 +1170,7 @@ xfs_fsync( /* * If this inode is on the RT dev we need to flush that - * cache aswell. + * cache as well. */ if (ip->i_d.di_flags & XFS_DIFLAG_REALTIME) xfs_blkdev_issue_flush(ip->i_mount->m_rtdev_targp); @@ -1380,7 +1380,7 @@ xfs_inactive_symlink_rmt( */ ntp = xfs_trans_dup(tp); /* - * Commit the transaction containing extent freeing and EFD's. + * Commit the transaction containing extent freeing and EFDs. * If we get an error on the commit here or on the reserve below, * we need to unlock the inode since the new transaction doesn't * have the inode attached. @@ -2023,7 +2023,7 @@ xfs_create( XFS_QM_DQRELE(mp, gdqp); /* - * Propogate the fact that the vnode changed after the + * Propagate the fact that the vnode changed after the * xfs_inode locks have been released. */ VOP_VNODE_CHANGE(vp, VCHANGE_FLAGS_TRUNCATED, 3); @@ -2370,7 +2370,7 @@ xfs_remove( * for a log reservation. Since we'll have to wait for the * inactive code to complete before returning from xfs_iget, * we need to make sure that we don't have log space reserved - * when we call xfs_iget. Instead we get an unlocked referece + * when we call xfs_iget. Instead we get an unlocked reference * to the inode before getting our log reservation. */ error = xfs_get_dir_entry(dentry, &ip); @@ -3020,7 +3020,7 @@ xfs_rmdir( * for a log reservation. Since we'll have to wait for the * inactive code to complete before returning from xfs_iget, * we need to make sure that we don't have log space reserved - * when we call xfs_iget. Instead we get an unlocked referece + * when we call xfs_iget. Instead we get an unlocked reference * to the inode before getting our log reservation. */ error = xfs_get_dir_entry(dentry, &cdp); @@ -4649,6 +4649,10 @@ vnodeops_t xfs_vnodeops = { #ifdef HAVE_SENDFILE .vop_sendfile = xfs_sendfile, #endif +#ifdef HAVE_SPLICE + .vop_splice_read = xfs_splice_read, + .vop_splice_write = xfs_splice_write, +#endif .vop_write = xfs_write, .vop_ioctl = xfs_ioctl, .vop_getattr = xfs_getattr, |