diff options
57 files changed, 1855 insertions, 973 deletions
diff --git a/Documentation/cpusets.txt b/Documentation/cpusets.txt index 159e2a0c3e8..76b44290c15 100644 --- a/Documentation/cpusets.txt +++ b/Documentation/cpusets.txt @@ -217,6 +217,12 @@ exclusive cpuset. Also, the use of a Linux virtual file system (vfs) to represent the cpuset hierarchy provides for a familiar permission and name space for cpusets, with a minimum of additional kernel code. +The cpus file in the root (top_cpuset) cpuset is read-only. +It automatically tracks the value of cpu_online_map, using a CPU +hotplug notifier. If and when memory nodes can be hotplugged, +we expect to make the mems file in the root cpuset read-only +as well, and have it track the value of node_online_map. + 1.4 What are exclusive cpusets ? -------------------------------- diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index 66fdc0744fe..16dec61d767 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -62,8 +62,8 @@ ramfs-rootfs-initramfs.txt - info on the 'in memory' filesystems ramfs, rootfs and initramfs. reiser4.txt - info on the Reiser4 filesystem based on dancing tree algorithms. -relayfs.txt - - info on relayfs, for efficient streaming from kernel to user space. +relay.txt + - info on relay, for efficient streaming from kernel to user space. romfs.txt - description of the ROMFS filesystem. smbfs.txt diff --git a/Documentation/filesystems/relay.txt b/Documentation/filesystems/relay.txt new file mode 100644 index 00000000000..d6788dae034 --- /dev/null +++ b/Documentation/filesystems/relay.txt @@ -0,0 +1,479 @@ +relay interface (formerly relayfs) +================================== + +The relay interface provides a means for kernel applications to +efficiently log and transfer large quantities of data from the kernel +to userspace via user-defined 'relay channels'. + +A 'relay channel' is a kernel->user data relay mechanism implemented +as a set of per-cpu kernel buffers ('channel buffers'), each +represented as a regular file ('relay file') in user space. Kernel +clients write into the channel buffers using efficient write +functions; these automatically log into the current cpu's channel +buffer. User space applications mmap() or read() from the relay files +and retrieve the data as it becomes available. The relay files +themselves are files created in a host filesystem, e.g. debugfs, and +are associated with the channel buffers using the API described below. + +The format of the data logged into the channel buffers is completely +up to the kernel client; the relay interface does however provide +hooks which allow kernel clients to impose some structure on the +buffer data. The relay interface doesn't implement any form of data +filtering - this also is left to the kernel client. The purpose is to +keep things as simple as possible. + +This document provides an overview of the relay interface API. The +details of the function parameters are documented along with the +functions in the relay interface code - please see that for details. + +Semantics +========= + +Each relay channel has one buffer per CPU, each buffer has one or more +sub-buffers. Messages are written to the first sub-buffer until it is +too full to contain a new message, in which case it it is written to +the next (if available). Messages are never split across sub-buffers. +At this point, userspace can be notified so it empties the first +sub-buffer, while the kernel continues writing to the next. + +When notified that a sub-buffer is full, the kernel knows how many +bytes of it are padding i.e. unused space occurring because a complete +message couldn't fit into a sub-buffer. Userspace can use this +knowledge to copy only valid data. + +After copying it, userspace can notify the kernel that a sub-buffer +has been consumed. + +A relay channel can operate in a mode where it will overwrite data not +yet collected by userspace, and not wait for it to be consumed. + +The relay channel itself does not provide for communication of such +data between userspace and kernel, allowing the kernel side to remain +simple and not impose a single interface on userspace. It does +provide a set of examples and a separate helper though, described +below. + +The read() interface both removes padding and internally consumes the +read sub-buffers; thus in cases where read(2) is being used to drain +the channel buffers, special-purpose communication between kernel and +user isn't necessary for basic operation. + +One of the major goals of the relay interface is to provide a low +overhead mechanism for conveying kernel data to userspace. While the +read() interface is easy to use, it's not as efficient as the mmap() +approach; the example code attempts to make the tradeoff between the +two approaches as small as possible. + +klog and relay-apps example code +================================ + +The relay interface itself is ready to use, but to make things easier, +a couple simple utility functions and a set of examples are provided. + +The relay-apps example tarball, available on the relay sourceforge +site, contains a set of self-contained examples, each consisting of a +pair of .c files containing boilerplate code for each of the user and +kernel sides of a relay application. When combined these two sets of +boilerplate code provide glue to easily stream data to disk, without +having to bother with mundane housekeeping chores. + +The 'klog debugging functions' patch (klog.patch in the relay-apps +tarball) provides a couple of high-level logging functions to the +kernel which allow writing formatted text or raw data to a channel, +regardless of whether a channel to write into exists or not, or even +whether the relay interface is compiled into the kernel or not. These +functions allow you to put unconditional 'trace' statements anywhere +in the kernel or kernel modules; only when there is a 'klog handler' +registered will data actually be logged (see the klog and kleak +examples for details). + +It is of course possible to use the relay interface from scratch, +i.e. without using any of the relay-apps example code or klog, but +you'll have to implement communication between userspace and kernel, +allowing both to convey the state of buffers (full, empty, amount of +padding). The read() interface both removes padding and internally +consumes the read sub-buffers; thus in cases where read(2) is being +used to drain the channel buffers, special-purpose communication +between kernel and user isn't necessary for basic operation. Things +such as buffer-full conditions would still need to be communicated via +some channel though. + +klog and the relay-apps examples can be found in the relay-apps +tarball on http://relayfs.sourceforge.net + +The relay interface user space API +================================== + +The relay interface implements basic file operations for user space +access to relay channel buffer data. Here are the file operations +that are available and some comments regarding their behavior: + +open() enables user to open an _existing_ channel buffer. + +mmap() results in channel buffer being mapped into the caller's + memory space. Note that you can't do a partial mmap - you + must map the entire file, which is NRBUF * SUBBUFSIZE. + +read() read the contents of a channel buffer. The bytes read are + 'consumed' by the reader, i.e. they won't be available + again to subsequent reads. If the channel is being used + in no-overwrite mode (the default), it can be read at any + time even if there's an active kernel writer. If the + channel is being used in overwrite mode and there are + active channel writers, results may be unpredictable - + users should make sure that all logging to the channel has + ended before using read() with overwrite mode. Sub-buffer + padding is automatically removed and will not be seen by + the reader. + +sendfile() transfer data from a channel buffer to an output file + descriptor. Sub-buffer padding is automatically removed + and will not be seen by the reader. + +poll() POLLIN/POLLRDNORM/POLLERR supported. User applications are + notified when sub-buffer boundaries are crossed. + +close() decrements the channel buffer's refcount. When the refcount + reaches 0, i.e. when no process or kernel client has the + buffer open, the channel buffer is freed. + +In order for a user application to make use of relay files, the +host filesystem must be mounted. For example, + + mount -t debugfs debugfs /debug + +NOTE: the host filesystem doesn't need to be mounted for kernel + clients to create or use channels - it only needs to be + mounted when user space applications need access to the buffer + data. + + +The relay interface kernel API +============================== + +Here's a summary of the API the relay interface provides to in-kernel clients: + +TBD(curr. line MT:/API/) + channel management functions: + + relay_open(base_filename, parent, subbuf_size, n_subbufs, + callbacks) + relay_close(chan) + relay_flush(chan) + relay_reset(chan) + + channel management typically called on instigation of userspace: + + relay_subbufs_consumed(chan, cpu, subbufs_consumed) + + write functions: + + relay_write(chan, data, length) + __relay_write(chan, data, length) + relay_reserve(chan, length) + + callbacks: + + subbuf_start(buf, subbuf, prev_subbuf, prev_padding) + buf_mapped(buf, filp) + buf_unmapped(buf, filp) + create_buf_file(filename, parent, mode, buf, is_global) + remove_buf_file(dentry) + + helper functions: + + relay_buf_full(buf) + subbuf_start_reserve(buf, length) + + +Creating a channel +------------------ + +relay_open() is used to create a channel, along with its per-cpu +channel buffers. Each channel buffer will have an associated file +created for it in the host filesystem, which can be and mmapped or +read from in user space. The files are named basename0...basenameN-1 +where N is the number of online cpus, and by default will be created +in the root of the filesystem (if the parent param is NULL). If you +want a directory structure to contain your relay files, you should +create it using the host filesystem's directory creation function, +e.g. debugfs_create_dir(), and pass the parent directory to +relay_open(). Users are responsible for cleaning up any directory +structure they create, when the channel is closed - again the host +filesystem's directory removal functions should be used for that, +e.g. debugfs_remove(). + +In order for a channel to be created and the host filesystem's files +associated with its channel buffers, the user must provide definitions +for two callback functions, create_buf_file() and remove_buf_file(). +create_buf_file() is called once for each per-cpu buffer from +relay_open() and allows the user to create the file which will be used +to represent the corresponding channel buffer. The callback should +return the dentry of the file created to represent the channel buffer. +remove_buf_file() must also be defined; it's responsible for deleting +the file(s) created in create_buf_file() and is called during +relay_close(). + +Here are some typical definitions for these callbacks, in this case +using debugfs: + +/* + * create_buf_file() callback. Creates relay file in debugfs. + */ +static struct dentry *create_buf_file_handler(const char *filename, + struct dentry *parent, + int mode, + struct rchan_buf *buf, + int *is_global) +{ + return debugfs_create_file(filename, mode, parent, buf, + &relay_file_operations); +} + +/* + * remove_buf_file() callback. Removes relay file from debugfs. + */ +static int remove_buf_file_handler(struct dentry *dentry) +{ + debugfs_remove(dentry); + + return 0; +} + +/* + * relay interface callbacks + */ +static struct rchan_callbacks relay_callbacks = +{ + .create_buf_file = create_buf_file_handler, + .remove_buf_file = remove_buf_file_handler, +}; + +And an example relay_open() invocation using them: + + chan = relay_open("cpu", NULL, SUBBUF_SIZE, N_SUBBUFS, &relay_callbacks); + +If the create_buf_file() callback fails, or isn't defined, channel +creation and thus relay_open() will fail. + +The total size of each per-cpu buffer is calculated by multiplying the +number of sub-buffers by the sub-buffer size passed into relay_open(). +The idea behind sub-buffers is that they're basically an extension of +double-buffering to N buffers, and they also allow applications to +easily implement random-access-on-buffer-boundary schemes, which can +be important for some high-volume applications. The number and size +of sub-buffers is completely dependent on the application and even for +the same application, different conditions will warrant different +values for these parameters at different times. Typically, the right +values to use are best decided after some experimentation; in general, +though, it's safe to assume that having only 1 sub-buffer is a bad +idea - you're guaranteed to either overwrite data or lose events +depending on the channel mode being used. + +The create_buf_file() implementation can also be defined in such a way +as to allow the creation of a single 'global' buffer instead of the +default per-cpu set. This can be useful for applications interested +mainly in seeing the relative ordering of system-wide events without +the need to bother with saving explicit timestamps for the purpose of +merging/sorting per-cpu files in a postprocessing step. + +To have relay_open() create a global buffer, the create_buf_file() +implementation should set the value of the is_global outparam to a +non-zero value in addition to creating the file that will be used to +represent the single buffer. In the case of a global buffer, +create_buf_file() and remove_buf_file() will be called only once. The +normal channel-writing functions, e.g. relay_write(), can still be +used - writes from any cpu will transparently end up in the global +buffer - but since it is a global buffer, callers should make sure +they use the proper locking for such a buffer, either by wrapping +writes in a spinlock, or by copying a write function from relay.h and +creating a local version that internally does the proper locking. + +Channel 'modes' +--------------- + +relay channels can be used in either of two modes - 'overwrite' or +'no-overwrite'. The mode is entirely determined by the implementation +of the subbuf_start() callback, as described below. The default if no +subbuf_start() callback is defined is 'no-overwrite' mode. If the +default mode suits your needs, and you plan to use the read() +interface to retrieve channel data, you can ignore the details of this +section, as it pertains mainly to mmap() implementations. + +In 'overwrite' mode, also known as 'flight recorder' mode, writes +continuously cycle around the buffer and will never fail, but will +unconditionally overwrite old data regardless of whether it's actually +been consumed. In no-overwrite mode, writes will fail, i.e. data will +be lost, if the number of unconsumed sub-buffers equals the total +number of sub-buffers in the channel. It should be clear that if +there is no consumer or if the consumer can't consume sub-buffers fast +enough, data will be lost in either case; the only difference is +whether data is lost from the beginning or the end of a buffer. + +As explained above, a relay channel is made of up one or more +per-cpu channel buffers, each implemented as a circular buffer +subdivided into one or more sub-buffers. Messages are written into +the current sub-buffer of the channel's current per-cpu buffer via the +write functions described below. Whenever a message can't fit into +the current sub-buffer, because there's no room left for it, the +client is notified via the subbuf_start() callback that a switch to a +new sub-buffer is about to occur. The client uses this callback to 1) +initialize the next sub-buffer if appropriate 2) finalize the previous +sub-buffer if appropriate and 3) return a boolean value indicating +whether or not to actually move on to the next sub-buffer. + +To implement 'no-overwrite' mode, the userspace client would provide +an implementation of the subbuf_start() callback something like the +following: + +static int subbuf_start(struct rchan_buf *buf, + void *subbuf, + void *prev_subbuf, + unsigned int prev_padding) +{ + if (prev_subbuf) + *((unsigned *)prev_subbuf) = prev_padding; + + if (relay_buf_full(buf)) + return 0; + + subbuf_start_reserve(buf, sizeof(unsigned int)); + + return 1; +} + +If the current buffer is full, i.e. all sub-buffers remain unconsumed, +the callback returns 0 to indicate that the buffer switch should not +occur yet, i.e. until the consumer has had a chance to read the +current set of ready sub-buffers. For the relay_buf_full() function +to make sense, the consumer is reponsible for notifying the relay +interface when sub-buffers have been consumed via +relay_subbufs_consumed(). Any subsequent attempts to write into the +buffer will again invoke the subbuf_start() callback with the same +parameters; only when the consumer has consumed one or more of the +ready sub-buffers will relay_buf_full() return 0, in which case the +buffer switch can continue. + +The implementation of the subbuf_start() callback for 'overwrite' mode +would be very similar: + +static int subbuf_start(struct rchan_buf *buf, + void *subbuf, + void *prev_subbuf, + unsigned int prev_padding) +{ + if (prev_subbuf) + *((unsigned *)prev_subbuf) = prev_padding; + + subbuf_start_reserve(buf, sizeof(unsigned int)); + + return 1; +} + +In this case, the relay_buf_full() check is meaningless and the +callback always returns 1, causing the buffer switch to occur +unconditionally. It's also meaningless for the client to use the +relay_subbufs_consumed() function in this mode, as it's never +consulted. + +The default subbuf_start() implementation, used if the client doesn't +define any callbacks, or doesn't define the subbuf_start() callback, +implements the simplest possible 'no-overwrite' mode, i.e. it does +nothing but return 0. + +Header information can be reserved at the beginning of each sub-buffer +by calling the subbuf_start_reserve() helper function from within the +subbuf_start() callback. This reserved area can be used to store +whatever information the client wants. In the example above, room is +reserved in each sub-buffer to store the padding count for that +sub-buffer. This is filled in for the previous sub-buffer in the +subbuf_start() implementation; the padding value for the previous +sub-buffer is passed into the subbuf_start() callback along with a +pointer to the previous sub-buffer, since the padding value isn't +known until a sub-buffer is filled. The subbuf_start() callback is +also called for the first sub-buffer when the channel is opened, to +give the client a chance to reserve space in it. In this case the +previous sub-buffer pointer passed into the callback will be NULL, so +the client should check the value of the prev_subbuf pointer before +writing into the previous sub-buffer. + +Writing to a channel +-------------------- + +Kernel clients write data into the current cpu's channel buffer using +relay_write() or __relay_write(). relay_write() is the main logging +function - it uses local_irqsave() to protect the buffer and should be +used if you might be logging from interrupt context. If you know +you'll never be logging from interrupt context, you can use +__relay_write(), which only disables preemption. These functions +don't return a value, so you can't determine whether or not they +failed - the assumption is that you wouldn't want to check a return +value in the fast logging path anyway, and that they'll always succeed +unless the buffer is full and no-overwrite mode is being used, in +which case you can detect a failed write in the subbuf_start() +callback by calling the relay_buf_full() helper function. + +relay_reserve() is used to reserve a slot in a channel buffer which +can be written to later. This would typically be used in applications +that need to write directly into a channel buffer without having to +stage data in a temporary buffer beforehand. Because the actual write +may not happen immediately after the slot is reserved, applications +using relay_reserve() can keep a count of the number of bytes actually +written, either in space reserved in the sub-buffers themselves or as +a separate array. See the 'reserve' example in the relay-apps tarball +at http://relayfs.sourceforge.net for an example of how this can be +done. Because the write is under control of the client and is +separated from the reserve, relay_reserve() doesn't protect the buffer +at all - it's up to the client to provide the appropriate +synchronization when using relay_reserve(). + +Closing a channel +----------------- + +The client calls relay_close() when it's finished using the channel. +The channel and its associated buffers are destroyed when there are no +longer any references to any of the channel buffers. relay_flush() +forces a sub-buffer switch on all the channel buffers, and can be used +to finalize and process the last sub-buffers before the channel is +closed. + +Misc +---- + +Some applications may want to keep a channel around and re-use it +rather than open and close a new channel for each use. relay_reset() +can be used for this purpose - it resets a channel to its initial +state without reallocating channel buffer memory or destroying +existing mappings. It should however only be called when it's safe to +do so, i.e. when the channel isn't currently being written to. + +Finally, there are a couple of utility callbacks that can be used for +different purposes. buf_mapped() is called whenever a channel buffer +is mmapped from user space and buf_unmapped() is called when it's +unmapped. The client can use this notification to trigger actions +within the kernel application, such as enabling/disabling logging to +the channel. + + +Resources +========= + +For news, example code, mailing list, etc. see the relay interface homepage: + + http://relayfs.sourceforge.net + + +Credits +======= + +The ideas and specs for the relay interface came about as a result of +discussions on tracing involving the following: + +Michel Dagenais <michel.dagenais@polymtl.ca> +Richard Moore <richardj_moore@uk.ibm.com> +Bob Wisniewski <bob@watson.ibm.com> +Karim Yaghmour <karim@opersys.com> +Tom Zanussi <zanussi@us.ibm.com> + +Also thanks to Hubertus Franke for a lot of useful suggestions and bug +reports. diff --git a/Documentation/filesystems/relayfs.txt b/Documentation/filesystems/relayfs.txt deleted file mode 100644 index 5832377b734..00000000000 --- a/Documentation/filesystems/relayfs.txt +++ /dev/null @@ -1,442 +0,0 @@ - -relayfs - a high-speed data relay filesystem -============================================ - -relayfs is a filesystem designed to provide an efficient mechanism for -tools and facilities to relay large and potentially sustained streams -of data from kernel space to user space. - -The main abstraction of relayfs is the 'channel'. A channel consists -of a set of per-cpu kernel buffers each represented by a file in the -relayfs filesystem. Kernel clients write into a channel using -efficient write functions which automatically log to the current cpu's -channel buffer. User space applications mmap() the per-cpu files and -retrieve the data as it becomes available. - -The format of the data logged into the channel buffers is completely -up to the relayfs client; relayfs does however provide hooks which -allow clients to impose some structure on the buffer data. Nor does -relayfs implement any form of data filtering - this also is left to -the client. The purpose is to keep relayfs as simple as possible. - -This document provides an overview of the relayfs API. The details of -the function parameters are documented along with the functions in the -filesystem code - please see that for details. - -Semantics -========= - -Each relayfs channel has one buffer per CPU, each buffer has one or -more sub-buffers. Messages are written to the first sub-buffer until -it is too full to contain a new message, in which case it it is -written to the next (if available). Messages are never split across -sub-buffers. At this point, userspace can be notified so it empties -the first sub-buffer, while the kernel continues writing to the next. - -When notified that a sub-buffer is full, the kernel knows how many -bytes of it are padding i.e. unused. Userspace can use this knowledge -to copy only valid data. - -After copying it, userspace can notify the kernel that a sub-buffer -has been consumed. - -relayfs can operate in a mode where it will overwrite data not yet -collected by userspace, and not wait for it to consume it. - -relayfs itself does not provide for communication of such data between -userspace and kernel, allowing the kernel side to remain simple and -not impose a single interface on userspace. It does provide a set of -examples and a separate helper though, described below. - -klog and relay-apps example code -================================ - -relayfs itself is ready to use, but to make things easier, a couple -simple utility functions and a set of examples are provided. - -The relay-apps example tarball, available on the relayfs sourceforge -site, contains a set of self-contained examples, each consisting of a -pair of .c files containing boilerplate code for each of the user and -kernel sides of a relayfs application; combined these two sets of -boilerplate code provide glue to easily stream data to disk, without -having to bother with mundane housekeeping chores. - -The 'klog debugging functions' patch (klog.patch in the relay-apps -tarball) provides a couple of high-level logging functions to the -kernel which allow writing formatted text or raw data to a channel, -regardless of whether a channel to write into exists or not, or -whether relayfs is compiled into the kernel or is configured as a -module. These functions allow you to put unconditional 'trace' -statements anywhere in the kernel or kernel modules; only when there -is a 'klog handler' registered will data actually be logged (see the -klog and kleak examples for details). - -It is of course possible to use relayfs from scratch i.e. without -using any of the relay-apps example code or klog, but you'll have to -implement communication between userspace and kernel, allowing both to -convey the state of buffers (full, empty, amount of padding). - -klog and the relay-apps examples can be found in the relay-apps -tarball on http://relayfs.sourceforge.net - - -The relayfs user space API -========================== - -relayfs implements basic file operations for user space access to -relayfs channel buffer data. Here are the file operations that are -available and some comments regarding their behavior: - -open() enables user to open an _existing_ buffer. - -mmap() results in channel buffer being mapped into the caller's - memory space. Note that you can't do a partial mmap - you must - map the entire file, which is NRBUF * SUBBUFSIZE. - -read() read the contents of a channel buffer. The bytes read are - 'consumed' by the reader i.e. they won't be available again - to subsequent reads. If the channel is being used in - no-overwrite mode (the default), it can be read at any time - even if there's an active kernel writer. If the channel is - being used in overwrite mode and there are active channel - writers, results may be unpredictable - users should make - sure that all logging to the channel has ended before using - read() with overwrite mode. - -poll() POLLIN/POLLRDNORM/POLLERR supported. User applications are - notified when sub-buffer boundaries are crossed. - -close() decrements the channel buffer's refcount. When the refcount - reaches 0 i.e. when no process or kernel client has the buffer - open, the channel buffer is freed. - - -In order for a user application to make use of relayfs files, the -relayfs filesystem must be mounted. For example, - - mount -t relayfs relayfs /mnt/relay - -NOTE: relayfs doesn't need to be mounted for kernel clients to create - or use channels - it only needs to be mounted when user space - applications need access to the buffer data. - - -The relayfs kernel API -====================== - -Here's a summary of the API relayfs provides to in-kernel clients: - - - channel management functions: - - relay_open(base_filename, parent, subbuf_size, n_subbufs, - callbacks) - relay_close(chan) - relay_flush(chan) - relay_reset(chan) - relayfs_create_dir(name, parent) - relayfs_remove_dir(dentry) - relayfs_create_file(name, parent, mode, fops, data) - relayfs_remove_file(dentry) - - channel management typically called on instigation of userspace: - - relay_subbufs_consumed(chan, cpu, subbufs_consumed) - - write functions: - - relay_write(chan, data, length) - __relay_write(chan, data, length) - relay_reserve(chan, length) - - callbacks: - - subbuf_start(buf, subbuf, prev_subbuf, prev_padding) - buf_mapped(buf, filp) - buf_unmapped(buf, filp) - create_buf_file(filename, parent, mode, buf, is_global) - remove_buf_file(dentry) - - helper functions: - - relay_buf_full(buf) - subbuf_start_reserve(buf, length) - - -Creating a channel ------------------- - -relay_open() is used to create a channel, along with its per-cpu -channel buffers. Each channel buffer will have an associated file -created for it in the relayfs filesystem, which can be opened and -mmapped from user space if desired. The files are named -basename0...basenameN-1 where N is the number of online cpus, and by -default will be created in the root of the filesystem. If you want a -directory structure to contain your relayfs files, you can create it -with relayfs_create_dir() and pass the parent directory to -relay_open(). Clients are responsible for cleaning up any directory -structure they create when the channel is closed - use -relayfs_remove_dir() for that. - -The total size of each per-cpu buffer is calculated by multiplying the -number of sub-buffers by the sub-buffer size passed into relay_open(). -The idea behind sub-buffers is that they're basically an extension of -double-buffering to N buffers, and they also allow applications to -easily implement random-access-on-buffer-boundary schemes, which can -be important for some high-volume applications. The number and size -of sub-buffers is completely dependent on the application and even for -the same application, different conditions will warrant different -values for these parameters at different times. Typically, the right -values to use are best decided after some experimentation; in general, -though, it's safe to assume that having only 1 sub-buffer is a bad -idea - you're guaranteed to either overwrite data or lose events -depending on the channel mode being used. - -Channel 'modes' ---------------- - -relayfs channels can be used in either of two modes - 'overwrite' or -'no-overwrite'. The mode is entirely determined by the implementation -of the subbuf_start() callback, as described below. In 'overwrite' -mode, also known as 'flight recorder' mode, writes continuously cycle -around the buffer and will never fail, but will unconditionally -overwrite old data regardless of whether it's actually been consumed. -In no-overwrite mode, writes will fail i.e. data will be lost, if the -number of unconsumed sub-buffers equals the total number of -sub-buffers in the channel. It should be clear that if there is no -consumer or if the consumer can't consume sub-buffers fast enought, -data will be lost in either case; the only difference is whether data -is lost from the beginning or the end of a buffer. - -As explained above, a relayfs channel is made of up one or more -per-cpu channel buffers, each implemented as a circular buffer -subdivided into one or more sub-buffers. Messages are written into -the current sub-buffer of the channel's current per-cpu buffer via the -write functions described below. Whenever a message can't fit into -the current sub-buffer, because there's no room left for it, the -client is notified via the subbuf_start() callback that a switch to a -new sub-buffer is about to occur. The client uses this callback to 1) -initialize the next sub-buffer if appropriate 2) finalize the previous -sub-buffer if appropriate and 3) return a boolean value indicating -whether or not to actually go ahead with the sub-buffer switch. - -To implement 'no-overwrite' mode, the userspace client would provide -an implementation of the subbuf_start() callback something like the -following: - -static int subbuf_start(struct rchan_buf *buf, - void *subbuf, - void *prev_subbuf, - unsigned int prev_padding) -{ - if (prev_subbuf) - *((unsigned *)prev_subbuf) = prev_padding; - - if (relay_buf_full(buf)) - return 0; - - subbuf_start_reserve(buf, sizeof(unsigned int)); - - return 1; -} - -If the current buffer is full i.e. all sub-buffers remain unconsumed, -the callback returns 0 to indicate that the buffer switch should not -occur yet i.e. until the consumer has had a chance to read the current -set of ready sub-buffers. For the relay_buf_full() function to make -sense, the consumer is reponsible for notifying relayfs when -sub-buffers have been consumed via relay_subbufs_consumed(). Any -subsequent attempts to write into the buffer will again invoke the -subbuf_start() callback with the same parameters; only when the -consumer has consumed one or more of the ready sub-buffers will -relay_buf_full() return 0, in which case the buffer switch can -continue. - -The implementation of the subbuf_start() callback for 'overwrite' mode -would be very similar: - -static int subbuf_start(struct rchan_buf *buf, - void *subbuf, - void *prev_subbuf, - unsigned int prev_padding) -{ - if (prev_subbuf) - *((unsigned *)prev_subbuf) = prev_padding; - - subbuf_start_reserve(buf, sizeof(unsigned int)); - - return 1; -} - -In this case, the relay_buf_full() check is meaningless and the -callback always returns 1, causing the buffer switch to occur -unconditionally. It's also meaningless for the client to use the -relay_subbufs_consumed() function in this mode, as it's never -consulted. - -The default subbuf_start() implementation, used if the client doesn't -define any callbacks, or doesn't define the subbuf_start() callback, -implements the simplest possible 'no-overwrite' mode i.e. it does -nothing but return 0. - -Header information can be reserved at the beginning of each sub-buffer -by calling the subbuf_start_reserve() helper function from within the -subbuf_start() callback. This reserved area can be used to store -whatever information the client wants. In the example above, room is -reserved in each sub-buffer to store the padding count for that -sub-buffer. This is filled in for the previous sub-buffer in the -subbuf_start() implementation; the padding value for the previous -sub-buffer is passed into the subbuf_start() callback along with a -pointer to the previous sub-buffer, since the padding value isn't -known until a sub-buffer is filled. The subbuf_start() callback is -also called for the first sub-buffer when the channel is opened, to -give the client a chance to reserve space in it. In this case the -previous sub-buffer pointer passed into the callback will be NULL, so -the client should check the value of the prev_subbuf pointer before -writing into the previous sub-buffer. - -Writing to a channel --------------------- - -kernel clients write data into the current cpu's channel buffer using -relay_write() or __relay_write(). relay_write() is the main logging -function - it uses local_irqsave() to protect the buffer and should be -used if you might be logging from interrupt context. If you know -you'll never be logging from interrupt context, you can use -__relay_write(), which only disables preemption. These functions -don't return a value, so you can't determine whether or not they -failed - the assumption is that you wouldn't want to check a return -value in the fast logging path anyway, and that they'll always succeed -unless the buffer is full and no-overwrite mode is being used, in -which case you can detect a failed write in the subbuf_start() -callback by calling the relay_buf_full() helper function. - -relay_reserve() is used to reserve a slot in a channel buffer which -can be written to later. This would typically be used in applications -that need to write directly into a channel buffer without having to -stage data in a temporary buffer beforehand. Because the actual write -may not happen immediately after the slot is reserved, applications -using relay_reserve() can keep a count of the number of bytes actually -written, either in space reserved in the sub-buffers themselves or as -a separate array. See the 'reserve' example in the relay-apps tarball -at http://relayfs.sourceforge.net for an example of how this can be -done. Because the write is under control of the client and is -separated from the reserve, relay_reserve() doesn't protect the buffer -at all - it's up to the client to provide the appropriate -synchronization when using relay_reserve(). - -Closing a channel ------------------ - -The client calls relay_close() when it's finished using the channel. -The channel and its associated buffers are destroyed when there are no -longer any references to any of the channel buffers. relay_flush() -forces a sub-buffer switch on all the channel buffers, and can be used -to finalize and process the last sub-buffers before the channel is -closed. - -Creating non-relay files ------------------------- - -relay_open() automatically creates files in the relayfs filesystem to -represent the per-cpu kernel buffers; it's often useful for -applications to be able to create their own files alongside the relay -files in the relayfs filesystem as well e.g. 'control' files much like -those created in /proc or debugfs for similar purposes, used to -communicate control information between the kernel and user sides of a -relayfs application. For this purpose the relayfs_create_file() and -relayfs_remove_file() API functions exist. For relayfs_create_file(), -the caller passes in a set of user-defined file operations to be used -for the file and an optional void * to a user-specified data item, -which will be accessible via inode->u.generic_ip (see the relay-apps -tarball for examples). The file_operations are a required parameter -to relayfs_create_file() and thus the semantics of these files are -completely defined by the caller. - -See the relay-apps tarball at http://relayfs.sourceforge.net for -examples of how these non-relay files are meant to be used. - -Creating relay files in other filesystems ------------------------------------------ - -By default of course, relay_open() creates relay files in the relayfs -filesystem. Because relay_file_operations is exported, however, it's -also possible to create and use relay files in other pseudo-filesytems -such as debugfs. - -For this purpose, two callback functions are provided, -create_buf_file() and remove_buf_file(). create_buf_file() is called -once for each per-cpu buffer from relay_open() to allow the client to -create a file to be used to represent the corresponding buffer; if -this callback is not defined, the default implementation will create -and return a file in the relayfs filesystem to represent the buffer. -The callback should return the dentry of the file created to represent -the relay buffer. Note that the parent directory passed to -relay_open() (and passed along to the callback), if specified, must -exist in the same filesystem the new relay file is created in. If -create_buf_file() is defined, remove_buf_file() must also be defined; -it's responsible for deleting the file(s) created in create_buf_file() -and is called during relay_close(). - -The create_buf_file() implementation can also be defined in such a way -as to allow the creation of a single 'global' buffer instead of the -default per-cpu set. This can be useful for applications interested -mainly in seeing the relative ordering of system-wide events without -the need to bother with saving explicit timestamps for the purpose of -merging/sorting per-cpu files in a postprocessing step. - -To have relay_open() create a global buffer, the create_buf_file() -implementation should set the value of the is_global outparam to a -non-zero value in addition to creating the file that will be used to -represent the single buffer. In the case of a global buffer, -create_buf_file() and remove_buf_file() will be called only once. The -normal channel-writing functions e.g. relay_write() can still be used -- writes from any cpu will transparently end up in the global buffer - -but since it is a global buffer, callers should make sure they use the -proper locking for such a buffer, either by wrapping writes in a -spinlock, or by copying a write function from relayfs_fs.h and -creating a local version that internally does the proper locking. - -See the 'exported-relayfile' examples in the relay-apps tarball for -examples of creating and using relay files in debugfs. - -Misc ----- - -Some applications may want to keep a channel around and re-use it -rather than open and close a new channel for each use. relay_reset() -can be used for this purpose - it resets a channel to its initial -state without reallocating channel buffer memory or destroying -existing mappings. It should however only be called when it's safe to -do so i.e. when the channel isn't currently being written to. - -Finally, there are a couple of utility callbacks that can be used for -different purposes. buf_mapped() is called whenever a channel buffer -is mmapped from user space and buf_unmapped() is called when it's -unmapped. The client can use this notification to trigger actions -within the kernel application, such as enabling/disabling logging to -the channel. - - -Resources -========= - -For news, example code, mailing list, etc. see the relayfs homepage: - - http://relayfs.sourceforge.net - - -Credits -======= - -The ideas and specs for relayfs came about as a result of discussions -on tracing involving the following: - -Michel Dagenais <michel.dagenais@polymtl.ca> -Richard Moore <richardj_moore@uk.ibm.com> -Bob Wisniewski <bob@watson.ibm.com> -Karim Yaghmour <karim@opersys.com> -Tom Zanussi <zanussi@us.ibm.com> - -Also thanks to Hubertus Franke for a lot of useful suggestions and bug -reports. diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 0b62c62142c..5c3a5190596 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -25,6 +25,7 @@ Currently, these files are in /proc/sys/fs: - inode-state - overflowuid - overflowgid +- suid_dumpable - super-max - super-nr @@ -131,6 +132,25 @@ The default is 65534. ============================================================== +suid_dumpable: + +This value can be used to query and set the core dump mode for setuid +or otherwise protected/tainted binaries. The modes are + +0 - (default) - traditional behaviour. Any process which has changed + privilege levels or is execute only will not be dumped +1 - (debug) - all processes dump core when possible. The core dump is + owned by the current user and no security is applied. This is + intended for system debugging situations only. Ptrace is unchecked. +2 - (suidsafe) - any binary which normally would not be dumped is dumped + readable by root only. This allows the end user to remove + such a dump but not access it directly. For security reasons + core dumps in this mode will not overwrite one another or + other files. This mode is appropriate when adminstrators are + attempting to debug problems in a normal environment. + +============================================================== + super-max & super-nr: These numbers control the maximum number of superblocks, and diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt index 7345c338080..89bf8c20a58 100644 --- a/Documentation/sysctl/kernel.txt +++ b/Documentation/sysctl/kernel.txt @@ -50,7 +50,6 @@ show up in /proc/sys/kernel: - shmmax [ sysv ipc ] - shmmni - stop-a [ SPARC only ] -- suid_dumpable - sysrq ==> Documentation/sysrq.txt - tainted - threads-max @@ -310,25 +309,6 @@ kernel. This value defaults to SHMMAX. ============================================================== -suid_dumpable: - -This value can be used to query and set the core dump mode for setuid -or otherwise protected/tainted binaries. The modes are - -0 - (default) - traditional behaviour. Any process which has changed - privilege levels or is execute only will not be dumped -1 - (debug) - all processes dump core when possible. The core dump is - owned by the current user and no security is applied. This is - intended for system debugging situations only. Ptrace is unchecked. -2 - (suidsafe) - any binary which normally would not be dumped is dumped - readable by root only. This allows the end user to remove - such a dump but not access it directly. For security reasons - core dumps in this mode will not overwrite one another or - other files. This mode is appropriate when adminstrators are - attempting to debug problems in a normal environment. - -============================================================== - tainted: Non-zero if the kernel has been tainted. Numeric values, which @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 18 -EXTRAVERSION = -rc4 +EXTRAVERSION = -rc5 NAME=Crazed Snow-Weasel # *DOCUMENTATION* diff --git a/arch/i386/Kconfig b/arch/i386/Kconfig index f71fb4a029c..b2751eadbc5 100644 --- a/arch/i386/Kconfig +++ b/arch/i386/Kconfig @@ -142,6 +142,7 @@ config X86_SUMMIT In particular, it is needed for the x440. If you don't have one of these computers, you should say N here. + If you want to build a NUMA kernel, you must select ACPI. config X86_BIGSMP bool "Support for other sub-arch SMP systems with more than 8 CPUs" @@ -169,6 +170,7 @@ config X86_GENERICARCH help This option compiles in the Summit, bigsmp, ES7000, default subarchitectures. It is intended for a generic binary kernel. + If you want a NUMA kernel, select ACPI. We need SRAT for NUMA. config X86_ES7000 bool "Support for Unisys ES7000 IA32 series" @@ -542,7 +544,7 @@ config X86_PAE # Common NUMA Features config NUMA bool "Numa Memory Allocation and Scheduler Support" - depends on SMP && HIGHMEM64G && (X86_NUMAQ || X86_GENERICARCH || (X86_SUMMIT && ACPI)) + depends on SMP && HIGHMEM64G && (X86_NUMAQ || (X86_SUMMIT || X86_GENERICARCH) && ACPI) default n if X86_PC default y if (X86_NUMAQ || X86_SUMMIT) diff --git a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c index efb41e81351..e6ea00edcb5 100644 --- a/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c +++ b/arch/i386/kernel/cpu/cpufreq/acpi-cpufreq.c @@ -567,16 +567,11 @@ static struct cpufreq_driver acpi_cpufreq_driver = { static int __init acpi_cpufreq_init (void) { - int result = 0; - dprintk("acpi_cpufreq_init\n"); - result = acpi_cpufreq_early_init_acpi(); + acpi_cpufreq_early_init_acpi(); - if (!result) - result = cpufreq_register_driver(&acpi_cpufreq_driver); - - return (result); + return cpufreq_register_driver(&acpi_cpufreq_driver); } diff --git a/drivers/base/node.c b/drivers/base/node.c index d7de1753e09..e9b0957f15d 100644 --- a/drivers/base/node.c +++ b/drivers/base/node.c @@ -64,7 +64,7 @@ static ssize_t node_read_meminfo(struct sys_device * dev, char * buf) "Node %d Mapped: %8lu kB\n" "Node %d AnonPages: %8lu kB\n" "Node %d PageTables: %8lu kB\n" - "Node %d NFS Unstable: %8lu kB\n" + "Node %d NFS_Unstable: %8lu kB\n" "Node %d Bounce: %8lu kB\n" "Node %d Slab: %8lu kB\n", nid, K(i.totalram), diff --git a/drivers/cdrom/gscd.c b/drivers/cdrom/gscd.c index b6ee50a2916..fa708248976 100644 --- a/drivers/cdrom/gscd.c +++ b/drivers/cdrom/gscd.c @@ -266,7 +266,7 @@ repeat: goto out; if (req->cmd != READ) { - printk("GSCD: bad cmd %lu\n", rq_data_dir(req)); + printk("GSCD: bad cmd %u\n", rq_data_dir(req)); end_request(req, 0); goto repeat; } diff --git a/drivers/char/moxa.c b/drivers/char/moxa.c index 4ea7bd5f4f5..a369dd6877d 100644 --- a/drivers/char/moxa.c +++ b/drivers/char/moxa.c @@ -142,6 +142,7 @@ typedef struct _moxa_board_conf { static moxa_board_conf moxa_boards[MAX_BOARDS]; static void __iomem *moxaBaseAddr[MAX_BOARDS]; +static int loadstat[MAX_BOARDS]; struct moxa_str { int type; @@ -1688,6 +1689,8 @@ int MoxaDriverPoll(void) if (moxaCard == 0) return (-1); for (card = 0; card < MAX_BOARDS; card++) { + if (loadstat[card] == 0) + continue; if ((ports = moxa_boards[card].numPorts) == 0) continue; if (readb(moxaIntPend[card]) == 0xff) { @@ -2903,6 +2906,7 @@ static int moxaloadcode(int cardno, unsigned char __user *tmp, int len) } break; } + loadstat[cardno] = 1; return (0); } @@ -2920,7 +2924,7 @@ static int moxaloadc218(int cardno, void __iomem *baseAddr, int len) len1 = len >> 1; ptr = (ushort *) moxaBuff; for (i = 0; i < len1; i++) - usum += *(ptr + i); + usum += le16_to_cpu(*(ptr + i)); retry = 0; do { len1 = len >> 1; @@ -2992,7 +2996,7 @@ static int moxaloadc320(int cardno, void __iomem *baseAddr, int len, int *numPor wlen = len >> 1; uptr = (ushort *) moxaBuff; for (i = 0; i < wlen; i++) - usum += uptr[i]; + usum += le16_to_cpu(uptr[i]); retry = 0; j = 0; do { diff --git a/drivers/char/tty_io.c b/drivers/char/tty_io.c index bfdb90242a9..bb0d9199e99 100644 --- a/drivers/char/tty_io.c +++ b/drivers/char/tty_io.c @@ -153,6 +153,15 @@ int tty_ioctl(struct inode * inode, struct file * file, static int tty_fasync(int fd, struct file * filp, int on); static void release_mem(struct tty_struct *tty, int idx); +/** + * alloc_tty_struct - allocate a tty object + * + * Return a new empty tty structure. The data fields have not + * been initialized in any way but has been zeroed + * + * Locking: none + * FIXME: use kzalloc + */ static struct tty_struct *alloc_tty_struct(void) { @@ -166,6 +175,15 @@ static struct tty_struct *alloc_tty_struct(void) static void tty_buffer_free_all(struct tty_struct *); +/** + * free_tty_struct - free a disused tty + * @tty: tty struct to free + * + * Free the write buffers, tty queue and tty memory itself. + * + * Locking: none. Must be called after tty is definitely unused + */ + static inline void free_tty_struct(struct tty_struct *tty) { kfree(tty->write_buf); @@ -175,6 +193,17 @@ static inline void free_tty_struct(struct tty_struct *tty) #define TTY_NUMBER(tty) ((tty)->index + (tty)->driver->name_base) +/** + * tty_name - return tty naming + * @tty: tty structure + * @buf: buffer for output + * + * Convert a tty structure into a name. The name reflects the kernel + * naming policy and if udev is in use may not reflect user space + * + * Locking: none + */ + char *tty_name(struct tty_struct *tty, char *buf) { if (!tty) /* Hmm. NULL pointer. That's fun. */ @@ -235,6 +264,28 @@ static int check_tty_count(struct tty_struct *tty, const char *routine) * Tty buffer allocation management */ + +/** + * tty_buffer_free_all - free buffers used by a tty + * @tty: tty to free from + * + * Remove all the buffers pending on a tty whether queued with data + * or in the free ring. Must be called when the tty is no longer in use + * + * Locking: none + */ + + +/** + * tty_buffer_free_all - free buffers used by a tty + * @tty: tty to free from + * + * Remove all the buffers pending on a tty whether queued with data + * or in the free ring. Must be called when the tty is no longer in use + * + * Locking: none + */ + static void tty_buffer_free_all(struct tty_struct *tty) { struct tty_buffer *thead; @@ -247,19 +298,47 @@ static void tty_buffer_free_all(struct tty_struct *tty) kfree(thead); } tty->buf.tail = NULL; + tty->buf.memory_used = 0; } +/** + * tty_buffer_init - prepare a tty buffer structure + * @tty: tty to initialise + * + * Set up the initial state of the buffer management for a tty device. + * Must be called before the other tty buffer functions are used. + * + * Locking: none + */ + static void tty_buffer_init(struct tty_struct *tty) { spin_lock_init(&tty->buf.lock); tty->buf.head = NULL; tty->buf.tail = NULL; tty->buf.free = NULL; + tty->buf.memory_used = 0; } -static struct tty_buffer *tty_buffer_alloc(size_t size) +/** + * tty_buffer_alloc - allocate a tty buffer + * @tty: tty device + * @size: desired size (characters) + * + * Allocate a new tty buffer to hold the desired number of characters. + * Return NULL if out of memory or the allocation would exceed the + * per device queue + * + * Locking: Caller must hold tty->buf.lock + */ + +static struct tty_buffer *tty_buffer_alloc(struct tty_struct *tty, size_t size) { - struct tty_buffer *p = kmalloc(sizeof(struct tty_buffer) + 2 * size, GFP_ATOMIC); + struct tty_buffer *p; + + if (tty->buf.memory_used + size > 65536) + return NULL; + p = kmalloc(sizeof(struct tty_buffer) + 2 * size, GFP_ATOMIC); if(p == NULL) return NULL; p->used = 0; @@ -269,17 +348,27 @@ static struct tty_buffer *tty_buffer_alloc(size_t size) p->read = 0; p->char_buf_ptr = (char *)(p->data); p->flag_buf_ptr = (unsigned char *)p->char_buf_ptr + size; -/* printk("Flip create %p\n", p); */ + tty->buf.memory_used += size; return p; } -/* Must be called with the tty_read lock held. This needs to acquire strategy - code to decide if we should kfree or relink a given expired buffer */ +/** + * tty_buffer_free - free a tty buffer + * @tty: tty owning the buffer + * @b: the buffer to free + * + * Free a tty buffer, or add it to the free list according to our + * internal strategy + * + * Locking: Caller must hold tty->buf.lock + */ static void tty_buffer_free(struct tty_struct *tty, struct tty_buffer *b) { /* Dumb strategy for now - should keep some stats */ -/* printk("Flip dispose %p\n", b); */ + tty->buf.memory_used -= b->size; + WARN_ON(tty->buf.memory_used < 0); + if(b->size >= 512) kfree(b); else { @@ -288,6 +377,18 @@ static void tty_buffer_free(struct tty_struct *tty, struct tty_buffer *b) } } +/** + * tty_buffer_find - find a free tty buffer + * @tty: tty owning the buffer + * @size: characters wanted + * + * Locate an existing suitable tty buffer or if we are lacking one then + * allocate a new one. We round our buffers off in 256 character chunks + * to get better allocation behaviour. + * + * Locking: Caller must hold tty->buf.lock + */ + static struct tty_buffer *tty_buffer_find(struct tty_struct *tty, size_t size) { struct tty_buffer **tbh = &tty->buf.free; @@ -299,20 +400,28 @@ static struct tty_buffer *tty_buffer_find(struct tty_struct *tty, size_t size) t->used = 0; t->commit = 0; t->read = 0; - /* DEBUG ONLY */ -/* memset(t->data, '*', size); */ -/* printk("Flip recycle %p\n", t); */ + tty->buf.memory_used += t->size; return t; } tbh = &((*tbh)->next); } /* Round the buffer size out */ size = (size + 0xFF) & ~ 0xFF; - return tty_buffer_alloc(size); + return tty_buffer_alloc(tty, size); /* Should possibly check if this fails for the largest buffer we have queued and recycle that ? */ } +/** + * tty_buffer_request_room - grow tty buffer if needed + * @tty: tty structure + * @size: size desired + * + * Make at least size bytes of linear space available for the tty + * buffer. If we fail return the size we managed to find. + * + * Locking: Takes tty->buf.lock + */ int tty_buffer_request_room(struct tty_struct *tty, size_t size) { struct tty_buffer *b, *n; @@ -347,6 +456,18 @@ int tty_buffer_request_room(struct tty_struct *tty, size_t size) } EXPORT_SYMBOL_GPL(tty_buffer_request_room); +/** + * tty_insert_flip_string - Add characters to the tty buffer + * @tty: tty structure + * @chars: characters + * @size: size + * + * Queue a series of bytes to the tty buffering. All the characters + * passed are marked as without error. Returns the number added. + * + * Locking: Called functions may take tty->buf.lock + */ + int tty_insert_flip_string(struct tty_struct *tty, const unsigned char *chars, size_t size) { @@ -370,6 +491,20 @@ int tty_insert_flip_string(struct tty_struct *tty, const unsigned char *chars, } EXPORT_SYMBOL(tty_insert_flip_string); +/** + * tty_insert_flip_string_flags - Add characters to the tty buffer + * @tty: tty structure + * @chars: characters + * @flags: flag bytes + * @size: size + * + * Queue a series of bytes to the tty buffering. For each character + * the flags array indicates the status of the character. Returns the + * number added. + * + * Locking: Called functions may take tty->buf.lock + */ + int tty_insert_flip_string_flags(struct tty_struct *tty, const unsigned char *chars, const char *flags, size_t size) { @@ -394,6 +529,17 @@ int tty_insert_flip_string_flags(struct tty_struct *tty, } EXPORT_SYMBOL(tty_insert_flip_string_flags); +/** + * tty_schedule_flip - push characters to ldisc + * @tty: tty to push from + * + * Takes any pending buffers and transfers their ownership to the + * ldisc side of the queue. It then schedules those characters for + * processing by the line discipline. + * + * Locking: Takes tty->buf.lock + */ + void tty_schedule_flip(struct tty_struct *tty) { unsigned long flags; @@ -405,12 +551,19 @@ void tty_schedule_flip(struct tty_struct *tty) } EXPORT_SYMBOL(tty_schedule_flip); -/* +/** + * tty_prepare_flip_string - make room for characters + * @tty: tty + * @chars: return pointer for character write area + * @size: desired size + * * Prepare a block of space in the buffer for data. Returns the length * available and buffer pointer to the space which is now allocated and * accounted for as ready for normal characters. This is used for drivers * that need their own block copy routines into the buffer. There is no * guarantee the buffer is a DMA target! + * + * Locking: May call functions taking tty->buf.lock */ int tty_prepare_flip_string(struct tty_struct *tty, unsigned char **chars, size_t size) @@ -427,12 +580,20 @@ int tty_prepare_flip_string(struct tty_struct *tty, unsigned char **chars, size_ EXPORT_SYMBOL_GPL(tty_prepare_flip_string); -/* +/** + * tty_prepare_flip_string_flags - make room for characters + * @tty: tty + * @chars: return pointer for character write area + * @flags: return pointer for status flag write area + * @size: desired size + * * Prepare a block of space in the buffer for data. Returns the length * available and buffer pointer to the space which is now allocated and * accounted for as ready for characters. This is used for drivers * that need their own block copy routines into the buffer. There is no * guarantee the buffer is a DMA target! + * + * Locking: May call functions taking tty->buf.lock */ int tty_prepare_flip_string_flags(struct tty_struct *tty, unsigned char **chars, char **flags, size_t size) @@ -451,10 +612,16 @@ EXPORT_SYMBOL_GPL(tty_prepare_flip_string_flags); -/* +/** + * tty_set_termios_ldisc - set ldisc field + * @tty: tty structure + * @num: line discipline number + * * This is probably overkill for real world processors but * they are not on hot paths so a little discipline won't do * any harm. + * + * Locking: takes termios_sem */ static void tty_set_termios_ldisc(struct tty_struct *tty, int num) @@ -474,6 +641,19 @@ static DEFINE_SPINLOCK(tty_ldisc_lock); static DECLARE_WAIT_QUEUE_HEAD(tty_ldisc_wait); static struct tty_ldisc tty_ldiscs[NR_LDISCS]; /* line disc dispatch table */ +/** + * tty_register_ldisc - install a line discipline + * @disc: ldisc number + * @new_ldisc: pointer to the ldisc object + * + * Installs a new line discipline into the kernel. The discipline + * is set up as unreferenced and then made available to the kernel + * from this point onwards. + * + * Locking: + * takes tty_ldisc_lock to guard against ldisc races + */ + int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc) { unsigned long flags; @@ -493,6 +673,18 @@ int tty_register_ldisc(int disc, struct tty_ldisc *new_ldisc) } EXPORT_SYMBOL(tty_register_ldisc); +/** + * tty_unregister_ldisc - unload a line discipline + * @disc: ldisc number + * @new_ldisc: pointer to the ldisc object + * + * Remove a line discipline from the kernel providing it is not + * currently in use. + * + * Locking: + * takes tty_ldisc_lock to guard against ldisc races + */ + int tty_unregister_ldisc(int disc) { unsigned long flags; @@ -512,6 +704,19 @@ int tty_unregister_ldisc(int disc) } EXPORT_SYMBOL(tty_unregister_ldisc); +/** + * tty_ldisc_get - take a reference to an ldisc + * @disc: ldisc number + * + * Takes a reference to a line discipline. Deals with refcounts and + * module locking counts. Returns NULL if the discipline is not available. + * Returns a pointer to the discipline and bumps the ref count if it is + * available + * + * Locking: + * takes tty_ldisc_lock to guard against ldisc races + */ + struct tty_ldisc *tty_ldisc_get(int disc) { unsigned long flags; @@ -540,6 +745,17 @@ struct tty_ldisc *tty_ldisc_get(int disc) EXPORT_SYMBOL_GPL(tty_ldisc_get); +/** + * tty_ldisc_put - drop ldisc reference + * @disc: ldisc number + * + * Drop a reference to a line discipline. Manage refcounts and + * module usage counts + * + * Locking: + * takes tty_ldisc_lock to guard against ldisc races + */ + void tty_ldisc_put(int disc) { struct tty_ldisc *ld; @@ -557,6 +773,19 @@ void tty_ldisc_put(int disc) EXPORT_SYMBOL_GPL(tty_ldisc_put); +/** + * tty_ldisc_assign - set ldisc on a tty + * @tty: tty to assign + * @ld: line discipline + * + * Install an instance of a line discipline into a tty structure. The + * ldisc must have a reference count above zero to ensure it remains/ + * The tty instance refcount starts at zero. + * + * Locking: + * Caller must hold references + */ + static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld) { tty->ldisc = *ld; @@ -571,6 +800,8 @@ static void tty_ldisc_assign(struct tty_struct *tty, struct tty_ldisc *ld) * the tty ldisc. Return 0 on failure or 1 on success. This is * used to implement both the waiting and non waiting versions * of tty_ldisc_ref + * + * Locking: takes tty_ldisc_lock */ static int tty_ldisc_try(struct tty_struct *tty) @@ -602,6 +833,8 @@ static int tty_ldisc_try(struct tty_struct *tty) * must also be careful not to hold other locks that will deadlock * against a discipline change, such as an existing ldisc reference * (which we check for) + * + * Locking: call functions take tty_ldisc_lock */ struct tty_ldisc *tty_ldisc_ref_wait(struct tty_struct *tty) @@ -622,6 +855,8 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref_wait); * Dereference the line discipline for the terminal and take a * reference to it. If the line discipline is in flux then * return NULL. Can be called from IRQ and timer functions. + * + * Locking: called functions take tty_ldisc_lock */ struct tty_ldisc *tty_ldisc_ref(struct tty_struct *tty) @@ -639,6 +874,8 @@ EXPORT_SYMBOL_GPL(tty_ldisc_ref); * * Undoes the effect of tty_ldisc_ref or tty_ldisc_ref_wait. May * be called in IRQ context. + * + * Locking: takes tty_ldisc_lock */ void tty_ldisc_deref(struct tty_ldisc *ld) @@ -683,6 +920,9 @@ static void tty_ldisc_enable(struct tty_struct *tty) * * Set the discipline of a tty line. Must be called from a process * context. + * + * Locking: takes tty_ldisc_lock. + * called functions take termios_sem */ static int tty_set_ldisc(struct tty_struct *tty, int ldisc) @@ -846,9 +1086,17 @@ restart: return retval; } -/* - * This routine returns a tty driver structure, given a device number +/** + * get_tty_driver - find device of a tty + * @dev_t: device identifier + * @index: returns the index of the tty + * + * This routine returns a tty driver structure, given a device number + * and also passes back the index number. + * + * Locking: caller must hold tty_mutex */ + static struct tty_driver *get_tty_driver(dev_t device, int *index) { struct tty_driver *p; @@ -863,11 +1111,17 @@ static struct tty_driver *get_tty_driver(dev_t device, int *index) return NULL; } -/* - * If we try to write to, or set the state of, a terminal and we're - * not in the foreground, send a SIGTTOU. If the signal is blocked or - * ignored, go ahead and perform the operation. (POSIX 7.2) +/** + * tty_check_change - check for POSIX terminal changes + * @tty: tty to check + * + * If we try to write to, or set the state of, a terminal and we're + * not in the foreground, send a SIGTTOU. If the signal is blocked or + * ignored, go ahead and perform the operation. (POSIX 7.2) + * + * Locking: none */ + int tty_check_change(struct tty_struct * tty) { if (current->signal->tty != tty) @@ -1005,10 +1259,27 @@ void tty_ldisc_flush(struct tty_struct *tty) EXPORT_SYMBOL_GPL(tty_ldisc_flush); -/* - * This can be called by the "eventd" kernel thread. That is process synchronous, - * but doesn't hold any locks, so we need to make sure we have the appropriate - * locks for what we're doing.. +/** + * do_tty_hangup - actual handler for hangup events + * @data: tty device + * + * This can be called by the "eventd" kernel thread. That is process + * synchronous but doesn't hold any locks, so we need to make sure we + * have the appropriate locks for what we're doing. + * + * The hangup event clears any pending redirections onto the hung up + * device. It ensures future writes will error and it does the needed + * line discipline hangup and signal delivery. The tty object itself + * remains intact. + * + * Locking: + * BKL + * redirect lock for undoing redirection + * file list lock for manipulating list of ttys + * tty_ldisc_lock from called functions + * termios_sem resetting termios data + * tasklist_lock to walk task list for hangup event + * */ static void do_tty_hangup(void *data) { @@ -1133,6 +1404,14 @@ static void do_tty_hangup(void *data) fput(f); } +/** + * tty_hangup - trigger a hangup event + * @tty: tty to hangup + * + * A carrier loss (virtual or otherwise) has occurred on this like + * schedule a hangup sequence to run after this event. + */ + void tty_hangup(struct tty_struct * tty) { #ifdef TTY_DEBUG_HANGUP @@ -1145,6 +1424,15 @@ void tty_hangup(struct tty_struct * tty) EXPORT_SYMBOL(tty_hangup); +/** + * tty_vhangup - process vhangup + * @tty: tty to hangup + * + * The user has asked via system call for the terminal to be hung up. + * We do this synchronously so that when the syscall returns the process + * is complete. That guarantee is neccessary for security reasons. + */ + void tty_vhangup(struct tty_struct * tty) { #ifdef TTY_DEBUG_HANGUP @@ -1156,6 +1444,14 @@ void tty_vhangup(struct tty_struct * tty) } EXPORT_SYMBOL(tty_vhangup); +/** + * tty_hung_up_p - was tty hung up + * @filp: file pointer of tty + * + * Return true if the tty has been subject to a vhangup or a carrier + * loss + */ + int tty_hung_up_p(struct file * filp) { return (filp->f_op == &hung_up_tty_fops); @@ -1163,19 +1459,28 @@ int tty_hung_up_p(struct file * filp) EXPORT_SYMBOL(tty_hung_up_p); -/* - * This function is typically called only by the session leader, when - * it wants to disassociate itself from its controlling tty. +/** + * disassociate_ctty - disconnect controlling tty + * @on_exit: true if exiting so need to "hang up" the session + * + * This function is typically called only by the session leader, when + * it wants to disassociate itself from its controlling tty. * - * It performs the following functions: + * It performs the following functions: * (1) Sends a SIGHUP and SIGCONT to the foreground process group * (2) Clears the tty from being controlling the session * (3) Clears the controlling tty for all processes in the * session group. * - * The argument on_exit is set to 1 if called when a process is - * exiting; it is 0 if called by the ioctl TIOCNOTTY. + * The argument on_exit is set to 1 if called when a process is + * exiting; it is 0 if called by the ioctl TIOCNOTTY. + * + * Locking: tty_mutex is taken to protect current->signal->tty + * BKL is taken for hysterical raisins + * Tasklist lock is taken (under tty_mutex) to walk process + * lists for the session. */ + void disassociate_ctty(int on_exit) { struct tty_struct *tty; @@ -1222,6 +1527,25 @@ void disassociate_ctty(int on_exit) unlock_kernel(); } + +/** + * stop_tty - propogate flow control + * @tty: tty to stop + * + * Perform flow control to the driver. For PTY/TTY pairs we + * must also propogate the TIOCKPKT status. May be called + * on an already stopped device and will not re-call the driver + * method. + * + * This functionality is used by both the line disciplines for + * halting incoming flow and by the driver. It may therefore be + * called from any context, may be under the tty atomic_write_lock + * but not always. + * + * Locking: + * Broken. Relies on BKL which is unsafe here. + */ + void stop_tty(struct tty_struct *tty) { if (tty->stopped) @@ -1238,6 +1562,19 @@ void stop_tty(struct tty_struct *tty) EXPORT_SYMBOL(stop_tty); +/** + * start_tty - propogate flow control + * @tty: tty to start + * + * Start a tty that has been stopped if at all possible. Perform + * any neccessary wakeups and propogate the TIOCPKT status. If this + * is the tty was previous stopped and is being started then the + * driver start method is invoked and the line discipline woken. + * + * Locking: + * Broken. Relies on BKL which is unsafe here. + */ + void start_tty(struct tty_struct *tty) { if (!tty->stopped || tty->flow_stopped) @@ -1258,6 +1595,23 @@ void start_tty(struct tty_struct *tty) EXPORT_SYMBOL(start_tty); +/** + * tty_read - read method for tty device files + * @file: pointer to tty file + * @buf: user buffer + * @count: size of user buffer + * @ppos: unused + * + * Perform the read system call function on this terminal device. Checks + * for hung up devices before calling the line discipline method. + * + * Locking: + * Locks the line discipline internally while needed + * For historical reasons the line discipline read method is + * invoked under the BKL. This will go away in time so do not rely on it + * in new code. Multiple read calls may be outstanding in parallel. + */ + static ssize_t tty_read(struct file * file, char __user * buf, size_t count, loff_t *ppos) { @@ -1302,6 +1656,7 @@ static inline ssize_t do_tty_write( ssize_t ret = 0, written = 0; unsigned int chunk; + /* FIXME: O_NDELAY ... */ if (mutex_lock_interruptible(&tty->atomic_write_lock)) { return -ERESTARTSYS; } @@ -1318,6 +1673,9 @@ static inline ssize_t do_tty_write( * layer has problems with bigger chunks. It will * claim to be able to handle more characters than * it actually does. + * + * FIXME: This can probably go away now except that 64K chunks + * are too likely to fail unless switched to vmalloc... */ chunk = 2048; if (test_bit(TTY_NO_WRITE_SPLIT, &tty->flags)) @@ -1375,6 +1733,24 @@ static inline ssize_t do_tty_write( } +/** + * tty_write - write method for tty device file + * @file: tty file pointer + * @buf: user data to write + * @count: bytes to write + * @ppos: unused + * + * Write data to a tty device via the line discipline. + * + * Locking: + * Locks the line discipline as required + * Writes to the tty driver are serialized by the atomic_write_lock + * and are then processed in chunks to the device. The line discipline + * write method will not be involked in parallel for each device + * The line discipline write method is called under the big + * kernel lock for historical reasons. New code should not rely on this. + */ + static ssize_t tty_write(struct file * file, const char __user * buf, size_t count, loff_t *ppos) { @@ -1422,7 +1798,18 @@ ssize_t redirected_tty_write(struct file * file, const char __user * buf, size_t static char ptychar[] = "pqrstuvwxyzabcde"; -static inline void pty_line_name(struct tty_driver *driver, int index, char *p) +/** + * pty_line_name - generate name for a pty + * @driver: the tty driver in use + * @index: the minor number + * @p: output buffer of at least 6 bytes + * + * Generate a name from a driver reference and write it to the output + * buffer. + * + * Locking: None + */ +static void pty_line_name(struct tty_driver *driver, int index, char *p) { int i = index + driver->name_base; /* ->name is initialized to "ttyp", but "tty" is expected */ @@ -1431,24 +1818,53 @@ static inline void pty_line_name(struct tty_driver *driver, int index, char *p) ptychar[i >> 4 & 0xf], i & 0xf); } -static inline void tty_line_name(struct tty_driver *driver, int index, char *p) +/** + * pty_line_name - generate name for a tty + * @driver: the tty driver in use + * @index: the minor number + * @p: output buffer of at least 7 bytes + * + * Generate a name from a driver reference and write it to the output + * buffer. + * + * Locking: None + */ +static void tty_line_name(struct tty_driver *driver, int index, char *p) { sprintf(p, "%s%d", driver->name, index + driver->name_base); } -/* +/** + * init_dev - initialise a tty device + * @driver: tty driver we are opening a device on + * @idx: device index + * @tty: returned tty structure + * + * Prepare a tty device. This may not be a "new" clean device but + * could also be an active device. The pty drivers require special + * handling because of this. + * + * Locking: + * The function is called under the tty_mutex, which + * protects us from the tty struct or driver itself going away. + * + * On exit the tty device has the line discipline attached and + * a reference count of 1. If a pair was created for pty/tty use + * and the other was a pty master then it too has a reference count of 1. + * * WSH 06/09/97: Rewritten to remove races and properly clean up after a * failed open. The new code protects the open with a mutex, so it's * really quite straightforward. The mutex locking can probably be * relaxed for the (most common) case of reopening a tty. */ + static int init_dev(struct tty_driver *driver, int idx, struct tty_struct **ret_tty) { struct tty_struct *tty, *o_tty; struct termios *tp, **tp_loc, *o_tp, **o_tp_loc; struct termios *ltp, **ltp_loc, *o_ltp, **o_ltp_loc; - int retval=0; + int retval = 0; /* check whether we're reopening an existing tty */ if (driver->flags & TTY_DRIVER_DEVPTS_MEM) { @@ -1662,10 +2078,20 @@ release_mem_out: goto end_init; } -/* - * Releases memory associated with a tty structure, and clears out the - * driver table slots. +/** + * release_mem - release tty structure memory + * + * Releases memory associated with a tty structure, and clears out the + * driver table slots. This function is called when a device is no longer + * in use. It also gets called when setup of a device fails. + * + * Locking: + * tty_mutex - sometimes only + * takes the file list lock internally when working on the list + * of ttys that the driver keeps. + * FIXME: should we require tty_mutex is held here ?? */ + static void release_mem(struct tty_struct *tty, int idx) { struct tty_struct *o_tty; @@ -2006,18 +2432,27 @@ static void release_dev(struct file * filp) } -/* - * tty_open and tty_release keep up the tty count that contains the - * number of opens done on a tty. We cannot use the inode-count, as - * different inodes might point to the same tty. +/** + * tty_open - open a tty device + * @inode: inode of device file + * @filp: file pointer to tty + * + * tty_open and tty_release keep up the tty count that contains the + * number of opens done on a tty. We cannot use the inode-count, as + * different inodes might point to the same tty. * - * Open-counting is needed for pty masters, as well as for keeping - * track of serial lines: DTR is dropped when the last close happens. - * (This is not done solely through tty->count, now. - Ted 1/27/92) + * Open-counting is needed for pty masters, as well as for keeping + * track of serial lines: DTR is dropped when the last close happens. + * (This is not done solely through tty->count, now. - Ted 1/27/92) * - * The termios state of a pty is reset on first open so that - * settings don't persist across reuse. + * The termios state of a pty is reset on first open so that + * settings don't persist across reuse. + * + * Locking: tty_mutex protects current->signal->tty, get_tty_driver and + * init_dev work. tty->count should protect the rest. + * task_lock is held to update task details for sessions */ + static int tty_open(struct inode * inode, struct file * filp) { struct tty_struct *tty; @@ -2132,6 +2567,18 @@ got_driver: } #ifdef CONFIG_UNIX98_PTYS +/** + * ptmx_open - open a unix 98 pty master + * @inode: inode of device file + * @filp: file pointer to tty + * + * Allocate a unix98 pty master device from the ptmx driver. + * + * Locking: tty_mutex protects theinit_dev work. tty->count should + protect the rest. + * allocated_ptys_lock handles the list of free pty numbers + */ + static int ptmx_open(struct inode * inode, struct file * filp) { struct tty_struct *tty; @@ -2191,6 +2638,18 @@ out: } #endif +/** + * tty_release - vfs callback for close + * @inode: inode of tty + * @filp: file pointer for handle to tty + * + * Called the last time each file handle is closed that references + * this tty. There may however be several such references. + * + * Locking: + * Takes bkl. See release_dev + */ + static int tty_release(struct inode * inode, struct file * filp) { lock_kernel(); @@ -2199,7 +2658,18 @@ static int tty_release(struct inode * inode, struct file * filp) return 0; } -/* No kernel lock held - fine */ +/** + * tty_poll - check tty status + * @filp: file being polled + * @wait: poll wait structures to update + * + * Call the line discipline polling method to obtain the poll + * status of the device. + * + * Locking: locks called line discipline but ldisc poll method + * may be re-entered freely by other callers. + */ + static unsigned int tty_poll(struct file * filp, poll_table * wait) { struct tty_struct * tty; @@ -2243,6 +2713,21 @@ static int tty_fasync(int fd, struct file * filp, int on) return 0; } +/** + * tiocsti - fake input character + * @tty: tty to fake input into + * @p: pointer to character + * + * Fake input to a tty device. Does the neccessary locking and + * input management. + * + * FIXME: does not honour flow control ?? + * + * Locking: + * Called functions take tty_ldisc_lock + * current->signal->tty check is safe without locks + */ + static int tiocsti(struct tty_struct *tty, char __user *p) { char ch, mbz = 0; @@ -2258,6 +2743,18 @@ static int tiocsti(struct tty_struct *tty, char __user *p) return 0; } +/** + * tiocgwinsz - implement window query ioctl + * @tty; tty + * @arg: user buffer for result + * + * Copies the kernel idea of the window size into the user buffer. No + * locking is done. + * + * FIXME: Returning random values racing a window size set is wrong + * should lock here against that + */ + static int tiocgwinsz(struct tty_struct *tty, struct winsize __user * arg) { if (copy_to_user(arg, &tty->winsize, sizeof(*arg))) @@ -2265,6 +2762,24 @@ static int tiocgwinsz(struct tty_struct *tty, struct winsize __user * arg) return 0; } +/** + * tiocswinsz - implement window size set ioctl + * @tty; tty + * @arg: user buffer for result + * + * Copies the user idea of the window size to the kernel. Traditionally + * this is just advisory information but for the Linux console it + * actually has driver level meaning and triggers a VC resize. + * + * Locking: + * The console_sem is used to ensure we do not try and resize + * the console twice at once. + * FIXME: Two racing size sets may leave the console and kernel + * parameters disagreeing. Is this exploitable ? + * FIXME: Random values racing a window size get is wrong + * should lock here against that + */ + static int tiocswinsz(struct tty_struct *tty, struct tty_struct *real_tty, struct winsize __user * arg) { @@ -2294,6 +2809,15 @@ static int tiocswinsz(struct tty_struct *tty, struct tty_struct *real_tty, return 0; } +/** + * tioccons - allow admin to move logical console + * @file: the file to become console + * + * Allow the adminstrator to move the redirected console device + * + * Locking: uses redirect_lock to guard the redirect information + */ + static int tioccons(struct file *file) { if (!capable(CAP_SYS_ADMIN)) @@ -2319,6 +2843,17 @@ static int tioccons(struct file *file) return 0; } +/** + * fionbio - non blocking ioctl + * @file: file to set blocking value + * @p: user parameter + * + * Historical tty interfaces had a blocking control ioctl before + * the generic functionality existed. This piece of history is preserved + * in the expected tty API of posix OS's. + * + * Locking: none, the open fle handle ensures it won't go away. + */ static int fionbio(struct file *file, int __user *p) { @@ -2334,6 +2869,23 @@ static int fionbio(struct file *file, int __user *p) return 0; } +/** + * tiocsctty - set controlling tty + * @tty: tty structure + * @arg: user argument + * + * This ioctl is used to manage job control. It permits a session + * leader to set this tty as the controlling tty for the session. + * + * Locking: + * Takes tasklist lock internally to walk sessions + * Takes task_lock() when updating signal->tty + * + * FIXME: tty_mutex is needed to protect signal->tty references. + * FIXME: why task_lock on the signal->tty reference ?? + * + */ + static int tiocsctty(struct tty_struct *tty, int arg) { struct task_struct *p; @@ -2374,6 +2926,18 @@ static int tiocsctty(struct tty_struct *tty, int arg) return 0; } +/** + * tiocgpgrp - get process group + * @tty: tty passed by user + * @real_tty: tty side of the tty pased by the user if a pty else the tty + * @p: returned pid + * + * Obtain the process group of the tty. If there is no process group + * return an error. + * + * Locking: none. Reference to ->signal->tty is safe. + */ + static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) { /* @@ -2385,6 +2949,20 @@ static int tiocgpgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t return put_user(real_tty->pgrp, p); } +/** + * tiocspgrp - attempt to set process group + * @tty: tty passed by user + * @real_tty: tty side device matching tty passed by user + * @p: pid pointer + * + * Set the process group of the tty to the session passed. Only + * permitted where the tty session is our session. + * + * Locking: None + * + * FIXME: current->signal->tty referencing is unsafe. + */ + static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) { pid_t pgrp; @@ -2408,6 +2986,18 @@ static int tiocspgrp(struct tty_struct *tty, struct tty_struct *real_tty, pid_t return 0; } +/** + * tiocgsid - get session id + * @tty: tty passed by user + * @real_tty: tty side of the tty pased by the user if a pty else the tty + * @p: pointer to returned session id + * + * Obtain the session id of the tty. If there is no session + * return an error. + * + * Locking: none. Reference to ->signal->tty is safe. + */ + static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t __user *p) { /* @@ -2421,6 +3011,16 @@ static int tiocgsid(struct tty_struct *tty, struct tty_struct *real_tty, pid_t _ return put_user(real_tty->session, p); } +/** + * tiocsetd - set line discipline + * @tty: tty device + * @p: pointer to user data + * + * Set the line discipline according to user request. + * + * Locking: see tty_set_ldisc, this function is just a helper + */ + static int tiocsetd(struct tty_struct *tty, int __user *p) { int ldisc; @@ -2430,6 +3030,21 @@ static int tiocsetd(struct tty_struct *tty, int __user *p) return tty_set_ldisc(tty, ldisc); } +/** + * send_break - performed time break + * @tty: device to break on + * @duration: timeout in mS + * + * Perform a timed break on hardware that lacks its own driver level + * timed break functionality. + * + * Locking: + * None + * + * FIXME: + * What if two overlap + */ + static int send_break(struct tty_struct *tty, unsigned int duration) { tty->driver->break_ctl(tty, -1); @@ -2442,8 +3057,19 @@ static int send_break(struct tty_struct *tty, unsigned int duration) return 0; } -static int -tty_tiocmget(struct tty_struct *tty, struct file *file, int __user *p) +/** + * tiocmget - get modem status + * @tty: tty device + * @file: user file pointer + * @p: pointer to result + * + * Obtain the modem status bits from the tty driver if the feature + * is supported. Return -EINVAL if it is not available. + * + * Locking: none (up to the driver) + */ + +static int tty_tiocmget(struct tty_struct *tty, struct file *file, int __user *p) { int retval = -EINVAL; @@ -2456,8 +3082,20 @@ tty_tiocmget(struct tty_struct *tty, struct file *file, int __user *p) return retval; } -static int -tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int cmd, +/** + * tiocmset - set modem status + * @tty: tty device + * @file: user file pointer + * @cmd: command - clear bits, set bits or set all + * @p: pointer to desired bits + * + * Set the modem status bits from the tty driver if the feature + * is supported. Return -EINVAL if it is not available. + * + * Locking: none (up to the driver) + */ + +static int tty_tiocmset(struct tty_struct *tty, struct file *file, unsigned int cmd, unsigned __user *p) { int retval = -EINVAL; @@ -2573,6 +3211,7 @@ int tty_ioctl(struct inode * inode, struct file * file, clear_bit(TTY_EXCLUSIVE, &tty->flags); return 0; case TIOCNOTTY: + /* FIXME: taks lock or tty_mutex ? */ if (current->signal->tty != tty) return -ENOTTY; if (current->signal->leader) @@ -2753,9 +3392,16 @@ void do_SAK(struct tty_struct *tty) EXPORT_SYMBOL(do_SAK); -/* - * This routine is called out of the software interrupt to flush data - * from the buffer chain to the line discipline. +/** + * flush_to_ldisc + * @private_: tty structure passed from work queue. + * + * This routine is called out of the software interrupt to flush data + * from the buffer chain to the line discipline. + * + * Locking: holds tty->buf.lock to guard buffer list. Drops the lock + * while invoking the line discipline receive_buf method. The + * receive_buf method is single threaded for each tty instance. */ static void flush_to_ldisc(void *private_) @@ -2831,6 +3477,8 @@ static int n_baud_table = ARRAY_SIZE(baud_table); * Convert termios baud rate data into a speed. This should be called * with the termios lock held if this termios is a terminal termios * structure. May change the termios data. + * + * Locking: none */ int tty_termios_baud_rate(struct termios *termios) @@ -2859,6 +3507,8 @@ EXPORT_SYMBOL(tty_termios_baud_rate); * Returns the baud rate as an integer for this terminal. The * termios lock must be held by the caller and the terminal bit * flags may be updated. + * + * Locking: none */ int tty_get_baud_rate(struct tty_struct *tty) @@ -2888,6 +3538,8 @@ EXPORT_SYMBOL(tty_get_baud_rate); * * In the event of the queue being busy for flipping the work will be * held off and retried later. + * + * Locking: tty buffer lock. Driver locks in low latency mode. */ void tty_flip_buffer_push(struct tty_struct *tty) @@ -2907,9 +3559,16 @@ void tty_flip_buffer_push(struct tty_struct *tty) EXPORT_SYMBOL(tty_flip_buffer_push); -/* - * This subroutine initializes a tty structure. +/** + * initialize_tty_struct + * @tty: tty to initialize + * + * This subroutine initializes a tty structure that has been newly + * allocated. + * + * Locking: none - tty in question must not be exposed at this point */ + static void initialize_tty_struct(struct tty_struct *tty) { memset(tty, 0, sizeof(struct tty_struct)); @@ -2935,6 +3594,7 @@ static void initialize_tty_struct(struct tty_struct *tty) /* * The default put_char routine if the driver did not define one. */ + static void tty_default_put_char(struct tty_struct *tty, unsigned char ch) { tty->driver->write(tty, &ch, 1); @@ -2943,19 +3603,23 @@ static void tty_default_put_char(struct tty_struct *tty, unsigned char ch) static struct class *tty_class; /** - * tty_register_device - register a tty device - * @driver: the tty driver that describes the tty device - * @index: the index in the tty driver for this tty device - * @device: a struct device that is associated with this tty device. - * This field is optional, if there is no known struct device for this - * tty device it can be set to NULL safely. + * tty_register_device - register a tty device + * @driver: the tty driver that describes the tty device + * @index: the index in the tty driver for this tty device + * @device: a struct device that is associated with this tty device. + * This field is optional, if there is no known struct device + * for this tty device it can be set to NULL safely. * - * Returns a pointer to the class device (or ERR_PTR(-EFOO) on error). + * Returns a pointer to the class device (or ERR_PTR(-EFOO) on error). * - * This call is required to be made to register an individual tty device if - * the tty driver's flags have the TTY_DRIVER_DYNAMIC_DEV bit set. If that - * bit is not set, this function should not be called by a tty driver. + * This call is required to be made to register an individual tty device + * if the tty driver's flags have the TTY_DRIVER_DYNAMIC_DEV bit set. If + * that bit is not set, this function should not be called by a tty + * driver. + * + * Locking: ?? */ + struct class_device *tty_register_device(struct tty_driver *driver, unsigned index, struct device *device) { @@ -2977,13 +3641,16 @@ struct class_device *tty_register_device(struct tty_driver *driver, } /** - * tty_unregister_device - unregister a tty device - * @driver: the tty driver that describes the tty device - * @index: the index in the tty driver for this tty device + * tty_unregister_device - unregister a tty device + * @driver: the tty driver that describes the tty device + * @index: the index in the tty driver for this tty device * - * If a tty device is registered with a call to tty_register_device() then - * this function must be made when the tty device is gone. + * If a tty device is registered with a call to tty_register_device() then + * this function must be called when the tty device is gone. + * + * Locking: ?? */ + void tty_unregister_device(struct tty_driver *driver, unsigned index) { class_device_destroy(tty_class, MKDEV(driver->major, driver->minor_start) + index); @@ -3094,7 +3761,6 @@ int tty_register_driver(struct tty_driver *driver) driver->cdev.owner = driver->owner; error = cdev_add(&driver->cdev, dev, driver->num); if (error) { - cdev_del(&driver->cdev); unregister_chrdev_region(dev, driver->num); driver->ttys = NULL; driver->termios = driver->termios_locked = NULL; diff --git a/drivers/char/tty_ioctl.c b/drivers/char/tty_ioctl.c index f19cf9d7792..4ad47d321bd 100644 --- a/drivers/char/tty_ioctl.c +++ b/drivers/char/tty_ioctl.c @@ -36,6 +36,18 @@ #define TERMIOS_WAIT 2 #define TERMIOS_TERMIO 4 + +/** + * tty_wait_until_sent - wait for I/O to finish + * @tty: tty we are waiting for + * @timeout: how long we will wait + * + * Wait for characters pending in a tty driver to hit the wire, or + * for a timeout to occur (eg due to flow control) + * + * Locking: none + */ + void tty_wait_until_sent(struct tty_struct * tty, long timeout) { DECLARE_WAITQUEUE(wait, current); @@ -94,6 +106,18 @@ static void unset_locked_termios(struct termios *termios, old->c_cc[i] : termios->c_cc[i]; } +/** + * change_termios - update termios values + * @tty: tty to update + * @new_termios: desired new value + * + * Perform updates to the termios values set on this terminal. There + * is a bit of layering violation here with n_tty in terms of the + * internal knowledge of this function. + * + * Locking: termios_sem + */ + static void change_termios(struct tty_struct * tty, struct termios * new_termios) { int canon_change; @@ -155,6 +179,19 @@ static void change_termios(struct tty_struct * tty, struct termios * new_termios up(&tty->termios_sem); } +/** + * set_termios - set termios values for a tty + * @tty: terminal device + * @arg: user data + * @opt: option information + * + * Helper function to prepare termios data and run neccessary other + * functions before using change_termios to do the actual changes. + * + * Locking: + * Called functions take ldisc and termios_sem locks + */ + static int set_termios(struct tty_struct * tty, void __user *arg, int opt) { struct termios tmp_termios; @@ -284,6 +321,17 @@ static void set_sgflags(struct termios * termios, int flags) } } +/** + * set_sgttyb - set legacy terminal values + * @tty: tty structure + * @sgttyb: pointer to old style terminal structure + * + * Updates a terminal from the legacy BSD style terminal information + * structure. + * + * Locking: termios_sem + */ + static int set_sgttyb(struct tty_struct * tty, struct sgttyb __user * sgttyb) { int retval; @@ -369,9 +417,16 @@ static int set_ltchars(struct tty_struct * tty, struct ltchars __user * ltchars) } #endif -/* - * Send a high priority character to the tty. +/** + * send_prio_char - send priority character + * + * Send a high priority character to the tty even if stopped + * + * Locking: none + * + * FIXME: overlapping calls with start/stop tty lose state of tty */ + static void send_prio_char(struct tty_struct *tty, char ch) { int was_stopped = tty->stopped; diff --git a/drivers/char/vt_ioctl.c b/drivers/char/vt_ioctl.c index eccffaf26fa..a5628a8b662 100644 --- a/drivers/char/vt_ioctl.c +++ b/drivers/char/vt_ioctl.c @@ -1011,6 +1011,8 @@ int vt_ioctl(struct tty_struct *tty, struct file * file, return -EPERM; vt_dont_switch = 0; return 0; + case VT_GETHIFONTMASK: + return put_user(vc->vc_hi_font_mask, (unsigned short __user *)arg); default: return -ENOIOCTLCMD; } diff --git a/drivers/ieee1394/ohci1394.c b/drivers/ieee1394/ohci1394.c index d4bad6704bb..448df277337 100644 --- a/drivers/ieee1394/ohci1394.c +++ b/drivers/ieee1394/ohci1394.c @@ -3552,6 +3552,8 @@ static int ohci1394_pci_resume (struct pci_dev *pdev) static int ohci1394_pci_suspend (struct pci_dev *pdev, pm_message_t state) { + pci_save_state(pdev); + #ifdef CONFIG_PPC_PMAC if (machine_is(powermac)) { struct device_node *of_node; @@ -3563,8 +3565,6 @@ static int ohci1394_pci_suspend (struct pci_dev *pdev, pm_message_t state) } #endif - pci_save_state(pdev); - return 0; } diff --git a/drivers/md/dm-raid1.c b/drivers/md/dm-raid1.c index be48cedf986..c54de989eb0 100644 --- a/drivers/md/dm-raid1.c +++ b/drivers/md/dm-raid1.c @@ -255,7 +255,9 @@ static struct region *__rh_alloc(struct region_hash *rh, region_t region) struct region *reg, *nreg; read_unlock(&rh->hash_lock); - nreg = mempool_alloc(rh->region_pool, GFP_NOIO); + nreg = mempool_alloc(rh->region_pool, GFP_ATOMIC); + if (unlikely(!nreg)) + nreg = kmalloc(sizeof(struct region), GFP_NOIO); nreg->state = rh->log->type->in_sync(rh->log, region, 1) ? RH_CLEAN : RH_NOSYNC; nreg->rh = rh; diff --git a/drivers/md/md.c b/drivers/md/md.c index b6d16022a53..8dbab2ef388 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -1597,6 +1597,19 @@ void md_update_sb(mddev_t * mddev) repeat: spin_lock_irq(&mddev->write_lock); + + if (mddev->degraded && mddev->sb_dirty == 3) + /* If the array is degraded, then skipping spares is both + * dangerous and fairly pointless. + * Dangerous because a device that was removed from the array + * might have a event_count that still looks up-to-date, + * so it can be re-added without a resync. + * Pointless because if there are any spares to skip, + * then a recovery will happen and soon that array won't + * be degraded any more and the spare can go back to sleep then. + */ + mddev->sb_dirty = 1; + sync_req = mddev->in_sync; mddev->utime = get_seconds(); if (mddev->sb_dirty == 3) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1efe22a2d04..87bfe9e7d8c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1625,15 +1625,16 @@ static sector_t sync_request(mddev_t *mddev, sector_t sector_nr, int *skipped, i return 0; } - /* before building a request, check if we can skip these blocks.. - * This call the bitmap_start_sync doesn't actually record anything - */ if (mddev->bitmap == NULL && mddev->recovery_cp == MaxSector && + !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery) && conf->fullsync == 0) { *skipped = 1; return max_sector - sector_nr; } + /* before building a request, check if we can skip these blocks.. + * This call the bitmap_start_sync doesn't actually record anything + */ if (!bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, 1) && !conf->fullsync && !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) { /* We can skip this block, and probably several more */ diff --git a/drivers/mtd/nand/ams-delta.c b/drivers/mtd/nand/ams-delta.c index d7897dc6b3c..a0ba07c36ee 100644 --- a/drivers/mtd/nand/ams-delta.c +++ b/drivers/mtd/nand/ams-delta.c @@ -130,11 +130,13 @@ static void ams_delta_hwcontrol(struct mtd_info *mtd, int cmd, if (ctrl & NAND_CTRL_CHANGE) { unsigned long bits; - bits = (~ctrl & NAND_NCE) << 2; - bits |= (ctrl & NAND_CLE) << 7; - bits |= (ctrl & NAND_ALE) << 6; + bits = (~ctrl & NAND_NCE) ? AMS_DELTA_LATCH2_NAND_NCE : 0; + bits |= (ctrl & NAND_CLE) ? AMS_DELTA_LATCH2_NAND_CLE : 0; + bits |= (ctrl & NAND_ALE) ? AMS_DELTA_LATCH2_NAND_ALE : 0; - ams_delta_latch2_write(0xC2, bits); + ams_delta_latch2_write(AMS_DELTA_LATCH2_NAND_CLE | + AMS_DELTA_LATCH2_NAND_ALE | + AMS_DELTA_LATCH2_NAND_NCE, bits); } if (cmd != NAND_CMD_NONE) diff --git a/drivers/mtd/nand/nand_base.c b/drivers/mtd/nand/nand_base.c index 62b861304e0..c8cbc00243f 100644 --- a/drivers/mtd/nand/nand_base.c +++ b/drivers/mtd/nand/nand_base.c @@ -1093,9 +1093,10 @@ static int nand_read(struct mtd_info *mtd, loff_t from, size_t len, ret = nand_do_read_ops(mtd, from, &chip->ops); + *retlen = chip->ops.retlen; + nand_release_device(mtd); - *retlen = chip->ops.retlen; return ret; } @@ -1691,9 +1692,10 @@ static int nand_write(struct mtd_info *mtd, loff_t to, size_t len, ret = nand_do_write_ops(mtd, to, &chip->ops); + *retlen = chip->ops.retlen; + nand_release_device(mtd); - *retlen = chip->ops.retlen; return ret; } diff --git a/drivers/rtc/rtc-s3c.c b/drivers/rtc/rtc-s3c.c index d6d1bff52b8..2c7de79c83b 100644 --- a/drivers/rtc/rtc-s3c.c +++ b/drivers/rtc/rtc-s3c.c @@ -69,12 +69,12 @@ static void s3c_rtc_setaie(int to) pr_debug("%s: aie=%d\n", __FUNCTION__, to); - tmp = readb(S3C2410_RTCALM) & ~S3C2410_RTCALM_ALMEN; + tmp = readb(s3c_rtc_base + S3C2410_RTCALM) & ~S3C2410_RTCALM_ALMEN; if (to) tmp |= S3C2410_RTCALM_ALMEN; - writeb(tmp, S3C2410_RTCALM); + writeb(tmp, s3c_rtc_base + S3C2410_RTCALM); } static void s3c_rtc_setpie(int to) @@ -84,12 +84,12 @@ static void s3c_rtc_setpie(int to) pr_debug("%s: pie=%d\n", __FUNCTION__, to); spin_lock_irq(&s3c_rtc_pie_lock); - tmp = readb(S3C2410_TICNT) & ~S3C2410_TICNT_ENABLE; + tmp = readb(s3c_rtc_base + S3C2410_TICNT) & ~S3C2410_TICNT_ENABLE; if (to) tmp |= S3C2410_TICNT_ENABLE; - writeb(tmp, S3C2410_TICNT); + writeb(tmp, s3c_rtc_base + S3C2410_TICNT); spin_unlock_irq(&s3c_rtc_pie_lock); } @@ -98,13 +98,13 @@ static void s3c_rtc_setfreq(int freq) unsigned int tmp; spin_lock_irq(&s3c_rtc_pie_lock); - tmp = readb(S3C2410_TICNT) & S3C2410_TICNT_ENABLE; + tmp = readb(s3c_rtc_base + S3C2410_TICNT) & S3C2410_TICNT_ENABLE; s3c_rtc_freq = freq; tmp |= (128 / freq)-1; - writeb(tmp, S3C2410_TICNT); + writeb(tmp, s3c_rtc_base + S3C2410_TICNT); spin_unlock_irq(&s3c_rtc_pie_lock); } @@ -113,14 +113,15 @@ static void s3c_rtc_setfreq(int freq) static int s3c_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm) { unsigned int have_retried = 0; + void __iomem *base = s3c_rtc_base; retry_get_time: - rtc_tm->tm_min = readb(S3C2410_RTCMIN); - rtc_tm->tm_hour = readb(S3C2410_RTCHOUR); - rtc_tm->tm_mday = readb(S3C2410_RTCDATE); - rtc_tm->tm_mon = readb(S3C2410_RTCMON); - rtc_tm->tm_year = readb(S3C2410_RTCYEAR); - rtc_tm->tm_sec = readb(S3C2410_RTCSEC); + rtc_tm->tm_min = readb(base + S3C2410_RTCMIN); + rtc_tm->tm_hour = readb(base + S3C2410_RTCHOUR); + rtc_tm->tm_mday = readb(base + S3C2410_RTCDATE); + rtc_tm->tm_mon = readb(base + S3C2410_RTCMON); + rtc_tm->tm_year = readb(base + S3C2410_RTCYEAR); + rtc_tm->tm_sec = readb(base + S3C2410_RTCSEC); /* the only way to work out wether the system was mid-update * when we read it is to check the second counter, and if it @@ -151,17 +152,26 @@ static int s3c_rtc_gettime(struct device *dev, struct rtc_time *rtc_tm) static int s3c_rtc_settime(struct device *dev, struct rtc_time *tm) { - /* the rtc gets round the y2k problem by just not supporting it */ + void __iomem *base = s3c_rtc_base; + int year = tm->tm_year - 100; - if (tm->tm_year < 100) + pr_debug("set time %02d.%02d.%02d %02d/%02d/%02d\n", + tm->tm_year, tm->tm_mon, tm->tm_mday, + tm->tm_hour, tm->tm_min, tm->tm_sec); + + /* we get around y2k by simply not supporting it */ + + if (year < 0 || year >= 100) { + dev_err(dev, "rtc only supports 100 years\n"); return -EINVAL; + } - writeb(BIN2BCD(tm->tm_sec), S3C2410_RTCSEC); - writeb(BIN2BCD(tm->tm_min), S3C2410_RTCMIN); - writeb(BIN2BCD(tm->tm_hour), S3C2410_RTCHOUR); - writeb(BIN2BCD(tm->tm_mday), S3C2410_RTCDATE); - writeb(BIN2BCD(tm->tm_mon + 1), S3C2410_RTCMON); - writeb(BIN2BCD(tm->tm_year - 100), S3C2410_RTCYEAR); + writeb(BIN2BCD(tm->tm_sec), base + S3C2410_RTCSEC); + writeb(BIN2BCD(tm->tm_min), base + S3C2410_RTCMIN); + writeb(BIN2BCD(tm->tm_hour), base + S3C2410_RTCHOUR); + writeb(BIN2BCD(tm->tm_mday), base + S3C2410_RTCDATE); + writeb(BIN2BCD(tm->tm_mon + 1), base + S3C2410_RTCMON); + writeb(BIN2BCD(year), base + S3C2410_RTCYEAR); return 0; } @@ -169,16 +179,17 @@ static int s3c_rtc_settime(struct device *dev, struct rtc_time *tm) static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm) { struct rtc_time *alm_tm = &alrm->time; + void __iomem *base = s3c_rtc_base; unsigned int alm_en; - alm_tm->tm_sec = readb(S3C2410_ALMSEC); - alm_tm->tm_min = readb(S3C2410_ALMMIN); - alm_tm->tm_hour = readb(S3C2410_ALMHOUR); - alm_tm->tm_mon = readb(S3C2410_ALMMON); - alm_tm->tm_mday = readb(S3C2410_ALMDATE); - alm_tm->tm_year = readb(S3C2410_ALMYEAR); + alm_tm->tm_sec = readb(base + S3C2410_ALMSEC); + alm_tm->tm_min = readb(base + S3C2410_ALMMIN); + alm_tm->tm_hour = readb(base + S3C2410_ALMHOUR); + alm_tm->tm_mon = readb(base + S3C2410_ALMMON); + alm_tm->tm_mday = readb(base + S3C2410_ALMDATE); + alm_tm->tm_year = readb(base + S3C2410_ALMYEAR); - alm_en = readb(S3C2410_RTCALM); + alm_en = readb(base + S3C2410_RTCALM); pr_debug("read alarm %02x %02x.%02x.%02x %02x/%02x/%02x\n", alm_en, @@ -226,6 +237,7 @@ static int s3c_rtc_getalarm(struct device *dev, struct rtc_wkalrm *alrm) static int s3c_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) { struct rtc_time *tm = &alrm->time; + void __iomem *base = s3c_rtc_base; unsigned int alrm_en; pr_debug("s3c_rtc_setalarm: %d, %02x/%02x/%02x %02x.%02x.%02x\n", @@ -234,32 +246,32 @@ static int s3c_rtc_setalarm(struct device *dev, struct rtc_wkalrm *alrm) tm->tm_hour & 0xff, tm->tm_min & 0xff, tm->tm_sec); - alrm_en = readb(S3C2410_RTCALM) & S3C2410_RTCALM_ALMEN; - writeb(0x00, S3C2410_RTCALM); + alrm_en = readb(base + S3C2410_RTCALM) & S3C2410_RTCALM_ALMEN; + writeb(0x00, base + S3C2410_RTCALM); if (tm->tm_sec < 60 && tm->tm_sec >= 0) { alrm_en |= S3C2410_RTCALM_SECEN; - writeb(BIN2BCD(tm->tm_sec), S3C2410_ALMSEC); + writeb(BIN2BCD(tm->tm_sec), base + S3C2410_ALMSEC); } if (tm->tm_min < 60 && tm->tm_min >= 0) { alrm_en |= S3C2410_RTCALM_MINEN; - writeb(BIN2BCD(tm->tm_min), S3C2410_ALMMIN); + writeb(BIN2BCD(tm->tm_min), base + S3C2410_ALMMIN); } if (tm->tm_hour < 24 && tm->tm_hour >= 0) { alrm_en |= S3C2410_RTCALM_HOUREN; - writeb(BIN2BCD(tm->tm_hour), S3C2410_ALMHOUR); + writeb(BIN2BCD(tm->tm_hour), base + S3C2410_ALMHOUR); } pr_debug("setting S3C2410_RTCALM to %08x\n", alrm_en); - writeb(alrm_en, S3C2410_RTCALM); + writeb(alrm_en, base + S3C2410_RTCALM); if (0) { - alrm_en = readb(S3C2410_RTCALM); + alrm_en = readb(base + S3C2410_RTCALM); alrm_en &= ~S3C2410_RTCALM_ALMEN; - writeb(alrm_en, S3C2410_RTCALM); + writeb(alrm_en, base + S3C2410_RTCALM); disable_irq_wake(s3c_rtc_alarmno); } @@ -319,8 +331,8 @@ static int s3c_rtc_ioctl(struct device *dev, static int s3c_rtc_proc(struct device *dev, struct seq_file *seq) { - unsigned int rtcalm = readb(S3C2410_RTCALM); - unsigned int ticnt = readb (S3C2410_TICNT); + unsigned int rtcalm = readb(s3c_rtc_base + S3C2410_RTCALM); + unsigned int ticnt = readb(s3c_rtc_base + S3C2410_TICNT); seq_printf(seq, "alarm_IRQ\t: %s\n", (rtcalm & S3C2410_RTCALM_ALMEN) ? "yes" : "no" ); @@ -387,39 +399,40 @@ static struct rtc_class_ops s3c_rtcops = { static void s3c_rtc_enable(struct platform_device *pdev, int en) { + void __iomem *base = s3c_rtc_base; unsigned int tmp; if (s3c_rtc_base == NULL) return; if (!en) { - tmp = readb(S3C2410_RTCCON); - writeb(tmp & ~S3C2410_RTCCON_RTCEN, S3C2410_RTCCON); + tmp = readb(base + S3C2410_RTCCON); + writeb(tmp & ~S3C2410_RTCCON_RTCEN, base + S3C2410_RTCCON); - tmp = readb(S3C2410_TICNT); - writeb(tmp & ~S3C2410_TICNT_ENABLE, S3C2410_TICNT); + tmp = readb(base + S3C2410_TICNT); + writeb(tmp & ~S3C2410_TICNT_ENABLE, base + S3C2410_TICNT); } else { /* re-enable the device, and check it is ok */ - if ((readb(S3C2410_RTCCON) & S3C2410_RTCCON_RTCEN) == 0){ + if ((readb(base+S3C2410_RTCCON) & S3C2410_RTCCON_RTCEN) == 0){ dev_info(&pdev->dev, "rtc disabled, re-enabling\n"); - tmp = readb(S3C2410_RTCCON); - writeb(tmp | S3C2410_RTCCON_RTCEN , S3C2410_RTCCON); + tmp = readb(base + S3C2410_RTCCON); + writeb(tmp|S3C2410_RTCCON_RTCEN, base+S3C2410_RTCCON); } - if ((readb(S3C2410_RTCCON) & S3C2410_RTCCON_CNTSEL)){ + if ((readb(base + S3C2410_RTCCON) & S3C2410_RTCCON_CNTSEL)){ dev_info(&pdev->dev, "removing RTCCON_CNTSEL\n"); - tmp = readb(S3C2410_RTCCON); - writeb(tmp& ~S3C2410_RTCCON_CNTSEL , S3C2410_RTCCON); + tmp = readb(base + S3C2410_RTCCON); + writeb(tmp& ~S3C2410_RTCCON_CNTSEL, base+S3C2410_RTCCON); } - if ((readb(S3C2410_RTCCON) & S3C2410_RTCCON_CLKRST)){ + if ((readb(base + S3C2410_RTCCON) & S3C2410_RTCCON_CLKRST)){ dev_info(&pdev->dev, "removing RTCCON_CLKRST\n"); - tmp = readb(S3C2410_RTCCON); - writeb(tmp & ~S3C2410_RTCCON_CLKRST, S3C2410_RTCCON); + tmp = readb(base + S3C2410_RTCCON); + writeb(tmp & ~S3C2410_RTCCON_CLKRST, base+S3C2410_RTCCON); } } } @@ -475,8 +488,8 @@ static int s3c_rtc_probe(struct platform_device *pdev) } s3c_rtc_mem = request_mem_region(res->start, - res->end-res->start+1, - pdev->name); + res->end-res->start+1, + pdev->name); if (s3c_rtc_mem == NULL) { dev_err(&pdev->dev, "failed to reserve memory region\n"); @@ -495,7 +508,8 @@ static int s3c_rtc_probe(struct platform_device *pdev) s3c_rtc_enable(pdev, 1); - pr_debug("s3c2410_rtc: RTCCON=%02x\n", readb(S3C2410_RTCCON)); + pr_debug("s3c2410_rtc: RTCCON=%02x\n", + readb(s3c_rtc_base + S3C2410_RTCCON)); s3c_rtc_setfreq(s3c_rtc_freq); @@ -543,7 +557,7 @@ static int s3c_rtc_suspend(struct platform_device *pdev, pm_message_t state) /* save TICNT for anyone using periodic interrupts */ - ticnt_save = readb(S3C2410_TICNT); + ticnt_save = readb(s3c_rtc_base + S3C2410_TICNT); /* calculate time delta for suspend */ @@ -567,7 +581,7 @@ static int s3c_rtc_resume(struct platform_device *pdev) rtc_tm_to_time(&tm, &time.tv_sec); restore_time_delta(&s3c_rtc_delta, &time); - writeb(ticnt_save, S3C2410_TICNT); + writeb(ticnt_save, s3c_rtc_base + S3C2410_TICNT); return 0; } #else diff --git a/drivers/scsi/ide-scsi.c b/drivers/scsi/ide-scsi.c index f7b5d7372d2..94d1de55607 100644 --- a/drivers/scsi/ide-scsi.c +++ b/drivers/scsi/ide-scsi.c @@ -517,7 +517,7 @@ static ide_startstop_t idescsi_pc_intr (ide_drive_t *drive) /* No more interrupts */ if (test_bit(IDESCSI_LOG_CMD, &scsi->log)) printk (KERN_INFO "Packet command completed, %d bytes transferred\n", pc->actually_transferred); - local_irq_enable(); + local_irq_enable_in_hardirq(); if (status.b.check) rq->errors++; idescsi_end_request (drive, 1, 0); diff --git a/drivers/video/imacfb.c b/drivers/video/imacfb.c index b485bece5fc..18ea4a54910 100644 --- a/drivers/video/imacfb.c +++ b/drivers/video/imacfb.c @@ -71,10 +71,10 @@ static int set_system(struct dmi_system_id *id) static struct dmi_system_id __initdata dmi_system_table[] = { { set_system, "iMac4,1", { DMI_MATCH(DMI_BIOS_VENDOR,"Apple Computer, Inc."), - DMI_MATCH(DMI_BIOS_VERSION,"iMac4,1") }, (void*)M_I17}, + DMI_MATCH(DMI_PRODUCT_NAME,"iMac4,1") }, (void*)M_I17}, { set_system, "MacBookPro1,1", { DMI_MATCH(DMI_BIOS_VENDOR,"Apple Computer, Inc."), - DMI_MATCH(DMI_BIOS_VERSION,"MacBookPro1,1") }, (void*)M_I17}, + DMI_MATCH(DMI_PRODUCT_NAME,"MacBookPro1,1") }, (void*)M_I17}, { set_system, "MacBook1,1", { DMI_MATCH(DMI_BIOS_VENDOR,"Apple Computer, Inc."), DMI_MATCH(DMI_PRODUCT_NAME,"MacBook1,1")}, (void *)M_MACBOOK}, diff --git a/drivers/video/matrox/g450_pll.c b/drivers/video/matrox/g450_pll.c index 440272ad10e..7c76e079ca7 100644 --- a/drivers/video/matrox/g450_pll.c +++ b/drivers/video/matrox/g450_pll.c @@ -331,7 +331,15 @@ static int __g450_setclk(WPMINFO unsigned int fout, unsigned int pll, tmp |= M1064_XPIXCLKCTRL_PLL_UP; } matroxfb_DAC_out(PMINFO M1064_XPIXCLKCTRL, tmp); +#ifdef __powerpc__ + /* This is necessary to avoid jitter on PowerPC + * (OpenFirmware) systems, but apparently + * introduces jitter, at least on a x86-64 + * using DVI. + * A simple workaround is disable for non-PPC. + */ matroxfb_DAC_out(PMINFO M1064_XDVICLKCTRL, 0); +#endif /* __powerpc__ */ matroxfb_DAC_out(PMINFO M1064_XPWRCTRL, xpwrctrl); matroxfb_DAC_unlock_irqrestore(flags); diff --git a/fs/block_dev.c b/fs/block_dev.c index 37534573960..045f98854f1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -884,6 +884,61 @@ void bd_set_size(struct block_device *bdev, loff_t size) } EXPORT_SYMBOL(bd_set_size); +static int __blkdev_put(struct block_device *bdev, unsigned int subclass) +{ + int ret = 0; + struct inode *bd_inode = bdev->bd_inode; + struct gendisk *disk = bdev->bd_disk; + + mutex_lock_nested(&bdev->bd_mutex, subclass); + lock_kernel(); + if (!--bdev->bd_openers) { + sync_blockdev(bdev); + kill_bdev(bdev); + } + if (bdev->bd_contains == bdev) { + if (disk->fops->release) + ret = disk->fops->release(bd_inode, NULL); + } else { + mutex_lock_nested(&bdev->bd_contains->bd_mutex, + subclass + 1); + bdev->bd_contains->bd_part_count--; + mutex_unlock(&bdev->bd_contains->bd_mutex); + } + if (!bdev->bd_openers) { + struct module *owner = disk->fops->owner; + + put_disk(disk); + module_put(owner); + + if (bdev->bd_contains != bdev) { + kobject_put(&bdev->bd_part->kobj); + bdev->bd_part = NULL; + } + bdev->bd_disk = NULL; + bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; + if (bdev != bdev->bd_contains) + __blkdev_put(bdev->bd_contains, subclass + 1); + bdev->bd_contains = NULL; + } + unlock_kernel(); + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); + return ret; +} + +int blkdev_put(struct block_device *bdev) +{ + return __blkdev_put(bdev, BD_MUTEX_NORMAL); +} +EXPORT_SYMBOL(blkdev_put); + +int blkdev_put_partition(struct block_device *bdev) +{ + return __blkdev_put(bdev, BD_MUTEX_PARTITION); +} +EXPORT_SYMBOL(blkdev_put_partition); + static int blkdev_get_whole(struct block_device *bdev, mode_t mode, unsigned flags); @@ -980,7 +1035,7 @@ out_first: bdev->bd_disk = NULL; bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) - blkdev_put(bdev->bd_contains); + __blkdev_put(bdev->bd_contains, BD_MUTEX_WHOLE); bdev->bd_contains = NULL; put_disk(disk); module_put(owner); @@ -1079,63 +1134,6 @@ static int blkdev_open(struct inode * inode, struct file * filp) return res; } -static int __blkdev_put(struct block_device *bdev, unsigned int subclass) -{ - int ret = 0; - struct inode *bd_inode = bdev->bd_inode; - struct gendisk *disk = bdev->bd_disk; - - mutex_lock_nested(&bdev->bd_mutex, subclass); - lock_kernel(); - if (!--bdev->bd_openers) { - sync_blockdev(bdev); - kill_bdev(bdev); - } - if (bdev->bd_contains == bdev) { - if (disk->fops->release) - ret = disk->fops->release(bd_inode, NULL); - } else { - mutex_lock_nested(&bdev->bd_contains->bd_mutex, - subclass + 1); - bdev->bd_contains->bd_part_count--; - mutex_unlock(&bdev->bd_contains->bd_mutex); - } - if (!bdev->bd_openers) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); - - if (bdev->bd_contains != bdev) { - kobject_put(&bdev->bd_part->kobj); - bdev->bd_part = NULL; - } - bdev->bd_disk = NULL; - bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; - if (bdev != bdev->bd_contains) - __blkdev_put(bdev->bd_contains, subclass + 1); - bdev->bd_contains = NULL; - } - unlock_kernel(); - mutex_unlock(&bdev->bd_mutex); - bdput(bdev); - return ret; -} - -int blkdev_put(struct block_device *bdev) -{ - return __blkdev_put(bdev, BD_MUTEX_NORMAL); -} - -EXPORT_SYMBOL(blkdev_put); - -int blkdev_put_partition(struct block_device *bdev) -{ - return __blkdev_put(bdev, BD_MUTEX_PARTITION); -} - -EXPORT_SYMBOL(blkdev_put_partition); - static int blkdev_close(struct inode * inode, struct file * filp) { struct block_device *bdev = I_BDEV(filp->f_mapping->host); diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 19ffb043abb..3a3567433b9 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1168,7 +1168,7 @@ static int ep_unlink(struct eventpoll *ep, struct epitem *epi) eexit_1: DNPRINTK(3, (KERN_INFO "[%p] eventpoll: ep_unlink(%p, %p) = %d\n", - current, ep, epi->file, error)); + current, ep, epi->ffd.file, error)); return error; } @@ -1236,7 +1236,7 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct eventpoll *ep = epi->ep; DNPRINTK(3, (KERN_INFO "[%p] eventpoll: poll_callback(%p) epi=%p ep=%p\n", - current, epi->file, epi, ep)); + current, epi->ffd.file, epi, ep)); write_lock_irqsave(&ep->lock, flags); diff --git a/fs/exec.c b/fs/exec.c index f7aabfeca03..54135df2a96 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -751,7 +751,7 @@ no_thread_group: write_lock_irq(&tasklist_lock); spin_lock(&oldsighand->siglock); - spin_lock(&newsighand->siglock); + spin_lock_nested(&newsighand->siglock, SINGLE_DEPTH_NESTING); rcu_assign_pointer(current->sighand, newsighand); recalc_sigpending(); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index f2702cda977..681dea8f953 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -775,7 +775,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (EXT2_INODE_SIZE(sb) == 0) goto cantfind_ext2; sbi->s_inodes_per_block = sb->s_blocksize / EXT2_INODE_SIZE(sb); - if (sbi->s_inodes_per_block == 0) + if (sbi->s_inodes_per_block == 0 || sbi->s_inodes_per_group == 0) goto cantfind_ext2; sbi->s_itb_per_group = sbi->s_inodes_per_group / sbi->s_inodes_per_block; diff --git a/fs/ext3/balloc.c b/fs/ext3/balloc.c index a504a40d6d2..063d994bda0 100644 --- a/fs/ext3/balloc.c +++ b/fs/ext3/balloc.c @@ -1269,12 +1269,12 @@ ext3_fsblk_t ext3_new_blocks(handle_t *handle, struct inode *inode, goal = le32_to_cpu(es->s_first_data_block); group_no = (goal - le32_to_cpu(es->s_first_data_block)) / EXT3_BLOCKS_PER_GROUP(sb); + goal_group = group_no; +retry_alloc: gdp = ext3_get_group_desc(sb, group_no, &gdp_bh); if (!gdp) goto io_error; - goal_group = group_no; -retry: free_blocks = le16_to_cpu(gdp->bg_free_blocks_count); /* * if there is not enough free blocks to make a new resevation @@ -1349,7 +1349,7 @@ retry: if (my_rsv) { my_rsv = NULL; group_no = goal_group; - goto retry; + goto retry_alloc; } /* No space left on the device */ *errp = -ENOSPC; diff --git a/fs/jbd/commit.c b/fs/jbd/commit.c index 0971814c38b..42da6078431 100644 --- a/fs/jbd/commit.c +++ b/fs/jbd/commit.c @@ -261,7 +261,7 @@ void journal_commit_transaction(journal_t *journal) struct buffer_head *bh = jh2bh(jh); jbd_lock_bh_state(bh); - kfree(jh->b_committed_data); + jbd_slab_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; jbd_unlock_bh_state(bh); } @@ -745,14 +745,14 @@ restart_loop: * Otherwise, we can just throw away the frozen data now. */ if (jh->b_committed_data) { - kfree(jh->b_committed_data); + jbd_slab_free(jh->b_committed_data, bh->b_size); jh->b_committed_data = NULL; if (jh->b_frozen_data) { jh->b_committed_data = jh->b_frozen_data; jh->b_frozen_data = NULL; } } else if (jh->b_frozen_data) { - kfree(jh->b_frozen_data); + jbd_slab_free(jh->b_frozen_data, bh->b_size); jh->b_frozen_data = NULL; } diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 8c9b28dff11..f66724ce443 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -84,6 +84,7 @@ EXPORT_SYMBOL(journal_force_commit); static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *); static void __journal_abort_soft (journal_t *journal, int errno); +static int journal_create_jbd_slab(size_t slab_size); /* * Helper function used to manage commit timeouts @@ -328,10 +329,10 @@ repeat: char *tmp; jbd_unlock_bh_state(bh_in); - tmp = jbd_rep_kmalloc(bh_in->b_size, GFP_NOFS); + tmp = jbd_slab_alloc(bh_in->b_size, GFP_NOFS); jbd_lock_bh_state(bh_in); if (jh_in->b_frozen_data) { - kfree(tmp); + jbd_slab_free(tmp, bh_in->b_size); goto repeat; } @@ -1069,17 +1070,17 @@ static int load_superblock(journal_t *journal) int journal_load(journal_t *journal) { int err; + journal_superblock_t *sb; err = load_superblock(journal); if (err) return err; + sb = journal->j_superblock; /* If this is a V2 superblock, then we have to check the * features flags on it. */ if (journal->j_format_version >= 2) { - journal_superblock_t *sb = journal->j_superblock; - if ((sb->s_feature_ro_compat & ~cpu_to_be32(JFS_KNOWN_ROCOMPAT_FEATURES)) || (sb->s_feature_incompat & @@ -1090,6 +1091,13 @@ int journal_load(journal_t *journal) } } + /* + * Create a slab for this blocksize + */ + err = journal_create_jbd_slab(cpu_to_be32(sb->s_blocksize)); + if (err) + return err; + /* Let the recovery code check whether it needs to recover any * data from the journal. */ if (journal_recover(journal)) @@ -1612,6 +1620,77 @@ void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry) } /* + * jbd slab management: create 1k, 2k, 4k, 8k slabs as needed + * and allocate frozen and commit buffers from these slabs. + * + * Reason for doing this is to avoid, SLAB_DEBUG - since it could + * cause bh to cross page boundary. + */ + +#define JBD_MAX_SLABS 5 +#define JBD_SLAB_INDEX(size) (size >> 11) + +static kmem_cache_t *jbd_slab[JBD_MAX_SLABS]; +static const char *jbd_slab_names[JBD_MAX_SLABS] = { + "jbd_1k", "jbd_2k", "jbd_4k", NULL, "jbd_8k" +}; + +static void journal_destroy_jbd_slabs(void) +{ + int i; + + for (i = 0; i < JBD_MAX_SLABS; i++) { + if (jbd_slab[i]) + kmem_cache_destroy(jbd_slab[i]); + jbd_slab[i] = NULL; + } +} + +static int journal_create_jbd_slab(size_t slab_size) +{ + int i = JBD_SLAB_INDEX(slab_size); + + BUG_ON(i >= JBD_MAX_SLABS); + + /* + * Check if we already have a slab created for this size + */ + if (jbd_slab[i]) + return 0; + + /* + * Create a slab and force alignment to be same as slabsize - + * this will make sure that allocations won't cross the page + * boundary. + */ + jbd_slab[i] = kmem_cache_create(jbd_slab_names[i], + slab_size, slab_size, 0, NULL, NULL); + if (!jbd_slab[i]) { + printk(KERN_EMERG "JBD: no memory for jbd_slab cache\n"); + return -ENOMEM; + } + return 0; +} + +void * jbd_slab_alloc(size_t size, gfp_t flags) +{ + int idx; + + idx = JBD_SLAB_INDEX(size); + BUG_ON(jbd_slab[idx] == NULL); + return kmem_cache_alloc(jbd_slab[idx], flags | __GFP_NOFAIL); +} + +void jbd_slab_free(void *ptr, size_t size) +{ + int idx; + + idx = JBD_SLAB_INDEX(size); + BUG_ON(jbd_slab[idx] == NULL); + kmem_cache_free(jbd_slab[idx], ptr); +} + +/* * Journal_head storage management */ static kmem_cache_t *journal_head_cache; @@ -1799,13 +1878,13 @@ static void __journal_remove_journal_head(struct buffer_head *bh) printk(KERN_WARNING "%s: freeing " "b_frozen_data\n", __FUNCTION__); - kfree(jh->b_frozen_data); + jbd_slab_free(jh->b_frozen_data, bh->b_size); } if (jh->b_committed_data) { printk(KERN_WARNING "%s: freeing " "b_committed_data\n", __FUNCTION__); - kfree(jh->b_committed_data); + jbd_slab_free(jh->b_committed_data, bh->b_size); } bh->b_private = NULL; jh->b_bh = NULL; /* debug, really */ @@ -1961,6 +2040,7 @@ static void journal_destroy_caches(void) journal_destroy_revoke_caches(); journal_destroy_journal_head_cache(); journal_destroy_handle_cache(); + journal_destroy_jbd_slabs(); } static int __init journal_init(void) diff --git a/fs/jbd/transaction.c b/fs/jbd/transaction.c index 508b2ea91f4..de2e4cbbf79 100644 --- a/fs/jbd/transaction.c +++ b/fs/jbd/transaction.c @@ -666,8 +666,9 @@ repeat: if (!frozen_buffer) { JBUFFER_TRACE(jh, "allocate memory for buffer"); jbd_unlock_bh_state(bh); - frozen_buffer = jbd_kmalloc(jh2bh(jh)->b_size, - GFP_NOFS); + frozen_buffer = + jbd_slab_alloc(jh2bh(jh)->b_size, + GFP_NOFS); if (!frozen_buffer) { printk(KERN_EMERG "%s: OOM for frozen_buffer\n", @@ -879,7 +880,7 @@ int journal_get_undo_access(handle_t *handle, struct buffer_head *bh) repeat: if (!jh->b_committed_data) { - committed_data = jbd_kmalloc(jh2bh(jh)->b_size, GFP_NOFS); + committed_data = jbd_slab_alloc(jh2bh(jh)->b_size, GFP_NOFS); if (!committed_data) { printk(KERN_EMERG "%s: No memory for committed data\n", __FUNCTION__); @@ -906,7 +907,7 @@ repeat: out: journal_put_journal_head(jh); if (unlikely(committed_data)) - kfree(committed_data); + jbd_slab_free(committed_data, bh->b_size); return err; } diff --git a/fs/minix/inode.c b/fs/minix/inode.c index 9ea91c5eeb7..330ff9fc7cf 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -204,6 +204,8 @@ static int minix_fill_super(struct super_block *s, void *data, int silent) /* * Allocate the buffer map to keep the superblock small. */ + if (sbi->s_imap_blocks == 0 || sbi->s_zmap_blocks == 0) + goto out_illegal_sb; i = (sbi->s_imap_blocks + sbi->s_zmap_blocks) * sizeof(bh); map = kmalloc(i, GFP_KERNEL); if (!map) @@ -263,7 +265,7 @@ out_no_root: out_no_bitmap: printk("MINIX-fs: bad superblock or unable to read bitmaps\n"); - out_freemap: +out_freemap: for (i = 0; i < sbi->s_imap_blocks; i++) brelse(sbi->s_imap[i]); for (i = 0; i < sbi->s_zmap_blocks; i++) @@ -276,11 +278,16 @@ out_no_map: printk("MINIX-fs: can't allocate map\n"); goto out_release; +out_illegal_sb: + if (!silent) + printk("MINIX-fs: bad superblock\n"); + goto out_release; + out_no_fs: if (!silent) printk("VFS: Can't find a Minix or Minix V2 filesystem " "on device %s\n", s->s_id); - out_release: +out_release: brelse(bh); goto out; @@ -290,7 +297,7 @@ out_bad_hblock: out_bad_sb: printk("MINIX-fs: unable to read superblock\n"); - out: +out: s->s_fs_info = NULL; kfree(sbi); return -EINVAL; diff --git a/fs/proc/proc_misc.c b/fs/proc/proc_misc.c index 9f2cfc30f9c..94215622544 100644 --- a/fs/proc/proc_misc.c +++ b/fs/proc/proc_misc.c @@ -169,7 +169,7 @@ static int meminfo_read_proc(char *page, char **start, off_t off, "Mapped: %8lu kB\n" "Slab: %8lu kB\n" "PageTables: %8lu kB\n" - "NFS Unstable: %8lu kB\n" + "NFS_Unstable: %8lu kB\n" "Bounce: %8lu kB\n" "CommitLimit: %8lu kB\n" "Committed_AS: %8lu kB\n" diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index 39fedaa88a0..d935fb9394e 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -424,7 +424,7 @@ int xattr_readdir(struct file *file, filldir_t filler, void *buf) int res = -ENOTDIR; if (!file->f_op || !file->f_op->readdir) goto out; - mutex_lock(&inode->i_mutex); + mutex_lock_nested(&inode->i_mutex, I_MUTEX_XATTR); // down(&inode->i_zombie); res = -ENOENT; if (!IS_DEADDIR(inode)) { diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e7c8615beb6..30c6e8a9446 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -169,18 +169,20 @@ static void ufs_clear_frag(struct inode *inode, struct buffer_head *bh) static struct buffer_head * ufs_clear_frags(struct inode *inode, sector_t beg, - unsigned int n) + unsigned int n, sector_t want) { - struct buffer_head *res, *bh; + struct buffer_head *res = NULL, *bh; sector_t end = beg + n; - res = sb_getblk(inode->i_sb, beg); - ufs_clear_frag(inode, res); - for (++beg; beg < end; ++beg) { + for (; beg < end; ++beg) { bh = sb_getblk(inode->i_sb, beg); ufs_clear_frag(inode, bh); - brelse(bh); + if (want != beg) + brelse(bh); + else + res = bh; } + BUG_ON(!res); return res; } @@ -265,7 +267,9 @@ repeat: lastfrag = ufsi->i_lastfrag; } - goal = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]) + uspi->s_fpb; + tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock]); + if (tmp) + goal = tmp + uspi->s_fpb; tmp = ufs_new_fragments (inode, p, fragment - blockoff, goal, required + blockoff, err, locked_page); @@ -277,13 +281,15 @@ repeat: tmp = ufs_new_fragments(inode, p, fragment - (blockoff - lastblockoff), fs32_to_cpu(sb, *p), required + (blockoff - lastblockoff), err, locked_page); - } + } else /* (lastblock > block) */ { /* * We will allocate new block before last allocated block */ - else /* (lastblock > block) */ { - if (lastblock && (tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[lastblock-1]))) - goal = tmp + uspi->s_fpb; + if (block) { + tmp = fs32_to_cpu(sb, ufsi->i_u1.i_data[block-1]); + if (tmp) + goal = tmp + uspi->s_fpb; + } tmp = ufs_new_fragments(inode, p, fragment - blockoff, goal, uspi->s_fpb, err, locked_page); } @@ -296,7 +302,7 @@ repeat: } if (!phys) { - result = ufs_clear_frags(inode, tmp + blockoff, required); + result = ufs_clear_frags(inode, tmp, required, tmp + blockoff); } else { *phys = tmp + blockoff; result = NULL; @@ -383,7 +389,7 @@ repeat: } } - if (block && (tmp = fs32_to_cpu(sb, ((__fs32*)bh->b_data)[block-1]) + uspi->s_fpb)) + if (block && (tmp = fs32_to_cpu(sb, ((__fs32*)bh->b_data)[block-1]))) goal = tmp + uspi->s_fpb; else goal = bh->b_blocknr + uspi->s_fpb; @@ -397,7 +403,8 @@ repeat: if (!phys) { - result = ufs_clear_frags(inode, tmp + blockoff, uspi->s_fpb); + result = ufs_clear_frags(inode, tmp, uspi->s_fpb, + tmp + blockoff); } else { *phys = tmp + blockoff; *new = 1; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index c9b55872079..ea11d04c41a 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -375,17 +375,15 @@ static int ufs_alloc_lastblock(struct inode *inode) int err = 0; struct address_space *mapping = inode->i_mapping; struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; - struct ufs_inode_info *ufsi = UFS_I(inode); unsigned lastfrag, i, end; struct page *lastpage; struct buffer_head *bh; lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift; - if (!lastfrag) { - ufsi->i_lastfrag = 0; + if (!lastfrag) goto out; - } + lastfrag--; lastpage = ufs_get_locked_page(mapping, lastfrag >> @@ -400,25 +398,25 @@ static int ufs_alloc_lastblock(struct inode *inode) for (i = 0; i < end; ++i) bh = bh->b_this_page; - if (!buffer_mapped(bh)) { - err = ufs_getfrag_block(inode, lastfrag, bh, 1); - - if (unlikely(err)) - goto out_unlock; - - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); - /* - * we do not zeroize fragment, because of - * if it maped to hole, it already contains zeroes - */ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - set_page_dirty(lastpage); - } + + err = ufs_getfrag_block(inode, lastfrag, bh, 1); + + if (unlikely(err)) + goto out_unlock; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + /* + * we do not zeroize fragment, because of + * if it maped to hole, it already contains zeroes + */ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + set_page_dirty(lastpage); } + out_unlock: ufs_put_locked_page(lastpage); out: @@ -440,23 +438,11 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; - if (inode->i_size > old_i_size) { - /* - * if we expand file we should care about - * allocation of block for last byte first of all - */ - err = ufs_alloc_lastblock(inode); + err = ufs_alloc_lastblock(inode); - if (err) { - i_size_write(inode, old_i_size); - goto out; - } - /* - * go away, because of we expand file, and we do not - * need free blocks, and zeroizes page - */ - lock_kernel(); - goto almost_end; + if (err) { + i_size_write(inode, old_i_size); + goto out; } block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); @@ -477,21 +463,8 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) yield(); } - if (inode->i_size < old_i_size) { - /* - * now we should have enough space - * to allocate block for last byte - */ - err = ufs_alloc_lastblock(inode); - if (err) - /* - * looks like all the same - we have no space, - * but we truncate file already - */ - inode->i_size = (ufsi->i_lastfrag - 1) * uspi->s_fsize; - } -almost_end: inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + ufsi->i_lastfrag = DIRECT_FRAGMENT; unlock_kernel(); mark_inode_dirty(inode); out: diff --git a/include/asm-arm/arch-s3c2410/regs-rtc.h b/include/asm-arm/arch-s3c2410/regs-rtc.h index 228983f89bc..0fbec07bb6b 100644 --- a/include/asm-arm/arch-s3c2410/regs-rtc.h +++ b/include/asm-arm/arch-s3c2410/regs-rtc.h @@ -18,7 +18,7 @@ #ifndef __ASM_ARCH_REGS_RTC_H #define __ASM_ARCH_REGS_RTC_H __FILE__ -#define S3C2410_RTCREG(x) ((x) + S3C24XX_VA_RTC) +#define S3C2410_RTCREG(x) (x) #define S3C2410_RTCCON S3C2410_RTCREG(0x40) #define S3C2410_RTCCON_RTCEN (1<<0) diff --git a/include/asm-i386/mmzone.h b/include/asm-i386/mmzone.h index e33e9f9e4c6..22cb07cc8f3 100644 --- a/include/asm-i386/mmzone.h +++ b/include/asm-i386/mmzone.h @@ -14,7 +14,7 @@ extern struct pglist_data *node_data[]; #ifdef CONFIG_X86_NUMAQ #include <asm/numaq.h> -#else /* summit or generic arch */ +#elif defined(CONFIG_ACPI_SRAT)/* summit or generic arch */ #include <asm/srat.h> #endif diff --git a/include/linux/compat_ioctl.h b/include/linux/compat_ioctl.h index 269d000bb2a..bea0255196c 100644 --- a/include/linux/compat_ioctl.h +++ b/include/linux/compat_ioctl.h @@ -216,6 +216,7 @@ COMPATIBLE_IOCTL(VT_RESIZE) COMPATIBLE_IOCTL(VT_RESIZEX) COMPATIBLE_IOCTL(VT_LOCKSWITCH) COMPATIBLE_IOCTL(VT_UNLOCKSWITCH) +COMPATIBLE_IOCTL(VT_GETHIFONTMASK) /* Little p (/dev/rtc, /dev/envctrl, etc.) */ COMPATIBLE_IOCTL(RTC_AIE_ON) COMPATIBLE_IOCTL(RTC_AIE_OFF) diff --git a/include/linux/fs.h b/include/linux/fs.h index 25610205c90..555bc195c42 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -570,13 +570,14 @@ struct inode { * 3: quota file * * The locking order between these classes is - * parent -> child -> normal -> quota + * parent -> child -> normal -> xattr -> quota */ enum inode_i_mutex_lock_class { I_MUTEX_NORMAL, I_MUTEX_PARENT, I_MUTEX_CHILD, + I_MUTEX_XATTR, I_MUTEX_QUOTA }; diff --git a/include/linux/jbd.h b/include/linux/jbd.h index 20eb34403d0..a04c154c520 100644 --- a/include/linux/jbd.h +++ b/include/linux/jbd.h @@ -72,6 +72,9 @@ extern int journal_enable_debug; #endif extern void * __jbd_kmalloc (const char *where, size_t size, gfp_t flags, int retry); +extern void * jbd_slab_alloc(size_t size, gfp_t flags); +extern void jbd_slab_free(void *ptr, size_t size); + #define jbd_kmalloc(size, flags) \ __jbd_kmalloc(__FUNCTION__, (size), (flags), journal_oom_retry) #define jbd_rep_kmalloc(size, flags) \ diff --git a/include/linux/node.h b/include/linux/node.h index 81dcec84cd8..bc001bc225c 100644 --- a/include/linux/node.h +++ b/include/linux/node.h @@ -30,12 +30,20 @@ extern struct node node_devices[]; extern int register_node(struct node *, int, struct node *); extern void unregister_node(struct node *node); +#ifdef CONFIG_NUMA extern int register_one_node(int nid); extern void unregister_one_node(int nid); -#ifdef CONFIG_NUMA extern int register_cpu_under_node(unsigned int cpu, unsigned int nid); extern int unregister_cpu_under_node(unsigned int cpu, unsigned int nid); #else +static inline int register_one_node(int nid) +{ + return 0; +} +static inline int unregister_one_node(int nid) +{ + return 0; +} static inline int register_cpu_under_node(unsigned int cpu, unsigned int nid) { return 0; diff --git a/include/linux/tty.h b/include/linux/tty.h index e421d5e3481..04827ca6578 100644 --- a/include/linux/tty.h +++ b/include/linux/tty.h @@ -59,6 +59,7 @@ struct tty_bufhead { struct tty_buffer *head; /* Queue head */ struct tty_buffer *tail; /* Active buffer */ struct tty_buffer *free; /* Free queue head */ + int memory_used; /* Buffer space used excluding free queue */ }; /* * The pty uses char_buf and flag_buf as a contiguous buffer diff --git a/include/linux/vt.h b/include/linux/vt.h index 8ab334a4822..ba806e8711b 100644 --- a/include/linux/vt.h +++ b/include/linux/vt.h @@ -60,5 +60,6 @@ struct vt_consize { #define VT_RESIZEX 0x560A /* set kernel's idea of screensize + more */ #define VT_LOCKSWITCH 0x560B /* disallow vt switching */ #define VT_UNLOCKSWITCH 0x560C /* allow vt switching */ +#define VT_GETHIFONTMASK 0x560D /* return hi font mask */ #endif /* _LINUX_VT_H */ diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 1a649f2bb9b..4ea6f0dc2fc 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -816,6 +816,10 @@ static int update_cpumask(struct cpuset *cs, char *buf) struct cpuset trialcs; int retval, cpus_unchanged; + /* top_cpuset.cpus_allowed tracks cpu_online_map; it's read-only */ + if (cs == &top_cpuset) + return -EACCES; + trialcs = *cs; retval = cpulist_parse(buf, trialcs.cpus_allowed); if (retval < 0) @@ -2033,6 +2037,33 @@ out: return err; } +/* + * The top_cpuset tracks what CPUs and Memory Nodes are online, + * period. This is necessary in order to make cpusets transparent + * (of no affect) on systems that are actively using CPU hotplug + * but making no active use of cpusets. + * + * This handles CPU hotplug (cpuhp) events. If someday Memory + * Nodes can be hotplugged (dynamically changing node_online_map) + * then we should handle that too, perhaps in a similar way. + */ + +#ifdef CONFIG_HOTPLUG_CPU +static int cpuset_handle_cpuhp(struct notifier_block *nb, + unsigned long phase, void *cpu) +{ + mutex_lock(&manage_mutex); + mutex_lock(&callback_mutex); + + top_cpuset.cpus_allowed = cpu_online_map; + + mutex_unlock(&callback_mutex); + mutex_unlock(&manage_mutex); + + return 0; +} +#endif + /** * cpuset_init_smp - initialize cpus_allowed * @@ -2043,6 +2074,8 @@ void __init cpuset_init_smp(void) { top_cpuset.cpus_allowed = cpu_online_map; top_cpuset.mems_allowed = node_online_map; + + hotcpu_notifier(cpuset_handle_cpuhp, 0); } /** @@ -2387,7 +2420,7 @@ EXPORT_SYMBOL_GPL(cpuset_mem_spread_node); int cpuset_excl_nodes_overlap(const struct task_struct *p) { const struct cpuset *cs1, *cs2; /* my and p's cpuset ancestors */ - int overlap = 0; /* do cpusets overlap? */ + int overlap = 1; /* do cpusets overlap? */ task_lock(current); if (current->flags & PF_EXITING) { diff --git a/kernel/futex.c b/kernel/futex.c index d4633c588f3..b9b8aea5389 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -397,7 +397,7 @@ static struct task_struct * futex_find_get_task(pid_t pid) p = NULL; goto out_unlock; } - if (p->state == EXIT_ZOMBIE || p->exit_state == EXIT_ZOMBIE) { + if (p->exit_state != 0) { p = NULL; goto out_unlock; } diff --git a/kernel/sched.c b/kernel/sched.c index a2be2d05529..a234fbee123 100644 --- a/kernel/sched.c +++ b/kernel/sched.c @@ -4162,10 +4162,8 @@ do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) read_unlock_irq(&tasklist_lock); return -ESRCH; } - get_task_struct(p); - read_unlock_irq(&tasklist_lock); retval = sched_setscheduler(p, policy, &lparam); - put_task_struct(p); + read_unlock_irq(&tasklist_lock); return retval; } diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index dcfb5d73146..51cacd111db 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -111,7 +111,6 @@ static int stop_machine(void) /* If some failed, kill them all. */ if (ret < 0) { stopmachine_set_state(STOPMACHINE_EXIT); - up(&stopmachine_mutex); return ret; } diff --git a/mm/swapfile.c b/mm/swapfile.c index e70d6c6d6fe..f1f5ec78378 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -442,11 +442,12 @@ int swap_type_of(dev_t device) if (!(swap_info[i].flags & SWP_WRITEOK)) continue; + if (!device) { spin_unlock(&swap_lock); return i; } - inode = swap_info->swap_file->f_dentry->d_inode; + inode = swap_info[i].swap_file->f_dentry->d_inode; if (S_ISBLK(inode->i_mode) && device == MKDEV(imajor(inode), iminor(inode))) { spin_unlock(&swap_lock); diff --git a/net/dccp/ccids/ccid3.c b/net/dccp/ccids/ccid3.c index 0f85970ee6d..090bc39e819 100644 --- a/net/dccp/ccids/ccid3.c +++ b/net/dccp/ccids/ccid3.c @@ -342,6 +342,8 @@ static int ccid3_hc_tx_send_packet(struct sock *sk, new_packet->dccphtx_ccval = DCCP_SKB_CB(skb)->dccpd_ccval = hctx->ccid3hctx_last_win_count; + timeval_add_usecs(&hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); } out: return rc; @@ -413,7 +415,8 @@ static void ccid3_hc_tx_packet_sent(struct sock *sk, int more, int len) case TFRC_SSTATE_NO_FBACK: case TFRC_SSTATE_FBACK: if (len > 0) { - hctx->ccid3hctx_t_nom = now; + timeval_sub_usecs(&hctx->ccid3hctx_t_nom, + hctx->ccid3hctx_t_ipi); ccid3_calc_new_t_ipi(hctx); ccid3_calc_new_delta(hctx); timeval_add_usecs(&hctx->ccid3hctx_t_nom, @@ -757,8 +760,7 @@ static void ccid3_hc_rx_send_feedback(struct sock *sk) } hcrx->ccid3hcrx_tstamp_last_feedback = now; - hcrx->ccid3hcrx_last_counter = packet->dccphrx_ccval; - hcrx->ccid3hcrx_seqno_last_counter = packet->dccphrx_seqno; + hcrx->ccid3hcrx_ccval_last_counter = packet->dccphrx_ccval; hcrx->ccid3hcrx_bytes_recv = 0; /* Convert to multiples of 10us */ @@ -782,7 +784,7 @@ static int ccid3_hc_rx_insert_options(struct sock *sk, struct sk_buff *skb) if (!(sk->sk_state == DCCP_OPEN || sk->sk_state == DCCP_PARTOPEN)) return 0; - DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_last_counter; + DCCP_SKB_CB(skb)->dccpd_ccval = hcrx->ccid3hcrx_ccval_last_counter; if (dccp_packet_without_ack(skb)) return 0; @@ -854,6 +856,11 @@ static u32 ccid3_hc_rx_calc_first_li(struct sock *sk) interval = 1; } found: + if (!tail) { + LIMIT_NETDEBUG(KERN_WARNING "%s: tail is null\n", + __FUNCTION__); + return ~0; + } rtt = timeval_delta(&tstamp, &tail->dccphrx_tstamp) * 4 / interval; ccid3_pr_debug("%s, sk=%p, approximated RTT to %uus\n", dccp_role(sk), sk, rtt); @@ -864,9 +871,20 @@ found: delta = timeval_delta(&tstamp, &hcrx->ccid3hcrx_tstamp_last_feedback); x_recv = usecs_div(hcrx->ccid3hcrx_bytes_recv, delta); + if (x_recv == 0) + x_recv = hcrx->ccid3hcrx_x_recv; + tmp1 = (u64)x_recv * (u64)rtt; do_div(tmp1,10000000); tmp2 = (u32)tmp1; + + if (!tmp2) { + LIMIT_NETDEBUG(KERN_WARNING "tmp2 = 0 " + "%s: x_recv = %u, rtt =%u\n", + __FUNCTION__, x_recv, rtt); + return ~0; + } + fval = (hcrx->ccid3hcrx_s * 100000) / tmp2; /* do not alter order above or you will get overflow on 32 bit */ p = tfrc_calc_x_reverse_lookup(fval); @@ -882,31 +900,101 @@ found: static void ccid3_hc_rx_update_li(struct sock *sk, u64 seq_loss, u8 win_loss) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); + struct dccp_li_hist_entry *next, *head; + u64 seq_temp; - if (seq_loss != DCCP_MAX_SEQNO + 1 && - list_empty(&hcrx->ccid3hcrx_li_hist)) { - struct dccp_li_hist_entry *li_tail; + if (list_empty(&hcrx->ccid3hcrx_li_hist)) { + if (!dccp_li_hist_interval_new(ccid3_li_hist, + &hcrx->ccid3hcrx_li_hist, seq_loss, win_loss)) + return; - li_tail = dccp_li_hist_interval_new(ccid3_li_hist, - &hcrx->ccid3hcrx_li_hist, - seq_loss, win_loss); - if (li_tail == NULL) + next = (struct dccp_li_hist_entry *) + hcrx->ccid3hcrx_li_hist.next; + next->dccplih_interval = ccid3_hc_rx_calc_first_li(sk); + } else { + struct dccp_li_hist_entry *entry; + struct list_head *tail; + + head = (struct dccp_li_hist_entry *) + hcrx->ccid3hcrx_li_hist.next; + /* FIXME win count check removed as was wrong */ + /* should make this check with receive history */ + /* and compare there as per section 10.2 of RFC4342 */ + + /* new loss event detected */ + /* calculate last interval length */ + seq_temp = dccp_delta_seqno(head->dccplih_seqno, seq_loss); + entry = dccp_li_hist_entry_new(ccid3_li_hist, SLAB_ATOMIC); + + if (entry == NULL) { + printk(KERN_CRIT "%s: out of memory\n",__FUNCTION__); + dump_stack(); return; - li_tail->dccplih_interval = ccid3_hc_rx_calc_first_li(sk); - } else - LIMIT_NETDEBUG(KERN_WARNING "%s: FIXME: find end of " - "interval\n", __FUNCTION__); + } + + list_add(&entry->dccplih_node, &hcrx->ccid3hcrx_li_hist); + + tail = hcrx->ccid3hcrx_li_hist.prev; + list_del(tail); + kmem_cache_free(ccid3_li_hist->dccplih_slab, tail); + + /* Create the newest interval */ + entry->dccplih_seqno = seq_loss; + entry->dccplih_interval = seq_temp; + entry->dccplih_win_count = win_loss; + } } -static void ccid3_hc_rx_detect_loss(struct sock *sk) +static int ccid3_hc_rx_detect_loss(struct sock *sk, + struct dccp_rx_hist_entry *packet) { struct ccid3_hc_rx_sock *hcrx = ccid3_hc_rx_sk(sk); - u8 win_loss; - const u64 seq_loss = dccp_rx_hist_detect_loss(&hcrx->ccid3hcrx_hist, - &hcrx->ccid3hcrx_li_hist, - &win_loss); + struct dccp_rx_hist_entry *rx_hist = dccp_rx_hist_head(&hcrx->ccid3hcrx_hist); + u64 seqno = packet->dccphrx_seqno; + u64 tmp_seqno; + int loss = 0; + u8 ccval; + + + tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss; + + if (!rx_hist || + follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) { + hcrx->ccid3hcrx_seqno_nonloss = seqno; + hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval; + goto detect_out; + } + - ccid3_hc_rx_update_li(sk, seq_loss, win_loss); + while (dccp_delta_seqno(hcrx->ccid3hcrx_seqno_nonloss, seqno) + > TFRC_RECV_NUM_LATE_LOSS) { + loss = 1; + ccid3_hc_rx_update_li(sk, hcrx->ccid3hcrx_seqno_nonloss, + hcrx->ccid3hcrx_ccval_nonloss); + tmp_seqno = hcrx->ccid3hcrx_seqno_nonloss; + dccp_inc_seqno(&tmp_seqno); + hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno; + dccp_inc_seqno(&tmp_seqno); + while (dccp_rx_hist_find_entry(&hcrx->ccid3hcrx_hist, + tmp_seqno, &ccval)) { + hcrx->ccid3hcrx_seqno_nonloss = tmp_seqno; + hcrx->ccid3hcrx_ccval_nonloss = ccval; + dccp_inc_seqno(&tmp_seqno); + } + } + + /* FIXME - this code could be simplified with above while */ + /* but works at moment */ + if (follows48(packet->dccphrx_seqno, hcrx->ccid3hcrx_seqno_nonloss)) { + hcrx->ccid3hcrx_seqno_nonloss = seqno; + hcrx->ccid3hcrx_ccval_nonloss = packet->dccphrx_ccval; + } + +detect_out: + dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist, + &hcrx->ccid3hcrx_li_hist, packet, + hcrx->ccid3hcrx_seqno_nonloss); + return loss; } static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) @@ -916,8 +1004,8 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) struct dccp_rx_hist_entry *packet; struct timeval now; u8 win_count; - u32 p_prev, r_sample, t_elapsed; - int ins; + u32 p_prev, rtt_prev, r_sample, t_elapsed; + int loss; BUG_ON(hcrx == NULL || !(hcrx->ccid3hcrx_state == TFRC_RSTATE_NO_DATA || @@ -932,7 +1020,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) case DCCP_PKT_DATAACK: if (opt_recv->dccpor_timestamp_echo == 0) break; - p_prev = hcrx->ccid3hcrx_rtt; + rtt_prev = hcrx->ccid3hcrx_rtt; dccp_timestamp(sk, &now); timeval_sub_usecs(&now, opt_recv->dccpor_timestamp_echo * 10); r_sample = timeval_usecs(&now); @@ -951,8 +1039,8 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) hcrx->ccid3hcrx_rtt = (hcrx->ccid3hcrx_rtt * 9) / 10 + r_sample / 10; - if (p_prev != hcrx->ccid3hcrx_rtt) - ccid3_pr_debug("%s, New RTT=%luus, elapsed time=%u\n", + if (rtt_prev != hcrx->ccid3hcrx_rtt) + ccid3_pr_debug("%s, New RTT=%uus, elapsed time=%u\n", dccp_role(sk), hcrx->ccid3hcrx_rtt, opt_recv->dccpor_elapsed_time); break; @@ -973,8 +1061,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) win_count = packet->dccphrx_ccval; - ins = dccp_rx_hist_add_packet(ccid3_rx_hist, &hcrx->ccid3hcrx_hist, - &hcrx->ccid3hcrx_li_hist, packet); + loss = ccid3_hc_rx_detect_loss(sk, packet); if (DCCP_SKB_CB(skb)->dccpd_type == DCCP_PKT_ACK) return; @@ -991,7 +1078,7 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) case TFRC_RSTATE_DATA: hcrx->ccid3hcrx_bytes_recv += skb->len - dccp_hdr(skb)->dccph_doff * 4; - if (ins != 0) + if (loss) break; dccp_timestamp(sk, &now); @@ -1012,7 +1099,6 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) ccid3_pr_debug("%s, sk=%p(%s), data loss! Reacting...\n", dccp_role(sk), sk, dccp_state_name(sk->sk_state)); - ccid3_hc_rx_detect_loss(sk); p_prev = hcrx->ccid3hcrx_p; /* Calculate loss event rate */ @@ -1022,6 +1108,9 @@ static void ccid3_hc_rx_packet_recv(struct sock *sk, struct sk_buff *skb) /* Scaling up by 1000000 as fixed decimal */ if (i_mean != 0) hcrx->ccid3hcrx_p = 1000000 / i_mean; + } else { + printk(KERN_CRIT "%s: empty loss hist\n",__FUNCTION__); + dump_stack(); } if (hcrx->ccid3hcrx_p > p_prev) { diff --git a/net/dccp/ccids/ccid3.h b/net/dccp/ccids/ccid3.h index 22cb9f80a09..0a2cb7536d2 100644 --- a/net/dccp/ccids/ccid3.h +++ b/net/dccp/ccids/ccid3.h @@ -120,9 +120,10 @@ struct ccid3_hc_rx_sock { #define ccid3hcrx_x_recv ccid3hcrx_tfrc.tfrcrx_x_recv #define ccid3hcrx_rtt ccid3hcrx_tfrc.tfrcrx_rtt #define ccid3hcrx_p ccid3hcrx_tfrc.tfrcrx_p - u64 ccid3hcrx_seqno_last_counter:48, + u64 ccid3hcrx_seqno_nonloss:48, + ccid3hcrx_ccval_nonloss:4, ccid3hcrx_state:8, - ccid3hcrx_last_counter:4; + ccid3hcrx_ccval_last_counter:4; u32 ccid3hcrx_bytes_recv; struct timeval ccid3hcrx_tstamp_last_feedback; struct timeval ccid3hcrx_tstamp_last_ack; diff --git a/net/dccp/ccids/lib/loss_interval.c b/net/dccp/ccids/lib/loss_interval.c index b93d9fc98cb..906c81ab9d4 100644 --- a/net/dccp/ccids/lib/loss_interval.c +++ b/net/dccp/ccids/lib/loss_interval.c @@ -12,6 +12,7 @@ */ #include <linux/module.h> +#include <net/sock.h> #include "loss_interval.h" @@ -90,13 +91,13 @@ u32 dccp_li_hist_calc_i_mean(struct list_head *list) u32 w_tot = 0; list_for_each_entry_safe(li_entry, li_next, list, dccplih_node) { - if (i < DCCP_LI_HIST_IVAL_F_LENGTH) { + if (li_entry->dccplih_interval != ~0) { i_tot0 += li_entry->dccplih_interval * dccp_li_hist_w[i]; w_tot += dccp_li_hist_w[i]; + if (i != 0) + i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1]; } - if (i != 0) - i_tot1 += li_entry->dccplih_interval * dccp_li_hist_w[i - 1]; if (++i > DCCP_LI_HIST_IVAL_F_LENGTH) break; @@ -107,37 +108,36 @@ u32 dccp_li_hist_calc_i_mean(struct list_head *list) i_tot = max(i_tot0, i_tot1); - /* FIXME: Why do we do this? -Ian McDonald */ - if (i_tot * 4 < w_tot) - i_tot = w_tot * 4; + if (!w_tot) { + LIMIT_NETDEBUG(KERN_WARNING "%s: w_tot = 0\n", __FUNCTION__); + return 1; + } - return i_tot * 4 / w_tot; + return i_tot / w_tot; } EXPORT_SYMBOL_GPL(dccp_li_hist_calc_i_mean); -struct dccp_li_hist_entry *dccp_li_hist_interval_new(struct dccp_li_hist *hist, - struct list_head *list, - const u64 seq_loss, - const u8 win_loss) +int dccp_li_hist_interval_new(struct dccp_li_hist *hist, + struct list_head *list, const u64 seq_loss, const u8 win_loss) { - struct dccp_li_hist_entry *tail = NULL, *entry; + struct dccp_li_hist_entry *entry; int i; - for (i = 0; i <= DCCP_LI_HIST_IVAL_F_LENGTH; ++i) { + for (i = 0; i < DCCP_LI_HIST_IVAL_F_LENGTH; i++) { entry = dccp_li_hist_entry_new(hist, SLAB_ATOMIC); if (entry == NULL) { dccp_li_hist_purge(hist, list); - return NULL; + dump_stack(); + return 0; } - if (tail == NULL) - tail = entry; + entry->dccplih_interval = ~0; list_add(&entry->dccplih_node, list); } entry->dccplih_seqno = seq_loss; entry->dccplih_win_count = win_loss; - return tail; + return 1; } EXPORT_SYMBOL_GPL(dccp_li_hist_interval_new); diff --git a/net/dccp/ccids/lib/loss_interval.h b/net/dccp/ccids/lib/loss_interval.h index dcb370a53f5..0ae85f0340b 100644 --- a/net/dccp/ccids/lib/loss_interval.h +++ b/net/dccp/ccids/lib/loss_interval.h @@ -52,9 +52,6 @@ extern void dccp_li_hist_purge(struct dccp_li_hist *hist, extern u32 dccp_li_hist_calc_i_mean(struct list_head *list); -extern struct dccp_li_hist_entry * - dccp_li_hist_interval_new(struct dccp_li_hist *hist, - struct list_head *list, - const u64 seq_loss, - const u8 win_loss); +extern int dccp_li_hist_interval_new(struct dccp_li_hist *hist, + struct list_head *list, const u64 seq_loss, const u8 win_loss); #endif /* _DCCP_LI_HIST_ */ diff --git a/net/dccp/ccids/lib/packet_history.c b/net/dccp/ccids/lib/packet_history.c index 420c60f8604..b876c9c81c6 100644 --- a/net/dccp/ccids/lib/packet_history.c +++ b/net/dccp/ccids/lib/packet_history.c @@ -112,64 +112,27 @@ struct dccp_rx_hist_entry * EXPORT_SYMBOL_GPL(dccp_rx_hist_find_data_packet); -int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, +void dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, struct list_head *rx_list, struct list_head *li_list, - struct dccp_rx_hist_entry *packet) + struct dccp_rx_hist_entry *packet, + u64 nonloss_seqno) { - struct dccp_rx_hist_entry *entry, *next, *iter; + struct dccp_rx_hist_entry *entry, *next; u8 num_later = 0; - iter = dccp_rx_hist_head(rx_list); - if (iter == NULL) - dccp_rx_hist_add_entry(rx_list, packet); - else { - const u64 seqno = packet->dccphrx_seqno; - - if (after48(seqno, iter->dccphrx_seqno)) - dccp_rx_hist_add_entry(rx_list, packet); - else { - if (dccp_rx_hist_entry_data_packet(iter)) - num_later = 1; - - list_for_each_entry_continue(iter, rx_list, - dccphrx_node) { - if (after48(seqno, iter->dccphrx_seqno)) { - dccp_rx_hist_add_entry(&iter->dccphrx_node, - packet); - goto trim_history; - } - - if (dccp_rx_hist_entry_data_packet(iter)) - num_later++; - - if (num_later == TFRC_RECV_NUM_LATE_LOSS) { - dccp_rx_hist_entry_delete(hist, packet); - return 1; - } - } - - if (num_later < TFRC_RECV_NUM_LATE_LOSS) - dccp_rx_hist_add_entry(rx_list, packet); - /* - * FIXME: else what? should we destroy the packet - * like above? - */ - } - } + list_add(&packet->dccphrx_node, rx_list); -trim_history: - /* - * Trim history (remove all packets after the NUM_LATE_LOSS + 1 - * data packets) - */ num_later = TFRC_RECV_NUM_LATE_LOSS + 1; if (!list_empty(li_list)) { list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { if (num_later == 0) { - list_del_init(&entry->dccphrx_node); - dccp_rx_hist_entry_delete(hist, entry); + if (after48(nonloss_seqno, + entry->dccphrx_seqno)) { + list_del_init(&entry->dccphrx_node); + dccp_rx_hist_entry_delete(hist, entry); + } } else if (dccp_rx_hist_entry_data_packet(entry)) --num_later; } @@ -217,94 +180,10 @@ trim_history: --num_later; } } - - return 0; } EXPORT_SYMBOL_GPL(dccp_rx_hist_add_packet); -u64 dccp_rx_hist_detect_loss(struct list_head *rx_list, - struct list_head *li_list, u8 *win_loss) -{ - struct dccp_rx_hist_entry *entry, *next, *packet; - struct dccp_rx_hist_entry *a_loss = NULL; - struct dccp_rx_hist_entry *b_loss = NULL; - u64 seq_loss = DCCP_MAX_SEQNO + 1; - u8 num_later = TFRC_RECV_NUM_LATE_LOSS; - - list_for_each_entry_safe(entry, next, rx_list, dccphrx_node) { - if (num_later == 0) { - b_loss = entry; - break; - } else if (dccp_rx_hist_entry_data_packet(entry)) - --num_later; - } - - if (b_loss == NULL) - goto out; - - num_later = 1; - list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) { - if (num_later == 0) { - a_loss = entry; - break; - } else if (dccp_rx_hist_entry_data_packet(entry)) - --num_later; - } - - if (a_loss == NULL) { - if (list_empty(li_list)) { - /* no loss event have occured yet */ - LIMIT_NETDEBUG("%s: TODO: find a lost data packet by " - "comparing to initial seqno\n", - __FUNCTION__); - goto out; - } else { - LIMIT_NETDEBUG("%s: Less than 4 data pkts in history!", - __FUNCTION__); - goto out; - } - } - - /* Locate a lost data packet */ - entry = packet = b_loss; - list_for_each_entry_safe_continue(entry, next, rx_list, dccphrx_node) { - u64 delta = dccp_delta_seqno(entry->dccphrx_seqno, - packet->dccphrx_seqno); - - if (delta != 0) { - if (dccp_rx_hist_entry_data_packet(packet)) - --delta; - /* - * FIXME: check this, probably this % usage is because - * in earlier drafts the ndp count was just 8 bits - * long, but now it cam be up to 24 bits long. - */ -#if 0 - if (delta % DCCP_NDP_LIMIT != - (packet->dccphrx_ndp - - entry->dccphrx_ndp) % DCCP_NDP_LIMIT) -#endif - if (delta != packet->dccphrx_ndp - entry->dccphrx_ndp) { - seq_loss = entry->dccphrx_seqno; - dccp_inc_seqno(&seq_loss); - } - } - packet = entry; - if (packet == a_loss) - break; - } -out: - if (seq_loss != DCCP_MAX_SEQNO + 1) - *win_loss = a_loss->dccphrx_ccval; - else - *win_loss = 0; /* Paranoia */ - - return seq_loss; -} - -EXPORT_SYMBOL_GPL(dccp_rx_hist_detect_loss); - struct dccp_tx_hist *dccp_tx_hist_new(const char *name) { struct dccp_tx_hist *hist = kmalloc(sizeof(*hist), GFP_ATOMIC); diff --git a/net/dccp/ccids/lib/packet_history.h b/net/dccp/ccids/lib/packet_history.h index aea9c5d7091..067cf1c85a3 100644 --- a/net/dccp/ccids/lib/packet_history.h +++ b/net/dccp/ccids/lib/packet_history.h @@ -166,12 +166,6 @@ static inline void dccp_rx_hist_entry_delete(struct dccp_rx_hist *hist, extern void dccp_rx_hist_purge(struct dccp_rx_hist *hist, struct list_head *list); -static inline void dccp_rx_hist_add_entry(struct list_head *list, - struct dccp_rx_hist_entry *entry) -{ - list_add(&entry->dccphrx_node, list); -} - static inline struct dccp_rx_hist_entry * dccp_rx_hist_head(struct list_head *list) { @@ -190,10 +184,11 @@ static inline int entry->dccphrx_type == DCCP_PKT_DATAACK; } -extern int dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, +extern void dccp_rx_hist_add_packet(struct dccp_rx_hist *hist, struct list_head *rx_list, struct list_head *li_list, - struct dccp_rx_hist_entry *packet); + struct dccp_rx_hist_entry *packet, + u64 nonloss_seqno); extern u64 dccp_rx_hist_detect_loss(struct list_head *rx_list, struct list_head *li_list, u8 *win_loss); |