From 1122a26f2abe4245ccdaed95ec23f63fe086b332 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Sep 2009 13:52:12 +0200 Subject: block: use normal I/O path for discard requests prepare_discard_fn() was being called in a place where memory allocation was effectively impossible. This makes it inappropriate for all but the most trivial translations of Linux's DISCARD operation to the block command set. Additionally adding a payload there makes the ownership of the bio backing unclear as it's now allocated by the device driver and not the submitter as usual. It is replaced with QUEUE_FLAG_DISCARD which is used to indicate whether the queue supports discard operations or not. blkdev_issue_discard now allocates a one-page, sector-length payload which is the right thing for the common ATA and SCSI implementations. The mtd implementation of prepare_discard_fn() is replaced with simply checking for the request being a discard. Largely based on a previous patch from Matthew Wilcox which did the prepare_discard_fn but not the different payload allocation yet. Signed-off-by: Christoph Hellwig --- include/linux/blkdev.h | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e23a86cae5a..f62d45e8761 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -82,7 +82,6 @@ enum rq_cmd_type_bits { enum { REQ_LB_OP_EJECT = 0x40, /* eject request */ REQ_LB_OP_FLUSH = 0x41, /* flush request */ - REQ_LB_OP_DISCARD = 0x42, /* discard sectors */ }; /* @@ -261,7 +260,6 @@ typedef void (request_fn_proc) (struct request_queue *q); typedef int (make_request_fn) (struct request_queue *q, struct bio *bio); typedef int (prep_rq_fn) (struct request_queue *, struct request *); typedef void (unplug_fn) (struct request_queue *); -typedef int (prepare_discard_fn) (struct request_queue *, struct request *); struct bio_vec; struct bvec_merge_data { @@ -340,7 +338,6 @@ struct request_queue make_request_fn *make_request_fn; prep_rq_fn *prep_rq_fn; unplug_fn *unplug_fn; - prepare_discard_fn *prepare_discard_fn; merge_bvec_fn *merge_bvec_fn; prepare_flush_fn *prepare_flush_fn; softirq_done_fn *softirq_done_fn; @@ -460,6 +457,7 @@ struct request_queue #define QUEUE_FLAG_VIRT QUEUE_FLAG_NONROT /* paravirt device */ #define QUEUE_FLAG_IO_STAT 15 /* do IO stats */ #define QUEUE_FLAG_CQ 16 /* hardware does queuing */ +#define QUEUE_FLAG_DISCARD 17 /* supports DISCARD */ #define QUEUE_FLAG_DEFAULT ((1 << QUEUE_FLAG_IO_STAT) | \ (1 << QUEUE_FLAG_CLUSTER) | \ @@ -591,6 +589,7 @@ enum { #define blk_queue_flushing(q) ((q)->ordseq) #define blk_queue_stackable(q) \ test_bit(QUEUE_FLAG_STACKABLE, &(q)->queue_flags) +#define blk_queue_discard(q) test_bit(QUEUE_FLAG_DISCARD, &(q)->queue_flags) #define blk_fs_request(rq) ((rq)->cmd_type == REQ_TYPE_FS) #define blk_pc_request(rq) ((rq)->cmd_type == REQ_TYPE_BLOCK_PC) @@ -955,7 +954,6 @@ extern void blk_queue_merge_bvec(struct request_queue *, merge_bvec_fn *); extern void blk_queue_dma_alignment(struct request_queue *, int); extern void blk_queue_update_dma_alignment(struct request_queue *, int); extern void blk_queue_softirq_done(struct request_queue *, softirq_done_fn *); -extern void blk_queue_set_discard(struct request_queue *, prepare_discard_fn *); extern void blk_queue_rq_timed_out(struct request_queue *, rq_timed_out_fn *); extern void blk_queue_rq_timeout(struct request_queue *, unsigned int); extern struct backing_dev_info *blk_get_backing_dev_info(struct block_device *bdev); -- cgit v1.2.3 From ca80650cfbde5b17a5fa957a261c7973f84599a7 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 30 Sep 2009 13:54:20 +0200 Subject: block: allow large discard requests Currently we set the bio size to the byte equivalent of the blocks to be trimmed when submitting the initial DISCARD ioctl. That means it is subject to the max_hw_sectors limitation of the HBA which is much lower than the size of a DISCARD request we can support. Add a separate max_discard_sectors tunable to limit the size for discard requests. We limit the max discard request size in bytes to 32bit as that is the limit for bio->bi_size. This could be much larger if we had a way to pass that information through the block layer. Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index f62d45e8761..1a03b715dfa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -311,6 +311,7 @@ struct queue_limits { unsigned int alignment_offset; unsigned int io_min; unsigned int io_opt; + unsigned int max_discard_sectors; unsigned short logical_block_size; unsigned short max_hw_segments; @@ -928,6 +929,8 @@ extern void blk_queue_max_hw_sectors(struct request_queue *, unsigned int); extern void blk_queue_max_phys_segments(struct request_queue *, unsigned short); extern void blk_queue_max_hw_segments(struct request_queue *, unsigned short); extern void blk_queue_max_segment_size(struct request_queue *, unsigned int); +extern void blk_queue_max_discard_sectors(struct request_queue *q, + unsigned int max_discard_sectors); extern void blk_queue_logical_block_size(struct request_queue *, unsigned short); extern void blk_queue_physical_block_size(struct request_queue *, unsigned short); extern void blk_queue_alignment_offset(struct request_queue *q, -- cgit v1.2.3 From 1a35e0f6443f4266dad4c569c55c57a9032596fa Mon Sep 17 00:00:00 2001 From: Jun'ichi Nomura Date: Thu, 1 Oct 2009 21:16:13 +0200 Subject: Add a tracepoint for block request remapping Since 2.6.31 now has request-based device-mapper, it's useful to have a tracepoint for request-remapping as well as bio-remapping. This patch adds a tracepoint for request-remapping, trace_block_rq_remap(). Signed-off-by: Kiyoshi Ueda Signed-off-by: Jun'ichi Nomura Cc: Alasdair G Kergon Cc: Li Zefan Signed-off-by: Jens Axboe --- include/linux/blktrace_api.h | 2 +- include/trace/events/block.h | 33 +++++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/blktrace_api.h b/include/linux/blktrace_api.h index 622939a2329..3b73b9992b2 100644 --- a/include/linux/blktrace_api.h +++ b/include/linux/blktrace_api.h @@ -212,7 +212,7 @@ extern struct attribute_group blk_trace_attr_group; # define blk_trace_startstop(q, start) (-ENOTTY) # define blk_trace_remove(q) (-ENOTTY) # define blk_add_trace_msg(q, fmt, ...) do { } while (0) -# define blk_trace_remove_sysfs(struct device *dev) do { } while (0) +# define blk_trace_remove_sysfs(dev) do { } while (0) static inline int blk_trace_init_sysfs(struct device *dev) { return 0; diff --git a/include/trace/events/block.h b/include/trace/events/block.h index d86af94691c..00405b5f624 100644 --- a/include/trace/events/block.h +++ b/include/trace/events/block.h @@ -488,6 +488,39 @@ TRACE_EVENT(block_remap, (unsigned long long)__entry->old_sector) ); +TRACE_EVENT(block_rq_remap, + + TP_PROTO(struct request_queue *q, struct request *rq, dev_t dev, + sector_t from), + + TP_ARGS(q, rq, dev, from), + + TP_STRUCT__entry( + __field( dev_t, dev ) + __field( sector_t, sector ) + __field( unsigned int, nr_sector ) + __field( dev_t, old_dev ) + __field( sector_t, old_sector ) + __array( char, rwbs, 6 ) + ), + + TP_fast_assign( + __entry->dev = disk_devt(rq->rq_disk); + __entry->sector = blk_rq_pos(rq); + __entry->nr_sector = blk_rq_sectors(rq); + __entry->old_dev = dev; + __entry->old_sector = from; + blk_fill_rwbs_rq(__entry->rwbs, rq); + ), + + TP_printk("%d,%d %s %llu + %u <- (%d,%d) %llu", + MAJOR(__entry->dev), MINOR(__entry->dev), __entry->rwbs, + (unsigned long long)__entry->sector, + __entry->nr_sector, + MAJOR(__entry->old_dev), MINOR(__entry->old_dev), + (unsigned long long)__entry->old_sector) +); + #endif /* _TRACE_BLOCK_H */ /* This part must be outside protection */ -- cgit v1.2.3 From b411b3637fa71fce9cf2acf0639009500f5892fe Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Fri, 25 Sep 2009 16:07:19 -0700 Subject: The DRBD driver Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd.h | 349 +++++++++++++++++++++++++++++++++++++++++ include/linux/drbd_limits.h | 137 ++++++++++++++++ include/linux/drbd_nl.h | 137 ++++++++++++++++ include/linux/drbd_tag_magic.h | 83 ++++++++++ include/linux/lru_cache.h | 294 ++++++++++++++++++++++++++++++++++ 5 files changed, 1000 insertions(+) create mode 100644 include/linux/drbd.h create mode 100644 include/linux/drbd_limits.h create mode 100644 include/linux/drbd_nl.h create mode 100644 include/linux/drbd_tag_magic.h create mode 100644 include/linux/lru_cache.h (limited to 'include') diff --git a/include/linux/drbd.h b/include/linux/drbd.h new file mode 100644 index 00000000000..69dc711f37b --- /dev/null +++ b/include/linux/drbd.h @@ -0,0 +1,349 @@ +/* + drbd.h + Kernel module for 2.6.x Kernels + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2001-2008, Philipp Reisner . + Copyright (C) 2001-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + +*/ +#ifndef DRBD_H +#define DRBD_H +#include +#include + +#ifdef __KERNEL__ +#include +#include +#else +#include +#include +#include + +/* Altough the Linux source code makes a difference between + generic endianness and the bitfields' endianness, there is no + architecture as of Linux-2.6.24-rc4 where the bitfileds' endianness + does not match the generic endianness. */ + +#if __BYTE_ORDER == __LITTLE_ENDIAN +#define __LITTLE_ENDIAN_BITFIELD +#elif __BYTE_ORDER == __BIG_ENDIAN +#define __BIG_ENDIAN_BITFIELD +#else +# error "sorry, weird endianness on this box" +#endif + +#endif + + +extern const char *drbd_buildtag(void); +#define REL_VERSION "8.3.3rc2" +#define API_VERSION 88 +#define PRO_VERSION_MIN 86 +#define PRO_VERSION_MAX 91 + + +enum drbd_io_error_p { + EP_PASS_ON, /* FIXME should the better be named "Ignore"? */ + EP_CALL_HELPER, + EP_DETACH +}; + +enum drbd_fencing_p { + FP_DONT_CARE, + FP_RESOURCE, + FP_STONITH +}; + +enum drbd_disconnect_p { + DP_RECONNECT, + DP_DROP_NET_CONF, + DP_FREEZE_IO +}; + +enum drbd_after_sb_p { + ASB_DISCONNECT, + ASB_DISCARD_YOUNGER_PRI, + ASB_DISCARD_OLDER_PRI, + ASB_DISCARD_ZERO_CHG, + ASB_DISCARD_LEAST_CHG, + ASB_DISCARD_LOCAL, + ASB_DISCARD_REMOTE, + ASB_CONSENSUS, + ASB_DISCARD_SECONDARY, + ASB_CALL_HELPER, + ASB_VIOLENTLY +}; + +/* KEEP the order, do not delete or insert. Only append. */ +enum drbd_ret_codes { + ERR_CODE_BASE = 100, + NO_ERROR = 101, + ERR_LOCAL_ADDR = 102, + ERR_PEER_ADDR = 103, + ERR_OPEN_DISK = 104, + ERR_OPEN_MD_DISK = 105, + ERR_DISK_NOT_BDEV = 107, + ERR_MD_NOT_BDEV = 108, + ERR_DISK_TO_SMALL = 111, + ERR_MD_DISK_TO_SMALL = 112, + ERR_BDCLAIM_DISK = 114, + ERR_BDCLAIM_MD_DISK = 115, + ERR_MD_IDX_INVALID = 116, + ERR_IO_MD_DISK = 118, + ERR_MD_INVALID = 119, + ERR_AUTH_ALG = 120, + ERR_AUTH_ALG_ND = 121, + ERR_NOMEM = 122, + ERR_DISCARD = 123, + ERR_DISK_CONFIGURED = 124, + ERR_NET_CONFIGURED = 125, + ERR_MANDATORY_TAG = 126, + ERR_MINOR_INVALID = 127, + ERR_INTR = 129, /* EINTR */ + ERR_RESIZE_RESYNC = 130, + ERR_NO_PRIMARY = 131, + ERR_SYNC_AFTER = 132, + ERR_SYNC_AFTER_CYCLE = 133, + ERR_PAUSE_IS_SET = 134, + ERR_PAUSE_IS_CLEAR = 135, + ERR_PACKET_NR = 137, + ERR_NO_DISK = 138, + ERR_NOT_PROTO_C = 139, + ERR_NOMEM_BITMAP = 140, + ERR_INTEGRITY_ALG = 141, /* DRBD 8.2 only */ + ERR_INTEGRITY_ALG_ND = 142, /* DRBD 8.2 only */ + ERR_CPU_MASK_PARSE = 143, /* DRBD 8.2 only */ + ERR_CSUMS_ALG = 144, /* DRBD 8.2 only */ + ERR_CSUMS_ALG_ND = 145, /* DRBD 8.2 only */ + ERR_VERIFY_ALG = 146, /* DRBD 8.2 only */ + ERR_VERIFY_ALG_ND = 147, /* DRBD 8.2 only */ + ERR_CSUMS_RESYNC_RUNNING= 148, /* DRBD 8.2 only */ + ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */ + ERR_DATA_NOT_CURRENT = 150, + ERR_CONNECTED = 151, /* DRBD 8.3 only */ + + /* insert new ones above this line */ + AFTER_LAST_ERR_CODE +}; + +#define DRBD_PROT_A 1 +#define DRBD_PROT_B 2 +#define DRBD_PROT_C 3 + +enum drbd_role { + R_UNKNOWN = 0, + R_PRIMARY = 1, /* role */ + R_SECONDARY = 2, /* role */ + R_MASK = 3, +}; + +/* The order of these constants is important. + * The lower ones (=C_WF_REPORT_PARAMS ==> There is a socket + */ +enum drbd_conns { + C_STANDALONE, + C_DISCONNECTING, /* Temporal state on the way to StandAlone. */ + C_UNCONNECTED, /* >= C_UNCONNECTED -> inc_net() succeeds */ + + /* These temporal states are all used on the way + * from >= C_CONNECTED to Unconnected. + * The 'disconnect reason' states + * I do not allow to change beween them. */ + C_TIMEOUT, + C_BROKEN_PIPE, + C_NETWORK_FAILURE, + C_PROTOCOL_ERROR, + C_TEAR_DOWN, + + C_WF_CONNECTION, + C_WF_REPORT_PARAMS, /* we have a socket */ + C_CONNECTED, /* we have introduced each other */ + C_STARTING_SYNC_S, /* starting full sync by admin request. */ + C_STARTING_SYNC_T, /* stariing full sync by admin request. */ + C_WF_BITMAP_S, + C_WF_BITMAP_T, + C_WF_SYNC_UUID, + + /* All SyncStates are tested with this comparison + * xx >= C_SYNC_SOURCE && xx <= C_PAUSED_SYNC_T */ + C_SYNC_SOURCE, + C_SYNC_TARGET, + C_VERIFY_S, + C_VERIFY_T, + C_PAUSED_SYNC_S, + C_PAUSED_SYNC_T, + C_MASK = 31 +}; + +enum drbd_disk_state { + D_DISKLESS, + D_ATTACHING, /* In the process of reading the meta-data */ + D_FAILED, /* Becomes D_DISKLESS as soon as we told it the peer */ + /* when >= D_FAILED it is legal to access mdev->bc */ + D_NEGOTIATING, /* Late attaching state, we need to talk to the peer */ + D_INCONSISTENT, + D_OUTDATED, + D_UNKNOWN, /* Only used for the peer, never for myself */ + D_CONSISTENT, /* Might be D_OUTDATED, might be D_UP_TO_DATE ... */ + D_UP_TO_DATE, /* Only this disk state allows applications' IO ! */ + D_MASK = 15 +}; + +union drbd_state { +/* According to gcc's docs is the ... + * The order of allocation of bit-fields within a unit (C90 6.5.2.1, C99 6.7.2.1). + * Determined by ABI. + * pointed out by Maxim Uvarov q + * even though we transmit as "cpu_to_be32(state)", + * the offsets of the bitfields still need to be swapped + * on different endianess. + */ + struct { +#if defined(__LITTLE_ENDIAN_BITFIELD) + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ + unsigned conn:5 ; /* 17/32 cstates */ + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned susp:1 ; /* 2/2 IO suspended no/yes */ + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ + unsigned peer_isp:1 ; + unsigned user_isp:1 ; + unsigned _pad:11; /* 0 unused */ +#elif defined(__BIG_ENDIAN_BITFIELD) + unsigned _pad:11; /* 0 unused */ + unsigned user_isp:1 ; + unsigned peer_isp:1 ; + unsigned aftr_isp:1 ; /* isp .. imposed sync pause */ + unsigned susp:1 ; /* 2/2 IO suspended no/yes */ + unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */ + unsigned conn:5 ; /* 17/32 cstates */ + unsigned peer:2 ; /* 3/4 primary/secondary/unknown */ + unsigned role:2 ; /* 3/4 primary/secondary/unknown */ +#else +# error "this endianess is not supported" +#endif + }; + unsigned int i; +}; + +enum drbd_state_ret_codes { + SS_CW_NO_NEED = 4, + SS_CW_SUCCESS = 3, + SS_NOTHING_TO_DO = 2, + SS_SUCCESS = 1, + SS_UNKNOWN_ERROR = 0, /* Used to sleep longer in _drbd_request_state */ + SS_TWO_PRIMARIES = -1, + SS_NO_UP_TO_DATE_DISK = -2, + SS_NO_LOCAL_DISK = -4, + SS_NO_REMOTE_DISK = -5, + SS_CONNECTED_OUTDATES = -6, + SS_PRIMARY_NOP = -7, + SS_RESYNC_RUNNING = -8, + SS_ALREADY_STANDALONE = -9, + SS_CW_FAILED_BY_PEER = -10, + SS_IS_DISKLESS = -11, + SS_DEVICE_IN_USE = -12, + SS_NO_NET_CONFIG = -13, + SS_NO_VERIFY_ALG = -14, /* drbd-8.2 only */ + SS_NEED_CONNECTION = -15, /* drbd-8.2 only */ + SS_LOWER_THAN_OUTDATED = -16, + SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */ + SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */ + SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */ + SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */ +}; + +/* from drbd_strings.c */ +extern const char *drbd_conn_str(enum drbd_conns); +extern const char *drbd_role_str(enum drbd_role); +extern const char *drbd_disk_str(enum drbd_disk_state); +extern const char *drbd_set_st_err_str(enum drbd_state_ret_codes); + +#define SHARED_SECRET_MAX 64 + +#define MDF_CONSISTENT (1 << 0) +#define MDF_PRIMARY_IND (1 << 1) +#define MDF_CONNECTED_IND (1 << 2) +#define MDF_FULL_SYNC (1 << 3) +#define MDF_WAS_UP_TO_DATE (1 << 4) +#define MDF_PEER_OUT_DATED (1 << 5) +#define MDF_CRASHED_PRIMARY (1 << 6) + +enum drbd_uuid_index { + UI_CURRENT, + UI_BITMAP, + UI_HISTORY_START, + UI_HISTORY_END, + UI_SIZE, /* nl-packet: number of dirty bits */ + UI_FLAGS, /* nl-packet: flags */ + UI_EXTENDED_SIZE /* Everything. */ +}; + +enum drbd_timeout_flag { + UT_DEFAULT = 0, + UT_DEGRADED = 1, + UT_PEER_OUTDATED = 2, +}; + +#define UUID_JUST_CREATED ((__u64)4) + +#define DRBD_MAGIC 0x83740267 +#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC) + +/* these are of type "int" */ +#define DRBD_MD_INDEX_INTERNAL -1 +#define DRBD_MD_INDEX_FLEX_EXT -2 +#define DRBD_MD_INDEX_FLEX_INT -3 + +/* Start of the new netlink/connector stuff */ + +#define DRBD_NL_CREATE_DEVICE 0x01 +#define DRBD_NL_SET_DEFAULTS 0x02 + +/* The following line should be moved over to linux/connector.h + * when the time comes */ +#ifndef CN_IDX_DRBD +# define CN_IDX_DRBD 0x4 +/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */ +#endif +#define CN_VAL_DRBD 0x1 + +/* For searching a vacant cn_idx value */ +#define CN_IDX_STEP 6977 + +struct drbd_nl_cfg_req { + int packet_type; + unsigned int drbd_minor; + int flags; + unsigned short tag_list[]; +}; + +struct drbd_nl_cfg_reply { + int packet_type; + unsigned int minor; + int ret_code; /* enum ret_code or set_st_err_t */ + unsigned short tag_list[]; /* only used with get_* calls */ +}; + +#endif diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h new file mode 100644 index 00000000000..9d067ce4696 --- /dev/null +++ b/include/linux/drbd_limits.h @@ -0,0 +1,137 @@ +/* + drbd_limits.h + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. +*/ + +/* + * Our current limitations. + * Some of them are hard limits, + * some of them are arbitrary range limits, that make it easier to provide + * feedback about nonsense settings for certain configurable values. + */ + +#ifndef DRBD_LIMITS_H +#define DRBD_LIMITS_H 1 + +#define DEBUG_RANGE_CHECK 0 + +#define DRBD_MINOR_COUNT_MIN 1 +#define DRBD_MINOR_COUNT_MAX 255 + +#define DRBD_DIALOG_REFRESH_MIN 0 +#define DRBD_DIALOG_REFRESH_MAX 600 + +/* valid port number */ +#define DRBD_PORT_MIN 1 +#define DRBD_PORT_MAX 0xffff + +/* startup { */ + /* if you want more than 3.4 days, disable */ +#define DRBD_WFC_TIMEOUT_MIN 0 +#define DRBD_WFC_TIMEOUT_MAX 300000 +#define DRBD_WFC_TIMEOUT_DEF 0 + +#define DRBD_DEGR_WFC_TIMEOUT_MIN 0 +#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000 +#define DRBD_DEGR_WFC_TIMEOUT_DEF 0 + +#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0 +#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000 +#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0 +/* }*/ + +/* net { */ + /* timeout, unit centi seconds + * more than one minute timeout is not usefull */ +#define DRBD_TIMEOUT_MIN 1 +#define DRBD_TIMEOUT_MAX 600 +#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */ + + /* active connection retries when C_WF_CONNECTION */ +#define DRBD_CONNECT_INT_MIN 1 +#define DRBD_CONNECT_INT_MAX 120 +#define DRBD_CONNECT_INT_DEF 10 /* seconds */ + + /* keep-alive probes when idle */ +#define DRBD_PING_INT_MIN 1 +#define DRBD_PING_INT_MAX 120 +#define DRBD_PING_INT_DEF 10 + + /* timeout for the ping packets.*/ +#define DRBD_PING_TIMEO_MIN 1 +#define DRBD_PING_TIMEO_MAX 100 +#define DRBD_PING_TIMEO_DEF 5 + + /* max number of write requests between write barriers */ +#define DRBD_MAX_EPOCH_SIZE_MIN 1 +#define DRBD_MAX_EPOCH_SIZE_MAX 20000 +#define DRBD_MAX_EPOCH_SIZE_DEF 2048 + + /* I don't think that a tcp send buffer of more than 10M is usefull */ +#define DRBD_SNDBUF_SIZE_MIN 0 +#define DRBD_SNDBUF_SIZE_MAX (10<<20) +#define DRBD_SNDBUF_SIZE_DEF (2*65535) + +#define DRBD_RCVBUF_SIZE_MIN 0 +#define DRBD_RCVBUF_SIZE_MAX (10<<20) +#define DRBD_RCVBUF_SIZE_DEF (2*65535) + + /* @4k PageSize -> 128kB - 512MB */ +#define DRBD_MAX_BUFFERS_MIN 32 +#define DRBD_MAX_BUFFERS_MAX 131072 +#define DRBD_MAX_BUFFERS_DEF 2048 + + /* @4k PageSize -> 4kB - 512MB */ +#define DRBD_UNPLUG_WATERMARK_MIN 1 +#define DRBD_UNPLUG_WATERMARK_MAX 131072 +#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16) + + /* 0 is disabled. + * 200 should be more than enough even for very short timeouts */ +#define DRBD_KO_COUNT_MIN 0 +#define DRBD_KO_COUNT_MAX 200 +#define DRBD_KO_COUNT_DEF 0 +/* } */ + +/* syncer { */ + /* FIXME allow rate to be zero? */ +#define DRBD_RATE_MIN 1 +/* channel bonding 10 GbE, or other hardware */ +#define DRBD_RATE_MAX (4 << 20) +#define DRBD_RATE_DEF 250 /* kb/second */ + + /* less than 7 would hit performance unneccessarily. + * 3833 is the largest prime that still does fit + * into 64 sectors of activity log */ +#define DRBD_AL_EXTENTS_MIN 7 +#define DRBD_AL_EXTENTS_MAX 3833 +#define DRBD_AL_EXTENTS_DEF 127 + +#define DRBD_AFTER_MIN -1 +#define DRBD_AFTER_MAX 255 +#define DRBD_AFTER_DEF -1 + +/* } */ + +/* drbdsetup XY resize -d Z + * you are free to reduce the device size to nothing, if you want to. + * the upper limit with 64bit kernel, enough ram and flexible meta data + * is 16 TB, currently. */ +/* DRBD_MAX_SECTORS */ +#define DRBD_DISK_SIZE_SECT_MIN 0 +#define DRBD_DISK_SIZE_SECT_MAX (16 * (2LLU << 30)) +#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */ + +#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON +#define DRBD_FENCING_DEF FP_DONT_CARE +#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT +#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT +#define DRBD_AFTER_SB_2P_DEF ASB_DISCONNECT +#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT + +#define DRBD_MAX_BIO_BVECS_MIN 0 +#define DRBD_MAX_BIO_BVECS_MAX 128 +#define DRBD_MAX_BIO_BVECS_DEF 0 + +#undef RANGE +#endif diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h new file mode 100644 index 00000000000..db5721ad50d --- /dev/null +++ b/include/linux/drbd_nl.h @@ -0,0 +1,137 @@ +/* + PAKET( name, + TYPE ( pn, pr, member ) + ... + ) + + You may never reissue one of the pn arguments +*/ + +#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64) +#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined" +#endif + +NL_PACKET(primary, 1, + NL_BIT( 1, T_MAY_IGNORE, overwrite_peer) +) + +NL_PACKET(secondary, 2, ) + +NL_PACKET(disk_conf, 3, + NL_INT64( 2, T_MAY_IGNORE, disk_size) + NL_STRING( 3, T_MANDATORY, backing_dev, 128) + NL_STRING( 4, T_MANDATORY, meta_dev, 128) + NL_INTEGER( 5, T_MANDATORY, meta_dev_idx) + NL_INTEGER( 6, T_MAY_IGNORE, on_io_error) + NL_INTEGER( 7, T_MAY_IGNORE, fencing) + NL_BIT( 37, T_MAY_IGNORE, use_bmbv) + NL_BIT( 53, T_MAY_IGNORE, no_disk_flush) + NL_BIT( 54, T_MAY_IGNORE, no_md_flush) + /* 55 max_bio_size was available in 8.2.6rc2 */ + NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs) + NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier) + NL_BIT( 58, T_MAY_IGNORE, no_disk_drain) +) + +NL_PACKET(detach, 4, ) + +NL_PACKET(net_conf, 5, + NL_STRING( 8, T_MANDATORY, my_addr, 128) + NL_STRING( 9, T_MANDATORY, peer_addr, 128) + NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX) + NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX) + NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX) + NL_INTEGER( 14, T_MAY_IGNORE, timeout) + NL_INTEGER( 15, T_MANDATORY, wire_protocol) + NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int) + NL_INTEGER( 17, T_MAY_IGNORE, ping_int) + NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size) + NL_INTEGER( 19, T_MAY_IGNORE, max_buffers) + NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark) + NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size) + NL_INTEGER( 22, T_MAY_IGNORE, ko_count) + NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p) + NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p) + NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p) + NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict) + NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo) + NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size) + /* 59 addr_family was available in GIT, never released */ + NL_BIT( 60, T_MANDATORY, mind_af) + NL_BIT( 27, T_MAY_IGNORE, want_lose) + NL_BIT( 28, T_MAY_IGNORE, two_primaries) + NL_BIT( 41, T_MAY_IGNORE, always_asbp) + NL_BIT( 61, T_MAY_IGNORE, no_cork) + NL_BIT( 62, T_MANDATORY, auto_sndbuf_size) +) + +NL_PACKET(disconnect, 6, ) + +NL_PACKET(resize, 7, + NL_INT64( 29, T_MAY_IGNORE, resize_size) +) + +NL_PACKET(syncer_conf, 8, + NL_INTEGER( 30, T_MAY_IGNORE, rate) + NL_INTEGER( 31, T_MAY_IGNORE, after) + NL_INTEGER( 32, T_MAY_IGNORE, al_extents) + NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX) + NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32) + NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX) + NL_BIT( 65, T_MAY_IGNORE, use_rle) +) + +NL_PACKET(invalidate, 9, ) +NL_PACKET(invalidate_peer, 10, ) +NL_PACKET(pause_sync, 11, ) +NL_PACKET(resume_sync, 12, ) +NL_PACKET(suspend_io, 13, ) +NL_PACKET(resume_io, 14, ) +NL_PACKET(outdate, 15, ) +NL_PACKET(get_config, 16, ) +NL_PACKET(get_state, 17, + NL_INTEGER( 33, T_MAY_IGNORE, state_i) +) + +NL_PACKET(get_uuids, 18, + NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64))) + NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags) +) + +NL_PACKET(get_timeout_flag, 19, + NL_BIT( 36, T_MAY_IGNORE, use_degraded) +) + +NL_PACKET(call_helper, 20, + NL_STRING( 38, T_MAY_IGNORE, helper, 32) +) + +/* Tag nr 42 already allocated in drbd-8.1 development. */ + +NL_PACKET(sync_progress, 23, + NL_INTEGER( 43, T_MAY_IGNORE, sync_progress) +) + +NL_PACKET(dump_ee, 24, + NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32) + NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX) + NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX) + NL_INT64( 48, T_MAY_IGNORE, ee_sector) + NL_INT64( 49, T_MAY_IGNORE, ee_block_id) + NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10) +) + +NL_PACKET(start_ov, 25, + NL_INT64( 66, T_MAY_IGNORE, start_sector) +) + +NL_PACKET(new_c_uuid, 26, + NL_BIT( 63, T_MANDATORY, clear_bm) +) + +#undef NL_PACKET +#undef NL_INTEGER +#undef NL_INT64 +#undef NL_BIT +#undef NL_STRING + diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h new file mode 100644 index 00000000000..fcdff8410e9 --- /dev/null +++ b/include/linux/drbd_tag_magic.h @@ -0,0 +1,83 @@ +#ifndef DRBD_TAG_MAGIC_H +#define DRBD_TAG_MAGIC_H + +#define TT_END 0 +#define TT_REMOVED 0xE000 + +/* declare packet_type enums */ +enum packet_types { +#define NL_PACKET(name, number, fields) P_ ## name = number, +#define NL_INTEGER(pn, pr, member) +#define NL_INT64(pn, pr, member) +#define NL_BIT(pn, pr, member) +#define NL_STRING(pn, pr, member, len) +#include "drbd_nl.h" + P_nl_after_last_packet, +}; + +/* These struct are used to deduce the size of the tag lists: */ +#define NL_PACKET(name, number, fields) \ + struct name ## _tag_len_struct { fields }; +#define NL_INTEGER(pn, pr, member) \ + int member; int tag_and_len ## member; +#define NL_INT64(pn, pr, member) \ + __u64 member; int tag_and_len ## member; +#define NL_BIT(pn, pr, member) \ + unsigned char member:1; int tag_and_len ## member; +#define NL_STRING(pn, pr, member, len) \ + unsigned char member[len]; int member ## _len; \ + int tag_and_len ## member; +#include "linux/drbd_nl.h" + +/* declate tag-list-sizes */ +static const int tag_list_sizes[] = { +#define NL_PACKET(name, number, fields) 2 fields , +#define NL_INTEGER(pn, pr, member) + 4 + 4 +#define NL_INT64(pn, pr, member) + 4 + 8 +#define NL_BIT(pn, pr, member) + 4 + 1 +#define NL_STRING(pn, pr, member, len) + 4 + (len) +#include "drbd_nl.h" +}; + +/* The two highest bits are used for the tag type */ +#define TT_MASK 0xC000 +#define TT_INTEGER 0x0000 +#define TT_INT64 0x4000 +#define TT_BIT 0x8000 +#define TT_STRING 0xC000 +/* The next bit indicates if processing of the tag is mandatory */ +#define T_MANDATORY 0x2000 +#define T_MAY_IGNORE 0x0000 +#define TN_MASK 0x1fff +/* The remaining 13 bits are used to enumerate the tags */ + +#define tag_type(T) ((T) & TT_MASK) +#define tag_number(T) ((T) & TN_MASK) + +/* declare tag enums */ +#define NL_PACKET(name, number, fields) fields +enum drbd_tags { +#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr , +#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr , +#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr , +#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr , +#include "drbd_nl.h" +}; + +struct tag { + const char *name; + int type_n_flags; + int max_len; +}; + +/* declare tag names */ +#define NL_PACKET(name, number, fields) fields +static const struct tag tag_descriptions[] = { +#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) }, +#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) }, +#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) }, +#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) }, +#include "drbd_nl.h" +}; + +#endif diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h new file mode 100644 index 00000000000..3a2b2d9b047 --- /dev/null +++ b/include/linux/lru_cache.h @@ -0,0 +1,294 @@ +/* + lru_cache.c + + This file is part of DRBD by Philipp Reisner and Lars Ellenberg. + + Copyright (C) 2003-2008, LINBIT Information Technologies GmbH. + Copyright (C) 2003-2008, Philipp Reisner . + Copyright (C) 2003-2008, Lars Ellenberg . + + drbd is free software; you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation; either version 2, or (at your option) + any later version. + + drbd is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with drbd; see the file COPYING. If not, write to + the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. + + */ + +#ifndef LRU_CACHE_H +#define LRU_CACHE_H + +#include +#include +#include +#include /* for memset */ +#include + +/* +This header file (and its .c file; kernel-doc of functions see there) + define a helper framework to easily keep track of index:label associations, + and changes to an "active set" of objects, as well as pending transactions, + to persistently record those changes. + + We use an LRU policy if it is necessary to "cool down" a region currently in + the active set before we can "heat" a previously unused region. + + Because of this later property, it is called "lru_cache". + As it actually Tracks Objects in an Active SeT, we could also call it + toast (incidentally that is what may happen to the data on the + backend storage uppon next resync, if we don't get it right). + +What for? + +We replicate IO (more or less synchronously) to local and remote disk. + +For crash recovery after replication node failure, + we need to resync all regions that have been target of in-flight WRITE IO + (in use, or "hot", regions), as we don't know wether or not those WRITEs have + made it to stable storage. + + To avoid a "full resync", we need to persistently track these regions. + + This is known as "write intent log", and can be implemented as on-disk + (coarse or fine grained) bitmap, or other meta data. + + To avoid the overhead of frequent extra writes to this meta data area, + usually the condition is softened to regions that _may_ have been target of + in-flight WRITE IO, e.g. by only lazily clearing the on-disk write-intent + bitmap, trading frequency of meta data transactions against amount of + (possibly unneccessary) resync traffic. + + If we set a hard limit on the area that may be "hot" at any given time, we + limit the amount of resync traffic needed for crash recovery. + +For recovery after replication link failure, + we need to resync all blocks that have been changed on the other replica + in the mean time, or, if both replica have been changed independently [*], + all blocks that have been changed on either replica in the mean time. + [*] usually as a result of a cluster split-brain and insufficient protection. + but there are valid use cases to do this on purpose. + + Tracking those blocks can be implemented as "dirty bitmap". + Having it fine-grained reduces the amount of resync traffic. + It should also be persistent, to allow for reboots (or crashes) + while the replication link is down. + +There are various possible implementations for persistently storing +write intent log information, three of which are mentioned here. + +"Chunk dirtying" + The on-disk "dirty bitmap" may be re-used as "write-intent" bitmap as well. + To reduce the frequency of bitmap updates for write-intent log purposes, + one could dirty "chunks" (of some size) at a time of the (fine grained) + on-disk bitmap, while keeping the in-memory "dirty" bitmap as clean as + possible, flushing it to disk again when a previously "hot" (and on-disk + dirtied as full chunk) area "cools down" again (no IO in flight anymore, + and none expected in the near future either). + +"Explicit (coarse) write intent bitmap" + An other implementation could chose a (probably coarse) explicit bitmap, + for write-intent log purposes, additionally to the fine grained dirty bitmap. + +"Activity log" + Yet an other implementation may keep track of the hot regions, by starting + with an empty set, and writing down a journal of region numbers that have + become "hot", or have "cooled down" again. + + To be able to use a ring buffer for this journal of changes to the active + set, we not only record the actual changes to that set, but also record the + not changing members of the set in a round robin fashion. To do so, we use a + fixed (but configurable) number of slots which we can identify by index, and + associate region numbers (labels) with these indices. + For each transaction recording a change to the active set, we record the + change itself (index: -old_label, +new_label), and which index is associated + with which label (index: current_label) within a certain sliding window that + is moved further over the available indices with each such transaction. + + Thus, for crash recovery, if the ringbuffer is sufficiently large, we can + accurately reconstruct the active set. + + Sufficiently large depends only on maximum number of active objects, and the + size of the sliding window recording "index: current_label" associations within + each transaction. + + This is what we call the "activity log". + + Currently we need one activity log transaction per single label change, which + does not give much benefit over the "dirty chunks of bitmap" approach, other + than potentially less seeks. + + We plan to change the transaction format to support multiple changes per + transaction, which then would reduce several (disjoint, "random") updates to + the bitmap into one transaction to the activity log ring buffer. +*/ + +/* this defines an element in a tracked set + * .colision is for hash table lookup. + * When we process a new IO request, we know its sector, thus can deduce the + * region number (label) easily. To do the label -> object lookup without a + * full list walk, we use a simple hash table. + * + * .list is on one of three lists: + * in_use: currently in use (refcnt > 0, lc_number != LC_FREE) + * lru: unused but ready to be reused or recycled + * (ts_refcnt == 0, lc_number != LC_FREE), + * free: unused but ready to be recycled + * (ts_refcnt == 0, lc_number == LC_FREE), + * + * an element is said to be "in the active set", + * if either on "in_use" or "lru", i.e. lc_number != LC_FREE. + * + * DRBD currently (May 2009) only uses 61 elements on the resync lru_cache + * (total memory usage 2 pages), and up to 3833 elements on the act_log + * lru_cache, totalling ~215 kB for 64bit architechture, ~53 pages. + * + * We usually do not actually free these objects again, but only "recycle" + * them, as the change "index: -old_label, +LC_FREE" would need a transaction + * as well. Which also means that using a kmem_cache to allocate the objects + * from wastes some resources. + * But it avoids high order page allocations in kmalloc. + */ +struct lc_element { + struct hlist_node colision; + struct list_head list; /* LRU list or free list */ + unsigned refcnt; + /* back "pointer" into ts_cache->element[index], + * for paranoia, and for "ts_element_to_index" */ + unsigned lc_index; + /* if we want to track a larger set of objects, + * it needs to become arch independend u64 */ + unsigned lc_number; + + /* special label when on free list */ +#define LC_FREE (~0U) +}; + +struct lru_cache { + /* the least recently used item is kept at lru->prev */ + struct list_head lru; + struct list_head free; + struct list_head in_use; + + /* the pre-created kmem cache to allocate the objects from */ + struct kmem_cache *lc_cache; + + /* size of tracked objects, used to memset(,0,) them in lc_reset */ + size_t element_size; + /* offset of struct lc_element member in the tracked object */ + size_t element_off; + + /* number of elements (indices) */ + unsigned int nr_elements; + /* Arbitrary limit on maximum tracked objects. Practical limit is much + * lower due to allocation failures, probably. For typical use cases, + * nr_elements should be a few thousand at most. + * This also limits the maximum value of ts_element.ts_index, allowing the + * 8 high bits of .ts_index to be overloaded with flags in the future. */ +#define LC_MAX_ACTIVE (1<<24) + + /* statistics */ + unsigned used; /* number of lelements currently on in_use list */ + unsigned long hits, misses, starving, dirty, changed; + + /* see below: flag-bits for lru_cache */ + unsigned long flags; + + /* when changing the label of an index element */ + unsigned int new_number; + + /* for paranoia when changing the label of an index element */ + struct lc_element *changing_element; + + void *lc_private; + const char *name; + + /* nr_elements there */ + struct hlist_head *lc_slot; + struct lc_element **lc_element; +}; + + +/* flag-bits for lru_cache */ +enum { + /* debugging aid, to catch concurrent access early. + * user needs to guarantee exclusive access by proper locking! */ + __LC_PARANOIA, + /* if we need to change the set, but currently there is a changing + * transaction pending, we are "dirty", and must deferr further + * changing requests */ + __LC_DIRTY, + /* if we need to change the set, but currently there is no free nor + * unused element available, we are "starving", and must not give out + * further references, to guarantee that eventually some refcnt will + * drop to zero and we will be able to make progress again, changing + * the set, writing the transaction. + * if the statistics say we are frequently starving, + * nr_elements is too small. */ + __LC_STARVING, +}; +#define LC_PARANOIA (1<<__LC_PARANOIA) +#define LC_DIRTY (1<<__LC_DIRTY) +#define LC_STARVING (1<<__LC_STARVING) + +extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache, + unsigned e_count, size_t e_size, size_t e_off); +extern void lc_reset(struct lru_cache *lc); +extern void lc_destroy(struct lru_cache *lc); +extern void lc_set(struct lru_cache *lc, unsigned int enr, int index); +extern void lc_del(struct lru_cache *lc, struct lc_element *element); + +extern struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr); +extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr); +extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr); +extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); +extern void lc_changed(struct lru_cache *lc, struct lc_element *e); + +struct seq_file; +extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); + +extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, + void (*detail) (struct seq_file *, struct lc_element *)); + +/** + * lc_try_lock - can be used to stop lc_get() from changing the tracked set + * @lc: the lru cache to operate on + * + * Note that the reference counts and order on the active and lru lists may + * still change. Returns true if we aquired the lock. + */ +static inline int lc_try_lock(struct lru_cache *lc) +{ + return !test_and_set_bit(__LC_DIRTY, &lc->flags); +} + +/** + * lc_unlock - unlock @lc, allow lc_get() to change the set again + * @lc: the lru cache to operate on + */ +static inline void lc_unlock(struct lru_cache *lc) +{ + clear_bit(__LC_DIRTY, &lc->flags); + smp_mb__after_clear_bit(); +} + +static inline int lc_is_used(struct lru_cache *lc, unsigned int enr) +{ + struct lc_element *e = lc_find(lc, enr); + return e && e->refcnt; +} + +#define lc_entry(ptr, type, member) \ + container_of(ptr, type, member) + +extern struct lc_element *lc_element_by_index(struct lru_cache *lc, unsigned i); +extern unsigned int lc_index_of(struct lru_cache *lc, struct lc_element *e); + +#endif -- cgit v1.2.3 From 9f5180e5c331d7b3ccc35e1a78072235d38f9f34 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 6 Oct 2009 09:30:14 +0200 Subject: drbd: Work on permission enforcement Now we have the capabilities of the sending process available, use them to enforce CAP_SYS_ADMIN. Signed-off-by: Philipp Reisner Signed-off-by: Jens Axboe --- include/linux/drbd.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 69dc711f37b..233db5c18b8 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -138,6 +138,7 @@ enum drbd_ret_codes { ERR_VERIFY_RUNNING = 149, /* DRBD 8.2 only */ ERR_DATA_NOT_CURRENT = 150, ERR_CONNECTED = 151, /* DRBD 8.3 only */ + ERR_PERM = 152, /* insert new ones above this line */ AFTER_LAST_ERR_CODE -- cgit v1.2.3 From b2c18e1e08a5a9663094d57bb4be2f02226ee61c Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 23 Oct 2009 17:14:49 -0400 Subject: cfq: calculate the seek_mean per cfq_queue not per cfq_io_context async cfq_queue's are already shared between processes within the same priority, and forthcoming patches will change the mapping of cic to sync cfq_queue from 1:1 to 1:N. So, calculate the seekiness of a process based on the cfq_queue instead of the cfq_io_context. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 5 ----- 1 file changed, 5 deletions(-) (limited to 'include') diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index 4da4a75c3f1..eb73632440f 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -40,16 +40,11 @@ struct cfq_io_context { struct io_context *ioc; unsigned long last_end_request; - sector_t last_request_pos; unsigned long ttime_total; unsigned long ttime_samples; unsigned long ttime_mean; - unsigned int seek_samples; - u64 seek_total; - sector_t seek_mean; - struct list_head queue_list; struct hlist_node cic_list; -- cgit v1.2.3 From 1af60fbd759d31f565552fea315c2033947cfbe6 Mon Sep 17 00:00:00 2001 From: Jeff Moyer Date: Fri, 2 Oct 2009 18:56:53 -0400 Subject: block: get rid of the WRITE_ODIRECT flag Hi, The WRITE_ODIRECT flag is only used in one place, and that code path happens to also call blk_run_address_space. The introduction of this flag, then, could result in the device being unplugged twice for every I/O. Further, with the batching changes in the next patch, we don't want an O_DIRECT write to imply a queue unplug. Signed-off-by: Jeff Moyer Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2620a8c6357..2f5fca4147c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -129,7 +129,6 @@ struct inodes_stat_t { * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device * immediately after submission. The write equivalent * of READ_SYNC. - * WRITE_ODIRECT Special case write for O_DIRECT only. * SWRITE_SYNC * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. * See SWRITE. @@ -151,7 +150,6 @@ struct inodes_stat_t { #define READ_META (READ | (1 << BIO_RW_META)) #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) -#define WRITE_ODIRECT (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_UNPLUG)) #define SWRITE_SYNC_PLUG \ (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) -- cgit v1.2.3 From b9d128f1088ea5245109dfc9bbceb128b6371a77 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 29 Oct 2009 13:59:26 +0100 Subject: block: move bdi/address_space unplug functions to backing-dev.h There's nothing block related about them, the backing device is used by things like NFS etc as well. This gets rid of the need to protect such calls by CONFIG_BLOCK. Signed-off-by: Jens Axboe --- include/linux/backing-dev.h | 13 +++++++++++++ include/linux/blkdev.h | 13 ------------- 2 files changed, 13 insertions(+), 13 deletions(-) (limited to 'include') diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index b449e738533..fcbc26af00e 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -331,4 +331,17 @@ static inline int bdi_sched_wait(void *word) return 0; } +static inline void blk_run_backing_dev(struct backing_dev_info *bdi, + struct page *page) +{ + if (bdi && bdi->unplug_io_fn) + bdi->unplug_io_fn(bdi, page); +} + +static inline void blk_run_address_space(struct address_space *mapping) +{ + if (mapping) + blk_run_backing_dev(mapping->backing_dev_info, NULL); +} + #endif /* _LINUX_BACKING_DEV_H */ diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 221cecd86bd..39c601f783a 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -823,19 +823,6 @@ static inline struct request_queue *bdev_get_queue(struct block_device *bdev) return bdev->bd_disk->queue; } -static inline void blk_run_backing_dev(struct backing_dev_info *bdi, - struct page *page) -{ - if (bdi && bdi->unplug_io_fn) - bdi->unplug_io_fn(bdi, page); -} - -static inline void blk_run_address_space(struct address_space *mapping) -{ - if (mapping) - blk_run_backing_dev(mapping->backing_dev_info, NULL); -} - /* * blk_rq_pos() : the current sector * blk_rq_bytes() : bytes left in the entire request -- cgit v1.2.3 From 4f570f995f68ef77aae7e5a441222f59232f2d0e Mon Sep 17 00:00:00 2001 From: Alberto Bertogli Date: Mon, 2 Nov 2009 11:40:16 +0100 Subject: Do not __always_inline bvec_kmap_irq() and bvec_kunmap_irq() So remove both the comment and the inline requirement, going back to the inline hint. Signed-off-by: Alberto Bertogli Signed-off-by: Jens Axboe --- include/linux/bio.h | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) (limited to 'include') diff --git a/include/linux/bio.h b/include/linux/bio.h index 5be93f18d84..474792b825d 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -450,11 +450,8 @@ extern struct biovec_slab bvec_slabs[BIOVEC_NR_POOLS] __read_mostly; /* * remember never ever reenable interrupts between a bvec_kmap_irq and * bvec_kunmap_irq! - * - * This function MUST be inlined - it plays with the CPU interrupt flags. */ -static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec, - unsigned long *flags) +static inline char *bvec_kmap_irq(struct bio_vec *bvec, unsigned long *flags) { unsigned long addr; @@ -470,8 +467,7 @@ static __always_inline char *bvec_kmap_irq(struct bio_vec *bvec, return (char *) addr + bvec->bv_offset; } -static __always_inline void bvec_kunmap_irq(char *buffer, - unsigned long *flags) +static inline void bvec_kunmap_irq(char *buffer, unsigned long *flags) { unsigned long ptr = (unsigned long) buffer & PAGE_MASK; -- cgit v1.2.3 From 89e1838f5f2c2af80268a096b9a687643b0d0846 Mon Sep 17 00:00:00 2001 From: Lars Ellenberg Date: Mon, 21 Sep 2009 10:46:22 +0200 Subject: change default: by default, use socket buffer auto tuning Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd_limits.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h index 9d067ce4696..51f47a586ad 100644 --- a/include/linux/drbd_limits.h +++ b/include/linux/drbd_limits.h @@ -70,11 +70,11 @@ /* I don't think that a tcp send buffer of more than 10M is usefull */ #define DRBD_SNDBUF_SIZE_MIN 0 #define DRBD_SNDBUF_SIZE_MAX (10<<20) -#define DRBD_SNDBUF_SIZE_DEF (2*65535) +#define DRBD_SNDBUF_SIZE_DEF 0 #define DRBD_RCVBUF_SIZE_MIN 0 #define DRBD_RCVBUF_SIZE_MAX (10<<20) -#define DRBD_RCVBUF_SIZE_DEF (2*65535) +#define DRBD_RCVBUF_SIZE_DEF 0 /* @4k PageSize -> 128kB - 512MB */ #define DRBD_MAX_BUFFERS_MIN 32 -- cgit v1.2.3 From ed814525f2e45188964c270fc3a5a0b644f7e4a9 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Tue, 27 Oct 2009 12:37:14 +0100 Subject: Now it is equal to DRBD release 8.3.5 without compat crap Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 233db5c18b8..18942ad115d 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.3rc2" +#define REL_VERSION "8.3.5" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 91 -- cgit v1.2.3 From 86b37281411cf1e9bc0a6b5406c45edb7bd9ea5d Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Tue, 10 Nov 2009 11:50:21 +0100 Subject: block: Expose discard granularity While SSDs track block usage on a per-sector basis, RAID arrays often have allocation blocks that are bigger. Allow the discard granularity and alignment to be set and teach the topology stacking logic how to handle them. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 18 ++++++++++++++++++ include/linux/genhd.h | 1 + 2 files changed, 19 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 39c601f783a..1cc02972fbe 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -312,12 +312,15 @@ struct queue_limits { unsigned int io_min; unsigned int io_opt; unsigned int max_discard_sectors; + unsigned int discard_granularity; + unsigned int discard_alignment; unsigned short logical_block_size; unsigned short max_hw_segments; unsigned short max_phys_segments; unsigned char misaligned; + unsigned char discard_misaligned; unsigned char no_cluster; }; @@ -1121,6 +1124,21 @@ static inline int bdev_alignment_offset(struct block_device *bdev) return q->limits.alignment_offset; } +static inline int queue_discard_alignment(struct request_queue *q) +{ + if (q->limits.discard_misaligned) + return -1; + + return q->limits.discard_alignment; +} + +static inline int queue_sector_discard_alignment(struct request_queue *q, + sector_t sector) +{ + return ((sector << 9) - q->limits.discard_alignment) + & (q->limits.discard_granularity - 1); +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; diff --git a/include/linux/genhd.h b/include/linux/genhd.h index 297df45ffd0..c6c0c41af35 100644 --- a/include/linux/genhd.h +++ b/include/linux/genhd.h @@ -91,6 +91,7 @@ struct hd_struct { sector_t start_sect; sector_t nr_sects; sector_t alignment_offset; + unsigned int discard_alignment; struct device __dev; struct kobject *holder_dir; int policy, partno; -- cgit v1.2.3 From ad85dfe67bbf13d5fa20764e4ce801a1e6e526d8 Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 18 Nov 2009 15:52:51 +0100 Subject: DRBD: Now the code is 8.3.6 + 3 fixes (without compat crap) Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/drbd.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include') diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 18942ad115d..99a4d76694e 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -53,7 +53,7 @@ extern const char *drbd_buildtag(void); -#define REL_VERSION "8.3.5" +#define REL_VERSION "8.3.6" #define API_VERSION 88 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 91 -- cgit v1.2.3 From 35a8a3fdcd4f973a5430e868f2f2a5c363803a5b Mon Sep 17 00:00:00 2001 From: Philipp Reisner Date: Wed, 25 Nov 2009 17:50:00 +0100 Subject: drbd: moved CN_IDX_DRBD and CN_VAL_DRBD to the right file Signed-off-by: Philipp Reisner Signed-off-by: Lars Ellenberg --- include/linux/connector.h | 2 ++ include/linux/drbd.h | 7 ------- 2 files changed, 2 insertions(+), 7 deletions(-) (limited to 'include') diff --git a/include/linux/connector.h b/include/linux/connector.h index 3a14615fd35..72ba63eb83c 100644 --- a/include/linux/connector.h +++ b/include/linux/connector.h @@ -43,6 +43,8 @@ #define CN_DST_VAL 0x1 #define CN_IDX_DM 0x7 /* Device Mapper */ #define CN_VAL_DM_USERSPACE_LOG 0x1 +#define CN_IDX_DRBD 0x8 +#define CN_VAL_DRBD 0x1 #define CN_NETLINK_USERS 8 diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 99a4d76694e..e84f4733cb5 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -322,13 +322,6 @@ enum drbd_timeout_flag { #define DRBD_NL_CREATE_DEVICE 0x01 #define DRBD_NL_SET_DEFAULTS 0x02 -/* The following line should be moved over to linux/connector.h - * when the time comes */ -#ifndef CN_IDX_DRBD -# define CN_IDX_DRBD 0x4 -/* Ubuntu "intrepid ibex" release defined CN_IDX_DRBD as 0x6 */ -#endif -#define CN_VAL_DRBD 0x1 /* For searching a vacant cn_idx value */ #define CN_IDX_STEP 6977 -- cgit v1.2.3 From 2d4dc890b5c8fabd818a8586607e6843c4375e62 Mon Sep 17 00:00:00 2001 From: Ilya Loginov Date: Thu, 26 Nov 2009 09:16:19 +0100 Subject: block: add helpers to run flush_dcache_page() against a bio and a request's pages Mtdblock driver doesn't call flush_dcache_page for pages in request. So, this causes problems on architectures where the icache doesn't fill from the dcache or with dcache aliases. The patch fixes this. The ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE symbol was introduced to avoid pointless empty cache-thrashing loops on architectures for which flush_dcache_page() is a no-op. Every architecture was provided with this flush pages on architectires where ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE is equal 1 or do nothing otherwise. See "fix mtd_blkdevs problem with caches on some architectures" discussion on LKML for more information. Signed-off-by: Ilya Loginov Cc: Ingo Molnar Cc: David Woodhouse Cc: Peter Horton Cc: "Ed L. Cashin" Signed-off-by: Jens Axboe --- include/asm-generic/cacheflush.h | 1 + include/linux/bio.h | 12 ++++++++++++ include/linux/blkdev.h | 11 +++++++++++ 3 files changed, 24 insertions(+) (limited to 'include') diff --git a/include/asm-generic/cacheflush.h b/include/asm-generic/cacheflush.h index ba4ec39a113..57b5c3c82e8 100644 --- a/include/asm-generic/cacheflush.h +++ b/include/asm-generic/cacheflush.h @@ -13,6 +13,7 @@ #define flush_cache_dup_mm(mm) do { } while (0) #define flush_cache_range(vma, start, end) do { } while (0) #define flush_cache_page(vma, vmaddr, pfn) do { } while (0) +#define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE 0 #define flush_dcache_page(page) do { } while (0) #define flush_dcache_mmap_lock(mapping) do { } while (0) #define flush_dcache_mmap_unlock(mapping) do { } while (0) diff --git a/include/linux/bio.h b/include/linux/bio.h index 474792b825d..7fc5606e6ea 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -391,6 +391,18 @@ extern struct bio *bio_copy_kern(struct request_queue *, void *, unsigned int, gfp_t, int); extern void bio_set_pages_dirty(struct bio *bio); extern void bio_check_pages_dirty(struct bio *bio); + +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" +#endif +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +extern void bio_flush_dcache_pages(struct bio *bi); +#else +static inline void bio_flush_dcache_pages(struct bio *bi) +{ +} +#endif + extern struct bio *bio_copy_user(struct request_queue *, struct rq_map_data *, unsigned long, unsigned int, int, gfp_t); extern struct bio *bio_copy_user_iov(struct request_queue *, diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 1cc02972fbe..e727f6c44c4 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -752,6 +752,17 @@ struct req_iterator { #define rq_iter_last(rq, _iter) \ (_iter.bio->bi_next == NULL && _iter.i == _iter.bio->bi_vcnt-1) +#ifndef ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +# error "You should define ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE for your platform" +#endif +#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE +extern void rq_flush_dcache_pages(struct request *rq); +#else +static inline void rq_flush_dcache_pages(struct request *rq) +{ +} +#endif + extern int blk_register_queue(struct gendisk *disk); extern void blk_unregister_queue(struct gendisk *disk); extern void register_disk(struct gendisk *dev); -- cgit v1.2.3 From d9449ce35a1e8fb58dd2d419f9215562a14ecca0 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 26 Nov 2009 09:45:40 +0100 Subject: Fix regression in direct writes performance due to WRITE_ODIRECT flag removal There seems to be a regression in direct write path due to following commit in for-2.6.33 branch of block tree. commit 1af60fbd759d31f565552fea315c2033947cfbe6 Author: Jeff Moyer Date: Fri Oct 2 18:56:53 2009 -0400 block: get rid of the WRITE_ODIRECT flag Marking direct writes as WRITE_SYNC_PLUG instead of WRITE_ODIRECT, sets the NOIDLE flag in bio and hence in request. This tells CFQ to not expect more request from the queue and not idle on it (despite the fact that queue's think time is less and it is not seeky). So direct writers lose big time when competing with sequential readers. Using fio, I have run one direct writer and two sequential readers and following are the results with 2.6.32-rc7 kernel and with for-2.6.33 branch. Test ==== 1 direct writer and 2 sequential reader running simultaneously. [global] directory=/mnt/sdc/fio/ runtime=10 [seqwrite] rw=write size=4G direct=1 [seqread] rw=read size=2G numjobs=2 2.6.32-rc7 ========== direct writes: aggrb=2,968KB/s readers : aggrb=101MB/s for-2.6.33 branch ================= direct write: aggrb=19KB/s readers aggrb=137MB/s This patch brings back the WRITE_ODIRECT flag, with the difference that we don't set the BIO_RW_UNPLUG flag so that device is not unplugged after submission of request and an explicit unplug from submitter is required. That way we fix the jeff's issue of not enough merging taking place in aio path as well as make sure direct writes get their fair share. After the fix ============= for-2.6.33 + fix ---------------- direct writes: aggrb=2,728KB/s reads: aggrb=103MB/s Thanks Vivek Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/fs.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include') diff --git a/include/linux/fs.h b/include/linux/fs.h index 2f5fca4147c..79cea805173 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -129,6 +129,7 @@ struct inodes_stat_t { * WRITE_SYNC Like WRITE_SYNC_PLUG, but also unplugs the device * immediately after submission. The write equivalent * of READ_SYNC. + * WRITE_ODIRECT_PLUG Special case write for O_DIRECT only. * SWRITE_SYNC * SWRITE_SYNC_PLUG Like WRITE_SYNC/WRITE_SYNC_PLUG, but locks the buffer. * See SWRITE. @@ -150,6 +151,7 @@ struct inodes_stat_t { #define READ_META (READ | (1 << BIO_RW_META)) #define WRITE_SYNC_PLUG (WRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define WRITE_SYNC (WRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) +#define WRITE_ODIRECT_PLUG (WRITE | (1 << BIO_RW_SYNCIO)) #define SWRITE_SYNC_PLUG \ (SWRITE | (1 << BIO_RW_SYNCIO) | (1 << BIO_RW_NOIDLE)) #define SWRITE_SYNC (SWRITE_SYNC_PLUG | (1 << BIO_RW_UNPLUG)) -- cgit v1.2.3 From 98262f2762f0067375f83824d81ea929e37e6bfe Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Thu, 3 Dec 2009 09:24:48 +0100 Subject: block: Allow devices to indicate whether discarded blocks are zeroed The discard ioctl is used by mkfs utilities to clear a block device prior to putting metadata down. However, not all devices return zeroed blocks after a discard. Some drives return stale data, potentially containing old superblocks. It is therefore important to know whether discarded blocks are properly zeroed. Both ATA and SCSI drives have configuration bits that indicate whether zeroes are returned after a discard operation. Implement a block level interface that allows this information to be bubbled up the stack and queried via a new block device ioctl. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- include/linux/blkdev.h | 14 ++++++++++++++ include/linux/fs.h | 1 + 2 files changed, 15 insertions(+) (limited to 'include') diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index e727f6c44c4..784a919aa0d 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -322,6 +322,7 @@ struct queue_limits { unsigned char misaligned; unsigned char discard_misaligned; unsigned char no_cluster; + signed char discard_zeroes_data; }; struct request_queue @@ -1150,6 +1151,19 @@ static inline int queue_sector_discard_alignment(struct request_queue *q, & (q->limits.discard_granularity - 1); } +static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) +{ + if (q->limits.discard_zeroes_data == 1) + return 1; + + return 0; +} + +static inline unsigned int bdev_discard_zeroes_data(struct block_device *bdev) +{ + return queue_discard_zeroes_data(bdev_get_queue(bdev)); +} + static inline int queue_dma_alignment(struct request_queue *q) { return q ? q->dma_alignment : 511; diff --git a/include/linux/fs.h b/include/linux/fs.h index 79cea805173..891f7d642e5 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -304,6 +304,7 @@ struct inodes_stat_t { #define BLKIOOPT _IO(0x12,121) #define BLKALIGNOFF _IO(0x12,122) #define BLKPBSZGET _IO(0x12,123) +#define BLKDISCARDZEROES _IO(0x12,124) #define BMAP_IOCTL 1 /* obsolete - kept for compatibility */ #define FIBMAP _IO(0x00,1) /* bmap access */ -- cgit v1.2.3 From b17621fed6aa039387e35f9b4d34d98f213e5673 Mon Sep 17 00:00:00 2001 From: Wu Fengguang Date: Thu, 3 Dec 2009 13:54:25 +0100 Subject: writeback: introduce wbc.for_background It will lower the flush priority for NFS, and maybe more in future. Signed-off-by: Wu Fengguang Cc: Trond Myklebust Cc: Jens Axboe Cc: Christoph Hellwig Signed-off-by: Jens Axboe --- include/linux/writeback.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include') diff --git a/include/linux/writeback.h b/include/linux/writeback.h index 66ebddcff66..705f01fe413 100644 --- a/include/linux/writeback.h +++ b/include/linux/writeback.h @@ -49,6 +49,7 @@ struct writeback_control { unsigned nonblocking:1; /* Don't get stuck on request queues */ unsigned encountered_congestion:1; /* An output: a queue is full */ unsigned for_kupdate:1; /* A kupdate writeback */ + unsigned for_background:1; /* A background writeback */ unsigned for_reclaim:1; /* Invoked from the page allocator */ unsigned range_cyclic:1; /* range_start is cyclic */ unsigned more_io:1; /* more io to be dispatched */ -- cgit v1.2.3 From 31e4c28d95e64f2d5d3c497a3ecf37c62de635b4 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Thu, 3 Dec 2009 12:59:42 -0500 Subject: blkio: Introduce blkio controller cgroup interface o This is basic implementation of blkio controller cgroup interface. This is the common interface visible to user space and should be used by different IO control policies as we implement those. Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- include/linux/cgroup_subsys.h | 6 ++++++ include/linux/iocontext.h | 4 ++++ 2 files changed, 10 insertions(+) (limited to 'include') diff --git a/include/linux/cgroup_subsys.h b/include/linux/cgroup_subsys.h index 9c8d31bacf4..ccefff02b6c 100644 --- a/include/linux/cgroup_subsys.h +++ b/include/linux/cgroup_subsys.h @@ -60,3 +60,9 @@ SUBSYS(net_cls) #endif /* */ + +#ifdef CONFIG_BLK_CGROUP +SUBSYS(blkio) +#endif + +/* */ diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index eb73632440f..d61b0b8b5cd 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -68,6 +68,10 @@ struct io_context { unsigned short ioprio; unsigned short ioprio_changed; +#ifdef CONFIG_BLK_CGROUP + unsigned short cgroup_changed; +#endif + /* * For request batching */ -- cgit v1.2.3 From b69f2292063d2caf37ca9aec7d63ded203701bf3 Mon Sep 17 00:00:00 2001 From: Louis Rilling Date: Fri, 4 Dec 2009 14:52:42 +0100 Subject: block: Fix io_context leak after failure of clone with CLONE_IO With CLONE_IO, parent's io_context->nr_tasks is incremented, but never decremented whenever copy_process() fails afterwards, which prevents exit_io_context() from calling IO schedulers exit functions. Give a task_struct to exit_io_context(), and call exit_io_context() instead of put_io_context() in copy_process() cleanup path. Signed-off-by: Louis Rilling Signed-off-by: Jens Axboe --- include/linux/iocontext.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'include') diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index d61b0b8b5cd..a6323599630 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -98,14 +98,15 @@ static inline struct io_context *ioc_task_link(struct io_context *ioc) return NULL; } +struct task_struct; #ifdef CONFIG_BLOCK int put_io_context(struct io_context *ioc); -void exit_io_context(void); +void exit_io_context(struct task_struct *task); struct io_context *get_io_context(gfp_t gfp_flags, int node); struct io_context *alloc_io_context(gfp_t gfp_flags, int node); void copy_io_context(struct io_context **pdst, struct io_context **psrc); #else -static inline void exit_io_context(void) +static inline void exit_io_context(struct task_struct *task) { } -- cgit v1.2.3