From 8c540a96c175bdf55bda8707db04cec78b816454 Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 5 Aug 2008 18:05:46 +0100 Subject: Let the block device know when sectors can be discarded [hirofumi@mail.parknet.co.jp: discard _after_ checking for corrupt chains] Signed-off-by: David Woodhouse Acked-by: OGAWA Hirofumi Signed-off-by: Jens Axboe --- fs/fat/fatent.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) (limited to 'fs') diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index 302e95c4af7..fb98b3d847e 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -6,6 +6,7 @@ #include #include #include +#include struct fatent_operations { void (*ent_blocknr)(struct super_block *, int, int *, sector_t *); @@ -535,6 +536,7 @@ int fat_free_clusters(struct inode *inode, int cluster) struct fat_entry fatent; struct buffer_head *bhs[MAX_BUF_PER_PAGE]; int i, err, nr_bhs; + int first_cl = cluster; nr_bhs = 0; fatent_init(&fatent); @@ -551,6 +553,18 @@ int fat_free_clusters(struct inode *inode, int cluster) goto error; } + /* + * Issue discard for the sectors we no longer care about, + * batching contiguous clusters into one request + */ + if (cluster != fatent.entry + 1) { + int nr_clus = fatent.entry - first_cl + 1; + + sb_issue_discard(sb, fat_clus_to_blknr(sbi, first_cl), + nr_clus * sbi->sec_per_clus); + first_cl = cluster; + } + ops->ent_put(&fatent, FAT_ENT_FREE); if (sbi->free_clusters != -1) { sbi->free_clusters++; -- cgit v1.2.3 From b8b3e16cfe6435d961f6aaebcfd52a1ff2a988c5 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 15 Aug 2008 10:15:19 +0200 Subject: block: drop virtual merging accounting Remove virtual merge accounting. Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- fs/bio.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 3cba7ae34d7..4ac7c59d1c6 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -350,8 +350,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page */ while (bio->bi_phys_segments >= q->max_phys_segments - || bio->bi_hw_segments >= q->max_hw_segments - || BIOVEC_VIRT_OVERSIZE(bio->bi_size)) { + || bio->bi_hw_segments >= q->max_hw_segments) { if (retried_segments) return 0; @@ -395,8 +394,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page } /* If we may be able to merge these biovecs, force a recount */ - if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec) || - BIOVEC_VIRT_MERGEABLE(bvec-1, bvec))) + if (bio->bi_vcnt && (BIOVEC_PHYS_MERGEABLE(bvec-1, bvec))) bio->bi_flags &= ~(1 << BIO_SEG_VALID); bio->bi_vcnt++; -- cgit v1.2.3 From 5df97b91b5d7ed426034fcc84cb6e7cf682b8838 Mon Sep 17 00:00:00 2001 From: Mikulas Patocka Date: Fri, 15 Aug 2008 10:20:02 +0200 Subject: drop vmerge accounting Remove hw_segments field from struct bio and struct request. Without virtual merge accounting they have no purpose. Signed-off-by: Mikulas Patocka Signed-off-by: Jens Axboe --- fs/bio.c | 12 +----------- 1 file changed, 1 insertion(+), 11 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 4ac7c59d1c6..bee4deca774 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -208,14 +208,6 @@ inline int bio_phys_segments(struct request_queue *q, struct bio *bio) return bio->bi_phys_segments; } -inline int bio_hw_segments(struct request_queue *q, struct bio *bio) -{ - if (unlikely(!bio_flagged(bio, BIO_SEG_VALID))) - blk_recount_segments(q, bio); - - return bio->bi_hw_segments; -} - /** * __bio_clone - clone a bio * @bio: destination bio @@ -350,7 +342,7 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page */ while (bio->bi_phys_segments >= q->max_phys_segments - || bio->bi_hw_segments >= q->max_hw_segments) { + || bio->bi_phys_segments >= q->max_hw_segments) { if (retried_segments) return 0; @@ -399,7 +391,6 @@ static int __bio_add_page(struct request_queue *q, struct bio *bio, struct page bio->bi_vcnt++; bio->bi_phys_segments++; - bio->bi_hw_segments++; done: bio->bi_size += len; return len; @@ -1381,7 +1372,6 @@ EXPORT_SYMBOL(bio_init); EXPORT_SYMBOL(__bio_clone); EXPORT_SYMBOL(bio_clone); EXPORT_SYMBOL(bio_phys_segments); -EXPORT_SYMBOL(bio_hw_segments); EXPORT_SYMBOL(bio_add_page); EXPORT_SYMBOL(bio_add_pc_page); EXPORT_SYMBOL(bio_get_nr_vecs); -- cgit v1.2.3 From ec2cdedf798385a9397ac50dd0405dd658f8529c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:30:15 +0900 Subject: block: allow deleting zero length partition delete_partition() was noop for zero length partition. As the addition code allows creating zero lenght partition and deletion is assumed to always succeed, this causes memory leak for zero length partitions. Allow zero length partitions to end their meaningless lives. While at it, allow deleting zero lenght partition via BLKPG_DEL_PARTITION ioctl too. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/partitions/check.c | 2 -- 1 file changed, 2 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index ecc3330972e..68f3e41ae66 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -325,8 +325,6 @@ void delete_partition(struct gendisk *disk, int part) if (!p) return; - if (!p->nr_sects) - return; disk->part[part-1] = NULL; p->start_sect = 0; p->nr_sects = 0; -- cgit v1.2.3 From 88e341261ca4d39eec21b212961c77eff51105f7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:30:16 +0900 Subject: block: update add_partition() error handling d805dda4 tried to fix error case handling in add_partition() but had a few problems. * disk->part[] entry is set early and left dangling if operation fails. * Once device initialized, the last put_device() is responsible for freeing all the resources. The failure path freed part_stats and p regardless of put_device() causing double free. * holders subdir holds reference to the disk device, so failure path should remove it to release resources properly which was missing. This patch fixes the above problems and while at it move partition slot busy check into add_partition() for completeness and inlines holders subdirectory creation. Using separate function for it just obfuscates the code. Signed-off-by: Tejun Heo Cc: Abdel Benamrouche Signed-off-by: Jens Axboe --- fs/partitions/check.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 68f3e41ae66..16f98d82460 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -300,15 +300,6 @@ struct device_type part_type = { .release = part_release, }; -static inline void partition_sysfs_add_subdir(struct hd_struct *p) -{ - struct kobject *k; - - k = kobject_get(&p->dev.kobj); - p->holder_dir = kobject_create_and_add("holders", k); - kobject_put(k); -} - static inline void disk_sysfs_add_subdirs(struct gendisk *disk) { struct kobject *k; @@ -347,13 +338,16 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, struct hd_struct *p; int err; + if (disk->part[part - 1]) + return -EBUSY; + p = kzalloc(sizeof(*p), GFP_KERNEL); if (!p) return -ENOMEM; if (!init_part_stats(p)) { err = -ENOMEM; - goto out0; + goto out_free; } p->start_sect = start; p->nr_sects = len; @@ -372,34 +366,42 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, p->dev.class = &block_class; p->dev.type = &part_type; p->dev.parent = &disk->dev; - disk->part[part-1] = p; /* delay uevent until 'holders' subdir is created */ p->dev.uevent_suppress = 1; err = device_add(&p->dev); if (err) - goto out1; - partition_sysfs_add_subdir(p); + goto out_put; + + err = -ENOMEM; + p->holder_dir = kobject_create_and_add("holders", &p->dev.kobj); + if (!p->holder_dir) + goto out_del; + p->dev.uevent_suppress = 0; if (flags & ADDPART_FLAG_WHOLEDISK) { err = device_create_file(&p->dev, &dev_attr_whole_disk); if (err) - goto out2; + goto out_del; } + /* everything is up and running, commence */ + disk->part[part - 1] = p; + /* suppress uevent if the disk supresses it */ if (!disk->dev.uevent_suppress) kobject_uevent(&p->dev.kobj, KOBJ_ADD); return 0; -out2: +out_free: + kfree(p); + return err; +out_del: + kobject_put(p->holder_dir); device_del(&p->dev); -out1: +out_put: put_device(&p->dev); - free_part_stats(p); -out0: - kfree(p); return err; } -- cgit v1.2.3 From cf771cb5a7b716f3f9e532fd42a1e3a0a75adec5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:01:09 +0200 Subject: block: make variable and argument names more consistent In hd_struct, @partno is used to denote partition number and a number of other places use @part to denote hd_struct. Functions use @part and @index instead. This causes confusion and makes it difficult to use consistent variable names for hd_struct. Always use @partno if a variable represents partition number. Also, print out functions use @f or @part for seq_file argument. Use @seqf uniformly instead. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 8 ++++---- fs/partitions/check.c | 33 +++++++++++++++++---------------- 2 files changed, 21 insertions(+), 20 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index aff54219e04..de0776cd721 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -930,7 +930,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) struct module *owner = NULL; struct gendisk *disk; int ret; - int part; + int partno; int perm = 0; if (file->f_mode & FMODE_READ) @@ -949,7 +949,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) ret = -ENXIO; file->f_mapping = bdev->bd_inode->i_mapping; lock_kernel(); - disk = get_gendisk(bdev->bd_dev, &part); + disk = get_gendisk(bdev->bd_dev, &partno); if (!disk) { unlock_kernel(); bdput(bdev); @@ -961,7 +961,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) if (!bdev->bd_openers) { bdev->bd_disk = disk; bdev->bd_contains = bdev; - if (!part) { + if (!partno) { struct backing_dev_info *bdi; if (disk->fops->open) { ret = disk->fops->open(bdev->bd_inode, file); @@ -989,7 +989,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) if (ret) goto out_first; bdev->bd_contains = whole; - p = disk->part[part - 1]; + p = disk->part[partno - 1]; bdev->bd_inode->i_data.backing_dev_info = whole->bd_inode->i_data.backing_dev_info; if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 16f98d82460..b86aab1b0df 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -120,22 +120,22 @@ static int (*check_part[])(struct parsed_partitions *, struct block_device *) = * a pointer to that same buffer (for convenience). */ -char *disk_name(struct gendisk *hd, int part, char *buf) +char *disk_name(struct gendisk *hd, int partno, char *buf) { - if (!part) + if (!partno) snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) - snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, part); + snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); else - snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, part); + snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); return buf; } const char *bdevname(struct block_device *bdev, char *buf) { - int part = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor; - return disk_name(bdev->bd_disk, part, buf); + int partno = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor; + return disk_name(bdev->bd_disk, partno, buf); } EXPORT_SYMBOL(bdevname); @@ -310,13 +310,13 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk) kobject_put(k); } -void delete_partition(struct gendisk *disk, int part) +void delete_partition(struct gendisk *disk, int partno) { - struct hd_struct *p = disk->part[part-1]; + struct hd_struct *p = disk->part[partno - 1]; if (!p) return; - disk->part[part-1] = NULL; + disk->part[partno - 1] = NULL; p->start_sect = 0; p->nr_sects = 0; part_stat_set_all(p, 0); @@ -333,12 +333,13 @@ static ssize_t whole_disk_show(struct device *dev, static DEVICE_ATTR(whole_disk, S_IRUSR | S_IRGRP | S_IROTH, whole_disk_show, NULL); -int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, int flags) +int add_partition(struct gendisk *disk, int partno, + sector_t start, sector_t len, int flags) { struct hd_struct *p; int err; - if (disk->part[part - 1]) + if (disk->part[partno - 1]) return -EBUSY; p = kzalloc(sizeof(*p), GFP_KERNEL); @@ -351,18 +352,18 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, } p->start_sect = start; p->nr_sects = len; - p->partno = part; + p->partno = partno; p->policy = disk->policy; if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1])) snprintf(p->dev.bus_id, BUS_ID_SIZE, - "%sp%d", disk->dev.bus_id, part); + "%sp%d", disk->dev.bus_id, partno); else snprintf(p->dev.bus_id, BUS_ID_SIZE, - "%s%d", disk->dev.bus_id, part); + "%s%d", disk->dev.bus_id, partno); device_initialize(&p->dev); - p->dev.devt = MKDEV(disk->major, disk->first_minor + part); + p->dev.devt = MKDEV(disk->major, disk->first_minor + partno); p->dev.class = &block_class; p->dev.type = &part_type; p->dev.parent = &disk->dev; @@ -386,7 +387,7 @@ int add_partition(struct gendisk *disk, int part, sector_t start, sector_t len, } /* everything is up and running, commence */ - disk->part[part - 1] = p; + disk->part[partno - 1] = p; /* suppress uevent if the disk supresses it */ if (!disk->dev.uevent_suppress) -- cgit v1.2.3 From f331c0296f2a9fee0d396a70598b954062603015 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:01:48 +0200 Subject: block: don't depend on consecutive minor space * Implement disk_devt() and part_devt() and use them to directly access devt instead of computing it from ->major and ->first_minor. Note that all references to ->major and ->first_minor outside of block layer is used to determine devt of the disk (the part0) and as ->major and ->first_minor will continue to represent devt for the disk, converting these users aren't strictly necessary. However, convert them for consistency. * Implement disk_max_parts() to avoid directly deferencing genhd->minors. * Update bdget_disk() such that it doesn't assume consecutive minor space. * Move devt computation from register_disk() to add_disk() and make it the only one (all other usages use the initially determined value). These changes clean up the code and will help disk->part dereference fix and extended block device numbers. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 +- fs/partitions/check.c | 19 +++++++++++-------- 2 files changed, 12 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index de0776cd721..72e0a2887cb 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -892,7 +892,7 @@ int check_disk_change(struct block_device *bdev) if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); - if (bdev->bd_disk->minors > 1) + if (disk_max_parts(bdev->bd_disk)) bdev->bd_invalidated = 1; return 1; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index b86aab1b0df..e77fa144a07 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -134,7 +134,11 @@ char *disk_name(struct gendisk *hd, int partno, char *buf) const char *bdevname(struct block_device *bdev, char *buf) { - int partno = MINOR(bdev->bd_dev) - bdev->bd_disk->first_minor; + int partno = 0; + + if (bdev->bd_part) + partno = bdev->bd_part->partno; + return disk_name(bdev->bd_disk, partno, buf); } @@ -169,7 +173,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev) if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); - state->limit = hd->minors; + state->limit = disk_max_parts(hd) + 1; i = res = err = 0; while (!res && check_part[i]) { memset(&state->parts, 0, sizeof(state->parts)); @@ -416,7 +420,6 @@ void register_disk(struct gendisk *disk) int err; disk->dev.parent = disk->driverfs_dev; - disk->dev.devt = MKDEV(disk->major, disk->first_minor); strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE); /* ewww... some of these buggers have / in the name... */ @@ -440,7 +443,7 @@ void register_disk(struct gendisk *disk) disk_sysfs_add_subdirs(disk); /* No minors to use for partitions */ - if (disk->minors == 1) + if (!disk_max_parts(disk)) goto exit; /* No such device (e.g., media were just removed) */ @@ -463,8 +466,8 @@ exit: kobject_uevent(&disk->dev.kobj, KOBJ_ADD); /* announce possible partitions */ - for (i = 1; i < disk->minors; i++) { - p = disk->part[i-1]; + for (i = 0; i < disk_max_parts(disk); i++) { + p = disk->part[i]; if (!p || !p->nr_sects) continue; kobject_uevent(&p->dev.kobj, KOBJ_ADD); @@ -482,7 +485,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) if (res) return res; bdev->bd_invalidated = 0; - for (p = 1; p < disk->minors; p++) + for (p = 1; p <= disk_max_parts(disk); p++) delete_partition(disk, p); if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); @@ -545,7 +548,7 @@ void del_gendisk(struct gendisk *disk) int p; /* invalidate stuff */ - for (p = disk->minors - 1; p > 0; p--) { + for (p = disk_max_parts(disk); p > 0; p--) { invalidate_partition(disk, p); delete_partition(disk, p); } -- cgit v1.2.3 From e71bf0d0ee89e51b92776391c5634938236977d5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:03:02 +0200 Subject: block: fix disk->part[] dereferencing race disk->part[] is protected by its matching bdev's lock. However, non-critical accesses like collecting stats and printing out sysfs and proc information used to be performed without any locking. As partitions can come and go dynamically, partitions can go away underneath those non-critical accesses. As some of those accesses are writes, this theoretically can lead to silent corruption. This patch fixes the race by using RCU for the partition array and dev reference counter to hold partitions. * Rename disk->part[] to disk->__part[] to make sure no one outside genhd layer proper accesses it directly. * Use RCU for disk->__part[] dereferencing. * Implement disk_{get|put}_part() which can be used to get and put partitions from gendisk respectively. * Iterators are implemented to help iterate through all partitions safely. * Functions which require RCU readlock are marked with _rcu suffix. * Use disk_put_part() in __blkdev_put() instead of directly putting the contained kobject. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 15 +++++------ fs/partitions/check.c | 70 +++++++++++++++++++++++++++++++++------------------ 2 files changed, 53 insertions(+), 32 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 72e0a2887cb..2f2873b9a04 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -929,6 +929,7 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) { struct module *owner = NULL; struct gendisk *disk; + struct hd_struct *part = NULL; int ret; int partno; int perm = 0; @@ -978,7 +979,6 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) if (bdev->bd_invalidated) rescan_partitions(disk, bdev); } else { - struct hd_struct *p; struct block_device *whole; whole = bdget_disk(disk, 0); ret = -ENOMEM; @@ -989,16 +989,16 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) if (ret) goto out_first; bdev->bd_contains = whole; - p = disk->part[partno - 1]; + part = disk_get_part(disk, partno); bdev->bd_inode->i_data.backing_dev_info = whole->bd_inode->i_data.backing_dev_info; - if (!(disk->flags & GENHD_FL_UP) || !p || !p->nr_sects) { + if (!(disk->flags & GENHD_FL_UP) || + !part || !part->nr_sects) { ret = -ENXIO; goto out_first; } - kobject_get(&p->dev.kobj); - bdev->bd_part = p; - bd_set_size(bdev, (loff_t) p->nr_sects << 9); + bdev->bd_part = part; + bd_set_size(bdev, (loff_t)part->nr_sects << 9); } } else { put_disk(disk); @@ -1027,6 +1027,7 @@ out_first: __blkdev_put(bdev->bd_contains, 1); bdev->bd_contains = NULL; put_disk(disk); + disk_put_part(part); module_put(owner); out: mutex_unlock(&bdev->bd_mutex); @@ -1119,7 +1120,7 @@ static int __blkdev_put(struct block_device *bdev, int for_part) module_put(owner); if (bdev->bd_contains != bdev) { - kobject_put(&bdev->bd_part->dev.kobj); + disk_put_part(bdev->bd_part); bdev->bd_part = NULL; } bdev->bd_disk = NULL; diff --git a/fs/partitions/check.c b/fs/partitions/check.c index e77fa144a07..96c8bf41e45 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -314,19 +314,29 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk) kobject_put(k); } +static void delete_partition_rcu_cb(struct rcu_head *head) +{ + struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); + + part->start_sect = 0; + part->nr_sects = 0; + part_stat_set_all(part, 0); + put_device(&part->dev); +} + void delete_partition(struct gendisk *disk, int partno) { - struct hd_struct *p = disk->part[partno - 1]; + struct hd_struct *part; - if (!p) + part = disk->__part[partno-1]; + if (!part) return; - disk->part[partno - 1] = NULL; - p->start_sect = 0; - p->nr_sects = 0; - part_stat_set_all(p, 0); - kobject_put(p->holder_dir); - device_del(&p->dev); - put_device(&p->dev); + + rcu_assign_pointer(disk->__part[partno-1], NULL); + kobject_put(part->holder_dir); + device_del(&part->dev); + + call_rcu(&part->rcu_head, delete_partition_rcu_cb); } static ssize_t whole_disk_show(struct device *dev, @@ -343,7 +353,7 @@ int add_partition(struct gendisk *disk, int partno, struct hd_struct *p; int err; - if (disk->part[partno - 1]) + if (disk->__part[partno - 1]) return -EBUSY; p = kzalloc(sizeof(*p), GFP_KERNEL); @@ -391,7 +401,8 @@ int add_partition(struct gendisk *disk, int partno, } /* everything is up and running, commence */ - disk->part[partno - 1] = p; + INIT_RCU_HEAD(&p->rcu_head); + rcu_assign_pointer(disk->__part[partno - 1], p); /* suppress uevent if the disk supresses it */ if (!disk->dev.uevent_suppress) @@ -414,9 +425,9 @@ out_put: void register_disk(struct gendisk *disk) { struct block_device *bdev; + struct disk_part_iter piter; + struct hd_struct *part; char *s; - int i; - struct hd_struct *p; int err; disk->dev.parent = disk->driverfs_dev; @@ -466,16 +477,16 @@ exit: kobject_uevent(&disk->dev.kobj, KOBJ_ADD); /* announce possible partitions */ - for (i = 0; i < disk_max_parts(disk); i++) { - p = disk->part[i]; - if (!p || !p->nr_sects) - continue; - kobject_uevent(&p->dev.kobj, KOBJ_ADD); - } + disk_part_iter_init(&piter, disk, 0); + while ((part = disk_part_iter_next(&piter))) + kobject_uevent(&part->dev.kobj, KOBJ_ADD); + disk_part_iter_exit(&piter); } int rescan_partitions(struct gendisk *disk, struct block_device *bdev) { + struct disk_part_iter piter; + struct hd_struct *part; struct parsed_partitions *state; int p, res; @@ -485,8 +496,12 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) if (res) return res; bdev->bd_invalidated = 0; - for (p = 1; p <= disk_max_parts(disk); p++) - delete_partition(disk, p); + + disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); + while ((part = disk_part_iter_next(&piter))) + delete_partition(disk, part->partno); + disk_part_iter_exit(&piter); + if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) @@ -545,13 +560,18 @@ EXPORT_SYMBOL(read_dev_sector); void del_gendisk(struct gendisk *disk) { - int p; + struct disk_part_iter piter; + struct hd_struct *part; /* invalidate stuff */ - for (p = disk_max_parts(disk); p > 0; p--) { - invalidate_partition(disk, p); - delete_partition(disk, p); + disk_part_iter_init(&piter, disk, + DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); + while ((part = disk_part_iter_next(&piter))) { + invalidate_partition(disk, part->partno); + delete_partition(disk, part->partno); } + disk_part_iter_exit(&piter); + invalidate_partition(disk, 0); disk->capacity = 0; disk->flags &= ~GENHD_FL_UP; -- cgit v1.2.3 From c9959059161ddd7bf4670cf47367033d6b2f79c4 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:21 +0900 Subject: block: fix diskstats access There are two variants of stat functions - ones prefixed with double underbars which don't care about preemption and ones without which disable preemption before manipulating per-cpu counters. It's unclear whether the underbarred ones assume that preemtion is disabled on entry as some callers don't do that. This patch unifies diskstats access by implementing disk_stat_lock() and disk_stat_unlock() which take care of both RCU (for partition access) and preemption (for per-cpu counter access). diskstats access should always be enclosed between the two functions. As such, there's no need for the versions which disables preemption. They're removed and double underbars ones are renamed to drop the underbars. As an extra argument is added, there's no danger of using the old version unconverted. disk_stat_lock() uses get_cpu() and returns the cpu index and all diskstat functions which access per-cpu counters now has @cpu argument to help RT. This change adds RCU or preemption operations at some places but also collapses several preemption ops into one at others. Overall, the performance difference should be negligible as all involved ops are very lightweight per-cpu ones. Signed-off-by: Tejun Heo Cc: Peter Zijlstra Signed-off-by: Jens Axboe --- fs/partitions/check.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 96c8bf41e45..c442f0aadac 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -219,10 +219,11 @@ static ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); + int cpu; - preempt_disable(); - part_round_stats(p); - preempt_enable(); + cpu = disk_stat_lock(); + part_round_stats(cpu, p); + disk_stat_unlock(); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " -- cgit v1.2.3 From bcce3de1be61e424deef35d1e86e86a35c4b6e65 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:47:22 +0900 Subject: block: implement extended dev numbers Implement extended device numbers. A block driver can tell block layer that it wants to use extended device numbers. After the usual minor space is used up, block layer automatically allocates devt's from EXT_BLOCK_MAJOR. Currently only one major number is allocated for this but as the allocation is strictly on-demand, ~1mil minor space under it should suffice unless the system actually has more than ~1mil partitions and if that ever happens adding more majors to the extended devt area is easy. Due to internal implementation issues, the first partition can't be allocated on the extended area. In other words, genhd->minors should at least be 1. This limitation will be lifted by later changes. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/partitions/check.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index c442f0aadac..0d4b7f28f13 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -333,6 +333,7 @@ void delete_partition(struct gendisk *disk, int partno) if (!part) return; + blk_free_devt(part_devt(part)); rcu_assign_pointer(disk->__part[partno-1], NULL); kobject_put(part->holder_dir); device_del(&part->dev); @@ -352,6 +353,7 @@ int add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags) { struct hd_struct *p; + dev_t devt = MKDEV(0, 0); int err; if (disk->__part[partno - 1]) @@ -378,11 +380,15 @@ int add_partition(struct gendisk *disk, int partno, "%s%d", disk->dev.bus_id, partno); device_initialize(&p->dev); - p->dev.devt = MKDEV(disk->major, disk->first_minor + partno); p->dev.class = &block_class; p->dev.type = &part_type; p->dev.parent = &disk->dev; + err = blk_alloc_devt(p, &devt); + if (err) + goto out_put; + p->dev.devt = devt; + /* delay uevent until 'holders' subdir is created */ p->dev.uevent_suppress = 1; err = device_add(&p->dev); @@ -419,6 +425,7 @@ out_del: device_del(&p->dev); out_put: put_device(&p->dev); + blk_free_devt(devt); return err; } -- cgit v1.2.3 From ed9e1982347b36573cd622ee5f4e2a7ccd79b3fd Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:05 +0900 Subject: block: implement and use {disk|part}_to_dev() Implement {disk|part}_to_dev() and use them to access generic device instead of directly dereferencing {disk|part}->dev. To make sure no user is left behind, rename generic devices fields to __dev. This is in preparation of unifying partition 0 handling with other partitions. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 4 +-- fs/partitions/check.c | 79 +++++++++++++++++++++++++++------------------------ 2 files changed, 44 insertions(+), 39 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 2f2873b9a04..a02df22f37c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -543,9 +543,9 @@ EXPORT_SYMBOL(bd_release); static struct kobject *bdev_get_kobj(struct block_device *bdev) { if (bdev->bd_contains != bdev) - return kobject_get(&bdev->bd_part->dev.kobj); + return kobject_get(&part_to_dev(bdev->bd_part)->kobj); else - return kobject_get(&bdev->bd_disk->dev.kobj); + return kobject_get(&disk_to_dev(bdev->bd_disk)->kobj); } static struct kobject *bdev_get_holder(struct block_device *bdev) diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 0d4b7f28f13..ac0df3acdcd 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -309,7 +309,7 @@ static inline void disk_sysfs_add_subdirs(struct gendisk *disk) { struct kobject *k; - k = kobject_get(&disk->dev.kobj); + k = kobject_get(&disk_to_dev(disk)->kobj); disk->holder_dir = kobject_create_and_add("holders", k); disk->slave_dir = kobject_create_and_add("slaves", k); kobject_put(k); @@ -322,7 +322,7 @@ static void delete_partition_rcu_cb(struct rcu_head *head) part->start_sect = 0; part->nr_sects = 0; part_stat_set_all(part, 0); - put_device(&part->dev); + put_device(part_to_dev(part)); } void delete_partition(struct gendisk *disk, int partno) @@ -336,7 +336,7 @@ void delete_partition(struct gendisk *disk, int partno) blk_free_devt(part_devt(part)); rcu_assign_pointer(disk->__part[partno-1], NULL); kobject_put(part->holder_dir); - device_del(&part->dev); + device_del(part_to_dev(part)); call_rcu(&part->rcu_head, delete_partition_rcu_cb); } @@ -354,6 +354,9 @@ int add_partition(struct gendisk *disk, int partno, { struct hd_struct *p; dev_t devt = MKDEV(0, 0); + struct device *ddev = disk_to_dev(disk); + struct device *pdev; + const char *dname; int err; if (disk->__part[partno - 1]) @@ -367,42 +370,43 @@ int add_partition(struct gendisk *disk, int partno, err = -ENOMEM; goto out_free; } + pdev = part_to_dev(p); + p->start_sect = start; p->nr_sects = len; p->partno = partno; p->policy = disk->policy; - if (isdigit(disk->dev.bus_id[strlen(disk->dev.bus_id)-1])) - snprintf(p->dev.bus_id, BUS_ID_SIZE, - "%sp%d", disk->dev.bus_id, partno); + dname = dev_name(ddev); + if (isdigit(dname[strlen(dname) - 1])) + snprintf(pdev->bus_id, BUS_ID_SIZE, "%sp%d", dname, partno); else - snprintf(p->dev.bus_id, BUS_ID_SIZE, - "%s%d", disk->dev.bus_id, partno); + snprintf(pdev->bus_id, BUS_ID_SIZE, "%s%d", dname, partno); - device_initialize(&p->dev); - p->dev.class = &block_class; - p->dev.type = &part_type; - p->dev.parent = &disk->dev; + device_initialize(pdev); + pdev->class = &block_class; + pdev->type = &part_type; + pdev->parent = ddev; err = blk_alloc_devt(p, &devt); if (err) - goto out_put; - p->dev.devt = devt; + goto out_free; + pdev->devt = devt; /* delay uevent until 'holders' subdir is created */ - p->dev.uevent_suppress = 1; - err = device_add(&p->dev); + pdev->uevent_suppress = 1; + err = device_add(pdev); if (err) goto out_put; err = -ENOMEM; - p->holder_dir = kobject_create_and_add("holders", &p->dev.kobj); + p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); if (!p->holder_dir) goto out_del; - p->dev.uevent_suppress = 0; + pdev->uevent_suppress = 0; if (flags & ADDPART_FLAG_WHOLEDISK) { - err = device_create_file(&p->dev, &dev_attr_whole_disk); + err = device_create_file(pdev, &dev_attr_whole_disk); if (err) goto out_del; } @@ -412,8 +416,8 @@ int add_partition(struct gendisk *disk, int partno, rcu_assign_pointer(disk->__part[partno - 1], p); /* suppress uevent if the disk supresses it */ - if (!disk->dev.uevent_suppress) - kobject_uevent(&p->dev.kobj, KOBJ_ADD); + if (!ddev->uevent_suppress) + kobject_uevent(&pdev->kobj, KOBJ_ADD); return 0; @@ -422,9 +426,9 @@ out_free: return err; out_del: kobject_put(p->holder_dir); - device_del(&p->dev); + device_del(pdev); out_put: - put_device(&p->dev); + put_device(pdev); blk_free_devt(devt); return err; } @@ -432,30 +436,31 @@ out_put: /* Not exported, helper to add_disk(). */ void register_disk(struct gendisk *disk) { + struct device *ddev = disk_to_dev(disk); struct block_device *bdev; struct disk_part_iter piter; struct hd_struct *part; char *s; int err; - disk->dev.parent = disk->driverfs_dev; + ddev->parent = disk->driverfs_dev; - strlcpy(disk->dev.bus_id, disk->disk_name, BUS_ID_SIZE); + strlcpy(ddev->bus_id, disk->disk_name, BUS_ID_SIZE); /* ewww... some of these buggers have / in the name... */ - s = strchr(disk->dev.bus_id, '/'); + s = strchr(ddev->bus_id, '/'); if (s) *s = '!'; /* delay uevents, until we scanned partition table */ - disk->dev.uevent_suppress = 1; + ddev->uevent_suppress = 1; - if (device_add(&disk->dev)) + if (device_add(ddev)) return; #ifndef CONFIG_SYSFS_DEPRECATED - err = sysfs_create_link(block_depr, &disk->dev.kobj, - kobject_name(&disk->dev.kobj)); + err = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); if (err) { - device_del(&disk->dev); + device_del(ddev); return; } #endif @@ -481,13 +486,13 @@ void register_disk(struct gendisk *disk) exit: /* announce disk after possible partitions are created */ - disk->dev.uevent_suppress = 0; - kobject_uevent(&disk->dev.kobj, KOBJ_ADD); + ddev->uevent_suppress = 0; + kobject_uevent(&ddev->kobj, KOBJ_ADD); /* announce possible partitions */ disk_part_iter_init(&piter, disk, 0); while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part->dev.kobj, KOBJ_ADD); + kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); disk_part_iter_exit(&piter); } @@ -518,7 +523,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) return -EIO; /* tell userspace that the media / partition table may have changed */ - kobject_uevent(&disk->dev.kobj, KOBJ_CHANGE); + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); for (p = 1; p < state->limit; p++) { sector_t size = state->parts[p].size; @@ -591,7 +596,7 @@ void del_gendisk(struct gendisk *disk) kobject_put(disk->slave_dir); disk->driverfs_dev = NULL; #ifndef CONFIG_SYSFS_DEPRECATED - sysfs_remove_link(block_depr, disk->dev.bus_id); + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); #endif - device_del(&disk->dev); + device_del(disk_to_dev(disk)); } -- cgit v1.2.3 From b5d0b9df0ba5d9a044f3a21e7544f53d90bd1465 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Wed, 3 Sep 2008 09:06:42 +0200 Subject: block: introduce partition 0 genhd and partition code handled disk and partitions separately. All information about the whole disk was in struct genhd and partitions in struct hd_struct. However, the whole disk (part0) and other partitions have a lot in common and the data structures end up having good number of common fields and thus separate code paths doing the same thing. Also, the partition array was indexed by partno - 1 which gets pretty confusing at times. This patch introduces partition 0 and makes the partition array indexed by partno. Following patches will unify the handling of disk and parts piece-by-piece. This patch also implements disk_partitionable() which tests whether a disk is partitionable. With coming dynamic partition array change, the most common usage of disk_max_parts() will be testing whether a disk is partitionable and the number of max partitions will become much less important. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 2 +- fs/partitions/check.c | 12 ++++++------ 2 files changed, 7 insertions(+), 7 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index a02df22f37c..c982a910797 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -892,7 +892,7 @@ int check_disk_change(struct block_device *bdev) if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); - if (disk_max_parts(bdev->bd_disk)) + if (disk_partitionable(bdev->bd_disk)) bdev->bd_invalidated = 1; return 1; } diff --git a/fs/partitions/check.c b/fs/partitions/check.c index ac0df3acdcd..b60699c271a 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -173,7 +173,7 @@ check_partition(struct gendisk *hd, struct block_device *bdev) if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); - state->limit = disk_max_parts(hd) + 1; + state->limit = disk_max_parts(hd); i = res = err = 0; while (!res && check_part[i]) { memset(&state->parts, 0, sizeof(state->parts)); @@ -329,12 +329,12 @@ void delete_partition(struct gendisk *disk, int partno) { struct hd_struct *part; - part = disk->__part[partno-1]; + part = disk->__part[partno]; if (!part) return; blk_free_devt(part_devt(part)); - rcu_assign_pointer(disk->__part[partno-1], NULL); + rcu_assign_pointer(disk->__part[partno], NULL); kobject_put(part->holder_dir); device_del(part_to_dev(part)); @@ -359,7 +359,7 @@ int add_partition(struct gendisk *disk, int partno, const char *dname; int err; - if (disk->__part[partno - 1]) + if (disk->__part[partno]) return -EBUSY; p = kzalloc(sizeof(*p), GFP_KERNEL); @@ -413,7 +413,7 @@ int add_partition(struct gendisk *disk, int partno, /* everything is up and running, commence */ INIT_RCU_HEAD(&p->rcu_head); - rcu_assign_pointer(disk->__part[partno - 1], p); + rcu_assign_pointer(disk->__part[partno], p); /* suppress uevent if the disk supresses it */ if (!ddev->uevent_suppress) @@ -467,7 +467,7 @@ void register_disk(struct gendisk *disk) disk_sysfs_add_subdirs(disk); /* No minors to use for partitions */ - if (!disk_max_parts(disk)) + if (!disk_partitionable(disk)) goto exit; /* No such device (e.g., media were just removed) */ -- cgit v1.2.3 From 80795aefb76d10c5d698e60c7e7750b5330787da Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:07 +0900 Subject: block: move capacity from disk to part0 Move disk->capacity to part0->nr_sects and convert all users who directly accessed the field to use {get|set}_capacity(). This is done early to allow the __dev field to be moved. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/partitions/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index b60699c271a..902b95f1f9d 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -586,7 +586,7 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_exit(&piter); invalidate_partition(disk, 0); - disk->capacity = 0; + set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; unlink_gendisk(disk); disk_stat_set_all(disk, 0); -- cgit v1.2.3 From e56105214943ce5f0901d20e972a7cfd0d1d0656 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:09 +0900 Subject: block: unify sysfs size node handling Now that capacity and __dev are moved to part0, part0 and others can share the same method. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/partitions/check.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 902b95f1f9d..24d2c56d7d2 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -208,8 +208,8 @@ static ssize_t part_start_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); } -static ssize_t part_size_show(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t part_size_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); -- cgit v1.2.3 From b7db9956e57c8151b930d5e5fe5c766e6aad3ff7 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:10 +0900 Subject: block: move policy from disk to part0 Move disk->policy to part0->policy. Implement and use get_disk_ro(). Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/partitions/check.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 24d2c56d7d2..ace6d03602c 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -375,7 +375,7 @@ int add_partition(struct gendisk *disk, int partno, p->start_sect = start; p->nr_sects = len; p->partno = partno; - p->policy = disk->policy; + p->policy = get_disk_ro(disk); dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) -- cgit v1.2.3 From 4c46501d1659475dc6c89554af6ce7fe6ecf615c Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:11 +0900 Subject: block: move holder_dir from disk to part0 Move disk->holder_dir to part0->holder_dir. Kill now mostly superflous bdev_get_holder(). While at it, kill superflous kobject_get/put() around holder_dir, slave_dir and cmd_filter creation and collapse disk_sysfs_add_subdirs() into register_disk(). These serve no purpose but obfuscating the code. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 10 +--------- fs/partitions/check.c | 15 +++------------ 2 files changed, 4 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index c982a910797..57d57264285 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -548,14 +548,6 @@ static struct kobject *bdev_get_kobj(struct block_device *bdev) return kobject_get(&disk_to_dev(bdev->bd_disk)->kobj); } -static struct kobject *bdev_get_holder(struct block_device *bdev) -{ - if (bdev->bd_contains != bdev) - return kobject_get(bdev->bd_part->holder_dir); - else - return kobject_get(bdev->bd_disk->holder_dir); -} - static int add_symlink(struct kobject *from, struct kobject *to) { if (!from || !to) @@ -608,7 +600,7 @@ static int bd_holder_grab_dirs(struct block_device *bdev, if (!bo->sdev) goto fail_put_hdev; - bo->hdir = bdev_get_holder(bdev); + bo->hdir = kobject_get(bdev->bd_part->holder_dir); if (!bo->hdir) goto fail_put_sdev; diff --git a/fs/partitions/check.c b/fs/partitions/check.c index ace6d03602c..f0f604950ff 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -305,16 +305,6 @@ struct device_type part_type = { .release = part_release, }; -static inline void disk_sysfs_add_subdirs(struct gendisk *disk) -{ - struct kobject *k; - - k = kobject_get(&disk_to_dev(disk)->kobj); - disk->holder_dir = kobject_create_and_add("holders", k); - disk->slave_dir = kobject_create_and_add("slaves", k); - kobject_put(k); -} - static void delete_partition_rcu_cb(struct rcu_head *head) { struct hd_struct *part = container_of(head, struct hd_struct, rcu_head); @@ -464,7 +454,8 @@ void register_disk(struct gendisk *disk) return; } #endif - disk_sysfs_add_subdirs(disk); + disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); /* No minors to use for partitions */ if (!disk_partitionable(disk)) @@ -592,7 +583,7 @@ void del_gendisk(struct gendisk *disk) disk_stat_set_all(disk, 0); disk->stamp = 0; - kobject_put(disk->holder_dir); + kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); disk->driverfs_dev = NULL; #ifndef CONFIG_SYSFS_DEPRECATED -- cgit v1.2.3 From 0762b8bde9729f10f8e6249809660ff2ec3ad735 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:12 +0900 Subject: block: always set bdev->bd_part Till now, bdev->bd_part is set only if the bdev was for parts other than part0. This patch makes bdev->bd_part always set so that code paths don't have to differenciate common handling. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/block_dev.c | 67 ++++++++++++++++++++++++--------------------------- fs/partitions/check.c | 7 +----- 2 files changed, 32 insertions(+), 42 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 57d57264285..c3fa19bd64d 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -540,14 +540,6 @@ EXPORT_SYMBOL(bd_release); * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 */ -static struct kobject *bdev_get_kobj(struct block_device *bdev) -{ - if (bdev->bd_contains != bdev) - return kobject_get(&part_to_dev(bdev->bd_part)->kobj); - else - return kobject_get(&disk_to_dev(bdev->bd_disk)->kobj); -} - static int add_symlink(struct kobject *from, struct kobject *to) { if (!from || !to) @@ -596,7 +588,7 @@ static int bd_holder_grab_dirs(struct block_device *bdev, if (!bo->hdev) goto fail_put_sdir; - bo->sdev = bdev_get_kobj(bdev); + bo->sdev = kobject_get(&part_to_dev(bdev->bd_part)->kobj); if (!bo->sdev) goto fail_put_hdev; @@ -919,7 +911,6 @@ static int __blkdev_put(struct block_device *bdev, int for_part); static int do_open(struct block_device *bdev, struct file *file, int for_part) { - struct module *owner = NULL; struct gendisk *disk; struct hd_struct *part = NULL; int ret; @@ -941,25 +932,27 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) ret = -ENXIO; file->f_mapping = bdev->bd_inode->i_mapping; + lock_kernel(); + disk = get_gendisk(bdev->bd_dev, &partno); - if (!disk) { - unlock_kernel(); - bdput(bdev); - return ret; - } - owner = disk->fops->owner; + if (!disk) + goto out_unlock_kernel; + part = disk_get_part(disk, partno); + if (!part) + goto out_unlock_kernel; mutex_lock_nested(&bdev->bd_mutex, for_part); if (!bdev->bd_openers) { bdev->bd_disk = disk; + bdev->bd_part = part; bdev->bd_contains = bdev; if (!partno) { struct backing_dev_info *bdi; if (disk->fops->open) { ret = disk->fops->open(bdev->bd_inode, file); if (ret) - goto out_first; + goto out_clear; } if (!bdev->bd_openers) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); @@ -975,31 +968,32 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) whole = bdget_disk(disk, 0); ret = -ENOMEM; if (!whole) - goto out_first; + goto out_clear; BUG_ON(for_part); ret = __blkdev_get(whole, file->f_mode, file->f_flags, 1); if (ret) - goto out_first; + goto out_clear; bdev->bd_contains = whole; - part = disk_get_part(disk, partno); bdev->bd_inode->i_data.backing_dev_info = whole->bd_inode->i_data.backing_dev_info; if (!(disk->flags & GENHD_FL_UP) || !part || !part->nr_sects) { ret = -ENXIO; - goto out_first; + goto out_clear; } - bdev->bd_part = part; bd_set_size(bdev, (loff_t)part->nr_sects << 9); } } else { + disk_put_part(part); put_disk(disk); - module_put(owner); + module_put(disk->fops->owner); + part = NULL; + disk = NULL; if (bdev->bd_contains == bdev) { if (bdev->bd_disk->fops->open) { ret = bdev->bd_disk->fops->open(bdev->bd_inode, file); if (ret) - goto out; + goto out_unlock_bdev; } if (bdev->bd_invalidated) rescan_partitions(bdev->bd_disk, bdev); @@ -1012,20 +1006,24 @@ static int do_open(struct block_device *bdev, struct file *file, int for_part) unlock_kernel(); return 0; -out_first: + out_clear: bdev->bd_disk = NULL; + bdev->bd_part = NULL; bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) __blkdev_put(bdev->bd_contains, 1); bdev->bd_contains = NULL; - put_disk(disk); - disk_put_part(part); - module_put(owner); -out: + out_unlock_bdev: mutex_unlock(&bdev->bd_mutex); + out_unlock_kernel: unlock_kernel(); - if (ret) - bdput(bdev); + + disk_put_part(part); + if (disk) + module_put(disk->fops->owner); + put_disk(disk); + bdput(bdev); + return ret; } @@ -1110,11 +1108,8 @@ static int __blkdev_put(struct block_device *bdev, int for_part) put_disk(disk); module_put(owner); - - if (bdev->bd_contains != bdev) { - disk_put_part(bdev->bd_part); - bdev->bd_part = NULL; - } + disk_put_part(bdev->bd_part); + bdev->bd_part = NULL; bdev->bd_disk = NULL; bdev->bd_inode->i_data.backing_dev_info = &default_backing_dev_info; if (bdev != bdev->bd_contains) diff --git a/fs/partitions/check.c b/fs/partitions/check.c index f0f604950ff..87298c0fc8c 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -134,12 +134,7 @@ char *disk_name(struct gendisk *hd, int partno, char *buf) const char *bdevname(struct block_device *bdev, char *buf) { - int partno = 0; - - if (bdev->bd_part) - partno = bdev->bd_part->partno; - - return disk_name(bdev->bd_disk, partno, buf); + return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); } EXPORT_SYMBOL(bdevname); -- cgit v1.2.3 From eddb2e26b5ee3c5da68ba4bf1921ba20e2097bff Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:13 +0900 Subject: block: kill GENHD_FL_FAIL and use part0->make_it_fail GENHD_FL_FAIL for disk is what make_it_fail is for parts. Kill it and use part0->make_it_fail. Sysfs node handling is unified too. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/partitions/check.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 87298c0fc8c..60592d9f43b 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -238,17 +238,17 @@ static ssize_t part_stat_show(struct device *dev, } #ifdef CONFIG_FAIL_MAKE_REQUEST -static ssize_t part_fail_show(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t part_fail_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); return sprintf(buf, "%d\n", p->make_it_fail); } -static ssize_t part_fail_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) +ssize_t part_fail_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) { struct hd_struct *p = dev_to_part(dev); int i; -- cgit v1.2.3 From 074a7aca7afa6f230104e8e65eba3420263714a5 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:14 +0900 Subject: block: move stats from disk to part0 Move stats related fields - stamp, in_flight, dkstats - from disk to part0 and unify stat handling such that... * part_stat_*() now updates part0 together if the specified partition is not part0. ie. part_stat_*() are now essentially all_stat_*(). * {disk|all}_stat_*() are gone. * part_round_stats() is updated similary. It handles part0 stats automatically and disk_round_stats() is killed. * part_{inc|dec}_in_fligh() is implemented which automatically updates part0 stats for parts other than part0. * disk_map_sector_rcu() is updated to return part0 if no part matches. Combined with the above changes, this makes NULL special case handling in callers unnecessary. * Separate stats show code paths for disk are collapsed into part stats show code paths. * Rename disk_stat_lock/unlock() to part_stat_lock/unlock() While at it, reposition stat handling macros a bit and add missing parentheses around macro parameters. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/partitions/check.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 60592d9f43b..f517869e8d1 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -210,15 +210,15 @@ ssize_t part_size_show(struct device *dev, return sprintf(buf, "%llu\n",(unsigned long long)p->nr_sects); } -static ssize_t part_stat_show(struct device *dev, - struct device_attribute *attr, char *buf) +ssize_t part_stat_show(struct device *dev, + struct device_attribute *attr, char *buf) { struct hd_struct *p = dev_to_part(dev); int cpu; - cpu = disk_stat_lock(); + cpu = part_stat_lock(); part_round_stats(cpu, p); - disk_stat_unlock(); + part_stat_unlock(); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -575,8 +575,8 @@ void del_gendisk(struct gendisk *disk) set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; unlink_gendisk(disk); - disk_stat_set_all(disk, 0); - disk->stamp = 0; + part_stat_set_all(&disk->part0, 0); + disk->part0.stamp = 0; kobject_put(disk->part0.holder_dir); kobject_put(disk->slave_dir); -- cgit v1.2.3 From 540eed5637b766bb1e881ef744c42617760b4815 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:15 +0900 Subject: block: make partition array dynamic disk->__part used to be statically allocated to the maximum possible number of partitions. This patch makes partition array allocation dynamic. The added overhead is minimal as only real change is one memory dereference changed to RCU one. This saves both a bit of memory and cpu cycles iterating through unoccupied slots and makes increasing partition limit easier. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/partitions/check.c | 31 ++++++++++++++++++++++++++----- 1 file changed, 26 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index f517869e8d1..772b2ed8d23 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -312,14 +312,18 @@ static void delete_partition_rcu_cb(struct rcu_head *head) void delete_partition(struct gendisk *disk, int partno) { + struct disk_part_tbl *ptbl = disk->part_tbl; struct hd_struct *part; - part = disk->__part[partno]; + if (partno >= ptbl->len) + return; + + part = ptbl->part[partno]; if (!part) return; blk_free_devt(part_devt(part)); - rcu_assign_pointer(disk->__part[partno], NULL); + rcu_assign_pointer(ptbl->part[partno], NULL); kobject_put(part->holder_dir); device_del(part_to_dev(part)); @@ -341,10 +345,16 @@ int add_partition(struct gendisk *disk, int partno, dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; + struct disk_part_tbl *ptbl; const char *dname; int err; - if (disk->__part[partno]) + err = disk_expand_part_tbl(disk, partno); + if (err) + return err; + ptbl = disk->part_tbl; + + if (ptbl->part[partno]) return -EBUSY; p = kzalloc(sizeof(*p), GFP_KERNEL); @@ -398,7 +408,7 @@ int add_partition(struct gendisk *disk, int partno, /* everything is up and running, commence */ INIT_RCU_HEAD(&p->rcu_head); - rcu_assign_pointer(disk->__part[partno], p); + rcu_assign_pointer(ptbl->part[partno], p); /* suppress uevent if the disk supresses it */ if (!ddev->uevent_suppress) @@ -487,7 +497,7 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) struct disk_part_iter piter; struct hd_struct *part; struct parsed_partitions *state; - int p, res; + int p, highest, res; if (bdev->bd_part_count) return -EBUSY; @@ -511,6 +521,17 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + /* Detect the highest partition number and preallocate + * disk->part_tbl. This is an optimization and not strictly + * necessary. + */ + for (p = 1, highest = 0; p < state->limit; p++) + if (state->parts[p].size) + highest = p; + + disk_expand_part_tbl(disk, highest); + + /* add partitions */ for (p = 1; p < state->limit; p++) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; -- cgit v1.2.3 From 689d6fac40b41c7bf154f362deaf442548e4dc81 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:16 +0900 Subject: block: replace @ext_minors with GENHD_FL_EXT_DEVT With previous changes, it's meaningless to limit the number of partitions. Replace @ext_minors with GENHD_FL_EXT_DEVT such that setting the flag allows the disk to have maximum number of allowed partitions (only limited by the number of entries in parsed_partitions as determined by MAX_PART constant). This kills not-too-pretty alloc_disk_ext[_node]() functions and makes @minors parameter to alloc_disk[_node]() unnecessary. The parameter is left alone to avoid disturbing the users. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/partitions/check.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/partitions/check.h b/fs/partitions/check.h index 17ae8ecd9e8..98dbe1a8452 100644 --- a/fs/partitions/check.h +++ b/fs/partitions/check.h @@ -5,15 +5,13 @@ * add_gd_partition adds a partitions details to the devices partition * description. */ -enum { MAX_PART = 256 }; - struct parsed_partitions { char name[BDEVNAME_SIZE]; struct { sector_t from; sector_t size; int flags; - } parts[MAX_PART]; + } parts[DISK_MAX_PARTS]; int next; int limit; }; -- cgit v1.2.3 From 3e1a7ff8a0a7b948f2684930166954f9e8e776fe Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 25 Aug 2008 19:56:17 +0900 Subject: block: allow disk to have extended device number Now that disk and partition handlings are mostly unified, it's easy to allow disk to have extended device number. This patch makes add_disk() use extended device number if disk->minors is zero. Both sd and ide-disk are updated to use this. * sd_format_disk_name() is implemented which can generically determine the drive name. This removes disk number restriction stemming from limited device names. * If sd index goes over SD_MAX_DISKS (which can be increased now BTW), sd simply doesn't initialize minors letting block layer choose extended device number. * If CONFIG_DEBUG_EXT_DEVT is set, both sd and ide-disk always set minors to 0 and use extended device numbers. Signed-off-by: Tejun Heo Signed-off-by: Jens Axboe --- fs/partitions/check.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 772b2ed8d23..0e411603fdf 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -593,6 +593,7 @@ void del_gendisk(struct gendisk *disk) disk_part_iter_exit(&piter); invalidate_partition(disk, 0); + blk_free_devt(disk_to_dev(disk)->devt); set_capacity(disk, 0); disk->flags &= ~GENHD_FL_UP; unlink_gendisk(disk); -- cgit v1.2.3 From c7c22e4d5c1fdebfac4dba76de7d0338c2b0d832 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Sat, 13 Sep 2008 20:26:01 +0200 Subject: block: add support for IO CPU affinity This patch adds support for controlling the IO completion CPU of either all requests on a queue, or on a per-request basis. We export a sysfs variable (rq_affinity) which, if set, migrates completions of requests to the CPU that originally submitted it. A bio helper (bio_set_completion_cpu()) is also added, so that queuers can ask for completion on that specific CPU. In testing, this has been show to cut the system time by as much as 20-40% on synthetic workloads where CPU affinity is desired. This requires a little help from the architecture, so it'll only work as designed for archs that are using the new generic smp helper infrastructure. Signed-off-by: Jens Axboe --- fs/bio.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index bee4deca774..6a637b5c24b 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -111,6 +111,7 @@ void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); bio->bi_flags = 1 << BIO_UPTODATE; + bio->bi_comp_cpu = -1; atomic_set(&bio->bi_cnt, 1); } -- cgit v1.2.3 From a3bce90edd8f6cafe3f63b1a943800792e830178 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 28 Aug 2008 16:17:05 +0900 Subject: block: add gfp_mask argument to blk_rq_map_user and blk_rq_map_user_iov Currently, blk_rq_map_user and blk_rq_map_user_iov always do GFP_KERNEL allocation. This adds gfp_mask argument to blk_rq_map_user and blk_rq_map_user_iov so sg can use it (sg always does GFP_ATOMIC allocation). Signed-off-by: FUJITA Tomonori Signed-off-by: Douglas Gilbert Cc: Mike Christie Cc: James Bottomley Signed-off-by: Jens Axboe --- fs/bio.c | 33 +++++++++++++++++++-------------- 1 file changed, 19 insertions(+), 14 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 6a637b5c24b..3d2e9ad2472 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -558,13 +558,14 @@ int bio_uncopy_user(struct bio *bio) * @iov: the iovec. * @iov_count: number of elements in the iovec * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Prepares and returns a bio for indirect user io, bouncing data * to/from kernel pages as necessary. Must be paired with * call bio_uncopy_user() on io completion. */ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, - int iov_count, int write_to_vm) + int iov_count, int write_to_vm, gfp_t gfp_mask) { struct bio_map_data *bmd; struct bio_vec *bvec; @@ -587,12 +588,12 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, len += iov[i].iov_len; } - bmd = bio_alloc_map_data(nr_pages, iov_count, GFP_KERNEL); + bmd = bio_alloc_map_data(nr_pages, iov_count, gfp_mask); if (!bmd) return ERR_PTR(-ENOMEM); ret = -ENOMEM; - bio = bio_alloc(GFP_KERNEL, nr_pages); + bio = bio_alloc(gfp_mask, nr_pages); if (!bio) goto out_bmd; @@ -605,7 +606,7 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, if (bytes > len) bytes = len; - page = alloc_page(q->bounce_gfp | GFP_KERNEL); + page = alloc_page(q->bounce_gfp | gfp_mask); if (!page) { ret = -ENOMEM; break; @@ -647,26 +648,27 @@ out_bmd: * @uaddr: start of user address * @len: length in bytes * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Prepares and returns a bio for indirect user io, bouncing data * to/from kernel pages as necessary. Must be paired with * call bio_uncopy_user() on io completion. */ struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, - unsigned int len, int write_to_vm) + unsigned int len, int write_to_vm, gfp_t gfp_mask) { struct sg_iovec iov; iov.iov_base = (void __user *)uaddr; iov.iov_len = len; - return bio_copy_user_iov(q, &iov, 1, write_to_vm); + return bio_copy_user_iov(q, &iov, 1, write_to_vm, gfp_mask); } static struct bio *__bio_map_user_iov(struct request_queue *q, struct block_device *bdev, struct sg_iovec *iov, int iov_count, - int write_to_vm) + int write_to_vm, gfp_t gfp_mask) { int i, j; int nr_pages = 0; @@ -692,12 +694,12 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, if (!nr_pages) return ERR_PTR(-EINVAL); - bio = bio_alloc(GFP_KERNEL, nr_pages); + bio = bio_alloc(gfp_mask, nr_pages); if (!bio) return ERR_PTR(-ENOMEM); ret = -ENOMEM; - pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); + pages = kcalloc(nr_pages, sizeof(struct page *), gfp_mask); if (!pages) goto out; @@ -776,19 +778,21 @@ static struct bio *__bio_map_user_iov(struct request_queue *q, * @uaddr: start of user address * @len: length in bytes * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Map the user space address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, - unsigned long uaddr, unsigned int len, int write_to_vm) + unsigned long uaddr, unsigned int len, int write_to_vm, + gfp_t gfp_mask) { struct sg_iovec iov; iov.iov_base = (void __user *)uaddr; iov.iov_len = len; - return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm); + return bio_map_user_iov(q, bdev, &iov, 1, write_to_vm, gfp_mask); } /** @@ -798,18 +802,19 @@ struct bio *bio_map_user(struct request_queue *q, struct block_device *bdev, * @iov: the iovec. * @iov_count: number of elements in the iovec * @write_to_vm: bool indicating writing to pages or not + * @gfp_mask: memory allocation flags * * Map the user space address into a bio suitable for io to a block * device. Returns an error pointer in case of error. */ struct bio *bio_map_user_iov(struct request_queue *q, struct block_device *bdev, struct sg_iovec *iov, int iov_count, - int write_to_vm) + int write_to_vm, gfp_t gfp_mask) { struct bio *bio; - bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm); - + bio = __bio_map_user_iov(q, bdev, iov, iov_count, write_to_vm, + gfp_mask); if (IS_ERR(bio)) return bio; -- cgit v1.2.3 From 152e283fdfea0cd11e297d982378b55937842dde Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 28 Aug 2008 16:17:06 +0900 Subject: block: introduce struct rq_map_data to use reserved pages This patch introduces struct rq_map_data to enable bio_copy_use_iov() use reserved pages. Currently, bio_copy_user_iov allocates bounce pages but drivers/scsi/sg.c wants to allocate pages by itself and use them. struct rq_map_data can be used to pass allocated pages to bio_copy_user_iov. The current users of bio_copy_user_iov simply passes NULL (they don't want to use pre-allocated pages). Signed-off-by: FUJITA Tomonori Cc: Jens Axboe Cc: Douglas Gilbert Cc: Mike Christie Cc: James Bottomley Signed-off-by: Jens Axboe --- fs/bio.c | 58 +++++++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 17 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 3d2e9ad2472..a2f072647cd 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -439,16 +439,19 @@ int bio_add_page(struct bio *bio, struct page *page, unsigned int len, struct bio_map_data { struct bio_vec *iovecs; - int nr_sgvecs; struct sg_iovec *sgvecs; + int nr_sgvecs; + int is_our_pages; }; static void bio_set_map_data(struct bio_map_data *bmd, struct bio *bio, - struct sg_iovec *iov, int iov_count) + struct sg_iovec *iov, int iov_count, + int is_our_pages) { memcpy(bmd->iovecs, bio->bi_io_vec, sizeof(struct bio_vec) * bio->bi_vcnt); memcpy(bmd->sgvecs, iov, sizeof(struct sg_iovec) * iov_count); bmd->nr_sgvecs = iov_count; + bmd->is_our_pages = is_our_pages; bio->bi_private = bmd; } @@ -483,7 +486,8 @@ static struct bio_map_data *bio_alloc_map_data(int nr_segs, int iov_count, } static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, - struct sg_iovec *iov, int iov_count, int uncopy) + struct sg_iovec *iov, int iov_count, int uncopy, + int do_free_page) { int ret = 0, i; struct bio_vec *bvec; @@ -526,7 +530,7 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, } } - if (uncopy) + if (do_free_page) __free_page(bvec->bv_page); } @@ -545,7 +549,8 @@ int bio_uncopy_user(struct bio *bio) struct bio_map_data *bmd = bio->bi_private; int ret; - ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1); + ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1, + bmd->is_our_pages); bio_free_map_data(bmd); bio_put(bio); @@ -555,6 +560,7 @@ int bio_uncopy_user(struct bio *bio) /** * bio_copy_user_iov - copy user data to bio * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) * @iov: the iovec. * @iov_count: number of elements in the iovec * @write_to_vm: bool indicating writing to pages or not @@ -564,8 +570,10 @@ int bio_uncopy_user(struct bio *bio) * to/from kernel pages as necessary. Must be paired with * call bio_uncopy_user() on io completion. */ -struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, - int iov_count, int write_to_vm, gfp_t gfp_mask) +struct bio *bio_copy_user_iov(struct request_queue *q, + struct rq_map_data *map_data, + struct sg_iovec *iov, int iov_count, + int write_to_vm, gfp_t gfp_mask) { struct bio_map_data *bmd; struct bio_vec *bvec; @@ -600,13 +608,26 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, bio->bi_rw |= (!write_to_vm << BIO_RW); ret = 0; + i = 0; while (len) { - unsigned int bytes = PAGE_SIZE; + unsigned int bytes; + + if (map_data) + bytes = 1U << (PAGE_SHIFT + map_data->page_order); + else + bytes = PAGE_SIZE; if (bytes > len) bytes = len; - page = alloc_page(q->bounce_gfp | gfp_mask); + if (map_data) { + if (i == map_data->nr_entries) { + ret = -ENOMEM; + break; + } + page = map_data->pages[i++]; + } else + page = alloc_page(q->bounce_gfp | gfp_mask); if (!page) { ret = -ENOMEM; break; @@ -625,16 +646,17 @@ struct bio *bio_copy_user_iov(struct request_queue *q, struct sg_iovec *iov, * success */ if (!write_to_vm) { - ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0); + ret = __bio_copy_iov(bio, bio->bi_io_vec, iov, iov_count, 0, 0); if (ret) goto cleanup; } - bio_set_map_data(bmd, bio, iov, iov_count); + bio_set_map_data(bmd, bio, iov, iov_count, map_data ? 0 : 1); return bio; cleanup: - bio_for_each_segment(bvec, bio, i) - __free_page(bvec->bv_page); + if (!map_data) + bio_for_each_segment(bvec, bio, i) + __free_page(bvec->bv_page); bio_put(bio); out_bmd: @@ -645,6 +667,7 @@ out_bmd: /** * bio_copy_user - copy user data to bio * @q: destination block queue + * @map_data: pointer to the rq_map_data holding pages (if necessary) * @uaddr: start of user address * @len: length in bytes * @write_to_vm: bool indicating writing to pages or not @@ -654,15 +677,16 @@ out_bmd: * to/from kernel pages as necessary. Must be paired with * call bio_uncopy_user() on io completion. */ -struct bio *bio_copy_user(struct request_queue *q, unsigned long uaddr, - unsigned int len, int write_to_vm, gfp_t gfp_mask) +struct bio *bio_copy_user(struct request_queue *q, struct rq_map_data *map_data, + unsigned long uaddr, unsigned int len, + int write_to_vm, gfp_t gfp_mask) { struct sg_iovec iov; iov.iov_base = (void __user *)uaddr; iov.iov_len = len; - return bio_copy_user_iov(q, &iov, 1, write_to_vm, gfp_mask); + return bio_copy_user_iov(q, map_data, &iov, 1, write_to_vm, gfp_mask); } static struct bio *__bio_map_user_iov(struct request_queue *q, @@ -1028,7 +1052,7 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, bio->bi_private = bmd; bio->bi_end_io = bio_copy_kern_endio; - bio_set_map_data(bmd, bio, &iov, 1); + bio_set_map_data(bmd, bio, &iov, 1, 1); return bio; cleanup: bio_for_each_segment(bvec, bio, i) -- cgit v1.2.3 From 4d8ab62e087d9300883b82c2662e73e6eef803a3 Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Thu, 28 Aug 2008 15:05:57 +0900 Subject: bio: convert bio_copy_kern to use bio_copy_user bio_copy_kern and bio_copy_user are very similar. This converts bio_copy_kern to use bio_copy_user. Signed-off-by: FUJITA Tomonori Cc: Jens Axboe Signed-off-by: Jens Axboe --- fs/bio.c | 54 ++++-------------------------------------------------- 1 file changed, 4 insertions(+), 50 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index a2f072647cd..9d68ddb89b7 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -995,48 +995,13 @@ static void bio_copy_kern_endio(struct bio *bio, int err) struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, gfp_t gfp_mask, int reading) { - unsigned long kaddr = (unsigned long)data; - unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT; - unsigned long start = kaddr >> PAGE_SHIFT; - const int nr_pages = end - start; struct bio *bio; struct bio_vec *bvec; - struct bio_map_data *bmd; - int i, ret; - struct sg_iovec iov; - - iov.iov_base = data; - iov.iov_len = len; - - bmd = bio_alloc_map_data(nr_pages, 1, gfp_mask); - if (!bmd) - return ERR_PTR(-ENOMEM); - - ret = -ENOMEM; - bio = bio_alloc(gfp_mask, nr_pages); - if (!bio) - goto out_bmd; - - while (len) { - struct page *page; - unsigned int bytes = PAGE_SIZE; - - if (bytes > len) - bytes = len; - - page = alloc_page(q->bounce_gfp | gfp_mask); - if (!page) { - ret = -ENOMEM; - goto cleanup; - } - - if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes) { - ret = -EINVAL; - goto cleanup; - } + int i; - len -= bytes; - } + bio = bio_copy_user(q, NULL, (unsigned long)data, len, 1, gfp_mask); + if (IS_ERR(bio)) + return bio; if (!reading) { void *p = data; @@ -1049,20 +1014,9 @@ struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len, } } - bio->bi_private = bmd; bio->bi_end_io = bio_copy_kern_endio; - bio_set_map_data(bmd, bio, &iov, 1, 1); return bio; -cleanup: - bio_for_each_segment(bvec, bio, i) - __free_page(bvec->bv_page); - - bio_put(bio); -out_bmd: - bio_free_map_data(bmd); - - return ERR_PTR(ret); } /* -- cgit v1.2.3 From 818827669d85b84241696ffef2de485db46b0b5e Mon Sep 17 00:00:00 2001 From: FUJITA Tomonori Date: Tue, 2 Sep 2008 16:20:19 +0900 Subject: block: make blk_rq_map_user take a NULL user-space buffer This patch changes blk_rq_map_user to accept a NULL user-space buffer with a READ command if rq_map_data is not NULL. Thus a caller can pass page frames to lk_rq_map_user to just set up a request and bios with page frames propely. bio_uncopy_user (called via blk_rq_unmap_user) doesn't copy data to user space with such request. Signed-off-by: FUJITA Tomonori Signed-off-by: Jens Axboe --- fs/bio.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 9d68ddb89b7..355302985e2 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -547,11 +547,11 @@ static int __bio_copy_iov(struct bio *bio, struct bio_vec *iovecs, int bio_uncopy_user(struct bio *bio) { struct bio_map_data *bmd = bio->bi_private; - int ret; - - ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, bmd->nr_sgvecs, 1, - bmd->is_our_pages); + int ret = 0; + if (!bio_flagged(bio, BIO_NULL_MAPPED)) + ret = __bio_copy_iov(bio, bmd->iovecs, bmd->sgvecs, + bmd->nr_sgvecs, 1, bmd->is_our_pages); bio_free_map_data(bmd); bio_put(bio); return ret; -- cgit v1.2.3 From 0c002c2f74e10baa9021d3ecc50585c6eafea568 Mon Sep 17 00:00:00 2001 From: Andrew Patterson Date: Thu, 4 Sep 2008 14:27:20 -0600 Subject: Wrapper for lower-level revalidate_disk routines. This is a wrapper for the lower-level revalidate_disk call-backs such as sd_revalidate_disk(). It allows us to perform pre and post operations when calling them. We will use this wrapper in a later patch to adjust block device sizes after an online resize (a _post_ operation). Signed-off-by: Andrew Patterson Signed-off-by: Jens Axboe --- fs/block_dev.c | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index c3fa19bd64d..4eeb69a8873 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -852,6 +852,27 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode) EXPORT_SYMBOL(open_by_devnum); +/** + * revalidate_disk - wrapper for lower-level driver's revalidate_disk + * call-back + * + * @disk: struct gendisk to be revalidated + * + * This routine is a wrapper for lower-level driver's revalidate_disk + * call-backs. It is used to do common pre and post operations needed + * for all revalidate_disk operations. + */ +int revalidate_disk(struct gendisk *disk) +{ + int ret = 0; + + if (disk->fops->revalidate_disk) + ret = disk->fops->revalidate_disk(disk); + + return ret; +} +EXPORT_SYMBOL(revalidate_disk); + /* * This routine checks whether a removable media has been changed, * and invalidates all buffer-cache-entries in that case. This -- cgit v1.2.3 From c3279d1454cdfed02a557d789d8a6d08ab4cbe70 Mon Sep 17 00:00:00 2001 From: Andrew Patterson Date: Thu, 4 Sep 2008 14:27:25 -0600 Subject: Adjust block device size after an online resize of a disk. The revalidate_disk routine now checks if a disk has been resized by comparing the gendisk capacity to the bdev inode size. If they are different (usually because the disk has been resized underneath the kernel) the bdev inode size is adjusted to match the capacity. Signed-off-by: Andrew Patterson Signed-off-by: Jens Axboe --- fs/block_dev.c | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 4eeb69a8873..b721955d382 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -852,6 +852,34 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode) EXPORT_SYMBOL(open_by_devnum); +/** + * check_disk_size_change - checks for disk size change and adjusts + * bdev size. + * + * @disk: struct gendisk to check + * @bdev: struct bdev to adjust. + * + * This routine checks to see if the bdev size does not match the disk size + * and adjusts it if it differs. + */ +void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) +{ + loff_t disk_size, bdev_size; + + disk_size = (loff_t)get_capacity(disk) << 9; + bdev_size = i_size_read(bdev->bd_inode); + if (disk_size != bdev_size) { + char name[BDEVNAME_SIZE]; + + disk_name(disk, 0, name); + printk(KERN_INFO + "%s: detected capacity change from %lld to %lld\n", + name, bdev_size, disk_size); + i_size_write(bdev->bd_inode, disk_size); + } +} +EXPORT_SYMBOL(check_disk_size_change); + /** * revalidate_disk - wrapper for lower-level driver's revalidate_disk * call-back @@ -864,11 +892,20 @@ EXPORT_SYMBOL(open_by_devnum); */ int revalidate_disk(struct gendisk *disk) { + struct block_device *bdev; int ret = 0; if (disk->fops->revalidate_disk) ret = disk->fops->revalidate_disk(disk); + bdev = bdget_disk(disk, 0); + if (!bdev) + return ret; + + mutex_lock(&bdev->bd_mutex); + check_disk_size_change(disk, bdev); + mutex_unlock(&bdev->bd_mutex); + bdput(bdev); return ret; } EXPORT_SYMBOL(revalidate_disk); -- cgit v1.2.3 From 9bc3ffbfbdf71fefda8a261ef8d6fdc388a29b42 Mon Sep 17 00:00:00 2001 From: Andrew Patterson Date: Thu, 4 Sep 2008 14:27:30 -0600 Subject: Check for device resize when rescanning partitions Check for device resize in the rescan_partitions() routine. If the device has been resized, the bdev size is set to match. The rescan_partitions() routine is called when opening the device and when calling the BLKRRPART ioctl. Signed-off-by: Andrew Patterson Signed-off-by: Jens Axboe --- fs/partitions/check.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/partitions/check.c b/fs/partitions/check.c index 0e411603fdf..7408227c49c 100644 --- a/fs/partitions/check.c +++ b/fs/partitions/check.c @@ -504,7 +504,6 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) res = invalidate_partition(disk, 0); if (res) return res; - bdev->bd_invalidated = 0; disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); while ((part = disk_part_iter_next(&piter))) @@ -513,6 +512,8 @@ int rescan_partitions(struct gendisk *disk, struct block_device *bdev) if (disk->fops->revalidate_disk) disk->fops->revalidate_disk(disk); + check_disk_size_change(disk, bdev); + bdev->bd_invalidated = 0; if (!get_capacity(disk) || !(state = check_partition(disk, bdev))) return 0; if (IS_ERR(state)) /* I/O error reading the partition table */ -- cgit v1.2.3 From 56ade44b46780fa291fa68b824f1dafdcb11b0ca Mon Sep 17 00:00:00 2001 From: Andrew Patterson Date: Thu, 4 Sep 2008 14:27:40 -0600 Subject: Added flush_disk to factor out common buffer cache flushing code. We need to be able to flush the buffer cache for for more than just when a disk is changed, so we factor out common cache flush code in check_disk_change() to an internal flush_disk() routine. This routine will then be used for both disk changes and disk resizes (in a later patch). Include the disk name in the text indicating that there are busy inodes on the device and increase the KERN severity of the message. Signed-off-by: Andrew Patterson Signed-off-by: Jens Axboe --- fs/block_dev.c | 32 +++++++++++++++++++++++++++----- 1 file changed, 27 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index b721955d382..33650fc537c 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -852,6 +852,32 @@ struct block_device *open_by_devnum(dev_t dev, unsigned mode) EXPORT_SYMBOL(open_by_devnum); +/** + * flush_disk - invalidates all buffer-cache entries on a disk + * + * @bdev: struct block device to be flushed + * + * Invalidates all buffer-cache entries on a disk. It should be called + * when a disk has been changed -- either by a media change or online + * resize. + */ +static void flush_disk(struct block_device *bdev) +{ + if (__invalidate_device(bdev)) { + char name[BDEVNAME_SIZE] = ""; + + if (bdev->bd_disk) + disk_name(bdev->bd_disk, 0, name); + printk(KERN_WARNING "VFS: busy inodes on changed media or " + "resized disk %s\n", name); + } + + if (!bdev->bd_disk) + return; + if (disk_partitionable(bdev->bd_disk)) + bdev->bd_invalidated = 1; +} + /** * check_disk_size_change - checks for disk size change and adjusts * bdev size. @@ -929,13 +955,9 @@ int check_disk_change(struct block_device *bdev) if (!bdops->media_changed(bdev->bd_disk)) return 0; - if (__invalidate_device(bdev)) - printk("VFS: busy inodes on changed media.\n"); - + flush_disk(bdev); if (bdops->revalidate_disk) bdops->revalidate_disk(bdev->bd_disk); - if (disk_partitionable(bdev->bd_disk)) - bdev->bd_invalidated = 1; return 1; } -- cgit v1.2.3 From 608aeef17a91747d6303de4df5e2c2e6899a95e8 Mon Sep 17 00:00:00 2001 From: Andrew Patterson Date: Thu, 4 Sep 2008 14:27:45 -0600 Subject: Call flush_disk() after detecting an online resize. We call flush_disk() to make sure the buffer cache for the disk is flushed after a disk resize. There are two resize cases, growing and shrinking. Given that users can shrink/then grow a disk before revalidate_disk() is called, we treat the grow case identically to shrinking. We need to flush the buffer cache after an online shrink because, as James Bottomley puts it, The two use cases for shrinking I can see are 1. planned: the fs is already shrunk to within the new boundaries and all data is relocated, so invalidate is fine (any dirty buffers that might exist in the shrunk region are there only because they were relocated but not yet written to their original location). 2. unplanned: In this case, the fs is probably toast, so whether we invalidate or not isn't going to make a whole lot of difference; it's still going to try to read or write from sectors beyond the new size and get I/O errors. Immediately invalidating shrunk disks will cause errors for outstanding I/Os for reads/write beyond the new end of the disk to be generated earlier then if we waited for the normal buffer cache operation. It also removes a potential security hole where we might keep old data around from beyond the end of the shrunk disk if the disk was not invalidated. Signed-off-by: Andrew Patterson Signed-off-by: Jens Axboe --- fs/block_dev.c | 1 + 1 file changed, 1 insertion(+) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 33650fc537c..57e2786dd2a 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -902,6 +902,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) "%s: detected capacity change from %lld to %lld\n", name, bdev_size, disk_size); i_size_write(bdev->bd_inode, disk_size); + flush_disk(bdev); } } EXPORT_SYMBOL(check_disk_size_change); -- cgit v1.2.3 From 0a0d96b03a1f3bfd6bc3ea08008699e8e59fccd9 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 11 Sep 2008 13:17:37 +0200 Subject: block: add bio_kmalloc() Not all callers need (or want!) the mempool backing guarentee, it essentially means that you can only use bio_alloc() for short allocations and not for preallocating some bio's at setup or init time. So add bio_kmalloc() which does the same thing as bio_alloc(), except it just uses kmalloc() as the backing instead of the bio mempools. Signed-off-by: Jens Axboe --- fs/bio.c | 96 ++++++++++++++++++++++++++++++++++++++++++++++++++-------------- 1 file changed, 75 insertions(+), 21 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index 355302985e2..e56e7685af9 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -60,25 +60,46 @@ struct bio_vec *bvec_alloc_bs(gfp_t gfp_mask, int nr, unsigned long *idx, struct struct bio_vec *bvl; /* - * see comment near bvec_array define! + * If 'bs' is given, lookup the pool and do the mempool alloc. + * If not, this is a bio_kmalloc() allocation and just do a + * kzalloc() for the exact number of vecs right away. */ - switch (nr) { - case 1 : *idx = 0; break; - case 2 ... 4: *idx = 1; break; - case 5 ... 16: *idx = 2; break; - case 17 ... 64: *idx = 3; break; - case 65 ... 128: *idx = 4; break; - case 129 ... BIO_MAX_PAGES: *idx = 5; break; + if (bs) { + /* + * see comment near bvec_array define! + */ + switch (nr) { + case 1: + *idx = 0; + break; + case 2 ... 4: + *idx = 1; + break; + case 5 ... 16: + *idx = 2; + break; + case 17 ... 64: + *idx = 3; + break; + case 65 ... 128: + *idx = 4; + break; + case 129 ... BIO_MAX_PAGES: + *idx = 5; + break; default: return NULL; - } - /* - * idx now points to the pool we want to allocate from - */ + } - bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); - if (bvl) - memset(bvl, 0, bvec_nr_vecs(*idx) * sizeof(struct bio_vec)); + /* + * idx now points to the pool we want to allocate from + */ + bvl = mempool_alloc(bs->bvec_pools[*idx], gfp_mask); + if (bvl) + memset(bvl, 0, + bvec_nr_vecs(*idx) * sizeof(struct bio_vec)); + } else + bvl = kzalloc(nr * sizeof(struct bio_vec), gfp_mask); return bvl; } @@ -107,6 +128,12 @@ static void bio_fs_destructor(struct bio *bio) bio_free(bio, fs_bio_set); } +static void bio_kmalloc_destructor(struct bio *bio) +{ + kfree(bio->bi_io_vec); + kfree(bio); +} + void bio_init(struct bio *bio) { memset(bio, 0, sizeof(*bio)); @@ -119,19 +146,25 @@ void bio_init(struct bio *bio) * bio_alloc_bioset - allocate a bio for I/O * @gfp_mask: the GFP_ mask given to the slab allocator * @nr_iovecs: number of iovecs to pre-allocate - * @bs: the bio_set to allocate from + * @bs: the bio_set to allocate from. If %NULL, just use kmalloc * * Description: - * bio_alloc_bioset will first try it's on mempool to satisfy the allocation. + * bio_alloc_bioset will first try its own mempool to satisfy the allocation. * If %__GFP_WAIT is set then we will block on the internal pool waiting - * for a &struct bio to become free. + * for a &struct bio to become free. If a %NULL @bs is passed in, we will + * fall back to just using @kmalloc to allocate the required memory. * * allocate bio and iovecs from the memory pools specified by the - * bio_set structure. + * bio_set structure, or @kmalloc if none given. **/ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) { - struct bio *bio = mempool_alloc(bs->bio_pool, gfp_mask); + struct bio *bio; + + if (bs) + bio = mempool_alloc(bs->bio_pool, gfp_mask); + else + bio = kmalloc(sizeof(*bio), gfp_mask); if (likely(bio)) { struct bio_vec *bvl = NULL; @@ -142,7 +175,10 @@ struct bio *bio_alloc_bioset(gfp_t gfp_mask, int nr_iovecs, struct bio_set *bs) bvl = bvec_alloc_bs(gfp_mask, nr_iovecs, &idx, bs); if (unlikely(!bvl)) { - mempool_free(bio, bs->bio_pool); + if (bs) + mempool_free(bio, bs->bio_pool); + else + kfree(bio); bio = NULL; goto out; } @@ -165,6 +201,23 @@ struct bio *bio_alloc(gfp_t gfp_mask, int nr_iovecs) return bio; } +/* + * Like bio_alloc(), but doesn't use a mempool backing. This means that + * it CAN fail, but while bio_alloc() can only be used for allocations + * that have a short (finite) life span, bio_kmalloc() should be used + * for more permanent bio allocations (like allocating some bio's for + * initalization or setup purposes). + */ +struct bio *bio_kmalloc(gfp_t gfp_mask, int nr_iovecs) +{ + struct bio *bio = bio_alloc_bioset(gfp_mask, nr_iovecs, NULL); + + if (bio) + bio->bi_destructor = bio_kmalloc_destructor; + + return bio; +} + void zero_fill_bio(struct bio *bio) { unsigned long flags; @@ -1349,6 +1402,7 @@ static int __init init_bio(void) subsys_initcall(init_bio); EXPORT_SYMBOL(bio_alloc); +EXPORT_SYMBOL(bio_kmalloc); EXPORT_SYMBOL(bio_put); EXPORT_SYMBOL(bio_free); EXPORT_SYMBOL(bio_endio); -- cgit v1.2.3 From 9c02f2b02e29a2244e36c6e1f246080d8afc6cff Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 18 Sep 2008 09:31:53 -0700 Subject: block: cleanup some of the integrity stuff in blkdev.h Don't put functions that are only used in fs/bio-integrity.c in blkdev.h, it's much cleaner to just keep it in there. Also kill completely unused bdev_get_tag_size() Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index c3e174b35fe..ba4ada08564 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -150,6 +150,29 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_integrity_add_page); +static struct blk_integrity *bdev_get_integrity(struct block_device *bdev) +{ + return bdev->bd_disk->integrity; +} + +static int bdev_integrity_enabled(struct block_device *bdev, int rw) +{ + struct blk_integrity *bi = bdev_get_integrity(bdev); + + if (bi == NULL) + return 0; + + if (rw == READ && bi->verify_fn != NULL && + (bi->flags & INTEGRITY_FLAG_READ)) + return 1; + + if (rw == WRITE && bi->generate_fn != NULL && + (bi->flags & INTEGRITY_FLAG_WRITE)) + return 1; + + return 0; +} + /** * bio_integrity_enabled - Check whether integrity can be passed * @bio: bio to check @@ -313,6 +336,14 @@ static void bio_integrity_generate(struct bio *bio) } } +static inline unsigned short blk_integrity_tuple_size(struct blk_integrity *bi) +{ + if (bi) + return bi->tuple_size; + + return 0; +} + /** * bio_integrity_prep - Prepare bio for integrity I/O * @bio: bio to prepare -- cgit v1.2.3 From b04accc425d52ca59699290661e0dfd09b0feeeb Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Thu, 2 Oct 2008 12:53:22 +0200 Subject: block: revert part of d7533ad0e132f92e75c1b2eb7c26387b25a583c1 We need bdev_get_integrity() to support the pending md/dm patches. Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index ba4ada08564..6e28dcdd23a 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -150,11 +150,6 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_integrity_add_page); -static struct blk_integrity *bdev_get_integrity(struct block_device *bdev) -{ - return bdev->bd_disk->integrity; -} - static int bdev_integrity_enabled(struct block_device *bdev, int rw) { struct blk_integrity *bi = bdev_get_integrity(bdev); -- cgit v1.2.3 From 74aa8c2cc010035a7eef2b4ca4d6430e0dae206a Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Oct 2008 03:38:37 -0400 Subject: block: Introduce integrity data ownership flag A filesystem might supply its own integrity metadata. Introduce a flag that indicates whether the filesystem or the block layer owns the integrity buffer. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/bio-integrity.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'fs') diff --git a/fs/bio-integrity.c b/fs/bio-integrity.c index 6e28dcdd23a..19caf7c962a 100644 --- a/fs/bio-integrity.c +++ b/fs/bio-integrity.c @@ -107,7 +107,8 @@ void bio_integrity_free(struct bio *bio, struct bio_set *bs) BUG_ON(bip == NULL); /* A cloned bio doesn't own the integrity metadata */ - if (!bio_flagged(bio, BIO_CLONED) && bip->bip_buf != NULL) + if (!bio_flagged(bio, BIO_CLONED) && !bio_flagged(bio, BIO_FS_INTEGRITY) + && bip->bip_buf != NULL) kfree(bip->bip_buf); mempool_free(bip->bip_vec, bs->bvec_pools[bip->bip_pool]); -- cgit v1.2.3 From ad3316bf4eeb53c89164f759767f911072b56203 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 1 Oct 2008 22:42:53 -0400 Subject: block: Find bio sector offset given idx and offset Helper function to find the sector offset in a bio given bvec index and page offset. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- fs/bio.c | 36 ++++++++++++++++++++++++++++++++++++ 1 file changed, 36 insertions(+) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index e56e7685af9..a5af5809f56 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -1300,6 +1300,42 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) return bp; } +/** + * bio_sector_offset - Find hardware sector offset in bio + * @bio: bio to inspect + * @index: bio_vec index + * @offset: offset in bv_page + * + * Return the number of hardware sectors between beginning of bio + * and an end point indicated by a bio_vec index and an offset + * within that vector's page. + */ +sector_t bio_sector_offset(struct bio *bio, unsigned short index, + unsigned int offset) +{ + unsigned int sector_sz = queue_hardsect_size(bio->bi_bdev->bd_disk->queue); + struct bio_vec *bv; + sector_t sectors; + int i; + + sectors = 0; + + if (index >= bio->bi_idx) + index = bio->bi_vcnt - 1; + + __bio_for_each_segment(bv, bio, i, 0) { + if (i == index) { + if (offset > bv->bv_offset) + sectors += (offset - bv->bv_offset) / sector_sz; + break; + } + + sectors += bv->bv_len / sector_sz; + } + + return sectors; +} +EXPORT_SYMBOL(bio_sector_offset); /* * create memory pools for biovec's in a bio_set. -- cgit v1.2.3 From 6feef531f55cf4a20fd9eb39f5352e5745203603 Mon Sep 17 00:00:00 2001 From: Denis ChengRq Date: Thu, 9 Oct 2008 08:57:05 +0200 Subject: block: mark bio_split_pool static Since all bio_split calls refer the same single bio_split_pool, the bio_split function can use bio_split_pool directly instead of the mempool_t parameter; then the mempool_t parameter can be removed from bio_split param list, and bio_split_pool is only referred in fs/bio.c file, can be marked static. Signed-off-by: Denis ChengRq Signed-off-by: Jens Axboe --- fs/bio.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/bio.c b/fs/bio.c index a5af5809f56..77a55bcceed 100644 --- a/fs/bio.c +++ b/fs/bio.c @@ -30,7 +30,7 @@ static struct kmem_cache *bio_slab __read_mostly; -mempool_t *bio_split_pool __read_mostly; +static mempool_t *bio_split_pool __read_mostly; /* * if you change this list, also change bvec_alloc or things will @@ -1256,9 +1256,9 @@ static void bio_pair_end_2(struct bio *bi, int err) * split a bio - only worry about a bio with a single page * in it's iovec */ -struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) +struct bio_pair *bio_split(struct bio *bi, int first_sectors) { - struct bio_pair *bp = mempool_alloc(pool, GFP_NOIO); + struct bio_pair *bp = mempool_alloc(bio_split_pool, GFP_NOIO); if (!bp) return bp; @@ -1292,7 +1292,7 @@ struct bio_pair *bio_split(struct bio *bi, mempool_t *pool, int first_sectors) bp->bio2.bi_end_io = bio_pair_end_2; bp->bio1.bi_private = bi; - bp->bio2.bi_private = pool; + bp->bio2.bi_private = bio_split_pool; if (bio_integrity(bi)) bio_integrity_split(bi, bp, first_sectors); @@ -1455,7 +1455,6 @@ EXPORT_SYMBOL(bio_map_kern); EXPORT_SYMBOL(bio_copy_kern); EXPORT_SYMBOL(bio_pair_release); EXPORT_SYMBOL(bio_split); -EXPORT_SYMBOL(bio_split_pool); EXPORT_SYMBOL(bio_copy_user); EXPORT_SYMBOL(bio_uncopy_user); EXPORT_SYMBOL(bioset_create); -- cgit v1.2.3 From 57d1b5366f46fe434e565b710baf683daff78dd8 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 9 Oct 2008 10:42:38 +0200 Subject: block_dev: fix kernel-doc in new functions Fix kernel-doc in new functions: Error(mmotm-2008-1002-1617//fs/block_dev.c:895): duplicate section name 'Description' Error(mmotm-2008-1002-1617//fs/block_dev.c:924): duplicate section name 'Description' Warning(mmotm-2008-1002-1617//fs/block_dev.c:1282): No description found for parameter 'pathname' Signed-off-by: Randy Dunlap cc: Andrew Patterson Signed-off-by: Jens Axboe --- fs/block_dev.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) (limited to 'fs') diff --git a/fs/block_dev.c b/fs/block_dev.c index 57e2786dd2a..d84f0469a01 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -879,9 +879,7 @@ static void flush_disk(struct block_device *bdev) } /** - * check_disk_size_change - checks for disk size change and adjusts - * bdev size. - * + * check_disk_size_change - checks for disk size change and adjusts bdev size. * @disk: struct gendisk to check * @bdev: struct bdev to adjust. * @@ -908,9 +906,7 @@ void check_disk_size_change(struct gendisk *disk, struct block_device *bdev) EXPORT_SYMBOL(check_disk_size_change); /** - * revalidate_disk - wrapper for lower-level driver's revalidate_disk - * call-back - * + * revalidate_disk - wrapper for lower-level driver's revalidate_disk call-back * @disk: struct gendisk to be revalidated * * This routine is a wrapper for lower-level driver's revalidate_disk @@ -1266,10 +1262,9 @@ EXPORT_SYMBOL(ioctl_by_bdev); /** * lookup_bdev - lookup a struct block_device by name + * @pathname: special file representing the block device * - * @path: special file representing the block device - * - * Get a reference to the blockdevice at @path in the current + * Get a reference to the blockdevice at @pathname in the current * namespace if possible and return it. Return ERR_PTR(error) * otherwise. */ -- cgit v1.2.3