aboutsummaryrefslogtreecommitdiff
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig16
-rw-r--r--drivers/block/Makefile2
-rw-r--r--drivers/block/aoe/aoe.h69
-rw-r--r--drivers/block/aoe/aoeblk.c72
-rw-r--r--drivers/block/aoe/aoechr.c93
-rw-r--r--drivers/block/aoe/aoecmd.c742
-rw-r--r--drivers/block/aoe/aoedev.c277
-rw-r--r--drivers/block/aoe/aoemain.c2
-rw-r--r--drivers/block/aoe/aoenet.c15
-rw-r--r--drivers/block/brd.c583
-rw-r--r--drivers/block/nbd.c10
-rw-r--r--drivers/block/rd.c537
12 files changed, 1465 insertions, 953 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index 64e5148d82b..b6d230b3209 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -322,7 +322,7 @@ config BLK_DEV_UB
If unsure, say N.
config BLK_DEV_RAM
- tristate "RAM disk support"
+ tristate "RAM block device support"
---help---
Saying Y here will allow you to use a portion of your RAM memory as
a block device, so that you can make file systems on it, read and
@@ -357,15 +357,15 @@ config BLK_DEV_RAM_SIZE
The default value is 4096 kilobytes. Only change this if you know
what you are doing.
-config BLK_DEV_RAM_BLOCKSIZE
- int "Default RAM disk block size (bytes)"
+config BLK_DEV_XIP
+ bool "Support XIP filesystems on RAM block device"
depends on BLK_DEV_RAM
- default "1024"
+ default n
help
- The default value is 1024 bytes. PAGE_SIZE is a much more
- efficient choice however. The default is kept to ensure initrd
- setups function - apparently needed by the rd_load_image routine
- that supposes the filesystem in the image uses a 1024 blocksize.
+ Support XIP filesystems (such as ext2 with XIP support on) on
+ top of block ram device. This will slightly enlarge the kernel, and
+ will prevent RAM block device backing store memory from being
+ allocated from highmem (only a problem for highmem systems).
config CDROM_PKTCDVD
tristate "Packet writing on CD/DVD media"
diff --git a/drivers/block/Makefile b/drivers/block/Makefile
index 7691505a2e1..01c972415cb 100644
--- a/drivers/block/Makefile
+++ b/drivers/block/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_AMIGA_FLOPPY) += amiflop.o
obj-$(CONFIG_PS3_DISK) += ps3disk.o
obj-$(CONFIG_ATARI_FLOPPY) += ataflop.o
obj-$(CONFIG_AMIGA_Z2RAM) += z2ram.o
-obj-$(CONFIG_BLK_DEV_RAM) += rd.o
+obj-$(CONFIG_BLK_DEV_RAM) += brd.o
obj-$(CONFIG_BLK_DEV_LOOP) += loop.o
obj-$(CONFIG_BLK_DEV_PS2) += ps2esdi.o
obj-$(CONFIG_BLK_DEV_XD) += xd.o
diff --git a/drivers/block/aoe/aoe.h b/drivers/block/aoe/aoe.h
index 07f02f855ab..280e71ee744 100644
--- a/drivers/block/aoe/aoe.h
+++ b/drivers/block/aoe/aoe.h
@@ -1,5 +1,5 @@
-/* Copyright (c) 2006 Coraid, Inc. See COPYING for GPL terms. */
-#define VERSION "32"
+/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
+#define VERSION "47"
#define AOE_MAJOR 152
#define DEVICE_NAME "aoe"
@@ -76,10 +76,8 @@ enum {
DEVFL_EXT = (1<<2), /* device accepts lba48 commands */
DEVFL_CLOSEWAIT = (1<<3), /* device is waiting for all closes to revalidate */
DEVFL_GDALLOC = (1<<4), /* need to alloc gendisk */
- DEVFL_PAUSE = (1<<5),
+ DEVFL_KICKME = (1<<5), /* slow polling network card catch */
DEVFL_NEWSIZE = (1<<6), /* need to update dev size in block layer */
- DEVFL_MAXBCNT = (1<<7), /* d->maxbcnt is not changeable */
- DEVFL_KICKME = (1<<8),
BUFFL_FAIL = 1,
};
@@ -88,17 +86,25 @@ enum {
DEFAULTBCNT = 2 * 512, /* 2 sectors */
NPERSHELF = 16, /* number of slots per shelf address */
FREETAG = -1,
- MIN_BUFS = 8,
+ MIN_BUFS = 16,
+ NTARGETS = 8,
+ NAOEIFS = 8,
+ NSKBPOOLMAX = 128,
+
+ TIMERTICK = HZ / 10,
+ MINTIMER = HZ >> 2,
+ MAXTIMER = HZ << 1,
+ HELPWAIT = 20,
};
struct buf {
struct list_head bufs;
- ulong start_time; /* for disk stats */
+ ulong stime; /* for disk stats */
ulong flags;
ulong nframesout;
- char *bufaddr;
ulong resid;
ulong bv_resid;
+ ulong bv_off;
sector_t sector;
struct bio *bio;
struct bio_vec *bv;
@@ -114,19 +120,38 @@ struct frame {
struct sk_buff *skb;
};
+struct aoeif {
+ struct net_device *nd;
+ unsigned char lost;
+ unsigned char lostjumbo;
+ ushort maxbcnt;
+};
+
+struct aoetgt {
+ unsigned char addr[6];
+ ushort nframes;
+ struct frame *frames;
+ struct aoeif ifs[NAOEIFS];
+ struct aoeif *ifp; /* current aoeif in use */
+ ushort nout;
+ ushort maxout;
+ u16 lasttag; /* last tag sent */
+ u16 useme;
+ ulong lastwadj; /* last window adjustment */
+ int wpkts, rpkts;
+ int dataref;
+};
+
struct aoedev {
struct aoedev *next;
- unsigned char addr[6]; /* remote mac addr */
- ushort flags;
ulong sysminor;
ulong aoemajor;
- ulong aoeminor;
+ u16 aoeminor;
+ u16 flags;
u16 nopen; /* (bd_openers isn't available without sleeping) */
- u16 lasttag; /* last tag sent */
u16 rttavg; /* round trip average of requests/responses */
u16 mintimer;
u16 fw_ver; /* version of blade's firmware */
- u16 maxbcnt;
struct work_struct work;/* disk create work struct */
struct gendisk *gd;
struct request_queue blkq;
@@ -134,15 +159,17 @@ struct aoedev {
sector_t ssize;
struct timer_list timer;
spinlock_t lock;
- struct net_device *ifp; /* interface ed is attached to */
struct sk_buff *sendq_hd; /* packets needing to be sent, list head */
struct sk_buff *sendq_tl;
+ struct sk_buff *skbpool_hd;
+ struct sk_buff *skbpool_tl;
+ int nskbpool;
mempool_t *bufpool; /* for deadlock-free Buf allocation */
struct list_head bufq; /* queue of bios to work on */
struct buf *inprocess; /* the one we're currently working on */
- ushort lostjumbo;
- ushort nframes; /* number of frames below */
- struct frame *frames;
+ struct aoetgt *targets[NTARGETS];
+ struct aoetgt **tgt; /* target in use when working */
+ struct aoetgt **htgt; /* target needing rexmit assistance */
};
@@ -160,14 +187,16 @@ void aoecmd_cfg(ushort aoemajor, unsigned char aoeminor);
void aoecmd_ata_rsp(struct sk_buff *);
void aoecmd_cfg_rsp(struct sk_buff *);
void aoecmd_sleepwork(struct work_struct *);
-struct sk_buff *new_skb(ulong);
+void aoecmd_cleanslate(struct aoedev *);
+struct sk_buff *aoecmd_ata_id(struct aoedev *);
int aoedev_init(void);
void aoedev_exit(void);
struct aoedev *aoedev_by_aoeaddr(int maj, int min);
-struct aoedev *aoedev_by_sysminor_m(ulong sysminor, ulong bufcnt);
+struct aoedev *aoedev_by_sysminor_m(ulong sysminor);
void aoedev_downdev(struct aoedev *d);
int aoedev_isbusy(struct aoedev *d);
+int aoedev_flush(const char __user *str, size_t size);
int aoenet_init(void);
void aoenet_exit(void);
@@ -175,4 +204,4 @@ void aoenet_xmit(struct sk_buff *);
int is_aoe_netif(struct net_device *ifp);
int set_aoe_iflist(const char __user *str, size_t size);
-u64 mac_addr(char addr[6]);
+unsigned long long mac_addr(char addr[6]);
diff --git a/drivers/block/aoe/aoeblk.c b/drivers/block/aoe/aoeblk.c
index 826d12381e2..0c39782b266 100644
--- a/drivers/block/aoe/aoeblk.c
+++ b/drivers/block/aoe/aoeblk.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2006 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoeblk.c
* block device routines
@@ -24,7 +24,7 @@ static ssize_t aoedisk_show_state(struct device *dev,
return snprintf(page, PAGE_SIZE,
"%s%s\n",
(d->flags & DEVFL_UP) ? "up" : "down",
- (d->flags & DEVFL_PAUSE) ? ",paused" :
+ (d->flags & DEVFL_KICKME) ? ",kickme" :
(d->nopen && !(d->flags & DEVFL_UP)) ? ",closewait" : "");
/* I'd rather see nopen exported so we can ditch closewait */
}
@@ -33,17 +33,48 @@ static ssize_t aoedisk_show_mac(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
struct aoedev *d = disk->private_data;
+ struct aoetgt *t = d->targets[0];
- return snprintf(page, PAGE_SIZE, "%012llx\n",
- (unsigned long long)mac_addr(d->addr));
+ if (t == NULL)
+ return snprintf(page, PAGE_SIZE, "none\n");
+ return snprintf(page, PAGE_SIZE, "%012llx\n", mac_addr(t->addr));
}
static ssize_t aoedisk_show_netif(struct device *dev,
struct device_attribute *attr, char *page)
{
struct gendisk *disk = dev_to_disk(dev);
struct aoedev *d = disk->private_data;
+ struct net_device *nds[8], **nd, **nnd, **ne;
+ struct aoetgt **t, **te;
+ struct aoeif *ifp, *e;
+ char *p;
+
+ memset(nds, 0, sizeof nds);
+ nd = nds;
+ ne = nd + ARRAY_SIZE(nds);
+ t = d->targets;
+ te = t + NTARGETS;
+ for (; t < te && *t; t++) {
+ ifp = (*t)->ifs;
+ e = ifp + NAOEIFS;
+ for (; ifp < e && ifp->nd; ifp++) {
+ for (nnd = nds; nnd < nd; nnd++)
+ if (*nnd == ifp->nd)
+ break;
+ if (nnd == nd && nd != ne)
+ *nd++ = ifp->nd;
+ }
+ }
- return snprintf(page, PAGE_SIZE, "%s\n", d->ifp->name);
+ ne = nd;
+ nd = nds;
+ if (*nd == NULL)
+ return snprintf(page, PAGE_SIZE, "none\n");
+ for (p = page; nd < ne; nd++)
+ p += snprintf(p, PAGE_SIZE - (p-page), "%s%s",
+ p == page ? "" : ",", (*nd)->name);
+ p += snprintf(p, PAGE_SIZE - (p-page), "\n");
+ return p-page;
}
/* firmware version */
static ssize_t aoedisk_show_fwver(struct device *dev,
@@ -134,7 +165,23 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
blk_queue_bounce(q, &bio);
+ if (bio == NULL) {
+ printk(KERN_ERR "aoe: bio is NULL\n");
+ BUG();
+ return 0;
+ }
d = bio->bi_bdev->bd_disk->private_data;
+ if (d == NULL) {
+ printk(KERN_ERR "aoe: bd_disk->private_data is NULL\n");
+ BUG();
+ bio_endio(bio, -ENXIO);
+ return 0;
+ } else if (bio->bi_io_vec == NULL) {
+ printk(KERN_ERR "aoe: bi_io_vec is NULL\n");
+ BUG();
+ bio_endio(bio, -ENXIO);
+ return 0;
+ }
buf = mempool_alloc(d->bufpool, GFP_NOIO);
if (buf == NULL) {
printk(KERN_INFO "aoe: buf allocation failure\n");
@@ -143,19 +190,19 @@ aoeblk_make_request(struct request_queue *q, struct bio *bio)
}
memset(buf, 0, sizeof(*buf));
INIT_LIST_HEAD(&buf->bufs);
- buf->start_time = jiffies;
+ buf->stime = jiffies;
buf->bio = bio;
buf->resid = bio->bi_size;
buf->sector = bio->bi_sector;
buf->bv = &bio->bi_io_vec[bio->bi_idx];
- WARN_ON(buf->bv->bv_len == 0);
buf->bv_resid = buf->bv->bv_len;
- buf->bufaddr = page_address(buf->bv->bv_page) + buf->bv->bv_offset;
+ WARN_ON(buf->bv_resid == 0);
+ buf->bv_off = buf->bv->bv_offset;
spin_lock_irqsave(&d->lock, flags);
if ((d->flags & DEVFL_UP) == 0) {
- printk(KERN_INFO "aoe: device %ld.%ld is not up\n",
+ printk(KERN_INFO "aoe: device %ld.%d is not up\n",
d->aoemajor, d->aoeminor);
spin_unlock_irqrestore(&d->lock, flags);
mempool_free(buf, d->bufpool);
@@ -208,14 +255,15 @@ aoeblk_gdalloc(void *vp)
gd = alloc_disk(AOE_PARTITIONS);
if (gd == NULL) {
- printk(KERN_ERR "aoe: cannot allocate disk structure for %ld.%ld\n",
+ printk(KERN_ERR
+ "aoe: cannot allocate disk structure for %ld.%d\n",
d->aoemajor, d->aoeminor);
goto err;
}
d->bufpool = mempool_create_slab_pool(MIN_BUFS, buf_pool_cache);
if (d->bufpool == NULL) {
- printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%ld\n",
+ printk(KERN_ERR "aoe: cannot allocate bufpool for %ld.%d\n",
d->aoemajor, d->aoeminor);
goto err_disk;
}
@@ -229,7 +277,7 @@ aoeblk_gdalloc(void *vp)
gd->fops = &aoe_bdops;
gd->private_data = d;
gd->capacity = d->ssize;
- snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%ld",
+ snprintf(gd->disk_name, sizeof gd->disk_name, "etherd/e%ld.%d",
d->aoemajor, d->aoeminor);
gd->queue = &d->blkq;
diff --git a/drivers/block/aoe/aoechr.c b/drivers/block/aoe/aoechr.c
index d5480e34cb2..e8e60e7a2e7 100644
--- a/drivers/block/aoe/aoechr.c
+++ b/drivers/block/aoe/aoechr.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2006 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoechr.c
* AoE character device driver
@@ -6,6 +6,7 @@
#include <linux/hdreg.h>
#include <linux/blkdev.h>
+#include <linux/delay.h>
#include "aoe.h"
enum {
@@ -14,6 +15,7 @@ enum {
MINOR_DISCOVER,
MINOR_INTERFACES,
MINOR_REVALIDATE,
+ MINOR_FLUSH,
MSGSZ = 2048,
NMSG = 100, /* message backlog to retain */
};
@@ -42,6 +44,7 @@ static struct aoe_chardev chardevs[] = {
{ MINOR_DISCOVER, "discover" },
{ MINOR_INTERFACES, "interfaces" },
{ MINOR_REVALIDATE, "revalidate" },
+ { MINOR_FLUSH, "flush" },
};
static int
@@ -68,6 +71,7 @@ revalidate(const char __user *str, size_t size)
int major, minor, n;
ulong flags;
struct aoedev *d;
+ struct sk_buff *skb;
char buf[16];
if (size >= sizeof buf)
@@ -85,13 +89,20 @@ revalidate(const char __user *str, size_t size)
d = aoedev_by_aoeaddr(major, minor);
if (!d)
return -EINVAL;
-
spin_lock_irqsave(&d->lock, flags);
- d->flags &= ~DEVFL_MAXBCNT;
- d->flags |= DEVFL_PAUSE;
+ aoecmd_cleanslate(d);
+loop:
+ skb = aoecmd_ata_id(d);
spin_unlock_irqrestore(&d->lock, flags);
+ /* try again if we are able to sleep a bit,
+ * otherwise give up this revalidation
+ */
+ if (!skb && !msleep_interruptible(200)) {
+ spin_lock_irqsave(&d->lock, flags);
+ goto loop;
+ }
+ aoenet_xmit(skb);
aoecmd_cfg(major, minor);
-
return 0;
}
@@ -149,6 +160,9 @@ aoechr_write(struct file *filp, const char __user *buf, size_t cnt, loff_t *offp
break;
case MINOR_REVALIDATE:
ret = revalidate(buf, cnt);
+ break;
+ case MINOR_FLUSH:
+ ret = aoedev_flush(buf, cnt);
}
if (ret == 0)
ret = cnt;
@@ -185,52 +199,51 @@ aoechr_read(struct file *filp, char __user *buf, size_t cnt, loff_t *off)
ulong flags;
n = (unsigned long) filp->private_data;
- switch (n) {
- case MINOR_ERR:
- spin_lock_irqsave(&emsgs_lock, flags);
-loop:
- em = emsgs + emsgs_head_idx;
- if ((em->flags & EMFL_VALID) == 0) {
- if (filp->f_flags & O_NDELAY) {
- spin_unlock_irqrestore(&emsgs_lock, flags);
- return -EAGAIN;
- }
- nblocked_emsgs_readers++;
+ if (n != MINOR_ERR)
+ return -EFAULT;
+
+ spin_lock_irqsave(&emsgs_lock, flags);
+ for (;;) {
+ em = emsgs + emsgs_head_idx;
+ if ((em->flags & EMFL_VALID) != 0)
+ break;
+ if (filp->f_flags & O_NDELAY) {
spin_unlock_irqrestore(&emsgs_lock, flags);
+ return -EAGAIN;
+ }
+ nblocked_emsgs_readers++;
- n = down_interruptible(&emsgs_sema);
+ spin_unlock_irqrestore(&emsgs_lock, flags);
+
+ n = down_interruptible(&emsgs_sema);
- spin_lock_irqsave(&emsgs_lock, flags);
+ spin_lock_irqsave(&emsgs_lock, flags);
- nblocked_emsgs_readers--;
+ nblocked_emsgs_readers--;
- if (n) {
- spin_unlock_irqrestore(&emsgs_lock, flags);
- return -ERESTARTSYS;
- }
- goto loop;
- }
- if (em->len > cnt) {
+ if (n) {
spin_unlock_irqrestore(&emsgs_lock, flags);
- return -EAGAIN;
+ return -ERESTARTSYS;
}
- mp = em->msg;
- len = em->len;
- em->msg = NULL;
- em->flags &= ~EMFL_VALID;
+ }
+ if (em->len > cnt) {
+ spin_unlock_irqrestore(&emsgs_lock, flags);
+ return -EAGAIN;
+ }
+ mp = em->msg;
+ len = em->len;
+ em->msg = NULL;
+ em->flags &= ~EMFL_VALID;
- emsgs_head_idx++;
- emsgs_head_idx %= ARRAY_SIZE(emsgs);
+ emsgs_head_idx++;
+ emsgs_head_idx %= ARRAY_SIZE(emsgs);
- spin_unlock_irqrestore(&emsgs_lock, flags);
+ spin_unlock_irqrestore(&emsgs_lock, flags);
- n = copy_to_user(buf, mp, len);
- kfree(mp);
- return n == 0 ? len : -EFAULT;
- default:
- return -EFAULT;
- }
+ n = copy_to_user(buf, mp, len);
+ kfree(mp);
+ return n == 0 ? len : -EFAULT;
}
static const struct file_operations aoe_fops = {
diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c
index 4d59d505773..44beb17e809 100644
--- a/drivers/block/aoe/aoecmd.c
+++ b/drivers/block/aoe/aoecmd.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2006 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoecmd.c
* Filesystem request handling methods
@@ -9,19 +9,21 @@
#include <linux/skbuff.h>
#include <linux/netdevice.h>
#include <linux/genhd.h>
+#include <linux/moduleparam.h>
#include <net/net_namespace.h>
#include <asm/unaligned.h>
#include "aoe.h"
-#define TIMERTICK (HZ / 10)
-#define MINTIMER (2 * TIMERTICK)
-#define MAXTIMER (HZ << 1)
-
static int aoe_deadsecs = 60 * 3;
module_param(aoe_deadsecs, int, 0644);
MODULE_PARM_DESC(aoe_deadsecs, "After aoe_deadsecs seconds, give up and fail dev.");
-struct sk_buff *
+static int aoe_maxout = 16;
+module_param(aoe_maxout, int, 0644);
+MODULE_PARM_DESC(aoe_maxout,
+ "Only aoe_maxout outstanding packets for every MAC on eX.Y.");
+
+static struct sk_buff *
new_skb(ulong len)
{
struct sk_buff *skb;
@@ -43,12 +45,12 @@ new_skb(ulong len)
}
static struct frame *
-getframe(struct aoedev *d, int tag)
+getframe(struct aoetgt *t, int tag)
{
struct frame *f, *e;
- f = d->frames;
- e = f + d->nframes;
+ f = t->frames;
+ e = f + t->nframes;
for (; f<e; f++)
if (f->tag == tag)
return f;
@@ -61,21 +63,21 @@ getframe(struct aoedev *d, int tag)
* This driver reserves tag -1 to mean "unused frame."
*/
static int
-newtag(struct aoedev *d)
+newtag(struct aoetgt *t)
{
register ulong n;
n = jiffies & 0xffff;
- return n |= (++d->lasttag & 0x7fff) << 16;
+ return n |= (++t->lasttag & 0x7fff) << 16;
}
static int
-aoehdr_atainit(struct aoedev *d, struct aoe_hdr *h)
+aoehdr_atainit(struct aoedev *d, struct aoetgt *t, struct aoe_hdr *h)
{
- u32 host_tag = newtag(d);
+ u32 host_tag = newtag(t);
- memcpy(h->src, d->ifp->dev_addr, sizeof h->src);
- memcpy(h->dst, d->addr, sizeof h->dst);
+ memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
+ memcpy(h->dst, t->addr, sizeof h->dst);
h->type = __constant_cpu_to_be16(ETH_P_AOE);
h->verfl = AOE_HVER;
h->major = cpu_to_be16(d->aoemajor);
@@ -98,42 +100,162 @@ put_lba(struct aoe_atahdr *ah, sector_t lba)
}
static void
-aoecmd_ata_rw(struct aoedev *d, struct frame *f)
+ifrotate(struct aoetgt *t)
+{
+ t->ifp++;
+ if (t->ifp >= &t->ifs[NAOEIFS] || t->ifp->nd == NULL)
+ t->ifp = t->ifs;
+ if (t->ifp->nd == NULL) {
+ printk(KERN_INFO "aoe: no interface to rotate to\n");
+ BUG();
+ }
+}
+
+static void
+skb_pool_put(struct aoedev *d, struct sk_buff *skb)
+{
+ if (!d->skbpool_hd)
+ d->skbpool_hd = skb;
+ else
+ d->skbpool_tl->next = skb;
+ d->skbpool_tl = skb;
+}
+
+static struct sk_buff *
+skb_pool_get(struct aoedev *d)
+{
+ struct sk_buff *skb;
+
+ skb = d->skbpool_hd;
+ if (skb && atomic_read(&skb_shinfo(skb)->dataref) == 1) {
+ d->skbpool_hd = skb->next;
+ skb->next = NULL;
+ return skb;
+ }
+ if (d->nskbpool < NSKBPOOLMAX
+ && (skb = new_skb(ETH_ZLEN))) {
+ d->nskbpool++;
+ return skb;
+ }
+ return NULL;
+}
+
+/* freeframe is where we do our load balancing so it's a little hairy. */
+static struct frame *
+freeframe(struct aoedev *d)
+{
+ struct frame *f, *e, *rf;
+ struct aoetgt **t;
+ struct sk_buff *skb;
+
+ if (d->targets[0] == NULL) { /* shouldn't happen, but I'm paranoid */
+ printk(KERN_ERR "aoe: NULL TARGETS!\n");
+ return NULL;
+ }
+ t = d->tgt;
+ t++;
+ if (t >= &d->targets[NTARGETS] || !*t)
+ t = d->targets;
+ for (;;) {
+ if ((*t)->nout < (*t)->maxout
+ && t != d->htgt
+ && (*t)->ifp->nd) {
+ rf = NULL;
+ f = (*t)->frames;
+ e = f + (*t)->nframes;
+ for (; f < e; f++) {
+ if (f->tag != FREETAG)
+ continue;
+ skb = f->skb;
+ if (!skb
+ && !(f->skb = skb = new_skb(ETH_ZLEN)))
+ continue;
+ if (atomic_read(&skb_shinfo(skb)->dataref)
+ != 1) {
+ if (!rf)
+ rf = f;
+ continue;
+ }
+gotone: skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+ skb_trim(skb, 0);
+ d->tgt = t;
+ ifrotate(*t);
+ return f;
+ }
+ /* Work can be done, but the network layer is
+ holding our precious packets. Try to grab
+ one from the pool. */
+ f = rf;
+ if (f == NULL) { /* more paranoia */
+ printk(KERN_ERR
+ "aoe: freeframe: %s.\n",
+ "unexpected null rf");
+ d->flags |= DEVFL_KICKME;
+ return NULL;
+ }
+ skb = skb_pool_get(d);
+ if (skb) {
+ skb_pool_put(d, f->skb);
+ f->skb = skb;
+ goto gotone;
+ }
+ (*t)->dataref++;
+ if ((*t)->nout == 0)
+ d->flags |= DEVFL_KICKME;
+ }
+ if (t == d->tgt) /* we've looped and found nada */
+ break;
+ t++;
+ if (t >= &d->targets[NTARGETS] || !*t)
+ t = d->targets;
+ }
+ return NULL;
+}
+
+static int
+aoecmd_ata_rw(struct aoedev *d)
{
+ struct frame *f;
struct aoe_hdr *h;
struct aoe_atahdr *ah;
struct buf *buf;
+ struct bio_vec *bv;
+ struct aoetgt *t;
struct sk_buff *skb;
ulong bcnt;
- register sector_t sector;
char writebit, extbit;
writebit = 0x10;
extbit = 0x4;
+ f = freeframe(d);
+ if (f == NULL)
+ return 0;
+ t = *d->tgt;
buf = d->inprocess;
-
- sector = buf->sector;
- bcnt = buf->bv_resid;
- if (bcnt > d->maxbcnt)
- bcnt = d->maxbcnt;
-
+ bv = buf->bv;
+ bcnt = t->ifp->maxbcnt;
+ if (bcnt == 0)
+ bcnt = DEFAULTBCNT;
+ if (bcnt > buf->bv_resid)
+ bcnt = buf->bv_resid;
/* initialize the headers & frame */
skb = f->skb;
h = (struct aoe_hdr *) skb_mac_header(skb);
ah = (struct aoe_atahdr *) (h+1);
skb_put(skb, sizeof *h + sizeof *ah);
memset(h, 0, skb->len);
- f->tag = aoehdr_atainit(d, h);
+ f->tag = aoehdr_atainit(d, t, h);
+ t->nout++;
f->waited = 0;
f->buf = buf;
- f->bufaddr = buf->bufaddr;
+ f->bufaddr = page_address(bv->bv_page) + buf->bv_off;
f->bcnt = bcnt;
- f->lba = sector;
+ f->lba = buf->sector;
/* set up ata header */
ah->scnt = bcnt >> 9;
- put_lba(ah, sector);
+ put_lba(ah, buf->sector);
if (d->flags & DEVFL_EXT) {
ah->aflags |= AOEAFL_EXT;
} else {
@@ -141,14 +263,14 @@ aoecmd_ata_rw(struct aoedev *d, struct frame *f)
ah->lba3 &= 0x0f;
ah->lba3 |= 0xe0; /* LBA bit + obsolete 0xa0 */
}
-
if (bio_data_dir(buf->bio) == WRITE) {
- skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
- offset_in_page(f->bufaddr), bcnt);
+ skb_fill_page_desc(skb, 0, bv->bv_page, buf->bv_off, bcnt);
ah->aflags |= AOEAFL_WRITE;
skb->len += bcnt;
skb->data_len = bcnt;
+ t->wpkts++;
} else {
+ t->rpkts++;
writebit = 0;
}
@@ -156,29 +278,29 @@ aoecmd_ata_rw(struct aoedev *d, struct frame *f)
/* mark all tracking fields and load out */
buf->nframesout += 1;
- buf->bufaddr += bcnt;
+ buf->bv_off += bcnt;
buf->bv_resid -= bcnt;
-/* printk(KERN_DEBUG "aoe: bv_resid=%ld\n", buf->bv_resid); */
buf->resid -= bcnt;
buf->sector += bcnt >> 9;
if (buf->resid == 0) {
d->inprocess = NULL;
} else if (buf->bv_resid == 0) {
- buf->bv++;
- WARN_ON(buf->bv->bv_len == 0);
- buf->bv_resid = buf->bv->bv_len;
- buf->bufaddr = page_address(buf->bv->bv_page) + buf->bv->bv_offset;
+ buf->bv = ++bv;
+ buf->bv_resid = bv->bv_len;
+ WARN_ON(buf->bv_resid == 0);
+ buf->bv_off = bv->bv_offset;
}
- skb->dev = d->ifp;
+ skb->dev = t->ifp->nd;
skb = skb_clone(skb, GFP_ATOMIC);
- if (skb == NULL)
- return;
- if (d->sendq_hd)
- d->sendq_tl->next = skb;
- else
- d->sendq_hd = skb;
- d->sendq_tl = skb;
+ if (skb) {
+ if (d->sendq_hd)
+ d->sendq_tl->next = skb;
+ else
+ d->sendq_hd = skb;
+ d->sendq_tl = skb;
+ }
+ return 1;
}
/* some callers cannot sleep, and they can call this function,
@@ -232,62 +354,8 @@ cont:
return sl;
}
-static struct frame *
-freeframe(struct aoedev *d)
-{
- struct frame *f, *e;
- int n = 0;
-
- f = d->frames;
- e = f + d->nframes;
- for (; f<e; f++) {
- if (f->tag != FREETAG)
- continue;
- if (atomic_read(&skb_shinfo(f->skb)->dataref) == 1) {
- skb_shinfo(f->skb)->nr_frags = f->skb->data_len = 0;
- skb_trim(f->skb, 0);
- return f;
- }
- n++;
- }
- if (n == d->nframes) /* wait for network layer */
- d->flags |= DEVFL_KICKME;
-
- return NULL;
-}
-
-/* enters with d->lock held */
-void
-aoecmd_work(struct aoedev *d)
-{
- struct frame *f;
- struct buf *buf;
-
- if (d->flags & DEVFL_PAUSE) {
- if (!aoedev_isbusy(d))
- d->sendq_hd = aoecmd_cfg_pkts(d->aoemajor,
- d->aoeminor, &d->sendq_tl);
- return;
- }
-
-loop:
- f = freeframe(d);
- if (f == NULL)
- return;
- if (d->inprocess == NULL) {
- if (list_empty(&d->bufq))
- return;
- buf = container_of(d->bufq.next, struct buf, bufs);
- list_del(d->bufq.next);
-/*printk(KERN_DEBUG "aoe: bi_size=%ld\n", buf->bio->bi_size); */
- d->inprocess = buf;
- }
- aoecmd_ata_rw(d, f);
- goto loop;
-}
-
static void
-rexmit(struct aoedev *d, struct frame *f)
+resend(struct aoedev *d, struct aoetgt *t, struct frame *f)
{
struct sk_buff *skb;
struct aoe_hdr *h;
@@ -295,41 +363,46 @@ rexmit(struct aoedev *d, struct frame *f)
char buf[128];
u32 n;
- n = newtag(d);
+ ifrotate(t);
+ n = newtag(t);
+ skb = f->skb;
+ h = (struct aoe_hdr *) skb_mac_header(skb);
+ ah = (struct aoe_atahdr *) (h+1);
snprintf(buf, sizeof buf,
- "%15s e%ld.%ld oldtag=%08x@%08lx newtag=%08x\n",
- "retransmit",
- d->aoemajor, d->aoeminor, f->tag, jiffies, n);
+ "%15s e%ld.%d oldtag=%08x@%08lx newtag=%08x "
+ "s=%012llx d=%012llx nout=%d\n",
+ "retransmit", d->aoemajor, d->aoeminor, f->tag, jiffies, n,
+ mac_addr(h->src),
+ mac_addr(h->dst), t->nout);
aoechr_error(buf);
- skb = f->skb;
- h = (struct aoe_hdr *) skb_mac_header(skb);
- ah = (struct aoe_atahdr *) (h+1);
f->tag = n;
h->tag = cpu_to_be32(n);
- memcpy(h->dst, d->addr, sizeof h->dst);
- memcpy(h->src, d->ifp->dev_addr, sizeof h->src);
-
- n = DEFAULTBCNT / 512;
- if (ah->scnt > n) {
- ah->scnt = n;
+ memcpy(h->dst, t->addr, sizeof h->dst);
+ memcpy(h->src, t->ifp->nd->dev_addr, sizeof h->src);
+
+ switch (ah->cmdstat) {
+ default:
+ break;
+ case WIN_READ:
+ case WIN_READ_EXT:
+ case WIN_WRITE:
+ case WIN_WRITE_EXT:
+ put_lba(ah, f->lba);
+
+ n = f->bcnt;
+ if (n > DEFAULTBCNT)
+ n = DEFAULTBCNT;
+ ah->scnt = n >> 9;
if (ah->aflags & AOEAFL_WRITE) {
skb_fill_page_desc(skb, 0, virt_to_page(f->bufaddr),
- offset_in_page(f->bufaddr), DEFAULTBCNT);
- skb->len = sizeof *h + sizeof *ah + DEFAULTBCNT;
- skb->data_len = DEFAULTBCNT;
- }
- if (++d->lostjumbo > (d->nframes << 1))
- if (d->maxbcnt != DEFAULTBCNT) {
- printk(KERN_INFO "aoe: e%ld.%ld: too many lost jumbo on %s - using 1KB frames.\n",
- d->aoemajor, d->aoeminor, d->ifp->name);
- d->maxbcnt = DEFAULTBCNT;
- d->flags |= DEVFL_MAXBCNT;
+ offset_in_page(f->bufaddr), n);
+ skb->len = sizeof *h + sizeof *ah + n;
+ skb->data_len = n;
}
}
-
- skb->dev = d->ifp;
+ skb->dev = t->ifp->nd;
skb = skb_clone(skb, GFP_ATOMIC);
if (skb == NULL)
return;
@@ -352,10 +425,92 @@ tsince(int tag)
return n;
}
+static struct aoeif *
+getif(struct aoetgt *t, struct net_device *nd)
+{
+ struct aoeif *p, *e;
+
+ p = t->ifs;
+ e = p + NAOEIFS;
+ for (; p < e; p++)
+ if (p->nd == nd)
+ return p;
+ return NULL;
+}
+
+static struct aoeif *
+addif(struct aoetgt *t, struct net_device *nd)
+{
+ struct aoeif *p;
+
+ p = getif(t, NULL);
+ if (!p)
+ return NULL;
+ p->nd = nd;
+ p->maxbcnt = DEFAULTBCNT;
+ p->lost = 0;
+ p->lostjumbo = 0;
+ return p;
+}
+
+static void
+ejectif(struct aoetgt *t, struct aoeif *ifp)
+{
+ struct aoeif *e;
+ ulong n;
+
+ e = t->ifs + NAOEIFS - 1;
+ n = (e - ifp) * sizeof *ifp;
+ memmove(ifp, ifp+1, n);
+ e->nd = NULL;
+}
+
+static int
+sthtith(struct aoedev *d)
+{
+ struct frame *f, *e, *nf;
+ struct sk_buff *skb;
+ struct aoetgt *ht = *d->htgt;
+
+ f = ht->frames;
+ e = f + ht->nframes;
+ for (; f < e; f++) {
+ if (f->tag == FREETAG)
+ continue;
+ nf = freeframe(d);
+ if (!nf)
+ return 0;
+ skb = nf->skb;
+ *nf = *f;
+ f->skb = skb;
+ f->tag = FREETAG;
+ nf->waited = 0;
+ ht->nout--;
+ (*d->tgt)->nout++;
+ resend(d, *d->tgt, nf);
+ }
+ /* he's clean, he's useless. take away his interfaces */
+ memset(ht->ifs, 0, sizeof ht->ifs);
+ d->htgt = NULL;
+ return 1;
+}
+
+static inline unsigned char
+ata_scnt(unsigned char *packet) {
+ struct aoe_hdr *h;
+ struct aoe_atahdr *ah;
+
+ h = (struct aoe_hdr *) packet;
+ ah = (struct aoe_atahdr *) (h+1);
+ return ah->scnt;
+}
+
static void
rexmit_timer(ulong vp)
{
struct aoedev *d;
+ struct aoetgt *t, **tt, **te;
+ struct aoeif *ifp;
struct frame *f, *e;
struct sk_buff *sl;
register long timeout;
@@ -374,31 +529,79 @@ rexmit_timer(ulong vp)
spin_unlock_irqrestore(&d->lock, flags);
return;
}
- f = d->frames;
- e = f + d->nframes;
- for (; f<e; f++) {
- if (f->tag != FREETAG && tsince(f->tag) >= timeout) {
+ tt = d->targets;
+ te = tt + NTARGETS;
+ for (; tt < te && *tt; tt++) {
+ t = *tt;
+ f = t->frames;
+ e = f + t->nframes;
+ for (; f < e; f++) {
+ if (f->tag == FREETAG
+ || tsince(f->tag) < timeout)
+ continue;
n = f->waited += timeout;
n /= HZ;
- if (n > aoe_deadsecs) { /* waited too long for response */
+ if (n > aoe_deadsecs) {
+ /* waited too long. device failure. */
aoedev_downdev(d);
break;
}
- rexmit(d, f);
+
+ if (n > HELPWAIT /* see if another target can help */
+ && (tt != d->targets || d->targets[1]))
+ d->htgt = tt;
+
+ if (t->nout == t->maxout) {
+ if (t->maxout > 1)
+ t->maxout--;
+ t->lastwadj = jiffies;
+ }
+
+ ifp = getif(t, f->skb->dev);
+ if (ifp && ++ifp->lost > (t->nframes << 1)
+ && (ifp != t->ifs || t->ifs[1].nd)) {
+ ejectif(t, ifp);
+ ifp = NULL;
+ }
+
+ if (ata_scnt(skb_mac_header(f->skb)) > DEFAULTBCNT / 512
+ && ifp && ++ifp->lostjumbo > (t->nframes << 1)
+ && ifp->maxbcnt != DEFAULTBCNT) {
+ printk(KERN_INFO
+ "aoe: e%ld.%d: "
+ "too many lost jumbo on "
+ "%s:%012llx - "
+ "falling back to %d frames.\n",
+ d->aoemajor, d->aoeminor,
+ ifp->nd->name, mac_addr(t->addr),
+ DEFAULTBCNT);
+ ifp->maxbcnt = 0;
+ }
+ resend(d, t, f);
+ }
+
+ /* window check */
+ if (t->nout == t->maxout
+ && t->maxout < t->nframes
+ && (jiffies - t->lastwadj)/HZ > 10) {
+ t->maxout++;
+ t->lastwadj = jiffies;
}
}
- if (d->flags & DEVFL_KICKME) {
+
+ if (d->sendq_hd) {
+ n = d->rttavg <<= 1;
+ if (n > MAXTIMER)
+ d->rttavg = MAXTIMER;
+ }
+
+ if (d->flags & DEVFL_KICKME || d->htgt) {
d->flags &= ~DEVFL_KICKME;
aoecmd_work(d);
}
sl = d->sendq_hd;
d->sendq_hd = d->sendq_tl = NULL;
- if (sl) {
- n = d->rttavg <<= 1;
- if (n > MAXTIMER)
- d->rttavg = MAXTIMER;
- }
d->timer.expires = jiffies + TIMERTICK;
add_timer(&d->timer);
@@ -408,6 +611,25 @@ rexmit_timer(ulong vp)
aoenet_xmit(sl);
}
+/* enters with d->lock held */
+void
+aoecmd_work(struct aoedev *d)
+{
+ struct buf *buf;
+loop:
+ if (d->htgt && !sthtith(d))
+ return;
+ if (d->inprocess == NULL) {
+ if (list_empty(&d->bufq))
+ return;
+ buf = container_of(d->bufq.next, struct buf, bufs);
+ list_del(d->bufq.next);
+ d->inprocess = buf;
+ }
+ if (aoecmd_ata_rw(d))
+ goto loop;
+}
+
/* this function performs work that has been deferred until sleeping is OK
*/
void
@@ -440,7 +662,7 @@ aoecmd_sleepwork(struct work_struct *work)
}
static void
-ataid_complete(struct aoedev *d, unsigned char *id)
+ataid_complete(struct aoedev *d, struct aoetgt *t, unsigned char *id)
{
u64 ssize;
u16 n;
@@ -475,24 +697,20 @@ ataid_complete(struct aoedev *d, unsigned char *id)
}
if (d->ssize != ssize)
- printk(KERN_INFO "aoe: %012llx e%lu.%lu v%04x has %llu sectors\n",
- (unsigned long long)mac_addr(d->addr),
+ printk(KERN_INFO
+ "aoe: %012llx e%ld.%d v%04x has %llu sectors\n",
+ mac_addr(t->addr),
d->aoemajor, d->aoeminor,
d->fw_ver, (long long)ssize);
d->ssize = ssize;
d->geo.start = 0;
+ if (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
+ return;
if (d->gd != NULL) {
d->gd->capacity = ssize;
d->flags |= DEVFL_NEWSIZE;
- } else {
- if (d->flags & DEVFL_GDALLOC) {
- printk(KERN_ERR "aoe: can't schedule work for e%lu.%lu, %s\n",
- d->aoemajor, d->aoeminor,
- "it's already on! This shouldn't happen.\n");
- return;
- }
+ } else
d->flags |= DEVFL_GDALLOC;
- }
schedule_work(&d->work);
}
@@ -519,6 +737,31 @@ calc_rttavg(struct aoedev *d, int rtt)
d->rttavg += n >> 2;
}
+static struct aoetgt *
+gettgt(struct aoedev *d, char *addr)
+{
+ struct aoetgt **t, **e;
+
+ t = d->targets;
+ e = t + NTARGETS;
+ for (; t < e && *t; t++)
+ if (memcmp((*t)->addr, addr, sizeof((*t)->addr)) == 0)
+ return *t;
+ return NULL;
+}
+
+static inline void
+diskstats(struct gendisk *disk, struct bio *bio, ulong duration)
+{
+ unsigned long n_sect = bio->bi_size >> 9;
+ const int rw = bio_data_dir(bio);
+
+ disk_stat_inc(disk, ios[rw]);
+ disk_stat_add(disk, ticks[rw], duration);
+ disk_stat_add(disk, sectors[rw], n_sect);
+ disk_stat_add(disk, io_ticks, duration);
+}
+
void
aoecmd_ata_rsp(struct sk_buff *skb)
{
@@ -528,6 +771,8 @@ aoecmd_ata_rsp(struct sk_buff *skb)
struct frame *f;
struct buf *buf;
struct sk_buff *sl;
+ struct aoetgt *t;
+ struct aoeif *ifp;
register long n;
ulong flags;
char ebuf[128];
@@ -547,7 +792,14 @@ aoecmd_ata_rsp(struct sk_buff *skb)
spin_lock_irqsave(&d->lock, flags);
n = be32_to_cpu(get_unaligned(&hin->tag));
- f = getframe(d, n);
+ t = gettgt(d, hin->src);
+ if (t == NULL) {
+ printk(KERN_INFO "aoe: can't find target e%ld.%d:%012llx\n",
+ d->aoemajor, d->aoeminor, mac_addr(hin->src));
+ spin_unlock_irqrestore(&d->lock, flags);
+ return;
+ }
+ f = getframe(t, n);
if (f == NULL) {
calc_rttavg(d, -tsince(n));
spin_unlock_irqrestore(&d->lock, flags);
@@ -569,24 +821,24 @@ aoecmd_ata_rsp(struct sk_buff *skb)
ahout = (struct aoe_atahdr *) (hout+1);
buf = f->buf;
- if (ahout->cmdstat == WIN_IDENTIFY)
- d->flags &= ~DEVFL_PAUSE;
if (ahin->cmdstat & 0xa9) { /* these bits cleared on success */
printk(KERN_ERR
- "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%ld\n",
+ "aoe: ata error cmd=%2.2Xh stat=%2.2Xh from e%ld.%d\n",
ahout->cmdstat, ahin->cmdstat,
d->aoemajor, d->aoeminor);
if (buf)
buf->flags |= BUFFL_FAIL;
} else {
+ if (d->htgt && t == *d->htgt) /* I'll help myself, thank you. */
+ d->htgt = NULL;
n = ahout->scnt << 9;
switch (ahout->cmdstat) {
case WIN_READ:
case WIN_READ_EXT:
if (skb->len - sizeof *hin - sizeof *ahin < n) {
printk(KERN_ERR
- "aoe: runt data size in read. skb->len=%d\n",
- skb->len);
+ "aoe: %s. skb->len=%d need=%ld\n",
+ "runt data size in read", skb->len, n);
/* fail frame f? just returning will rexmit. */
spin_unlock_irqrestore(&d->lock, flags);
return;
@@ -594,32 +846,18 @@ aoecmd_ata_rsp(struct sk_buff *skb)
memcpy(f->bufaddr, ahin+1, n);
case WIN_WRITE:
case WIN_WRITE_EXT:
+ ifp = getif(t, skb->dev);
+ if (ifp) {
+ ifp->lost = 0;
+ if (n > DEFAULTBCNT)
+ ifp->lostjumbo = 0;
+ }
if (f->bcnt -= n) {
- skb = f->skb;
+ f->lba += n >> 9;
f->bufaddr += n;
- put_lba(ahout, f->lba += ahout->scnt);
- n = f->bcnt;
- if (n > DEFAULTBCNT)
- n = DEFAULTBCNT;
- ahout->scnt = n >> 9;
- if (ahout->aflags & AOEAFL_WRITE) {
- skb_fill_page_desc(skb, 0,
- virt_to_page(f->bufaddr),
- offset_in_page(f->bufaddr), n);
- skb->len = sizeof *hout + sizeof *ahout + n;
- skb->data_len = n;
- }
- f->tag = newtag(d);
- hout->tag = cpu_to_be32(f->tag);
- skb->dev = d->ifp;
- skb = skb_clone(skb, GFP_ATOMIC);
- spin_unlock_irqrestore(&d->lock, flags);
- if (skb)
- aoenet_xmit(skb);
- return;
+ resend(d, t, f);
+ goto xmit;
}
- if (n > DEFAULTBCNT)
- d->lostjumbo = 0;
break;
case WIN_IDENTIFY:
if (skb->len - sizeof *hin - sizeof *ahin < 512) {
@@ -629,7 +867,7 @@ aoecmd_ata_rsp(struct sk_buff *skb)
spin_unlock_irqrestore(&d->lock, flags);
return;
}
- ataid_complete(d, (char *) (ahin+1));
+ ataid_complete(d, t, (char *) (ahin+1));
break;
default:
printk(KERN_INFO
@@ -640,28 +878,19 @@ aoecmd_ata_rsp(struct sk_buff *skb)
}
}
- if (buf) {
- buf->nframesout -= 1;
- if (buf->nframesout == 0 && buf->resid == 0) {
- unsigned long duration = jiffies - buf->start_time;
- unsigned long n_sect = buf->bio->bi_size >> 9;
- struct gendisk *disk = d->gd;
- const int rw = bio_data_dir(buf->bio);
-
- disk_stat_inc(disk, ios[rw]);
- disk_stat_add(disk, ticks[rw], duration);
- disk_stat_add(disk, sectors[rw], n_sect);
- disk_stat_add(disk, io_ticks, duration);
- n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
- bio_endio(buf->bio, n);
- mempool_free(buf, d->bufpool);
- }
+ if (buf && --buf->nframesout == 0 && buf->resid == 0) {
+ diskstats(d->gd, buf->bio, jiffies - buf->stime);
+ n = (buf->flags & BUFFL_FAIL) ? -EIO : 0;
+ bio_endio(buf->bio, n);
+ mempool_free(buf, d->bufpool);
}
f->buf = NULL;
f->tag = FREETAG;
+ t->nout--;
aoecmd_work(d);
+xmit:
sl = d->sendq_hd;
d->sendq_hd = d->sendq_tl = NULL;
@@ -679,23 +908,20 @@ aoecmd_cfg(ushort aoemajor, unsigned char aoeminor)
aoenet_xmit(sl);
}
-/*
- * Since we only call this in one place (and it only prepares one frame)
- * we just return the skb. Usually we'd chain it up to the aoedev sendq.
- */
-static struct sk_buff *
+struct sk_buff *
aoecmd_ata_id(struct aoedev *d)
{
struct aoe_hdr *h;
struct aoe_atahdr *ah;
struct frame *f;
struct sk_buff *skb;
+ struct aoetgt *t;
f = freeframe(d);
- if (f == NULL) {
- printk(KERN_ERR "aoe: can't get a frame. This shouldn't happen.\n");
+ if (f == NULL)
return NULL;
- }
+
+ t = *d->tgt;
/* initialize the headers & frame */
skb = f->skb;
@@ -703,7 +929,8 @@ aoecmd_ata_id(struct aoedev *d)
ah = (struct aoe_atahdr *) (h+1);
skb_put(skb, sizeof *h + sizeof *ah);
memset(h, 0, skb->len);
- f->tag = aoehdr_atainit(d, h);
+ f->tag = aoehdr_atainit(d, t, h);
+ t->nout++;
f->waited = 0;
/* set up ata header */
@@ -711,7 +938,7 @@ aoecmd_ata_id(struct aoedev *d)
ah->cmdstat = WIN_IDENTIFY;
ah->lba3 = 0xa0;
- skb->dev = d->ifp;
+ skb->dev = t->ifp->nd;
d->rttavg = MAXTIMER;
d->timer.function = rexmit_timer;
@@ -719,15 +946,52 @@ aoecmd_ata_id(struct aoedev *d)
return skb_clone(skb, GFP_ATOMIC);
}
+static struct aoetgt *
+addtgt(struct aoedev *d, char *addr, ulong nframes)
+{
+ struct aoetgt *t, **tt, **te;
+ struct frame *f, *e;
+
+ tt = d->targets;
+ te = tt + NTARGETS;
+ for (; tt < te && *tt; tt++)
+ ;
+
+ if (tt == te) {
+ printk(KERN_INFO
+ "aoe: device addtgt failure; too many targets\n");
+ return NULL;
+ }
+ t = kcalloc(1, sizeof *t, GFP_ATOMIC);
+ f = kcalloc(nframes, sizeof *f, GFP_ATOMIC);
+ if (!t || !f) {
+ kfree(f);
+ kfree(t);
+ printk(KERN_INFO "aoe: cannot allocate memory to add target\n");
+ return NULL;
+ }
+
+ t->nframes = nframes;
+ t->frames = f;
+ e = f + nframes;
+ for (; f < e; f++)
+ f->tag = FREETAG;
+ memcpy(t->addr, addr, sizeof t->addr);
+ t->ifp = t->ifs;
+ t->maxout = t->nframes;
+ return *tt = t;
+}
+
void
aoecmd_cfg_rsp(struct sk_buff *skb)
{
struct aoedev *d;
struct aoe_hdr *h;
struct aoe_cfghdr *ch;
+ struct aoetgt *t;
+ struct aoeif *ifp;
ulong flags, sysminor, aoemajor;
struct sk_buff *sl;
- enum { MAXFRAMES = 16 };
u16 n;
h = (struct aoe_hdr *) skb_mac_header(skb);
@@ -752,10 +1016,10 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
}
n = be16_to_cpu(ch->bufcnt);
- if (n > MAXFRAMES) /* keep it reasonable */
- n = MAXFRAMES;
+ if (n > aoe_maxout) /* keep it reasonable */
+ n = aoe_maxout;
- d = aoedev_by_sysminor_m(sysminor, n);
+ d = aoedev_by_sysminor_m(sysminor);
if (d == NULL) {
printk(KERN_INFO "aoe: device sysminor_m failure\n");
return;
@@ -763,38 +1027,74 @@ aoecmd_cfg_rsp(struct sk_buff *skb)
spin_lock_irqsave(&d->lock, flags);
- /* permit device to migrate mac and network interface */
- d->ifp = skb->dev;
- memcpy(d->addr, h->src, sizeof d->addr);
- if (!(d->flags & DEVFL_MAXBCNT)) {
- n = d->ifp->mtu;
+ t = gettgt(d, h->src);
+ if (!t) {
+ t = addtgt(d, h->src, n);
+ if (!t) {
+ spin_unlock_irqrestore(&d->lock, flags);
+ return;
+ }
+ }
+ ifp = getif(t, skb->dev);
+ if (!ifp) {
+ ifp = addif(t, skb->dev);
+ if (!ifp) {
+ printk(KERN_INFO
+ "aoe: device addif failure; "
+ "too many interfaces?\n");
+ spin_unlock_irqrestore(&d->lock, flags);
+ return;
+ }
+ }
+ if (ifp->maxbcnt) {
+ n = ifp->nd->mtu;
n -= sizeof (struct aoe_hdr) + sizeof (struct aoe_atahdr);
n /= 512;
if (n > ch->scnt)
n = ch->scnt;
n = n ? n * 512 : DEFAULTBCNT;
- if (n != d->maxbcnt) {
+ if (n != ifp->maxbcnt) {
printk(KERN_INFO
- "aoe: e%ld.%ld: setting %d byte data frames on %s\n",
- d->aoemajor, d->aoeminor, n, d->ifp->name);
- d->maxbcnt = n;
+ "aoe: e%ld.%d: setting %d%s%s:%012llx\n",
+ d->aoemajor, d->aoeminor, n,
+ " byte data frames on ", ifp->nd->name,
+ mac_addr(t->addr));
+ ifp->maxbcnt = n;
}
}
/* don't change users' perspective */
- if (d->nopen && !(d->flags & DEVFL_PAUSE)) {
+ if (d->nopen) {
spin_unlock_irqrestore(&d->lock, flags);
return;
}
- d->flags |= DEVFL_PAUSE; /* force pause */
- d->mintimer = MINTIMER;
d->fw_ver = be16_to_cpu(ch->fwver);
- /* check for already outstanding ataid */
- sl = aoedev_isbusy(d) == 0 ? aoecmd_ata_id(d) : NULL;
+ sl = aoecmd_ata_id(d);
spin_unlock_irqrestore(&d->lock, flags);
aoenet_xmit(sl);
}
+void
+aoecmd_cleanslate(struct aoedev *d)
+{
+ struct aoetgt **t, **te;
+ struct aoeif *p, *e;
+
+ d->mintimer = MINTIMER;
+
+ t = d->targets;
+ te = t + NTARGETS;
+ for (; t < te && *t; t++) {
+ (*t)->maxout = (*t)->nframes;
+ p = (*t)->ifs;
+ e = p + NAOEIFS;
+ for (; p < e; p++) {
+ p->lostjumbo = 0;
+ p->lost = 0;
+ p->maxbcnt = DEFAULTBCNT;
+ }
+ }
+}
diff --git a/drivers/block/aoe/aoedev.c b/drivers/block/aoe/aoedev.c
index 51f50710e5f..f9a1cd9edb7 100644
--- a/drivers/block/aoe/aoedev.c
+++ b/drivers/block/aoe/aoedev.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2006 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoedev.c
* AoE device utility functions; maintains device list.
@@ -7,23 +7,32 @@
#include <linux/hdreg.h>
#include <linux/blkdev.h>
#include <linux/netdevice.h>
+#include <linux/delay.h>
#include "aoe.h"
+static void dummy_timer(ulong);
+static void aoedev_freedev(struct aoedev *);
+static void freetgt(struct aoedev *d, struct aoetgt *t);
+static void skbpoolfree(struct aoedev *d);
+
static struct aoedev *devlist;
-static spinlock_t devlist_lock;
+static DEFINE_SPINLOCK(devlist_lock);
int
aoedev_isbusy(struct aoedev *d)
{
+ struct aoetgt **t, **te;
struct frame *f, *e;
- f = d->frames;
- e = f + d->nframes;
- do {
- if (f->tag != FREETAG)
- return 1;
- } while (++f < e);
-
+ t = d->targets;
+ te = t + NTARGETS;
+ for (; t < te && *t; t++) {
+ f = (*t)->frames;
+ e = f + (*t)->nframes;
+ for (; f < e; f++)
+ if (f->tag != FREETAG)
+ return 1;
+ }
return 0;
}
@@ -55,75 +64,41 @@ dummy_timer(ulong vp)
add_timer(&d->timer);
}
-/* called with devlist lock held */
-static struct aoedev *
-aoedev_newdev(ulong nframes)
-{
- struct aoedev *d;
- struct frame *f, *e;
-
- d = kzalloc(sizeof *d, GFP_ATOMIC);
- f = kcalloc(nframes, sizeof *f, GFP_ATOMIC);
- switch (!d || !f) {
- case 0:
- d->nframes = nframes;
- d->frames = f;
- e = f + nframes;
- for (; f<e; f++) {
- f->tag = FREETAG;
- f->skb = new_skb(ETH_ZLEN);
- if (!f->skb)
- break;
- }
- if (f == e)
- break;
- while (f > d->frames) {
- f--;
- dev_kfree_skb(f->skb);
- }
- default:
- if (f)
- kfree(f);
- if (d)
- kfree(d);
- return NULL;
- }
- INIT_WORK(&d->work, aoecmd_sleepwork);
- spin_lock_init(&d->lock);
- init_timer(&d->timer);
- d->timer.data = (ulong) d;
- d->timer.function = dummy_timer;
- d->timer.expires = jiffies + HZ;
- add_timer(&d->timer);
- d->bufpool = NULL; /* defer to aoeblk_gdalloc */
- INIT_LIST_HEAD(&d->bufq);
- d->next = devlist;
- devlist = d;
-
- return d;
-}
-
void
aoedev_downdev(struct aoedev *d)
{
+ struct aoetgt **t, **te;
struct frame *f, *e;
struct buf *buf;
struct bio *bio;
- f = d->frames;
- e = f + d->nframes;
- for (; f<e; f->tag = FREETAG, f->buf = NULL, f++) {
- if (f->tag == FREETAG || f->buf == NULL)
- continue;
- buf = f->buf;
- bio = buf->bio;
- if (--buf->nframesout == 0) {
- mempool_free(buf, d->bufpool);
- bio_endio(bio, -EIO);
+ t = d->targets;
+ te = t + NTARGETS;
+ for (; t < te && *t; t++) {
+ f = (*t)->frames;
+ e = f + (*t)->nframes;
+ for (; f < e; f->tag = FREETAG, f->buf = NULL, f++) {
+ if (f->tag == FREETAG || f->buf == NULL)
+ continue;
+ buf = f->buf;
+ bio = buf->bio;
+ if (--buf->nframesout == 0
+ && buf != d->inprocess) {
+ mempool_free(buf, d->bufpool);
+ bio_endio(bio, -EIO);
+ }
}
- skb_shinfo(f->skb)->nr_frags = f->skb->data_len = 0;
+ (*t)->maxout = (*t)->nframes;
+ (*t)->nout = 0;
+ }
+ buf = d->inprocess;
+ if (buf) {
+ bio = buf->bio;
+ mempool_free(buf, d->bufpool);
+ bio_endio(bio, -EIO);
}
d->inprocess = NULL;
+ d->htgt = NULL;
while (!list_empty(&d->bufq)) {
buf = container_of(d->bufq.next, struct buf, bufs);
@@ -136,12 +111,114 @@ aoedev_downdev(struct aoedev *d)
if (d->gd)
d->gd->capacity = 0;
- d->flags &= ~(DEVFL_UP | DEVFL_PAUSE);
+ d->flags &= ~DEVFL_UP;
+}
+
+static void
+aoedev_freedev(struct aoedev *d)
+{
+ struct aoetgt **t, **e;
+
+ if (d->gd) {
+ aoedisk_rm_sysfs(d);
+ del_gendisk(d->gd);
+ put_disk(d->gd);
+ }
+ t = d->targets;
+ e = t + NTARGETS;
+ for (; t < e && *t; t++)
+ freetgt(d, *t);
+ if (d->bufpool)
+ mempool_destroy(d->bufpool);
+ skbpoolfree(d);
+ kfree(d);
+}
+
+int
+aoedev_flush(const char __user *str, size_t cnt)
+{
+ ulong flags;
+ struct aoedev *d, **dd;
+ struct aoedev *rmd = NULL;
+ char buf[16];
+ int all = 0;
+
+ if (cnt >= 3) {
+ if (cnt > sizeof buf)
+ cnt = sizeof buf;
+ if (copy_from_user(buf, str, cnt))
+ return -EFAULT;
+ all = !strncmp(buf, "all", 3);
+ }
+
+ flush_scheduled_work();
+ spin_lock_irqsave(&devlist_lock, flags);
+ dd = &devlist;
+ while ((d = *dd)) {
+ spin_lock(&d->lock);
+ if ((!all && (d->flags & DEVFL_UP))
+ || (d->flags & (DEVFL_GDALLOC|DEVFL_NEWSIZE))
+ || d->nopen) {
+ spin_unlock(&d->lock);
+ dd = &d->next;
+ continue;
+ }
+ *dd = d->next;
+ aoedev_downdev(d);
+ d->flags |= DEVFL_TKILL;
+ spin_unlock(&d->lock);
+ d->next = rmd;
+ rmd = d;
+ }
+ spin_unlock_irqrestore(&devlist_lock, flags);
+ while ((d = rmd)) {
+ rmd = d->next;
+ del_timer_sync(&d->timer);
+ aoedev_freedev(d); /* must be able to sleep */
+ }
+ return 0;
+}
+
+/* I'm not really sure that this is a realistic problem, but if the
+network driver goes gonzo let's just leak memory after complaining. */
+static void
+skbfree(struct sk_buff *skb)
+{
+ enum { Sms = 100, Tms = 3*1000};
+ int i = Tms / Sms;
+
+ if (skb == NULL)
+ return;
+ while (atomic_read(&skb_shinfo(skb)->dataref) != 1 && i-- > 0)
+ msleep(Sms);
+ if (i <= 0) {
+ printk(KERN_ERR
+ "aoe: %s holds ref: %s\n",
+ skb->dev ? skb->dev->name : "netif",
+ "cannot free skb -- memory leaked.");
+ return;
+ }
+ skb_shinfo(skb)->nr_frags = skb->data_len = 0;
+ skb_trim(skb, 0);
+ dev_kfree_skb(skb);
+}
+
+static void
+skbpoolfree(struct aoedev *d)
+{
+ struct sk_buff *skb;
+
+ while ((skb = d->skbpool_hd)) {
+ d->skbpool_hd = skb->next;
+ skb->next = NULL;
+ skbfree(skb);
+ }
+ d->skbpool_tl = NULL;
}
/* find it or malloc it */
struct aoedev *
-aoedev_by_sysminor_m(ulong sysminor, ulong bufcnt)
+aoedev_by_sysminor_m(ulong sysminor)
{
struct aoedev *d;
ulong flags;
@@ -151,43 +228,43 @@ aoedev_by_sysminor_m(ulong sysminor, ulong bufcnt)
for (d=devlist; d; d=d->next)
if (d->sysminor == sysminor)
break;
-
- if (d == NULL) {
- d = aoedev_newdev(bufcnt);
- if (d == NULL) {
- spin_unlock_irqrestore(&devlist_lock, flags);
- printk(KERN_INFO "aoe: aoedev_newdev failure.\n");
- return NULL;
- }
- d->sysminor = sysminor;
- d->aoemajor = AOEMAJOR(sysminor);
- d->aoeminor = AOEMINOR(sysminor);
- }
-
+ if (d)
+ goto out;
+ d = kcalloc(1, sizeof *d, GFP_ATOMIC);
+ if (!d)
+ goto out;
+ INIT_WORK(&d->work, aoecmd_sleepwork);
+ spin_lock_init(&d->lock);
+ init_timer(&d->timer);
+ d->timer.data = (ulong) d;
+ d->timer.function = dummy_timer;
+ d->timer.expires = jiffies + HZ;
+ add_timer(&d->timer);
+ d->bufpool = NULL; /* defer to aoeblk_gdalloc */
+ d->tgt = d->targets;
+ INIT_LIST_HEAD(&d->bufq);
+ d->sysminor = sysminor;
+ d->aoemajor = AOEMAJOR(sysminor);
+ d->aoeminor = AOEMINOR(sysminor);
+ d->mintimer = MINTIMER;
+ d->next = devlist;
+ devlist = d;
+ out:
spin_unlock_irqrestore(&devlist_lock, flags);
return d;
}
static void
-aoedev_freedev(struct aoedev *d)
+freetgt(struct aoedev *d, struct aoetgt *t)
{
struct frame *f, *e;
- if (d->gd) {
- aoedisk_rm_sysfs(d);
- del_gendisk(d->gd);
- put_disk(d->gd);
- }
- f = d->frames;
- e = f + d->nframes;
- for (; f<e; f++) {
- skb_shinfo(f->skb)->nr_frags = 0;
- dev_kfree_skb(f->skb);
- }
- kfree(d->frames);
- if (d->bufpool)
- mempool_destroy(d->bufpool);
- kfree(d);
+ f = t->frames;
+ e = f + t->nframes;
+ for (; f < e; f++)
+ skbfree(f->skb);
+ kfree(t->frames);
+ kfree(t);
}
void
@@ -214,7 +291,5 @@ aoedev_exit(void)
int __init
aoedev_init(void)
{
- spin_lock_init(&devlist_lock);
return 0;
}
-
diff --git a/drivers/block/aoe/aoemain.c b/drivers/block/aoe/aoemain.c
index a04b7d61329..7b15a5e9cec 100644
--- a/drivers/block/aoe/aoemain.c
+++ b/drivers/block/aoe/aoemain.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2006 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoemain.c
* Module initialization routines, discover timer
diff --git a/drivers/block/aoe/aoenet.c b/drivers/block/aoe/aoenet.c
index 4e6deb7f5c2..8460ef736d5 100644
--- a/drivers/block/aoe/aoenet.c
+++ b/drivers/block/aoe/aoenet.c
@@ -1,4 +1,4 @@
-/* Copyright (c) 2006 Coraid, Inc. See COPYING for GPL terms. */
+/* Copyright (c) 2007 Coraid, Inc. See COPYING for GPL terms. */
/*
* aoenet.c
* Ethernet portion of AoE driver
@@ -83,7 +83,7 @@ set_aoe_iflist(const char __user *user_str, size_t size)
return 0;
}
-u64
+unsigned long long
mac_addr(char addr[6])
{
__be64 n = 0;
@@ -91,7 +91,7 @@ mac_addr(char addr[6])
memcpy(p + 2, addr, 6); /* (sizeof addr != 6) */
- return __be64_to_cpu(n);
+ return (unsigned long long) __be64_to_cpu(n);
}
void
@@ -137,9 +137,12 @@ aoenet_rcv(struct sk_buff *skb, struct net_device *ifp, struct packet_type *pt,
if (n > NECODES)
n = 0;
if (net_ratelimit())
- printk(KERN_ERR "aoe: error packet from %d.%d; ecode=%d '%s'\n",
- be16_to_cpu(get_unaligned(&h->major)), h->minor,
- h->err, aoe_errlist[n]);
+ printk(KERN_ERR
+ "%s%d.%d@%s; ecode=%d '%s'\n",
+ "aoe: error packet from ",
+ be16_to_cpu(get_unaligned(&h->major)),
+ h->minor, skb->dev->name,
+ h->err, aoe_errlist[n]);
goto exit;
}
diff --git a/drivers/block/brd.c b/drivers/block/brd.c
new file mode 100644
index 00000000000..85364804364
--- /dev/null
+++ b/drivers/block/brd.c
@@ -0,0 +1,583 @@
+/*
+ * Ram backed block device driver.
+ *
+ * Copyright (C) 2007 Nick Piggin
+ * Copyright (C) 2007 Novell Inc.
+ *
+ * Parts derived from drivers/block/rd.c, and drivers/block/loop.c, copyright
+ * of their respective owners.
+ */
+
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/moduleparam.h>
+#include <linux/major.h>
+#include <linux/blkdev.h>
+#include <linux/bio.h>
+#include <linux/highmem.h>
+#include <linux/gfp.h>
+#include <linux/radix-tree.h>
+#include <linux/buffer_head.h> /* invalidate_bh_lrus() */
+
+#include <asm/uaccess.h>
+
+#define SECTOR_SHIFT 9
+#define PAGE_SECTORS_SHIFT (PAGE_SHIFT - SECTOR_SHIFT)
+#define PAGE_SECTORS (1 << PAGE_SECTORS_SHIFT)
+
+/*
+ * Each block ramdisk device has a radix_tree brd_pages of pages that stores
+ * the pages containing the block device's contents. A brd page's ->index is
+ * its offset in PAGE_SIZE units. This is similar to, but in no way connected
+ * with, the kernel's pagecache or buffer cache (which sit above our block
+ * device).
+ */
+struct brd_device {
+ int brd_number;
+ int brd_refcnt;
+ loff_t brd_offset;
+ loff_t brd_sizelimit;
+ unsigned brd_blocksize;
+
+ struct request_queue *brd_queue;
+ struct gendisk *brd_disk;
+ struct list_head brd_list;
+
+ /*
+ * Backing store of pages and lock to protect it. This is the contents
+ * of the block device.
+ */
+ spinlock_t brd_lock;
+ struct radix_tree_root brd_pages;
+};
+
+/*
+ * Look up and return a brd's page for a given sector.
+ */
+static struct page *brd_lookup_page(struct brd_device *brd, sector_t sector)
+{
+ pgoff_t idx;
+ struct page *page;
+
+ /*
+ * The page lifetime is protected by the fact that we have opened the
+ * device node -- brd pages will never be deleted under us, so we
+ * don't need any further locking or refcounting.
+ *
+ * This is strictly true for the radix-tree nodes as well (ie. we
+ * don't actually need the rcu_read_lock()), however that is not a
+ * documented feature of the radix-tree API so it is better to be
+ * safe here (we don't have total exclusion from radix tree updates
+ * here, only deletes).
+ */
+ rcu_read_lock();
+ idx = sector >> PAGE_SECTORS_SHIFT; /* sector to page index */
+ page = radix_tree_lookup(&brd->brd_pages, idx);
+ rcu_read_unlock();
+
+ BUG_ON(page && page->index != idx);
+
+ return page;
+}
+
+/*
+ * Look up and return a brd's page for a given sector.
+ * If one does not exist, allocate an empty page, and insert that. Then
+ * return it.
+ */
+static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
+{
+ pgoff_t idx;
+ struct page *page;
+ gfp_t gfp_flags;
+
+ page = brd_lookup_page(brd, sector);
+ if (page)
+ return page;
+
+ /*
+ * Must use NOIO because we don't want to recurse back into the
+ * block or filesystem layers from page reclaim.
+ *
+ * Cannot support XIP and highmem, because our ->direct_access
+ * routine for XIP must return memory that is always addressable.
+ * If XIP was reworked to use pfns and kmap throughout, this
+ * restriction might be able to be lifted.
+ */
+ gfp_flags = GFP_NOIO | __GFP_ZERO;
+#ifndef CONFIG_BLK_DEV_XIP
+ gfp_flags |= __GFP_HIGHMEM;
+#endif
+ page = alloc_page(GFP_NOIO | __GFP_HIGHMEM | __GFP_ZERO);
+ if (!page)
+ return NULL;
+
+ if (radix_tree_preload(GFP_NOIO)) {
+ __free_page(page);
+ return NULL;
+ }
+
+ spin_lock(&brd->brd_lock);
+ idx = sector >> PAGE_SECTORS_SHIFT;
+ if (radix_tree_insert(&brd->brd_pages, idx, page)) {
+ __free_page(page);
+ page = radix_tree_lookup(&brd->brd_pages, idx);
+ BUG_ON(!page);
+ BUG_ON(page->index != idx);
+ } else
+ page->index = idx;
+ spin_unlock(&brd->brd_lock);
+
+ radix_tree_preload_end();
+
+ return page;
+}
+
+/*
+ * Free all backing store pages and radix tree. This must only be called when
+ * there are no other users of the device.
+ */
+#define FREE_BATCH 16
+static void brd_free_pages(struct brd_device *brd)
+{
+ unsigned long pos = 0;
+ struct page *pages[FREE_BATCH];
+ int nr_pages;
+
+ do {
+ int i;
+
+ nr_pages = radix_tree_gang_lookup(&brd->brd_pages,
+ (void **)pages, pos, FREE_BATCH);
+
+ for (i = 0; i < nr_pages; i++) {
+ void *ret;
+
+ BUG_ON(pages[i]->index < pos);
+ pos = pages[i]->index;
+ ret = radix_tree_delete(&brd->brd_pages, pos);
+ BUG_ON(!ret || ret != pages[i]);
+ __free_page(pages[i]);
+ }
+
+ pos++;
+
+ /*
+ * This assumes radix_tree_gang_lookup always returns as
+ * many pages as possible. If the radix-tree code changes,
+ * so will this have to.
+ */
+ } while (nr_pages == FREE_BATCH);
+}
+
+/*
+ * copy_to_brd_setup must be called before copy_to_brd. It may sleep.
+ */
+static int copy_to_brd_setup(struct brd_device *brd, sector_t sector, size_t n)
+{
+ unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+ size_t copy;
+
+ copy = min_t(size_t, n, PAGE_SIZE - offset);
+ if (!brd_insert_page(brd, sector))
+ return -ENOMEM;
+ if (copy < n) {
+ sector += copy >> SECTOR_SHIFT;
+ if (!brd_insert_page(brd, sector))
+ return -ENOMEM;
+ }
+ return 0;
+}
+
+/*
+ * Copy n bytes from src to the brd starting at sector. Does not sleep.
+ */
+static void copy_to_brd(struct brd_device *brd, const void *src,
+ sector_t sector, size_t n)
+{
+ struct page *page;
+ void *dst;
+ unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+ size_t copy;
+
+ copy = min_t(size_t, n, PAGE_SIZE - offset);
+ page = brd_lookup_page(brd, sector);
+ BUG_ON(!page);
+
+ dst = kmap_atomic(page, KM_USER1);
+ memcpy(dst + offset, src, copy);
+ kunmap_atomic(dst, KM_USER1);
+
+ if (copy < n) {
+ src += copy;
+ sector += copy >> SECTOR_SHIFT;
+ copy = n - copy;
+ page = brd_lookup_page(brd, sector);
+ BUG_ON(!page);
+
+ dst = kmap_atomic(page, KM_USER1);
+ memcpy(dst, src, copy);
+ kunmap_atomic(dst, KM_USER1);
+ }
+}
+
+/*
+ * Copy n bytes to dst from the brd starting at sector. Does not sleep.
+ */
+static void copy_from_brd(void *dst, struct brd_device *brd,
+ sector_t sector, size_t n)
+{
+ struct page *page;
+ void *src;
+ unsigned int offset = (sector & (PAGE_SECTORS-1)) << SECTOR_SHIFT;
+ size_t copy;
+
+ copy = min_t(size_t, n, PAGE_SIZE - offset);
+ page = brd_lookup_page(brd, sector);
+ if (page) {
+ src = kmap_atomic(page, KM_USER1);
+ memcpy(dst, src + offset, copy);
+ kunmap_atomic(src, KM_USER1);
+ } else
+ memset(dst, 0, copy);
+
+ if (copy < n) {
+ dst += copy;
+ sector += copy >> SECTOR_SHIFT;
+ copy = n - copy;
+ page = brd_lookup_page(brd, sector);
+ if (page) {
+ src = kmap_atomic(page, KM_USER1);
+ memcpy(dst, src, copy);
+ kunmap_atomic(src, KM_USER1);
+ } else
+ memset(dst, 0, copy);
+ }
+}
+
+/*
+ * Process a single bvec of a bio.
+ */
+static int brd_do_bvec(struct brd_device *brd, struct page *page,
+ unsigned int len, unsigned int off, int rw,
+ sector_t sector)
+{
+ void *mem;
+ int err = 0;
+
+ if (rw != READ) {
+ err = copy_to_brd_setup(brd, sector, len);
+ if (err)
+ goto out;
+ }
+
+ mem = kmap_atomic(page, KM_USER0);
+ if (rw == READ) {
+ copy_from_brd(mem + off, brd, sector, len);
+ flush_dcache_page(page);
+ } else
+ copy_to_brd(brd, mem + off, sector, len);
+ kunmap_atomic(mem, KM_USER0);
+
+out:
+ return err;
+}
+
+static int brd_make_request(struct request_queue *q, struct bio *bio)
+{
+ struct block_device *bdev = bio->bi_bdev;
+ struct brd_device *brd = bdev->bd_disk->private_data;
+ int rw;
+ struct bio_vec *bvec;
+ sector_t sector;
+ int i;
+ int err = -EIO;
+
+ sector = bio->bi_sector;
+ if (sector + (bio->bi_size >> SECTOR_SHIFT) >
+ get_capacity(bdev->bd_disk))
+ goto out;
+
+ rw = bio_rw(bio);
+ if (rw == READA)
+ rw = READ;
+
+ bio_for_each_segment(bvec, bio, i) {
+ unsigned int len = bvec->bv_len;
+ err = brd_do_bvec(brd, bvec->bv_page, len,
+ bvec->bv_offset, rw, sector);
+ if (err)
+ break;
+ sector += len >> SECTOR_SHIFT;
+ }
+
+out:
+ bio_endio(bio, err);
+
+ return 0;
+}
+
+#ifdef CONFIG_BLK_DEV_XIP
+static int brd_direct_access (struct block_device *bdev, sector_t sector,
+ unsigned long *data)
+{
+ struct brd_device *brd = bdev->bd_disk->private_data;
+ struct page *page;
+
+ if (!brd)
+ return -ENODEV;
+ if (sector & (PAGE_SECTORS-1))
+ return -EINVAL;
+ if (sector + PAGE_SECTORS > get_capacity(bdev->bd_disk))
+ return -ERANGE;
+ page = brd_insert_page(brd, sector);
+ if (!page)
+ return -ENOMEM;
+ *data = (unsigned long)page_address(page);
+
+ return 0;
+}
+#endif
+
+static int brd_ioctl(struct inode *inode, struct file *file,
+ unsigned int cmd, unsigned long arg)
+{
+ int error;
+ struct block_device *bdev = inode->i_bdev;
+ struct brd_device *brd = bdev->bd_disk->private_data;
+
+ if (cmd != BLKFLSBUF)
+ return -ENOTTY;
+
+ /*
+ * ram device BLKFLSBUF has special semantics, we want to actually
+ * release and destroy the ramdisk data.
+ */
+ mutex_lock(&bdev->bd_mutex);
+ error = -EBUSY;
+ if (bdev->bd_openers <= 1) {
+ /*
+ * Invalidate the cache first, so it isn't written
+ * back to the device.
+ *
+ * Another thread might instantiate more buffercache here,
+ * but there is not much we can do to close that race.
+ */
+ invalidate_bh_lrus();
+ truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
+ brd_free_pages(brd);
+ error = 0;
+ }
+ mutex_unlock(&bdev->bd_mutex);
+
+ return error;
+}
+
+static struct block_device_operations brd_fops = {
+ .owner = THIS_MODULE,
+ .ioctl = brd_ioctl,
+#ifdef CONFIG_BLK_DEV_XIP
+ .direct_access = brd_direct_access,
+#endif
+};
+
+/*
+ * And now the modules code and kernel interface.
+ */
+static int rd_nr;
+int rd_size = CONFIG_BLK_DEV_RAM_SIZE;
+module_param(rd_nr, int, 0);
+MODULE_PARM_DESC(rd_nr, "Maximum number of brd devices");
+module_param(rd_size, int, 0);
+MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
+
+#ifndef MODULE
+/* Legacy boot options - nonmodular */
+static int __init ramdisk_size(char *str)
+{
+ rd_size = simple_strtol(str, NULL, 0);
+ return 1;
+}
+static int __init ramdisk_size2(char *str)
+{
+ return ramdisk_size(str);
+}
+__setup("ramdisk=", ramdisk_size);
+__setup("ramdisk_size=", ramdisk_size2);
+#endif
+
+/*
+ * The device scheme is derived from loop.c. Keep them in synch where possible
+ * (should share code eventually).
+ */
+static LIST_HEAD(brd_devices);
+static DEFINE_MUTEX(brd_devices_mutex);
+
+static struct brd_device *brd_alloc(int i)
+{
+ struct brd_device *brd;
+ struct gendisk *disk;
+
+ brd = kzalloc(sizeof(*brd), GFP_KERNEL);
+ if (!brd)
+ goto out;
+ brd->brd_number = i;
+ spin_lock_init(&brd->brd_lock);
+ INIT_RADIX_TREE(&brd->brd_pages, GFP_ATOMIC);
+
+ brd->brd_queue = blk_alloc_queue(GFP_KERNEL);
+ if (!brd->brd_queue)
+ goto out_free_dev;
+ blk_queue_make_request(brd->brd_queue, brd_make_request);
+ blk_queue_max_sectors(brd->brd_queue, 1024);
+ blk_queue_bounce_limit(brd->brd_queue, BLK_BOUNCE_ANY);
+
+ disk = brd->brd_disk = alloc_disk(1);
+ if (!disk)
+ goto out_free_queue;
+ disk->major = RAMDISK_MAJOR;
+ disk->first_minor = i;
+ disk->fops = &brd_fops;
+ disk->private_data = brd;
+ disk->queue = brd->brd_queue;
+ sprintf(disk->disk_name, "ram%d", i);
+ set_capacity(disk, rd_size * 2);
+
+ return brd;
+
+out_free_queue:
+ blk_cleanup_queue(brd->brd_queue);
+out_free_dev:
+ kfree(brd);
+out:
+ return NULL;
+}
+
+static void brd_free(struct brd_device *brd)
+{
+ put_disk(brd->brd_disk);
+ blk_cleanup_queue(brd->brd_queue);
+ brd_free_pages(brd);
+ kfree(brd);
+}
+
+static struct brd_device *brd_init_one(int i)
+{
+ struct brd_device *brd;
+
+ list_for_each_entry(brd, &brd_devices, brd_list) {
+ if (brd->brd_number == i)
+ goto out;
+ }
+
+ brd = brd_alloc(i);
+ if (brd) {
+ add_disk(brd->brd_disk);
+ list_add_tail(&brd->brd_list, &brd_devices);
+ }
+out:
+ return brd;
+}
+
+static void brd_del_one(struct brd_device *brd)
+{
+ list_del(&brd->brd_list);
+ del_gendisk(brd->brd_disk);
+ brd_free(brd);
+}
+
+static struct kobject *brd_probe(dev_t dev, int *part, void *data)
+{
+ struct brd_device *brd;
+ struct kobject *kobj;
+
+ mutex_lock(&brd_devices_mutex);
+ brd = brd_init_one(dev & MINORMASK);
+ kobj = brd ? get_disk(brd->brd_disk) : ERR_PTR(-ENOMEM);
+ mutex_unlock(&brd_devices_mutex);
+
+ *part = 0;
+ return kobj;
+}
+
+static int __init brd_init(void)
+{
+ int i, nr;
+ unsigned long range;
+ struct brd_device *brd, *next;
+
+ /*
+ * brd module now has a feature to instantiate underlying device
+ * structure on-demand, provided that there is an access dev node.
+ * However, this will not work well with user space tool that doesn't
+ * know about such "feature". In order to not break any existing
+ * tool, we do the following:
+ *
+ * (1) if rd_nr is specified, create that many upfront, and this
+ * also becomes a hard limit.
+ * (2) if rd_nr is not specified, create 1 rd device on module
+ * load, user can further extend brd device by create dev node
+ * themselves and have kernel automatically instantiate actual
+ * device on-demand.
+ */
+ if (rd_nr > 1UL << MINORBITS)
+ return -EINVAL;
+
+ if (rd_nr) {
+ nr = rd_nr;
+ range = rd_nr;
+ } else {
+ nr = CONFIG_BLK_DEV_RAM_COUNT;
+ range = 1UL << MINORBITS;
+ }
+
+ if (register_blkdev(RAMDISK_MAJOR, "ramdisk"))
+ return -EIO;
+
+ for (i = 0; i < nr; i++) {
+ brd = brd_alloc(i);
+ if (!brd)
+ goto out_free;
+ list_add_tail(&brd->brd_list, &brd_devices);
+ }
+
+ /* point of no return */
+
+ list_for_each_entry(brd, &brd_devices, brd_list)
+ add_disk(brd->brd_disk);
+
+ blk_register_region(MKDEV(RAMDISK_MAJOR, 0), range,
+ THIS_MODULE, brd_probe, NULL, NULL);
+
+ printk(KERN_INFO "brd: module loaded\n");
+ return 0;
+
+out_free:
+ list_for_each_entry_safe(brd, next, &brd_devices, brd_list) {
+ list_del(&brd->brd_list);
+ brd_free(brd);
+ }
+
+ unregister_blkdev(RAMDISK_MAJOR, "brd");
+ return -ENOMEM;
+}
+
+static void __exit brd_exit(void)
+{
+ unsigned long range;
+ struct brd_device *brd, *next;
+
+ range = rd_nr ? rd_nr : 1UL << MINORBITS;
+
+ list_for_each_entry_safe(brd, next, &brd_devices, brd_list)
+ brd_del_one(brd);
+
+ blk_unregister_region(MKDEV(RAMDISK_MAJOR, 0), range);
+ unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
+}
+
+module_init(brd_init);
+module_exit(brd_exit);
+
diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
index ae3106045ee..018753c59b8 100644
--- a/drivers/block/nbd.c
+++ b/drivers/block/nbd.c
@@ -54,7 +54,7 @@ static unsigned int debugflags;
#endif /* NDEBUG */
static unsigned int nbds_max = 16;
-static struct nbd_device nbd_dev[MAX_NBD];
+static struct nbd_device *nbd_dev;
/*
* Use just one lock (or at most 1 per NIC). Two arguments for this:
@@ -649,11 +649,9 @@ static int __init nbd_init(void)
BUILD_BUG_ON(sizeof(struct nbd_request) != 28);
- if (nbds_max > MAX_NBD) {
- printk(KERN_CRIT "nbd: cannot allocate more than %u nbds; %u requested.\n", MAX_NBD,
- nbds_max);
- return -EINVAL;
- }
+ nbd_dev = kcalloc(nbds_max, sizeof(*nbd_dev), GFP_KERNEL);
+ if (!nbd_dev)
+ return -ENOMEM;
for (i = 0; i < nbds_max; i++) {
struct gendisk *disk = alloc_disk(1);
diff --git a/drivers/block/rd.c b/drivers/block/rd.c
deleted file mode 100644
index 06e23be7090..00000000000
--- a/drivers/block/rd.c
+++ /dev/null
@@ -1,537 +0,0 @@
-/*
- * ramdisk.c - Multiple RAM disk driver - gzip-loading version - v. 0.8 beta.
- *
- * (C) Chad Page, Theodore Ts'o, et. al, 1995.
- *
- * This RAM disk is designed to have filesystems created on it and mounted
- * just like a regular floppy disk.
- *
- * It also does something suggested by Linus: use the buffer cache as the
- * RAM disk data. This makes it possible to dynamically allocate the RAM disk
- * buffer - with some consequences I have to deal with as I write this.
- *
- * This code is based on the original ramdisk.c, written mostly by
- * Theodore Ts'o (TYT) in 1991. The code was largely rewritten by
- * Chad Page to use the buffer cache to store the RAM disk data in
- * 1995; Theodore then took over the driver again, and cleaned it up
- * for inclusion in the mainline kernel.
- *
- * The original CRAMDISK code was written by Richard Lyons, and
- * adapted by Chad Page to use the new RAM disk interface. Theodore
- * Ts'o rewrote it so that both the compressed RAM disk loader and the
- * kernel decompressor uses the same inflate.c codebase. The RAM disk
- * loader now also loads into a dynamic (buffer cache based) RAM disk,
- * not the old static RAM disk. Support for the old static RAM disk has
- * been completely removed.
- *
- * Loadable module support added by Tom Dyas.
- *
- * Further cleanups by Chad Page (page0588@sundance.sjsu.edu):
- * Cosmetic changes in #ifdef MODULE, code movement, etc.
- * When the RAM disk module is removed, free the protected buffers
- * Default RAM disk size changed to 2.88 MB
- *
- * Added initrd: Werner Almesberger & Hans Lermen, Feb '96
- *
- * 4/25/96 : Made RAM disk size a parameter (default is now 4 MB)
- * - Chad Page
- *
- * Add support for fs images split across >1 disk, Paul Gortmaker, Mar '98
- *
- * Make block size and block size shift for RAM disks a global macro
- * and set blk_size for -ENOSPC, Werner Fink <werner@suse.de>, Apr '99
- */
-
-#include <linux/string.h>
-#include <linux/slab.h>
-#include <asm/atomic.h>
-#include <linux/bio.h>
-#include <linux/module.h>
-#include <linux/moduleparam.h>
-#include <linux/init.h>
-#include <linux/pagemap.h>
-#include <linux/blkdev.h>
-#include <linux/genhd.h>
-#include <linux/buffer_head.h> /* for invalidate_bdev() */
-#include <linux/backing-dev.h>
-#include <linux/blkpg.h>
-#include <linux/writeback.h>
-#include <linux/log2.h>
-
-#include <asm/uaccess.h>
-
-/* Various static variables go here. Most are used only in the RAM disk code.
- */
-
-static struct gendisk *rd_disks[CONFIG_BLK_DEV_RAM_COUNT];
-static struct block_device *rd_bdev[CONFIG_BLK_DEV_RAM_COUNT];/* Protected device data */
-static struct request_queue *rd_queue[CONFIG_BLK_DEV_RAM_COUNT];
-
-/*
- * Parameters for the boot-loading of the RAM disk. These are set by
- * init/main.c (from arguments to the kernel command line) or from the
- * architecture-specific setup routine (from the stored boot sector
- * information).
- */
-int rd_size = CONFIG_BLK_DEV_RAM_SIZE; /* Size of the RAM disks */
-/*
- * It would be very desirable to have a soft-blocksize (that in the case
- * of the ramdisk driver is also the hardblocksize ;) of PAGE_SIZE because
- * doing that we'll achieve a far better MM footprint. Using a rd_blocksize of
- * BLOCK_SIZE in the worst case we'll make PAGE_SIZE/BLOCK_SIZE buffer-pages
- * unfreeable. With a rd_blocksize of PAGE_SIZE instead we are sure that only
- * 1 page will be protected. Depending on the size of the ramdisk you
- * may want to change the ramdisk blocksize to achieve a better or worse MM
- * behaviour. The default is still BLOCK_SIZE (needed by rd_load_image that
- * supposes the filesystem in the image uses a BLOCK_SIZE blocksize).
- */
-static int rd_blocksize = CONFIG_BLK_DEV_RAM_BLOCKSIZE;
-
-/*
- * Copyright (C) 2000 Linus Torvalds.
- * 2000 Transmeta Corp.
- * aops copied from ramfs.
- */
-
-/*
- * If a ramdisk page has buffers, some may be uptodate and some may be not.
- * To bring the page uptodate we zero out the non-uptodate buffers. The
- * page must be locked.
- */
-static void make_page_uptodate(struct page *page)
-{
- if (page_has_buffers(page)) {
- struct buffer_head *bh = page_buffers(page);
- struct buffer_head *head = bh;
-
- do {
- if (!buffer_uptodate(bh)) {
- memset(bh->b_data, 0, bh->b_size);
- /*
- * akpm: I'm totally undecided about this. The
- * buffer has just been magically brought "up to
- * date", but nobody should want to be reading
- * it anyway, because it hasn't been used for
- * anything yet. It is still in a "not read
- * from disk yet" state.
- *
- * But non-uptodate buffers against an uptodate
- * page are against the rules. So do it anyway.
- */
- set_buffer_uptodate(bh);
- }
- } while ((bh = bh->b_this_page) != head);
- } else {
- memset(page_address(page), 0, PAGE_CACHE_SIZE);
- }
- flush_dcache_page(page);
- SetPageUptodate(page);
-}
-
-static int ramdisk_readpage(struct file *file, struct page *page)
-{
- if (!PageUptodate(page))
- make_page_uptodate(page);
- unlock_page(page);
- return 0;
-}
-
-static int ramdisk_prepare_write(struct file *file, struct page *page,
- unsigned offset, unsigned to)
-{
- if (!PageUptodate(page))
- make_page_uptodate(page);
- return 0;
-}
-
-static int ramdisk_commit_write(struct file *file, struct page *page,
- unsigned offset, unsigned to)
-{
- set_page_dirty(page);
- return 0;
-}
-
-/*
- * ->writepage to the blockdev's mapping has to redirty the page so that the
- * VM doesn't go and steal it. We return AOP_WRITEPAGE_ACTIVATE so that the VM
- * won't try to (pointlessly) write the page again for a while.
- *
- * Really, these pages should not be on the LRU at all.
- */
-static int ramdisk_writepage(struct page *page, struct writeback_control *wbc)
-{
- if (!PageUptodate(page))
- make_page_uptodate(page);
- SetPageDirty(page);
- if (wbc->for_reclaim)
- return AOP_WRITEPAGE_ACTIVATE;
- unlock_page(page);
- return 0;
-}
-
-/*
- * This is a little speedup thing: short-circuit attempts to write back the
- * ramdisk blockdev inode to its non-existent backing store.
- */
-static int ramdisk_writepages(struct address_space *mapping,
- struct writeback_control *wbc)
-{
- return 0;
-}
-
-/*
- * ramdisk blockdev pages have their own ->set_page_dirty() because we don't
- * want them to contribute to dirty memory accounting.
- */
-static int ramdisk_set_page_dirty(struct page *page)
-{
- if (!TestSetPageDirty(page))
- return 1;
- return 0;
-}
-
-/*
- * releasepage is called by pagevec_strip/try_to_release_page if
- * buffers_heads_over_limit is true. Without a releasepage function
- * try_to_free_buffers is called instead. That can unset the dirty
- * bit of our ram disk pages, which will be eventually freed, even
- * if the page is still in use.
- */
-static int ramdisk_releasepage(struct page *page, gfp_t dummy)
-{
- return 0;
-}
-
-static const struct address_space_operations ramdisk_aops = {
- .readpage = ramdisk_readpage,
- .prepare_write = ramdisk_prepare_write,
- .commit_write = ramdisk_commit_write,
- .writepage = ramdisk_writepage,
- .set_page_dirty = ramdisk_set_page_dirty,
- .writepages = ramdisk_writepages,
- .releasepage = ramdisk_releasepage,
-};
-
-static int rd_blkdev_pagecache_IO(int rw, struct bio_vec *vec, sector_t sector,
- struct address_space *mapping)
-{
- pgoff_t index = sector >> (PAGE_CACHE_SHIFT - 9);
- unsigned int vec_offset = vec->bv_offset;
- int offset = (sector << 9) & ~PAGE_CACHE_MASK;
- int size = vec->bv_len;
- int err = 0;
-
- do {
- int count;
- struct page *page;
- char *src;
- char *dst;
-
- count = PAGE_CACHE_SIZE - offset;
- if (count > size)
- count = size;
- size -= count;
-
- page = grab_cache_page(mapping, index);
- if (!page) {
- err = -ENOMEM;
- goto out;
- }
-
- if (!PageUptodate(page))
- make_page_uptodate(page);
-
- index++;
-
- if (rw == READ) {
- src = kmap_atomic(page, KM_USER0) + offset;
- dst = kmap_atomic(vec->bv_page, KM_USER1) + vec_offset;
- } else {
- src = kmap_atomic(vec->bv_page, KM_USER0) + vec_offset;
- dst = kmap_atomic(page, KM_USER1) + offset;
- }
- offset = 0;
- vec_offset += count;
-
- memcpy(dst, src, count);
-
- kunmap_atomic(src, KM_USER0);
- kunmap_atomic(dst, KM_USER1);
-
- if (rw == READ)
- flush_dcache_page(vec->bv_page);
- else
- set_page_dirty(page);
- unlock_page(page);
- put_page(page);
- } while (size);
-
- out:
- return err;
-}
-
-/*
- * Basically, my strategy here is to set up a buffer-head which can't be
- * deleted, and make that my Ramdisk. If the request is outside of the
- * allocated size, we must get rid of it...
- *
- * 19-JAN-1998 Richard Gooch <rgooch@atnf.csiro.au> Added devfs support
- *
- */
-static int rd_make_request(struct request_queue *q, struct bio *bio)
-{
- struct block_device *bdev = bio->bi_bdev;
- struct address_space * mapping = bdev->bd_inode->i_mapping;
- sector_t sector = bio->bi_sector;
- unsigned long len = bio->bi_size >> 9;
- int rw = bio_data_dir(bio);
- struct bio_vec *bvec;
- int ret = 0, i;
-
- if (sector + len > get_capacity(bdev->bd_disk))
- goto fail;
-
- if (rw==READA)
- rw=READ;
-
- bio_for_each_segment(bvec, bio, i) {
- ret |= rd_blkdev_pagecache_IO(rw, bvec, sector, mapping);
- sector += bvec->bv_len >> 9;
- }
- if (ret)
- goto fail;
-
- bio_endio(bio, 0);
- return 0;
-fail:
- bio_io_error(bio);
- return 0;
-}
-
-static int rd_ioctl(struct inode *inode, struct file *file,
- unsigned int cmd, unsigned long arg)
-{
- int error;
- struct block_device *bdev = inode->i_bdev;
-
- if (cmd != BLKFLSBUF)
- return -ENOTTY;
-
- /*
- * special: we want to release the ramdisk memory, it's not like with
- * the other blockdevices where this ioctl only flushes away the buffer
- * cache
- */
- error = -EBUSY;
- mutex_lock(&bdev->bd_mutex);
- if (bdev->bd_openers <= 2) {
- truncate_inode_pages(bdev->bd_inode->i_mapping, 0);
- error = 0;
- }
- mutex_unlock(&bdev->bd_mutex);
- return error;
-}
-
-/*
- * This is the backing_dev_info for the blockdev inode itself. It doesn't need
- * writeback and it does not contribute to dirty memory accounting.
- */
-static struct backing_dev_info rd_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_NO_ACCT_DIRTY | BDI_CAP_NO_WRITEBACK | BDI_CAP_MAP_COPY,
- .unplug_io_fn = default_unplug_io_fn,
-};
-
-/*
- * This is the backing_dev_info for the files which live atop the ramdisk
- * "device". These files do need writeback and they do contribute to dirty
- * memory accounting.
- */
-static struct backing_dev_info rd_file_backing_dev_info = {
- .ra_pages = 0, /* No readahead */
- .capabilities = BDI_CAP_MAP_COPY, /* Does contribute to dirty memory */
- .unplug_io_fn = default_unplug_io_fn,
-};
-
-static int rd_open(struct inode *inode, struct file *filp)
-{
- unsigned unit = iminor(inode);
-
- if (rd_bdev[unit] == NULL) {
- struct block_device *bdev = inode->i_bdev;
- struct address_space *mapping;
- unsigned bsize;
- gfp_t gfp_mask;
-
- inode = igrab(bdev->bd_inode);
- rd_bdev[unit] = bdev;
- bdev->bd_openers++;
- bsize = bdev_hardsect_size(bdev);
- bdev->bd_block_size = bsize;
- inode->i_blkbits = blksize_bits(bsize);
- inode->i_size = get_capacity(bdev->bd_disk)<<9;
-
- mapping = inode->i_mapping;
- mapping->a_ops = &ramdisk_aops;
- mapping->backing_dev_info = &rd_backing_dev_info;
- bdev->bd_inode_backing_dev_info = &rd_file_backing_dev_info;
-
- /*
- * Deep badness. rd_blkdev_pagecache_IO() needs to allocate
- * pagecache pages within a request_fn. We cannot recur back
- * into the filesystem which is mounted atop the ramdisk, because
- * that would deadlock on fs locks. And we really don't want
- * to reenter rd_blkdev_pagecache_IO when we're already within
- * that function.
- *
- * So we turn off __GFP_FS and __GFP_IO.
- *
- * And to give this thing a hope of working, turn on __GFP_HIGH.
- * Hopefully, there's enough regular memory allocation going on
- * for the page allocator emergency pools to keep the ramdisk
- * driver happy.
- */
- gfp_mask = mapping_gfp_mask(mapping);
- gfp_mask &= ~(__GFP_FS|__GFP_IO);
- gfp_mask |= __GFP_HIGH;
- mapping_set_gfp_mask(mapping, gfp_mask);
- }
-
- return 0;
-}
-
-static struct block_device_operations rd_bd_op = {
- .owner = THIS_MODULE,
- .open = rd_open,
- .ioctl = rd_ioctl,
-};
-
-/*
- * Before freeing the module, invalidate all of the protected buffers!
- */
-static void __exit rd_cleanup(void)
-{
- int i;
-
- for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
- struct block_device *bdev = rd_bdev[i];
- rd_bdev[i] = NULL;
- if (bdev) {
- invalidate_bdev(bdev);
- blkdev_put(bdev);
- }
- del_gendisk(rd_disks[i]);
- put_disk(rd_disks[i]);
- blk_cleanup_queue(rd_queue[i]);
- }
- unregister_blkdev(RAMDISK_MAJOR, "ramdisk");
-
- bdi_destroy(&rd_file_backing_dev_info);
- bdi_destroy(&rd_backing_dev_info);
-}
-
-/*
- * This is the registration and initialization section of the RAM disk driver
- */
-static int __init rd_init(void)
-{
- int i;
- int err;
-
- err = bdi_init(&rd_backing_dev_info);
- if (err)
- goto out2;
-
- err = bdi_init(&rd_file_backing_dev_info);
- if (err) {
- bdi_destroy(&rd_backing_dev_info);
- goto out2;
- }
-
- err = -ENOMEM;
-
- if (rd_blocksize > PAGE_SIZE || rd_blocksize < 512 ||
- !is_power_of_2(rd_blocksize)) {
- printk("RAMDISK: wrong blocksize %d, reverting to defaults\n",
- rd_blocksize);
- rd_blocksize = BLOCK_SIZE;
- }
-
- for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
- rd_disks[i] = alloc_disk(1);
- if (!rd_disks[i])
- goto out;
-
- rd_queue[i] = blk_alloc_queue(GFP_KERNEL);
- if (!rd_queue[i]) {
- put_disk(rd_disks[i]);
- goto out;
- }
- }
-
- if (register_blkdev(RAMDISK_MAJOR, "ramdisk")) {
- err = -EIO;
- goto out;
- }
-
- for (i = 0; i < CONFIG_BLK_DEV_RAM_COUNT; i++) {
- struct gendisk *disk = rd_disks[i];
-
- blk_queue_make_request(rd_queue[i], &rd_make_request);
- blk_queue_hardsect_size(rd_queue[i], rd_blocksize);
-
- /* rd_size is given in kB */
- disk->major = RAMDISK_MAJOR;
- disk->first_minor = i;
- disk->fops = &rd_bd_op;
- disk->queue = rd_queue[i];
- disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
- sprintf(disk->disk_name, "ram%d", i);
- set_capacity(disk, rd_size * 2);
- add_disk(rd_disks[i]);
- }
-
- /* rd_size is given in kB */
- printk("RAMDISK driver initialized: "
- "%d RAM disks of %dK size %d blocksize\n",
- CONFIG_BLK_DEV_RAM_COUNT, rd_size, rd_blocksize);
-
- return 0;
-out:
- while (i--) {
- put_disk(rd_disks[i]);
- blk_cleanup_queue(rd_queue[i]);
- }
- bdi_destroy(&rd_backing_dev_info);
- bdi_destroy(&rd_file_backing_dev_info);
-out2:
- return err;
-}
-
-module_init(rd_init);
-module_exit(rd_cleanup);
-
-/* options - nonmodular */
-#ifndef MODULE
-static int __init ramdisk_size(char *str)
-{
- rd_size = simple_strtol(str,NULL,0);
- return 1;
-}
-static int __init ramdisk_blocksize(char *str)
-{
- rd_blocksize = simple_strtol(str,NULL,0);
- return 1;
-}
-__setup("ramdisk_size=", ramdisk_size);
-__setup("ramdisk_blocksize=", ramdisk_blocksize);
-#endif
-
-/* options - modular */
-module_param(rd_size, int, 0);
-MODULE_PARM_DESC(rd_size, "Size of each RAM disk in kbytes.");
-module_param(rd_blocksize, int, 0);
-MODULE_PARM_DESC(rd_blocksize, "Blocksize of each RAM disk in bytes.");
-MODULE_ALIAS_BLOCKDEV_MAJOR(RAMDISK_MAJOR);
-
-MODULE_LICENSE("GPL");