From 7c963ad1d113790a8c723a178988b675868f3abe Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 31 May 2005 16:57:59 -0700 Subject: [SPARC64]: Fix streaming buffer flushing on PCI and SBUS. Firstly, if the direction is TODEVICE, then dirty data in the streaming cache is impossible so we can elide the flush-flag synchronization in that case. Next, the context allocator is broken. It is highly likely that contexts get used multiple times for different dma mappings, which confuses the strbuf flushing code and makes it run inefficiently. Signed-off-by: David S. Miller --- arch/sparc64/kernel/pci_iommu.c | 90 +++++++++++++++++++++++++++++++--------- arch/sparc64/kernel/pci_psycho.c | 2 +- arch/sparc64/kernel/pci_sabre.c | 2 +- arch/sparc64/kernel/pci_schizo.c | 2 +- arch/sparc64/kernel/sbus.c | 20 ++++++--- 5 files changed, 87 insertions(+), 29 deletions(-) (limited to 'arch/sparc64/kernel') diff --git a/arch/sparc64/kernel/pci_iommu.c b/arch/sparc64/kernel/pci_iommu.c index 33ca56c90da..1807876f8c3 100644 --- a/arch/sparc64/kernel/pci_iommu.c +++ b/arch/sparc64/kernel/pci_iommu.c @@ -196,6 +196,34 @@ static iopte_t *alloc_consistent_cluster(struct pci_iommu *iommu, unsigned long return NULL; } +static int iommu_alloc_ctx(struct pci_iommu *iommu) +{ + int lowest = iommu->ctx_lowest_free; + int sz = IOMMU_NUM_CTXS - lowest; + int n = find_next_zero_bit(iommu->ctx_bitmap, sz, lowest); + + if (unlikely(n == sz)) { + n = find_next_zero_bit(iommu->ctx_bitmap, lowest, 1); + if (unlikely(n == lowest)) { + printk(KERN_WARNING "IOMMU: Ran out of contexts.\n"); + n = 0; + } + } + if (n) + __set_bit(n, iommu->ctx_bitmap); + + return n; +} + +static inline void iommu_free_ctx(struct pci_iommu *iommu, int ctx) +{ + if (likely(ctx)) { + __clear_bit(ctx, iommu->ctx_bitmap); + if (ctx < iommu->ctx_lowest_free) + iommu->ctx_lowest_free = ctx; + } +} + /* Allocate and map kernel buffer of size SIZE using consistent mode * DMA for PCI device PDEV. Return non-NULL cpu-side address if * successful and set *DMA_ADDRP to the PCI side dma address. @@ -236,7 +264,7 @@ void *pci_alloc_consistent(struct pci_dev *pdev, size_t size, dma_addr_t *dma_ad npages = size >> IO_PAGE_SHIFT; ctx = 0; if (iommu->iommu_ctxflush) - ctx = iommu->iommu_cur_ctx++; + ctx = iommu_alloc_ctx(iommu); first_page = __pa(first_page); while (npages--) { iopte_val(*iopte) = (IOPTE_CONSISTENT(ctx) | @@ -317,6 +345,8 @@ void pci_free_consistent(struct pci_dev *pdev, size_t size, void *cpu, dma_addr_ } } + iommu_free_ctx(iommu, ctx); + spin_unlock_irqrestore(&iommu->lock, flags); order = get_order(size); @@ -360,7 +390,7 @@ dma_addr_t pci_map_single(struct pci_dev *pdev, void *ptr, size_t sz, int direct base_paddr = __pa(oaddr & IO_PAGE_MASK); ctx = 0; if (iommu->iommu_ctxflush) - ctx = iommu->iommu_cur_ctx++; + ctx = iommu_alloc_ctx(iommu); if (strbuf->strbuf_enabled) iopte_protection = IOPTE_STREAMING(ctx); else @@ -380,39 +410,55 @@ bad: return PCI_DMA_ERROR_CODE; } -static void pci_strbuf_flush(struct pci_strbuf *strbuf, struct pci_iommu *iommu, u32 vaddr, unsigned long ctx, unsigned long npages) +static void pci_strbuf_flush(struct pci_strbuf *strbuf, struct pci_iommu *iommu, u32 vaddr, unsigned long ctx, unsigned long npages, int direction) { int limit; - PCI_STC_FLUSHFLAG_INIT(strbuf); if (strbuf->strbuf_ctxflush && iommu->iommu_ctxflush) { unsigned long matchreg, flushreg; + u64 val; flushreg = strbuf->strbuf_ctxflush; matchreg = PCI_STC_CTXMATCH_ADDR(strbuf, ctx); - limit = 100000; + if (pci_iommu_read(matchreg) == 0) + goto do_flush_sync; + pci_iommu_write(flushreg, ctx); - for(;;) { - if (((long)pci_iommu_read(matchreg)) >= 0L) - break; - limit--; - if (!limit) - break; - udelay(1); + if ((val = pci_iommu_read(matchreg)) == 0) + goto do_flush_sync; + + val &= 0xffff; + while (val) { + if (val & 0x1) + pci_iommu_write(flushreg, ctx); + val >>= 1; } - if (!limit) + val = pci_iommu_read(matchreg); + if (unlikely(val)) { printk(KERN_WARNING "pci_strbuf_flush: ctx flush " - "timeout vaddr[%08x] ctx[%lx]\n", - vaddr, ctx); + "timeout matchreg[%lx] ctx[%lx]\n", + val, ctx); + goto do_page_flush; + } } else { unsigned long i; + do_page_flush: for (i = 0; i < npages; i++, vaddr += IO_PAGE_SIZE) pci_iommu_write(strbuf->strbuf_pflush, vaddr); } +do_flush_sync: + /* If the device could not have possibly put dirty data into + * the streaming cache, no flush-flag synchronization needs + * to be performed. + */ + if (direction == PCI_DMA_TODEVICE) + return; + + PCI_STC_FLUSHFLAG_INIT(strbuf); pci_iommu_write(strbuf->strbuf_fsync, strbuf->strbuf_flushflag_pa); (void) pci_iommu_read(iommu->write_complete_reg); @@ -466,7 +512,7 @@ void pci_unmap_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int /* Step 1: Kick data out of streaming buffers if necessary. */ if (strbuf->strbuf_enabled) - pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages); + pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction); /* Step 2: Clear out first TSB entry. */ iopte_make_dummy(iommu, base); @@ -474,6 +520,8 @@ void pci_unmap_single(struct pci_dev *pdev, dma_addr_t bus_addr, size_t sz, int free_streaming_cluster(iommu, bus_addr - iommu->page_table_map_base, npages, ctx); + iommu_free_ctx(iommu, ctx); + spin_unlock_irqrestore(&iommu->lock, flags); } @@ -613,7 +661,7 @@ int pci_map_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, int /* Step 4: Choose a context if necessary. */ ctx = 0; if (iommu->iommu_ctxflush) - ctx = iommu->iommu_cur_ctx++; + ctx = iommu_alloc_ctx(iommu); /* Step 5: Create the mappings. */ if (strbuf->strbuf_enabled) @@ -678,7 +726,7 @@ void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, /* Step 1: Kick data out of streaming buffers if necessary. */ if (strbuf->strbuf_enabled) - pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages); + pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction); /* Step 2: Clear out first TSB entry. */ iopte_make_dummy(iommu, base); @@ -686,6 +734,8 @@ void pci_unmap_sg(struct pci_dev *pdev, struct scatterlist *sglist, int nelems, free_streaming_cluster(iommu, bus_addr - iommu->page_table_map_base, npages, ctx); + iommu_free_ctx(iommu, ctx); + spin_unlock_irqrestore(&iommu->lock, flags); } @@ -724,7 +774,7 @@ void pci_dma_sync_single_for_cpu(struct pci_dev *pdev, dma_addr_t bus_addr, size } /* Step 2: Kick data out of streaming buffers. */ - pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages); + pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction); spin_unlock_irqrestore(&iommu->lock, flags); } @@ -768,7 +818,7 @@ void pci_dma_sync_sg_for_cpu(struct pci_dev *pdev, struct scatterlist *sglist, i i--; npages = (IO_PAGE_ALIGN(sglist[i].dma_address + sglist[i].dma_length) - bus_addr) >> IO_PAGE_SHIFT; - pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages); + pci_strbuf_flush(strbuf, iommu, bus_addr, ctx, npages, direction); spin_unlock_irqrestore(&iommu->lock, flags); } diff --git a/arch/sparc64/kernel/pci_psycho.c b/arch/sparc64/kernel/pci_psycho.c index 3567fa879e1..534320ef0db 100644 --- a/arch/sparc64/kernel/pci_psycho.c +++ b/arch/sparc64/kernel/pci_psycho.c @@ -1212,7 +1212,7 @@ static void __init psycho_iommu_init(struct pci_controller_info *p) /* Setup initial software IOMMU state. */ spin_lock_init(&iommu->lock); - iommu->iommu_cur_ctx = 0; + iommu->ctx_lowest_free = 1; /* Register addresses. */ iommu->iommu_control = p->pbm_A.controller_regs + PSYCHO_IOMMU_CONTROL; diff --git a/arch/sparc64/kernel/pci_sabre.c b/arch/sparc64/kernel/pci_sabre.c index 5525d1ec4af..53d333b4a4e 100644 --- a/arch/sparc64/kernel/pci_sabre.c +++ b/arch/sparc64/kernel/pci_sabre.c @@ -1265,7 +1265,7 @@ static void __init sabre_iommu_init(struct pci_controller_info *p, /* Setup initial software IOMMU state. */ spin_lock_init(&iommu->lock); - iommu->iommu_cur_ctx = 0; + iommu->ctx_lowest_free = 1; /* Register addresses. */ iommu->iommu_control = p->pbm_A.controller_regs + SABRE_IOMMU_CONTROL; diff --git a/arch/sparc64/kernel/pci_schizo.c b/arch/sparc64/kernel/pci_schizo.c index e93fcadc372..5753175b94e 100644 --- a/arch/sparc64/kernel/pci_schizo.c +++ b/arch/sparc64/kernel/pci_schizo.c @@ -1753,7 +1753,7 @@ static void schizo_pbm_iommu_init(struct pci_pbm_info *pbm) /* Setup initial software IOMMU state. */ spin_lock_init(&iommu->lock); - iommu->iommu_cur_ctx = 0; + iommu->ctx_lowest_free = 1; /* Register addresses, SCHIZO has iommu ctx flushing. */ iommu->iommu_control = pbm->pbm_regs + SCHIZO_IOMMU_CONTROL; diff --git a/arch/sparc64/kernel/sbus.c b/arch/sparc64/kernel/sbus.c index 76ea6455433..89f5e019f24 100644 --- a/arch/sparc64/kernel/sbus.c +++ b/arch/sparc64/kernel/sbus.c @@ -117,17 +117,25 @@ static void iommu_flush(struct sbus_iommu *iommu, u32 base, unsigned long npages #define STRBUF_TAG_VALID 0x02UL -static void sbus_strbuf_flush(struct sbus_iommu *iommu, u32 base, unsigned long npages) +static void sbus_strbuf_flush(struct sbus_iommu *iommu, u32 base, unsigned long npages, int direction) { unsigned long n; int limit; - iommu->strbuf_flushflag = 0UL; n = npages; while (n--) upa_writeq(base + (n << IO_PAGE_SHIFT), iommu->strbuf_regs + STRBUF_PFLUSH); + /* If the device could not have possibly put dirty data into + * the streaming cache, no flush-flag synchronization needs + * to be performed. + */ + if (direction == SBUS_DMA_TODEVICE) + return; + + iommu->strbuf_flushflag = 0UL; + /* Whoopee cushion! */ upa_writeq(__pa(&iommu->strbuf_flushflag), iommu->strbuf_regs + STRBUF_FSYNC); @@ -421,7 +429,7 @@ void sbus_unmap_single(struct sbus_dev *sdev, dma_addr_t dma_addr, size_t size, spin_lock_irqsave(&iommu->lock, flags); free_streaming_cluster(iommu, dma_base, size >> IO_PAGE_SHIFT); - sbus_strbuf_flush(iommu, dma_base, size >> IO_PAGE_SHIFT); + sbus_strbuf_flush(iommu, dma_base, size >> IO_PAGE_SHIFT, direction); spin_unlock_irqrestore(&iommu->lock, flags); } @@ -584,7 +592,7 @@ void sbus_unmap_sg(struct sbus_dev *sdev, struct scatterlist *sg, int nents, int iommu = sdev->bus->iommu; spin_lock_irqsave(&iommu->lock, flags); free_streaming_cluster(iommu, dvma_base, size >> IO_PAGE_SHIFT); - sbus_strbuf_flush(iommu, dvma_base, size >> IO_PAGE_SHIFT); + sbus_strbuf_flush(iommu, dvma_base, size >> IO_PAGE_SHIFT, direction); spin_unlock_irqrestore(&iommu->lock, flags); } @@ -596,7 +604,7 @@ void sbus_dma_sync_single_for_cpu(struct sbus_dev *sdev, dma_addr_t base, size_t size = (IO_PAGE_ALIGN(base + size) - (base & IO_PAGE_MASK)); spin_lock_irqsave(&iommu->lock, flags); - sbus_strbuf_flush(iommu, base & IO_PAGE_MASK, size >> IO_PAGE_SHIFT); + sbus_strbuf_flush(iommu, base & IO_PAGE_MASK, size >> IO_PAGE_SHIFT, direction); spin_unlock_irqrestore(&iommu->lock, flags); } @@ -620,7 +628,7 @@ void sbus_dma_sync_sg_for_cpu(struct sbus_dev *sdev, struct scatterlist *sg, int size = IO_PAGE_ALIGN(sg[i].dma_address + sg[i].dma_length) - base; spin_lock_irqsave(&iommu->lock, flags); - sbus_strbuf_flush(iommu, base, size >> IO_PAGE_SHIFT); + sbus_strbuf_flush(iommu, base, size >> IO_PAGE_SHIFT, direction); spin_unlock_irqrestore(&iommu->lock, flags); } -- cgit v1.2.3 From 88314ee73fd75eb32abdcb3119cd303c116d4500 Mon Sep 17 00:00:00 2001 From: "David S. Miller" Date: Tue, 31 May 2005 19:13:52 -0700 Subject: [SPARC64]: Refine PCI strbuf ctx-based flush. The initial peek read PIO of the match register is just a waste. Just do the flush writes first, as that is more efficient. Signed-off-by: David S. Miller --- arch/sparc64/kernel/pci_iommu.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'arch/sparc64/kernel') diff --git a/arch/sparc64/kernel/pci_iommu.c b/arch/sparc64/kernel/pci_iommu.c index 1807876f8c3..2803bc7c2c7 100644 --- a/arch/sparc64/kernel/pci_iommu.c +++ b/arch/sparc64/kernel/pci_iommu.c @@ -422,14 +422,12 @@ static void pci_strbuf_flush(struct pci_strbuf *strbuf, struct pci_iommu *iommu, flushreg = strbuf->strbuf_ctxflush; matchreg = PCI_STC_CTXMATCH_ADDR(strbuf, ctx); - if (pci_iommu_read(matchreg) == 0) - goto do_flush_sync; - pci_iommu_write(flushreg, ctx); - if ((val = pci_iommu_read(matchreg)) == 0) + val = pci_iommu_read(matchreg); + val &= 0xffff; + if (!val) goto do_flush_sync; - val &= 0xffff; while (val) { if (val & 0x1) pci_iommu_write(flushreg, ctx); -- cgit v1.2.3 From 1363c3cd8603a913a27e2995dccbd70d5312d8e6 Mon Sep 17 00:00:00 2001 From: Wolfgang Wander Date: Tue, 21 Jun 2005 17:14:49 -0700 Subject: [PATCH] Avoiding mmap fragmentation Ingo recently introduced a great speedup for allocating new mmaps using the free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and causes huge performance increases in thread creation. The downside of this patch is that it does lead to fragmentation in the mmap-ed areas (visible via /proc/self/maps), such that some applications that work fine under 2.4 kernels quickly run out of memory on any 2.6 kernel. The problem is twofold: 1) the free_area_cache is used to continue a search for memory where the last search ended. Before the change new areas were always searched from the base address on. So now new small areas are cluttering holes of all sizes throughout the whole mmap-able region whereas before small holes tended to close holes near the base leaving holes far from the base large and available for larger requests. 2) the free_area_cache also is set to the location of the last munmap-ed area so in scenarios where we allocate e.g. five regions of 1K each, then free regions 4 2 3 in this order the next request for 1K will be placed in the position of the old region 3, whereas before we appended it to the still active region 1, placing it at the location of the old region 2. Before we had 1 free region of 2K, now we only get two free regions of 1K -> fragmentation. The patch addresses thes issues by introducing yet another cache descriptor cached_hole_size that contains the largest known hole size below the current free_area_cache. If a new request comes in the size is compared against the cached_hole_size and if the request can be filled with a hole below free_area_cache the search is started from the base instead. The results look promising: Whereas 2.6.12-rc4 fragments quickly and my (earlier posted) leakme.c test program terminates after 50000+ iterations with 96 distinct and fragmented maps in /proc/self/maps it performs nicely (as expected) with thread creation, Ingo's test_str02 with 20000 threads requires 0.7s system time. Taking out Ingo's patch (un-patch available per request) by basically deleting all mentions of free_area_cache from the kernel and starting the search for new memory always at the respective bases we observe: leakme terminates successfully with 11 distinctive hardly fragmented areas in /proc/self/maps but thread creating is gringdingly slow: 30+s(!) system time for Ingo's test_str02 with 20000 threads. Now - drumroll ;-) the appended patch works fine with leakme: it ends with only 7 distinct areas in /proc/self/maps and also thread creation seems sufficiently fast with 0.71s for 20000 threads. Signed-off-by: Wolfgang Wander Credit-to: "Richard Purdie" Signed-off-by: Ken Chen Acked-by: Ingo Molnar (partly) Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc64/kernel/sys_sparc.c | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'arch/sparc64/kernel') diff --git a/arch/sparc64/kernel/sys_sparc.c b/arch/sparc64/kernel/sys_sparc.c index 0077f02f4b3..5f8c822a2b4 100644 --- a/arch/sparc64/kernel/sys_sparc.c +++ b/arch/sparc64/kernel/sys_sparc.c @@ -84,6 +84,10 @@ unsigned long arch_get_unmapped_area(struct file *filp, unsigned long addr, unsi return addr; } + if (len <= mm->cached_hole_size) { + mm->cached_hole_size = 0; + mm->free_area_cache = TASK_UNMAPPED_BASE; + } start_addr = addr = mm->free_area_cache; task_size -= len; @@ -103,6 +107,7 @@ full_search: if (task_size < addr) { if (start_addr != TASK_UNMAPPED_BASE) { start_addr = addr = TASK_UNMAPPED_BASE; + mm->cached_hole_size = 0; goto full_search; } return -ENOMEM; @@ -114,6 +119,9 @@ full_search: mm->free_area_cache = addr + len; return addr; } + if (addr + mm->cached_hole_size < vma->vm_start) + mm->cached_hole_size = vma->vm_start - addr; + addr = vma->vm_end; if (do_color_align) addr = COLOUR_ALIGN(addr, pgoff); -- cgit v1.2.3