diff options
Diffstat (limited to 'drivers/pci/intel-iommu.c')
-rw-r--r-- | drivers/pci/intel-iommu.c | 759 |
1 files changed, 548 insertions, 211 deletions
diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index cd389162735..e53eacd75c8 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -39,6 +39,7 @@ #include <linux/sysdev.h> #include <asm/cacheflush.h> #include <asm/iommu.h> +#include <asm/e820.h> #include "pci.h" #define ROOT_SIZE VTD_PAGE_SIZE @@ -53,6 +54,8 @@ #define DEFAULT_DOMAIN_ADDRESS_WIDTH 48 +#define MAX_AGAW_WIDTH 64 + #define DOMAIN_MAX_ADDR(gaw) ((((u64)1) << gaw) - 1) #define IOVA_PFN(addr) ((addr) >> PAGE_SHIFT) @@ -131,8 +134,6 @@ static inline void context_set_fault_enable(struct context_entry *context) context->lo &= (((u64)-1) << 2) | 1; } -#define CONTEXT_TT_MULTI_LEVEL 0 - static inline void context_set_translation_type(struct context_entry *context, unsigned long value) { @@ -217,6 +218,14 @@ static inline bool dma_pte_present(struct dma_pte *pte) return (pte->val & 3) != 0; } +/* + * This domain is a statically identity mapping domain. + * 1. This domain creats a static 1:1 mapping to all usable memory. + * 2. It maps to each iommu if successful. + * 3. Each iommu mapps to this domain if successful. + */ +struct dmar_domain *si_domain; + /* devices under the same p2p bridge are owned in one domain */ #define DOMAIN_FLAG_P2P_MULTIPLE_DEVICES (1 << 0) @@ -225,6 +234,9 @@ static inline bool dma_pte_present(struct dma_pte *pte) */ #define DOMAIN_FLAG_VIRTUAL_MACHINE (1 << 1) +/* si_domain contains mulitple devices */ +#define DOMAIN_FLAG_STATIC_IDENTITY (1 << 2) + struct dmar_domain { int id; /* domain id */ unsigned long iommu_bmp; /* bitmap of iommus this domain uses*/ @@ -256,6 +268,7 @@ struct device_domain_info { u8 bus; /* PCI bus number */ u8 devfn; /* PCI devfn number */ struct pci_dev *dev; /* it's NULL for PCIE-to-PCI bridge */ + struct intel_iommu *iommu; /* IOMMU used by this device */ struct dmar_domain *domain; /* pointer to domain */ }; @@ -401,17 +414,13 @@ void free_iova_mem(struct iova *iova) static inline int width_to_agaw(int width); -/* calculate agaw for each iommu. - * "SAGAW" may be different across iommus, use a default agaw, and - * get a supported less agaw for iommus that don't support the default agaw. - */ -int iommu_calculate_agaw(struct intel_iommu *iommu) +static int __iommu_calculate_agaw(struct intel_iommu *iommu, int max_gaw) { unsigned long sagaw; int agaw = -1; sagaw = cap_sagaw(iommu->cap); - for (agaw = width_to_agaw(DEFAULT_DOMAIN_ADDRESS_WIDTH); + for (agaw = width_to_agaw(max_gaw); agaw >= 0; agaw--) { if (test_bit(agaw, &sagaw)) break; @@ -420,12 +429,32 @@ int iommu_calculate_agaw(struct intel_iommu *iommu) return agaw; } -/* in native case, each domain is related to only one iommu */ +/* + * Calculate max SAGAW for each iommu. + */ +int iommu_calculate_max_sagaw(struct intel_iommu *iommu) +{ + return __iommu_calculate_agaw(iommu, MAX_AGAW_WIDTH); +} + +/* + * calculate agaw for each iommu. + * "SAGAW" may be different across iommus, use a default agaw, and + * get a supported less agaw for iommus that don't support the default agaw. + */ +int iommu_calculate_agaw(struct intel_iommu *iommu) +{ + return __iommu_calculate_agaw(iommu, DEFAULT_DOMAIN_ADDRESS_WIDTH); +} + +/* This functionin only returns single iommu in a domain */ static struct intel_iommu *domain_get_iommu(struct dmar_domain *domain) { int iommu_id; + /* si_domain and vm domain should not get here. */ BUG_ON(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE); + BUG_ON(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY); iommu_id = find_first_bit(&domain->iommu_bmp, g_num_of_iommus); if (iommu_id < 0 || iommu_id >= g_num_of_iommus) @@ -809,7 +838,7 @@ static int iommu_alloc_root_entry(struct intel_iommu *iommu) static void iommu_set_root_entry(struct intel_iommu *iommu) { void *addr; - u32 cmd, sts; + u32 sts; unsigned long flag; addr = iommu->root_entry; @@ -817,12 +846,11 @@ static void iommu_set_root_entry(struct intel_iommu *iommu) spin_lock_irqsave(&iommu->register_lock, flag); dmar_writeq(iommu->reg + DMAR_RTADDR_REG, virt_to_phys(addr)); - cmd = iommu->gcmd | DMA_GCMD_SRTP; - writel(cmd, iommu->reg + DMAR_GCMD_REG); + writel(iommu->gcmd | DMA_GCMD_SRTP, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (sts & DMA_GSTS_RTPS), sts); + readl, (sts & DMA_GSTS_RTPS), sts); spin_unlock_irqrestore(&iommu->register_lock, flag); } @@ -834,39 +862,25 @@ static void iommu_flush_write_buffer(struct intel_iommu *iommu) if (!rwbf_quirk && !cap_rwbf(iommu->cap)) return; - val = iommu->gcmd | DMA_GCMD_WBF; spin_lock_irqsave(&iommu->register_lock, flag); - writel(val, iommu->reg + DMAR_GCMD_REG); + writel(iommu->gcmd | DMA_GCMD_WBF, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (!(val & DMA_GSTS_WBFS)), val); + readl, (!(val & DMA_GSTS_WBFS)), val); spin_unlock_irqrestore(&iommu->register_lock, flag); } /* return value determine if we need a write buffer flush */ -static int __iommu_flush_context(struct intel_iommu *iommu, - u16 did, u16 source_id, u8 function_mask, u64 type, - int non_present_entry_flush) +static void __iommu_flush_context(struct intel_iommu *iommu, + u16 did, u16 source_id, u8 function_mask, + u64 type) { u64 val = 0; unsigned long flag; - /* - * In the non-present entry flush case, if hardware doesn't cache - * non-present entry we do nothing and if hardware cache non-present - * entry, we flush entries of domain 0 (the domain id is used to cache - * any non-present entries) - */ - if (non_present_entry_flush) { - if (!cap_caching_mode(iommu->cap)) - return 1; - else - did = 0; - } - switch (type) { case DMA_CCMD_GLOBAL_INVL: val = DMA_CCMD_GLOBAL_INVL; @@ -891,33 +905,16 @@ static int __iommu_flush_context(struct intel_iommu *iommu, dmar_readq, (!(val & DMA_CCMD_ICC)), val); spin_unlock_irqrestore(&iommu->register_lock, flag); - - /* flush context entry will implicitly flush write buffer */ - return 0; } /* return value determine if we need a write buffer flush */ -static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int size_order, u64 type, - int non_present_entry_flush) +static void __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, + u64 addr, unsigned int size_order, u64 type) { int tlb_offset = ecap_iotlb_offset(iommu->ecap); u64 val = 0, val_iva = 0; unsigned long flag; - /* - * In the non-present entry flush case, if hardware doesn't cache - * non-present entry we do nothing and if hardware cache non-present - * entry, we flush entries of domain 0 (the domain id is used to cache - * any non-present entries) - */ - if (non_present_entry_flush) { - if (!cap_caching_mode(iommu->cap)) - return 1; - else - did = 0; - } - switch (type) { case DMA_TLB_GLOBAL_FLUSH: /* global flush doesn't need set IVA_REG */ @@ -965,37 +962,101 @@ static int __iommu_flush_iotlb(struct intel_iommu *iommu, u16 did, pr_debug("IOMMU: tlb flush request %Lx, actual %Lx\n", (unsigned long long)DMA_TLB_IIRG(type), (unsigned long long)DMA_TLB_IAIG(val)); - /* flush iotlb entry will implicitly flush write buffer */ - return 0; } -static int iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, - u64 addr, unsigned int pages, int non_present_entry_flush) +static struct device_domain_info *iommu_support_dev_iotlb( + struct dmar_domain *domain, int segment, u8 bus, u8 devfn) +{ + int found = 0; + unsigned long flags; + struct device_domain_info *info; + struct intel_iommu *iommu = device_to_iommu(segment, bus, devfn); + + if (!ecap_dev_iotlb_support(iommu->ecap)) + return NULL; + + if (!iommu->qi) + return NULL; + + spin_lock_irqsave(&device_domain_lock, flags); + list_for_each_entry(info, &domain->devices, link) + if (info->bus == bus && info->devfn == devfn) { + found = 1; + break; + } + spin_unlock_irqrestore(&device_domain_lock, flags); + + if (!found || !info->dev) + return NULL; + + if (!pci_find_ext_capability(info->dev, PCI_EXT_CAP_ID_ATS)) + return NULL; + + if (!dmar_find_matched_atsr_unit(info->dev)) + return NULL; + + info->iommu = iommu; + + return info; +} + +static void iommu_enable_dev_iotlb(struct device_domain_info *info) +{ + if (!info) + return; + + pci_enable_ats(info->dev, VTD_PAGE_SHIFT); +} + +static void iommu_disable_dev_iotlb(struct device_domain_info *info) +{ + if (!info->dev || !pci_ats_enabled(info->dev)) + return; + + pci_disable_ats(info->dev); +} + +static void iommu_flush_dev_iotlb(struct dmar_domain *domain, + u64 addr, unsigned mask) { - unsigned int mask; + u16 sid, qdep; + unsigned long flags; + struct device_domain_info *info; + + spin_lock_irqsave(&device_domain_lock, flags); + list_for_each_entry(info, &domain->devices, link) { + if (!info->dev || !pci_ats_enabled(info->dev)) + continue; + + sid = info->bus << 8 | info->devfn; + qdep = pci_ats_queue_depth(info->dev); + qi_flush_dev_iotlb(info->iommu, sid, qdep, addr, mask); + } + spin_unlock_irqrestore(&device_domain_lock, flags); +} + +static void iommu_flush_iotlb_psi(struct intel_iommu *iommu, u16 did, + u64 addr, unsigned int pages) +{ + unsigned int mask = ilog2(__roundup_pow_of_two(pages)); BUG_ON(addr & (~VTD_PAGE_MASK)); BUG_ON(pages == 0); - /* Fallback to domain selective flush if no PSI support */ - if (!cap_pgsel_inv(iommu->cap)) - return iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH, - non_present_entry_flush); - /* + * Fallback to domain selective flush if no PSI support or the size is + * too big. * PSI requires page size to be 2 ^ x, and the base address is naturally * aligned to the size */ - mask = ilog2(__roundup_pow_of_two(pages)); - /* Fallback to domain selective flush if size is too big */ - if (mask > cap_max_amask_val(iommu->cap)) - return iommu->flush.flush_iotlb(iommu, did, 0, 0, - DMA_TLB_DSI_FLUSH, non_present_entry_flush); - - return iommu->flush.flush_iotlb(iommu, did, addr, mask, - DMA_TLB_PSI_FLUSH, - non_present_entry_flush); + if (!cap_pgsel_inv(iommu->cap) || mask > cap_max_amask_val(iommu->cap)) + iommu->flush.flush_iotlb(iommu, did, 0, 0, + DMA_TLB_DSI_FLUSH); + else + iommu->flush.flush_iotlb(iommu, did, addr, mask, + DMA_TLB_PSI_FLUSH); + if (did) + iommu_flush_dev_iotlb(iommu->domains[did], addr, mask); } static void iommu_disable_protect_mem_regions(struct intel_iommu *iommu) @@ -1021,13 +1082,13 @@ static int iommu_enable_translation(struct intel_iommu *iommu) unsigned long flags; spin_lock_irqsave(&iommu->register_lock, flags); - writel(iommu->gcmd|DMA_GCMD_TE, iommu->reg + DMAR_GCMD_REG); + iommu->gcmd |= DMA_GCMD_TE; + writel(iommu->gcmd, iommu->reg + DMAR_GCMD_REG); /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (sts & DMA_GSTS_TES), sts); + readl, (sts & DMA_GSTS_TES), sts); - iommu->gcmd |= DMA_GCMD_TE; spin_unlock_irqrestore(&iommu->register_lock, flags); return 0; } @@ -1043,7 +1104,7 @@ static int iommu_disable_translation(struct intel_iommu *iommu) /* Make sure hardware complete it */ IOMMU_WAIT_OP(iommu, DMAR_GSTS_REG, - readl, (!(sts & DMA_GSTS_TES)), sts); + readl, (!(sts & DMA_GSTS_TES)), sts); spin_unlock_irqrestore(&iommu->register_lock, flag); return 0; @@ -1142,48 +1203,71 @@ void free_dmar_iommu(struct intel_iommu *iommu) free_context_table(iommu); } -static struct dmar_domain * iommu_alloc_domain(struct intel_iommu *iommu) +static struct dmar_domain *alloc_domain(void) { - unsigned long num; - unsigned long ndomains; struct dmar_domain *domain; - unsigned long flags; domain = alloc_domain_mem(); if (!domain) return NULL; + memset(&domain->iommu_bmp, 0, sizeof(unsigned long)); + domain->flags = 0; + + return domain; +} + +static int iommu_attach_domain(struct dmar_domain *domain, + struct intel_iommu *iommu) +{ + int num; + unsigned long ndomains; + unsigned long flags; + ndomains = cap_ndoms(iommu->cap); spin_lock_irqsave(&iommu->lock, flags); + num = find_first_zero_bit(iommu->domain_ids, ndomains); if (num >= ndomains) { spin_unlock_irqrestore(&iommu->lock, flags); - free_domain_mem(domain); printk(KERN_ERR "IOMMU: no free domain ids\n"); - return NULL; + return -ENOMEM; } - set_bit(num, iommu->domain_ids); domain->id = num; - memset(&domain->iommu_bmp, 0, sizeof(unsigned long)); + set_bit(num, iommu->domain_ids); set_bit(iommu->seq_id, &domain->iommu_bmp); - domain->flags = 0; iommu->domains[num] = domain; spin_unlock_irqrestore(&iommu->lock, flags); - return domain; + return 0; } -static void iommu_free_domain(struct dmar_domain *domain) +static void iommu_detach_domain(struct dmar_domain *domain, + struct intel_iommu *iommu) { unsigned long flags; - struct intel_iommu *iommu; - - iommu = domain_get_iommu(domain); + int num, ndomains; + int found = 0; spin_lock_irqsave(&iommu->lock, flags); - clear_bit(domain->id, iommu->domain_ids); + ndomains = cap_ndoms(iommu->cap); + num = find_first_bit(iommu->domain_ids, ndomains); + for (; num < ndomains; ) { + if (iommu->domains[num] == domain) { + found = 1; + break; + } + num = find_next_bit(iommu->domain_ids, + cap_ndoms(iommu->cap), num+1); + } + + if (found) { + clear_bit(num, iommu->domain_ids); + clear_bit(iommu->seq_id, &domain->iommu_bmp); + iommu->domains[num] = NULL; + } spin_unlock_irqrestore(&iommu->lock, flags); } @@ -1303,6 +1387,8 @@ static int domain_init(struct dmar_domain *domain, int guest_width) static void domain_exit(struct dmar_domain *domain) { + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; u64 end; /* Domain 0 is reserved, so dont process it */ @@ -1321,12 +1407,15 @@ static void domain_exit(struct dmar_domain *domain) /* free page tables */ dma_pte_free_pagetable(domain, 0, end); - iommu_free_domain(domain); + for_each_active_iommu(iommu, drhd) + if (test_bit(iommu->seq_id, &domain->iommu_bmp)) + iommu_detach_domain(domain, iommu); + free_domain_mem(domain); } -static int domain_context_mapping_one(struct dmar_domain *domain, - int segment, u8 bus, u8 devfn) +static int domain_context_mapping_one(struct dmar_domain *domain, int segment, + u8 bus, u8 devfn, int translation) { struct context_entry *context; unsigned long flags; @@ -1336,10 +1425,14 @@ static int domain_context_mapping_one(struct dmar_domain *domain, unsigned long ndomains; int id; int agaw; + struct device_domain_info *info = NULL; pr_debug("Set context mapping for %02x:%02x.%d\n", bus, PCI_SLOT(devfn), PCI_FUNC(devfn)); + BUG_ON(!domain->pgd); + BUG_ON(translation != CONTEXT_TT_PASS_THROUGH && + translation != CONTEXT_TT_MULTI_LEVEL); iommu = device_to_iommu(segment, bus, devfn); if (!iommu) @@ -1357,7 +1450,8 @@ static int domain_context_mapping_one(struct dmar_domain *domain, id = domain->id; pgd = domain->pgd; - if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) { + if (domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE || + domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) { int found = 0; /* find an available domain id for this device in iommu */ @@ -1382,6 +1476,7 @@ static int domain_context_mapping_one(struct dmar_domain *domain, } set_bit(num, iommu->domain_ids); + set_bit(iommu->seq_id, &domain->iommu_bmp); iommu->domains[num] = domain; id = num; } @@ -1399,21 +1494,44 @@ static int domain_context_mapping_one(struct dmar_domain *domain, } context_set_domain_id(context, id); - context_set_address_width(context, iommu->agaw); - context_set_address_root(context, virt_to_phys(pgd)); - context_set_translation_type(context, CONTEXT_TT_MULTI_LEVEL); + + if (translation != CONTEXT_TT_PASS_THROUGH) { + info = iommu_support_dev_iotlb(domain, segment, bus, devfn); + translation = info ? CONTEXT_TT_DEV_IOTLB : + CONTEXT_TT_MULTI_LEVEL; + } + /* + * In pass through mode, AW must be programmed to indicate the largest + * AGAW value supported by hardware. And ASR is ignored by hardware. + */ + if (unlikely(translation == CONTEXT_TT_PASS_THROUGH)) + context_set_address_width(context, iommu->msagaw); + else { + context_set_address_root(context, virt_to_phys(pgd)); + context_set_address_width(context, iommu->agaw); + } + + context_set_translation_type(context, translation); context_set_fault_enable(context); context_set_present(context); domain_flush_cache(domain, context, sizeof(*context)); - /* it's a non-present to present mapping */ - if (iommu->flush.flush_context(iommu, domain->id, - (((u16)bus) << 8) | devfn, DMA_CCMD_MASK_NOBIT, - DMA_CCMD_DEVICE_INVL, 1)) + /* + * It's a non-present to present mapping. If hardware doesn't cache + * non-present entry we only need to flush the write-buffer. If the + * _does_ cache non-present entries, then it does so in the special + * domain #0, which we have to flush: + */ + if (cap_caching_mode(iommu->cap)) { + iommu->flush.flush_context(iommu, 0, + (((u16)bus) << 8) | devfn, + DMA_CCMD_MASK_NOBIT, + DMA_CCMD_DEVICE_INVL); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH); + } else { iommu_flush_write_buffer(iommu); - else - iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_DSI_FLUSH, 0); - + } + iommu_enable_dev_iotlb(info); spin_unlock_irqrestore(&iommu->lock, flags); spin_lock_irqsave(&domain->iommu_lock, flags); @@ -1426,13 +1544,15 @@ static int domain_context_mapping_one(struct dmar_domain *domain, } static int -domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) +domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev, + int translation) { int ret; struct pci_dev *tmp, *parent; ret = domain_context_mapping_one(domain, pci_domain_nr(pdev->bus), - pdev->bus->number, pdev->devfn); + pdev->bus->number, pdev->devfn, + translation); if (ret) return ret; @@ -1446,7 +1566,7 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) ret = domain_context_mapping_one(domain, pci_domain_nr(parent->bus), parent->bus->number, - parent->devfn); + parent->devfn, translation); if (ret) return ret; parent = parent->bus->self; @@ -1454,12 +1574,14 @@ domain_context_mapping(struct dmar_domain *domain, struct pci_dev *pdev) if (tmp->is_pcie) /* this is a PCIE-to-PCI bridge */ return domain_context_mapping_one(domain, pci_domain_nr(tmp->subordinate), - tmp->subordinate->number, 0); + tmp->subordinate->number, 0, + translation); else /* this is a legacy PCI bridge */ return domain_context_mapping_one(domain, pci_domain_nr(tmp->bus), tmp->bus->number, - tmp->devfn); + tmp->devfn, + translation); } static int domain_context_mapped(struct pci_dev *pdev) @@ -1540,9 +1662,8 @@ static void iommu_detach_dev(struct intel_iommu *iommu, u8 bus, u8 devfn) clear_context_table(iommu, bus, devfn); iommu->flush.flush_context(iommu, 0, 0, 0, - DMA_CCMD_GLOBAL_INVL, 0); - iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); + DMA_CCMD_GLOBAL_INVL); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); } static void domain_remove_dev_info(struct dmar_domain *domain) @@ -1561,6 +1682,7 @@ static void domain_remove_dev_info(struct dmar_domain *domain) info->dev->dev.archdata.iommu = NULL; spin_unlock_irqrestore(&device_domain_lock, flags); + iommu_disable_dev_iotlb(info); iommu = device_to_iommu(info->segment, info->bus, info->devfn); iommu_detach_dev(iommu, info->bus, info->devfn); free_devinfo_mem(info); @@ -1597,6 +1719,7 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) unsigned long flags; int bus = 0, devfn = 0; int segment; + int ret; domain = find_domain(pdev); if (domain) @@ -1629,6 +1752,10 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) } } + domain = alloc_domain(); + if (!domain) + goto error; + /* Allocate new domain for the device */ drhd = dmar_find_matched_drhd_unit(pdev); if (!drhd) { @@ -1638,9 +1765,11 @@ static struct dmar_domain *get_domain_for_dev(struct pci_dev *pdev, int gaw) } iommu = drhd->iommu; - domain = iommu_alloc_domain(iommu); - if (!domain) + ret = iommu_attach_domain(domain, iommu); + if (ret) { + domain_exit(domain); goto error; + } if (domain_init(domain, gaw)) { domain_exit(domain); @@ -1714,6 +1843,8 @@ error: return find_domain(pdev); } +static int iommu_identity_mapping; + static int iommu_prepare_identity_map(struct pci_dev *pdev, unsigned long long start, unsigned long long end) @@ -1726,8 +1857,11 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev, printk(KERN_INFO "IOMMU: Setting identity map for device %s [0x%Lx - 0x%Lx]\n", pci_name(pdev), start, end); - /* page table init */ - domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); + if (iommu_identity_mapping) + domain = si_domain; + else + /* page table init */ + domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); if (!domain) return -ENOMEM; @@ -1756,7 +1890,7 @@ static int iommu_prepare_identity_map(struct pci_dev *pdev, goto error; /* context entry init */ - ret = domain_context_mapping(domain, pdev); + ret = domain_context_mapping(domain, pdev, CONTEXT_TT_MULTI_LEVEL); if (!ret) return 0; error: @@ -1857,13 +1991,141 @@ static inline void iommu_prepare_isa(void) } #endif /* !CONFIG_DMAR_FLPY_WA */ -static int __init init_dmars(void) +/* Initialize each context entry as pass through.*/ +static int __init init_context_pass_through(void) +{ + struct pci_dev *pdev = NULL; + struct dmar_domain *domain; + int ret; + + for_each_pci_dev(pdev) { + domain = get_domain_for_dev(pdev, DEFAULT_DOMAIN_ADDRESS_WIDTH); + ret = domain_context_mapping(domain, pdev, + CONTEXT_TT_PASS_THROUGH); + if (ret) + return ret; + } + return 0; +} + +static int md_domain_init(struct dmar_domain *domain, int guest_width); +static int si_domain_init(void) +{ + struct dmar_drhd_unit *drhd; + struct intel_iommu *iommu; + int ret = 0; + + si_domain = alloc_domain(); + if (!si_domain) + return -EFAULT; + + + for_each_active_iommu(iommu, drhd) { + ret = iommu_attach_domain(si_domain, iommu); + if (ret) { + domain_exit(si_domain); + return -EFAULT; + } + } + + if (md_domain_init(si_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { + domain_exit(si_domain); + return -EFAULT; + } + + si_domain->flags = DOMAIN_FLAG_STATIC_IDENTITY; + + return 0; +} + +static void domain_remove_one_dev_info(struct dmar_domain *domain, + struct pci_dev *pdev); +static int identity_mapping(struct pci_dev *pdev) +{ + struct device_domain_info *info; + + if (likely(!iommu_identity_mapping)) + return 0; + + + list_for_each_entry(info, &si_domain->devices, link) + if (info->dev == pdev) + return 1; + return 0; +} + +static int domain_add_dev_info(struct dmar_domain *domain, + struct pci_dev *pdev) +{ + struct device_domain_info *info; + unsigned long flags; + + info = alloc_devinfo_mem(); + if (!info) + return -ENOMEM; + + info->segment = pci_domain_nr(pdev->bus); + info->bus = pdev->bus->number; + info->devfn = pdev->devfn; + info->dev = pdev; + info->domain = domain; + + spin_lock_irqsave(&device_domain_lock, flags); + list_add(&info->link, &domain->devices); + list_add(&info->global, &device_domain_list); + pdev->dev.archdata.iommu = info; + spin_unlock_irqrestore(&device_domain_lock, flags); + + return 0; +} + +static int iommu_prepare_static_identity_mapping(void) +{ + int i; + struct pci_dev *pdev = NULL; + int ret; + + ret = si_domain_init(); + if (ret) + return -EFAULT; + + printk(KERN_INFO "IOMMU: Setting identity map:\n"); + for_each_pci_dev(pdev) { + for (i = 0; i < e820.nr_map; i++) { + struct e820entry *ei = &e820.map[i]; + + if (ei->type == E820_RAM) { + ret = iommu_prepare_identity_map(pdev, + ei->addr, ei->addr + ei->size); + if (ret) { + printk(KERN_INFO "1:1 mapping to one domain failed.\n"); + return -EFAULT; + } + } + } + ret = domain_add_dev_info(si_domain, pdev); + if (ret) + return ret; + } + + return 0; +} + +int __init init_dmars(void) { struct dmar_drhd_unit *drhd; struct dmar_rmrr_unit *rmrr; struct pci_dev *pdev; struct intel_iommu *iommu; int i, ret; + int pass_through = 1; + + /* + * In case pass through can not be enabled, iommu tries to use identity + * mapping. + */ + if (iommu_pass_through) + iommu_identity_mapping = 1; /* * for each drhd @@ -1917,7 +2179,15 @@ static int __init init_dmars(void) printk(KERN_ERR "IOMMU: allocate root entry failed\n"); goto error; } + if (!ecap_pass_through(iommu->ecap)) + pass_through = 0; } + if (iommu_pass_through) + if (!pass_through) { + printk(KERN_INFO + "Pass Through is not supported by hardware.\n"); + iommu_pass_through = 0; + } /* * Start from the sane iommu hardware state. @@ -1973,35 +2243,60 @@ static int __init init_dmars(void) } /* - * For each rmrr - * for each dev attached to rmrr - * do - * locate drhd for dev, alloc domain for dev - * allocate free domain - * allocate page table entries for rmrr - * if context not allocated for bus - * allocate and init context - * set present in root table for this bus - * init context with domain, translation etc - * endfor - * endfor + * If pass through is set and enabled, context entries of all pci + * devices are intialized by pass through translation type. */ - for_each_rmrr_units(rmrr) { - for (i = 0; i < rmrr->devices_cnt; i++) { - pdev = rmrr->devices[i]; - /* some BIOS lists non-exist devices in DMAR table */ - if (!pdev) - continue; - ret = iommu_prepare_rmrr_dev(rmrr, pdev); - if (ret) - printk(KERN_ERR - "IOMMU: mapping reserved region failed\n"); + if (iommu_pass_through) { + ret = init_context_pass_through(); + if (ret) { + printk(KERN_ERR "IOMMU: Pass through init failed.\n"); + iommu_pass_through = 0; } } - iommu_prepare_gfx_mapping(); + /* + * If pass through is not set or not enabled, setup context entries for + * identity mappings for rmrr, gfx, and isa and may fall back to static + * identity mapping if iommu_identity_mapping is set. + */ + if (!iommu_pass_through) { + if (iommu_identity_mapping) + iommu_prepare_static_identity_mapping(); + /* + * For each rmrr + * for each dev attached to rmrr + * do + * locate drhd for dev, alloc domain for dev + * allocate free domain + * allocate page table entries for rmrr + * if context not allocated for bus + * allocate and init context + * set present in root table for this bus + * init context with domain, translation etc + * endfor + * endfor + */ + printk(KERN_INFO "IOMMU: Setting RMRR:\n"); + for_each_rmrr_units(rmrr) { + for (i = 0; i < rmrr->devices_cnt; i++) { + pdev = rmrr->devices[i]; + /* + * some BIOS lists non-exist devices in DMAR + * table. + */ + if (!pdev) + continue; + ret = iommu_prepare_rmrr_dev(rmrr, pdev); + if (ret) + printk(KERN_ERR + "IOMMU: mapping reserved region failed\n"); + } + } + + iommu_prepare_gfx_mapping(); - iommu_prepare_isa(); + iommu_prepare_isa(); + } /* * for each drhd @@ -2023,10 +2318,8 @@ static int __init init_dmars(void) iommu_set_root_entry(iommu); - iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL, - 0); - iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH, - 0); + iommu->flush.flush_context(iommu, 0, 0, 0, DMA_CCMD_GLOBAL_INVL); + iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); iommu_disable_protect_mem_regions(iommu); ret = iommu_enable_translation(iommu); @@ -2112,7 +2405,8 @@ get_valid_domain_for_dev(struct pci_dev *pdev) /* make sure context mapping is ok */ if (unlikely(!domain_context_mapped(pdev))) { - ret = domain_context_mapping(domain, pdev); + ret = domain_context_mapping(domain, pdev, + CONTEXT_TT_MULTI_LEVEL); if (ret) { printk(KERN_ERR "Domain context map for %s failed", @@ -2124,6 +2418,52 @@ get_valid_domain_for_dev(struct pci_dev *pdev) return domain; } +static int iommu_dummy(struct pci_dev *pdev) +{ + return pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO; +} + +/* Check if the pdev needs to go through non-identity map and unmap process.*/ +static int iommu_no_mapping(struct pci_dev *pdev) +{ + int found; + + if (!iommu_identity_mapping) + return iommu_dummy(pdev); + + found = identity_mapping(pdev); + if (found) { + if (pdev->dma_mask > DMA_BIT_MASK(32)) + return 1; + else { + /* + * 32 bit DMA is removed from si_domain and fall back + * to non-identity mapping. + */ + domain_remove_one_dev_info(si_domain, pdev); + printk(KERN_INFO "32bit %s uses non-identity mapping\n", + pci_name(pdev)); + return 0; + } + } else { + /* + * In case of a detached 64 bit DMA device from vm, the device + * is put into si_domain for identity mapping. + */ + if (pdev->dma_mask > DMA_BIT_MASK(32)) { + int ret; + ret = domain_add_dev_info(si_domain, pdev); + if (!ret) { + printk(KERN_INFO "64bit %s uses identity mapping\n", + pci_name(pdev)); + return 1; + } + } + } + + return iommu_dummy(pdev); +} + static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, size_t size, int dir, u64 dma_mask) { @@ -2136,7 +2476,8 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, struct intel_iommu *iommu; BUG_ON(dir == DMA_NONE); - if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + + if (iommu_no_mapping(pdev)) return paddr; domain = get_valid_domain_for_dev(pdev); @@ -2173,10 +2514,11 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, if (ret) goto error; - /* it's a non-present to present mapping */ - ret = iommu_flush_iotlb_psi(iommu, domain->id, - start_paddr, size >> VTD_PAGE_SHIFT, 1); - if (ret) + /* it's a non-present to present mapping. Only flush if caching mode */ + if (cap_caching_mode(iommu->cap)) + iommu_flush_iotlb_psi(iommu, 0, start_paddr, + size >> VTD_PAGE_SHIFT); + else iommu_flush_write_buffer(iommu); return start_paddr + ((u64)paddr & (~PAGE_MASK)); @@ -2210,15 +2552,22 @@ static void flush_unmaps(void) if (!iommu) continue; - if (deferred_flush[i].next) { - iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); - for (j = 0; j < deferred_flush[i].next; j++) { - __free_iova(&deferred_flush[i].domain[j]->iovad, - deferred_flush[i].iova[j]); - } - deferred_flush[i].next = 0; + if (!deferred_flush[i].next) + continue; + + iommu->flush.flush_iotlb(iommu, 0, 0, 0, + DMA_TLB_GLOBAL_FLUSH); + for (j = 0; j < deferred_flush[i].next; j++) { + unsigned long mask; + struct iova *iova = deferred_flush[i].iova[j]; + + mask = (iova->pfn_hi - iova->pfn_lo + 1) << PAGE_SHIFT; + mask = ilog2(mask >> VTD_PAGE_SHIFT); + iommu_flush_dev_iotlb(deferred_flush[i].domain[j], + iova->pfn_lo << PAGE_SHIFT, mask); + __free_iova(&deferred_flush[i].domain[j]->iovad, iova); } + deferred_flush[i].next = 0; } list_size = 0; @@ -2269,8 +2618,9 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, struct iova *iova; struct intel_iommu *iommu; - if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + if (iommu_no_mapping(pdev)) return; + domain = find_domain(pdev); BUG_ON(!domain); @@ -2291,9 +2641,8 @@ static void intel_unmap_page(struct device *dev, dma_addr_t dev_addr, /* free page tables */ dma_pte_free_pagetable(domain, start_addr, start_addr + size); if (intel_iommu_strict) { - if (iommu_flush_iotlb_psi(iommu, - domain->id, start_addr, size >> VTD_PAGE_SHIFT, 0)) - iommu_flush_write_buffer(iommu); + iommu_flush_iotlb_psi(iommu, domain->id, start_addr, + size >> VTD_PAGE_SHIFT); /* free iova */ __free_iova(&domain->iovad, iova); } else { @@ -2361,7 +2710,7 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, struct scatterlist *sg; struct intel_iommu *iommu; - if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + if (iommu_no_mapping(pdev)) return; domain = find_domain(pdev); @@ -2384,9 +2733,8 @@ static void intel_unmap_sg(struct device *hwdev, struct scatterlist *sglist, /* free page tables */ dma_pte_free_pagetable(domain, start_addr, start_addr + size); - if (iommu_flush_iotlb_psi(iommu, domain->id, start_addr, - size >> VTD_PAGE_SHIFT, 0)) - iommu_flush_write_buffer(iommu); + iommu_flush_iotlb_psi(iommu, domain->id, start_addr, + size >> VTD_PAGE_SHIFT); /* free iova */ __free_iova(&domain->iovad, iova); @@ -2423,7 +2771,7 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne struct intel_iommu *iommu; BUG_ON(dir == DMA_NONE); - if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) + if (iommu_no_mapping(pdev)) return intel_nontranslate_map_sg(hwdev, sglist, nelems, dir); domain = get_valid_domain_for_dev(pdev); @@ -2478,10 +2826,13 @@ static int intel_map_sg(struct device *hwdev, struct scatterlist *sglist, int ne offset += size; } - /* it's a non-present to present mapping */ - if (iommu_flush_iotlb_psi(iommu, domain->id, - start_addr, offset >> VTD_PAGE_SHIFT, 1)) + /* it's a non-present to present mapping. Only flush if caching mode */ + if (cap_caching_mode(iommu->cap)) + iommu_flush_iotlb_psi(iommu, 0, start_addr, + offset >> VTD_PAGE_SHIFT); + else iommu_flush_write_buffer(iommu); + return nelems; } @@ -2640,9 +2991,9 @@ static int init_iommu_hw(void) iommu_set_root_entry(iommu); iommu->flush.flush_context(iommu, 0, 0, 0, - DMA_CCMD_GLOBAL_INVL, 0); + DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); + DMA_TLB_GLOBAL_FLUSH); iommu_disable_protect_mem_regions(iommu); iommu_enable_translation(iommu); } @@ -2657,9 +3008,9 @@ static void iommu_flush_all(void) for_each_active_iommu(iommu, drhd) { iommu->flush.flush_context(iommu, 0, 0, 0, - DMA_CCMD_GLOBAL_INVL, 0); + DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, - DMA_TLB_GLOBAL_FLUSH, 0); + DMA_TLB_GLOBAL_FLUSH); } } @@ -2782,7 +3133,7 @@ int __init intel_iommu_init(void) * Check the need for DMA-remapping initialization now. * Above initialization will also be used by Interrupt-remapping. */ - if (no_iommu || swiotlb || dmar_disabled) + if (no_iommu || (swiotlb && !iommu_pass_through) || dmar_disabled) return -ENODEV; iommu_init_mempool(); @@ -2802,35 +3153,18 @@ int __init intel_iommu_init(void) init_timer(&unmap_timer); force_iommu = 1; - dma_ops = &intel_dma_ops; - init_iommu_sysfs(); - - register_iommu(&intel_iommu_ops); - return 0; -} + if (!iommu_pass_through) { + printk(KERN_INFO + "Multi-level page-table translation for DMAR.\n"); + dma_ops = &intel_dma_ops; + } else + printk(KERN_INFO + "DMAR: Pass through translation for DMAR.\n"); -static int vm_domain_add_dev_info(struct dmar_domain *domain, - struct pci_dev *pdev) -{ - struct device_domain_info *info; - unsigned long flags; - - info = alloc_devinfo_mem(); - if (!info) - return -ENOMEM; - - info->segment = pci_domain_nr(pdev->bus); - info->bus = pdev->bus->number; - info->devfn = pdev->devfn; - info->dev = pdev; - info->domain = domain; + init_iommu_sysfs(); - spin_lock_irqsave(&device_domain_lock, flags); - list_add(&info->link, &domain->devices); - list_add(&info->global, &device_domain_list); - pdev->dev.archdata.iommu = info; - spin_unlock_irqrestore(&device_domain_lock, flags); + register_iommu(&intel_iommu_ops); return 0; } @@ -2862,7 +3196,7 @@ static void iommu_detach_dependent_devices(struct intel_iommu *iommu, } } -static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, +static void domain_remove_one_dev_info(struct dmar_domain *domain, struct pci_dev *pdev) { struct device_domain_info *info; @@ -2888,6 +3222,7 @@ static void vm_domain_remove_one_dev_info(struct dmar_domain *domain, info->dev->dev.archdata.iommu = NULL; spin_unlock_irqrestore(&device_domain_lock, flags); + iommu_disable_dev_iotlb(info); iommu_detach_dev(iommu, info->bus, info->devfn); iommu_detach_dependent_devices(iommu, pdev); free_devinfo_mem(info); @@ -2938,6 +3273,7 @@ static void vm_domain_remove_all_dev_info(struct dmar_domain *domain) spin_unlock_irqrestore(&device_domain_lock, flags1); + iommu_disable_dev_iotlb(info); iommu = device_to_iommu(info->segment, info->bus, info->devfn); iommu_detach_dev(iommu, info->bus, info->devfn); iommu_detach_dependent_devices(iommu, info->dev); @@ -2993,7 +3329,7 @@ static struct dmar_domain *iommu_alloc_vm_domain(void) return domain; } -static int vm_domain_init(struct dmar_domain *domain, int guest_width) +static int md_domain_init(struct dmar_domain *domain, int guest_width) { int adjust_width; @@ -3084,7 +3420,7 @@ static int intel_iommu_domain_init(struct iommu_domain *domain) "intel_iommu_domain_init: dmar_domain == NULL\n"); return -ENOMEM; } - if (vm_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { + if (md_domain_init(dmar_domain, DEFAULT_DOMAIN_ADDRESS_WIDTH)) { printk(KERN_ERR "intel_iommu_domain_init() failed\n"); vm_domain_exit(dmar_domain); @@ -3119,8 +3455,9 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, old_domain = find_domain(pdev); if (old_domain) { - if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) - vm_domain_remove_one_dev_info(old_domain, pdev); + if (dmar_domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE || + dmar_domain->flags & DOMAIN_FLAG_STATIC_IDENTITY) + domain_remove_one_dev_info(old_domain, pdev); else domain_remove_dev_info(old_domain); } @@ -3142,11 +3479,11 @@ static int intel_iommu_attach_device(struct iommu_domain *domain, return -EFAULT; } - ret = domain_context_mapping(dmar_domain, pdev); + ret = domain_add_dev_info(dmar_domain, pdev); if (ret) return ret; - ret = vm_domain_add_dev_info(dmar_domain, pdev); + ret = domain_context_mapping(dmar_domain, pdev, CONTEXT_TT_MULTI_LEVEL); return ret; } @@ -3156,7 +3493,7 @@ static void intel_iommu_detach_device(struct iommu_domain *domain, struct dmar_domain *dmar_domain = domain->priv; struct pci_dev *pdev = to_pci_dev(dev); - vm_domain_remove_one_dev_info(dmar_domain, pdev); + domain_remove_one_dev_info(dmar_domain, pdev); } static int intel_iommu_map_range(struct iommu_domain *domain, |