|
@@ -9,11 +9,20 @@
|
|
|
* License as published by the Free Software Foundation.
|
|
|
*/
|
|
|
|
|
|
+#include <linux/slab.h>
|
|
|
+#include <linux/mmu_notifier.h>
|
|
|
+#include <linux/mmu_context.h>
|
|
|
+#include <linux/of.h>
|
|
|
#include <linux/export.h>
|
|
|
#include <linux/pci.h>
|
|
|
#include <linux/memblock.h>
|
|
|
#include <linux/iommu.h>
|
|
|
|
|
|
+#include <asm/tlb.h>
|
|
|
+#include <asm/powernv.h>
|
|
|
+#include <asm/reg.h>
|
|
|
+#include <asm/opal.h>
|
|
|
+#include <asm/io.h>
|
|
|
#include <asm/iommu.h>
|
|
|
#include <asm/pnv-pci.h>
|
|
|
#include <asm/msi_bitmap.h>
|
|
@@ -22,6 +31,8 @@
|
|
|
#include "powernv.h"
|
|
|
#include "pci.h"
|
|
|
|
|
|
+#define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
|
|
|
+
|
|
|
/*
|
|
|
* Other types of TCE cache invalidation are not functional in the
|
|
|
* hardware.
|
|
@@ -371,3 +382,442 @@ struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
|
|
|
|
|
|
return gpe;
|
|
|
}
|
|
|
+
|
|
|
+/* Maximum number of nvlinks per npu */
|
|
|
+#define NV_MAX_LINKS 6
|
|
|
+
|
|
|
+/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
|
|
|
+static int max_npu2_index;
|
|
|
+
|
|
|
+struct npu_context {
|
|
|
+ struct mm_struct *mm;
|
|
|
+ struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
|
|
|
+ struct mmu_notifier mn;
|
|
|
+ struct kref kref;
|
|
|
+
|
|
|
+ /* Callback to stop translation requests on a given GPU */
|
|
|
+ struct npu_context *(*release_cb)(struct npu_context *, void *);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Private pointer passed to the above callback for usage by
|
|
|
+ * device drivers.
|
|
|
+ */
|
|
|
+ void *priv;
|
|
|
+};
|
|
|
+
|
|
|
+/*
|
|
|
+ * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
|
|
|
+ * if none are available.
|
|
|
+ */
|
|
|
+static int get_mmio_atsd_reg(struct npu *npu)
|
|
|
+{
|
|
|
+ int i;
|
|
|
+
|
|
|
+ for (i = 0; i < npu->mmio_atsd_count; i++) {
|
|
|
+ if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
|
|
|
+ return i;
|
|
|
+ }
|
|
|
+
|
|
|
+ return -ENOSPC;
|
|
|
+}
|
|
|
+
|
|
|
+static void put_mmio_atsd_reg(struct npu *npu, int reg)
|
|
|
+{
|
|
|
+ clear_bit(reg, &npu->mmio_atsd_usage);
|
|
|
+}
|
|
|
+
|
|
|
+/* MMIO ATSD register offsets */
|
|
|
+#define XTS_ATSD_AVA 1
|
|
|
+#define XTS_ATSD_STAT 2
|
|
|
+
|
|
|
+static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
|
|
|
+ unsigned long va)
|
|
|
+{
|
|
|
+ int mmio_atsd_reg;
|
|
|
+
|
|
|
+ do {
|
|
|
+ mmio_atsd_reg = get_mmio_atsd_reg(npu);
|
|
|
+ cpu_relax();
|
|
|
+ } while (mmio_atsd_reg < 0);
|
|
|
+
|
|
|
+ __raw_writeq(cpu_to_be64(va),
|
|
|
+ npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
|
|
|
+ eieio();
|
|
|
+ __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
|
|
|
+
|
|
|
+ return mmio_atsd_reg;
|
|
|
+}
|
|
|
+
|
|
|
+static int mmio_invalidate_pid(struct npu *npu, unsigned long pid)
|
|
|
+{
|
|
|
+ unsigned long launch;
|
|
|
+
|
|
|
+ /* IS set to invalidate matching PID */
|
|
|
+ launch = PPC_BIT(12);
|
|
|
+
|
|
|
+ /* PRS set to process-scoped */
|
|
|
+ launch |= PPC_BIT(13);
|
|
|
+
|
|
|
+ /* AP */
|
|
|
+ launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
|
|
|
+
|
|
|
+ /* PID */
|
|
|
+ launch |= pid << PPC_BITLSHIFT(38);
|
|
|
+
|
|
|
+ /* Invalidating the entire process doesn't use a va */
|
|
|
+ return mmio_launch_invalidate(npu, launch, 0);
|
|
|
+}
|
|
|
+
|
|
|
+static int mmio_invalidate_va(struct npu *npu, unsigned long va,
|
|
|
+ unsigned long pid)
|
|
|
+{
|
|
|
+ unsigned long launch;
|
|
|
+
|
|
|
+ /* IS set to invalidate target VA */
|
|
|
+ launch = 0;
|
|
|
+
|
|
|
+ /* PRS set to process scoped */
|
|
|
+ launch |= PPC_BIT(13);
|
|
|
+
|
|
|
+ /* AP */
|
|
|
+ launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
|
|
|
+
|
|
|
+ /* PID */
|
|
|
+ launch |= pid << PPC_BITLSHIFT(38);
|
|
|
+
|
|
|
+ return mmio_launch_invalidate(npu, launch, va);
|
|
|
+}
|
|
|
+
|
|
|
+#define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
|
|
|
+
|
|
|
+/*
|
|
|
+ * Invalidate either a single address or an entire PID depending on
|
|
|
+ * the value of va.
|
|
|
+ */
|
|
|
+static void mmio_invalidate(struct npu_context *npu_context, int va,
|
|
|
+ unsigned long address)
|
|
|
+{
|
|
|
+ int i, j, reg;
|
|
|
+ struct npu *npu;
|
|
|
+ struct pnv_phb *nphb;
|
|
|
+ struct pci_dev *npdev;
|
|
|
+ struct {
|
|
|
+ struct npu *npu;
|
|
|
+ int reg;
|
|
|
+ } mmio_atsd_reg[NV_MAX_NPUS];
|
|
|
+ unsigned long pid = npu_context->mm->context.id;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Loop over all the NPUs this process is active on and launch
|
|
|
+ * an invalidate.
|
|
|
+ */
|
|
|
+ for (i = 0; i <= max_npu2_index; i++) {
|
|
|
+ mmio_atsd_reg[i].reg = -1;
|
|
|
+ for (j = 0; j < NV_MAX_LINKS; j++) {
|
|
|
+ npdev = npu_context->npdev[i][j];
|
|
|
+ if (!npdev)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
|
|
|
+ npu = &nphb->npu;
|
|
|
+ mmio_atsd_reg[i].npu = npu;
|
|
|
+
|
|
|
+ if (va)
|
|
|
+ mmio_atsd_reg[i].reg =
|
|
|
+ mmio_invalidate_va(npu, address, pid);
|
|
|
+ else
|
|
|
+ mmio_atsd_reg[i].reg =
|
|
|
+ mmio_invalidate_pid(npu, pid);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The NPU hardware forwards the shootdown to all GPUs
|
|
|
+ * so we only have to launch one shootdown per NPU.
|
|
|
+ */
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Unfortunately the nest mmu does not support flushing specific
|
|
|
+ * addresses so we have to flush the whole mm.
|
|
|
+ */
|
|
|
+ flush_tlb_mm(npu_context->mm);
|
|
|
+
|
|
|
+ /* Wait for all invalidations to complete */
|
|
|
+ for (i = 0; i <= max_npu2_index; i++) {
|
|
|
+ if (mmio_atsd_reg[i].reg < 0)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /* Wait for completion */
|
|
|
+ npu = mmio_atsd_reg[i].npu;
|
|
|
+ reg = mmio_atsd_reg[i].reg;
|
|
|
+ while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
|
|
|
+ cpu_relax();
|
|
|
+ put_mmio_atsd_reg(npu, reg);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+static void pnv_npu2_mn_release(struct mmu_notifier *mn,
|
|
|
+ struct mm_struct *mm)
|
|
|
+{
|
|
|
+ struct npu_context *npu_context = mn_to_npu_context(mn);
|
|
|
+
|
|
|
+ /* Call into device driver to stop requests to the NMMU */
|
|
|
+ if (npu_context->release_cb)
|
|
|
+ npu_context->release_cb(npu_context, npu_context->priv);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * There should be no more translation requests for this PID, but we
|
|
|
+ * need to ensure any entries for it are removed from the TLB.
|
|
|
+ */
|
|
|
+ mmio_invalidate(npu_context, 0, 0);
|
|
|
+}
|
|
|
+
|
|
|
+static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
|
|
|
+ struct mm_struct *mm,
|
|
|
+ unsigned long address,
|
|
|
+ pte_t pte)
|
|
|
+{
|
|
|
+ struct npu_context *npu_context = mn_to_npu_context(mn);
|
|
|
+
|
|
|
+ mmio_invalidate(npu_context, 1, address);
|
|
|
+}
|
|
|
+
|
|
|
+static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
|
|
|
+ struct mm_struct *mm,
|
|
|
+ unsigned long address)
|
|
|
+{
|
|
|
+ struct npu_context *npu_context = mn_to_npu_context(mn);
|
|
|
+
|
|
|
+ mmio_invalidate(npu_context, 1, address);
|
|
|
+}
|
|
|
+
|
|
|
+static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
|
|
|
+ struct mm_struct *mm,
|
|
|
+ unsigned long start, unsigned long end)
|
|
|
+{
|
|
|
+ struct npu_context *npu_context = mn_to_npu_context(mn);
|
|
|
+ unsigned long address;
|
|
|
+
|
|
|
+ for (address = start; address <= end; address += PAGE_SIZE)
|
|
|
+ mmio_invalidate(npu_context, 1, address);
|
|
|
+}
|
|
|
+
|
|
|
+static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
|
|
|
+ .release = pnv_npu2_mn_release,
|
|
|
+ .change_pte = pnv_npu2_mn_change_pte,
|
|
|
+ .invalidate_page = pnv_npu2_mn_invalidate_page,
|
|
|
+ .invalidate_range = pnv_npu2_mn_invalidate_range,
|
|
|
+};
|
|
|
+
|
|
|
+/*
|
|
|
+ * Call into OPAL to setup the nmmu context for the current task in
|
|
|
+ * the NPU. This must be called to setup the context tables before the
|
|
|
+ * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
|
|
|
+ *
|
|
|
+ * A release callback should be registered to allow a device driver to
|
|
|
+ * be notified that it should not launch any new translation requests
|
|
|
+ * as the final TLB invalidate is about to occur.
|
|
|
+ *
|
|
|
+ * Returns an error if there no contexts are currently available or a
|
|
|
+ * npu_context which should be passed to pnv_npu2_handle_fault().
|
|
|
+ *
|
|
|
+ * mmap_sem must be held in write mode.
|
|
|
+ */
|
|
|
+struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
|
|
|
+ unsigned long flags,
|
|
|
+ struct npu_context *(*cb)(struct npu_context *, void *),
|
|
|
+ void *priv)
|
|
|
+{
|
|
|
+ int rc;
|
|
|
+ u32 nvlink_index;
|
|
|
+ struct device_node *nvlink_dn;
|
|
|
+ struct mm_struct *mm = current->mm;
|
|
|
+ struct pnv_phb *nphb;
|
|
|
+ struct npu *npu;
|
|
|
+ struct npu_context *npu_context;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * At present we don't support GPUs connected to multiple NPUs and I'm
|
|
|
+ * not sure the hardware does either.
|
|
|
+ */
|
|
|
+ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
|
|
|
+
|
|
|
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
|
|
|
+ return ERR_PTR(-ENODEV);
|
|
|
+
|
|
|
+ if (!npdev)
|
|
|
+ /* No nvlink associated with this GPU device */
|
|
|
+ return ERR_PTR(-ENODEV);
|
|
|
+
|
|
|
+ if (!mm) {
|
|
|
+ /* kernel thread contexts are not supported */
|
|
|
+ return ERR_PTR(-EINVAL);
|
|
|
+ }
|
|
|
+
|
|
|
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
|
|
|
+ npu = &nphb->npu;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Setup the NPU context table for a particular GPU. These need to be
|
|
|
+ * per-GPU as we need the tables to filter ATSDs when there are no
|
|
|
+ * active contexts on a particular GPU.
|
|
|
+ */
|
|
|
+ rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
|
|
|
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn));
|
|
|
+ if (rc < 0)
|
|
|
+ return ERR_PTR(-ENOSPC);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We store the npu pci device so we can more easily get at the
|
|
|
+ * associated npus.
|
|
|
+ */
|
|
|
+ npu_context = mm->context.npu_context;
|
|
|
+ if (!npu_context) {
|
|
|
+ npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
|
|
|
+ if (!npu_context)
|
|
|
+ return ERR_PTR(-ENOMEM);
|
|
|
+
|
|
|
+ mm->context.npu_context = npu_context;
|
|
|
+ npu_context->mm = mm;
|
|
|
+ npu_context->mn.ops = &nv_nmmu_notifier_ops;
|
|
|
+ __mmu_notifier_register(&npu_context->mn, mm);
|
|
|
+ kref_init(&npu_context->kref);
|
|
|
+ } else {
|
|
|
+ kref_get(&npu_context->kref);
|
|
|
+ }
|
|
|
+
|
|
|
+ npu_context->release_cb = cb;
|
|
|
+ npu_context->priv = priv;
|
|
|
+ nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
|
|
|
+ if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
|
|
|
+ &nvlink_index)))
|
|
|
+ return ERR_PTR(-ENODEV);
|
|
|
+ npu_context->npdev[npu->index][nvlink_index] = npdev;
|
|
|
+
|
|
|
+ return npu_context;
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(pnv_npu2_init_context);
|
|
|
+
|
|
|
+static void pnv_npu2_release_context(struct kref *kref)
|
|
|
+{
|
|
|
+ struct npu_context *npu_context =
|
|
|
+ container_of(kref, struct npu_context, kref);
|
|
|
+
|
|
|
+ npu_context->mm->context.npu_context = NULL;
|
|
|
+ mmu_notifier_unregister(&npu_context->mn,
|
|
|
+ npu_context->mm);
|
|
|
+
|
|
|
+ kfree(npu_context);
|
|
|
+}
|
|
|
+
|
|
|
+void pnv_npu2_destroy_context(struct npu_context *npu_context,
|
|
|
+ struct pci_dev *gpdev)
|
|
|
+{
|
|
|
+ struct pnv_phb *nphb, *phb;
|
|
|
+ struct npu *npu;
|
|
|
+ struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
|
|
|
+ struct device_node *nvlink_dn;
|
|
|
+ u32 nvlink_index;
|
|
|
+
|
|
|
+ if (WARN_ON(!npdev))
|
|
|
+ return;
|
|
|
+
|
|
|
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
|
|
|
+ return;
|
|
|
+
|
|
|
+ nphb = pci_bus_to_host(npdev->bus)->private_data;
|
|
|
+ npu = &nphb->npu;
|
|
|
+ phb = pci_bus_to_host(gpdev->bus)->private_data;
|
|
|
+ nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
|
|
|
+ if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
|
|
|
+ &nvlink_index)))
|
|
|
+ return;
|
|
|
+ npu_context->npdev[npu->index][nvlink_index] = NULL;
|
|
|
+ opal_npu_destroy_context(phb->opal_id, npu_context->mm->context.id,
|
|
|
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn));
|
|
|
+ kref_put(&npu_context->kref, pnv_npu2_release_context);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(pnv_npu2_destroy_context);
|
|
|
+
|
|
|
+/*
|
|
|
+ * Assumes mmap_sem is held for the contexts associated mm.
|
|
|
+ */
|
|
|
+int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
|
|
|
+ unsigned long *flags, unsigned long *status, int count)
|
|
|
+{
|
|
|
+ u64 rc = 0, result = 0;
|
|
|
+ int i, is_write;
|
|
|
+ struct page *page[1];
|
|
|
+
|
|
|
+ /* mmap_sem should be held so the struct_mm must be present */
|
|
|
+ struct mm_struct *mm = context->mm;
|
|
|
+
|
|
|
+ if (!firmware_has_feature(FW_FEATURE_OPAL))
|
|
|
+ return -ENODEV;
|
|
|
+
|
|
|
+ WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
|
|
|
+
|
|
|
+ for (i = 0; i < count; i++) {
|
|
|
+ is_write = flags[i] & NPU2_WRITE;
|
|
|
+ rc = get_user_pages_remote(NULL, mm, ea[i], 1,
|
|
|
+ is_write ? FOLL_WRITE : 0,
|
|
|
+ page, NULL, NULL);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * To support virtualised environments we will have to do an
|
|
|
+ * access to the page to ensure it gets faulted into the
|
|
|
+ * hypervisor. For the moment virtualisation is not supported in
|
|
|
+ * other areas so leave the access out.
|
|
|
+ */
|
|
|
+ if (rc != 1) {
|
|
|
+ status[i] = rc;
|
|
|
+ result = -EFAULT;
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ status[i] = 0;
|
|
|
+ put_page(page[0]);
|
|
|
+ }
|
|
|
+
|
|
|
+ return result;
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(pnv_npu2_handle_fault);
|
|
|
+
|
|
|
+int pnv_npu2_init(struct pnv_phb *phb)
|
|
|
+{
|
|
|
+ unsigned int i;
|
|
|
+ u64 mmio_atsd;
|
|
|
+ struct device_node *dn;
|
|
|
+ struct pci_dev *gpdev;
|
|
|
+ static int npu_index;
|
|
|
+ uint64_t rc = 0;
|
|
|
+
|
|
|
+ for_each_child_of_node(phb->hose->dn, dn) {
|
|
|
+ gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
|
|
|
+ if (gpdev) {
|
|
|
+ rc = opal_npu_map_lpar(phb->opal_id,
|
|
|
+ PCI_DEVID(gpdev->bus->number, gpdev->devfn),
|
|
|
+ 0, 0);
|
|
|
+ if (rc)
|
|
|
+ dev_err(&gpdev->dev,
|
|
|
+ "Error %lld mapping device to LPAR\n",
|
|
|
+ rc);
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
|
|
|
+ i, &mmio_atsd); i++)
|
|
|
+ phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
|
|
|
+
|
|
|
+ pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
|
|
|
+ phb->npu.mmio_atsd_count = i;
|
|
|
+ phb->npu.mmio_atsd_usage = 0;
|
|
|
+ npu_index++;
|
|
|
+ if (WARN_ON(npu_index >= NV_MAX_NPUS))
|
|
|
+ return -ENOSPC;
|
|
|
+ max_npu2_index = npu_index;
|
|
|
+ phb->npu.index = npu_index;
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|