npu-dma.c 21 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859
  1. /*
  2. * This file implements the DMA operations for NVLink devices. The NPU
  3. * devices all point to the same iommu table as the parent PCI device.
  4. *
  5. * Copyright Alistair Popple, IBM Corporation 2015.
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of version 2 of the GNU General Public
  9. * License as published by the Free Software Foundation.
  10. */
  11. #include <linux/slab.h>
  12. #include <linux/mmu_notifier.h>
  13. #include <linux/mmu_context.h>
  14. #include <linux/of.h>
  15. #include <linux/export.h>
  16. #include <linux/pci.h>
  17. #include <linux/memblock.h>
  18. #include <linux/iommu.h>
  19. #include <asm/tlb.h>
  20. #include <asm/powernv.h>
  21. #include <asm/reg.h>
  22. #include <asm/opal.h>
  23. #include <asm/io.h>
  24. #include <asm/iommu.h>
  25. #include <asm/pnv-pci.h>
  26. #include <asm/msi_bitmap.h>
  27. #include <asm/opal.h>
  28. #include "powernv.h"
  29. #include "pci.h"
  30. #define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
  31. /*
  32. * Other types of TCE cache invalidation are not functional in the
  33. * hardware.
  34. */
  35. static struct pci_dev *get_pci_dev(struct device_node *dn)
  36. {
  37. return PCI_DN(dn)->pcidev;
  38. }
  39. /* Given a NPU device get the associated PCI device. */
  40. struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
  41. {
  42. struct device_node *dn;
  43. struct pci_dev *gpdev;
  44. if (WARN_ON(!npdev))
  45. return NULL;
  46. if (WARN_ON(!npdev->dev.of_node))
  47. return NULL;
  48. /* Get assoicated PCI device */
  49. dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
  50. if (!dn)
  51. return NULL;
  52. gpdev = get_pci_dev(dn);
  53. of_node_put(dn);
  54. return gpdev;
  55. }
  56. EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
  57. /* Given the real PCI device get a linked NPU device. */
  58. struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
  59. {
  60. struct device_node *dn;
  61. struct pci_dev *npdev;
  62. if (WARN_ON(!gpdev))
  63. return NULL;
  64. /* Not all PCI devices have device-tree nodes */
  65. if (!gpdev->dev.of_node)
  66. return NULL;
  67. /* Get assoicated PCI device */
  68. dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
  69. if (!dn)
  70. return NULL;
  71. npdev = get_pci_dev(dn);
  72. of_node_put(dn);
  73. return npdev;
  74. }
  75. EXPORT_SYMBOL(pnv_pci_get_npu_dev);
  76. #define NPU_DMA_OP_UNSUPPORTED() \
  77. dev_err_once(dev, "%s operation unsupported for NVLink devices\n", \
  78. __func__)
  79. static void *dma_npu_alloc(struct device *dev, size_t size,
  80. dma_addr_t *dma_handle, gfp_t flag,
  81. unsigned long attrs)
  82. {
  83. NPU_DMA_OP_UNSUPPORTED();
  84. return NULL;
  85. }
  86. static void dma_npu_free(struct device *dev, size_t size,
  87. void *vaddr, dma_addr_t dma_handle,
  88. unsigned long attrs)
  89. {
  90. NPU_DMA_OP_UNSUPPORTED();
  91. }
  92. static dma_addr_t dma_npu_map_page(struct device *dev, struct page *page,
  93. unsigned long offset, size_t size,
  94. enum dma_data_direction direction,
  95. unsigned long attrs)
  96. {
  97. NPU_DMA_OP_UNSUPPORTED();
  98. return 0;
  99. }
  100. static int dma_npu_map_sg(struct device *dev, struct scatterlist *sglist,
  101. int nelems, enum dma_data_direction direction,
  102. unsigned long attrs)
  103. {
  104. NPU_DMA_OP_UNSUPPORTED();
  105. return 0;
  106. }
  107. static int dma_npu_dma_supported(struct device *dev, u64 mask)
  108. {
  109. NPU_DMA_OP_UNSUPPORTED();
  110. return 0;
  111. }
  112. static u64 dma_npu_get_required_mask(struct device *dev)
  113. {
  114. NPU_DMA_OP_UNSUPPORTED();
  115. return 0;
  116. }
  117. static const struct dma_map_ops dma_npu_ops = {
  118. .map_page = dma_npu_map_page,
  119. .map_sg = dma_npu_map_sg,
  120. .alloc = dma_npu_alloc,
  121. .free = dma_npu_free,
  122. .dma_supported = dma_npu_dma_supported,
  123. .get_required_mask = dma_npu_get_required_mask,
  124. };
  125. /*
  126. * Returns the PE assoicated with the PCI device of the given
  127. * NPU. Returns the linked pci device if pci_dev != NULL.
  128. */
  129. static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
  130. struct pci_dev **gpdev)
  131. {
  132. struct pnv_phb *phb;
  133. struct pci_controller *hose;
  134. struct pci_dev *pdev;
  135. struct pnv_ioda_pe *pe;
  136. struct pci_dn *pdn;
  137. pdev = pnv_pci_get_gpu_dev(npe->pdev);
  138. if (!pdev)
  139. return NULL;
  140. pdn = pci_get_pdn(pdev);
  141. if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
  142. return NULL;
  143. hose = pci_bus_to_host(pdev->bus);
  144. phb = hose->private_data;
  145. pe = &phb->ioda.pe_array[pdn->pe_number];
  146. if (gpdev)
  147. *gpdev = pdev;
  148. return pe;
  149. }
  150. long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
  151. struct iommu_table *tbl)
  152. {
  153. struct pnv_phb *phb = npe->phb;
  154. int64_t rc;
  155. const unsigned long size = tbl->it_indirect_levels ?
  156. tbl->it_level_size : tbl->it_size;
  157. const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
  158. const __u64 win_size = tbl->it_size << tbl->it_page_shift;
  159. pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
  160. start_addr, start_addr + win_size - 1,
  161. IOMMU_PAGE_SIZE(tbl));
  162. rc = opal_pci_map_pe_dma_window(phb->opal_id,
  163. npe->pe_number,
  164. npe->pe_number,
  165. tbl->it_indirect_levels + 1,
  166. __pa(tbl->it_base),
  167. size << 3,
  168. IOMMU_PAGE_SIZE(tbl));
  169. if (rc) {
  170. pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
  171. return rc;
  172. }
  173. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  174. /* Add the table to the list so its TCE cache will get invalidated */
  175. pnv_pci_link_table_and_group(phb->hose->node, num,
  176. tbl, &npe->table_group);
  177. return 0;
  178. }
  179. long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
  180. {
  181. struct pnv_phb *phb = npe->phb;
  182. int64_t rc;
  183. pe_info(npe, "Removing DMA window\n");
  184. rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
  185. npe->pe_number,
  186. 0/* levels */, 0/* table address */,
  187. 0/* table size */, 0/* page size */);
  188. if (rc) {
  189. pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
  190. return rc;
  191. }
  192. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  193. pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
  194. &npe->table_group);
  195. return 0;
  196. }
  197. /*
  198. * Enables 32 bit DMA on NPU.
  199. */
  200. static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
  201. {
  202. struct pci_dev *gpdev;
  203. struct pnv_ioda_pe *gpe;
  204. int64_t rc;
  205. /*
  206. * Find the assoicated PCI devices and get the dma window
  207. * information from there.
  208. */
  209. if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
  210. return;
  211. gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
  212. if (!gpe)
  213. return;
  214. rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
  215. /*
  216. * We don't initialise npu_pe->tce32_table as we always use
  217. * dma_npu_ops which are nops.
  218. */
  219. set_dma_ops(&npe->pdev->dev, &dma_npu_ops);
  220. }
  221. /*
  222. * Enables bypass mode on the NPU. The NPU only supports one
  223. * window per link, so bypass needs to be explicitly enabled or
  224. * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
  225. * active at the same time.
  226. */
  227. static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
  228. {
  229. struct pnv_phb *phb = npe->phb;
  230. int64_t rc = 0;
  231. phys_addr_t top = memblock_end_of_DRAM();
  232. if (phb->type != PNV_PHB_NPU || !npe->pdev)
  233. return -EINVAL;
  234. rc = pnv_npu_unset_window(npe, 0);
  235. if (rc != OPAL_SUCCESS)
  236. return rc;
  237. /* Enable the bypass window */
  238. top = roundup_pow_of_two(top);
  239. dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
  240. npe->pe_number);
  241. rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
  242. npe->pe_number, npe->pe_number,
  243. 0 /* bypass base */, top);
  244. if (rc == OPAL_SUCCESS)
  245. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  246. return rc;
  247. }
  248. void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
  249. {
  250. int i;
  251. struct pnv_phb *phb;
  252. struct pci_dn *pdn;
  253. struct pnv_ioda_pe *npe;
  254. struct pci_dev *npdev;
  255. for (i = 0; ; ++i) {
  256. npdev = pnv_pci_get_npu_dev(gpdev, i);
  257. if (!npdev)
  258. break;
  259. pdn = pci_get_pdn(npdev);
  260. if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
  261. return;
  262. phb = pci_bus_to_host(npdev->bus)->private_data;
  263. /* We only do bypass if it's enabled on the linked device */
  264. npe = &phb->ioda.pe_array[pdn->pe_number];
  265. if (bypass) {
  266. dev_info(&npdev->dev,
  267. "Using 64-bit DMA iommu bypass\n");
  268. pnv_npu_dma_set_bypass(npe);
  269. } else {
  270. dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
  271. pnv_npu_dma_set_32(npe);
  272. }
  273. }
  274. }
  275. /* Switch ownership from platform code to external user (e.g. VFIO) */
  276. void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
  277. {
  278. struct pnv_phb *phb = npe->phb;
  279. int64_t rc;
  280. /*
  281. * Note: NPU has just a single TVE in the hardware which means that
  282. * while used by the kernel, it can have either 32bit window or
  283. * DMA bypass but never both. So we deconfigure 32bit window only
  284. * if it was enabled at the moment of ownership change.
  285. */
  286. if (npe->table_group.tables[0]) {
  287. pnv_npu_unset_window(npe, 0);
  288. return;
  289. }
  290. /* Disable bypass */
  291. rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
  292. npe->pe_number, npe->pe_number,
  293. 0 /* bypass base */, 0);
  294. if (rc) {
  295. pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
  296. return;
  297. }
  298. pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
  299. }
  300. struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
  301. {
  302. struct pnv_phb *phb = npe->phb;
  303. struct pci_bus *pbus = phb->hose->bus;
  304. struct pci_dev *npdev, *gpdev = NULL, *gptmp;
  305. struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
  306. if (!gpe || !gpdev)
  307. return NULL;
  308. list_for_each_entry(npdev, &pbus->devices, bus_list) {
  309. gptmp = pnv_pci_get_gpu_dev(npdev);
  310. if (gptmp != gpdev)
  311. continue;
  312. pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
  313. iommu_group_add_device(gpe->table_group.group, &npdev->dev);
  314. }
  315. return gpe;
  316. }
  317. /* Maximum number of nvlinks per npu */
  318. #define NV_MAX_LINKS 6
  319. /* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
  320. static int max_npu2_index;
  321. struct npu_context {
  322. struct mm_struct *mm;
  323. struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
  324. struct mmu_notifier mn;
  325. struct kref kref;
  326. /* Callback to stop translation requests on a given GPU */
  327. struct npu_context *(*release_cb)(struct npu_context *, void *);
  328. /*
  329. * Private pointer passed to the above callback for usage by
  330. * device drivers.
  331. */
  332. void *priv;
  333. };
  334. /*
  335. * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
  336. * if none are available.
  337. */
  338. static int get_mmio_atsd_reg(struct npu *npu)
  339. {
  340. int i;
  341. for (i = 0; i < npu->mmio_atsd_count; i++) {
  342. if (!test_and_set_bit(i, &npu->mmio_atsd_usage))
  343. return i;
  344. }
  345. return -ENOSPC;
  346. }
  347. static void put_mmio_atsd_reg(struct npu *npu, int reg)
  348. {
  349. clear_bit(reg, &npu->mmio_atsd_usage);
  350. }
  351. /* MMIO ATSD register offsets */
  352. #define XTS_ATSD_AVA 1
  353. #define XTS_ATSD_STAT 2
  354. static int mmio_launch_invalidate(struct npu *npu, unsigned long launch,
  355. unsigned long va)
  356. {
  357. int mmio_atsd_reg;
  358. do {
  359. mmio_atsd_reg = get_mmio_atsd_reg(npu);
  360. cpu_relax();
  361. } while (mmio_atsd_reg < 0);
  362. __raw_writeq(cpu_to_be64(va),
  363. npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA);
  364. eieio();
  365. __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]);
  366. return mmio_atsd_reg;
  367. }
  368. static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush)
  369. {
  370. unsigned long launch;
  371. /* IS set to invalidate matching PID */
  372. launch = PPC_BIT(12);
  373. /* PRS set to process-scoped */
  374. launch |= PPC_BIT(13);
  375. /* AP */
  376. launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
  377. /* PID */
  378. launch |= pid << PPC_BITLSHIFT(38);
  379. /* No flush */
  380. launch |= !flush << PPC_BITLSHIFT(39);
  381. /* Invalidating the entire process doesn't use a va */
  382. return mmio_launch_invalidate(npu, launch, 0);
  383. }
  384. static int mmio_invalidate_va(struct npu *npu, unsigned long va,
  385. unsigned long pid, bool flush)
  386. {
  387. unsigned long launch;
  388. /* IS set to invalidate target VA */
  389. launch = 0;
  390. /* PRS set to process scoped */
  391. launch |= PPC_BIT(13);
  392. /* AP */
  393. launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
  394. /* PID */
  395. launch |= pid << PPC_BITLSHIFT(38);
  396. /* No flush */
  397. launch |= !flush << PPC_BITLSHIFT(39);
  398. return mmio_launch_invalidate(npu, launch, va);
  399. }
  400. #define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
  401. struct mmio_atsd_reg {
  402. struct npu *npu;
  403. int reg;
  404. };
  405. static void mmio_invalidate_wait(
  406. struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush)
  407. {
  408. struct npu *npu;
  409. int i, reg;
  410. /* Wait for all invalidations to complete */
  411. for (i = 0; i <= max_npu2_index; i++) {
  412. if (mmio_atsd_reg[i].reg < 0)
  413. continue;
  414. /* Wait for completion */
  415. npu = mmio_atsd_reg[i].npu;
  416. reg = mmio_atsd_reg[i].reg;
  417. while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
  418. cpu_relax();
  419. put_mmio_atsd_reg(npu, reg);
  420. /*
  421. * The GPU requires two flush ATSDs to ensure all entries have
  422. * been flushed. We use PID 0 as it will never be used for a
  423. * process on the GPU.
  424. */
  425. if (flush)
  426. mmio_invalidate_pid(npu, 0, true);
  427. }
  428. }
  429. /*
  430. * Invalidate either a single address or an entire PID depending on
  431. * the value of va.
  432. */
  433. static void mmio_invalidate(struct npu_context *npu_context, int va,
  434. unsigned long address, bool flush)
  435. {
  436. int i, j;
  437. struct npu *npu;
  438. struct pnv_phb *nphb;
  439. struct pci_dev *npdev;
  440. struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
  441. unsigned long pid = npu_context->mm->context.id;
  442. /*
  443. * Loop over all the NPUs this process is active on and launch
  444. * an invalidate.
  445. */
  446. for (i = 0; i <= max_npu2_index; i++) {
  447. mmio_atsd_reg[i].reg = -1;
  448. for (j = 0; j < NV_MAX_LINKS; j++) {
  449. npdev = npu_context->npdev[i][j];
  450. if (!npdev)
  451. continue;
  452. nphb = pci_bus_to_host(npdev->bus)->private_data;
  453. npu = &nphb->npu;
  454. mmio_atsd_reg[i].npu = npu;
  455. if (va)
  456. mmio_atsd_reg[i].reg =
  457. mmio_invalidate_va(npu, address, pid,
  458. flush);
  459. else
  460. mmio_atsd_reg[i].reg =
  461. mmio_invalidate_pid(npu, pid, flush);
  462. /*
  463. * The NPU hardware forwards the shootdown to all GPUs
  464. * so we only have to launch one shootdown per NPU.
  465. */
  466. break;
  467. }
  468. }
  469. /*
  470. * Unfortunately the nest mmu does not support flushing specific
  471. * addresses so we have to flush the whole mm.
  472. */
  473. flush_tlb_mm(npu_context->mm);
  474. mmio_invalidate_wait(mmio_atsd_reg, flush);
  475. if (flush)
  476. /* Wait for the flush to complete */
  477. mmio_invalidate_wait(mmio_atsd_reg, false);
  478. }
  479. static void pnv_npu2_mn_release(struct mmu_notifier *mn,
  480. struct mm_struct *mm)
  481. {
  482. struct npu_context *npu_context = mn_to_npu_context(mn);
  483. /* Call into device driver to stop requests to the NMMU */
  484. if (npu_context->release_cb)
  485. npu_context->release_cb(npu_context, npu_context->priv);
  486. /*
  487. * There should be no more translation requests for this PID, but we
  488. * need to ensure any entries for it are removed from the TLB.
  489. */
  490. mmio_invalidate(npu_context, 0, 0, true);
  491. }
  492. static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
  493. struct mm_struct *mm,
  494. unsigned long address,
  495. pte_t pte)
  496. {
  497. struct npu_context *npu_context = mn_to_npu_context(mn);
  498. mmio_invalidate(npu_context, 1, address, true);
  499. }
  500. static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn,
  501. struct mm_struct *mm,
  502. unsigned long address)
  503. {
  504. struct npu_context *npu_context = mn_to_npu_context(mn);
  505. mmio_invalidate(npu_context, 1, address, true);
  506. }
  507. static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
  508. struct mm_struct *mm,
  509. unsigned long start, unsigned long end)
  510. {
  511. struct npu_context *npu_context = mn_to_npu_context(mn);
  512. unsigned long address;
  513. for (address = start; address < end; address += PAGE_SIZE)
  514. mmio_invalidate(npu_context, 1, address, false);
  515. /* Do the flush only on the final addess == end */
  516. mmio_invalidate(npu_context, 1, address, true);
  517. }
  518. static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
  519. .release = pnv_npu2_mn_release,
  520. .change_pte = pnv_npu2_mn_change_pte,
  521. .invalidate_page = pnv_npu2_mn_invalidate_page,
  522. .invalidate_range = pnv_npu2_mn_invalidate_range,
  523. };
  524. /*
  525. * Call into OPAL to setup the nmmu context for the current task in
  526. * the NPU. This must be called to setup the context tables before the
  527. * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
  528. *
  529. * A release callback should be registered to allow a device driver to
  530. * be notified that it should not launch any new translation requests
  531. * as the final TLB invalidate is about to occur.
  532. *
  533. * Returns an error if there no contexts are currently available or a
  534. * npu_context which should be passed to pnv_npu2_handle_fault().
  535. *
  536. * mmap_sem must be held in write mode.
  537. */
  538. struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
  539. unsigned long flags,
  540. struct npu_context *(*cb)(struct npu_context *, void *),
  541. void *priv)
  542. {
  543. int rc;
  544. u32 nvlink_index;
  545. struct device_node *nvlink_dn;
  546. struct mm_struct *mm = current->mm;
  547. struct pnv_phb *nphb;
  548. struct npu *npu;
  549. struct npu_context *npu_context;
  550. /*
  551. * At present we don't support GPUs connected to multiple NPUs and I'm
  552. * not sure the hardware does either.
  553. */
  554. struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
  555. if (!firmware_has_feature(FW_FEATURE_OPAL))
  556. return ERR_PTR(-ENODEV);
  557. if (!npdev)
  558. /* No nvlink associated with this GPU device */
  559. return ERR_PTR(-ENODEV);
  560. if (!mm || mm->context.id == 0) {
  561. /*
  562. * Kernel thread contexts are not supported and context id 0 is
  563. * reserved on the GPU.
  564. */
  565. return ERR_PTR(-EINVAL);
  566. }
  567. nphb = pci_bus_to_host(npdev->bus)->private_data;
  568. npu = &nphb->npu;
  569. /*
  570. * Setup the NPU context table for a particular GPU. These need to be
  571. * per-GPU as we need the tables to filter ATSDs when there are no
  572. * active contexts on a particular GPU.
  573. */
  574. rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
  575. PCI_DEVID(gpdev->bus->number, gpdev->devfn));
  576. if (rc < 0)
  577. return ERR_PTR(-ENOSPC);
  578. /*
  579. * We store the npu pci device so we can more easily get at the
  580. * associated npus.
  581. */
  582. npu_context = mm->context.npu_context;
  583. if (!npu_context) {
  584. npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
  585. if (!npu_context)
  586. return ERR_PTR(-ENOMEM);
  587. mm->context.npu_context = npu_context;
  588. npu_context->mm = mm;
  589. npu_context->mn.ops = &nv_nmmu_notifier_ops;
  590. __mmu_notifier_register(&npu_context->mn, mm);
  591. kref_init(&npu_context->kref);
  592. } else {
  593. kref_get(&npu_context->kref);
  594. }
  595. npu_context->release_cb = cb;
  596. npu_context->priv = priv;
  597. nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
  598. if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
  599. &nvlink_index)))
  600. return ERR_PTR(-ENODEV);
  601. npu_context->npdev[npu->index][nvlink_index] = npdev;
  602. return npu_context;
  603. }
  604. EXPORT_SYMBOL(pnv_npu2_init_context);
  605. static void pnv_npu2_release_context(struct kref *kref)
  606. {
  607. struct npu_context *npu_context =
  608. container_of(kref, struct npu_context, kref);
  609. npu_context->mm->context.npu_context = NULL;
  610. mmu_notifier_unregister(&npu_context->mn,
  611. npu_context->mm);
  612. kfree(npu_context);
  613. }
  614. void pnv_npu2_destroy_context(struct npu_context *npu_context,
  615. struct pci_dev *gpdev)
  616. {
  617. struct pnv_phb *nphb;
  618. struct npu *npu;
  619. struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
  620. struct device_node *nvlink_dn;
  621. u32 nvlink_index;
  622. if (WARN_ON(!npdev))
  623. return;
  624. if (!firmware_has_feature(FW_FEATURE_OPAL))
  625. return;
  626. nphb = pci_bus_to_host(npdev->bus)->private_data;
  627. npu = &nphb->npu;
  628. nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
  629. if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
  630. &nvlink_index)))
  631. return;
  632. npu_context->npdev[npu->index][nvlink_index] = NULL;
  633. opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
  634. PCI_DEVID(gpdev->bus->number, gpdev->devfn));
  635. kref_put(&npu_context->kref, pnv_npu2_release_context);
  636. }
  637. EXPORT_SYMBOL(pnv_npu2_destroy_context);
  638. /*
  639. * Assumes mmap_sem is held for the contexts associated mm.
  640. */
  641. int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
  642. unsigned long *flags, unsigned long *status, int count)
  643. {
  644. u64 rc = 0, result = 0;
  645. int i, is_write;
  646. struct page *page[1];
  647. /* mmap_sem should be held so the struct_mm must be present */
  648. struct mm_struct *mm = context->mm;
  649. if (!firmware_has_feature(FW_FEATURE_OPAL))
  650. return -ENODEV;
  651. WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
  652. for (i = 0; i < count; i++) {
  653. is_write = flags[i] & NPU2_WRITE;
  654. rc = get_user_pages_remote(NULL, mm, ea[i], 1,
  655. is_write ? FOLL_WRITE : 0,
  656. page, NULL, NULL);
  657. /*
  658. * To support virtualised environments we will have to do an
  659. * access to the page to ensure it gets faulted into the
  660. * hypervisor. For the moment virtualisation is not supported in
  661. * other areas so leave the access out.
  662. */
  663. if (rc != 1) {
  664. status[i] = rc;
  665. result = -EFAULT;
  666. continue;
  667. }
  668. status[i] = 0;
  669. put_page(page[0]);
  670. }
  671. return result;
  672. }
  673. EXPORT_SYMBOL(pnv_npu2_handle_fault);
  674. int pnv_npu2_init(struct pnv_phb *phb)
  675. {
  676. unsigned int i;
  677. u64 mmio_atsd;
  678. struct device_node *dn;
  679. struct pci_dev *gpdev;
  680. static int npu_index;
  681. uint64_t rc = 0;
  682. for_each_child_of_node(phb->hose->dn, dn) {
  683. gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
  684. if (gpdev) {
  685. rc = opal_npu_map_lpar(phb->opal_id,
  686. PCI_DEVID(gpdev->bus->number, gpdev->devfn),
  687. 0, 0);
  688. if (rc)
  689. dev_err(&gpdev->dev,
  690. "Error %lld mapping device to LPAR\n",
  691. rc);
  692. }
  693. }
  694. for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
  695. i, &mmio_atsd); i++)
  696. phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
  697. pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
  698. phb->npu.mmio_atsd_count = i;
  699. phb->npu.mmio_atsd_usage = 0;
  700. npu_index++;
  701. if (WARN_ON(npu_index >= NV_MAX_NPUS))
  702. return -ENOSPC;
  703. max_npu2_index = npu_index;
  704. phb->npu.index = npu_index;
  705. return 0;
  706. }