npu-dma.c 25 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990
  1. /*
  2. * This file implements the DMA operations for NVLink devices. The NPU
  3. * devices all point to the same iommu table as the parent PCI device.
  4. *
  5. * Copyright Alistair Popple, IBM Corporation 2015.
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of version 2 of the GNU General Public
  9. * License as published by the Free Software Foundation.
  10. */
  11. #include <linux/slab.h>
  12. #include <linux/mmu_notifier.h>
  13. #include <linux/mmu_context.h>
  14. #include <linux/of.h>
  15. #include <linux/export.h>
  16. #include <linux/pci.h>
  17. #include <linux/memblock.h>
  18. #include <linux/iommu.h>
  19. #include <asm/tlb.h>
  20. #include <asm/powernv.h>
  21. #include <asm/reg.h>
  22. #include <asm/opal.h>
  23. #include <asm/io.h>
  24. #include <asm/iommu.h>
  25. #include <asm/pnv-pci.h>
  26. #include <asm/msi_bitmap.h>
  27. #include <asm/opal.h>
  28. #include "powernv.h"
  29. #include "pci.h"
  30. #define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
  31. /*
  32. * spinlock to protect initialisation of an npu_context for a particular
  33. * mm_struct.
  34. */
  35. static DEFINE_SPINLOCK(npu_context_lock);
  36. /*
  37. * When an address shootdown range exceeds this threshold we invalidate the
  38. * entire TLB on the GPU for the given PID rather than each specific address in
  39. * the range.
  40. */
  41. #define ATSD_THRESHOLD (2*1024*1024)
  42. /*
  43. * Other types of TCE cache invalidation are not functional in the
  44. * hardware.
  45. */
  46. static struct pci_dev *get_pci_dev(struct device_node *dn)
  47. {
  48. struct pci_dn *pdn = PCI_DN(dn);
  49. return pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
  50. pdn->busno, pdn->devfn);
  51. }
  52. /* Given a NPU device get the associated PCI device. */
  53. struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
  54. {
  55. struct device_node *dn;
  56. struct pci_dev *gpdev;
  57. if (WARN_ON(!npdev))
  58. return NULL;
  59. if (WARN_ON(!npdev->dev.of_node))
  60. return NULL;
  61. /* Get assoicated PCI device */
  62. dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
  63. if (!dn)
  64. return NULL;
  65. gpdev = get_pci_dev(dn);
  66. of_node_put(dn);
  67. return gpdev;
  68. }
  69. EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
  70. /* Given the real PCI device get a linked NPU device. */
  71. struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
  72. {
  73. struct device_node *dn;
  74. struct pci_dev *npdev;
  75. if (WARN_ON(!gpdev))
  76. return NULL;
  77. /* Not all PCI devices have device-tree nodes */
  78. if (!gpdev->dev.of_node)
  79. return NULL;
  80. /* Get assoicated PCI device */
  81. dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
  82. if (!dn)
  83. return NULL;
  84. npdev = get_pci_dev(dn);
  85. of_node_put(dn);
  86. return npdev;
  87. }
  88. EXPORT_SYMBOL(pnv_pci_get_npu_dev);
  89. #define NPU_DMA_OP_UNSUPPORTED() \
  90. dev_err_once(dev, "%s operation unsupported for NVLink devices\n", \
  91. __func__)
  92. static void *dma_npu_alloc(struct device *dev, size_t size,
  93. dma_addr_t *dma_handle, gfp_t flag,
  94. unsigned long attrs)
  95. {
  96. NPU_DMA_OP_UNSUPPORTED();
  97. return NULL;
  98. }
  99. static void dma_npu_free(struct device *dev, size_t size,
  100. void *vaddr, dma_addr_t dma_handle,
  101. unsigned long attrs)
  102. {
  103. NPU_DMA_OP_UNSUPPORTED();
  104. }
  105. static dma_addr_t dma_npu_map_page(struct device *dev, struct page *page,
  106. unsigned long offset, size_t size,
  107. enum dma_data_direction direction,
  108. unsigned long attrs)
  109. {
  110. NPU_DMA_OP_UNSUPPORTED();
  111. return 0;
  112. }
  113. static int dma_npu_map_sg(struct device *dev, struct scatterlist *sglist,
  114. int nelems, enum dma_data_direction direction,
  115. unsigned long attrs)
  116. {
  117. NPU_DMA_OP_UNSUPPORTED();
  118. return 0;
  119. }
  120. static int dma_npu_dma_supported(struct device *dev, u64 mask)
  121. {
  122. NPU_DMA_OP_UNSUPPORTED();
  123. return 0;
  124. }
  125. static u64 dma_npu_get_required_mask(struct device *dev)
  126. {
  127. NPU_DMA_OP_UNSUPPORTED();
  128. return 0;
  129. }
  130. static const struct dma_map_ops dma_npu_ops = {
  131. .map_page = dma_npu_map_page,
  132. .map_sg = dma_npu_map_sg,
  133. .alloc = dma_npu_alloc,
  134. .free = dma_npu_free,
  135. .dma_supported = dma_npu_dma_supported,
  136. .get_required_mask = dma_npu_get_required_mask,
  137. };
  138. /*
  139. * Returns the PE assoicated with the PCI device of the given
  140. * NPU. Returns the linked pci device if pci_dev != NULL.
  141. */
  142. static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
  143. struct pci_dev **gpdev)
  144. {
  145. struct pnv_phb *phb;
  146. struct pci_controller *hose;
  147. struct pci_dev *pdev;
  148. struct pnv_ioda_pe *pe;
  149. struct pci_dn *pdn;
  150. pdev = pnv_pci_get_gpu_dev(npe->pdev);
  151. if (!pdev)
  152. return NULL;
  153. pdn = pci_get_pdn(pdev);
  154. if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
  155. return NULL;
  156. hose = pci_bus_to_host(pdev->bus);
  157. phb = hose->private_data;
  158. pe = &phb->ioda.pe_array[pdn->pe_number];
  159. if (gpdev)
  160. *gpdev = pdev;
  161. return pe;
  162. }
  163. long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
  164. struct iommu_table *tbl)
  165. {
  166. struct pnv_phb *phb = npe->phb;
  167. int64_t rc;
  168. const unsigned long size = tbl->it_indirect_levels ?
  169. tbl->it_level_size : tbl->it_size;
  170. const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
  171. const __u64 win_size = tbl->it_size << tbl->it_page_shift;
  172. pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
  173. start_addr, start_addr + win_size - 1,
  174. IOMMU_PAGE_SIZE(tbl));
  175. rc = opal_pci_map_pe_dma_window(phb->opal_id,
  176. npe->pe_number,
  177. npe->pe_number,
  178. tbl->it_indirect_levels + 1,
  179. __pa(tbl->it_base),
  180. size << 3,
  181. IOMMU_PAGE_SIZE(tbl));
  182. if (rc) {
  183. pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
  184. return rc;
  185. }
  186. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  187. /* Add the table to the list so its TCE cache will get invalidated */
  188. pnv_pci_link_table_and_group(phb->hose->node, num,
  189. tbl, &npe->table_group);
  190. return 0;
  191. }
  192. long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
  193. {
  194. struct pnv_phb *phb = npe->phb;
  195. int64_t rc;
  196. pe_info(npe, "Removing DMA window\n");
  197. rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
  198. npe->pe_number,
  199. 0/* levels */, 0/* table address */,
  200. 0/* table size */, 0/* page size */);
  201. if (rc) {
  202. pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
  203. return rc;
  204. }
  205. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  206. pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
  207. &npe->table_group);
  208. return 0;
  209. }
  210. /*
  211. * Enables 32 bit DMA on NPU.
  212. */
  213. static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
  214. {
  215. struct pci_dev *gpdev;
  216. struct pnv_ioda_pe *gpe;
  217. int64_t rc;
  218. /*
  219. * Find the assoicated PCI devices and get the dma window
  220. * information from there.
  221. */
  222. if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
  223. return;
  224. gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
  225. if (!gpe)
  226. return;
  227. rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
  228. /*
  229. * We don't initialise npu_pe->tce32_table as we always use
  230. * dma_npu_ops which are nops.
  231. */
  232. set_dma_ops(&npe->pdev->dev, &dma_npu_ops);
  233. }
  234. /*
  235. * Enables bypass mode on the NPU. The NPU only supports one
  236. * window per link, so bypass needs to be explicitly enabled or
  237. * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
  238. * active at the same time.
  239. */
  240. static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
  241. {
  242. struct pnv_phb *phb = npe->phb;
  243. int64_t rc = 0;
  244. phys_addr_t top = memblock_end_of_DRAM();
  245. if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
  246. return -EINVAL;
  247. rc = pnv_npu_unset_window(npe, 0);
  248. if (rc != OPAL_SUCCESS)
  249. return rc;
  250. /* Enable the bypass window */
  251. top = roundup_pow_of_two(top);
  252. dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
  253. npe->pe_number);
  254. rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
  255. npe->pe_number, npe->pe_number,
  256. 0 /* bypass base */, top);
  257. if (rc == OPAL_SUCCESS)
  258. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  259. return rc;
  260. }
  261. void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
  262. {
  263. int i;
  264. struct pnv_phb *phb;
  265. struct pci_dn *pdn;
  266. struct pnv_ioda_pe *npe;
  267. struct pci_dev *npdev;
  268. for (i = 0; ; ++i) {
  269. npdev = pnv_pci_get_npu_dev(gpdev, i);
  270. if (!npdev)
  271. break;
  272. pdn = pci_get_pdn(npdev);
  273. if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
  274. return;
  275. phb = pci_bus_to_host(npdev->bus)->private_data;
  276. /* We only do bypass if it's enabled on the linked device */
  277. npe = &phb->ioda.pe_array[pdn->pe_number];
  278. if (bypass) {
  279. dev_info(&npdev->dev,
  280. "Using 64-bit DMA iommu bypass\n");
  281. pnv_npu_dma_set_bypass(npe);
  282. } else {
  283. dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
  284. pnv_npu_dma_set_32(npe);
  285. }
  286. }
  287. }
  288. /* Switch ownership from platform code to external user (e.g. VFIO) */
  289. void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
  290. {
  291. struct pnv_phb *phb = npe->phb;
  292. int64_t rc;
  293. /*
  294. * Note: NPU has just a single TVE in the hardware which means that
  295. * while used by the kernel, it can have either 32bit window or
  296. * DMA bypass but never both. So we deconfigure 32bit window only
  297. * if it was enabled at the moment of ownership change.
  298. */
  299. if (npe->table_group.tables[0]) {
  300. pnv_npu_unset_window(npe, 0);
  301. return;
  302. }
  303. /* Disable bypass */
  304. rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
  305. npe->pe_number, npe->pe_number,
  306. 0 /* bypass base */, 0);
  307. if (rc) {
  308. pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
  309. return;
  310. }
  311. pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
  312. }
  313. struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
  314. {
  315. struct pnv_phb *phb = npe->phb;
  316. struct pci_bus *pbus = phb->hose->bus;
  317. struct pci_dev *npdev, *gpdev = NULL, *gptmp;
  318. struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
  319. if (!gpe || !gpdev)
  320. return NULL;
  321. list_for_each_entry(npdev, &pbus->devices, bus_list) {
  322. gptmp = pnv_pci_get_gpu_dev(npdev);
  323. if (gptmp != gpdev)
  324. continue;
  325. pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
  326. iommu_group_add_device(gpe->table_group.group, &npdev->dev);
  327. }
  328. return gpe;
  329. }
  330. /* Maximum number of nvlinks per npu */
  331. #define NV_MAX_LINKS 6
  332. /* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
  333. static int max_npu2_index;
  334. struct npu_context {
  335. struct mm_struct *mm;
  336. struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
  337. struct mmu_notifier mn;
  338. struct kref kref;
  339. bool nmmu_flush;
  340. /* Callback to stop translation requests on a given GPU */
  341. void (*release_cb)(struct npu_context *context, void *priv);
  342. /*
  343. * Private pointer passed to the above callback for usage by
  344. * device drivers.
  345. */
  346. void *priv;
  347. };
  348. struct mmio_atsd_reg {
  349. struct npu *npu;
  350. int reg;
  351. };
  352. /*
  353. * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
  354. * if none are available.
  355. */
  356. static int get_mmio_atsd_reg(struct npu *npu)
  357. {
  358. int i;
  359. for (i = 0; i < npu->mmio_atsd_count; i++) {
  360. if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
  361. return i;
  362. }
  363. return -ENOSPC;
  364. }
  365. static void put_mmio_atsd_reg(struct npu *npu, int reg)
  366. {
  367. clear_bit_unlock(reg, &npu->mmio_atsd_usage);
  368. }
  369. /* MMIO ATSD register offsets */
  370. #define XTS_ATSD_AVA 1
  371. #define XTS_ATSD_STAT 2
  372. static void mmio_launch_invalidate(struct mmio_atsd_reg *mmio_atsd_reg,
  373. unsigned long launch, unsigned long va)
  374. {
  375. struct npu *npu = mmio_atsd_reg->npu;
  376. int reg = mmio_atsd_reg->reg;
  377. __raw_writeq_be(va, npu->mmio_atsd_regs[reg] + XTS_ATSD_AVA);
  378. eieio();
  379. __raw_writeq_be(launch, npu->mmio_atsd_regs[reg]);
  380. }
  381. static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
  382. unsigned long pid, bool flush)
  383. {
  384. int i;
  385. unsigned long launch;
  386. for (i = 0; i <= max_npu2_index; i++) {
  387. if (mmio_atsd_reg[i].reg < 0)
  388. continue;
  389. /* IS set to invalidate matching PID */
  390. launch = PPC_BIT(12);
  391. /* PRS set to process-scoped */
  392. launch |= PPC_BIT(13);
  393. /* AP */
  394. launch |= (u64)
  395. mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
  396. /* PID */
  397. launch |= pid << PPC_BITLSHIFT(38);
  398. /* No flush */
  399. launch |= !flush << PPC_BITLSHIFT(39);
  400. /* Invalidating the entire process doesn't use a va */
  401. mmio_launch_invalidate(&mmio_atsd_reg[i], launch, 0);
  402. }
  403. }
  404. static void mmio_invalidate_va(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
  405. unsigned long va, unsigned long pid, bool flush)
  406. {
  407. int i;
  408. unsigned long launch;
  409. for (i = 0; i <= max_npu2_index; i++) {
  410. if (mmio_atsd_reg[i].reg < 0)
  411. continue;
  412. /* IS set to invalidate target VA */
  413. launch = 0;
  414. /* PRS set to process scoped */
  415. launch |= PPC_BIT(13);
  416. /* AP */
  417. launch |= (u64)
  418. mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17);
  419. /* PID */
  420. launch |= pid << PPC_BITLSHIFT(38);
  421. /* No flush */
  422. launch |= !flush << PPC_BITLSHIFT(39);
  423. mmio_launch_invalidate(&mmio_atsd_reg[i], launch, va);
  424. }
  425. }
  426. #define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
  427. static void mmio_invalidate_wait(
  428. struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
  429. {
  430. struct npu *npu;
  431. int i, reg;
  432. /* Wait for all invalidations to complete */
  433. for (i = 0; i <= max_npu2_index; i++) {
  434. if (mmio_atsd_reg[i].reg < 0)
  435. continue;
  436. /* Wait for completion */
  437. npu = mmio_atsd_reg[i].npu;
  438. reg = mmio_atsd_reg[i].reg;
  439. while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
  440. cpu_relax();
  441. }
  442. }
  443. /*
  444. * Acquires all the address translation shootdown (ATSD) registers required to
  445. * launch an ATSD on all links this npu_context is active on.
  446. */
  447. static void acquire_atsd_reg(struct npu_context *npu_context,
  448. struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
  449. {
  450. int i, j;
  451. struct npu *npu;
  452. struct pci_dev *npdev;
  453. struct pnv_phb *nphb;
  454. for (i = 0; i <= max_npu2_index; i++) {
  455. mmio_atsd_reg[i].reg = -1;
  456. for (j = 0; j < NV_MAX_LINKS; j++) {
  457. /*
  458. * There are no ordering requirements with respect to
  459. * the setup of struct npu_context, but to ensure
  460. * consistent behaviour we need to ensure npdev[][] is
  461. * only read once.
  462. */
  463. npdev = READ_ONCE(npu_context->npdev[i][j]);
  464. if (!npdev)
  465. continue;
  466. nphb = pci_bus_to_host(npdev->bus)->private_data;
  467. npu = &nphb->npu;
  468. mmio_atsd_reg[i].npu = npu;
  469. mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
  470. while (mmio_atsd_reg[i].reg < 0) {
  471. mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
  472. cpu_relax();
  473. }
  474. break;
  475. }
  476. }
  477. }
  478. /*
  479. * Release previously acquired ATSD registers. To avoid deadlocks the registers
  480. * must be released in the same order they were acquired above in
  481. * acquire_atsd_reg.
  482. */
  483. static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
  484. {
  485. int i;
  486. for (i = 0; i <= max_npu2_index; i++) {
  487. /*
  488. * We can't rely on npu_context->npdev[][] being the same here
  489. * as when acquire_atsd_reg() was called, hence we use the
  490. * values stored in mmio_atsd_reg during the acquire phase
  491. * rather than re-reading npdev[][].
  492. */
  493. if (mmio_atsd_reg[i].reg < 0)
  494. continue;
  495. put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
  496. }
  497. }
  498. /*
  499. * Invalidate either a single address or an entire PID depending on
  500. * the value of va.
  501. */
  502. static void mmio_invalidate(struct npu_context *npu_context, int va,
  503. unsigned long address, bool flush)
  504. {
  505. struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
  506. unsigned long pid = npu_context->mm->context.id;
  507. if (npu_context->nmmu_flush)
  508. /*
  509. * Unfortunately the nest mmu does not support flushing specific
  510. * addresses so we have to flush the whole mm once before
  511. * shooting down the GPU translation.
  512. */
  513. flush_all_mm(npu_context->mm);
  514. /*
  515. * Loop over all the NPUs this process is active on and launch
  516. * an invalidate.
  517. */
  518. acquire_atsd_reg(npu_context, mmio_atsd_reg);
  519. if (va)
  520. mmio_invalidate_va(mmio_atsd_reg, address, pid, flush);
  521. else
  522. mmio_invalidate_pid(mmio_atsd_reg, pid, flush);
  523. mmio_invalidate_wait(mmio_atsd_reg);
  524. if (flush) {
  525. /*
  526. * The GPU requires two flush ATSDs to ensure all entries have
  527. * been flushed. We use PID 0 as it will never be used for a
  528. * process on the GPU.
  529. */
  530. mmio_invalidate_pid(mmio_atsd_reg, 0, true);
  531. mmio_invalidate_wait(mmio_atsd_reg);
  532. mmio_invalidate_pid(mmio_atsd_reg, 0, true);
  533. mmio_invalidate_wait(mmio_atsd_reg);
  534. }
  535. release_atsd_reg(mmio_atsd_reg);
  536. }
  537. static void pnv_npu2_mn_release(struct mmu_notifier *mn,
  538. struct mm_struct *mm)
  539. {
  540. struct npu_context *npu_context = mn_to_npu_context(mn);
  541. /* Call into device driver to stop requests to the NMMU */
  542. if (npu_context->release_cb)
  543. npu_context->release_cb(npu_context, npu_context->priv);
  544. /*
  545. * There should be no more translation requests for this PID, but we
  546. * need to ensure any entries for it are removed from the TLB.
  547. */
  548. mmio_invalidate(npu_context, 0, 0, true);
  549. }
  550. static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
  551. struct mm_struct *mm,
  552. unsigned long address,
  553. pte_t pte)
  554. {
  555. struct npu_context *npu_context = mn_to_npu_context(mn);
  556. mmio_invalidate(npu_context, 1, address, true);
  557. }
  558. static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
  559. struct mm_struct *mm,
  560. unsigned long start, unsigned long end)
  561. {
  562. struct npu_context *npu_context = mn_to_npu_context(mn);
  563. unsigned long address;
  564. if (end - start > ATSD_THRESHOLD) {
  565. /*
  566. * Just invalidate the entire PID if the address range is too
  567. * large.
  568. */
  569. mmio_invalidate(npu_context, 0, 0, true);
  570. } else {
  571. for (address = start; address < end; address += PAGE_SIZE)
  572. mmio_invalidate(npu_context, 1, address, false);
  573. /* Do the flush only on the final addess == end */
  574. mmio_invalidate(npu_context, 1, address, true);
  575. }
  576. }
  577. static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
  578. .release = pnv_npu2_mn_release,
  579. .change_pte = pnv_npu2_mn_change_pte,
  580. .invalidate_range = pnv_npu2_mn_invalidate_range,
  581. };
  582. /*
  583. * Call into OPAL to setup the nmmu context for the current task in
  584. * the NPU. This must be called to setup the context tables before the
  585. * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
  586. *
  587. * A release callback should be registered to allow a device driver to
  588. * be notified that it should not launch any new translation requests
  589. * as the final TLB invalidate is about to occur.
  590. *
  591. * Returns an error if there no contexts are currently available or a
  592. * npu_context which should be passed to pnv_npu2_handle_fault().
  593. *
  594. * mmap_sem must be held in write mode and must not be called from interrupt
  595. * context.
  596. */
  597. struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
  598. unsigned long flags,
  599. void (*cb)(struct npu_context *, void *),
  600. void *priv)
  601. {
  602. int rc;
  603. u32 nvlink_index;
  604. struct device_node *nvlink_dn;
  605. struct mm_struct *mm = current->mm;
  606. struct pnv_phb *nphb;
  607. struct npu *npu;
  608. struct npu_context *npu_context;
  609. /*
  610. * At present we don't support GPUs connected to multiple NPUs and I'm
  611. * not sure the hardware does either.
  612. */
  613. struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
  614. if (!firmware_has_feature(FW_FEATURE_OPAL))
  615. return ERR_PTR(-ENODEV);
  616. if (!npdev)
  617. /* No nvlink associated with this GPU device */
  618. return ERR_PTR(-ENODEV);
  619. nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
  620. if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
  621. &nvlink_index)))
  622. return ERR_PTR(-ENODEV);
  623. if (!mm || mm->context.id == 0) {
  624. /*
  625. * Kernel thread contexts are not supported and context id 0 is
  626. * reserved on the GPU.
  627. */
  628. return ERR_PTR(-EINVAL);
  629. }
  630. nphb = pci_bus_to_host(npdev->bus)->private_data;
  631. npu = &nphb->npu;
  632. /*
  633. * Setup the NPU context table for a particular GPU. These need to be
  634. * per-GPU as we need the tables to filter ATSDs when there are no
  635. * active contexts on a particular GPU. It is safe for these to be
  636. * called concurrently with destroy as the OPAL call takes appropriate
  637. * locks and refcounts on init/destroy.
  638. */
  639. rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
  640. PCI_DEVID(gpdev->bus->number, gpdev->devfn));
  641. if (rc < 0)
  642. return ERR_PTR(-ENOSPC);
  643. /*
  644. * We store the npu pci device so we can more easily get at the
  645. * associated npus.
  646. */
  647. spin_lock(&npu_context_lock);
  648. npu_context = mm->context.npu_context;
  649. if (npu_context) {
  650. if (npu_context->release_cb != cb ||
  651. npu_context->priv != priv) {
  652. spin_unlock(&npu_context_lock);
  653. opal_npu_destroy_context(nphb->opal_id, mm->context.id,
  654. PCI_DEVID(gpdev->bus->number,
  655. gpdev->devfn));
  656. return ERR_PTR(-EINVAL);
  657. }
  658. WARN_ON(!kref_get_unless_zero(&npu_context->kref));
  659. }
  660. spin_unlock(&npu_context_lock);
  661. if (!npu_context) {
  662. /*
  663. * We can set up these fields without holding the
  664. * npu_context_lock as the npu_context hasn't been returned to
  665. * the caller meaning it can't be destroyed. Parallel allocation
  666. * is protected against by mmap_sem.
  667. */
  668. rc = -ENOMEM;
  669. npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
  670. if (npu_context) {
  671. kref_init(&npu_context->kref);
  672. npu_context->mm = mm;
  673. npu_context->mn.ops = &nv_nmmu_notifier_ops;
  674. rc = __mmu_notifier_register(&npu_context->mn, mm);
  675. }
  676. if (rc) {
  677. kfree(npu_context);
  678. opal_npu_destroy_context(nphb->opal_id, mm->context.id,
  679. PCI_DEVID(gpdev->bus->number,
  680. gpdev->devfn));
  681. return ERR_PTR(rc);
  682. }
  683. mm->context.npu_context = npu_context;
  684. }
  685. npu_context->release_cb = cb;
  686. npu_context->priv = priv;
  687. /*
  688. * npdev is a pci_dev pointer setup by the PCI code. We assign it to
  689. * npdev[][] to indicate to the mmu notifiers that an invalidation
  690. * should also be sent over this nvlink. The notifiers don't use any
  691. * other fields in npu_context, so we just need to ensure that when they
  692. * deference npu_context->npdev[][] it is either a valid pointer or
  693. * NULL.
  694. */
  695. WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
  696. if (!nphb->npu.nmmu_flush) {
  697. /*
  698. * If we're not explicitly flushing ourselves we need to mark
  699. * the thread for global flushes
  700. */
  701. npu_context->nmmu_flush = false;
  702. mm_context_add_copro(mm);
  703. } else
  704. npu_context->nmmu_flush = true;
  705. return npu_context;
  706. }
  707. EXPORT_SYMBOL(pnv_npu2_init_context);
  708. static void pnv_npu2_release_context(struct kref *kref)
  709. {
  710. struct npu_context *npu_context =
  711. container_of(kref, struct npu_context, kref);
  712. if (!npu_context->nmmu_flush)
  713. mm_context_remove_copro(npu_context->mm);
  714. npu_context->mm->context.npu_context = NULL;
  715. }
  716. /*
  717. * Destroy a context on the given GPU. May free the npu_context if it is no
  718. * longer active on any GPUs. Must not be called from interrupt context.
  719. */
  720. void pnv_npu2_destroy_context(struct npu_context *npu_context,
  721. struct pci_dev *gpdev)
  722. {
  723. int removed;
  724. struct pnv_phb *nphb;
  725. struct npu *npu;
  726. struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
  727. struct device_node *nvlink_dn;
  728. u32 nvlink_index;
  729. if (WARN_ON(!npdev))
  730. return;
  731. if (!firmware_has_feature(FW_FEATURE_OPAL))
  732. return;
  733. nphb = pci_bus_to_host(npdev->bus)->private_data;
  734. npu = &nphb->npu;
  735. nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
  736. if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
  737. &nvlink_index)))
  738. return;
  739. WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
  740. opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
  741. PCI_DEVID(gpdev->bus->number, gpdev->devfn));
  742. spin_lock(&npu_context_lock);
  743. removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
  744. spin_unlock(&npu_context_lock);
  745. /*
  746. * We need to do this outside of pnv_npu2_release_context so that it is
  747. * outside the spinlock as mmu_notifier_destroy uses SRCU.
  748. */
  749. if (removed) {
  750. mmu_notifier_unregister(&npu_context->mn,
  751. npu_context->mm);
  752. kfree(npu_context);
  753. }
  754. }
  755. EXPORT_SYMBOL(pnv_npu2_destroy_context);
  756. /*
  757. * Assumes mmap_sem is held for the contexts associated mm.
  758. */
  759. int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
  760. unsigned long *flags, unsigned long *status, int count)
  761. {
  762. u64 rc = 0, result = 0;
  763. int i, is_write;
  764. struct page *page[1];
  765. /* mmap_sem should be held so the struct_mm must be present */
  766. struct mm_struct *mm = context->mm;
  767. if (!firmware_has_feature(FW_FEATURE_OPAL))
  768. return -ENODEV;
  769. WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
  770. for (i = 0; i < count; i++) {
  771. is_write = flags[i] & NPU2_WRITE;
  772. rc = get_user_pages_remote(NULL, mm, ea[i], 1,
  773. is_write ? FOLL_WRITE : 0,
  774. page, NULL, NULL);
  775. /*
  776. * To support virtualised environments we will have to do an
  777. * access to the page to ensure it gets faulted into the
  778. * hypervisor. For the moment virtualisation is not supported in
  779. * other areas so leave the access out.
  780. */
  781. if (rc != 1) {
  782. status[i] = rc;
  783. result = -EFAULT;
  784. continue;
  785. }
  786. status[i] = 0;
  787. put_page(page[0]);
  788. }
  789. return result;
  790. }
  791. EXPORT_SYMBOL(pnv_npu2_handle_fault);
  792. int pnv_npu2_init(struct pnv_phb *phb)
  793. {
  794. unsigned int i;
  795. u64 mmio_atsd;
  796. struct device_node *dn;
  797. struct pci_dev *gpdev;
  798. static int npu_index;
  799. uint64_t rc = 0;
  800. phb->npu.nmmu_flush =
  801. of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
  802. for_each_child_of_node(phb->hose->dn, dn) {
  803. gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
  804. if (gpdev) {
  805. rc = opal_npu_map_lpar(phb->opal_id,
  806. PCI_DEVID(gpdev->bus->number, gpdev->devfn),
  807. 0, 0);
  808. if (rc)
  809. dev_err(&gpdev->dev,
  810. "Error %lld mapping device to LPAR\n",
  811. rc);
  812. }
  813. }
  814. for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
  815. i, &mmio_atsd); i++)
  816. phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
  817. pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
  818. phb->npu.mmio_atsd_count = i;
  819. phb->npu.mmio_atsd_usage = 0;
  820. npu_index++;
  821. if (WARN_ON(npu_index >= NV_MAX_NPUS))
  822. return -ENOSPC;
  823. max_npu2_index = npu_index;
  824. phb->npu.index = npu_index;
  825. return 0;
  826. }