npu-dma.c 24 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933
  1. /*
  2. * This file implements the DMA operations for NVLink devices. The NPU
  3. * devices all point to the same iommu table as the parent PCI device.
  4. *
  5. * Copyright Alistair Popple, IBM Corporation 2015.
  6. *
  7. * This program is free software; you can redistribute it and/or
  8. * modify it under the terms of version 2 of the GNU General Public
  9. * License as published by the Free Software Foundation.
  10. */
  11. #include <linux/slab.h>
  12. #include <linux/mmu_notifier.h>
  13. #include <linux/mmu_context.h>
  14. #include <linux/of.h>
  15. #include <linux/export.h>
  16. #include <linux/pci.h>
  17. #include <linux/memblock.h>
  18. #include <linux/iommu.h>
  19. #include <linux/sizes.h>
  20. #include <asm/debugfs.h>
  21. #include <asm/tlb.h>
  22. #include <asm/powernv.h>
  23. #include <asm/reg.h>
  24. #include <asm/opal.h>
  25. #include <asm/io.h>
  26. #include <asm/iommu.h>
  27. #include <asm/pnv-pci.h>
  28. #include <asm/msi_bitmap.h>
  29. #include <asm/opal.h>
  30. #include "powernv.h"
  31. #include "pci.h"
  32. #define npu_to_phb(x) container_of(x, struct pnv_phb, npu)
  33. /*
  34. * spinlock to protect initialisation of an npu_context for a particular
  35. * mm_struct.
  36. */
  37. static DEFINE_SPINLOCK(npu_context_lock);
  38. /*
  39. * Other types of TCE cache invalidation are not functional in the
  40. * hardware.
  41. */
  42. static struct pci_dev *get_pci_dev(struct device_node *dn)
  43. {
  44. struct pci_dn *pdn = PCI_DN(dn);
  45. return pci_get_domain_bus_and_slot(pci_domain_nr(pdn->phb->bus),
  46. pdn->busno, pdn->devfn);
  47. }
  48. /* Given a NPU device get the associated PCI device. */
  49. struct pci_dev *pnv_pci_get_gpu_dev(struct pci_dev *npdev)
  50. {
  51. struct device_node *dn;
  52. struct pci_dev *gpdev;
  53. if (WARN_ON(!npdev))
  54. return NULL;
  55. if (WARN_ON(!npdev->dev.of_node))
  56. return NULL;
  57. /* Get assoicated PCI device */
  58. dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0);
  59. if (!dn)
  60. return NULL;
  61. gpdev = get_pci_dev(dn);
  62. of_node_put(dn);
  63. return gpdev;
  64. }
  65. EXPORT_SYMBOL(pnv_pci_get_gpu_dev);
  66. /* Given the real PCI device get a linked NPU device. */
  67. struct pci_dev *pnv_pci_get_npu_dev(struct pci_dev *gpdev, int index)
  68. {
  69. struct device_node *dn;
  70. struct pci_dev *npdev;
  71. if (WARN_ON(!gpdev))
  72. return NULL;
  73. /* Not all PCI devices have device-tree nodes */
  74. if (!gpdev->dev.of_node)
  75. return NULL;
  76. /* Get assoicated PCI device */
  77. dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index);
  78. if (!dn)
  79. return NULL;
  80. npdev = get_pci_dev(dn);
  81. of_node_put(dn);
  82. return npdev;
  83. }
  84. EXPORT_SYMBOL(pnv_pci_get_npu_dev);
  85. /*
  86. * Returns the PE assoicated with the PCI device of the given
  87. * NPU. Returns the linked pci device if pci_dev != NULL.
  88. */
  89. static struct pnv_ioda_pe *get_gpu_pci_dev_and_pe(struct pnv_ioda_pe *npe,
  90. struct pci_dev **gpdev)
  91. {
  92. struct pnv_phb *phb;
  93. struct pci_controller *hose;
  94. struct pci_dev *pdev;
  95. struct pnv_ioda_pe *pe;
  96. struct pci_dn *pdn;
  97. pdev = pnv_pci_get_gpu_dev(npe->pdev);
  98. if (!pdev)
  99. return NULL;
  100. pdn = pci_get_pdn(pdev);
  101. if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
  102. return NULL;
  103. hose = pci_bus_to_host(pdev->bus);
  104. phb = hose->private_data;
  105. pe = &phb->ioda.pe_array[pdn->pe_number];
  106. if (gpdev)
  107. *gpdev = pdev;
  108. return pe;
  109. }
  110. long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num,
  111. struct iommu_table *tbl)
  112. {
  113. struct pnv_phb *phb = npe->phb;
  114. int64_t rc;
  115. const unsigned long size = tbl->it_indirect_levels ?
  116. tbl->it_level_size : tbl->it_size;
  117. const __u64 start_addr = tbl->it_offset << tbl->it_page_shift;
  118. const __u64 win_size = tbl->it_size << tbl->it_page_shift;
  119. pe_info(npe, "Setting up window %llx..%llx pg=%lx\n",
  120. start_addr, start_addr + win_size - 1,
  121. IOMMU_PAGE_SIZE(tbl));
  122. rc = opal_pci_map_pe_dma_window(phb->opal_id,
  123. npe->pe_number,
  124. npe->pe_number,
  125. tbl->it_indirect_levels + 1,
  126. __pa(tbl->it_base),
  127. size << 3,
  128. IOMMU_PAGE_SIZE(tbl));
  129. if (rc) {
  130. pe_err(npe, "Failed to configure TCE table, err %lld\n", rc);
  131. return rc;
  132. }
  133. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  134. /* Add the table to the list so its TCE cache will get invalidated */
  135. pnv_pci_link_table_and_group(phb->hose->node, num,
  136. tbl, &npe->table_group);
  137. return 0;
  138. }
  139. long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num)
  140. {
  141. struct pnv_phb *phb = npe->phb;
  142. int64_t rc;
  143. pe_info(npe, "Removing DMA window\n");
  144. rc = opal_pci_map_pe_dma_window(phb->opal_id, npe->pe_number,
  145. npe->pe_number,
  146. 0/* levels */, 0/* table address */,
  147. 0/* table size */, 0/* page size */);
  148. if (rc) {
  149. pe_err(npe, "Unmapping failed, ret = %lld\n", rc);
  150. return rc;
  151. }
  152. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  153. pnv_pci_unlink_table_and_group(npe->table_group.tables[num],
  154. &npe->table_group);
  155. return 0;
  156. }
  157. /*
  158. * Enables 32 bit DMA on NPU.
  159. */
  160. static void pnv_npu_dma_set_32(struct pnv_ioda_pe *npe)
  161. {
  162. struct pci_dev *gpdev;
  163. struct pnv_ioda_pe *gpe;
  164. int64_t rc;
  165. /*
  166. * Find the assoicated PCI devices and get the dma window
  167. * information from there.
  168. */
  169. if (!npe->pdev || !(npe->flags & PNV_IODA_PE_DEV))
  170. return;
  171. gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
  172. if (!gpe)
  173. return;
  174. rc = pnv_npu_set_window(npe, 0, gpe->table_group.tables[0]);
  175. /*
  176. * NVLink devices use the same TCE table configuration as
  177. * their parent device so drivers shouldn't be doing DMA
  178. * operations directly on these devices.
  179. */
  180. set_dma_ops(&npe->pdev->dev, NULL);
  181. }
  182. /*
  183. * Enables bypass mode on the NPU. The NPU only supports one
  184. * window per link, so bypass needs to be explicitly enabled or
  185. * disabled. Unlike for a PHB3 bypass and non-bypass modes can't be
  186. * active at the same time.
  187. */
  188. static int pnv_npu_dma_set_bypass(struct pnv_ioda_pe *npe)
  189. {
  190. struct pnv_phb *phb = npe->phb;
  191. int64_t rc = 0;
  192. phys_addr_t top = memblock_end_of_DRAM();
  193. if (phb->type != PNV_PHB_NPU_NVLINK || !npe->pdev)
  194. return -EINVAL;
  195. rc = pnv_npu_unset_window(npe, 0);
  196. if (rc != OPAL_SUCCESS)
  197. return rc;
  198. /* Enable the bypass window */
  199. top = roundup_pow_of_two(top);
  200. dev_info(&npe->pdev->dev, "Enabling bypass for PE %x\n",
  201. npe->pe_number);
  202. rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
  203. npe->pe_number, npe->pe_number,
  204. 0 /* bypass base */, top);
  205. if (rc == OPAL_SUCCESS)
  206. pnv_pci_ioda2_tce_invalidate_entire(phb, false);
  207. return rc;
  208. }
  209. void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass)
  210. {
  211. int i;
  212. struct pnv_phb *phb;
  213. struct pci_dn *pdn;
  214. struct pnv_ioda_pe *npe;
  215. struct pci_dev *npdev;
  216. for (i = 0; ; ++i) {
  217. npdev = pnv_pci_get_npu_dev(gpdev, i);
  218. if (!npdev)
  219. break;
  220. pdn = pci_get_pdn(npdev);
  221. if (WARN_ON(!pdn || pdn->pe_number == IODA_INVALID_PE))
  222. return;
  223. phb = pci_bus_to_host(npdev->bus)->private_data;
  224. /* We only do bypass if it's enabled on the linked device */
  225. npe = &phb->ioda.pe_array[pdn->pe_number];
  226. if (bypass) {
  227. dev_info(&npdev->dev,
  228. "Using 64-bit DMA iommu bypass\n");
  229. pnv_npu_dma_set_bypass(npe);
  230. } else {
  231. dev_info(&npdev->dev, "Using 32-bit DMA via iommu\n");
  232. pnv_npu_dma_set_32(npe);
  233. }
  234. }
  235. }
  236. /* Switch ownership from platform code to external user (e.g. VFIO) */
  237. void pnv_npu_take_ownership(struct pnv_ioda_pe *npe)
  238. {
  239. struct pnv_phb *phb = npe->phb;
  240. int64_t rc;
  241. /*
  242. * Note: NPU has just a single TVE in the hardware which means that
  243. * while used by the kernel, it can have either 32bit window or
  244. * DMA bypass but never both. So we deconfigure 32bit window only
  245. * if it was enabled at the moment of ownership change.
  246. */
  247. if (npe->table_group.tables[0]) {
  248. pnv_npu_unset_window(npe, 0);
  249. return;
  250. }
  251. /* Disable bypass */
  252. rc = opal_pci_map_pe_dma_window_real(phb->opal_id,
  253. npe->pe_number, npe->pe_number,
  254. 0 /* bypass base */, 0);
  255. if (rc) {
  256. pe_err(npe, "Failed to disable bypass, err %lld\n", rc);
  257. return;
  258. }
  259. pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false);
  260. }
  261. struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe)
  262. {
  263. struct pnv_phb *phb = npe->phb;
  264. struct pci_bus *pbus = phb->hose->bus;
  265. struct pci_dev *npdev, *gpdev = NULL, *gptmp;
  266. struct pnv_ioda_pe *gpe = get_gpu_pci_dev_and_pe(npe, &gpdev);
  267. if (!gpe || !gpdev)
  268. return NULL;
  269. list_for_each_entry(npdev, &pbus->devices, bus_list) {
  270. gptmp = pnv_pci_get_gpu_dev(npdev);
  271. if (gptmp != gpdev)
  272. continue;
  273. pe_info(gpe, "Attached NPU %s\n", dev_name(&npdev->dev));
  274. iommu_group_add_device(gpe->table_group.group, &npdev->dev);
  275. }
  276. return gpe;
  277. }
  278. /* Maximum number of nvlinks per npu */
  279. #define NV_MAX_LINKS 6
  280. /* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */
  281. static int max_npu2_index;
  282. struct npu_context {
  283. struct mm_struct *mm;
  284. struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS];
  285. struct mmu_notifier mn;
  286. struct kref kref;
  287. bool nmmu_flush;
  288. /* Callback to stop translation requests on a given GPU */
  289. void (*release_cb)(struct npu_context *context, void *priv);
  290. /*
  291. * Private pointer passed to the above callback for usage by
  292. * device drivers.
  293. */
  294. void *priv;
  295. };
  296. struct mmio_atsd_reg {
  297. struct npu *npu;
  298. int reg;
  299. };
  300. /*
  301. * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC
  302. * if none are available.
  303. */
  304. static int get_mmio_atsd_reg(struct npu *npu)
  305. {
  306. int i;
  307. for (i = 0; i < npu->mmio_atsd_count; i++) {
  308. if (!test_bit(i, &npu->mmio_atsd_usage))
  309. if (!test_and_set_bit_lock(i, &npu->mmio_atsd_usage))
  310. return i;
  311. }
  312. return -ENOSPC;
  313. }
  314. static void put_mmio_atsd_reg(struct npu *npu, int reg)
  315. {
  316. clear_bit_unlock(reg, &npu->mmio_atsd_usage);
  317. }
  318. /* MMIO ATSD register offsets */
  319. #define XTS_ATSD_LAUNCH 0
  320. #define XTS_ATSD_AVA 1
  321. #define XTS_ATSD_STAT 2
  322. static unsigned long get_atsd_launch_val(unsigned long pid, unsigned long psize)
  323. {
  324. unsigned long launch = 0;
  325. if (psize == MMU_PAGE_COUNT) {
  326. /* IS set to invalidate entire matching PID */
  327. launch |= PPC_BIT(12);
  328. } else {
  329. /* AP set to invalidate region of psize */
  330. launch |= (u64)mmu_get_ap(psize) << PPC_BITLSHIFT(17);
  331. }
  332. /* PRS set to process-scoped */
  333. launch |= PPC_BIT(13);
  334. /* PID */
  335. launch |= pid << PPC_BITLSHIFT(38);
  336. /* Leave "No flush" (bit 39) 0 so every ATSD performs a flush */
  337. return launch;
  338. }
  339. static void mmio_atsd_regs_write(struct mmio_atsd_reg
  340. mmio_atsd_reg[NV_MAX_NPUS], unsigned long offset,
  341. unsigned long val)
  342. {
  343. struct npu *npu;
  344. int i, reg;
  345. for (i = 0; i <= max_npu2_index; i++) {
  346. reg = mmio_atsd_reg[i].reg;
  347. if (reg < 0)
  348. continue;
  349. npu = mmio_atsd_reg[i].npu;
  350. __raw_writeq_be(val, npu->mmio_atsd_regs[reg] + offset);
  351. }
  352. }
  353. static void mmio_invalidate_pid(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS],
  354. unsigned long pid)
  355. {
  356. unsigned long launch = get_atsd_launch_val(pid, MMU_PAGE_COUNT);
  357. /* Invalidating the entire process doesn't use a va */
  358. mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
  359. }
  360. static void mmio_invalidate_range(struct mmio_atsd_reg
  361. mmio_atsd_reg[NV_MAX_NPUS], unsigned long pid,
  362. unsigned long start, unsigned long psize)
  363. {
  364. unsigned long launch = get_atsd_launch_val(pid, psize);
  365. /* Write all VAs first */
  366. mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_AVA, start);
  367. /* Issue one barrier for all address writes */
  368. eieio();
  369. /* Launch */
  370. mmio_atsd_regs_write(mmio_atsd_reg, XTS_ATSD_LAUNCH, launch);
  371. }
  372. #define mn_to_npu_context(x) container_of(x, struct npu_context, mn)
  373. static void mmio_invalidate_wait(
  374. struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
  375. {
  376. struct npu *npu;
  377. int i, reg;
  378. /* Wait for all invalidations to complete */
  379. for (i = 0; i <= max_npu2_index; i++) {
  380. if (mmio_atsd_reg[i].reg < 0)
  381. continue;
  382. /* Wait for completion */
  383. npu = mmio_atsd_reg[i].npu;
  384. reg = mmio_atsd_reg[i].reg;
  385. while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT))
  386. cpu_relax();
  387. }
  388. }
  389. /*
  390. * Acquires all the address translation shootdown (ATSD) registers required to
  391. * launch an ATSD on all links this npu_context is active on.
  392. */
  393. static void acquire_atsd_reg(struct npu_context *npu_context,
  394. struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
  395. {
  396. int i, j;
  397. struct npu *npu;
  398. struct pci_dev *npdev;
  399. struct pnv_phb *nphb;
  400. for (i = 0; i <= max_npu2_index; i++) {
  401. mmio_atsd_reg[i].reg = -1;
  402. for (j = 0; j < NV_MAX_LINKS; j++) {
  403. /*
  404. * There are no ordering requirements with respect to
  405. * the setup of struct npu_context, but to ensure
  406. * consistent behaviour we need to ensure npdev[][] is
  407. * only read once.
  408. */
  409. npdev = READ_ONCE(npu_context->npdev[i][j]);
  410. if (!npdev)
  411. continue;
  412. nphb = pci_bus_to_host(npdev->bus)->private_data;
  413. npu = &nphb->npu;
  414. mmio_atsd_reg[i].npu = npu;
  415. mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
  416. while (mmio_atsd_reg[i].reg < 0) {
  417. mmio_atsd_reg[i].reg = get_mmio_atsd_reg(npu);
  418. cpu_relax();
  419. }
  420. break;
  421. }
  422. }
  423. }
  424. /*
  425. * Release previously acquired ATSD registers. To avoid deadlocks the registers
  426. * must be released in the same order they were acquired above in
  427. * acquire_atsd_reg.
  428. */
  429. static void release_atsd_reg(struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS])
  430. {
  431. int i;
  432. for (i = 0; i <= max_npu2_index; i++) {
  433. /*
  434. * We can't rely on npu_context->npdev[][] being the same here
  435. * as when acquire_atsd_reg() was called, hence we use the
  436. * values stored in mmio_atsd_reg during the acquire phase
  437. * rather than re-reading npdev[][].
  438. */
  439. if (mmio_atsd_reg[i].reg < 0)
  440. continue;
  441. put_mmio_atsd_reg(mmio_atsd_reg[i].npu, mmio_atsd_reg[i].reg);
  442. }
  443. }
  444. /*
  445. * Invalidate a virtual address range
  446. */
  447. static void mmio_invalidate(struct npu_context *npu_context,
  448. unsigned long start, unsigned long size)
  449. {
  450. struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS];
  451. unsigned long pid = npu_context->mm->context.id;
  452. unsigned long atsd_start = 0;
  453. unsigned long end = start + size - 1;
  454. int atsd_psize = MMU_PAGE_COUNT;
  455. /*
  456. * Convert the input range into one of the supported sizes. If the range
  457. * doesn't fit, use the next larger supported size. Invalidation latency
  458. * is high, so over-invalidation is preferred to issuing multiple
  459. * invalidates.
  460. *
  461. * A 4K page size isn't supported by NPU/GPU ATS, so that case is
  462. * ignored.
  463. */
  464. if (size == SZ_64K) {
  465. atsd_start = start;
  466. atsd_psize = MMU_PAGE_64K;
  467. } else if (ALIGN_DOWN(start, SZ_2M) == ALIGN_DOWN(end, SZ_2M)) {
  468. atsd_start = ALIGN_DOWN(start, SZ_2M);
  469. atsd_psize = MMU_PAGE_2M;
  470. } else if (ALIGN_DOWN(start, SZ_1G) == ALIGN_DOWN(end, SZ_1G)) {
  471. atsd_start = ALIGN_DOWN(start, SZ_1G);
  472. atsd_psize = MMU_PAGE_1G;
  473. }
  474. if (npu_context->nmmu_flush)
  475. /*
  476. * Unfortunately the nest mmu does not support flushing specific
  477. * addresses so we have to flush the whole mm once before
  478. * shooting down the GPU translation.
  479. */
  480. flush_all_mm(npu_context->mm);
  481. /*
  482. * Loop over all the NPUs this process is active on and launch
  483. * an invalidate.
  484. */
  485. acquire_atsd_reg(npu_context, mmio_atsd_reg);
  486. if (atsd_psize == MMU_PAGE_COUNT)
  487. mmio_invalidate_pid(mmio_atsd_reg, pid);
  488. else
  489. mmio_invalidate_range(mmio_atsd_reg, pid, atsd_start,
  490. atsd_psize);
  491. mmio_invalidate_wait(mmio_atsd_reg);
  492. /*
  493. * The GPU requires two flush ATSDs to ensure all entries have been
  494. * flushed. We use PID 0 as it will never be used for a process on the
  495. * GPU.
  496. */
  497. mmio_invalidate_pid(mmio_atsd_reg, 0);
  498. mmio_invalidate_wait(mmio_atsd_reg);
  499. mmio_invalidate_pid(mmio_atsd_reg, 0);
  500. mmio_invalidate_wait(mmio_atsd_reg);
  501. release_atsd_reg(mmio_atsd_reg);
  502. }
  503. static void pnv_npu2_mn_release(struct mmu_notifier *mn,
  504. struct mm_struct *mm)
  505. {
  506. struct npu_context *npu_context = mn_to_npu_context(mn);
  507. /* Call into device driver to stop requests to the NMMU */
  508. if (npu_context->release_cb)
  509. npu_context->release_cb(npu_context, npu_context->priv);
  510. /*
  511. * There should be no more translation requests for this PID, but we
  512. * need to ensure any entries for it are removed from the TLB.
  513. */
  514. mmio_invalidate(npu_context, 0, ~0UL);
  515. }
  516. static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn,
  517. struct mm_struct *mm,
  518. unsigned long address,
  519. pte_t pte)
  520. {
  521. struct npu_context *npu_context = mn_to_npu_context(mn);
  522. mmio_invalidate(npu_context, address, PAGE_SIZE);
  523. }
  524. static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn,
  525. struct mm_struct *mm,
  526. unsigned long start, unsigned long end)
  527. {
  528. struct npu_context *npu_context = mn_to_npu_context(mn);
  529. mmio_invalidate(npu_context, start, end - start);
  530. }
  531. static const struct mmu_notifier_ops nv_nmmu_notifier_ops = {
  532. .release = pnv_npu2_mn_release,
  533. .change_pte = pnv_npu2_mn_change_pte,
  534. .invalidate_range = pnv_npu2_mn_invalidate_range,
  535. };
  536. /*
  537. * Call into OPAL to setup the nmmu context for the current task in
  538. * the NPU. This must be called to setup the context tables before the
  539. * GPU issues ATRs. pdev should be a pointed to PCIe GPU device.
  540. *
  541. * A release callback should be registered to allow a device driver to
  542. * be notified that it should not launch any new translation requests
  543. * as the final TLB invalidate is about to occur.
  544. *
  545. * Returns an error if there no contexts are currently available or a
  546. * npu_context which should be passed to pnv_npu2_handle_fault().
  547. *
  548. * mmap_sem must be held in write mode and must not be called from interrupt
  549. * context.
  550. */
  551. struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev,
  552. unsigned long flags,
  553. void (*cb)(struct npu_context *, void *),
  554. void *priv)
  555. {
  556. int rc;
  557. u32 nvlink_index;
  558. struct device_node *nvlink_dn;
  559. struct mm_struct *mm = current->mm;
  560. struct pnv_phb *nphb;
  561. struct npu *npu;
  562. struct npu_context *npu_context;
  563. /*
  564. * At present we don't support GPUs connected to multiple NPUs and I'm
  565. * not sure the hardware does either.
  566. */
  567. struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
  568. if (!firmware_has_feature(FW_FEATURE_OPAL))
  569. return ERR_PTR(-ENODEV);
  570. if (!npdev)
  571. /* No nvlink associated with this GPU device */
  572. return ERR_PTR(-ENODEV);
  573. nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
  574. if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
  575. &nvlink_index)))
  576. return ERR_PTR(-ENODEV);
  577. if (!mm || mm->context.id == 0) {
  578. /*
  579. * Kernel thread contexts are not supported and context id 0 is
  580. * reserved on the GPU.
  581. */
  582. return ERR_PTR(-EINVAL);
  583. }
  584. nphb = pci_bus_to_host(npdev->bus)->private_data;
  585. npu = &nphb->npu;
  586. /*
  587. * Setup the NPU context table for a particular GPU. These need to be
  588. * per-GPU as we need the tables to filter ATSDs when there are no
  589. * active contexts on a particular GPU. It is safe for these to be
  590. * called concurrently with destroy as the OPAL call takes appropriate
  591. * locks and refcounts on init/destroy.
  592. */
  593. rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags,
  594. PCI_DEVID(gpdev->bus->number, gpdev->devfn));
  595. if (rc < 0)
  596. return ERR_PTR(-ENOSPC);
  597. /*
  598. * We store the npu pci device so we can more easily get at the
  599. * associated npus.
  600. */
  601. spin_lock(&npu_context_lock);
  602. npu_context = mm->context.npu_context;
  603. if (npu_context) {
  604. if (npu_context->release_cb != cb ||
  605. npu_context->priv != priv) {
  606. spin_unlock(&npu_context_lock);
  607. opal_npu_destroy_context(nphb->opal_id, mm->context.id,
  608. PCI_DEVID(gpdev->bus->number,
  609. gpdev->devfn));
  610. return ERR_PTR(-EINVAL);
  611. }
  612. WARN_ON(!kref_get_unless_zero(&npu_context->kref));
  613. }
  614. spin_unlock(&npu_context_lock);
  615. if (!npu_context) {
  616. /*
  617. * We can set up these fields without holding the
  618. * npu_context_lock as the npu_context hasn't been returned to
  619. * the caller meaning it can't be destroyed. Parallel allocation
  620. * is protected against by mmap_sem.
  621. */
  622. rc = -ENOMEM;
  623. npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL);
  624. if (npu_context) {
  625. kref_init(&npu_context->kref);
  626. npu_context->mm = mm;
  627. npu_context->mn.ops = &nv_nmmu_notifier_ops;
  628. rc = __mmu_notifier_register(&npu_context->mn, mm);
  629. }
  630. if (rc) {
  631. kfree(npu_context);
  632. opal_npu_destroy_context(nphb->opal_id, mm->context.id,
  633. PCI_DEVID(gpdev->bus->number,
  634. gpdev->devfn));
  635. return ERR_PTR(rc);
  636. }
  637. mm->context.npu_context = npu_context;
  638. }
  639. npu_context->release_cb = cb;
  640. npu_context->priv = priv;
  641. /*
  642. * npdev is a pci_dev pointer setup by the PCI code. We assign it to
  643. * npdev[][] to indicate to the mmu notifiers that an invalidation
  644. * should also be sent over this nvlink. The notifiers don't use any
  645. * other fields in npu_context, so we just need to ensure that when they
  646. * deference npu_context->npdev[][] it is either a valid pointer or
  647. * NULL.
  648. */
  649. WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], npdev);
  650. if (!nphb->npu.nmmu_flush) {
  651. /*
  652. * If we're not explicitly flushing ourselves we need to mark
  653. * the thread for global flushes
  654. */
  655. npu_context->nmmu_flush = false;
  656. mm_context_add_copro(mm);
  657. } else
  658. npu_context->nmmu_flush = true;
  659. return npu_context;
  660. }
  661. EXPORT_SYMBOL(pnv_npu2_init_context);
  662. static void pnv_npu2_release_context(struct kref *kref)
  663. {
  664. struct npu_context *npu_context =
  665. container_of(kref, struct npu_context, kref);
  666. if (!npu_context->nmmu_flush)
  667. mm_context_remove_copro(npu_context->mm);
  668. npu_context->mm->context.npu_context = NULL;
  669. }
  670. /*
  671. * Destroy a context on the given GPU. May free the npu_context if it is no
  672. * longer active on any GPUs. Must not be called from interrupt context.
  673. */
  674. void pnv_npu2_destroy_context(struct npu_context *npu_context,
  675. struct pci_dev *gpdev)
  676. {
  677. int removed;
  678. struct pnv_phb *nphb;
  679. struct npu *npu;
  680. struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0);
  681. struct device_node *nvlink_dn;
  682. u32 nvlink_index;
  683. if (WARN_ON(!npdev))
  684. return;
  685. if (!firmware_has_feature(FW_FEATURE_OPAL))
  686. return;
  687. nphb = pci_bus_to_host(npdev->bus)->private_data;
  688. npu = &nphb->npu;
  689. nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0);
  690. if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index",
  691. &nvlink_index)))
  692. return;
  693. WRITE_ONCE(npu_context->npdev[npu->index][nvlink_index], NULL);
  694. opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id,
  695. PCI_DEVID(gpdev->bus->number, gpdev->devfn));
  696. spin_lock(&npu_context_lock);
  697. removed = kref_put(&npu_context->kref, pnv_npu2_release_context);
  698. spin_unlock(&npu_context_lock);
  699. /*
  700. * We need to do this outside of pnv_npu2_release_context so that it is
  701. * outside the spinlock as mmu_notifier_destroy uses SRCU.
  702. */
  703. if (removed) {
  704. mmu_notifier_unregister(&npu_context->mn,
  705. npu_context->mm);
  706. kfree(npu_context);
  707. }
  708. }
  709. EXPORT_SYMBOL(pnv_npu2_destroy_context);
  710. /*
  711. * Assumes mmap_sem is held for the contexts associated mm.
  712. */
  713. int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea,
  714. unsigned long *flags, unsigned long *status, int count)
  715. {
  716. u64 rc = 0, result = 0;
  717. int i, is_write;
  718. struct page *page[1];
  719. /* mmap_sem should be held so the struct_mm must be present */
  720. struct mm_struct *mm = context->mm;
  721. if (!firmware_has_feature(FW_FEATURE_OPAL))
  722. return -ENODEV;
  723. WARN_ON(!rwsem_is_locked(&mm->mmap_sem));
  724. for (i = 0; i < count; i++) {
  725. is_write = flags[i] & NPU2_WRITE;
  726. rc = get_user_pages_remote(NULL, mm, ea[i], 1,
  727. is_write ? FOLL_WRITE : 0,
  728. page, NULL, NULL);
  729. /*
  730. * To support virtualised environments we will have to do an
  731. * access to the page to ensure it gets faulted into the
  732. * hypervisor. For the moment virtualisation is not supported in
  733. * other areas so leave the access out.
  734. */
  735. if (rc != 1) {
  736. status[i] = rc;
  737. result = -EFAULT;
  738. continue;
  739. }
  740. status[i] = 0;
  741. put_page(page[0]);
  742. }
  743. return result;
  744. }
  745. EXPORT_SYMBOL(pnv_npu2_handle_fault);
  746. int pnv_npu2_init(struct pnv_phb *phb)
  747. {
  748. unsigned int i;
  749. u64 mmio_atsd;
  750. struct device_node *dn;
  751. struct pci_dev *gpdev;
  752. static int npu_index;
  753. uint64_t rc = 0;
  754. phb->npu.nmmu_flush =
  755. of_property_read_bool(phb->hose->dn, "ibm,nmmu-flush");
  756. for_each_child_of_node(phb->hose->dn, dn) {
  757. gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn));
  758. if (gpdev) {
  759. rc = opal_npu_map_lpar(phb->opal_id,
  760. PCI_DEVID(gpdev->bus->number, gpdev->devfn),
  761. 0, 0);
  762. if (rc)
  763. dev_err(&gpdev->dev,
  764. "Error %lld mapping device to LPAR\n",
  765. rc);
  766. }
  767. }
  768. for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd",
  769. i, &mmio_atsd); i++)
  770. phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32);
  771. pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i);
  772. phb->npu.mmio_atsd_count = i;
  773. phb->npu.mmio_atsd_usage = 0;
  774. npu_index++;
  775. if (WARN_ON(npu_index >= NV_MAX_NPUS))
  776. return -ENOSPC;
  777. max_npu2_index = npu_index;
  778. phb->npu.index = npu_index;
  779. return 0;
  780. }