virtio_ring.c 33 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203
  1. /* Virtio ring implementation.
  2. *
  3. * Copyright 2007 Rusty Russell IBM Corporation
  4. *
  5. * This program is free software; you can redistribute it and/or modify
  6. * it under the terms of the GNU General Public License as published by
  7. * the Free Software Foundation; either version 2 of the License, or
  8. * (at your option) any later version.
  9. *
  10. * This program is distributed in the hope that it will be useful,
  11. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  12. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  13. * GNU General Public License for more details.
  14. *
  15. * You should have received a copy of the GNU General Public License
  16. * along with this program; if not, write to the Free Software
  17. * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
  18. */
  19. #include <linux/virtio.h>
  20. #include <linux/virtio_ring.h>
  21. #include <linux/virtio_config.h>
  22. #include <linux/device.h>
  23. #include <linux/slab.h>
  24. #include <linux/module.h>
  25. #include <linux/hrtimer.h>
  26. #include <linux/kmemleak.h>
  27. #include <linux/dma-mapping.h>
  28. #include <xen/xen.h>
  29. #ifdef DEBUG
  30. /* For development, we want to crash whenever the ring is screwed. */
  31. #define BAD_RING(_vq, fmt, args...) \
  32. do { \
  33. dev_err(&(_vq)->vq.vdev->dev, \
  34. "%s:"fmt, (_vq)->vq.name, ##args); \
  35. BUG(); \
  36. } while (0)
  37. /* Caller is supposed to guarantee no reentry. */
  38. #define START_USE(_vq) \
  39. do { \
  40. if ((_vq)->in_use) \
  41. panic("%s:in_use = %i\n", \
  42. (_vq)->vq.name, (_vq)->in_use); \
  43. (_vq)->in_use = __LINE__; \
  44. } while (0)
  45. #define END_USE(_vq) \
  46. do { BUG_ON(!(_vq)->in_use); (_vq)->in_use = 0; } while(0)
  47. #else
  48. #define BAD_RING(_vq, fmt, args...) \
  49. do { \
  50. dev_err(&_vq->vq.vdev->dev, \
  51. "%s:"fmt, (_vq)->vq.name, ##args); \
  52. (_vq)->broken = true; \
  53. } while (0)
  54. #define START_USE(vq)
  55. #define END_USE(vq)
  56. #endif
  57. struct vring_desc_state {
  58. void *data; /* Data for callback. */
  59. struct vring_desc *indir_desc; /* Indirect descriptor, if any. */
  60. };
  61. struct vring_virtqueue {
  62. struct virtqueue vq;
  63. /* Actual memory layout for this queue */
  64. struct vring vring;
  65. /* Can we use weak barriers? */
  66. bool weak_barriers;
  67. /* Other side has made a mess, don't try any more. */
  68. bool broken;
  69. /* Host supports indirect buffers */
  70. bool indirect;
  71. /* Host publishes avail event idx */
  72. bool event;
  73. /* Head of free buffer list. */
  74. unsigned int free_head;
  75. /* Number we've added since last sync. */
  76. unsigned int num_added;
  77. /* Last used index we've seen. */
  78. u16 last_used_idx;
  79. /* Last written value to avail->flags */
  80. u16 avail_flags_shadow;
  81. /* Last written value to avail->idx in guest byte order */
  82. u16 avail_idx_shadow;
  83. /* How to notify other side. FIXME: commonalize hcalls! */
  84. bool (*notify)(struct virtqueue *vq);
  85. /* DMA, allocation, and size information */
  86. bool we_own_ring;
  87. size_t queue_size_in_bytes;
  88. dma_addr_t queue_dma_addr;
  89. #ifdef DEBUG
  90. /* They're supposed to lock for us. */
  91. unsigned int in_use;
  92. /* Figure out if their kicks are too delayed. */
  93. bool last_add_time_valid;
  94. ktime_t last_add_time;
  95. #endif
  96. /* Per-descriptor state. */
  97. struct vring_desc_state desc_state[];
  98. };
  99. #define to_vvq(_vq) container_of(_vq, struct vring_virtqueue, vq)
  100. /*
  101. * Modern virtio devices have feature bits to specify whether they need a
  102. * quirk and bypass the IOMMU. If not there, just use the DMA API.
  103. *
  104. * If there, the interaction between virtio and DMA API is messy.
  105. *
  106. * On most systems with virtio, physical addresses match bus addresses,
  107. * and it doesn't particularly matter whether we use the DMA API.
  108. *
  109. * On some systems, including Xen and any system with a physical device
  110. * that speaks virtio behind a physical IOMMU, we must use the DMA API
  111. * for virtio DMA to work at all.
  112. *
  113. * On other systems, including SPARC and PPC64, virtio-pci devices are
  114. * enumerated as though they are behind an IOMMU, but the virtio host
  115. * ignores the IOMMU, so we must either pretend that the IOMMU isn't
  116. * there or somehow map everything as the identity.
  117. *
  118. * For the time being, we preserve historic behavior and bypass the DMA
  119. * API.
  120. *
  121. * TODO: install a per-device DMA ops structure that does the right thing
  122. * taking into account all the above quirks, and use the DMA API
  123. * unconditionally on data path.
  124. */
  125. static bool vring_use_dma_api(struct virtio_device *vdev)
  126. {
  127. if (!virtio_has_iommu_quirk(vdev))
  128. return true;
  129. /* Otherwise, we are left to guess. */
  130. /*
  131. * In theory, it's possible to have a buggy QEMU-supposed
  132. * emulated Q35 IOMMU and Xen enabled at the same time. On
  133. * such a configuration, virtio has never worked and will
  134. * not work without an even larger kludge. Instead, enable
  135. * the DMA API if we're a Xen guest, which at least allows
  136. * all of the sensible Xen configurations to work correctly.
  137. */
  138. if (xen_domain())
  139. return true;
  140. return false;
  141. }
  142. /*
  143. * The DMA ops on various arches are rather gnarly right now, and
  144. * making all of the arch DMA ops work on the vring device itself
  145. * is a mess. For now, we use the parent device for DMA ops.
  146. */
  147. struct device *vring_dma_dev(const struct vring_virtqueue *vq)
  148. {
  149. return vq->vq.vdev->dev.parent;
  150. }
  151. /* Map one sg entry. */
  152. static dma_addr_t vring_map_one_sg(const struct vring_virtqueue *vq,
  153. struct scatterlist *sg,
  154. enum dma_data_direction direction)
  155. {
  156. if (!vring_use_dma_api(vq->vq.vdev))
  157. return (dma_addr_t)sg_phys(sg);
  158. /*
  159. * We can't use dma_map_sg, because we don't use scatterlists in
  160. * the way it expects (we don't guarantee that the scatterlist
  161. * will exist for the lifetime of the mapping).
  162. */
  163. return dma_map_page(vring_dma_dev(vq),
  164. sg_page(sg), sg->offset, sg->length,
  165. direction);
  166. }
  167. static dma_addr_t vring_map_single(const struct vring_virtqueue *vq,
  168. void *cpu_addr, size_t size,
  169. enum dma_data_direction direction)
  170. {
  171. if (!vring_use_dma_api(vq->vq.vdev))
  172. return (dma_addr_t)virt_to_phys(cpu_addr);
  173. return dma_map_single(vring_dma_dev(vq),
  174. cpu_addr, size, direction);
  175. }
  176. static void vring_unmap_one(const struct vring_virtqueue *vq,
  177. struct vring_desc *desc)
  178. {
  179. u16 flags;
  180. if (!vring_use_dma_api(vq->vq.vdev))
  181. return;
  182. flags = virtio16_to_cpu(vq->vq.vdev, desc->flags);
  183. if (flags & VRING_DESC_F_INDIRECT) {
  184. dma_unmap_single(vring_dma_dev(vq),
  185. virtio64_to_cpu(vq->vq.vdev, desc->addr),
  186. virtio32_to_cpu(vq->vq.vdev, desc->len),
  187. (flags & VRING_DESC_F_WRITE) ?
  188. DMA_FROM_DEVICE : DMA_TO_DEVICE);
  189. } else {
  190. dma_unmap_page(vring_dma_dev(vq),
  191. virtio64_to_cpu(vq->vq.vdev, desc->addr),
  192. virtio32_to_cpu(vq->vq.vdev, desc->len),
  193. (flags & VRING_DESC_F_WRITE) ?
  194. DMA_FROM_DEVICE : DMA_TO_DEVICE);
  195. }
  196. }
  197. static int vring_mapping_error(const struct vring_virtqueue *vq,
  198. dma_addr_t addr)
  199. {
  200. if (!vring_use_dma_api(vq->vq.vdev))
  201. return 0;
  202. return dma_mapping_error(vring_dma_dev(vq), addr);
  203. }
  204. static struct vring_desc *alloc_indirect(struct virtqueue *_vq,
  205. unsigned int total_sg, gfp_t gfp)
  206. {
  207. struct vring_desc *desc;
  208. unsigned int i;
  209. /*
  210. * We require lowmem mappings for the descriptors because
  211. * otherwise virt_to_phys will give us bogus addresses in the
  212. * virtqueue.
  213. */
  214. gfp &= ~__GFP_HIGHMEM;
  215. desc = kmalloc(total_sg * sizeof(struct vring_desc), gfp);
  216. if (!desc)
  217. return NULL;
  218. for (i = 0; i < total_sg; i++)
  219. desc[i].next = cpu_to_virtio16(_vq->vdev, i + 1);
  220. return desc;
  221. }
  222. static inline int virtqueue_add(struct virtqueue *_vq,
  223. struct scatterlist *sgs[],
  224. unsigned int total_sg,
  225. unsigned int out_sgs,
  226. unsigned int in_sgs,
  227. void *data,
  228. gfp_t gfp)
  229. {
  230. struct vring_virtqueue *vq = to_vvq(_vq);
  231. struct scatterlist *sg;
  232. struct vring_desc *desc;
  233. unsigned int i, n, avail, descs_used, uninitialized_var(prev), err_idx;
  234. int head;
  235. bool indirect;
  236. START_USE(vq);
  237. BUG_ON(data == NULL);
  238. if (unlikely(vq->broken)) {
  239. END_USE(vq);
  240. return -EIO;
  241. }
  242. #ifdef DEBUG
  243. {
  244. ktime_t now = ktime_get();
  245. /* No kick or get, with .1 second between? Warn. */
  246. if (vq->last_add_time_valid)
  247. WARN_ON(ktime_to_ms(ktime_sub(now, vq->last_add_time))
  248. > 100);
  249. vq->last_add_time = now;
  250. vq->last_add_time_valid = true;
  251. }
  252. #endif
  253. BUG_ON(total_sg > vq->vring.num);
  254. BUG_ON(total_sg == 0);
  255. head = vq->free_head;
  256. /* If the host supports indirect descriptor tables, and we have multiple
  257. * buffers, then go indirect. FIXME: tune this threshold */
  258. if (vq->indirect && total_sg > 1 && vq->vq.num_free)
  259. desc = alloc_indirect(_vq, total_sg, gfp);
  260. else
  261. desc = NULL;
  262. if (desc) {
  263. /* Use a single buffer which doesn't continue */
  264. indirect = true;
  265. /* Set up rest to use this indirect table. */
  266. i = 0;
  267. descs_used = 1;
  268. } else {
  269. indirect = false;
  270. desc = vq->vring.desc;
  271. i = head;
  272. descs_used = total_sg;
  273. }
  274. if (vq->vq.num_free < descs_used) {
  275. pr_debug("Can't add buf len %i - avail = %i\n",
  276. descs_used, vq->vq.num_free);
  277. /* FIXME: for historical reasons, we force a notify here if
  278. * there are outgoing parts to the buffer. Presumably the
  279. * host should service the ring ASAP. */
  280. if (out_sgs)
  281. vq->notify(&vq->vq);
  282. if (indirect)
  283. kfree(desc);
  284. END_USE(vq);
  285. return -ENOSPC;
  286. }
  287. for (n = 0; n < out_sgs; n++) {
  288. for (sg = sgs[n]; sg; sg = sg_next(sg)) {
  289. dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_TO_DEVICE);
  290. if (vring_mapping_error(vq, addr))
  291. goto unmap_release;
  292. desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT);
  293. desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
  294. desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
  295. prev = i;
  296. i = virtio16_to_cpu(_vq->vdev, desc[i].next);
  297. }
  298. }
  299. for (; n < (out_sgs + in_sgs); n++) {
  300. for (sg = sgs[n]; sg; sg = sg_next(sg)) {
  301. dma_addr_t addr = vring_map_one_sg(vq, sg, DMA_FROM_DEVICE);
  302. if (vring_mapping_error(vq, addr))
  303. goto unmap_release;
  304. desc[i].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_NEXT | VRING_DESC_F_WRITE);
  305. desc[i].addr = cpu_to_virtio64(_vq->vdev, addr);
  306. desc[i].len = cpu_to_virtio32(_vq->vdev, sg->length);
  307. prev = i;
  308. i = virtio16_to_cpu(_vq->vdev, desc[i].next);
  309. }
  310. }
  311. /* Last one doesn't continue. */
  312. desc[prev].flags &= cpu_to_virtio16(_vq->vdev, ~VRING_DESC_F_NEXT);
  313. if (indirect) {
  314. /* Now that the indirect table is filled in, map it. */
  315. dma_addr_t addr = vring_map_single(
  316. vq, desc, total_sg * sizeof(struct vring_desc),
  317. DMA_TO_DEVICE);
  318. if (vring_mapping_error(vq, addr))
  319. goto unmap_release;
  320. vq->vring.desc[head].flags = cpu_to_virtio16(_vq->vdev, VRING_DESC_F_INDIRECT);
  321. vq->vring.desc[head].addr = cpu_to_virtio64(_vq->vdev, addr);
  322. vq->vring.desc[head].len = cpu_to_virtio32(_vq->vdev, total_sg * sizeof(struct vring_desc));
  323. }
  324. /* We're using some buffers from the free list. */
  325. vq->vq.num_free -= descs_used;
  326. /* Update free pointer */
  327. if (indirect)
  328. vq->free_head = virtio16_to_cpu(_vq->vdev, vq->vring.desc[head].next);
  329. else
  330. vq->free_head = i;
  331. /* Store token and indirect buffer state. */
  332. vq->desc_state[head].data = data;
  333. if (indirect)
  334. vq->desc_state[head].indir_desc = desc;
  335. /* Put entry in available array (but don't update avail->idx until they
  336. * do sync). */
  337. avail = vq->avail_idx_shadow & (vq->vring.num - 1);
  338. vq->vring.avail->ring[avail] = cpu_to_virtio16(_vq->vdev, head);
  339. /* Descriptors and available array need to be set before we expose the
  340. * new available array entries. */
  341. virtio_wmb(vq->weak_barriers);
  342. vq->avail_idx_shadow++;
  343. vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
  344. vq->num_added++;
  345. pr_debug("Added buffer head %i to %p\n", head, vq);
  346. END_USE(vq);
  347. /* This is very unlikely, but theoretically possible. Kick
  348. * just in case. */
  349. if (unlikely(vq->num_added == (1 << 16) - 1))
  350. virtqueue_kick(_vq);
  351. return 0;
  352. unmap_release:
  353. err_idx = i;
  354. i = head;
  355. for (n = 0; n < total_sg; n++) {
  356. if (i == err_idx)
  357. break;
  358. vring_unmap_one(vq, &desc[i]);
  359. i = vq->vring.desc[i].next;
  360. }
  361. vq->vq.num_free += total_sg;
  362. if (indirect)
  363. kfree(desc);
  364. END_USE(vq);
  365. return -EIO;
  366. }
  367. /**
  368. * virtqueue_add_sgs - expose buffers to other end
  369. * @vq: the struct virtqueue we're talking about.
  370. * @sgs: array of terminated scatterlists.
  371. * @out_num: the number of scatterlists readable by other side
  372. * @in_num: the number of scatterlists which are writable (after readable ones)
  373. * @data: the token identifying the buffer.
  374. * @gfp: how to do memory allocations (if necessary).
  375. *
  376. * Caller must ensure we don't call this with other virtqueue operations
  377. * at the same time (except where noted).
  378. *
  379. * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
  380. */
  381. int virtqueue_add_sgs(struct virtqueue *_vq,
  382. struct scatterlist *sgs[],
  383. unsigned int out_sgs,
  384. unsigned int in_sgs,
  385. void *data,
  386. gfp_t gfp)
  387. {
  388. unsigned int i, total_sg = 0;
  389. /* Count them first. */
  390. for (i = 0; i < out_sgs + in_sgs; i++) {
  391. struct scatterlist *sg;
  392. for (sg = sgs[i]; sg; sg = sg_next(sg))
  393. total_sg++;
  394. }
  395. return virtqueue_add(_vq, sgs, total_sg, out_sgs, in_sgs, data, gfp);
  396. }
  397. EXPORT_SYMBOL_GPL(virtqueue_add_sgs);
  398. /**
  399. * virtqueue_add_outbuf - expose output buffers to other end
  400. * @vq: the struct virtqueue we're talking about.
  401. * @sg: scatterlist (must be well-formed and terminated!)
  402. * @num: the number of entries in @sg readable by other side
  403. * @data: the token identifying the buffer.
  404. * @gfp: how to do memory allocations (if necessary).
  405. *
  406. * Caller must ensure we don't call this with other virtqueue operations
  407. * at the same time (except where noted).
  408. *
  409. * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
  410. */
  411. int virtqueue_add_outbuf(struct virtqueue *vq,
  412. struct scatterlist *sg, unsigned int num,
  413. void *data,
  414. gfp_t gfp)
  415. {
  416. return virtqueue_add(vq, &sg, num, 1, 0, data, gfp);
  417. }
  418. EXPORT_SYMBOL_GPL(virtqueue_add_outbuf);
  419. /**
  420. * virtqueue_add_inbuf - expose input buffers to other end
  421. * @vq: the struct virtqueue we're talking about.
  422. * @sg: scatterlist (must be well-formed and terminated!)
  423. * @num: the number of entries in @sg writable by other side
  424. * @data: the token identifying the buffer.
  425. * @gfp: how to do memory allocations (if necessary).
  426. *
  427. * Caller must ensure we don't call this with other virtqueue operations
  428. * at the same time (except where noted).
  429. *
  430. * Returns zero or a negative error (ie. ENOSPC, ENOMEM, EIO).
  431. */
  432. int virtqueue_add_inbuf(struct virtqueue *vq,
  433. struct scatterlist *sg, unsigned int num,
  434. void *data,
  435. gfp_t gfp)
  436. {
  437. return virtqueue_add(vq, &sg, num, 0, 1, data, gfp);
  438. }
  439. EXPORT_SYMBOL_GPL(virtqueue_add_inbuf);
  440. /**
  441. * virtqueue_kick_prepare - first half of split virtqueue_kick call.
  442. * @vq: the struct virtqueue
  443. *
  444. * Instead of virtqueue_kick(), you can do:
  445. * if (virtqueue_kick_prepare(vq))
  446. * virtqueue_notify(vq);
  447. *
  448. * This is sometimes useful because the virtqueue_kick_prepare() needs
  449. * to be serialized, but the actual virtqueue_notify() call does not.
  450. */
  451. bool virtqueue_kick_prepare(struct virtqueue *_vq)
  452. {
  453. struct vring_virtqueue *vq = to_vvq(_vq);
  454. u16 new, old;
  455. bool needs_kick;
  456. START_USE(vq);
  457. /* We need to expose available array entries before checking avail
  458. * event. */
  459. virtio_mb(vq->weak_barriers);
  460. old = vq->avail_idx_shadow - vq->num_added;
  461. new = vq->avail_idx_shadow;
  462. vq->num_added = 0;
  463. #ifdef DEBUG
  464. if (vq->last_add_time_valid) {
  465. WARN_ON(ktime_to_ms(ktime_sub(ktime_get(),
  466. vq->last_add_time)) > 100);
  467. }
  468. vq->last_add_time_valid = false;
  469. #endif
  470. if (vq->event) {
  471. needs_kick = vring_need_event(virtio16_to_cpu(_vq->vdev, vring_avail_event(&vq->vring)),
  472. new, old);
  473. } else {
  474. needs_kick = !(vq->vring.used->flags & cpu_to_virtio16(_vq->vdev, VRING_USED_F_NO_NOTIFY));
  475. }
  476. END_USE(vq);
  477. return needs_kick;
  478. }
  479. EXPORT_SYMBOL_GPL(virtqueue_kick_prepare);
  480. /**
  481. * virtqueue_notify - second half of split virtqueue_kick call.
  482. * @vq: the struct virtqueue
  483. *
  484. * This does not need to be serialized.
  485. *
  486. * Returns false if host notify failed or queue is broken, otherwise true.
  487. */
  488. bool virtqueue_notify(struct virtqueue *_vq)
  489. {
  490. struct vring_virtqueue *vq = to_vvq(_vq);
  491. if (unlikely(vq->broken))
  492. return false;
  493. /* Prod other side to tell it about changes. */
  494. if (!vq->notify(_vq)) {
  495. vq->broken = true;
  496. return false;
  497. }
  498. return true;
  499. }
  500. EXPORT_SYMBOL_GPL(virtqueue_notify);
  501. /**
  502. * virtqueue_kick - update after add_buf
  503. * @vq: the struct virtqueue
  504. *
  505. * After one or more virtqueue_add_* calls, invoke this to kick
  506. * the other side.
  507. *
  508. * Caller must ensure we don't call this with other virtqueue
  509. * operations at the same time (except where noted).
  510. *
  511. * Returns false if kick failed, otherwise true.
  512. */
  513. bool virtqueue_kick(struct virtqueue *vq)
  514. {
  515. if (virtqueue_kick_prepare(vq))
  516. return virtqueue_notify(vq);
  517. return true;
  518. }
  519. EXPORT_SYMBOL_GPL(virtqueue_kick);
  520. static void detach_buf(struct vring_virtqueue *vq, unsigned int head)
  521. {
  522. unsigned int i, j;
  523. u16 nextflag = cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_NEXT);
  524. /* Clear data ptr. */
  525. vq->desc_state[head].data = NULL;
  526. /* Put back on free list: unmap first-level descriptors and find end */
  527. i = head;
  528. while (vq->vring.desc[i].flags & nextflag) {
  529. vring_unmap_one(vq, &vq->vring.desc[i]);
  530. i = virtio16_to_cpu(vq->vq.vdev, vq->vring.desc[i].next);
  531. vq->vq.num_free++;
  532. }
  533. vring_unmap_one(vq, &vq->vring.desc[i]);
  534. vq->vring.desc[i].next = cpu_to_virtio16(vq->vq.vdev, vq->free_head);
  535. vq->free_head = head;
  536. /* Plus final descriptor */
  537. vq->vq.num_free++;
  538. /* Free the indirect table, if any, now that it's unmapped. */
  539. if (vq->desc_state[head].indir_desc) {
  540. struct vring_desc *indir_desc = vq->desc_state[head].indir_desc;
  541. u32 len = virtio32_to_cpu(vq->vq.vdev, vq->vring.desc[head].len);
  542. BUG_ON(!(vq->vring.desc[head].flags &
  543. cpu_to_virtio16(vq->vq.vdev, VRING_DESC_F_INDIRECT)));
  544. BUG_ON(len == 0 || len % sizeof(struct vring_desc));
  545. for (j = 0; j < len / sizeof(struct vring_desc); j++)
  546. vring_unmap_one(vq, &indir_desc[j]);
  547. kfree(vq->desc_state[head].indir_desc);
  548. vq->desc_state[head].indir_desc = NULL;
  549. }
  550. }
  551. static inline bool more_used(const struct vring_virtqueue *vq)
  552. {
  553. return vq->last_used_idx != virtio16_to_cpu(vq->vq.vdev, vq->vring.used->idx);
  554. }
  555. /**
  556. * virtqueue_get_buf - get the next used buffer
  557. * @vq: the struct virtqueue we're talking about.
  558. * @len: the length written into the buffer
  559. *
  560. * If the driver wrote data into the buffer, @len will be set to the
  561. * amount written. This means you don't need to clear the buffer
  562. * beforehand to ensure there's no data leakage in the case of short
  563. * writes.
  564. *
  565. * Caller must ensure we don't call this with other virtqueue
  566. * operations at the same time (except where noted).
  567. *
  568. * Returns NULL if there are no used buffers, or the "data" token
  569. * handed to virtqueue_add_*().
  570. */
  571. void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len)
  572. {
  573. struct vring_virtqueue *vq = to_vvq(_vq);
  574. void *ret;
  575. unsigned int i;
  576. u16 last_used;
  577. START_USE(vq);
  578. if (unlikely(vq->broken)) {
  579. END_USE(vq);
  580. return NULL;
  581. }
  582. if (!more_used(vq)) {
  583. pr_debug("No more buffers in queue\n");
  584. END_USE(vq);
  585. return NULL;
  586. }
  587. /* Only get used array entries after they have been exposed by host. */
  588. virtio_rmb(vq->weak_barriers);
  589. last_used = (vq->last_used_idx & (vq->vring.num - 1));
  590. i = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].id);
  591. *len = virtio32_to_cpu(_vq->vdev, vq->vring.used->ring[last_used].len);
  592. if (unlikely(i >= vq->vring.num)) {
  593. BAD_RING(vq, "id %u out of range\n", i);
  594. return NULL;
  595. }
  596. if (unlikely(!vq->desc_state[i].data)) {
  597. BAD_RING(vq, "id %u is not a head!\n", i);
  598. return NULL;
  599. }
  600. /* detach_buf clears data, so grab it now. */
  601. ret = vq->desc_state[i].data;
  602. detach_buf(vq, i);
  603. vq->last_used_idx++;
  604. /* If we expect an interrupt for the next entry, tell host
  605. * by writing event index and flush out the write before
  606. * the read in the next get_buf call. */
  607. if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT))
  608. virtio_store_mb(vq->weak_barriers,
  609. &vring_used_event(&vq->vring),
  610. cpu_to_virtio16(_vq->vdev, vq->last_used_idx));
  611. #ifdef DEBUG
  612. vq->last_add_time_valid = false;
  613. #endif
  614. END_USE(vq);
  615. return ret;
  616. }
  617. EXPORT_SYMBOL_GPL(virtqueue_get_buf);
  618. /**
  619. * virtqueue_disable_cb - disable callbacks
  620. * @vq: the struct virtqueue we're talking about.
  621. *
  622. * Note that this is not necessarily synchronous, hence unreliable and only
  623. * useful as an optimization.
  624. *
  625. * Unlike other operations, this need not be serialized.
  626. */
  627. void virtqueue_disable_cb(struct virtqueue *_vq)
  628. {
  629. struct vring_virtqueue *vq = to_vvq(_vq);
  630. if (!(vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT)) {
  631. vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
  632. vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
  633. }
  634. }
  635. EXPORT_SYMBOL_GPL(virtqueue_disable_cb);
  636. /**
  637. * virtqueue_enable_cb_prepare - restart callbacks after disable_cb
  638. * @vq: the struct virtqueue we're talking about.
  639. *
  640. * This re-enables callbacks; it returns current queue state
  641. * in an opaque unsigned value. This value should be later tested by
  642. * virtqueue_poll, to detect a possible race between the driver checking for
  643. * more work, and enabling callbacks.
  644. *
  645. * Caller must ensure we don't call this with other virtqueue
  646. * operations at the same time (except where noted).
  647. */
  648. unsigned virtqueue_enable_cb_prepare(struct virtqueue *_vq)
  649. {
  650. struct vring_virtqueue *vq = to_vvq(_vq);
  651. u16 last_used_idx;
  652. START_USE(vq);
  653. /* We optimistically turn back on interrupts, then check if there was
  654. * more to do. */
  655. /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to
  656. * either clear the flags bit or point the event index at the next
  657. * entry. Always do both to keep code simple. */
  658. if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
  659. vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
  660. vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
  661. }
  662. vring_used_event(&vq->vring) = cpu_to_virtio16(_vq->vdev, last_used_idx = vq->last_used_idx);
  663. END_USE(vq);
  664. return last_used_idx;
  665. }
  666. EXPORT_SYMBOL_GPL(virtqueue_enable_cb_prepare);
  667. /**
  668. * virtqueue_poll - query pending used buffers
  669. * @vq: the struct virtqueue we're talking about.
  670. * @last_used_idx: virtqueue state (from call to virtqueue_enable_cb_prepare).
  671. *
  672. * Returns "true" if there are pending used buffers in the queue.
  673. *
  674. * This does not need to be serialized.
  675. */
  676. bool virtqueue_poll(struct virtqueue *_vq, unsigned last_used_idx)
  677. {
  678. struct vring_virtqueue *vq = to_vvq(_vq);
  679. virtio_mb(vq->weak_barriers);
  680. return (u16)last_used_idx != virtio16_to_cpu(_vq->vdev, vq->vring.used->idx);
  681. }
  682. EXPORT_SYMBOL_GPL(virtqueue_poll);
  683. /**
  684. * virtqueue_enable_cb - restart callbacks after disable_cb.
  685. * @vq: the struct virtqueue we're talking about.
  686. *
  687. * This re-enables callbacks; it returns "false" if there are pending
  688. * buffers in the queue, to detect a possible race between the driver
  689. * checking for more work, and enabling callbacks.
  690. *
  691. * Caller must ensure we don't call this with other virtqueue
  692. * operations at the same time (except where noted).
  693. */
  694. bool virtqueue_enable_cb(struct virtqueue *_vq)
  695. {
  696. unsigned last_used_idx = virtqueue_enable_cb_prepare(_vq);
  697. return !virtqueue_poll(_vq, last_used_idx);
  698. }
  699. EXPORT_SYMBOL_GPL(virtqueue_enable_cb);
  700. /**
  701. * virtqueue_enable_cb_delayed - restart callbacks after disable_cb.
  702. * @vq: the struct virtqueue we're talking about.
  703. *
  704. * This re-enables callbacks but hints to the other side to delay
  705. * interrupts until most of the available buffers have been processed;
  706. * it returns "false" if there are many pending buffers in the queue,
  707. * to detect a possible race between the driver checking for more work,
  708. * and enabling callbacks.
  709. *
  710. * Caller must ensure we don't call this with other virtqueue
  711. * operations at the same time (except where noted).
  712. */
  713. bool virtqueue_enable_cb_delayed(struct virtqueue *_vq)
  714. {
  715. struct vring_virtqueue *vq = to_vvq(_vq);
  716. u16 bufs;
  717. START_USE(vq);
  718. /* We optimistically turn back on interrupts, then check if there was
  719. * more to do. */
  720. /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to
  721. * either clear the flags bit or point the event index at the next
  722. * entry. Always do both to keep code simple. */
  723. if (vq->avail_flags_shadow & VRING_AVAIL_F_NO_INTERRUPT) {
  724. vq->avail_flags_shadow &= ~VRING_AVAIL_F_NO_INTERRUPT;
  725. vq->vring.avail->flags = cpu_to_virtio16(_vq->vdev, vq->avail_flags_shadow);
  726. }
  727. /* TODO: tune this threshold */
  728. bufs = (u16)(vq->avail_idx_shadow - vq->last_used_idx) * 3 / 4;
  729. virtio_store_mb(vq->weak_barriers,
  730. &vring_used_event(&vq->vring),
  731. cpu_to_virtio16(_vq->vdev, vq->last_used_idx + bufs));
  732. if (unlikely((u16)(virtio16_to_cpu(_vq->vdev, vq->vring.used->idx) - vq->last_used_idx) > bufs)) {
  733. END_USE(vq);
  734. return false;
  735. }
  736. END_USE(vq);
  737. return true;
  738. }
  739. EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed);
  740. /**
  741. * virtqueue_detach_unused_buf - detach first unused buffer
  742. * @vq: the struct virtqueue we're talking about.
  743. *
  744. * Returns NULL or the "data" token handed to virtqueue_add_*().
  745. * This is not valid on an active queue; it is useful only for device
  746. * shutdown.
  747. */
  748. void *virtqueue_detach_unused_buf(struct virtqueue *_vq)
  749. {
  750. struct vring_virtqueue *vq = to_vvq(_vq);
  751. unsigned int i;
  752. void *buf;
  753. START_USE(vq);
  754. for (i = 0; i < vq->vring.num; i++) {
  755. if (!vq->desc_state[i].data)
  756. continue;
  757. /* detach_buf clears data, so grab it now. */
  758. buf = vq->desc_state[i].data;
  759. detach_buf(vq, i);
  760. vq->avail_idx_shadow--;
  761. vq->vring.avail->idx = cpu_to_virtio16(_vq->vdev, vq->avail_idx_shadow);
  762. END_USE(vq);
  763. return buf;
  764. }
  765. /* That should have freed everything. */
  766. BUG_ON(vq->vq.num_free != vq->vring.num);
  767. END_USE(vq);
  768. return NULL;
  769. }
  770. EXPORT_SYMBOL_GPL(virtqueue_detach_unused_buf);
  771. irqreturn_t vring_interrupt(int irq, void *_vq)
  772. {
  773. struct vring_virtqueue *vq = to_vvq(_vq);
  774. if (!more_used(vq)) {
  775. pr_debug("virtqueue interrupt with no work for %p\n", vq);
  776. return IRQ_NONE;
  777. }
  778. if (unlikely(vq->broken))
  779. return IRQ_HANDLED;
  780. pr_debug("virtqueue callback for %p (%p)\n", vq, vq->vq.callback);
  781. if (vq->vq.callback)
  782. vq->vq.callback(&vq->vq);
  783. return IRQ_HANDLED;
  784. }
  785. EXPORT_SYMBOL_GPL(vring_interrupt);
  786. struct virtqueue *__vring_new_virtqueue(unsigned int index,
  787. struct vring vring,
  788. struct virtio_device *vdev,
  789. bool weak_barriers,
  790. bool (*notify)(struct virtqueue *),
  791. void (*callback)(struct virtqueue *),
  792. const char *name)
  793. {
  794. unsigned int i;
  795. struct vring_virtqueue *vq;
  796. vq = kmalloc(sizeof(*vq) + vring.num * sizeof(struct vring_desc_state),
  797. GFP_KERNEL);
  798. if (!vq)
  799. return NULL;
  800. vq->vring = vring;
  801. vq->vq.callback = callback;
  802. vq->vq.vdev = vdev;
  803. vq->vq.name = name;
  804. vq->vq.num_free = vring.num;
  805. vq->vq.index = index;
  806. vq->we_own_ring = false;
  807. vq->queue_dma_addr = 0;
  808. vq->queue_size_in_bytes = 0;
  809. vq->notify = notify;
  810. vq->weak_barriers = weak_barriers;
  811. vq->broken = false;
  812. vq->last_used_idx = 0;
  813. vq->avail_flags_shadow = 0;
  814. vq->avail_idx_shadow = 0;
  815. vq->num_added = 0;
  816. list_add_tail(&vq->vq.list, &vdev->vqs);
  817. #ifdef DEBUG
  818. vq->in_use = false;
  819. vq->last_add_time_valid = false;
  820. #endif
  821. vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC);
  822. vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX);
  823. /* No callback? Tell other side not to bother us. */
  824. if (!callback) {
  825. vq->avail_flags_shadow |= VRING_AVAIL_F_NO_INTERRUPT;
  826. vq->vring.avail->flags = cpu_to_virtio16(vdev, vq->avail_flags_shadow);
  827. }
  828. /* Put everything in free lists. */
  829. vq->free_head = 0;
  830. for (i = 0; i < vring.num-1; i++)
  831. vq->vring.desc[i].next = cpu_to_virtio16(vdev, i + 1);
  832. memset(vq->desc_state, 0, vring.num * sizeof(struct vring_desc_state));
  833. return &vq->vq;
  834. }
  835. EXPORT_SYMBOL_GPL(__vring_new_virtqueue);
  836. static void *vring_alloc_queue(struct virtio_device *vdev, size_t size,
  837. dma_addr_t *dma_handle, gfp_t flag)
  838. {
  839. if (vring_use_dma_api(vdev)) {
  840. return dma_alloc_coherent(vdev->dev.parent, size,
  841. dma_handle, flag);
  842. } else {
  843. void *queue = alloc_pages_exact(PAGE_ALIGN(size), flag);
  844. if (queue) {
  845. phys_addr_t phys_addr = virt_to_phys(queue);
  846. *dma_handle = (dma_addr_t)phys_addr;
  847. /*
  848. * Sanity check: make sure we dind't truncate
  849. * the address. The only arches I can find that
  850. * have 64-bit phys_addr_t but 32-bit dma_addr_t
  851. * are certain non-highmem MIPS and x86
  852. * configurations, but these configurations
  853. * should never allocate physical pages above 32
  854. * bits, so this is fine. Just in case, throw a
  855. * warning and abort if we end up with an
  856. * unrepresentable address.
  857. */
  858. if (WARN_ON_ONCE(*dma_handle != phys_addr)) {
  859. free_pages_exact(queue, PAGE_ALIGN(size));
  860. return NULL;
  861. }
  862. }
  863. return queue;
  864. }
  865. }
  866. static void vring_free_queue(struct virtio_device *vdev, size_t size,
  867. void *queue, dma_addr_t dma_handle)
  868. {
  869. if (vring_use_dma_api(vdev)) {
  870. dma_free_coherent(vdev->dev.parent, size, queue, dma_handle);
  871. } else {
  872. free_pages_exact(queue, PAGE_ALIGN(size));
  873. }
  874. }
  875. struct virtqueue *vring_create_virtqueue(
  876. unsigned int index,
  877. unsigned int num,
  878. unsigned int vring_align,
  879. struct virtio_device *vdev,
  880. bool weak_barriers,
  881. bool may_reduce_num,
  882. bool (*notify)(struct virtqueue *),
  883. void (*callback)(struct virtqueue *),
  884. const char *name)
  885. {
  886. struct virtqueue *vq;
  887. void *queue = NULL;
  888. dma_addr_t dma_addr;
  889. size_t queue_size_in_bytes;
  890. struct vring vring;
  891. /* We assume num is a power of 2. */
  892. if (num & (num - 1)) {
  893. dev_warn(&vdev->dev, "Bad virtqueue length %u\n", num);
  894. return NULL;
  895. }
  896. /* TODO: allocate each queue chunk individually */
  897. for (; num && vring_size(num, vring_align) > PAGE_SIZE; num /= 2) {
  898. queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
  899. &dma_addr,
  900. GFP_KERNEL|__GFP_NOWARN|__GFP_ZERO);
  901. if (queue)
  902. break;
  903. }
  904. if (!num)
  905. return NULL;
  906. if (!queue) {
  907. /* Try to get a single page. You are my only hope! */
  908. queue = vring_alloc_queue(vdev, vring_size(num, vring_align),
  909. &dma_addr, GFP_KERNEL|__GFP_ZERO);
  910. }
  911. if (!queue)
  912. return NULL;
  913. queue_size_in_bytes = vring_size(num, vring_align);
  914. vring_init(&vring, num, queue, vring_align);
  915. vq = __vring_new_virtqueue(index, vring, vdev, weak_barriers,
  916. notify, callback, name);
  917. if (!vq) {
  918. vring_free_queue(vdev, queue_size_in_bytes, queue,
  919. dma_addr);
  920. return NULL;
  921. }
  922. to_vvq(vq)->queue_dma_addr = dma_addr;
  923. to_vvq(vq)->queue_size_in_bytes = queue_size_in_bytes;
  924. to_vvq(vq)->we_own_ring = true;
  925. return vq;
  926. }
  927. EXPORT_SYMBOL_GPL(vring_create_virtqueue);
  928. struct virtqueue *vring_new_virtqueue(unsigned int index,
  929. unsigned int num,
  930. unsigned int vring_align,
  931. struct virtio_device *vdev,
  932. bool weak_barriers,
  933. void *pages,
  934. bool (*notify)(struct virtqueue *vq),
  935. void (*callback)(struct virtqueue *vq),
  936. const char *name)
  937. {
  938. struct vring vring;
  939. vring_init(&vring, num, pages, vring_align);
  940. return __vring_new_virtqueue(index, vring, vdev, weak_barriers,
  941. notify, callback, name);
  942. }
  943. EXPORT_SYMBOL_GPL(vring_new_virtqueue);
  944. void vring_del_virtqueue(struct virtqueue *_vq)
  945. {
  946. struct vring_virtqueue *vq = to_vvq(_vq);
  947. if (vq->we_own_ring) {
  948. vring_free_queue(vq->vq.vdev, vq->queue_size_in_bytes,
  949. vq->vring.desc, vq->queue_dma_addr);
  950. }
  951. list_del(&_vq->list);
  952. kfree(vq);
  953. }
  954. EXPORT_SYMBOL_GPL(vring_del_virtqueue);
  955. /* Manipulates transport-specific feature bits. */
  956. void vring_transport_features(struct virtio_device *vdev)
  957. {
  958. unsigned int i;
  959. for (i = VIRTIO_TRANSPORT_F_START; i < VIRTIO_TRANSPORT_F_END; i++) {
  960. switch (i) {
  961. case VIRTIO_RING_F_INDIRECT_DESC:
  962. break;
  963. case VIRTIO_RING_F_EVENT_IDX:
  964. break;
  965. case VIRTIO_F_VERSION_1:
  966. break;
  967. case VIRTIO_F_IOMMU_PLATFORM:
  968. break;
  969. default:
  970. /* We don't understand this bit. */
  971. __virtio_clear_bit(vdev, i);
  972. }
  973. }
  974. }
  975. EXPORT_SYMBOL_GPL(vring_transport_features);
  976. /**
  977. * virtqueue_get_vring_size - return the size of the virtqueue's vring
  978. * @vq: the struct virtqueue containing the vring of interest.
  979. *
  980. * Returns the size of the vring. This is mainly used for boasting to
  981. * userspace. Unlike other operations, this need not be serialized.
  982. */
  983. unsigned int virtqueue_get_vring_size(struct virtqueue *_vq)
  984. {
  985. struct vring_virtqueue *vq = to_vvq(_vq);
  986. return vq->vring.num;
  987. }
  988. EXPORT_SYMBOL_GPL(virtqueue_get_vring_size);
  989. bool virtqueue_is_broken(struct virtqueue *_vq)
  990. {
  991. struct vring_virtqueue *vq = to_vvq(_vq);
  992. return vq->broken;
  993. }
  994. EXPORT_SYMBOL_GPL(virtqueue_is_broken);
  995. /*
  996. * This should prevent the device from being used, allowing drivers to
  997. * recover. You may need to grab appropriate locks to flush.
  998. */
  999. void virtio_break_device(struct virtio_device *dev)
  1000. {
  1001. struct virtqueue *_vq;
  1002. list_for_each_entry(_vq, &dev->vqs, list) {
  1003. struct vring_virtqueue *vq = to_vvq(_vq);
  1004. vq->broken = true;
  1005. }
  1006. }
  1007. EXPORT_SYMBOL_GPL(virtio_break_device);
  1008. dma_addr_t virtqueue_get_desc_addr(struct virtqueue *_vq)
  1009. {
  1010. struct vring_virtqueue *vq = to_vvq(_vq);
  1011. BUG_ON(!vq->we_own_ring);
  1012. return vq->queue_dma_addr;
  1013. }
  1014. EXPORT_SYMBOL_GPL(virtqueue_get_desc_addr);
  1015. dma_addr_t virtqueue_get_avail_addr(struct virtqueue *_vq)
  1016. {
  1017. struct vring_virtqueue *vq = to_vvq(_vq);
  1018. BUG_ON(!vq->we_own_ring);
  1019. return vq->queue_dma_addr +
  1020. ((char *)vq->vring.avail - (char *)vq->vring.desc);
  1021. }
  1022. EXPORT_SYMBOL_GPL(virtqueue_get_avail_addr);
  1023. dma_addr_t virtqueue_get_used_addr(struct virtqueue *_vq)
  1024. {
  1025. struct vring_virtqueue *vq = to_vvq(_vq);
  1026. BUG_ON(!vq->we_own_ring);
  1027. return vq->queue_dma_addr +
  1028. ((char *)vq->vring.used - (char *)vq->vring.desc);
  1029. }
  1030. EXPORT_SYMBOL_GPL(virtqueue_get_used_addr);
  1031. const struct vring *virtqueue_get_vring(struct virtqueue *vq)
  1032. {
  1033. return &to_vvq(vq)->vring;
  1034. }
  1035. EXPORT_SYMBOL_GPL(virtqueue_get_vring);
  1036. MODULE_LICENSE("GPL");