zbud.c 18 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622
  1. /*
  2. * zbud.c
  3. *
  4. * Copyright (C) 2013, Seth Jennings, IBM
  5. *
  6. * Concepts based on zcache internal zbud allocator by Dan Magenheimer.
  7. *
  8. * zbud is an special purpose allocator for storing compressed pages. Contrary
  9. * to what its name may suggest, zbud is not a buddy allocator, but rather an
  10. * allocator that "buddies" two compressed pages together in a single memory
  11. * page.
  12. *
  13. * While this design limits storage density, it has simple and deterministic
  14. * reclaim properties that make it preferable to a higher density approach when
  15. * reclaim will be used.
  16. *
  17. * zbud works by storing compressed pages, or "zpages", together in pairs in a
  18. * single memory page called a "zbud page". The first buddy is "left
  19. * justified" at the beginning of the zbud page, and the last buddy is "right
  20. * justified" at the end of the zbud page. The benefit is that if either
  21. * buddy is freed, the freed buddy space, coalesced with whatever slack space
  22. * that existed between the buddies, results in the largest possible free region
  23. * within the zbud page.
  24. *
  25. * zbud also provides an attractive lower bound on density. The ratio of zpages
  26. * to zbud pages can not be less than 1. This ensures that zbud can never "do
  27. * harm" by using more pages to store zpages than the uncompressed zpages would
  28. * have used on their own.
  29. *
  30. * zbud pages are divided into "chunks". The size of the chunks is fixed at
  31. * compile time and determined by NCHUNKS_ORDER below. Dividing zbud pages
  32. * into chunks allows organizing unbuddied zbud pages into a manageable number
  33. * of unbuddied lists according to the number of free chunks available in the
  34. * zbud page.
  35. *
  36. * The zbud API differs from that of conventional allocators in that the
  37. * allocation function, zbud_alloc(), returns an opaque handle to the user,
  38. * not a dereferenceable pointer. The user must map the handle using
  39. * zbud_map() in order to get a usable pointer by which to access the
  40. * allocation data and unmap the handle with zbud_unmap() when operations
  41. * on the allocation data are complete.
  42. */
  43. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  44. #include <linux/atomic.h>
  45. #include <linux/list.h>
  46. #include <linux/mm.h>
  47. #include <linux/module.h>
  48. #include <linux/preempt.h>
  49. #include <linux/slab.h>
  50. #include <linux/spinlock.h>
  51. #include <linux/zbud.h>
  52. #include <linux/zpool.h>
  53. /*****************
  54. * Structures
  55. *****************/
  56. /*
  57. * NCHUNKS_ORDER determines the internal allocation granularity, effectively
  58. * adjusting internal fragmentation. It also determines the number of
  59. * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
  60. * allocation granularity will be in chunks of size PAGE_SIZE/64, and there
  61. * will be 64 freelists per pool.
  62. */
  63. #define NCHUNKS_ORDER 6
  64. #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
  65. #define CHUNK_SIZE (1 << CHUNK_SHIFT)
  66. #define NCHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
  67. #define ZHDR_SIZE_ALIGNED CHUNK_SIZE
  68. /**
  69. * struct zbud_pool - stores metadata for each zbud pool
  70. * @lock: protects all pool fields and first|last_chunk fields of any
  71. * zbud page in the pool
  72. * @unbuddied: array of lists tracking zbud pages that only contain one buddy;
  73. * the lists each zbud page is added to depends on the size of
  74. * its free region.
  75. * @buddied: list tracking the zbud pages that contain two buddies;
  76. * these zbud pages are full
  77. * @lru: list tracking the zbud pages in LRU order by most recently
  78. * added buddy.
  79. * @pages_nr: number of zbud pages in the pool.
  80. * @ops: pointer to a structure of user defined operations specified at
  81. * pool creation time.
  82. *
  83. * This structure is allocated at pool creation time and maintains metadata
  84. * pertaining to a particular zbud pool.
  85. */
  86. struct zbud_pool {
  87. spinlock_t lock;
  88. struct list_head unbuddied[NCHUNKS];
  89. struct list_head buddied;
  90. struct list_head lru;
  91. u64 pages_nr;
  92. struct zbud_ops *ops;
  93. };
  94. /*
  95. * struct zbud_header - zbud page metadata occupying the first chunk of each
  96. * zbud page.
  97. * @buddy: links the zbud page into the unbuddied/buddied lists in the pool
  98. * @lru: links the zbud page into the lru list in the pool
  99. * @first_chunks: the size of the first buddy in chunks, 0 if free
  100. * @last_chunks: the size of the last buddy in chunks, 0 if free
  101. */
  102. struct zbud_header {
  103. struct list_head buddy;
  104. struct list_head lru;
  105. unsigned int first_chunks;
  106. unsigned int last_chunks;
  107. bool under_reclaim;
  108. };
  109. /*****************
  110. * zpool
  111. ****************/
  112. #ifdef CONFIG_ZPOOL
  113. static int zbud_zpool_evict(struct zbud_pool *pool, unsigned long handle)
  114. {
  115. return zpool_evict(pool, handle);
  116. }
  117. static struct zbud_ops zbud_zpool_ops = {
  118. .evict = zbud_zpool_evict
  119. };
  120. static void *zbud_zpool_create(gfp_t gfp, struct zpool_ops *zpool_ops)
  121. {
  122. return zbud_create_pool(gfp, &zbud_zpool_ops);
  123. }
  124. static void zbud_zpool_destroy(void *pool)
  125. {
  126. zbud_destroy_pool(pool);
  127. }
  128. static int zbud_zpool_malloc(void *pool, size_t size, gfp_t gfp,
  129. unsigned long *handle)
  130. {
  131. return zbud_alloc(pool, size, gfp, handle);
  132. }
  133. static void zbud_zpool_free(void *pool, unsigned long handle)
  134. {
  135. zbud_free(pool, handle);
  136. }
  137. static int zbud_zpool_shrink(void *pool, unsigned int pages,
  138. unsigned int *reclaimed)
  139. {
  140. unsigned int total = 0;
  141. int ret = -EINVAL;
  142. while (total < pages) {
  143. ret = zbud_reclaim_page(pool, 8);
  144. if (ret < 0)
  145. break;
  146. total++;
  147. }
  148. if (reclaimed)
  149. *reclaimed = total;
  150. return ret;
  151. }
  152. static void *zbud_zpool_map(void *pool, unsigned long handle,
  153. enum zpool_mapmode mm)
  154. {
  155. return zbud_map(pool, handle);
  156. }
  157. static void zbud_zpool_unmap(void *pool, unsigned long handle)
  158. {
  159. zbud_unmap(pool, handle);
  160. }
  161. static u64 zbud_zpool_total_size(void *pool)
  162. {
  163. return zbud_get_pool_size(pool) * PAGE_SIZE;
  164. }
  165. static struct zpool_driver zbud_zpool_driver = {
  166. .type = "zbud",
  167. .owner = THIS_MODULE,
  168. .create = zbud_zpool_create,
  169. .destroy = zbud_zpool_destroy,
  170. .malloc = zbud_zpool_malloc,
  171. .free = zbud_zpool_free,
  172. .shrink = zbud_zpool_shrink,
  173. .map = zbud_zpool_map,
  174. .unmap = zbud_zpool_unmap,
  175. .total_size = zbud_zpool_total_size,
  176. };
  177. MODULE_ALIAS("zpool-zbud");
  178. #endif /* CONFIG_ZPOOL */
  179. /*****************
  180. * Helpers
  181. *****************/
  182. /* Just to make the code easier to read */
  183. enum buddy {
  184. FIRST,
  185. LAST
  186. };
  187. /* Converts an allocation size in bytes to size in zbud chunks */
  188. static int size_to_chunks(size_t size)
  189. {
  190. return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
  191. }
  192. #define for_each_unbuddied_list(_iter, _begin) \
  193. for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
  194. /* Initializes the zbud header of a newly allocated zbud page */
  195. static struct zbud_header *init_zbud_page(struct page *page)
  196. {
  197. struct zbud_header *zhdr = page_address(page);
  198. zhdr->first_chunks = 0;
  199. zhdr->last_chunks = 0;
  200. INIT_LIST_HEAD(&zhdr->buddy);
  201. INIT_LIST_HEAD(&zhdr->lru);
  202. zhdr->under_reclaim = 0;
  203. return zhdr;
  204. }
  205. /* Resets the struct page fields and frees the page */
  206. static void free_zbud_page(struct zbud_header *zhdr)
  207. {
  208. __free_page(virt_to_page(zhdr));
  209. }
  210. /*
  211. * Encodes the handle of a particular buddy within a zbud page
  212. * Pool lock should be held as this function accesses first|last_chunks
  213. */
  214. static unsigned long encode_handle(struct zbud_header *zhdr, enum buddy bud)
  215. {
  216. unsigned long handle;
  217. /*
  218. * For now, the encoded handle is actually just the pointer to the data
  219. * but this might not always be the case. A little information hiding.
  220. * Add CHUNK_SIZE to the handle if it is the first allocation to jump
  221. * over the zbud header in the first chunk.
  222. */
  223. handle = (unsigned long)zhdr;
  224. if (bud == FIRST)
  225. /* skip over zbud header */
  226. handle += ZHDR_SIZE_ALIGNED;
  227. else /* bud == LAST */
  228. handle += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
  229. return handle;
  230. }
  231. /* Returns the zbud page where a given handle is stored */
  232. static struct zbud_header *handle_to_zbud_header(unsigned long handle)
  233. {
  234. return (struct zbud_header *)(handle & PAGE_MASK);
  235. }
  236. /* Returns the number of free chunks in a zbud page */
  237. static int num_free_chunks(struct zbud_header *zhdr)
  238. {
  239. /*
  240. * Rather than branch for different situations, just use the fact that
  241. * free buddies have a length of zero to simplify everything. -1 at the
  242. * end for the zbud header.
  243. */
  244. return NCHUNKS - zhdr->first_chunks - zhdr->last_chunks - 1;
  245. }
  246. /*****************
  247. * API Functions
  248. *****************/
  249. /**
  250. * zbud_create_pool() - create a new zbud pool
  251. * @gfp: gfp flags when allocating the zbud pool structure
  252. * @ops: user-defined operations for the zbud pool
  253. *
  254. * Return: pointer to the new zbud pool or NULL if the metadata allocation
  255. * failed.
  256. */
  257. struct zbud_pool *zbud_create_pool(gfp_t gfp, struct zbud_ops *ops)
  258. {
  259. struct zbud_pool *pool;
  260. int i;
  261. pool = kmalloc(sizeof(struct zbud_pool), gfp);
  262. if (!pool)
  263. return NULL;
  264. spin_lock_init(&pool->lock);
  265. for_each_unbuddied_list(i, 0)
  266. INIT_LIST_HEAD(&pool->unbuddied[i]);
  267. INIT_LIST_HEAD(&pool->buddied);
  268. INIT_LIST_HEAD(&pool->lru);
  269. pool->pages_nr = 0;
  270. pool->ops = ops;
  271. return pool;
  272. }
  273. /**
  274. * zbud_destroy_pool() - destroys an existing zbud pool
  275. * @pool: the zbud pool to be destroyed
  276. *
  277. * The pool should be emptied before this function is called.
  278. */
  279. void zbud_destroy_pool(struct zbud_pool *pool)
  280. {
  281. kfree(pool);
  282. }
  283. /**
  284. * zbud_alloc() - allocates a region of a given size
  285. * @pool: zbud pool from which to allocate
  286. * @size: size in bytes of the desired allocation
  287. * @gfp: gfp flags used if the pool needs to grow
  288. * @handle: handle of the new allocation
  289. *
  290. * This function will attempt to find a free region in the pool large enough to
  291. * satisfy the allocation request. A search of the unbuddied lists is
  292. * performed first. If no suitable free region is found, then a new page is
  293. * allocated and added to the pool to satisfy the request.
  294. *
  295. * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
  296. * as zbud pool pages.
  297. *
  298. * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
  299. * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
  300. * a new page.
  301. */
  302. int zbud_alloc(struct zbud_pool *pool, size_t size, gfp_t gfp,
  303. unsigned long *handle)
  304. {
  305. int chunks, i, freechunks;
  306. struct zbud_header *zhdr = NULL;
  307. enum buddy bud;
  308. struct page *page;
  309. if (!size || (gfp & __GFP_HIGHMEM))
  310. return -EINVAL;
  311. if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
  312. return -ENOSPC;
  313. chunks = size_to_chunks(size);
  314. spin_lock(&pool->lock);
  315. /* First, try to find an unbuddied zbud page. */
  316. zhdr = NULL;
  317. for_each_unbuddied_list(i, chunks) {
  318. if (!list_empty(&pool->unbuddied[i])) {
  319. zhdr = list_first_entry(&pool->unbuddied[i],
  320. struct zbud_header, buddy);
  321. list_del(&zhdr->buddy);
  322. if (zhdr->first_chunks == 0)
  323. bud = FIRST;
  324. else
  325. bud = LAST;
  326. goto found;
  327. }
  328. }
  329. /* Couldn't find unbuddied zbud page, create new one */
  330. spin_unlock(&pool->lock);
  331. page = alloc_page(gfp);
  332. if (!page)
  333. return -ENOMEM;
  334. spin_lock(&pool->lock);
  335. pool->pages_nr++;
  336. zhdr = init_zbud_page(page);
  337. bud = FIRST;
  338. found:
  339. if (bud == FIRST)
  340. zhdr->first_chunks = chunks;
  341. else
  342. zhdr->last_chunks = chunks;
  343. if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0) {
  344. /* Add to unbuddied list */
  345. freechunks = num_free_chunks(zhdr);
  346. list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
  347. } else {
  348. /* Add to buddied list */
  349. list_add(&zhdr->buddy, &pool->buddied);
  350. }
  351. /* Add/move zbud page to beginning of LRU */
  352. if (!list_empty(&zhdr->lru))
  353. list_del(&zhdr->lru);
  354. list_add(&zhdr->lru, &pool->lru);
  355. *handle = encode_handle(zhdr, bud);
  356. spin_unlock(&pool->lock);
  357. return 0;
  358. }
  359. /**
  360. * zbud_free() - frees the allocation associated with the given handle
  361. * @pool: pool in which the allocation resided
  362. * @handle: handle associated with the allocation returned by zbud_alloc()
  363. *
  364. * In the case that the zbud page in which the allocation resides is under
  365. * reclaim, as indicated by the PG_reclaim flag being set, this function
  366. * only sets the first|last_chunks to 0. The page is actually freed
  367. * once both buddies are evicted (see zbud_reclaim_page() below).
  368. */
  369. void zbud_free(struct zbud_pool *pool, unsigned long handle)
  370. {
  371. struct zbud_header *zhdr;
  372. int freechunks;
  373. spin_lock(&pool->lock);
  374. zhdr = handle_to_zbud_header(handle);
  375. /* If first buddy, handle will be page aligned */
  376. if ((handle - ZHDR_SIZE_ALIGNED) & ~PAGE_MASK)
  377. zhdr->last_chunks = 0;
  378. else
  379. zhdr->first_chunks = 0;
  380. if (zhdr->under_reclaim) {
  381. /* zbud page is under reclaim, reclaim will free */
  382. spin_unlock(&pool->lock);
  383. return;
  384. }
  385. /* Remove from existing buddy list */
  386. list_del(&zhdr->buddy);
  387. if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
  388. /* zbud page is empty, free */
  389. list_del(&zhdr->lru);
  390. free_zbud_page(zhdr);
  391. pool->pages_nr--;
  392. } else {
  393. /* Add to unbuddied list */
  394. freechunks = num_free_chunks(zhdr);
  395. list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
  396. }
  397. spin_unlock(&pool->lock);
  398. }
  399. #define list_tail_entry(ptr, type, member) \
  400. list_entry((ptr)->prev, type, member)
  401. /**
  402. * zbud_reclaim_page() - evicts allocations from a pool page and frees it
  403. * @pool: pool from which a page will attempt to be evicted
  404. * @retires: number of pages on the LRU list for which eviction will
  405. * be attempted before failing
  406. *
  407. * zbud reclaim is different from normal system reclaim in that the reclaim is
  408. * done from the bottom, up. This is because only the bottom layer, zbud, has
  409. * information on how the allocations are organized within each zbud page. This
  410. * has the potential to create interesting locking situations between zbud and
  411. * the user, however.
  412. *
  413. * To avoid these, this is how zbud_reclaim_page() should be called:
  414. * The user detects a page should be reclaimed and calls zbud_reclaim_page().
  415. * zbud_reclaim_page() will remove a zbud page from the pool LRU list and call
  416. * the user-defined eviction handler with the pool and handle as arguments.
  417. *
  418. * If the handle can not be evicted, the eviction handler should return
  419. * non-zero. zbud_reclaim_page() will add the zbud page back to the
  420. * appropriate list and try the next zbud page on the LRU up to
  421. * a user defined number of retries.
  422. *
  423. * If the handle is successfully evicted, the eviction handler should
  424. * return 0 _and_ should have called zbud_free() on the handle. zbud_free()
  425. * contains logic to delay freeing the page if the page is under reclaim,
  426. * as indicated by the setting of the PG_reclaim flag on the underlying page.
  427. *
  428. * If all buddies in the zbud page are successfully evicted, then the
  429. * zbud page can be freed.
  430. *
  431. * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
  432. * no pages to evict or an eviction handler is not registered, -EAGAIN if
  433. * the retry limit was hit.
  434. */
  435. int zbud_reclaim_page(struct zbud_pool *pool, unsigned int retries)
  436. {
  437. int i, ret, freechunks;
  438. struct zbud_header *zhdr;
  439. unsigned long first_handle = 0, last_handle = 0;
  440. spin_lock(&pool->lock);
  441. if (!pool->ops || !pool->ops->evict || list_empty(&pool->lru) ||
  442. retries == 0) {
  443. spin_unlock(&pool->lock);
  444. return -EINVAL;
  445. }
  446. for (i = 0; i < retries; i++) {
  447. zhdr = list_tail_entry(&pool->lru, struct zbud_header, lru);
  448. list_del(&zhdr->lru);
  449. list_del(&zhdr->buddy);
  450. /* Protect zbud page against free */
  451. zhdr->under_reclaim = true;
  452. /*
  453. * We need encode the handles before unlocking, since we can
  454. * race with free that will set (first|last)_chunks to 0
  455. */
  456. first_handle = 0;
  457. last_handle = 0;
  458. if (zhdr->first_chunks)
  459. first_handle = encode_handle(zhdr, FIRST);
  460. if (zhdr->last_chunks)
  461. last_handle = encode_handle(zhdr, LAST);
  462. spin_unlock(&pool->lock);
  463. /* Issue the eviction callback(s) */
  464. if (first_handle) {
  465. ret = pool->ops->evict(pool, first_handle);
  466. if (ret)
  467. goto next;
  468. }
  469. if (last_handle) {
  470. ret = pool->ops->evict(pool, last_handle);
  471. if (ret)
  472. goto next;
  473. }
  474. next:
  475. spin_lock(&pool->lock);
  476. zhdr->under_reclaim = false;
  477. if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
  478. /*
  479. * Both buddies are now free, free the zbud page and
  480. * return success.
  481. */
  482. free_zbud_page(zhdr);
  483. pool->pages_nr--;
  484. spin_unlock(&pool->lock);
  485. return 0;
  486. } else if (zhdr->first_chunks == 0 ||
  487. zhdr->last_chunks == 0) {
  488. /* add to unbuddied list */
  489. freechunks = num_free_chunks(zhdr);
  490. list_add(&zhdr->buddy, &pool->unbuddied[freechunks]);
  491. } else {
  492. /* add to buddied list */
  493. list_add(&zhdr->buddy, &pool->buddied);
  494. }
  495. /* add to beginning of LRU */
  496. list_add(&zhdr->lru, &pool->lru);
  497. }
  498. spin_unlock(&pool->lock);
  499. return -EAGAIN;
  500. }
  501. /**
  502. * zbud_map() - maps the allocation associated with the given handle
  503. * @pool: pool in which the allocation resides
  504. * @handle: handle associated with the allocation to be mapped
  505. *
  506. * While trivial for zbud, the mapping functions for others allocators
  507. * implementing this allocation API could have more complex information encoded
  508. * in the handle and could create temporary mappings to make the data
  509. * accessible to the user.
  510. *
  511. * Returns: a pointer to the mapped allocation
  512. */
  513. void *zbud_map(struct zbud_pool *pool, unsigned long handle)
  514. {
  515. return (void *)(handle);
  516. }
  517. /**
  518. * zbud_unmap() - maps the allocation associated with the given handle
  519. * @pool: pool in which the allocation resides
  520. * @handle: handle associated with the allocation to be unmapped
  521. */
  522. void zbud_unmap(struct zbud_pool *pool, unsigned long handle)
  523. {
  524. }
  525. /**
  526. * zbud_get_pool_size() - gets the zbud pool size in pages
  527. * @pool: pool whose size is being queried
  528. *
  529. * Returns: size in pages of the given pool. The pool lock need not be
  530. * taken to access pages_nr.
  531. */
  532. u64 zbud_get_pool_size(struct zbud_pool *pool)
  533. {
  534. return pool->pages_nr;
  535. }
  536. static int __init init_zbud(void)
  537. {
  538. /* Make sure the zbud header will fit in one chunk */
  539. BUILD_BUG_ON(sizeof(struct zbud_header) > ZHDR_SIZE_ALIGNED);
  540. pr_info("loaded\n");
  541. #ifdef CONFIG_ZPOOL
  542. zpool_register_driver(&zbud_zpool_driver);
  543. #endif
  544. return 0;
  545. }
  546. static void __exit exit_zbud(void)
  547. {
  548. #ifdef CONFIG_ZPOOL
  549. zpool_unregister_driver(&zbud_zpool_driver);
  550. #endif
  551. pr_info("unloaded\n");
  552. }
  553. module_init(init_zbud);
  554. module_exit(exit_zbud);
  555. MODULE_LICENSE("GPL");
  556. MODULE_AUTHOR("Seth Jennings <sjenning@linux.vnet.ibm.com>");
  557. MODULE_DESCRIPTION("Buddy Allocator for Compressed Pages");