z3fold.c 29 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104
  1. /*
  2. * z3fold.c
  3. *
  4. * Author: Vitaly Wool <vitaly.wool@konsulko.com>
  5. * Copyright (C) 2016, Sony Mobile Communications Inc.
  6. *
  7. * This implementation is based on zbud written by Seth Jennings.
  8. *
  9. * z3fold is an special purpose allocator for storing compressed pages. It
  10. * can store up to three compressed pages per page which improves the
  11. * compression ratio of zbud while retaining its main concepts (e. g. always
  12. * storing an integral number of objects per page) and simplicity.
  13. * It still has simple and deterministic reclaim properties that make it
  14. * preferable to a higher density approach (with no requirement on integral
  15. * number of object per page) when reclaim is used.
  16. *
  17. * As in zbud, pages are divided into "chunks". The size of the chunks is
  18. * fixed at compile time and is determined by NCHUNKS_ORDER below.
  19. *
  20. * z3fold doesn't export any API and is meant to be used via zpool API.
  21. */
  22. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  23. #include <linux/atomic.h>
  24. #include <linux/sched.h>
  25. #include <linux/list.h>
  26. #include <linux/mm.h>
  27. #include <linux/module.h>
  28. #include <linux/percpu.h>
  29. #include <linux/preempt.h>
  30. #include <linux/workqueue.h>
  31. #include <linux/slab.h>
  32. #include <linux/spinlock.h>
  33. #include <linux/zpool.h>
  34. /*****************
  35. * Structures
  36. *****************/
  37. struct z3fold_pool;
  38. struct z3fold_ops {
  39. int (*evict)(struct z3fold_pool *pool, unsigned long handle);
  40. };
  41. enum buddy {
  42. HEADLESS = 0,
  43. FIRST,
  44. MIDDLE,
  45. LAST,
  46. BUDDIES_MAX
  47. };
  48. /*
  49. * struct z3fold_header - z3fold page metadata occupying first chunks of each
  50. * z3fold page, except for HEADLESS pages
  51. * @buddy: links the z3fold page into the relevant list in the
  52. * pool
  53. * @page_lock: per-page lock
  54. * @refcount: reference count for the z3fold page
  55. * @work: work_struct for page layout optimization
  56. * @pool: pointer to the pool which this page belongs to
  57. * @cpu: CPU which this page "belongs" to
  58. * @first_chunks: the size of the first buddy in chunks, 0 if free
  59. * @middle_chunks: the size of the middle buddy in chunks, 0 if free
  60. * @last_chunks: the size of the last buddy in chunks, 0 if free
  61. * @first_num: the starting number (for the first handle)
  62. */
  63. struct z3fold_header {
  64. struct list_head buddy;
  65. spinlock_t page_lock;
  66. struct kref refcount;
  67. struct work_struct work;
  68. struct z3fold_pool *pool;
  69. short cpu;
  70. unsigned short first_chunks;
  71. unsigned short middle_chunks;
  72. unsigned short last_chunks;
  73. unsigned short start_middle;
  74. unsigned short first_num:2;
  75. };
  76. /*
  77. * NCHUNKS_ORDER determines the internal allocation granularity, effectively
  78. * adjusting internal fragmentation. It also determines the number of
  79. * freelists maintained in each pool. NCHUNKS_ORDER of 6 means that the
  80. * allocation granularity will be in chunks of size PAGE_SIZE/64. Some chunks
  81. * in the beginning of an allocated page are occupied by z3fold header, so
  82. * NCHUNKS will be calculated to 63 (or 62 in case CONFIG_DEBUG_SPINLOCK=y),
  83. * which shows the max number of free chunks in z3fold page, also there will
  84. * be 63, or 62, respectively, freelists per pool.
  85. */
  86. #define NCHUNKS_ORDER 6
  87. #define CHUNK_SHIFT (PAGE_SHIFT - NCHUNKS_ORDER)
  88. #define CHUNK_SIZE (1 << CHUNK_SHIFT)
  89. #define ZHDR_SIZE_ALIGNED round_up(sizeof(struct z3fold_header), CHUNK_SIZE)
  90. #define ZHDR_CHUNKS (ZHDR_SIZE_ALIGNED >> CHUNK_SHIFT)
  91. #define TOTAL_CHUNKS (PAGE_SIZE >> CHUNK_SHIFT)
  92. #define NCHUNKS ((PAGE_SIZE - ZHDR_SIZE_ALIGNED) >> CHUNK_SHIFT)
  93. #define BUDDY_MASK (0x3)
  94. /**
  95. * struct z3fold_pool - stores metadata for each z3fold pool
  96. * @name: pool name
  97. * @lock: protects pool unbuddied/lru lists
  98. * @stale_lock: protects pool stale page list
  99. * @unbuddied: per-cpu array of lists tracking z3fold pages that contain 2-
  100. * buddies; the list each z3fold page is added to depends on
  101. * the size of its free region.
  102. * @lru: list tracking the z3fold pages in LRU order by most recently
  103. * added buddy.
  104. * @stale: list of pages marked for freeing
  105. * @pages_nr: number of z3fold pages in the pool.
  106. * @ops: pointer to a structure of user defined operations specified at
  107. * pool creation time.
  108. * @compact_wq: workqueue for page layout background optimization
  109. * @release_wq: workqueue for safe page release
  110. * @work: work_struct for safe page release
  111. *
  112. * This structure is allocated at pool creation time and maintains metadata
  113. * pertaining to a particular z3fold pool.
  114. */
  115. struct z3fold_pool {
  116. const char *name;
  117. spinlock_t lock;
  118. spinlock_t stale_lock;
  119. struct list_head *unbuddied;
  120. struct list_head lru;
  121. struct list_head stale;
  122. atomic64_t pages_nr;
  123. const struct z3fold_ops *ops;
  124. struct zpool *zpool;
  125. const struct zpool_ops *zpool_ops;
  126. struct workqueue_struct *compact_wq;
  127. struct workqueue_struct *release_wq;
  128. struct work_struct work;
  129. };
  130. /*
  131. * Internal z3fold page flags
  132. */
  133. enum z3fold_page_flags {
  134. PAGE_HEADLESS = 0,
  135. MIDDLE_CHUNK_MAPPED,
  136. NEEDS_COMPACTING,
  137. PAGE_STALE
  138. };
  139. /*****************
  140. * Helpers
  141. *****************/
  142. /* Converts an allocation size in bytes to size in z3fold chunks */
  143. static int size_to_chunks(size_t size)
  144. {
  145. return (size + CHUNK_SIZE - 1) >> CHUNK_SHIFT;
  146. }
  147. #define for_each_unbuddied_list(_iter, _begin) \
  148. for ((_iter) = (_begin); (_iter) < NCHUNKS; (_iter)++)
  149. static void compact_page_work(struct work_struct *w);
  150. /* Initializes the z3fold header of a newly allocated z3fold page */
  151. static struct z3fold_header *init_z3fold_page(struct page *page,
  152. struct z3fold_pool *pool)
  153. {
  154. struct z3fold_header *zhdr = page_address(page);
  155. INIT_LIST_HEAD(&page->lru);
  156. clear_bit(PAGE_HEADLESS, &page->private);
  157. clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
  158. clear_bit(NEEDS_COMPACTING, &page->private);
  159. clear_bit(PAGE_STALE, &page->private);
  160. spin_lock_init(&zhdr->page_lock);
  161. kref_init(&zhdr->refcount);
  162. zhdr->first_chunks = 0;
  163. zhdr->middle_chunks = 0;
  164. zhdr->last_chunks = 0;
  165. zhdr->first_num = 0;
  166. zhdr->start_middle = 0;
  167. zhdr->cpu = -1;
  168. zhdr->pool = pool;
  169. INIT_LIST_HEAD(&zhdr->buddy);
  170. INIT_WORK(&zhdr->work, compact_page_work);
  171. return zhdr;
  172. }
  173. /* Resets the struct page fields and frees the page */
  174. static void free_z3fold_page(struct page *page)
  175. {
  176. __free_page(page);
  177. }
  178. /* Lock a z3fold page */
  179. static inline void z3fold_page_lock(struct z3fold_header *zhdr)
  180. {
  181. spin_lock(&zhdr->page_lock);
  182. }
  183. /* Try to lock a z3fold page */
  184. static inline int z3fold_page_trylock(struct z3fold_header *zhdr)
  185. {
  186. return spin_trylock(&zhdr->page_lock);
  187. }
  188. /* Unlock a z3fold page */
  189. static inline void z3fold_page_unlock(struct z3fold_header *zhdr)
  190. {
  191. spin_unlock(&zhdr->page_lock);
  192. }
  193. /*
  194. * Encodes the handle of a particular buddy within a z3fold page
  195. * Pool lock should be held as this function accesses first_num
  196. */
  197. static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud)
  198. {
  199. unsigned long handle;
  200. handle = (unsigned long)zhdr;
  201. if (bud != HEADLESS)
  202. handle += (bud + zhdr->first_num) & BUDDY_MASK;
  203. return handle;
  204. }
  205. /* Returns the z3fold page where a given handle is stored */
  206. static struct z3fold_header *handle_to_z3fold_header(unsigned long handle)
  207. {
  208. return (struct z3fold_header *)(handle & PAGE_MASK);
  209. }
  210. /*
  211. * (handle & BUDDY_MASK) < zhdr->first_num is possible in encode_handle
  212. * but that doesn't matter. because the masking will result in the
  213. * correct buddy number.
  214. */
  215. static enum buddy handle_to_buddy(unsigned long handle)
  216. {
  217. struct z3fold_header *zhdr = handle_to_z3fold_header(handle);
  218. return (handle - zhdr->first_num) & BUDDY_MASK;
  219. }
  220. static void __release_z3fold_page(struct z3fold_header *zhdr, bool locked)
  221. {
  222. struct page *page = virt_to_page(zhdr);
  223. struct z3fold_pool *pool = zhdr->pool;
  224. WARN_ON(!list_empty(&zhdr->buddy));
  225. set_bit(PAGE_STALE, &page->private);
  226. clear_bit(NEEDS_COMPACTING, &page->private);
  227. spin_lock(&pool->lock);
  228. if (!list_empty(&page->lru))
  229. list_del(&page->lru);
  230. spin_unlock(&pool->lock);
  231. if (locked)
  232. z3fold_page_unlock(zhdr);
  233. spin_lock(&pool->stale_lock);
  234. list_add(&zhdr->buddy, &pool->stale);
  235. queue_work(pool->release_wq, &pool->work);
  236. spin_unlock(&pool->stale_lock);
  237. }
  238. static void __attribute__((__unused__))
  239. release_z3fold_page(struct kref *ref)
  240. {
  241. struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
  242. refcount);
  243. __release_z3fold_page(zhdr, false);
  244. }
  245. static void release_z3fold_page_locked(struct kref *ref)
  246. {
  247. struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
  248. refcount);
  249. WARN_ON(z3fold_page_trylock(zhdr));
  250. __release_z3fold_page(zhdr, true);
  251. }
  252. static void release_z3fold_page_locked_list(struct kref *ref)
  253. {
  254. struct z3fold_header *zhdr = container_of(ref, struct z3fold_header,
  255. refcount);
  256. spin_lock(&zhdr->pool->lock);
  257. list_del_init(&zhdr->buddy);
  258. spin_unlock(&zhdr->pool->lock);
  259. WARN_ON(z3fold_page_trylock(zhdr));
  260. __release_z3fold_page(zhdr, true);
  261. }
  262. static void free_pages_work(struct work_struct *w)
  263. {
  264. struct z3fold_pool *pool = container_of(w, struct z3fold_pool, work);
  265. spin_lock(&pool->stale_lock);
  266. while (!list_empty(&pool->stale)) {
  267. struct z3fold_header *zhdr = list_first_entry(&pool->stale,
  268. struct z3fold_header, buddy);
  269. struct page *page = virt_to_page(zhdr);
  270. list_del(&zhdr->buddy);
  271. if (WARN_ON(!test_bit(PAGE_STALE, &page->private)))
  272. continue;
  273. spin_unlock(&pool->stale_lock);
  274. cancel_work_sync(&zhdr->work);
  275. free_z3fold_page(page);
  276. cond_resched();
  277. spin_lock(&pool->stale_lock);
  278. }
  279. spin_unlock(&pool->stale_lock);
  280. }
  281. /*
  282. * Returns the number of free chunks in a z3fold page.
  283. * NB: can't be used with HEADLESS pages.
  284. */
  285. static int num_free_chunks(struct z3fold_header *zhdr)
  286. {
  287. int nfree;
  288. /*
  289. * If there is a middle object, pick up the bigger free space
  290. * either before or after it. Otherwise just subtract the number
  291. * of chunks occupied by the first and the last objects.
  292. */
  293. if (zhdr->middle_chunks != 0) {
  294. int nfree_before = zhdr->first_chunks ?
  295. 0 : zhdr->start_middle - ZHDR_CHUNKS;
  296. int nfree_after = zhdr->last_chunks ?
  297. 0 : TOTAL_CHUNKS -
  298. (zhdr->start_middle + zhdr->middle_chunks);
  299. nfree = max(nfree_before, nfree_after);
  300. } else
  301. nfree = NCHUNKS - zhdr->first_chunks - zhdr->last_chunks;
  302. return nfree;
  303. }
  304. static inline void *mchunk_memmove(struct z3fold_header *zhdr,
  305. unsigned short dst_chunk)
  306. {
  307. void *beg = zhdr;
  308. return memmove(beg + (dst_chunk << CHUNK_SHIFT),
  309. beg + (zhdr->start_middle << CHUNK_SHIFT),
  310. zhdr->middle_chunks << CHUNK_SHIFT);
  311. }
  312. #define BIG_CHUNK_GAP 3
  313. /* Has to be called with lock held */
  314. static int z3fold_compact_page(struct z3fold_header *zhdr)
  315. {
  316. struct page *page = virt_to_page(zhdr);
  317. if (test_bit(MIDDLE_CHUNK_MAPPED, &page->private))
  318. return 0; /* can't move middle chunk, it's used */
  319. if (zhdr->middle_chunks == 0)
  320. return 0; /* nothing to compact */
  321. if (zhdr->first_chunks == 0 && zhdr->last_chunks == 0) {
  322. /* move to the beginning */
  323. mchunk_memmove(zhdr, ZHDR_CHUNKS);
  324. zhdr->first_chunks = zhdr->middle_chunks;
  325. zhdr->middle_chunks = 0;
  326. zhdr->start_middle = 0;
  327. zhdr->first_num++;
  328. return 1;
  329. }
  330. /*
  331. * moving data is expensive, so let's only do that if
  332. * there's substantial gain (at least BIG_CHUNK_GAP chunks)
  333. */
  334. if (zhdr->first_chunks != 0 && zhdr->last_chunks == 0 &&
  335. zhdr->start_middle - (zhdr->first_chunks + ZHDR_CHUNKS) >=
  336. BIG_CHUNK_GAP) {
  337. mchunk_memmove(zhdr, zhdr->first_chunks + ZHDR_CHUNKS);
  338. zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
  339. return 1;
  340. } else if (zhdr->last_chunks != 0 && zhdr->first_chunks == 0 &&
  341. TOTAL_CHUNKS - (zhdr->last_chunks + zhdr->start_middle
  342. + zhdr->middle_chunks) >=
  343. BIG_CHUNK_GAP) {
  344. unsigned short new_start = TOTAL_CHUNKS - zhdr->last_chunks -
  345. zhdr->middle_chunks;
  346. mchunk_memmove(zhdr, new_start);
  347. zhdr->start_middle = new_start;
  348. return 1;
  349. }
  350. return 0;
  351. }
  352. static void do_compact_page(struct z3fold_header *zhdr, bool locked)
  353. {
  354. struct z3fold_pool *pool = zhdr->pool;
  355. struct page *page;
  356. struct list_head *unbuddied;
  357. int fchunks;
  358. page = virt_to_page(zhdr);
  359. if (locked)
  360. WARN_ON(z3fold_page_trylock(zhdr));
  361. else
  362. z3fold_page_lock(zhdr);
  363. if (WARN_ON(!test_and_clear_bit(NEEDS_COMPACTING, &page->private))) {
  364. z3fold_page_unlock(zhdr);
  365. return;
  366. }
  367. spin_lock(&pool->lock);
  368. list_del_init(&zhdr->buddy);
  369. spin_unlock(&pool->lock);
  370. if (kref_put(&zhdr->refcount, release_z3fold_page_locked)) {
  371. atomic64_dec(&pool->pages_nr);
  372. return;
  373. }
  374. z3fold_compact_page(zhdr);
  375. unbuddied = get_cpu_ptr(pool->unbuddied);
  376. fchunks = num_free_chunks(zhdr);
  377. if (fchunks < NCHUNKS &&
  378. (!zhdr->first_chunks || !zhdr->middle_chunks ||
  379. !zhdr->last_chunks)) {
  380. /* the page's not completely free and it's unbuddied */
  381. spin_lock(&pool->lock);
  382. list_add(&zhdr->buddy, &unbuddied[fchunks]);
  383. spin_unlock(&pool->lock);
  384. zhdr->cpu = smp_processor_id();
  385. }
  386. put_cpu_ptr(pool->unbuddied);
  387. z3fold_page_unlock(zhdr);
  388. }
  389. static void compact_page_work(struct work_struct *w)
  390. {
  391. struct z3fold_header *zhdr = container_of(w, struct z3fold_header,
  392. work);
  393. do_compact_page(zhdr, false);
  394. }
  395. /*
  396. * API Functions
  397. */
  398. /**
  399. * z3fold_create_pool() - create a new z3fold pool
  400. * @name: pool name
  401. * @gfp: gfp flags when allocating the z3fold pool structure
  402. * @ops: user-defined operations for the z3fold pool
  403. *
  404. * Return: pointer to the new z3fold pool or NULL if the metadata allocation
  405. * failed.
  406. */
  407. static struct z3fold_pool *z3fold_create_pool(const char *name, gfp_t gfp,
  408. const struct z3fold_ops *ops)
  409. {
  410. struct z3fold_pool *pool = NULL;
  411. int i, cpu;
  412. pool = kzalloc(sizeof(struct z3fold_pool), gfp);
  413. if (!pool)
  414. goto out;
  415. spin_lock_init(&pool->lock);
  416. spin_lock_init(&pool->stale_lock);
  417. pool->unbuddied = __alloc_percpu(sizeof(struct list_head)*NCHUNKS, 2);
  418. for_each_possible_cpu(cpu) {
  419. struct list_head *unbuddied =
  420. per_cpu_ptr(pool->unbuddied, cpu);
  421. for_each_unbuddied_list(i, 0)
  422. INIT_LIST_HEAD(&unbuddied[i]);
  423. }
  424. INIT_LIST_HEAD(&pool->lru);
  425. INIT_LIST_HEAD(&pool->stale);
  426. atomic64_set(&pool->pages_nr, 0);
  427. pool->name = name;
  428. pool->compact_wq = create_singlethread_workqueue(pool->name);
  429. if (!pool->compact_wq)
  430. goto out;
  431. pool->release_wq = create_singlethread_workqueue(pool->name);
  432. if (!pool->release_wq)
  433. goto out_wq;
  434. INIT_WORK(&pool->work, free_pages_work);
  435. pool->ops = ops;
  436. return pool;
  437. out_wq:
  438. destroy_workqueue(pool->compact_wq);
  439. out:
  440. kfree(pool);
  441. return NULL;
  442. }
  443. /**
  444. * z3fold_destroy_pool() - destroys an existing z3fold pool
  445. * @pool: the z3fold pool to be destroyed
  446. *
  447. * The pool should be emptied before this function is called.
  448. */
  449. static void z3fold_destroy_pool(struct z3fold_pool *pool)
  450. {
  451. destroy_workqueue(pool->release_wq);
  452. destroy_workqueue(pool->compact_wq);
  453. kfree(pool);
  454. }
  455. /**
  456. * z3fold_alloc() - allocates a region of a given size
  457. * @pool: z3fold pool from which to allocate
  458. * @size: size in bytes of the desired allocation
  459. * @gfp: gfp flags used if the pool needs to grow
  460. * @handle: handle of the new allocation
  461. *
  462. * This function will attempt to find a free region in the pool large enough to
  463. * satisfy the allocation request. A search of the unbuddied lists is
  464. * performed first. If no suitable free region is found, then a new page is
  465. * allocated and added to the pool to satisfy the request.
  466. *
  467. * gfp should not set __GFP_HIGHMEM as highmem pages cannot be used
  468. * as z3fold pool pages.
  469. *
  470. * Return: 0 if success and handle is set, otherwise -EINVAL if the size or
  471. * gfp arguments are invalid or -ENOMEM if the pool was unable to allocate
  472. * a new page.
  473. */
  474. static int z3fold_alloc(struct z3fold_pool *pool, size_t size, gfp_t gfp,
  475. unsigned long *handle)
  476. {
  477. int chunks = 0, i, freechunks;
  478. struct z3fold_header *zhdr = NULL;
  479. struct page *page = NULL;
  480. enum buddy bud;
  481. bool can_sleep = (gfp & __GFP_RECLAIM) == __GFP_RECLAIM;
  482. if (!size || (gfp & __GFP_HIGHMEM))
  483. return -EINVAL;
  484. if (size > PAGE_SIZE)
  485. return -ENOSPC;
  486. if (size > PAGE_SIZE - ZHDR_SIZE_ALIGNED - CHUNK_SIZE)
  487. bud = HEADLESS;
  488. else {
  489. struct list_head *unbuddied;
  490. chunks = size_to_chunks(size);
  491. lookup:
  492. /* First, try to find an unbuddied z3fold page. */
  493. unbuddied = get_cpu_ptr(pool->unbuddied);
  494. for_each_unbuddied_list(i, chunks) {
  495. struct list_head *l = &unbuddied[i];
  496. zhdr = list_first_entry_or_null(READ_ONCE(l),
  497. struct z3fold_header, buddy);
  498. if (!zhdr)
  499. continue;
  500. /* Re-check under lock. */
  501. spin_lock(&pool->lock);
  502. l = &unbuddied[i];
  503. if (unlikely(zhdr != list_first_entry(READ_ONCE(l),
  504. struct z3fold_header, buddy)) ||
  505. !z3fold_page_trylock(zhdr)) {
  506. spin_unlock(&pool->lock);
  507. put_cpu_ptr(pool->unbuddied);
  508. goto lookup;
  509. }
  510. list_del_init(&zhdr->buddy);
  511. zhdr->cpu = -1;
  512. spin_unlock(&pool->lock);
  513. page = virt_to_page(zhdr);
  514. if (test_bit(NEEDS_COMPACTING, &page->private)) {
  515. z3fold_page_unlock(zhdr);
  516. zhdr = NULL;
  517. put_cpu_ptr(pool->unbuddied);
  518. if (can_sleep)
  519. cond_resched();
  520. goto lookup;
  521. }
  522. /*
  523. * this page could not be removed from its unbuddied
  524. * list while pool lock was held, and then we've taken
  525. * page lock so kref_put could not be called before
  526. * we got here, so it's safe to just call kref_get()
  527. */
  528. kref_get(&zhdr->refcount);
  529. break;
  530. }
  531. put_cpu_ptr(pool->unbuddied);
  532. if (zhdr) {
  533. if (zhdr->first_chunks == 0) {
  534. if (zhdr->middle_chunks != 0 &&
  535. chunks >= zhdr->start_middle)
  536. bud = LAST;
  537. else
  538. bud = FIRST;
  539. } else if (zhdr->last_chunks == 0)
  540. bud = LAST;
  541. else if (zhdr->middle_chunks == 0)
  542. bud = MIDDLE;
  543. else {
  544. if (kref_put(&zhdr->refcount,
  545. release_z3fold_page_locked))
  546. atomic64_dec(&pool->pages_nr);
  547. else
  548. z3fold_page_unlock(zhdr);
  549. pr_err("No free chunks in unbuddied\n");
  550. WARN_ON(1);
  551. goto lookup;
  552. }
  553. goto found;
  554. }
  555. bud = FIRST;
  556. }
  557. spin_lock(&pool->stale_lock);
  558. zhdr = list_first_entry_or_null(&pool->stale,
  559. struct z3fold_header, buddy);
  560. /*
  561. * Before allocating a page, let's see if we can take one from the
  562. * stale pages list. cancel_work_sync() can sleep so we must make
  563. * sure it won't be called in case we're in atomic context.
  564. */
  565. if (zhdr && (can_sleep || !work_pending(&zhdr->work))) {
  566. list_del(&zhdr->buddy);
  567. spin_unlock(&pool->stale_lock);
  568. if (can_sleep)
  569. cancel_work_sync(&zhdr->work);
  570. page = virt_to_page(zhdr);
  571. } else {
  572. spin_unlock(&pool->stale_lock);
  573. page = alloc_page(gfp);
  574. }
  575. if (!page)
  576. return -ENOMEM;
  577. atomic64_inc(&pool->pages_nr);
  578. zhdr = init_z3fold_page(page, pool);
  579. if (bud == HEADLESS) {
  580. set_bit(PAGE_HEADLESS, &page->private);
  581. goto headless;
  582. }
  583. z3fold_page_lock(zhdr);
  584. found:
  585. if (bud == FIRST)
  586. zhdr->first_chunks = chunks;
  587. else if (bud == LAST)
  588. zhdr->last_chunks = chunks;
  589. else {
  590. zhdr->middle_chunks = chunks;
  591. zhdr->start_middle = zhdr->first_chunks + ZHDR_CHUNKS;
  592. }
  593. if (zhdr->first_chunks == 0 || zhdr->last_chunks == 0 ||
  594. zhdr->middle_chunks == 0) {
  595. struct list_head *unbuddied = get_cpu_ptr(pool->unbuddied);
  596. /* Add to unbuddied list */
  597. freechunks = num_free_chunks(zhdr);
  598. spin_lock(&pool->lock);
  599. list_add(&zhdr->buddy, &unbuddied[freechunks]);
  600. spin_unlock(&pool->lock);
  601. zhdr->cpu = smp_processor_id();
  602. put_cpu_ptr(pool->unbuddied);
  603. }
  604. headless:
  605. spin_lock(&pool->lock);
  606. /* Add/move z3fold page to beginning of LRU */
  607. if (!list_empty(&page->lru))
  608. list_del(&page->lru);
  609. list_add(&page->lru, &pool->lru);
  610. *handle = encode_handle(zhdr, bud);
  611. spin_unlock(&pool->lock);
  612. if (bud != HEADLESS)
  613. z3fold_page_unlock(zhdr);
  614. return 0;
  615. }
  616. /**
  617. * z3fold_free() - frees the allocation associated with the given handle
  618. * @pool: pool in which the allocation resided
  619. * @handle: handle associated with the allocation returned by z3fold_alloc()
  620. *
  621. * In the case that the z3fold page in which the allocation resides is under
  622. * reclaim, as indicated by the PG_reclaim flag being set, this function
  623. * only sets the first|last_chunks to 0. The page is actually freed
  624. * once both buddies are evicted (see z3fold_reclaim_page() below).
  625. */
  626. static void z3fold_free(struct z3fold_pool *pool, unsigned long handle)
  627. {
  628. struct z3fold_header *zhdr;
  629. struct page *page;
  630. enum buddy bud;
  631. zhdr = handle_to_z3fold_header(handle);
  632. page = virt_to_page(zhdr);
  633. if (test_bit(PAGE_HEADLESS, &page->private)) {
  634. /* HEADLESS page stored */
  635. bud = HEADLESS;
  636. } else {
  637. z3fold_page_lock(zhdr);
  638. bud = handle_to_buddy(handle);
  639. switch (bud) {
  640. case FIRST:
  641. zhdr->first_chunks = 0;
  642. break;
  643. case MIDDLE:
  644. zhdr->middle_chunks = 0;
  645. zhdr->start_middle = 0;
  646. break;
  647. case LAST:
  648. zhdr->last_chunks = 0;
  649. break;
  650. default:
  651. pr_err("%s: unknown bud %d\n", __func__, bud);
  652. WARN_ON(1);
  653. z3fold_page_unlock(zhdr);
  654. return;
  655. }
  656. }
  657. if (bud == HEADLESS) {
  658. spin_lock(&pool->lock);
  659. list_del(&page->lru);
  660. spin_unlock(&pool->lock);
  661. free_z3fold_page(page);
  662. atomic64_dec(&pool->pages_nr);
  663. return;
  664. }
  665. if (kref_put(&zhdr->refcount, release_z3fold_page_locked_list)) {
  666. atomic64_dec(&pool->pages_nr);
  667. return;
  668. }
  669. if (test_and_set_bit(NEEDS_COMPACTING, &page->private)) {
  670. z3fold_page_unlock(zhdr);
  671. return;
  672. }
  673. if (zhdr->cpu < 0 || !cpu_online(zhdr->cpu)) {
  674. spin_lock(&pool->lock);
  675. list_del_init(&zhdr->buddy);
  676. spin_unlock(&pool->lock);
  677. zhdr->cpu = -1;
  678. kref_get(&zhdr->refcount);
  679. do_compact_page(zhdr, true);
  680. return;
  681. }
  682. kref_get(&zhdr->refcount);
  683. queue_work_on(zhdr->cpu, pool->compact_wq, &zhdr->work);
  684. z3fold_page_unlock(zhdr);
  685. }
  686. /**
  687. * z3fold_reclaim_page() - evicts allocations from a pool page and frees it
  688. * @pool: pool from which a page will attempt to be evicted
  689. * @retires: number of pages on the LRU list for which eviction will
  690. * be attempted before failing
  691. *
  692. * z3fold reclaim is different from normal system reclaim in that it is done
  693. * from the bottom, up. This is because only the bottom layer, z3fold, has
  694. * information on how the allocations are organized within each z3fold page.
  695. * This has the potential to create interesting locking situations between
  696. * z3fold and the user, however.
  697. *
  698. * To avoid these, this is how z3fold_reclaim_page() should be called:
  699. * The user detects a page should be reclaimed and calls z3fold_reclaim_page().
  700. * z3fold_reclaim_page() will remove a z3fold page from the pool LRU list and
  701. * call the user-defined eviction handler with the pool and handle as
  702. * arguments.
  703. *
  704. * If the handle can not be evicted, the eviction handler should return
  705. * non-zero. z3fold_reclaim_page() will add the z3fold page back to the
  706. * appropriate list and try the next z3fold page on the LRU up to
  707. * a user defined number of retries.
  708. *
  709. * If the handle is successfully evicted, the eviction handler should
  710. * return 0 _and_ should have called z3fold_free() on the handle. z3fold_free()
  711. * contains logic to delay freeing the page if the page is under reclaim,
  712. * as indicated by the setting of the PG_reclaim flag on the underlying page.
  713. *
  714. * If all buddies in the z3fold page are successfully evicted, then the
  715. * z3fold page can be freed.
  716. *
  717. * Returns: 0 if page is successfully freed, otherwise -EINVAL if there are
  718. * no pages to evict or an eviction handler is not registered, -EAGAIN if
  719. * the retry limit was hit.
  720. */
  721. static int z3fold_reclaim_page(struct z3fold_pool *pool, unsigned int retries)
  722. {
  723. int i, ret = 0;
  724. struct z3fold_header *zhdr = NULL;
  725. struct page *page = NULL;
  726. struct list_head *pos;
  727. unsigned long first_handle = 0, middle_handle = 0, last_handle = 0;
  728. spin_lock(&pool->lock);
  729. if (!pool->ops || !pool->ops->evict || retries == 0) {
  730. spin_unlock(&pool->lock);
  731. return -EINVAL;
  732. }
  733. for (i = 0; i < retries; i++) {
  734. if (list_empty(&pool->lru)) {
  735. spin_unlock(&pool->lock);
  736. return -EINVAL;
  737. }
  738. list_for_each_prev(pos, &pool->lru) {
  739. page = list_entry(pos, struct page, lru);
  740. if (test_bit(PAGE_HEADLESS, &page->private))
  741. /* candidate found */
  742. break;
  743. zhdr = page_address(page);
  744. if (!z3fold_page_trylock(zhdr))
  745. continue; /* can't evict at this point */
  746. kref_get(&zhdr->refcount);
  747. list_del_init(&zhdr->buddy);
  748. zhdr->cpu = -1;
  749. }
  750. list_del_init(&page->lru);
  751. spin_unlock(&pool->lock);
  752. if (!test_bit(PAGE_HEADLESS, &page->private)) {
  753. /*
  754. * We need encode the handles before unlocking, since
  755. * we can race with free that will set
  756. * (first|last)_chunks to 0
  757. */
  758. first_handle = 0;
  759. last_handle = 0;
  760. middle_handle = 0;
  761. if (zhdr->first_chunks)
  762. first_handle = encode_handle(zhdr, FIRST);
  763. if (zhdr->middle_chunks)
  764. middle_handle = encode_handle(zhdr, MIDDLE);
  765. if (zhdr->last_chunks)
  766. last_handle = encode_handle(zhdr, LAST);
  767. /*
  768. * it's safe to unlock here because we hold a
  769. * reference to this page
  770. */
  771. z3fold_page_unlock(zhdr);
  772. } else {
  773. first_handle = encode_handle(zhdr, HEADLESS);
  774. last_handle = middle_handle = 0;
  775. }
  776. /* Issue the eviction callback(s) */
  777. if (middle_handle) {
  778. ret = pool->ops->evict(pool, middle_handle);
  779. if (ret)
  780. goto next;
  781. }
  782. if (first_handle) {
  783. ret = pool->ops->evict(pool, first_handle);
  784. if (ret)
  785. goto next;
  786. }
  787. if (last_handle) {
  788. ret = pool->ops->evict(pool, last_handle);
  789. if (ret)
  790. goto next;
  791. }
  792. next:
  793. spin_lock(&pool->lock);
  794. if (test_bit(PAGE_HEADLESS, &page->private)) {
  795. if (ret == 0) {
  796. spin_unlock(&pool->lock);
  797. free_z3fold_page(page);
  798. return 0;
  799. }
  800. } else if (kref_put(&zhdr->refcount, release_z3fold_page)) {
  801. atomic64_dec(&pool->pages_nr);
  802. spin_unlock(&pool->lock);
  803. return 0;
  804. }
  805. /*
  806. * Add to the beginning of LRU.
  807. * Pool lock has to be kept here to ensure the page has
  808. * not already been released
  809. */
  810. list_add(&page->lru, &pool->lru);
  811. }
  812. spin_unlock(&pool->lock);
  813. return -EAGAIN;
  814. }
  815. /**
  816. * z3fold_map() - maps the allocation associated with the given handle
  817. * @pool: pool in which the allocation resides
  818. * @handle: handle associated with the allocation to be mapped
  819. *
  820. * Extracts the buddy number from handle and constructs the pointer to the
  821. * correct starting chunk within the page.
  822. *
  823. * Returns: a pointer to the mapped allocation
  824. */
  825. static void *z3fold_map(struct z3fold_pool *pool, unsigned long handle)
  826. {
  827. struct z3fold_header *zhdr;
  828. struct page *page;
  829. void *addr;
  830. enum buddy buddy;
  831. zhdr = handle_to_z3fold_header(handle);
  832. addr = zhdr;
  833. page = virt_to_page(zhdr);
  834. if (test_bit(PAGE_HEADLESS, &page->private))
  835. goto out;
  836. z3fold_page_lock(zhdr);
  837. buddy = handle_to_buddy(handle);
  838. switch (buddy) {
  839. case FIRST:
  840. addr += ZHDR_SIZE_ALIGNED;
  841. break;
  842. case MIDDLE:
  843. addr += zhdr->start_middle << CHUNK_SHIFT;
  844. set_bit(MIDDLE_CHUNK_MAPPED, &page->private);
  845. break;
  846. case LAST:
  847. addr += PAGE_SIZE - (zhdr->last_chunks << CHUNK_SHIFT);
  848. break;
  849. default:
  850. pr_err("unknown buddy id %d\n", buddy);
  851. WARN_ON(1);
  852. addr = NULL;
  853. break;
  854. }
  855. z3fold_page_unlock(zhdr);
  856. out:
  857. return addr;
  858. }
  859. /**
  860. * z3fold_unmap() - unmaps the allocation associated with the given handle
  861. * @pool: pool in which the allocation resides
  862. * @handle: handle associated with the allocation to be unmapped
  863. */
  864. static void z3fold_unmap(struct z3fold_pool *pool, unsigned long handle)
  865. {
  866. struct z3fold_header *zhdr;
  867. struct page *page;
  868. enum buddy buddy;
  869. zhdr = handle_to_z3fold_header(handle);
  870. page = virt_to_page(zhdr);
  871. if (test_bit(PAGE_HEADLESS, &page->private))
  872. return;
  873. z3fold_page_lock(zhdr);
  874. buddy = handle_to_buddy(handle);
  875. if (buddy == MIDDLE)
  876. clear_bit(MIDDLE_CHUNK_MAPPED, &page->private);
  877. z3fold_page_unlock(zhdr);
  878. }
  879. /**
  880. * z3fold_get_pool_size() - gets the z3fold pool size in pages
  881. * @pool: pool whose size is being queried
  882. *
  883. * Returns: size in pages of the given pool.
  884. */
  885. static u64 z3fold_get_pool_size(struct z3fold_pool *pool)
  886. {
  887. return atomic64_read(&pool->pages_nr);
  888. }
  889. /*****************
  890. * zpool
  891. ****************/
  892. static int z3fold_zpool_evict(struct z3fold_pool *pool, unsigned long handle)
  893. {
  894. if (pool->zpool && pool->zpool_ops && pool->zpool_ops->evict)
  895. return pool->zpool_ops->evict(pool->zpool, handle);
  896. else
  897. return -ENOENT;
  898. }
  899. static const struct z3fold_ops z3fold_zpool_ops = {
  900. .evict = z3fold_zpool_evict
  901. };
  902. static void *z3fold_zpool_create(const char *name, gfp_t gfp,
  903. const struct zpool_ops *zpool_ops,
  904. struct zpool *zpool)
  905. {
  906. struct z3fold_pool *pool;
  907. pool = z3fold_create_pool(name, gfp,
  908. zpool_ops ? &z3fold_zpool_ops : NULL);
  909. if (pool) {
  910. pool->zpool = zpool;
  911. pool->zpool_ops = zpool_ops;
  912. }
  913. return pool;
  914. }
  915. static void z3fold_zpool_destroy(void *pool)
  916. {
  917. z3fold_destroy_pool(pool);
  918. }
  919. static int z3fold_zpool_malloc(void *pool, size_t size, gfp_t gfp,
  920. unsigned long *handle)
  921. {
  922. return z3fold_alloc(pool, size, gfp, handle);
  923. }
  924. static void z3fold_zpool_free(void *pool, unsigned long handle)
  925. {
  926. z3fold_free(pool, handle);
  927. }
  928. static int z3fold_zpool_shrink(void *pool, unsigned int pages,
  929. unsigned int *reclaimed)
  930. {
  931. unsigned int total = 0;
  932. int ret = -EINVAL;
  933. while (total < pages) {
  934. ret = z3fold_reclaim_page(pool, 8);
  935. if (ret < 0)
  936. break;
  937. total++;
  938. }
  939. if (reclaimed)
  940. *reclaimed = total;
  941. return ret;
  942. }
  943. static void *z3fold_zpool_map(void *pool, unsigned long handle,
  944. enum zpool_mapmode mm)
  945. {
  946. return z3fold_map(pool, handle);
  947. }
  948. static void z3fold_zpool_unmap(void *pool, unsigned long handle)
  949. {
  950. z3fold_unmap(pool, handle);
  951. }
  952. static u64 z3fold_zpool_total_size(void *pool)
  953. {
  954. return z3fold_get_pool_size(pool) * PAGE_SIZE;
  955. }
  956. static struct zpool_driver z3fold_zpool_driver = {
  957. .type = "z3fold",
  958. .owner = THIS_MODULE,
  959. .create = z3fold_zpool_create,
  960. .destroy = z3fold_zpool_destroy,
  961. .malloc = z3fold_zpool_malloc,
  962. .free = z3fold_zpool_free,
  963. .shrink = z3fold_zpool_shrink,
  964. .map = z3fold_zpool_map,
  965. .unmap = z3fold_zpool_unmap,
  966. .total_size = z3fold_zpool_total_size,
  967. };
  968. MODULE_ALIAS("zpool-z3fold");
  969. static int __init init_z3fold(void)
  970. {
  971. /* Make sure the z3fold header is not larger than the page size */
  972. BUILD_BUG_ON(ZHDR_SIZE_ALIGNED > PAGE_SIZE);
  973. zpool_register_driver(&z3fold_zpool_driver);
  974. return 0;
  975. }
  976. static void __exit exit_z3fold(void)
  977. {
  978. zpool_unregister_driver(&z3fold_zpool_driver);
  979. }
  980. module_init(init_z3fold);
  981. module_exit(exit_z3fold);
  982. MODULE_LICENSE("GPL");
  983. MODULE_AUTHOR("Vitaly Wool <vitalywool@gmail.com>");
  984. MODULE_DESCRIPTION("3-Fold Allocator for Compressed Pages");