zswap.c 31 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239
  1. /*
  2. * zswap.c - zswap driver file
  3. *
  4. * zswap is a backend for frontswap that takes pages that are in the process
  5. * of being swapped out and attempts to compress and store them in a
  6. * RAM-based memory pool. This can result in a significant I/O reduction on
  7. * the swap device and, in the case where decompressing from RAM is faster
  8. * than reading from the swap device, can also improve workload performance.
  9. *
  10. * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
  11. *
  12. * This program is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU General Public License
  14. * as published by the Free Software Foundation; either version 2
  15. * of the License, or (at your option) any later version.
  16. *
  17. * This program is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. */
  22. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  23. #include <linux/module.h>
  24. #include <linux/cpu.h>
  25. #include <linux/highmem.h>
  26. #include <linux/slab.h>
  27. #include <linux/spinlock.h>
  28. #include <linux/types.h>
  29. #include <linux/atomic.h>
  30. #include <linux/frontswap.h>
  31. #include <linux/rbtree.h>
  32. #include <linux/swap.h>
  33. #include <linux/crypto.h>
  34. #include <linux/mempool.h>
  35. #include <linux/zpool.h>
  36. #include <linux/mm_types.h>
  37. #include <linux/page-flags.h>
  38. #include <linux/swapops.h>
  39. #include <linux/writeback.h>
  40. #include <linux/pagemap.h>
  41. /*********************************
  42. * statistics
  43. **********************************/
  44. /* Total bytes used by the compressed storage */
  45. static u64 zswap_pool_total_size;
  46. /* The number of compressed pages currently stored in zswap */
  47. static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
  48. /*
  49. * The statistics below are not protected from concurrent access for
  50. * performance reasons so they may not be a 100% accurate. However,
  51. * they do provide useful information on roughly how many times a
  52. * certain event is occurring.
  53. */
  54. /* Pool limit was hit (see zswap_max_pool_percent) */
  55. static u64 zswap_pool_limit_hit;
  56. /* Pages written back when pool limit was reached */
  57. static u64 zswap_written_back_pages;
  58. /* Store failed due to a reclaim failure after pool limit was reached */
  59. static u64 zswap_reject_reclaim_fail;
  60. /* Compressed page was too big for the allocator to (optimally) store */
  61. static u64 zswap_reject_compress_poor;
  62. /* Store failed because underlying allocator could not get memory */
  63. static u64 zswap_reject_alloc_fail;
  64. /* Store failed because the entry metadata could not be allocated (rare) */
  65. static u64 zswap_reject_kmemcache_fail;
  66. /* Duplicate store was encountered (rare) */
  67. static u64 zswap_duplicate_entry;
  68. /*********************************
  69. * tunables
  70. **********************************/
  71. /* Enable/disable zswap (disabled by default) */
  72. static bool zswap_enabled;
  73. static int zswap_enabled_param_set(const char *,
  74. const struct kernel_param *);
  75. static struct kernel_param_ops zswap_enabled_param_ops = {
  76. .set = zswap_enabled_param_set,
  77. .get = param_get_bool,
  78. };
  79. module_param_cb(enabled, &zswap_enabled_param_ops, &zswap_enabled, 0644);
  80. /* Crypto compressor to use */
  81. #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
  82. static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
  83. static int zswap_compressor_param_set(const char *,
  84. const struct kernel_param *);
  85. static struct kernel_param_ops zswap_compressor_param_ops = {
  86. .set = zswap_compressor_param_set,
  87. .get = param_get_charp,
  88. .free = param_free_charp,
  89. };
  90. module_param_cb(compressor, &zswap_compressor_param_ops,
  91. &zswap_compressor, 0644);
  92. /* Compressed storage zpool to use */
  93. #define ZSWAP_ZPOOL_DEFAULT "zbud"
  94. static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
  95. static int zswap_zpool_param_set(const char *, const struct kernel_param *);
  96. static struct kernel_param_ops zswap_zpool_param_ops = {
  97. .set = zswap_zpool_param_set,
  98. .get = param_get_charp,
  99. .free = param_free_charp,
  100. };
  101. module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
  102. /* The maximum percentage of memory that the compressed pool can occupy */
  103. static unsigned int zswap_max_pool_percent = 20;
  104. module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
  105. /*********************************
  106. * data structures
  107. **********************************/
  108. struct zswap_pool {
  109. struct zpool *zpool;
  110. struct crypto_comp * __percpu *tfm;
  111. struct kref kref;
  112. struct list_head list;
  113. struct work_struct work;
  114. struct hlist_node node;
  115. char tfm_name[CRYPTO_MAX_ALG_NAME];
  116. };
  117. /*
  118. * struct zswap_entry
  119. *
  120. * This structure contains the metadata for tracking a single compressed
  121. * page within zswap.
  122. *
  123. * rbnode - links the entry into red-black tree for the appropriate swap type
  124. * offset - the swap offset for the entry. Index into the red-black tree.
  125. * refcount - the number of outstanding reference to the entry. This is needed
  126. * to protect against premature freeing of the entry by code
  127. * concurrent calls to load, invalidate, and writeback. The lock
  128. * for the zswap_tree structure that contains the entry must
  129. * be held while changing the refcount. Since the lock must
  130. * be held, there is no reason to also make refcount atomic.
  131. * length - the length in bytes of the compressed page data. Needed during
  132. * decompression
  133. * pool - the zswap_pool the entry's data is in
  134. * handle - zpool allocation handle that stores the compressed page data
  135. */
  136. struct zswap_entry {
  137. struct rb_node rbnode;
  138. pgoff_t offset;
  139. int refcount;
  140. unsigned int length;
  141. struct zswap_pool *pool;
  142. unsigned long handle;
  143. };
  144. struct zswap_header {
  145. swp_entry_t swpentry;
  146. };
  147. /*
  148. * The tree lock in the zswap_tree struct protects a few things:
  149. * - the rbtree
  150. * - the refcount field of each entry in the tree
  151. */
  152. struct zswap_tree {
  153. struct rb_root rbroot;
  154. spinlock_t lock;
  155. };
  156. static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
  157. /* RCU-protected iteration */
  158. static LIST_HEAD(zswap_pools);
  159. /* protects zswap_pools list modification */
  160. static DEFINE_SPINLOCK(zswap_pools_lock);
  161. /* pool counter to provide unique names to zpool */
  162. static atomic_t zswap_pools_count = ATOMIC_INIT(0);
  163. /* used by param callback function */
  164. static bool zswap_init_started;
  165. /* fatal error during init */
  166. static bool zswap_init_failed;
  167. /*********************************
  168. * helpers and fwd declarations
  169. **********************************/
  170. #define zswap_pool_debug(msg, p) \
  171. pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
  172. zpool_get_type((p)->zpool))
  173. static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
  174. static int zswap_pool_get(struct zswap_pool *pool);
  175. static void zswap_pool_put(struct zswap_pool *pool);
  176. static const struct zpool_ops zswap_zpool_ops = {
  177. .evict = zswap_writeback_entry
  178. };
  179. static bool zswap_is_full(void)
  180. {
  181. return totalram_pages * zswap_max_pool_percent / 100 <
  182. DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
  183. }
  184. static void zswap_update_total_size(void)
  185. {
  186. struct zswap_pool *pool;
  187. u64 total = 0;
  188. rcu_read_lock();
  189. list_for_each_entry_rcu(pool, &zswap_pools, list)
  190. total += zpool_get_total_size(pool->zpool);
  191. rcu_read_unlock();
  192. zswap_pool_total_size = total;
  193. }
  194. /*********************************
  195. * zswap entry functions
  196. **********************************/
  197. static struct kmem_cache *zswap_entry_cache;
  198. static int __init zswap_entry_cache_create(void)
  199. {
  200. zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
  201. return zswap_entry_cache == NULL;
  202. }
  203. static void __init zswap_entry_cache_destroy(void)
  204. {
  205. kmem_cache_destroy(zswap_entry_cache);
  206. }
  207. static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
  208. {
  209. struct zswap_entry *entry;
  210. entry = kmem_cache_alloc(zswap_entry_cache, gfp);
  211. if (!entry)
  212. return NULL;
  213. entry->refcount = 1;
  214. RB_CLEAR_NODE(&entry->rbnode);
  215. return entry;
  216. }
  217. static void zswap_entry_cache_free(struct zswap_entry *entry)
  218. {
  219. kmem_cache_free(zswap_entry_cache, entry);
  220. }
  221. /*********************************
  222. * rbtree functions
  223. **********************************/
  224. static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
  225. {
  226. struct rb_node *node = root->rb_node;
  227. struct zswap_entry *entry;
  228. while (node) {
  229. entry = rb_entry(node, struct zswap_entry, rbnode);
  230. if (entry->offset > offset)
  231. node = node->rb_left;
  232. else if (entry->offset < offset)
  233. node = node->rb_right;
  234. else
  235. return entry;
  236. }
  237. return NULL;
  238. }
  239. /*
  240. * In the case that a entry with the same offset is found, a pointer to
  241. * the existing entry is stored in dupentry and the function returns -EEXIST
  242. */
  243. static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
  244. struct zswap_entry **dupentry)
  245. {
  246. struct rb_node **link = &root->rb_node, *parent = NULL;
  247. struct zswap_entry *myentry;
  248. while (*link) {
  249. parent = *link;
  250. myentry = rb_entry(parent, struct zswap_entry, rbnode);
  251. if (myentry->offset > entry->offset)
  252. link = &(*link)->rb_left;
  253. else if (myentry->offset < entry->offset)
  254. link = &(*link)->rb_right;
  255. else {
  256. *dupentry = myentry;
  257. return -EEXIST;
  258. }
  259. }
  260. rb_link_node(&entry->rbnode, parent, link);
  261. rb_insert_color(&entry->rbnode, root);
  262. return 0;
  263. }
  264. static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
  265. {
  266. if (!RB_EMPTY_NODE(&entry->rbnode)) {
  267. rb_erase(&entry->rbnode, root);
  268. RB_CLEAR_NODE(&entry->rbnode);
  269. }
  270. }
  271. /*
  272. * Carries out the common pattern of freeing and entry's zpool allocation,
  273. * freeing the entry itself, and decrementing the number of stored pages.
  274. */
  275. static void zswap_free_entry(struct zswap_entry *entry)
  276. {
  277. zpool_free(entry->pool->zpool, entry->handle);
  278. zswap_pool_put(entry->pool);
  279. zswap_entry_cache_free(entry);
  280. atomic_dec(&zswap_stored_pages);
  281. zswap_update_total_size();
  282. }
  283. /* caller must hold the tree lock */
  284. static void zswap_entry_get(struct zswap_entry *entry)
  285. {
  286. entry->refcount++;
  287. }
  288. /* caller must hold the tree lock
  289. * remove from the tree and free it, if nobody reference the entry
  290. */
  291. static void zswap_entry_put(struct zswap_tree *tree,
  292. struct zswap_entry *entry)
  293. {
  294. int refcount = --entry->refcount;
  295. BUG_ON(refcount < 0);
  296. if (refcount == 0) {
  297. zswap_rb_erase(&tree->rbroot, entry);
  298. zswap_free_entry(entry);
  299. }
  300. }
  301. /* caller must hold the tree lock */
  302. static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
  303. pgoff_t offset)
  304. {
  305. struct zswap_entry *entry;
  306. entry = zswap_rb_search(root, offset);
  307. if (entry)
  308. zswap_entry_get(entry);
  309. return entry;
  310. }
  311. /*********************************
  312. * per-cpu code
  313. **********************************/
  314. static DEFINE_PER_CPU(u8 *, zswap_dstmem);
  315. static int zswap_dstmem_prepare(unsigned int cpu)
  316. {
  317. u8 *dst;
  318. dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
  319. if (!dst) {
  320. pr_err("can't allocate compressor buffer\n");
  321. return -ENOMEM;
  322. }
  323. per_cpu(zswap_dstmem, cpu) = dst;
  324. return 0;
  325. }
  326. static int zswap_dstmem_dead(unsigned int cpu)
  327. {
  328. u8 *dst;
  329. dst = per_cpu(zswap_dstmem, cpu);
  330. kfree(dst);
  331. per_cpu(zswap_dstmem, cpu) = NULL;
  332. return 0;
  333. }
  334. static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
  335. {
  336. struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
  337. struct crypto_comp *tfm;
  338. if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
  339. return 0;
  340. tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
  341. if (IS_ERR_OR_NULL(tfm)) {
  342. pr_err("could not alloc crypto comp %s : %ld\n",
  343. pool->tfm_name, PTR_ERR(tfm));
  344. return -ENOMEM;
  345. }
  346. *per_cpu_ptr(pool->tfm, cpu) = tfm;
  347. return 0;
  348. }
  349. static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
  350. {
  351. struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
  352. struct crypto_comp *tfm;
  353. tfm = *per_cpu_ptr(pool->tfm, cpu);
  354. if (!IS_ERR_OR_NULL(tfm))
  355. crypto_free_comp(tfm);
  356. *per_cpu_ptr(pool->tfm, cpu) = NULL;
  357. return 0;
  358. }
  359. /*********************************
  360. * pool functions
  361. **********************************/
  362. static struct zswap_pool *__zswap_pool_current(void)
  363. {
  364. struct zswap_pool *pool;
  365. pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
  366. WARN_ON(!pool);
  367. return pool;
  368. }
  369. static struct zswap_pool *zswap_pool_current(void)
  370. {
  371. assert_spin_locked(&zswap_pools_lock);
  372. return __zswap_pool_current();
  373. }
  374. static struct zswap_pool *zswap_pool_current_get(void)
  375. {
  376. struct zswap_pool *pool;
  377. rcu_read_lock();
  378. pool = __zswap_pool_current();
  379. if (!pool || !zswap_pool_get(pool))
  380. pool = NULL;
  381. rcu_read_unlock();
  382. return pool;
  383. }
  384. static struct zswap_pool *zswap_pool_last_get(void)
  385. {
  386. struct zswap_pool *pool, *last = NULL;
  387. rcu_read_lock();
  388. list_for_each_entry_rcu(pool, &zswap_pools, list)
  389. last = pool;
  390. if (!WARN_ON(!last) && !zswap_pool_get(last))
  391. last = NULL;
  392. rcu_read_unlock();
  393. return last;
  394. }
  395. /* type and compressor must be null-terminated */
  396. static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
  397. {
  398. struct zswap_pool *pool;
  399. assert_spin_locked(&zswap_pools_lock);
  400. list_for_each_entry_rcu(pool, &zswap_pools, list) {
  401. if (strcmp(pool->tfm_name, compressor))
  402. continue;
  403. if (strcmp(zpool_get_type(pool->zpool), type))
  404. continue;
  405. /* if we can't get it, it's about to be destroyed */
  406. if (!zswap_pool_get(pool))
  407. continue;
  408. return pool;
  409. }
  410. return NULL;
  411. }
  412. static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
  413. {
  414. struct zswap_pool *pool;
  415. char name[38]; /* 'zswap' + 32 char (max) num + \0 */
  416. gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
  417. int ret;
  418. pool = kzalloc(sizeof(*pool), GFP_KERNEL);
  419. if (!pool) {
  420. pr_err("pool alloc failed\n");
  421. return NULL;
  422. }
  423. /* unique name for each pool specifically required by zsmalloc */
  424. snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
  425. pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
  426. if (!pool->zpool) {
  427. pr_err("%s zpool not available\n", type);
  428. goto error;
  429. }
  430. pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
  431. strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
  432. pool->tfm = alloc_percpu(struct crypto_comp *);
  433. if (!pool->tfm) {
  434. pr_err("percpu alloc failed\n");
  435. goto error;
  436. }
  437. ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
  438. &pool->node);
  439. if (ret)
  440. goto error;
  441. pr_debug("using %s compressor\n", pool->tfm_name);
  442. /* being the current pool takes 1 ref; this func expects the
  443. * caller to always add the new pool as the current pool
  444. */
  445. kref_init(&pool->kref);
  446. INIT_LIST_HEAD(&pool->list);
  447. zswap_pool_debug("created", pool);
  448. return pool;
  449. error:
  450. free_percpu(pool->tfm);
  451. if (pool->zpool)
  452. zpool_destroy_pool(pool->zpool);
  453. kfree(pool);
  454. return NULL;
  455. }
  456. static __init struct zswap_pool *__zswap_pool_create_fallback(void)
  457. {
  458. if (!crypto_has_comp(zswap_compressor, 0, 0)) {
  459. if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
  460. pr_err("default compressor %s not available\n",
  461. zswap_compressor);
  462. return NULL;
  463. }
  464. pr_err("compressor %s not available, using default %s\n",
  465. zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
  466. param_free_charp(&zswap_compressor);
  467. zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
  468. }
  469. if (!zpool_has_pool(zswap_zpool_type)) {
  470. if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
  471. pr_err("default zpool %s not available\n",
  472. zswap_zpool_type);
  473. return NULL;
  474. }
  475. pr_err("zpool %s not available, using default %s\n",
  476. zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
  477. param_free_charp(&zswap_zpool_type);
  478. zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
  479. }
  480. return zswap_pool_create(zswap_zpool_type, zswap_compressor);
  481. }
  482. static void zswap_pool_destroy(struct zswap_pool *pool)
  483. {
  484. zswap_pool_debug("destroying", pool);
  485. cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
  486. free_percpu(pool->tfm);
  487. zpool_destroy_pool(pool->zpool);
  488. kfree(pool);
  489. }
  490. static int __must_check zswap_pool_get(struct zswap_pool *pool)
  491. {
  492. return kref_get_unless_zero(&pool->kref);
  493. }
  494. static void __zswap_pool_release(struct work_struct *work)
  495. {
  496. struct zswap_pool *pool = container_of(work, typeof(*pool), work);
  497. synchronize_rcu();
  498. /* nobody should have been able to get a kref... */
  499. WARN_ON(kref_get_unless_zero(&pool->kref));
  500. /* pool is now off zswap_pools list and has no references. */
  501. zswap_pool_destroy(pool);
  502. }
  503. static void __zswap_pool_empty(struct kref *kref)
  504. {
  505. struct zswap_pool *pool;
  506. pool = container_of(kref, typeof(*pool), kref);
  507. spin_lock(&zswap_pools_lock);
  508. WARN_ON(pool == zswap_pool_current());
  509. list_del_rcu(&pool->list);
  510. INIT_WORK(&pool->work, __zswap_pool_release);
  511. schedule_work(&pool->work);
  512. spin_unlock(&zswap_pools_lock);
  513. }
  514. static void zswap_pool_put(struct zswap_pool *pool)
  515. {
  516. kref_put(&pool->kref, __zswap_pool_empty);
  517. }
  518. /*********************************
  519. * param callbacks
  520. **********************************/
  521. /* val must be a null-terminated string */
  522. static int __zswap_param_set(const char *val, const struct kernel_param *kp,
  523. char *type, char *compressor)
  524. {
  525. struct zswap_pool *pool, *put_pool = NULL;
  526. char *s = strstrip((char *)val);
  527. int ret;
  528. if (zswap_init_failed) {
  529. pr_err("can't set param, initialization failed\n");
  530. return -ENODEV;
  531. }
  532. /* no change required */
  533. if (!strcmp(s, *(char **)kp->arg))
  534. return 0;
  535. /* if this is load-time (pre-init) param setting,
  536. * don't create a pool; that's done during init.
  537. */
  538. if (!zswap_init_started)
  539. return param_set_charp(s, kp);
  540. if (!type) {
  541. if (!zpool_has_pool(s)) {
  542. pr_err("zpool %s not available\n", s);
  543. return -ENOENT;
  544. }
  545. type = s;
  546. } else if (!compressor) {
  547. if (!crypto_has_comp(s, 0, 0)) {
  548. pr_err("compressor %s not available\n", s);
  549. return -ENOENT;
  550. }
  551. compressor = s;
  552. } else {
  553. WARN_ON(1);
  554. return -EINVAL;
  555. }
  556. spin_lock(&zswap_pools_lock);
  557. pool = zswap_pool_find_get(type, compressor);
  558. if (pool) {
  559. zswap_pool_debug("using existing", pool);
  560. list_del_rcu(&pool->list);
  561. } else {
  562. spin_unlock(&zswap_pools_lock);
  563. pool = zswap_pool_create(type, compressor);
  564. spin_lock(&zswap_pools_lock);
  565. }
  566. if (pool)
  567. ret = param_set_charp(s, kp);
  568. else
  569. ret = -EINVAL;
  570. if (!ret) {
  571. put_pool = zswap_pool_current();
  572. list_add_rcu(&pool->list, &zswap_pools);
  573. } else if (pool) {
  574. /* add the possibly pre-existing pool to the end of the pools
  575. * list; if it's new (and empty) then it'll be removed and
  576. * destroyed by the put after we drop the lock
  577. */
  578. list_add_tail_rcu(&pool->list, &zswap_pools);
  579. put_pool = pool;
  580. }
  581. spin_unlock(&zswap_pools_lock);
  582. /* drop the ref from either the old current pool,
  583. * or the new pool we failed to add
  584. */
  585. if (put_pool)
  586. zswap_pool_put(put_pool);
  587. return ret;
  588. }
  589. static int zswap_compressor_param_set(const char *val,
  590. const struct kernel_param *kp)
  591. {
  592. return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
  593. }
  594. static int zswap_zpool_param_set(const char *val,
  595. const struct kernel_param *kp)
  596. {
  597. return __zswap_param_set(val, kp, NULL, zswap_compressor);
  598. }
  599. static int zswap_enabled_param_set(const char *val,
  600. const struct kernel_param *kp)
  601. {
  602. if (zswap_init_failed) {
  603. pr_err("can't enable, initialization failed\n");
  604. return -ENODEV;
  605. }
  606. return param_set_bool(val, kp);
  607. }
  608. /*********************************
  609. * writeback code
  610. **********************************/
  611. /* return enum for zswap_get_swap_cache_page */
  612. enum zswap_get_swap_ret {
  613. ZSWAP_SWAPCACHE_NEW,
  614. ZSWAP_SWAPCACHE_EXIST,
  615. ZSWAP_SWAPCACHE_FAIL,
  616. };
  617. /*
  618. * zswap_get_swap_cache_page
  619. *
  620. * This is an adaption of read_swap_cache_async()
  621. *
  622. * This function tries to find a page with the given swap entry
  623. * in the swapper_space address space (the swap cache). If the page
  624. * is found, it is returned in retpage. Otherwise, a page is allocated,
  625. * added to the swap cache, and returned in retpage.
  626. *
  627. * If success, the swap cache page is returned in retpage
  628. * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
  629. * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
  630. * the new page is added to swapcache and locked
  631. * Returns ZSWAP_SWAPCACHE_FAIL on error
  632. */
  633. static int zswap_get_swap_cache_page(swp_entry_t entry,
  634. struct page **retpage)
  635. {
  636. bool page_was_allocated;
  637. *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
  638. NULL, 0, &page_was_allocated);
  639. if (page_was_allocated)
  640. return ZSWAP_SWAPCACHE_NEW;
  641. if (!*retpage)
  642. return ZSWAP_SWAPCACHE_FAIL;
  643. return ZSWAP_SWAPCACHE_EXIST;
  644. }
  645. /*
  646. * Attempts to free an entry by adding a page to the swap cache,
  647. * decompressing the entry data into the page, and issuing a
  648. * bio write to write the page back to the swap device.
  649. *
  650. * This can be thought of as a "resumed writeback" of the page
  651. * to the swap device. We are basically resuming the same swap
  652. * writeback path that was intercepted with the frontswap_store()
  653. * in the first place. After the page has been decompressed into
  654. * the swap cache, the compressed version stored by zswap can be
  655. * freed.
  656. */
  657. static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
  658. {
  659. struct zswap_header *zhdr;
  660. swp_entry_t swpentry;
  661. struct zswap_tree *tree;
  662. pgoff_t offset;
  663. struct zswap_entry *entry;
  664. struct page *page;
  665. struct crypto_comp *tfm;
  666. u8 *src, *dst;
  667. unsigned int dlen;
  668. int ret;
  669. struct writeback_control wbc = {
  670. .sync_mode = WB_SYNC_NONE,
  671. };
  672. /* extract swpentry from data */
  673. zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
  674. swpentry = zhdr->swpentry; /* here */
  675. zpool_unmap_handle(pool, handle);
  676. tree = zswap_trees[swp_type(swpentry)];
  677. offset = swp_offset(swpentry);
  678. /* find and ref zswap entry */
  679. spin_lock(&tree->lock);
  680. entry = zswap_entry_find_get(&tree->rbroot, offset);
  681. if (!entry) {
  682. /* entry was invalidated */
  683. spin_unlock(&tree->lock);
  684. return 0;
  685. }
  686. spin_unlock(&tree->lock);
  687. BUG_ON(offset != entry->offset);
  688. /* try to allocate swap cache page */
  689. switch (zswap_get_swap_cache_page(swpentry, &page)) {
  690. case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
  691. ret = -ENOMEM;
  692. goto fail;
  693. case ZSWAP_SWAPCACHE_EXIST:
  694. /* page is already in the swap cache, ignore for now */
  695. put_page(page);
  696. ret = -EEXIST;
  697. goto fail;
  698. case ZSWAP_SWAPCACHE_NEW: /* page is locked */
  699. /* decompress */
  700. dlen = PAGE_SIZE;
  701. src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
  702. ZPOOL_MM_RO) + sizeof(struct zswap_header);
  703. dst = kmap_atomic(page);
  704. tfm = *get_cpu_ptr(entry->pool->tfm);
  705. ret = crypto_comp_decompress(tfm, src, entry->length,
  706. dst, &dlen);
  707. put_cpu_ptr(entry->pool->tfm);
  708. kunmap_atomic(dst);
  709. zpool_unmap_handle(entry->pool->zpool, entry->handle);
  710. BUG_ON(ret);
  711. BUG_ON(dlen != PAGE_SIZE);
  712. /* page is up to date */
  713. SetPageUptodate(page);
  714. }
  715. /* move it to the tail of the inactive list after end_writeback */
  716. SetPageReclaim(page);
  717. /* start writeback */
  718. __swap_writepage(page, &wbc, end_swap_bio_write);
  719. put_page(page);
  720. zswap_written_back_pages++;
  721. spin_lock(&tree->lock);
  722. /* drop local reference */
  723. zswap_entry_put(tree, entry);
  724. /*
  725. * There are two possible situations for entry here:
  726. * (1) refcount is 1(normal case), entry is valid and on the tree
  727. * (2) refcount is 0, entry is freed and not on the tree
  728. * because invalidate happened during writeback
  729. * search the tree and free the entry if find entry
  730. */
  731. if (entry == zswap_rb_search(&tree->rbroot, offset))
  732. zswap_entry_put(tree, entry);
  733. spin_unlock(&tree->lock);
  734. goto end;
  735. /*
  736. * if we get here due to ZSWAP_SWAPCACHE_EXIST
  737. * a load may happening concurrently
  738. * it is safe and okay to not free the entry
  739. * if we free the entry in the following put
  740. * it it either okay to return !0
  741. */
  742. fail:
  743. spin_lock(&tree->lock);
  744. zswap_entry_put(tree, entry);
  745. spin_unlock(&tree->lock);
  746. end:
  747. return ret;
  748. }
  749. static int zswap_shrink(void)
  750. {
  751. struct zswap_pool *pool;
  752. int ret;
  753. pool = zswap_pool_last_get();
  754. if (!pool)
  755. return -ENOENT;
  756. ret = zpool_shrink(pool->zpool, 1, NULL);
  757. zswap_pool_put(pool);
  758. return ret;
  759. }
  760. /*********************************
  761. * frontswap hooks
  762. **********************************/
  763. /* attempts to compress and store an single page */
  764. static int zswap_frontswap_store(unsigned type, pgoff_t offset,
  765. struct page *page)
  766. {
  767. struct zswap_tree *tree = zswap_trees[type];
  768. struct zswap_entry *entry, *dupentry;
  769. struct crypto_comp *tfm;
  770. int ret;
  771. unsigned int dlen = PAGE_SIZE, len;
  772. unsigned long handle;
  773. char *buf;
  774. u8 *src, *dst;
  775. struct zswap_header *zhdr;
  776. if (!zswap_enabled || !tree) {
  777. ret = -ENODEV;
  778. goto reject;
  779. }
  780. /* reclaim space if needed */
  781. if (zswap_is_full()) {
  782. zswap_pool_limit_hit++;
  783. if (zswap_shrink()) {
  784. zswap_reject_reclaim_fail++;
  785. ret = -ENOMEM;
  786. goto reject;
  787. }
  788. }
  789. /* allocate entry */
  790. entry = zswap_entry_cache_alloc(GFP_KERNEL);
  791. if (!entry) {
  792. zswap_reject_kmemcache_fail++;
  793. ret = -ENOMEM;
  794. goto reject;
  795. }
  796. /* if entry is successfully added, it keeps the reference */
  797. entry->pool = zswap_pool_current_get();
  798. if (!entry->pool) {
  799. ret = -EINVAL;
  800. goto freepage;
  801. }
  802. /* compress */
  803. dst = get_cpu_var(zswap_dstmem);
  804. tfm = *get_cpu_ptr(entry->pool->tfm);
  805. src = kmap_atomic(page);
  806. ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
  807. kunmap_atomic(src);
  808. put_cpu_ptr(entry->pool->tfm);
  809. if (ret) {
  810. ret = -EINVAL;
  811. goto put_dstmem;
  812. }
  813. /* store */
  814. len = dlen + sizeof(struct zswap_header);
  815. ret = zpool_malloc(entry->pool->zpool, len,
  816. __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
  817. &handle);
  818. if (ret == -ENOSPC) {
  819. zswap_reject_compress_poor++;
  820. goto put_dstmem;
  821. }
  822. if (ret) {
  823. zswap_reject_alloc_fail++;
  824. goto put_dstmem;
  825. }
  826. zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
  827. zhdr->swpentry = swp_entry(type, offset);
  828. buf = (u8 *)(zhdr + 1);
  829. memcpy(buf, dst, dlen);
  830. zpool_unmap_handle(entry->pool->zpool, handle);
  831. put_cpu_var(zswap_dstmem);
  832. /* populate entry */
  833. entry->offset = offset;
  834. entry->handle = handle;
  835. entry->length = dlen;
  836. /* map */
  837. spin_lock(&tree->lock);
  838. do {
  839. ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
  840. if (ret == -EEXIST) {
  841. zswap_duplicate_entry++;
  842. /* remove from rbtree */
  843. zswap_rb_erase(&tree->rbroot, dupentry);
  844. zswap_entry_put(tree, dupentry);
  845. }
  846. } while (ret == -EEXIST);
  847. spin_unlock(&tree->lock);
  848. /* update stats */
  849. atomic_inc(&zswap_stored_pages);
  850. zswap_update_total_size();
  851. return 0;
  852. put_dstmem:
  853. put_cpu_var(zswap_dstmem);
  854. zswap_pool_put(entry->pool);
  855. freepage:
  856. zswap_entry_cache_free(entry);
  857. reject:
  858. return ret;
  859. }
  860. /*
  861. * returns 0 if the page was successfully decompressed
  862. * return -1 on entry not found or error
  863. */
  864. static int zswap_frontswap_load(unsigned type, pgoff_t offset,
  865. struct page *page)
  866. {
  867. struct zswap_tree *tree = zswap_trees[type];
  868. struct zswap_entry *entry;
  869. struct crypto_comp *tfm;
  870. u8 *src, *dst;
  871. unsigned int dlen;
  872. int ret;
  873. /* find */
  874. spin_lock(&tree->lock);
  875. entry = zswap_entry_find_get(&tree->rbroot, offset);
  876. if (!entry) {
  877. /* entry was written back */
  878. spin_unlock(&tree->lock);
  879. return -1;
  880. }
  881. spin_unlock(&tree->lock);
  882. /* decompress */
  883. dlen = PAGE_SIZE;
  884. src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
  885. ZPOOL_MM_RO) + sizeof(struct zswap_header);
  886. dst = kmap_atomic(page);
  887. tfm = *get_cpu_ptr(entry->pool->tfm);
  888. ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
  889. put_cpu_ptr(entry->pool->tfm);
  890. kunmap_atomic(dst);
  891. zpool_unmap_handle(entry->pool->zpool, entry->handle);
  892. BUG_ON(ret);
  893. spin_lock(&tree->lock);
  894. zswap_entry_put(tree, entry);
  895. spin_unlock(&tree->lock);
  896. return 0;
  897. }
  898. /* frees an entry in zswap */
  899. static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
  900. {
  901. struct zswap_tree *tree = zswap_trees[type];
  902. struct zswap_entry *entry;
  903. /* find */
  904. spin_lock(&tree->lock);
  905. entry = zswap_rb_search(&tree->rbroot, offset);
  906. if (!entry) {
  907. /* entry was written back */
  908. spin_unlock(&tree->lock);
  909. return;
  910. }
  911. /* remove from rbtree */
  912. zswap_rb_erase(&tree->rbroot, entry);
  913. /* drop the initial reference from entry creation */
  914. zswap_entry_put(tree, entry);
  915. spin_unlock(&tree->lock);
  916. }
  917. /* frees all zswap entries for the given swap type */
  918. static void zswap_frontswap_invalidate_area(unsigned type)
  919. {
  920. struct zswap_tree *tree = zswap_trees[type];
  921. struct zswap_entry *entry, *n;
  922. if (!tree)
  923. return;
  924. /* walk the tree and free everything */
  925. spin_lock(&tree->lock);
  926. rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
  927. zswap_free_entry(entry);
  928. tree->rbroot = RB_ROOT;
  929. spin_unlock(&tree->lock);
  930. kfree(tree);
  931. zswap_trees[type] = NULL;
  932. }
  933. static void zswap_frontswap_init(unsigned type)
  934. {
  935. struct zswap_tree *tree;
  936. tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
  937. if (!tree) {
  938. pr_err("alloc failed, zswap disabled for swap type %d\n", type);
  939. return;
  940. }
  941. tree->rbroot = RB_ROOT;
  942. spin_lock_init(&tree->lock);
  943. zswap_trees[type] = tree;
  944. }
  945. static struct frontswap_ops zswap_frontswap_ops = {
  946. .store = zswap_frontswap_store,
  947. .load = zswap_frontswap_load,
  948. .invalidate_page = zswap_frontswap_invalidate_page,
  949. .invalidate_area = zswap_frontswap_invalidate_area,
  950. .init = zswap_frontswap_init
  951. };
  952. /*********************************
  953. * debugfs functions
  954. **********************************/
  955. #ifdef CONFIG_DEBUG_FS
  956. #include <linux/debugfs.h>
  957. static struct dentry *zswap_debugfs_root;
  958. static int __init zswap_debugfs_init(void)
  959. {
  960. if (!debugfs_initialized())
  961. return -ENODEV;
  962. zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
  963. if (!zswap_debugfs_root)
  964. return -ENOMEM;
  965. debugfs_create_u64("pool_limit_hit", S_IRUGO,
  966. zswap_debugfs_root, &zswap_pool_limit_hit);
  967. debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
  968. zswap_debugfs_root, &zswap_reject_reclaim_fail);
  969. debugfs_create_u64("reject_alloc_fail", S_IRUGO,
  970. zswap_debugfs_root, &zswap_reject_alloc_fail);
  971. debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
  972. zswap_debugfs_root, &zswap_reject_kmemcache_fail);
  973. debugfs_create_u64("reject_compress_poor", S_IRUGO,
  974. zswap_debugfs_root, &zswap_reject_compress_poor);
  975. debugfs_create_u64("written_back_pages", S_IRUGO,
  976. zswap_debugfs_root, &zswap_written_back_pages);
  977. debugfs_create_u64("duplicate_entry", S_IRUGO,
  978. zswap_debugfs_root, &zswap_duplicate_entry);
  979. debugfs_create_u64("pool_total_size", S_IRUGO,
  980. zswap_debugfs_root, &zswap_pool_total_size);
  981. debugfs_create_atomic_t("stored_pages", S_IRUGO,
  982. zswap_debugfs_root, &zswap_stored_pages);
  983. return 0;
  984. }
  985. static void __exit zswap_debugfs_exit(void)
  986. {
  987. debugfs_remove_recursive(zswap_debugfs_root);
  988. }
  989. #else
  990. static int __init zswap_debugfs_init(void)
  991. {
  992. return 0;
  993. }
  994. static void __exit zswap_debugfs_exit(void) { }
  995. #endif
  996. /*********************************
  997. * module init and exit
  998. **********************************/
  999. static int __init init_zswap(void)
  1000. {
  1001. struct zswap_pool *pool;
  1002. int ret;
  1003. zswap_init_started = true;
  1004. if (zswap_entry_cache_create()) {
  1005. pr_err("entry cache creation failed\n");
  1006. goto cache_fail;
  1007. }
  1008. ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
  1009. zswap_dstmem_prepare, zswap_dstmem_dead);
  1010. if (ret) {
  1011. pr_err("dstmem alloc failed\n");
  1012. goto dstmem_fail;
  1013. }
  1014. ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
  1015. "mm/zswap_pool:prepare",
  1016. zswap_cpu_comp_prepare,
  1017. zswap_cpu_comp_dead);
  1018. if (ret)
  1019. goto hp_fail;
  1020. pool = __zswap_pool_create_fallback();
  1021. if (!pool) {
  1022. pr_err("pool creation failed\n");
  1023. goto pool_fail;
  1024. }
  1025. pr_info("loaded using pool %s/%s\n", pool->tfm_name,
  1026. zpool_get_type(pool->zpool));
  1027. list_add(&pool->list, &zswap_pools);
  1028. frontswap_register_ops(&zswap_frontswap_ops);
  1029. if (zswap_debugfs_init())
  1030. pr_warn("debugfs initialization failed\n");
  1031. return 0;
  1032. pool_fail:
  1033. cpuhp_remove_state_nocalls(CPUHP_MM_ZSWP_POOL_PREPARE);
  1034. hp_fail:
  1035. cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
  1036. dstmem_fail:
  1037. zswap_entry_cache_destroy();
  1038. cache_fail:
  1039. /* if built-in, we aren't unloaded on failure; don't allow use */
  1040. zswap_init_failed = true;
  1041. zswap_enabled = false;
  1042. return -ENOMEM;
  1043. }
  1044. /* must be late so crypto has time to come up */
  1045. late_initcall(init_zswap);
  1046. MODULE_LICENSE("GPL");
  1047. MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
  1048. MODULE_DESCRIPTION("Compressed cache for swap pages");