zswap.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281
  1. /*
  2. * zswap.c - zswap driver file
  3. *
  4. * zswap is a backend for frontswap that takes pages that are in the process
  5. * of being swapped out and attempts to compress and store them in a
  6. * RAM-based memory pool. This can result in a significant I/O reduction on
  7. * the swap device and, in the case where decompressing from RAM is faster
  8. * than reading from the swap device, can also improve workload performance.
  9. *
  10. * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
  11. *
  12. * This program is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU General Public License
  14. * as published by the Free Software Foundation; either version 2
  15. * of the License, or (at your option) any later version.
  16. *
  17. * This program is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. */
  22. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  23. #include <linux/module.h>
  24. #include <linux/cpu.h>
  25. #include <linux/highmem.h>
  26. #include <linux/slab.h>
  27. #include <linux/spinlock.h>
  28. #include <linux/types.h>
  29. #include <linux/atomic.h>
  30. #include <linux/frontswap.h>
  31. #include <linux/rbtree.h>
  32. #include <linux/swap.h>
  33. #include <linux/crypto.h>
  34. #include <linux/mempool.h>
  35. #include <linux/zpool.h>
  36. #include <linux/mm_types.h>
  37. #include <linux/page-flags.h>
  38. #include <linux/swapops.h>
  39. #include <linux/writeback.h>
  40. #include <linux/pagemap.h>
  41. /*********************************
  42. * statistics
  43. **********************************/
  44. /* Total bytes used by the compressed storage */
  45. static u64 zswap_pool_total_size;
  46. /* The number of compressed pages currently stored in zswap */
  47. static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
  48. /*
  49. * The statistics below are not protected from concurrent access for
  50. * performance reasons so they may not be a 100% accurate. However,
  51. * they do provide useful information on roughly how many times a
  52. * certain event is occurring.
  53. */
  54. /* Pool limit was hit (see zswap_max_pool_percent) */
  55. static u64 zswap_pool_limit_hit;
  56. /* Pages written back when pool limit was reached */
  57. static u64 zswap_written_back_pages;
  58. /* Store failed due to a reclaim failure after pool limit was reached */
  59. static u64 zswap_reject_reclaim_fail;
  60. /* Compressed page was too big for the allocator to (optimally) store */
  61. static u64 zswap_reject_compress_poor;
  62. /* Store failed because underlying allocator could not get memory */
  63. static u64 zswap_reject_alloc_fail;
  64. /* Store failed because the entry metadata could not be allocated (rare) */
  65. static u64 zswap_reject_kmemcache_fail;
  66. /* Duplicate store was encountered (rare) */
  67. static u64 zswap_duplicate_entry;
  68. /*********************************
  69. * tunables
  70. **********************************/
  71. /* Enable/disable zswap (disabled by default) */
  72. static bool zswap_enabled;
  73. module_param_named(enabled, zswap_enabled, bool, 0644);
  74. /* Crypto compressor to use */
  75. #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
  76. static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
  77. static int zswap_compressor_param_set(const char *,
  78. const struct kernel_param *);
  79. static struct kernel_param_ops zswap_compressor_param_ops = {
  80. .set = zswap_compressor_param_set,
  81. .get = param_get_charp,
  82. .free = param_free_charp,
  83. };
  84. module_param_cb(compressor, &zswap_compressor_param_ops,
  85. &zswap_compressor, 0644);
  86. /* Compressed storage zpool to use */
  87. #define ZSWAP_ZPOOL_DEFAULT "zbud"
  88. static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
  89. static int zswap_zpool_param_set(const char *, const struct kernel_param *);
  90. static struct kernel_param_ops zswap_zpool_param_ops = {
  91. .set = zswap_zpool_param_set,
  92. .get = param_get_charp,
  93. .free = param_free_charp,
  94. };
  95. module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
  96. /* The maximum percentage of memory that the compressed pool can occupy */
  97. static unsigned int zswap_max_pool_percent = 20;
  98. module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
  99. /*********************************
  100. * data structures
  101. **********************************/
  102. struct zswap_pool {
  103. struct zpool *zpool;
  104. struct crypto_comp * __percpu *tfm;
  105. struct kref kref;
  106. struct list_head list;
  107. struct work_struct work;
  108. struct notifier_block notifier;
  109. char tfm_name[CRYPTO_MAX_ALG_NAME];
  110. };
  111. /*
  112. * struct zswap_entry
  113. *
  114. * This structure contains the metadata for tracking a single compressed
  115. * page within zswap.
  116. *
  117. * rbnode - links the entry into red-black tree for the appropriate swap type
  118. * offset - the swap offset for the entry. Index into the red-black tree.
  119. * refcount - the number of outstanding reference to the entry. This is needed
  120. * to protect against premature freeing of the entry by code
  121. * concurrent calls to load, invalidate, and writeback. The lock
  122. * for the zswap_tree structure that contains the entry must
  123. * be held while changing the refcount. Since the lock must
  124. * be held, there is no reason to also make refcount atomic.
  125. * length - the length in bytes of the compressed page data. Needed during
  126. * decompression
  127. * pool - the zswap_pool the entry's data is in
  128. * handle - zpool allocation handle that stores the compressed page data
  129. */
  130. struct zswap_entry {
  131. struct rb_node rbnode;
  132. pgoff_t offset;
  133. int refcount;
  134. unsigned int length;
  135. struct zswap_pool *pool;
  136. unsigned long handle;
  137. };
  138. struct zswap_header {
  139. swp_entry_t swpentry;
  140. };
  141. /*
  142. * The tree lock in the zswap_tree struct protects a few things:
  143. * - the rbtree
  144. * - the refcount field of each entry in the tree
  145. */
  146. struct zswap_tree {
  147. struct rb_root rbroot;
  148. spinlock_t lock;
  149. };
  150. static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
  151. /* RCU-protected iteration */
  152. static LIST_HEAD(zswap_pools);
  153. /* protects zswap_pools list modification */
  154. static DEFINE_SPINLOCK(zswap_pools_lock);
  155. /* pool counter to provide unique names to zpool */
  156. static atomic_t zswap_pools_count = ATOMIC_INIT(0);
  157. /* used by param callback function */
  158. static bool zswap_init_started;
  159. /*********************************
  160. * helpers and fwd declarations
  161. **********************************/
  162. #define zswap_pool_debug(msg, p) \
  163. pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
  164. zpool_get_type((p)->zpool))
  165. static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
  166. static int zswap_pool_get(struct zswap_pool *pool);
  167. static void zswap_pool_put(struct zswap_pool *pool);
  168. static const struct zpool_ops zswap_zpool_ops = {
  169. .evict = zswap_writeback_entry
  170. };
  171. static bool zswap_is_full(void)
  172. {
  173. return totalram_pages * zswap_max_pool_percent / 100 <
  174. DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
  175. }
  176. static void zswap_update_total_size(void)
  177. {
  178. struct zswap_pool *pool;
  179. u64 total = 0;
  180. rcu_read_lock();
  181. list_for_each_entry_rcu(pool, &zswap_pools, list)
  182. total += zpool_get_total_size(pool->zpool);
  183. rcu_read_unlock();
  184. zswap_pool_total_size = total;
  185. }
  186. /*********************************
  187. * zswap entry functions
  188. **********************************/
  189. static struct kmem_cache *zswap_entry_cache;
  190. static int __init zswap_entry_cache_create(void)
  191. {
  192. zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
  193. return zswap_entry_cache == NULL;
  194. }
  195. static void __init zswap_entry_cache_destroy(void)
  196. {
  197. kmem_cache_destroy(zswap_entry_cache);
  198. }
  199. static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
  200. {
  201. struct zswap_entry *entry;
  202. entry = kmem_cache_alloc(zswap_entry_cache, gfp);
  203. if (!entry)
  204. return NULL;
  205. entry->refcount = 1;
  206. RB_CLEAR_NODE(&entry->rbnode);
  207. return entry;
  208. }
  209. static void zswap_entry_cache_free(struct zswap_entry *entry)
  210. {
  211. kmem_cache_free(zswap_entry_cache, entry);
  212. }
  213. /*********************************
  214. * rbtree functions
  215. **********************************/
  216. static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
  217. {
  218. struct rb_node *node = root->rb_node;
  219. struct zswap_entry *entry;
  220. while (node) {
  221. entry = rb_entry(node, struct zswap_entry, rbnode);
  222. if (entry->offset > offset)
  223. node = node->rb_left;
  224. else if (entry->offset < offset)
  225. node = node->rb_right;
  226. else
  227. return entry;
  228. }
  229. return NULL;
  230. }
  231. /*
  232. * In the case that a entry with the same offset is found, a pointer to
  233. * the existing entry is stored in dupentry and the function returns -EEXIST
  234. */
  235. static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
  236. struct zswap_entry **dupentry)
  237. {
  238. struct rb_node **link = &root->rb_node, *parent = NULL;
  239. struct zswap_entry *myentry;
  240. while (*link) {
  241. parent = *link;
  242. myentry = rb_entry(parent, struct zswap_entry, rbnode);
  243. if (myentry->offset > entry->offset)
  244. link = &(*link)->rb_left;
  245. else if (myentry->offset < entry->offset)
  246. link = &(*link)->rb_right;
  247. else {
  248. *dupentry = myentry;
  249. return -EEXIST;
  250. }
  251. }
  252. rb_link_node(&entry->rbnode, parent, link);
  253. rb_insert_color(&entry->rbnode, root);
  254. return 0;
  255. }
  256. static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
  257. {
  258. if (!RB_EMPTY_NODE(&entry->rbnode)) {
  259. rb_erase(&entry->rbnode, root);
  260. RB_CLEAR_NODE(&entry->rbnode);
  261. }
  262. }
  263. /*
  264. * Carries out the common pattern of freeing and entry's zpool allocation,
  265. * freeing the entry itself, and decrementing the number of stored pages.
  266. */
  267. static void zswap_free_entry(struct zswap_entry *entry)
  268. {
  269. zpool_free(entry->pool->zpool, entry->handle);
  270. zswap_pool_put(entry->pool);
  271. zswap_entry_cache_free(entry);
  272. atomic_dec(&zswap_stored_pages);
  273. zswap_update_total_size();
  274. }
  275. /* caller must hold the tree lock */
  276. static void zswap_entry_get(struct zswap_entry *entry)
  277. {
  278. entry->refcount++;
  279. }
  280. /* caller must hold the tree lock
  281. * remove from the tree and free it, if nobody reference the entry
  282. */
  283. static void zswap_entry_put(struct zswap_tree *tree,
  284. struct zswap_entry *entry)
  285. {
  286. int refcount = --entry->refcount;
  287. BUG_ON(refcount < 0);
  288. if (refcount == 0) {
  289. zswap_rb_erase(&tree->rbroot, entry);
  290. zswap_free_entry(entry);
  291. }
  292. }
  293. /* caller must hold the tree lock */
  294. static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
  295. pgoff_t offset)
  296. {
  297. struct zswap_entry *entry;
  298. entry = zswap_rb_search(root, offset);
  299. if (entry)
  300. zswap_entry_get(entry);
  301. return entry;
  302. }
  303. /*********************************
  304. * per-cpu code
  305. **********************************/
  306. static DEFINE_PER_CPU(u8 *, zswap_dstmem);
  307. static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu)
  308. {
  309. u8 *dst;
  310. switch (action) {
  311. case CPU_UP_PREPARE:
  312. dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
  313. if (!dst) {
  314. pr_err("can't allocate compressor buffer\n");
  315. return NOTIFY_BAD;
  316. }
  317. per_cpu(zswap_dstmem, cpu) = dst;
  318. break;
  319. case CPU_DEAD:
  320. case CPU_UP_CANCELED:
  321. dst = per_cpu(zswap_dstmem, cpu);
  322. kfree(dst);
  323. per_cpu(zswap_dstmem, cpu) = NULL;
  324. break;
  325. default:
  326. break;
  327. }
  328. return NOTIFY_OK;
  329. }
  330. static int zswap_cpu_dstmem_notifier(struct notifier_block *nb,
  331. unsigned long action, void *pcpu)
  332. {
  333. return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu);
  334. }
  335. static struct notifier_block zswap_dstmem_notifier = {
  336. .notifier_call = zswap_cpu_dstmem_notifier,
  337. };
  338. static int __init zswap_cpu_dstmem_init(void)
  339. {
  340. unsigned long cpu;
  341. cpu_notifier_register_begin();
  342. for_each_online_cpu(cpu)
  343. if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) ==
  344. NOTIFY_BAD)
  345. goto cleanup;
  346. __register_cpu_notifier(&zswap_dstmem_notifier);
  347. cpu_notifier_register_done();
  348. return 0;
  349. cleanup:
  350. for_each_online_cpu(cpu)
  351. __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
  352. cpu_notifier_register_done();
  353. return -ENOMEM;
  354. }
  355. static void zswap_cpu_dstmem_destroy(void)
  356. {
  357. unsigned long cpu;
  358. cpu_notifier_register_begin();
  359. for_each_online_cpu(cpu)
  360. __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
  361. __unregister_cpu_notifier(&zswap_dstmem_notifier);
  362. cpu_notifier_register_done();
  363. }
  364. static int __zswap_cpu_comp_notifier(struct zswap_pool *pool,
  365. unsigned long action, unsigned long cpu)
  366. {
  367. struct crypto_comp *tfm;
  368. switch (action) {
  369. case CPU_UP_PREPARE:
  370. if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
  371. break;
  372. tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
  373. if (IS_ERR_OR_NULL(tfm)) {
  374. pr_err("could not alloc crypto comp %s : %ld\n",
  375. pool->tfm_name, PTR_ERR(tfm));
  376. return NOTIFY_BAD;
  377. }
  378. *per_cpu_ptr(pool->tfm, cpu) = tfm;
  379. break;
  380. case CPU_DEAD:
  381. case CPU_UP_CANCELED:
  382. tfm = *per_cpu_ptr(pool->tfm, cpu);
  383. if (!IS_ERR_OR_NULL(tfm))
  384. crypto_free_comp(tfm);
  385. *per_cpu_ptr(pool->tfm, cpu) = NULL;
  386. break;
  387. default:
  388. break;
  389. }
  390. return NOTIFY_OK;
  391. }
  392. static int zswap_cpu_comp_notifier(struct notifier_block *nb,
  393. unsigned long action, void *pcpu)
  394. {
  395. unsigned long cpu = (unsigned long)pcpu;
  396. struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier);
  397. return __zswap_cpu_comp_notifier(pool, action, cpu);
  398. }
  399. static int zswap_cpu_comp_init(struct zswap_pool *pool)
  400. {
  401. unsigned long cpu;
  402. memset(&pool->notifier, 0, sizeof(pool->notifier));
  403. pool->notifier.notifier_call = zswap_cpu_comp_notifier;
  404. cpu_notifier_register_begin();
  405. for_each_online_cpu(cpu)
  406. if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) ==
  407. NOTIFY_BAD)
  408. goto cleanup;
  409. __register_cpu_notifier(&pool->notifier);
  410. cpu_notifier_register_done();
  411. return 0;
  412. cleanup:
  413. for_each_online_cpu(cpu)
  414. __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
  415. cpu_notifier_register_done();
  416. return -ENOMEM;
  417. }
  418. static void zswap_cpu_comp_destroy(struct zswap_pool *pool)
  419. {
  420. unsigned long cpu;
  421. cpu_notifier_register_begin();
  422. for_each_online_cpu(cpu)
  423. __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
  424. __unregister_cpu_notifier(&pool->notifier);
  425. cpu_notifier_register_done();
  426. }
  427. /*********************************
  428. * pool functions
  429. **********************************/
  430. static struct zswap_pool *__zswap_pool_current(void)
  431. {
  432. struct zswap_pool *pool;
  433. pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
  434. WARN_ON(!pool);
  435. return pool;
  436. }
  437. static struct zswap_pool *zswap_pool_current(void)
  438. {
  439. assert_spin_locked(&zswap_pools_lock);
  440. return __zswap_pool_current();
  441. }
  442. static struct zswap_pool *zswap_pool_current_get(void)
  443. {
  444. struct zswap_pool *pool;
  445. rcu_read_lock();
  446. pool = __zswap_pool_current();
  447. if (!pool || !zswap_pool_get(pool))
  448. pool = NULL;
  449. rcu_read_unlock();
  450. return pool;
  451. }
  452. static struct zswap_pool *zswap_pool_last_get(void)
  453. {
  454. struct zswap_pool *pool, *last = NULL;
  455. rcu_read_lock();
  456. list_for_each_entry_rcu(pool, &zswap_pools, list)
  457. last = pool;
  458. if (!WARN_ON(!last) && !zswap_pool_get(last))
  459. last = NULL;
  460. rcu_read_unlock();
  461. return last;
  462. }
  463. /* type and compressor must be null-terminated */
  464. static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
  465. {
  466. struct zswap_pool *pool;
  467. assert_spin_locked(&zswap_pools_lock);
  468. list_for_each_entry_rcu(pool, &zswap_pools, list) {
  469. if (strcmp(pool->tfm_name, compressor))
  470. continue;
  471. if (strcmp(zpool_get_type(pool->zpool), type))
  472. continue;
  473. /* if we can't get it, it's about to be destroyed */
  474. if (!zswap_pool_get(pool))
  475. continue;
  476. return pool;
  477. }
  478. return NULL;
  479. }
  480. static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
  481. {
  482. struct zswap_pool *pool;
  483. char name[38]; /* 'zswap' + 32 char (max) num + \0 */
  484. gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
  485. pool = kzalloc(sizeof(*pool), GFP_KERNEL);
  486. if (!pool) {
  487. pr_err("pool alloc failed\n");
  488. return NULL;
  489. }
  490. /* unique name for each pool specifically required by zsmalloc */
  491. snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
  492. pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
  493. if (!pool->zpool) {
  494. pr_err("%s zpool not available\n", type);
  495. goto error;
  496. }
  497. pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
  498. strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
  499. pool->tfm = alloc_percpu(struct crypto_comp *);
  500. if (!pool->tfm) {
  501. pr_err("percpu alloc failed\n");
  502. goto error;
  503. }
  504. if (zswap_cpu_comp_init(pool))
  505. goto error;
  506. pr_debug("using %s compressor\n", pool->tfm_name);
  507. /* being the current pool takes 1 ref; this func expects the
  508. * caller to always add the new pool as the current pool
  509. */
  510. kref_init(&pool->kref);
  511. INIT_LIST_HEAD(&pool->list);
  512. zswap_pool_debug("created", pool);
  513. return pool;
  514. error:
  515. free_percpu(pool->tfm);
  516. if (pool->zpool)
  517. zpool_destroy_pool(pool->zpool);
  518. kfree(pool);
  519. return NULL;
  520. }
  521. static __init struct zswap_pool *__zswap_pool_create_fallback(void)
  522. {
  523. if (!crypto_has_comp(zswap_compressor, 0, 0)) {
  524. if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
  525. pr_err("default compressor %s not available\n",
  526. zswap_compressor);
  527. return NULL;
  528. }
  529. pr_err("compressor %s not available, using default %s\n",
  530. zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
  531. param_free_charp(&zswap_compressor);
  532. zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
  533. }
  534. if (!zpool_has_pool(zswap_zpool_type)) {
  535. if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
  536. pr_err("default zpool %s not available\n",
  537. zswap_zpool_type);
  538. return NULL;
  539. }
  540. pr_err("zpool %s not available, using default %s\n",
  541. zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
  542. param_free_charp(&zswap_zpool_type);
  543. zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
  544. }
  545. return zswap_pool_create(zswap_zpool_type, zswap_compressor);
  546. }
  547. static void zswap_pool_destroy(struct zswap_pool *pool)
  548. {
  549. zswap_pool_debug("destroying", pool);
  550. zswap_cpu_comp_destroy(pool);
  551. free_percpu(pool->tfm);
  552. zpool_destroy_pool(pool->zpool);
  553. kfree(pool);
  554. }
  555. static int __must_check zswap_pool_get(struct zswap_pool *pool)
  556. {
  557. return kref_get_unless_zero(&pool->kref);
  558. }
  559. static void __zswap_pool_release(struct work_struct *work)
  560. {
  561. struct zswap_pool *pool = container_of(work, typeof(*pool), work);
  562. synchronize_rcu();
  563. /* nobody should have been able to get a kref... */
  564. WARN_ON(kref_get_unless_zero(&pool->kref));
  565. /* pool is now off zswap_pools list and has no references. */
  566. zswap_pool_destroy(pool);
  567. }
  568. static void __zswap_pool_empty(struct kref *kref)
  569. {
  570. struct zswap_pool *pool;
  571. pool = container_of(kref, typeof(*pool), kref);
  572. spin_lock(&zswap_pools_lock);
  573. WARN_ON(pool == zswap_pool_current());
  574. list_del_rcu(&pool->list);
  575. INIT_WORK(&pool->work, __zswap_pool_release);
  576. schedule_work(&pool->work);
  577. spin_unlock(&zswap_pools_lock);
  578. }
  579. static void zswap_pool_put(struct zswap_pool *pool)
  580. {
  581. kref_put(&pool->kref, __zswap_pool_empty);
  582. }
  583. /*********************************
  584. * param callbacks
  585. **********************************/
  586. /* val must be a null-terminated string */
  587. static int __zswap_param_set(const char *val, const struct kernel_param *kp,
  588. char *type, char *compressor)
  589. {
  590. struct zswap_pool *pool, *put_pool = NULL;
  591. char *s = strstrip((char *)val);
  592. int ret;
  593. /* no change required */
  594. if (!strcmp(s, *(char **)kp->arg))
  595. return 0;
  596. /* if this is load-time (pre-init) param setting,
  597. * don't create a pool; that's done during init.
  598. */
  599. if (!zswap_init_started)
  600. return param_set_charp(s, kp);
  601. if (!type) {
  602. if (!zpool_has_pool(s)) {
  603. pr_err("zpool %s not available\n", s);
  604. return -ENOENT;
  605. }
  606. type = s;
  607. } else if (!compressor) {
  608. if (!crypto_has_comp(s, 0, 0)) {
  609. pr_err("compressor %s not available\n", s);
  610. return -ENOENT;
  611. }
  612. compressor = s;
  613. } else {
  614. WARN_ON(1);
  615. return -EINVAL;
  616. }
  617. spin_lock(&zswap_pools_lock);
  618. pool = zswap_pool_find_get(type, compressor);
  619. if (pool) {
  620. zswap_pool_debug("using existing", pool);
  621. list_del_rcu(&pool->list);
  622. } else {
  623. spin_unlock(&zswap_pools_lock);
  624. pool = zswap_pool_create(type, compressor);
  625. spin_lock(&zswap_pools_lock);
  626. }
  627. if (pool)
  628. ret = param_set_charp(s, kp);
  629. else
  630. ret = -EINVAL;
  631. if (!ret) {
  632. put_pool = zswap_pool_current();
  633. list_add_rcu(&pool->list, &zswap_pools);
  634. } else if (pool) {
  635. /* add the possibly pre-existing pool to the end of the pools
  636. * list; if it's new (and empty) then it'll be removed and
  637. * destroyed by the put after we drop the lock
  638. */
  639. list_add_tail_rcu(&pool->list, &zswap_pools);
  640. put_pool = pool;
  641. }
  642. spin_unlock(&zswap_pools_lock);
  643. /* drop the ref from either the old current pool,
  644. * or the new pool we failed to add
  645. */
  646. if (put_pool)
  647. zswap_pool_put(put_pool);
  648. return ret;
  649. }
  650. static int zswap_compressor_param_set(const char *val,
  651. const struct kernel_param *kp)
  652. {
  653. return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
  654. }
  655. static int zswap_zpool_param_set(const char *val,
  656. const struct kernel_param *kp)
  657. {
  658. return __zswap_param_set(val, kp, NULL, zswap_compressor);
  659. }
  660. /*********************************
  661. * writeback code
  662. **********************************/
  663. /* return enum for zswap_get_swap_cache_page */
  664. enum zswap_get_swap_ret {
  665. ZSWAP_SWAPCACHE_NEW,
  666. ZSWAP_SWAPCACHE_EXIST,
  667. ZSWAP_SWAPCACHE_FAIL,
  668. };
  669. /*
  670. * zswap_get_swap_cache_page
  671. *
  672. * This is an adaption of read_swap_cache_async()
  673. *
  674. * This function tries to find a page with the given swap entry
  675. * in the swapper_space address space (the swap cache). If the page
  676. * is found, it is returned in retpage. Otherwise, a page is allocated,
  677. * added to the swap cache, and returned in retpage.
  678. *
  679. * If success, the swap cache page is returned in retpage
  680. * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
  681. * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
  682. * the new page is added to swapcache and locked
  683. * Returns ZSWAP_SWAPCACHE_FAIL on error
  684. */
  685. static int zswap_get_swap_cache_page(swp_entry_t entry,
  686. struct page **retpage)
  687. {
  688. bool page_was_allocated;
  689. *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
  690. NULL, 0, &page_was_allocated);
  691. if (page_was_allocated)
  692. return ZSWAP_SWAPCACHE_NEW;
  693. if (!*retpage)
  694. return ZSWAP_SWAPCACHE_FAIL;
  695. return ZSWAP_SWAPCACHE_EXIST;
  696. }
  697. /*
  698. * Attempts to free an entry by adding a page to the swap cache,
  699. * decompressing the entry data into the page, and issuing a
  700. * bio write to write the page back to the swap device.
  701. *
  702. * This can be thought of as a "resumed writeback" of the page
  703. * to the swap device. We are basically resuming the same swap
  704. * writeback path that was intercepted with the frontswap_store()
  705. * in the first place. After the page has been decompressed into
  706. * the swap cache, the compressed version stored by zswap can be
  707. * freed.
  708. */
  709. static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
  710. {
  711. struct zswap_header *zhdr;
  712. swp_entry_t swpentry;
  713. struct zswap_tree *tree;
  714. pgoff_t offset;
  715. struct zswap_entry *entry;
  716. struct page *page;
  717. struct crypto_comp *tfm;
  718. u8 *src, *dst;
  719. unsigned int dlen;
  720. int ret;
  721. struct writeback_control wbc = {
  722. .sync_mode = WB_SYNC_NONE,
  723. };
  724. /* extract swpentry from data */
  725. zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
  726. swpentry = zhdr->swpentry; /* here */
  727. zpool_unmap_handle(pool, handle);
  728. tree = zswap_trees[swp_type(swpentry)];
  729. offset = swp_offset(swpentry);
  730. /* find and ref zswap entry */
  731. spin_lock(&tree->lock);
  732. entry = zswap_entry_find_get(&tree->rbroot, offset);
  733. if (!entry) {
  734. /* entry was invalidated */
  735. spin_unlock(&tree->lock);
  736. return 0;
  737. }
  738. spin_unlock(&tree->lock);
  739. BUG_ON(offset != entry->offset);
  740. /* try to allocate swap cache page */
  741. switch (zswap_get_swap_cache_page(swpentry, &page)) {
  742. case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
  743. ret = -ENOMEM;
  744. goto fail;
  745. case ZSWAP_SWAPCACHE_EXIST:
  746. /* page is already in the swap cache, ignore for now */
  747. put_page(page);
  748. ret = -EEXIST;
  749. goto fail;
  750. case ZSWAP_SWAPCACHE_NEW: /* page is locked */
  751. /* decompress */
  752. dlen = PAGE_SIZE;
  753. src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
  754. ZPOOL_MM_RO) + sizeof(struct zswap_header);
  755. dst = kmap_atomic(page);
  756. tfm = *get_cpu_ptr(entry->pool->tfm);
  757. ret = crypto_comp_decompress(tfm, src, entry->length,
  758. dst, &dlen);
  759. put_cpu_ptr(entry->pool->tfm);
  760. kunmap_atomic(dst);
  761. zpool_unmap_handle(entry->pool->zpool, entry->handle);
  762. BUG_ON(ret);
  763. BUG_ON(dlen != PAGE_SIZE);
  764. /* page is up to date */
  765. SetPageUptodate(page);
  766. }
  767. /* move it to the tail of the inactive list after end_writeback */
  768. SetPageReclaim(page);
  769. /* start writeback */
  770. __swap_writepage(page, &wbc, end_swap_bio_write);
  771. put_page(page);
  772. zswap_written_back_pages++;
  773. spin_lock(&tree->lock);
  774. /* drop local reference */
  775. zswap_entry_put(tree, entry);
  776. /*
  777. * There are two possible situations for entry here:
  778. * (1) refcount is 1(normal case), entry is valid and on the tree
  779. * (2) refcount is 0, entry is freed and not on the tree
  780. * because invalidate happened during writeback
  781. * search the tree and free the entry if find entry
  782. */
  783. if (entry == zswap_rb_search(&tree->rbroot, offset))
  784. zswap_entry_put(tree, entry);
  785. spin_unlock(&tree->lock);
  786. goto end;
  787. /*
  788. * if we get here due to ZSWAP_SWAPCACHE_EXIST
  789. * a load may happening concurrently
  790. * it is safe and okay to not free the entry
  791. * if we free the entry in the following put
  792. * it it either okay to return !0
  793. */
  794. fail:
  795. spin_lock(&tree->lock);
  796. zswap_entry_put(tree, entry);
  797. spin_unlock(&tree->lock);
  798. end:
  799. return ret;
  800. }
  801. static int zswap_shrink(void)
  802. {
  803. struct zswap_pool *pool;
  804. int ret;
  805. pool = zswap_pool_last_get();
  806. if (!pool)
  807. return -ENOENT;
  808. ret = zpool_shrink(pool->zpool, 1, NULL);
  809. zswap_pool_put(pool);
  810. return ret;
  811. }
  812. /*********************************
  813. * frontswap hooks
  814. **********************************/
  815. /* attempts to compress and store an single page */
  816. static int zswap_frontswap_store(unsigned type, pgoff_t offset,
  817. struct page *page)
  818. {
  819. struct zswap_tree *tree = zswap_trees[type];
  820. struct zswap_entry *entry, *dupentry;
  821. struct crypto_comp *tfm;
  822. int ret;
  823. unsigned int dlen = PAGE_SIZE, len;
  824. unsigned long handle;
  825. char *buf;
  826. u8 *src, *dst;
  827. struct zswap_header *zhdr;
  828. if (!zswap_enabled || !tree) {
  829. ret = -ENODEV;
  830. goto reject;
  831. }
  832. /* reclaim space if needed */
  833. if (zswap_is_full()) {
  834. zswap_pool_limit_hit++;
  835. if (zswap_shrink()) {
  836. zswap_reject_reclaim_fail++;
  837. ret = -ENOMEM;
  838. goto reject;
  839. }
  840. }
  841. /* allocate entry */
  842. entry = zswap_entry_cache_alloc(GFP_KERNEL);
  843. if (!entry) {
  844. zswap_reject_kmemcache_fail++;
  845. ret = -ENOMEM;
  846. goto reject;
  847. }
  848. /* if entry is successfully added, it keeps the reference */
  849. entry->pool = zswap_pool_current_get();
  850. if (!entry->pool) {
  851. ret = -EINVAL;
  852. goto freepage;
  853. }
  854. /* compress */
  855. dst = get_cpu_var(zswap_dstmem);
  856. tfm = *get_cpu_ptr(entry->pool->tfm);
  857. src = kmap_atomic(page);
  858. ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
  859. kunmap_atomic(src);
  860. put_cpu_ptr(entry->pool->tfm);
  861. if (ret) {
  862. ret = -EINVAL;
  863. goto put_dstmem;
  864. }
  865. /* store */
  866. len = dlen + sizeof(struct zswap_header);
  867. ret = zpool_malloc(entry->pool->zpool, len,
  868. __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
  869. &handle);
  870. if (ret == -ENOSPC) {
  871. zswap_reject_compress_poor++;
  872. goto put_dstmem;
  873. }
  874. if (ret) {
  875. zswap_reject_alloc_fail++;
  876. goto put_dstmem;
  877. }
  878. zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
  879. zhdr->swpentry = swp_entry(type, offset);
  880. buf = (u8 *)(zhdr + 1);
  881. memcpy(buf, dst, dlen);
  882. zpool_unmap_handle(entry->pool->zpool, handle);
  883. put_cpu_var(zswap_dstmem);
  884. /* populate entry */
  885. entry->offset = offset;
  886. entry->handle = handle;
  887. entry->length = dlen;
  888. /* map */
  889. spin_lock(&tree->lock);
  890. do {
  891. ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
  892. if (ret == -EEXIST) {
  893. zswap_duplicate_entry++;
  894. /* remove from rbtree */
  895. zswap_rb_erase(&tree->rbroot, dupentry);
  896. zswap_entry_put(tree, dupentry);
  897. }
  898. } while (ret == -EEXIST);
  899. spin_unlock(&tree->lock);
  900. /* update stats */
  901. atomic_inc(&zswap_stored_pages);
  902. zswap_update_total_size();
  903. return 0;
  904. put_dstmem:
  905. put_cpu_var(zswap_dstmem);
  906. zswap_pool_put(entry->pool);
  907. freepage:
  908. zswap_entry_cache_free(entry);
  909. reject:
  910. return ret;
  911. }
  912. /*
  913. * returns 0 if the page was successfully decompressed
  914. * return -1 on entry not found or error
  915. */
  916. static int zswap_frontswap_load(unsigned type, pgoff_t offset,
  917. struct page *page)
  918. {
  919. struct zswap_tree *tree = zswap_trees[type];
  920. struct zswap_entry *entry;
  921. struct crypto_comp *tfm;
  922. u8 *src, *dst;
  923. unsigned int dlen;
  924. int ret;
  925. /* find */
  926. spin_lock(&tree->lock);
  927. entry = zswap_entry_find_get(&tree->rbroot, offset);
  928. if (!entry) {
  929. /* entry was written back */
  930. spin_unlock(&tree->lock);
  931. return -1;
  932. }
  933. spin_unlock(&tree->lock);
  934. /* decompress */
  935. dlen = PAGE_SIZE;
  936. src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
  937. ZPOOL_MM_RO) + sizeof(struct zswap_header);
  938. dst = kmap_atomic(page);
  939. tfm = *get_cpu_ptr(entry->pool->tfm);
  940. ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
  941. put_cpu_ptr(entry->pool->tfm);
  942. kunmap_atomic(dst);
  943. zpool_unmap_handle(entry->pool->zpool, entry->handle);
  944. BUG_ON(ret);
  945. spin_lock(&tree->lock);
  946. zswap_entry_put(tree, entry);
  947. spin_unlock(&tree->lock);
  948. return 0;
  949. }
  950. /* frees an entry in zswap */
  951. static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
  952. {
  953. struct zswap_tree *tree = zswap_trees[type];
  954. struct zswap_entry *entry;
  955. /* find */
  956. spin_lock(&tree->lock);
  957. entry = zswap_rb_search(&tree->rbroot, offset);
  958. if (!entry) {
  959. /* entry was written back */
  960. spin_unlock(&tree->lock);
  961. return;
  962. }
  963. /* remove from rbtree */
  964. zswap_rb_erase(&tree->rbroot, entry);
  965. /* drop the initial reference from entry creation */
  966. zswap_entry_put(tree, entry);
  967. spin_unlock(&tree->lock);
  968. }
  969. /* frees all zswap entries for the given swap type */
  970. static void zswap_frontswap_invalidate_area(unsigned type)
  971. {
  972. struct zswap_tree *tree = zswap_trees[type];
  973. struct zswap_entry *entry, *n;
  974. if (!tree)
  975. return;
  976. /* walk the tree and free everything */
  977. spin_lock(&tree->lock);
  978. rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
  979. zswap_free_entry(entry);
  980. tree->rbroot = RB_ROOT;
  981. spin_unlock(&tree->lock);
  982. kfree(tree);
  983. zswap_trees[type] = NULL;
  984. }
  985. static void zswap_frontswap_init(unsigned type)
  986. {
  987. struct zswap_tree *tree;
  988. tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
  989. if (!tree) {
  990. pr_err("alloc failed, zswap disabled for swap type %d\n", type);
  991. return;
  992. }
  993. tree->rbroot = RB_ROOT;
  994. spin_lock_init(&tree->lock);
  995. zswap_trees[type] = tree;
  996. }
  997. static struct frontswap_ops zswap_frontswap_ops = {
  998. .store = zswap_frontswap_store,
  999. .load = zswap_frontswap_load,
  1000. .invalidate_page = zswap_frontswap_invalidate_page,
  1001. .invalidate_area = zswap_frontswap_invalidate_area,
  1002. .init = zswap_frontswap_init
  1003. };
  1004. /*********************************
  1005. * debugfs functions
  1006. **********************************/
  1007. #ifdef CONFIG_DEBUG_FS
  1008. #include <linux/debugfs.h>
  1009. static struct dentry *zswap_debugfs_root;
  1010. static int __init zswap_debugfs_init(void)
  1011. {
  1012. if (!debugfs_initialized())
  1013. return -ENODEV;
  1014. zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
  1015. if (!zswap_debugfs_root)
  1016. return -ENOMEM;
  1017. debugfs_create_u64("pool_limit_hit", S_IRUGO,
  1018. zswap_debugfs_root, &zswap_pool_limit_hit);
  1019. debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
  1020. zswap_debugfs_root, &zswap_reject_reclaim_fail);
  1021. debugfs_create_u64("reject_alloc_fail", S_IRUGO,
  1022. zswap_debugfs_root, &zswap_reject_alloc_fail);
  1023. debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
  1024. zswap_debugfs_root, &zswap_reject_kmemcache_fail);
  1025. debugfs_create_u64("reject_compress_poor", S_IRUGO,
  1026. zswap_debugfs_root, &zswap_reject_compress_poor);
  1027. debugfs_create_u64("written_back_pages", S_IRUGO,
  1028. zswap_debugfs_root, &zswap_written_back_pages);
  1029. debugfs_create_u64("duplicate_entry", S_IRUGO,
  1030. zswap_debugfs_root, &zswap_duplicate_entry);
  1031. debugfs_create_u64("pool_total_size", S_IRUGO,
  1032. zswap_debugfs_root, &zswap_pool_total_size);
  1033. debugfs_create_atomic_t("stored_pages", S_IRUGO,
  1034. zswap_debugfs_root, &zswap_stored_pages);
  1035. return 0;
  1036. }
  1037. static void __exit zswap_debugfs_exit(void)
  1038. {
  1039. debugfs_remove_recursive(zswap_debugfs_root);
  1040. }
  1041. #else
  1042. static int __init zswap_debugfs_init(void)
  1043. {
  1044. return 0;
  1045. }
  1046. static void __exit zswap_debugfs_exit(void) { }
  1047. #endif
  1048. /*********************************
  1049. * module init and exit
  1050. **********************************/
  1051. static int __init init_zswap(void)
  1052. {
  1053. struct zswap_pool *pool;
  1054. zswap_init_started = true;
  1055. if (zswap_entry_cache_create()) {
  1056. pr_err("entry cache creation failed\n");
  1057. goto cache_fail;
  1058. }
  1059. if (zswap_cpu_dstmem_init()) {
  1060. pr_err("dstmem alloc failed\n");
  1061. goto dstmem_fail;
  1062. }
  1063. pool = __zswap_pool_create_fallback();
  1064. if (!pool) {
  1065. pr_err("pool creation failed\n");
  1066. goto pool_fail;
  1067. }
  1068. pr_info("loaded using pool %s/%s\n", pool->tfm_name,
  1069. zpool_get_type(pool->zpool));
  1070. list_add(&pool->list, &zswap_pools);
  1071. frontswap_register_ops(&zswap_frontswap_ops);
  1072. if (zswap_debugfs_init())
  1073. pr_warn("debugfs initialization failed\n");
  1074. return 0;
  1075. pool_fail:
  1076. zswap_cpu_dstmem_destroy();
  1077. dstmem_fail:
  1078. zswap_entry_cache_destroy();
  1079. cache_fail:
  1080. return -ENOMEM;
  1081. }
  1082. /* must be late so crypto has time to come up */
  1083. late_initcall(init_zswap);
  1084. MODULE_LICENSE("GPL");
  1085. MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
  1086. MODULE_DESCRIPTION("Compressed cache for swap pages");