zswap.c 32 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237123812391240124112421243124412451246124712481249125012511252125312541255125612571258125912601261126212631264126512661267126812691270
  1. /*
  2. * zswap.c - zswap driver file
  3. *
  4. * zswap is a backend for frontswap that takes pages that are in the process
  5. * of being swapped out and attempts to compress and store them in a
  6. * RAM-based memory pool. This can result in a significant I/O reduction on
  7. * the swap device and, in the case where decompressing from RAM is faster
  8. * than reading from the swap device, can also improve workload performance.
  9. *
  10. * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
  11. *
  12. * This program is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU General Public License
  14. * as published by the Free Software Foundation; either version 2
  15. * of the License, or (at your option) any later version.
  16. *
  17. * This program is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. */
  22. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  23. #include <linux/module.h>
  24. #include <linux/cpu.h>
  25. #include <linux/highmem.h>
  26. #include <linux/slab.h>
  27. #include <linux/spinlock.h>
  28. #include <linux/types.h>
  29. #include <linux/atomic.h>
  30. #include <linux/frontswap.h>
  31. #include <linux/rbtree.h>
  32. #include <linux/swap.h>
  33. #include <linux/crypto.h>
  34. #include <linux/mempool.h>
  35. #include <linux/zpool.h>
  36. #include <linux/mm_types.h>
  37. #include <linux/page-flags.h>
  38. #include <linux/swapops.h>
  39. #include <linux/writeback.h>
  40. #include <linux/pagemap.h>
  41. /*********************************
  42. * statistics
  43. **********************************/
  44. /* Total bytes used by the compressed storage */
  45. static u64 zswap_pool_total_size;
  46. /* The number of compressed pages currently stored in zswap */
  47. static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
  48. /*
  49. * The statistics below are not protected from concurrent access for
  50. * performance reasons so they may not be a 100% accurate. However,
  51. * they do provide useful information on roughly how many times a
  52. * certain event is occurring.
  53. */
  54. /* Pool limit was hit (see zswap_max_pool_percent) */
  55. static u64 zswap_pool_limit_hit;
  56. /* Pages written back when pool limit was reached */
  57. static u64 zswap_written_back_pages;
  58. /* Store failed due to a reclaim failure after pool limit was reached */
  59. static u64 zswap_reject_reclaim_fail;
  60. /* Compressed page was too big for the allocator to (optimally) store */
  61. static u64 zswap_reject_compress_poor;
  62. /* Store failed because underlying allocator could not get memory */
  63. static u64 zswap_reject_alloc_fail;
  64. /* Store failed because the entry metadata could not be allocated (rare) */
  65. static u64 zswap_reject_kmemcache_fail;
  66. /* Duplicate store was encountered (rare) */
  67. static u64 zswap_duplicate_entry;
  68. /*********************************
  69. * tunables
  70. **********************************/
  71. /* Enable/disable zswap (disabled by default) */
  72. static bool zswap_enabled;
  73. module_param_named(enabled, zswap_enabled, bool, 0644);
  74. /* Crypto compressor to use */
  75. #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
  76. static char zswap_compressor[CRYPTO_MAX_ALG_NAME] = ZSWAP_COMPRESSOR_DEFAULT;
  77. static struct kparam_string zswap_compressor_kparam = {
  78. .string = zswap_compressor,
  79. .maxlen = sizeof(zswap_compressor),
  80. };
  81. static int zswap_compressor_param_set(const char *,
  82. const struct kernel_param *);
  83. static struct kernel_param_ops zswap_compressor_param_ops = {
  84. .set = zswap_compressor_param_set,
  85. .get = param_get_string,
  86. };
  87. module_param_cb(compressor, &zswap_compressor_param_ops,
  88. &zswap_compressor_kparam, 0644);
  89. /* Compressed storage zpool to use */
  90. #define ZSWAP_ZPOOL_DEFAULT "zbud"
  91. static char zswap_zpool_type[32 /* arbitrary */] = ZSWAP_ZPOOL_DEFAULT;
  92. static struct kparam_string zswap_zpool_kparam = {
  93. .string = zswap_zpool_type,
  94. .maxlen = sizeof(zswap_zpool_type),
  95. };
  96. static int zswap_zpool_param_set(const char *, const struct kernel_param *);
  97. static struct kernel_param_ops zswap_zpool_param_ops = {
  98. .set = zswap_zpool_param_set,
  99. .get = param_get_string,
  100. };
  101. module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_kparam, 0644);
  102. /* The maximum percentage of memory that the compressed pool can occupy */
  103. static unsigned int zswap_max_pool_percent = 20;
  104. module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
  105. /*********************************
  106. * data structures
  107. **********************************/
  108. struct zswap_pool {
  109. struct zpool *zpool;
  110. struct crypto_comp * __percpu *tfm;
  111. struct kref kref;
  112. struct list_head list;
  113. struct rcu_head rcu_head;
  114. struct notifier_block notifier;
  115. char tfm_name[CRYPTO_MAX_ALG_NAME];
  116. };
  117. /*
  118. * struct zswap_entry
  119. *
  120. * This structure contains the metadata for tracking a single compressed
  121. * page within zswap.
  122. *
  123. * rbnode - links the entry into red-black tree for the appropriate swap type
  124. * offset - the swap offset for the entry. Index into the red-black tree.
  125. * refcount - the number of outstanding reference to the entry. This is needed
  126. * to protect against premature freeing of the entry by code
  127. * concurrent calls to load, invalidate, and writeback. The lock
  128. * for the zswap_tree structure that contains the entry must
  129. * be held while changing the refcount. Since the lock must
  130. * be held, there is no reason to also make refcount atomic.
  131. * length - the length in bytes of the compressed page data. Needed during
  132. * decompression
  133. * pool - the zswap_pool the entry's data is in
  134. * handle - zpool allocation handle that stores the compressed page data
  135. */
  136. struct zswap_entry {
  137. struct rb_node rbnode;
  138. pgoff_t offset;
  139. int refcount;
  140. unsigned int length;
  141. struct zswap_pool *pool;
  142. unsigned long handle;
  143. };
  144. struct zswap_header {
  145. swp_entry_t swpentry;
  146. };
  147. /*
  148. * The tree lock in the zswap_tree struct protects a few things:
  149. * - the rbtree
  150. * - the refcount field of each entry in the tree
  151. */
  152. struct zswap_tree {
  153. struct rb_root rbroot;
  154. spinlock_t lock;
  155. };
  156. static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
  157. /* RCU-protected iteration */
  158. static LIST_HEAD(zswap_pools);
  159. /* protects zswap_pools list modification */
  160. static DEFINE_SPINLOCK(zswap_pools_lock);
  161. /* used by param callback function */
  162. static bool zswap_init_started;
  163. /*********************************
  164. * helpers and fwd declarations
  165. **********************************/
  166. #define zswap_pool_debug(msg, p) \
  167. pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
  168. zpool_get_type((p)->zpool))
  169. static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
  170. static int zswap_pool_get(struct zswap_pool *pool);
  171. static void zswap_pool_put(struct zswap_pool *pool);
  172. static const struct zpool_ops zswap_zpool_ops = {
  173. .evict = zswap_writeback_entry
  174. };
  175. static bool zswap_is_full(void)
  176. {
  177. return totalram_pages * zswap_max_pool_percent / 100 <
  178. DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
  179. }
  180. static void zswap_update_total_size(void)
  181. {
  182. struct zswap_pool *pool;
  183. u64 total = 0;
  184. rcu_read_lock();
  185. list_for_each_entry_rcu(pool, &zswap_pools, list)
  186. total += zpool_get_total_size(pool->zpool);
  187. rcu_read_unlock();
  188. zswap_pool_total_size = total;
  189. }
  190. /*********************************
  191. * zswap entry functions
  192. **********************************/
  193. static struct kmem_cache *zswap_entry_cache;
  194. static int __init zswap_entry_cache_create(void)
  195. {
  196. zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
  197. return zswap_entry_cache == NULL;
  198. }
  199. static void __init zswap_entry_cache_destroy(void)
  200. {
  201. kmem_cache_destroy(zswap_entry_cache);
  202. }
  203. static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
  204. {
  205. struct zswap_entry *entry;
  206. entry = kmem_cache_alloc(zswap_entry_cache, gfp);
  207. if (!entry)
  208. return NULL;
  209. entry->refcount = 1;
  210. RB_CLEAR_NODE(&entry->rbnode);
  211. return entry;
  212. }
  213. static void zswap_entry_cache_free(struct zswap_entry *entry)
  214. {
  215. kmem_cache_free(zswap_entry_cache, entry);
  216. }
  217. /*********************************
  218. * rbtree functions
  219. **********************************/
  220. static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
  221. {
  222. struct rb_node *node = root->rb_node;
  223. struct zswap_entry *entry;
  224. while (node) {
  225. entry = rb_entry(node, struct zswap_entry, rbnode);
  226. if (entry->offset > offset)
  227. node = node->rb_left;
  228. else if (entry->offset < offset)
  229. node = node->rb_right;
  230. else
  231. return entry;
  232. }
  233. return NULL;
  234. }
  235. /*
  236. * In the case that a entry with the same offset is found, a pointer to
  237. * the existing entry is stored in dupentry and the function returns -EEXIST
  238. */
  239. static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
  240. struct zswap_entry **dupentry)
  241. {
  242. struct rb_node **link = &root->rb_node, *parent = NULL;
  243. struct zswap_entry *myentry;
  244. while (*link) {
  245. parent = *link;
  246. myentry = rb_entry(parent, struct zswap_entry, rbnode);
  247. if (myentry->offset > entry->offset)
  248. link = &(*link)->rb_left;
  249. else if (myentry->offset < entry->offset)
  250. link = &(*link)->rb_right;
  251. else {
  252. *dupentry = myentry;
  253. return -EEXIST;
  254. }
  255. }
  256. rb_link_node(&entry->rbnode, parent, link);
  257. rb_insert_color(&entry->rbnode, root);
  258. return 0;
  259. }
  260. static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
  261. {
  262. if (!RB_EMPTY_NODE(&entry->rbnode)) {
  263. rb_erase(&entry->rbnode, root);
  264. RB_CLEAR_NODE(&entry->rbnode);
  265. }
  266. }
  267. /*
  268. * Carries out the common pattern of freeing and entry's zpool allocation,
  269. * freeing the entry itself, and decrementing the number of stored pages.
  270. */
  271. static void zswap_free_entry(struct zswap_entry *entry)
  272. {
  273. zpool_free(entry->pool->zpool, entry->handle);
  274. zswap_pool_put(entry->pool);
  275. zswap_entry_cache_free(entry);
  276. atomic_dec(&zswap_stored_pages);
  277. zswap_update_total_size();
  278. }
  279. /* caller must hold the tree lock */
  280. static void zswap_entry_get(struct zswap_entry *entry)
  281. {
  282. entry->refcount++;
  283. }
  284. /* caller must hold the tree lock
  285. * remove from the tree and free it, if nobody reference the entry
  286. */
  287. static void zswap_entry_put(struct zswap_tree *tree,
  288. struct zswap_entry *entry)
  289. {
  290. int refcount = --entry->refcount;
  291. BUG_ON(refcount < 0);
  292. if (refcount == 0) {
  293. zswap_rb_erase(&tree->rbroot, entry);
  294. zswap_free_entry(entry);
  295. }
  296. }
  297. /* caller must hold the tree lock */
  298. static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
  299. pgoff_t offset)
  300. {
  301. struct zswap_entry *entry = NULL;
  302. entry = zswap_rb_search(root, offset);
  303. if (entry)
  304. zswap_entry_get(entry);
  305. return entry;
  306. }
  307. /*********************************
  308. * per-cpu code
  309. **********************************/
  310. static DEFINE_PER_CPU(u8 *, zswap_dstmem);
  311. static int __zswap_cpu_dstmem_notifier(unsigned long action, unsigned long cpu)
  312. {
  313. u8 *dst;
  314. switch (action) {
  315. case CPU_UP_PREPARE:
  316. dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
  317. if (!dst) {
  318. pr_err("can't allocate compressor buffer\n");
  319. return NOTIFY_BAD;
  320. }
  321. per_cpu(zswap_dstmem, cpu) = dst;
  322. break;
  323. case CPU_DEAD:
  324. case CPU_UP_CANCELED:
  325. dst = per_cpu(zswap_dstmem, cpu);
  326. kfree(dst);
  327. per_cpu(zswap_dstmem, cpu) = NULL;
  328. break;
  329. default:
  330. break;
  331. }
  332. return NOTIFY_OK;
  333. }
  334. static int zswap_cpu_dstmem_notifier(struct notifier_block *nb,
  335. unsigned long action, void *pcpu)
  336. {
  337. return __zswap_cpu_dstmem_notifier(action, (unsigned long)pcpu);
  338. }
  339. static struct notifier_block zswap_dstmem_notifier = {
  340. .notifier_call = zswap_cpu_dstmem_notifier,
  341. };
  342. static int __init zswap_cpu_dstmem_init(void)
  343. {
  344. unsigned long cpu;
  345. cpu_notifier_register_begin();
  346. for_each_online_cpu(cpu)
  347. if (__zswap_cpu_dstmem_notifier(CPU_UP_PREPARE, cpu) ==
  348. NOTIFY_BAD)
  349. goto cleanup;
  350. __register_cpu_notifier(&zswap_dstmem_notifier);
  351. cpu_notifier_register_done();
  352. return 0;
  353. cleanup:
  354. for_each_online_cpu(cpu)
  355. __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
  356. cpu_notifier_register_done();
  357. return -ENOMEM;
  358. }
  359. static void zswap_cpu_dstmem_destroy(void)
  360. {
  361. unsigned long cpu;
  362. cpu_notifier_register_begin();
  363. for_each_online_cpu(cpu)
  364. __zswap_cpu_dstmem_notifier(CPU_UP_CANCELED, cpu);
  365. __unregister_cpu_notifier(&zswap_dstmem_notifier);
  366. cpu_notifier_register_done();
  367. }
  368. static int __zswap_cpu_comp_notifier(struct zswap_pool *pool,
  369. unsigned long action, unsigned long cpu)
  370. {
  371. struct crypto_comp *tfm;
  372. switch (action) {
  373. case CPU_UP_PREPARE:
  374. if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
  375. break;
  376. tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
  377. if (IS_ERR_OR_NULL(tfm)) {
  378. pr_err("could not alloc crypto comp %s : %ld\n",
  379. pool->tfm_name, PTR_ERR(tfm));
  380. return NOTIFY_BAD;
  381. }
  382. *per_cpu_ptr(pool->tfm, cpu) = tfm;
  383. break;
  384. case CPU_DEAD:
  385. case CPU_UP_CANCELED:
  386. tfm = *per_cpu_ptr(pool->tfm, cpu);
  387. if (!IS_ERR_OR_NULL(tfm))
  388. crypto_free_comp(tfm);
  389. *per_cpu_ptr(pool->tfm, cpu) = NULL;
  390. break;
  391. default:
  392. break;
  393. }
  394. return NOTIFY_OK;
  395. }
  396. static int zswap_cpu_comp_notifier(struct notifier_block *nb,
  397. unsigned long action, void *pcpu)
  398. {
  399. unsigned long cpu = (unsigned long)pcpu;
  400. struct zswap_pool *pool = container_of(nb, typeof(*pool), notifier);
  401. return __zswap_cpu_comp_notifier(pool, action, cpu);
  402. }
  403. static int zswap_cpu_comp_init(struct zswap_pool *pool)
  404. {
  405. unsigned long cpu;
  406. memset(&pool->notifier, 0, sizeof(pool->notifier));
  407. pool->notifier.notifier_call = zswap_cpu_comp_notifier;
  408. cpu_notifier_register_begin();
  409. for_each_online_cpu(cpu)
  410. if (__zswap_cpu_comp_notifier(pool, CPU_UP_PREPARE, cpu) ==
  411. NOTIFY_BAD)
  412. goto cleanup;
  413. __register_cpu_notifier(&pool->notifier);
  414. cpu_notifier_register_done();
  415. return 0;
  416. cleanup:
  417. for_each_online_cpu(cpu)
  418. __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
  419. cpu_notifier_register_done();
  420. return -ENOMEM;
  421. }
  422. static void zswap_cpu_comp_destroy(struct zswap_pool *pool)
  423. {
  424. unsigned long cpu;
  425. cpu_notifier_register_begin();
  426. for_each_online_cpu(cpu)
  427. __zswap_cpu_comp_notifier(pool, CPU_UP_CANCELED, cpu);
  428. __unregister_cpu_notifier(&pool->notifier);
  429. cpu_notifier_register_done();
  430. }
  431. /*********************************
  432. * pool functions
  433. **********************************/
  434. static struct zswap_pool *__zswap_pool_current(void)
  435. {
  436. struct zswap_pool *pool;
  437. pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
  438. WARN_ON(!pool);
  439. return pool;
  440. }
  441. static struct zswap_pool *zswap_pool_current(void)
  442. {
  443. assert_spin_locked(&zswap_pools_lock);
  444. return __zswap_pool_current();
  445. }
  446. static struct zswap_pool *zswap_pool_current_get(void)
  447. {
  448. struct zswap_pool *pool;
  449. rcu_read_lock();
  450. pool = __zswap_pool_current();
  451. if (!pool || !zswap_pool_get(pool))
  452. pool = NULL;
  453. rcu_read_unlock();
  454. return pool;
  455. }
  456. static struct zswap_pool *zswap_pool_last_get(void)
  457. {
  458. struct zswap_pool *pool, *last = NULL;
  459. rcu_read_lock();
  460. list_for_each_entry_rcu(pool, &zswap_pools, list)
  461. last = pool;
  462. if (!WARN_ON(!last) && !zswap_pool_get(last))
  463. last = NULL;
  464. rcu_read_unlock();
  465. return last;
  466. }
  467. static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
  468. {
  469. struct zswap_pool *pool;
  470. assert_spin_locked(&zswap_pools_lock);
  471. list_for_each_entry_rcu(pool, &zswap_pools, list) {
  472. if (strncmp(pool->tfm_name, compressor, sizeof(pool->tfm_name)))
  473. continue;
  474. if (strncmp(zpool_get_type(pool->zpool), type,
  475. sizeof(zswap_zpool_type)))
  476. continue;
  477. /* if we can't get it, it's about to be destroyed */
  478. if (!zswap_pool_get(pool))
  479. continue;
  480. return pool;
  481. }
  482. return NULL;
  483. }
  484. static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
  485. {
  486. struct zswap_pool *pool;
  487. gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN;
  488. pool = kzalloc(sizeof(*pool), GFP_KERNEL);
  489. if (!pool) {
  490. pr_err("pool alloc failed\n");
  491. return NULL;
  492. }
  493. pool->zpool = zpool_create_pool(type, "zswap", gfp, &zswap_zpool_ops);
  494. if (!pool->zpool) {
  495. pr_err("%s zpool not available\n", type);
  496. goto error;
  497. }
  498. pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
  499. strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
  500. pool->tfm = alloc_percpu(struct crypto_comp *);
  501. if (!pool->tfm) {
  502. pr_err("percpu alloc failed\n");
  503. goto error;
  504. }
  505. if (zswap_cpu_comp_init(pool))
  506. goto error;
  507. pr_debug("using %s compressor\n", pool->tfm_name);
  508. /* being the current pool takes 1 ref; this func expects the
  509. * caller to always add the new pool as the current pool
  510. */
  511. kref_init(&pool->kref);
  512. INIT_LIST_HEAD(&pool->list);
  513. zswap_pool_debug("created", pool);
  514. return pool;
  515. error:
  516. free_percpu(pool->tfm);
  517. if (pool->zpool)
  518. zpool_destroy_pool(pool->zpool);
  519. kfree(pool);
  520. return NULL;
  521. }
  522. static struct zswap_pool *__zswap_pool_create_fallback(void)
  523. {
  524. if (!crypto_has_comp(zswap_compressor, 0, 0)) {
  525. pr_err("compressor %s not available, using default %s\n",
  526. zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
  527. strncpy(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT,
  528. sizeof(zswap_compressor));
  529. }
  530. if (!zpool_has_pool(zswap_zpool_type)) {
  531. pr_err("zpool %s not available, using default %s\n",
  532. zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
  533. strncpy(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT,
  534. sizeof(zswap_zpool_type));
  535. }
  536. return zswap_pool_create(zswap_zpool_type, zswap_compressor);
  537. }
  538. static void zswap_pool_destroy(struct zswap_pool *pool)
  539. {
  540. zswap_pool_debug("destroying", pool);
  541. zswap_cpu_comp_destroy(pool);
  542. free_percpu(pool->tfm);
  543. zpool_destroy_pool(pool->zpool);
  544. kfree(pool);
  545. }
  546. static int __must_check zswap_pool_get(struct zswap_pool *pool)
  547. {
  548. return kref_get_unless_zero(&pool->kref);
  549. }
  550. static void __zswap_pool_release(struct rcu_head *head)
  551. {
  552. struct zswap_pool *pool = container_of(head, typeof(*pool), rcu_head);
  553. /* nobody should have been able to get a kref... */
  554. WARN_ON(kref_get_unless_zero(&pool->kref));
  555. /* pool is now off zswap_pools list and has no references. */
  556. zswap_pool_destroy(pool);
  557. }
  558. static void __zswap_pool_empty(struct kref *kref)
  559. {
  560. struct zswap_pool *pool;
  561. pool = container_of(kref, typeof(*pool), kref);
  562. spin_lock(&zswap_pools_lock);
  563. WARN_ON(pool == zswap_pool_current());
  564. list_del_rcu(&pool->list);
  565. call_rcu(&pool->rcu_head, __zswap_pool_release);
  566. spin_unlock(&zswap_pools_lock);
  567. }
  568. static void zswap_pool_put(struct zswap_pool *pool)
  569. {
  570. kref_put(&pool->kref, __zswap_pool_empty);
  571. }
  572. /*********************************
  573. * param callbacks
  574. **********************************/
  575. static int __zswap_param_set(const char *val, const struct kernel_param *kp,
  576. char *type, char *compressor)
  577. {
  578. struct zswap_pool *pool, *put_pool = NULL;
  579. char str[kp->str->maxlen], *s;
  580. int ret;
  581. /*
  582. * kp is either zswap_zpool_kparam or zswap_compressor_kparam, defined
  583. * at the top of this file, so maxlen is CRYPTO_MAX_ALG_NAME (64) or
  584. * 32 (arbitrary).
  585. */
  586. strlcpy(str, val, kp->str->maxlen);
  587. s = strim(str);
  588. /* if this is load-time (pre-init) param setting,
  589. * don't create a pool; that's done during init.
  590. */
  591. if (!zswap_init_started)
  592. return param_set_copystring(s, kp);
  593. /* no change required */
  594. if (!strncmp(kp->str->string, s, kp->str->maxlen))
  595. return 0;
  596. if (!type) {
  597. type = s;
  598. if (!zpool_has_pool(type)) {
  599. pr_err("zpool %s not available\n", type);
  600. return -ENOENT;
  601. }
  602. } else if (!compressor) {
  603. compressor = s;
  604. if (!crypto_has_comp(compressor, 0, 0)) {
  605. pr_err("compressor %s not available\n", compressor);
  606. return -ENOENT;
  607. }
  608. }
  609. spin_lock(&zswap_pools_lock);
  610. pool = zswap_pool_find_get(type, compressor);
  611. if (pool) {
  612. zswap_pool_debug("using existing", pool);
  613. list_del_rcu(&pool->list);
  614. } else {
  615. spin_unlock(&zswap_pools_lock);
  616. pool = zswap_pool_create(type, compressor);
  617. spin_lock(&zswap_pools_lock);
  618. }
  619. if (pool)
  620. ret = param_set_copystring(s, kp);
  621. else
  622. ret = -EINVAL;
  623. if (!ret) {
  624. put_pool = zswap_pool_current();
  625. list_add_rcu(&pool->list, &zswap_pools);
  626. } else if (pool) {
  627. /* add the possibly pre-existing pool to the end of the pools
  628. * list; if it's new (and empty) then it'll be removed and
  629. * destroyed by the put after we drop the lock
  630. */
  631. list_add_tail_rcu(&pool->list, &zswap_pools);
  632. put_pool = pool;
  633. }
  634. spin_unlock(&zswap_pools_lock);
  635. /* drop the ref from either the old current pool,
  636. * or the new pool we failed to add
  637. */
  638. if (put_pool)
  639. zswap_pool_put(put_pool);
  640. return ret;
  641. }
  642. static int zswap_compressor_param_set(const char *val,
  643. const struct kernel_param *kp)
  644. {
  645. return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
  646. }
  647. static int zswap_zpool_param_set(const char *val,
  648. const struct kernel_param *kp)
  649. {
  650. return __zswap_param_set(val, kp, NULL, zswap_compressor);
  651. }
  652. /*********************************
  653. * writeback code
  654. **********************************/
  655. /* return enum for zswap_get_swap_cache_page */
  656. enum zswap_get_swap_ret {
  657. ZSWAP_SWAPCACHE_NEW,
  658. ZSWAP_SWAPCACHE_EXIST,
  659. ZSWAP_SWAPCACHE_FAIL,
  660. };
  661. /*
  662. * zswap_get_swap_cache_page
  663. *
  664. * This is an adaption of read_swap_cache_async()
  665. *
  666. * This function tries to find a page with the given swap entry
  667. * in the swapper_space address space (the swap cache). If the page
  668. * is found, it is returned in retpage. Otherwise, a page is allocated,
  669. * added to the swap cache, and returned in retpage.
  670. *
  671. * If success, the swap cache page is returned in retpage
  672. * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
  673. * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
  674. * the new page is added to swapcache and locked
  675. * Returns ZSWAP_SWAPCACHE_FAIL on error
  676. */
  677. static int zswap_get_swap_cache_page(swp_entry_t entry,
  678. struct page **retpage)
  679. {
  680. bool page_was_allocated;
  681. *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
  682. NULL, 0, &page_was_allocated);
  683. if (page_was_allocated)
  684. return ZSWAP_SWAPCACHE_NEW;
  685. if (!*retpage)
  686. return ZSWAP_SWAPCACHE_FAIL;
  687. return ZSWAP_SWAPCACHE_EXIST;
  688. }
  689. /*
  690. * Attempts to free an entry by adding a page to the swap cache,
  691. * decompressing the entry data into the page, and issuing a
  692. * bio write to write the page back to the swap device.
  693. *
  694. * This can be thought of as a "resumed writeback" of the page
  695. * to the swap device. We are basically resuming the same swap
  696. * writeback path that was intercepted with the frontswap_store()
  697. * in the first place. After the page has been decompressed into
  698. * the swap cache, the compressed version stored by zswap can be
  699. * freed.
  700. */
  701. static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
  702. {
  703. struct zswap_header *zhdr;
  704. swp_entry_t swpentry;
  705. struct zswap_tree *tree;
  706. pgoff_t offset;
  707. struct zswap_entry *entry;
  708. struct page *page;
  709. struct crypto_comp *tfm;
  710. u8 *src, *dst;
  711. unsigned int dlen;
  712. int ret;
  713. struct writeback_control wbc = {
  714. .sync_mode = WB_SYNC_NONE,
  715. };
  716. /* extract swpentry from data */
  717. zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
  718. swpentry = zhdr->swpentry; /* here */
  719. zpool_unmap_handle(pool, handle);
  720. tree = zswap_trees[swp_type(swpentry)];
  721. offset = swp_offset(swpentry);
  722. /* find and ref zswap entry */
  723. spin_lock(&tree->lock);
  724. entry = zswap_entry_find_get(&tree->rbroot, offset);
  725. if (!entry) {
  726. /* entry was invalidated */
  727. spin_unlock(&tree->lock);
  728. return 0;
  729. }
  730. spin_unlock(&tree->lock);
  731. BUG_ON(offset != entry->offset);
  732. /* try to allocate swap cache page */
  733. switch (zswap_get_swap_cache_page(swpentry, &page)) {
  734. case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
  735. ret = -ENOMEM;
  736. goto fail;
  737. case ZSWAP_SWAPCACHE_EXIST:
  738. /* page is already in the swap cache, ignore for now */
  739. page_cache_release(page);
  740. ret = -EEXIST;
  741. goto fail;
  742. case ZSWAP_SWAPCACHE_NEW: /* page is locked */
  743. /* decompress */
  744. dlen = PAGE_SIZE;
  745. src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
  746. ZPOOL_MM_RO) + sizeof(struct zswap_header);
  747. dst = kmap_atomic(page);
  748. tfm = *get_cpu_ptr(entry->pool->tfm);
  749. ret = crypto_comp_decompress(tfm, src, entry->length,
  750. dst, &dlen);
  751. put_cpu_ptr(entry->pool->tfm);
  752. kunmap_atomic(dst);
  753. zpool_unmap_handle(entry->pool->zpool, entry->handle);
  754. BUG_ON(ret);
  755. BUG_ON(dlen != PAGE_SIZE);
  756. /* page is up to date */
  757. SetPageUptodate(page);
  758. }
  759. /* move it to the tail of the inactive list after end_writeback */
  760. SetPageReclaim(page);
  761. /* start writeback */
  762. __swap_writepage(page, &wbc, end_swap_bio_write);
  763. page_cache_release(page);
  764. zswap_written_back_pages++;
  765. spin_lock(&tree->lock);
  766. /* drop local reference */
  767. zswap_entry_put(tree, entry);
  768. /*
  769. * There are two possible situations for entry here:
  770. * (1) refcount is 1(normal case), entry is valid and on the tree
  771. * (2) refcount is 0, entry is freed and not on the tree
  772. * because invalidate happened during writeback
  773. * search the tree and free the entry if find entry
  774. */
  775. if (entry == zswap_rb_search(&tree->rbroot, offset))
  776. zswap_entry_put(tree, entry);
  777. spin_unlock(&tree->lock);
  778. goto end;
  779. /*
  780. * if we get here due to ZSWAP_SWAPCACHE_EXIST
  781. * a load may happening concurrently
  782. * it is safe and okay to not free the entry
  783. * if we free the entry in the following put
  784. * it it either okay to return !0
  785. */
  786. fail:
  787. spin_lock(&tree->lock);
  788. zswap_entry_put(tree, entry);
  789. spin_unlock(&tree->lock);
  790. end:
  791. return ret;
  792. }
  793. static int zswap_shrink(void)
  794. {
  795. struct zswap_pool *pool;
  796. int ret;
  797. pool = zswap_pool_last_get();
  798. if (!pool)
  799. return -ENOENT;
  800. ret = zpool_shrink(pool->zpool, 1, NULL);
  801. zswap_pool_put(pool);
  802. return ret;
  803. }
  804. /*********************************
  805. * frontswap hooks
  806. **********************************/
  807. /* attempts to compress and store an single page */
  808. static int zswap_frontswap_store(unsigned type, pgoff_t offset,
  809. struct page *page)
  810. {
  811. struct zswap_tree *tree = zswap_trees[type];
  812. struct zswap_entry *entry, *dupentry;
  813. struct crypto_comp *tfm;
  814. int ret;
  815. unsigned int dlen = PAGE_SIZE, len;
  816. unsigned long handle;
  817. char *buf;
  818. u8 *src, *dst;
  819. struct zswap_header *zhdr;
  820. if (!zswap_enabled || !tree) {
  821. ret = -ENODEV;
  822. goto reject;
  823. }
  824. /* reclaim space if needed */
  825. if (zswap_is_full()) {
  826. zswap_pool_limit_hit++;
  827. if (zswap_shrink()) {
  828. zswap_reject_reclaim_fail++;
  829. ret = -ENOMEM;
  830. goto reject;
  831. }
  832. }
  833. /* allocate entry */
  834. entry = zswap_entry_cache_alloc(GFP_KERNEL);
  835. if (!entry) {
  836. zswap_reject_kmemcache_fail++;
  837. ret = -ENOMEM;
  838. goto reject;
  839. }
  840. /* if entry is successfully added, it keeps the reference */
  841. entry->pool = zswap_pool_current_get();
  842. if (!entry->pool) {
  843. ret = -EINVAL;
  844. goto freepage;
  845. }
  846. /* compress */
  847. dst = get_cpu_var(zswap_dstmem);
  848. tfm = *get_cpu_ptr(entry->pool->tfm);
  849. src = kmap_atomic(page);
  850. ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
  851. kunmap_atomic(src);
  852. put_cpu_ptr(entry->pool->tfm);
  853. if (ret) {
  854. ret = -EINVAL;
  855. goto put_dstmem;
  856. }
  857. /* store */
  858. len = dlen + sizeof(struct zswap_header);
  859. ret = zpool_malloc(entry->pool->zpool, len,
  860. __GFP_NORETRY | __GFP_NOWARN, &handle);
  861. if (ret == -ENOSPC) {
  862. zswap_reject_compress_poor++;
  863. goto put_dstmem;
  864. }
  865. if (ret) {
  866. zswap_reject_alloc_fail++;
  867. goto put_dstmem;
  868. }
  869. zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
  870. zhdr->swpentry = swp_entry(type, offset);
  871. buf = (u8 *)(zhdr + 1);
  872. memcpy(buf, dst, dlen);
  873. zpool_unmap_handle(entry->pool->zpool, handle);
  874. put_cpu_var(zswap_dstmem);
  875. /* populate entry */
  876. entry->offset = offset;
  877. entry->handle = handle;
  878. entry->length = dlen;
  879. /* map */
  880. spin_lock(&tree->lock);
  881. do {
  882. ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
  883. if (ret == -EEXIST) {
  884. zswap_duplicate_entry++;
  885. /* remove from rbtree */
  886. zswap_rb_erase(&tree->rbroot, dupentry);
  887. zswap_entry_put(tree, dupentry);
  888. }
  889. } while (ret == -EEXIST);
  890. spin_unlock(&tree->lock);
  891. /* update stats */
  892. atomic_inc(&zswap_stored_pages);
  893. zswap_update_total_size();
  894. return 0;
  895. put_dstmem:
  896. put_cpu_var(zswap_dstmem);
  897. zswap_pool_put(entry->pool);
  898. freepage:
  899. zswap_entry_cache_free(entry);
  900. reject:
  901. return ret;
  902. }
  903. /*
  904. * returns 0 if the page was successfully decompressed
  905. * return -1 on entry not found or error
  906. */
  907. static int zswap_frontswap_load(unsigned type, pgoff_t offset,
  908. struct page *page)
  909. {
  910. struct zswap_tree *tree = zswap_trees[type];
  911. struct zswap_entry *entry;
  912. struct crypto_comp *tfm;
  913. u8 *src, *dst;
  914. unsigned int dlen;
  915. int ret;
  916. /* find */
  917. spin_lock(&tree->lock);
  918. entry = zswap_entry_find_get(&tree->rbroot, offset);
  919. if (!entry) {
  920. /* entry was written back */
  921. spin_unlock(&tree->lock);
  922. return -1;
  923. }
  924. spin_unlock(&tree->lock);
  925. /* decompress */
  926. dlen = PAGE_SIZE;
  927. src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
  928. ZPOOL_MM_RO) + sizeof(struct zswap_header);
  929. dst = kmap_atomic(page);
  930. tfm = *get_cpu_ptr(entry->pool->tfm);
  931. ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
  932. put_cpu_ptr(entry->pool->tfm);
  933. kunmap_atomic(dst);
  934. zpool_unmap_handle(entry->pool->zpool, entry->handle);
  935. BUG_ON(ret);
  936. spin_lock(&tree->lock);
  937. zswap_entry_put(tree, entry);
  938. spin_unlock(&tree->lock);
  939. return 0;
  940. }
  941. /* frees an entry in zswap */
  942. static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
  943. {
  944. struct zswap_tree *tree = zswap_trees[type];
  945. struct zswap_entry *entry;
  946. /* find */
  947. spin_lock(&tree->lock);
  948. entry = zswap_rb_search(&tree->rbroot, offset);
  949. if (!entry) {
  950. /* entry was written back */
  951. spin_unlock(&tree->lock);
  952. return;
  953. }
  954. /* remove from rbtree */
  955. zswap_rb_erase(&tree->rbroot, entry);
  956. /* drop the initial reference from entry creation */
  957. zswap_entry_put(tree, entry);
  958. spin_unlock(&tree->lock);
  959. }
  960. /* frees all zswap entries for the given swap type */
  961. static void zswap_frontswap_invalidate_area(unsigned type)
  962. {
  963. struct zswap_tree *tree = zswap_trees[type];
  964. struct zswap_entry *entry, *n;
  965. if (!tree)
  966. return;
  967. /* walk the tree and free everything */
  968. spin_lock(&tree->lock);
  969. rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
  970. zswap_free_entry(entry);
  971. tree->rbroot = RB_ROOT;
  972. spin_unlock(&tree->lock);
  973. kfree(tree);
  974. zswap_trees[type] = NULL;
  975. }
  976. static void zswap_frontswap_init(unsigned type)
  977. {
  978. struct zswap_tree *tree;
  979. tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
  980. if (!tree) {
  981. pr_err("alloc failed, zswap disabled for swap type %d\n", type);
  982. return;
  983. }
  984. tree->rbroot = RB_ROOT;
  985. spin_lock_init(&tree->lock);
  986. zswap_trees[type] = tree;
  987. }
  988. static struct frontswap_ops zswap_frontswap_ops = {
  989. .store = zswap_frontswap_store,
  990. .load = zswap_frontswap_load,
  991. .invalidate_page = zswap_frontswap_invalidate_page,
  992. .invalidate_area = zswap_frontswap_invalidate_area,
  993. .init = zswap_frontswap_init
  994. };
  995. /*********************************
  996. * debugfs functions
  997. **********************************/
  998. #ifdef CONFIG_DEBUG_FS
  999. #include <linux/debugfs.h>
  1000. static struct dentry *zswap_debugfs_root;
  1001. static int __init zswap_debugfs_init(void)
  1002. {
  1003. if (!debugfs_initialized())
  1004. return -ENODEV;
  1005. zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
  1006. if (!zswap_debugfs_root)
  1007. return -ENOMEM;
  1008. debugfs_create_u64("pool_limit_hit", S_IRUGO,
  1009. zswap_debugfs_root, &zswap_pool_limit_hit);
  1010. debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
  1011. zswap_debugfs_root, &zswap_reject_reclaim_fail);
  1012. debugfs_create_u64("reject_alloc_fail", S_IRUGO,
  1013. zswap_debugfs_root, &zswap_reject_alloc_fail);
  1014. debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
  1015. zswap_debugfs_root, &zswap_reject_kmemcache_fail);
  1016. debugfs_create_u64("reject_compress_poor", S_IRUGO,
  1017. zswap_debugfs_root, &zswap_reject_compress_poor);
  1018. debugfs_create_u64("written_back_pages", S_IRUGO,
  1019. zswap_debugfs_root, &zswap_written_back_pages);
  1020. debugfs_create_u64("duplicate_entry", S_IRUGO,
  1021. zswap_debugfs_root, &zswap_duplicate_entry);
  1022. debugfs_create_u64("pool_total_size", S_IRUGO,
  1023. zswap_debugfs_root, &zswap_pool_total_size);
  1024. debugfs_create_atomic_t("stored_pages", S_IRUGO,
  1025. zswap_debugfs_root, &zswap_stored_pages);
  1026. return 0;
  1027. }
  1028. static void __exit zswap_debugfs_exit(void)
  1029. {
  1030. debugfs_remove_recursive(zswap_debugfs_root);
  1031. }
  1032. #else
  1033. static int __init zswap_debugfs_init(void)
  1034. {
  1035. return 0;
  1036. }
  1037. static void __exit zswap_debugfs_exit(void) { }
  1038. #endif
  1039. /*********************************
  1040. * module init and exit
  1041. **********************************/
  1042. static int __init init_zswap(void)
  1043. {
  1044. struct zswap_pool *pool;
  1045. zswap_init_started = true;
  1046. if (zswap_entry_cache_create()) {
  1047. pr_err("entry cache creation failed\n");
  1048. goto cache_fail;
  1049. }
  1050. if (zswap_cpu_dstmem_init()) {
  1051. pr_err("dstmem alloc failed\n");
  1052. goto dstmem_fail;
  1053. }
  1054. pool = __zswap_pool_create_fallback();
  1055. if (!pool) {
  1056. pr_err("pool creation failed\n");
  1057. goto pool_fail;
  1058. }
  1059. pr_info("loaded using pool %s/%s\n", pool->tfm_name,
  1060. zpool_get_type(pool->zpool));
  1061. list_add(&pool->list, &zswap_pools);
  1062. frontswap_register_ops(&zswap_frontswap_ops);
  1063. if (zswap_debugfs_init())
  1064. pr_warn("debugfs initialization failed\n");
  1065. return 0;
  1066. pool_fail:
  1067. zswap_cpu_dstmem_destroy();
  1068. dstmem_fail:
  1069. zswap_entry_cache_destroy();
  1070. cache_fail:
  1071. return -ENOMEM;
  1072. }
  1073. /* must be late so crypto has time to come up */
  1074. late_initcall(init_zswap);
  1075. MODULE_LICENSE("GPL");
  1076. MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
  1077. MODULE_DESCRIPTION("Compressed cache for swap pages");