zswap.c 31 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154115511561157115811591160116111621163116411651166116711681169117011711172117311741175117611771178117911801181118211831184118511861187118811891190119111921193119411951196119711981199120012011202120312041205120612071208120912101211
  1. /*
  2. * zswap.c - zswap driver file
  3. *
  4. * zswap is a backend for frontswap that takes pages that are in the process
  5. * of being swapped out and attempts to compress and store them in a
  6. * RAM-based memory pool. This can result in a significant I/O reduction on
  7. * the swap device and, in the case where decompressing from RAM is faster
  8. * than reading from the swap device, can also improve workload performance.
  9. *
  10. * Copyright (C) 2012 Seth Jennings <sjenning@linux.vnet.ibm.com>
  11. *
  12. * This program is free software; you can redistribute it and/or
  13. * modify it under the terms of the GNU General Public License
  14. * as published by the Free Software Foundation; either version 2
  15. * of the License, or (at your option) any later version.
  16. *
  17. * This program is distributed in the hope that it will be useful,
  18. * but WITHOUT ANY WARRANTY; without even the implied warranty of
  19. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  20. * GNU General Public License for more details.
  21. */
  22. #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
  23. #include <linux/module.h>
  24. #include <linux/cpu.h>
  25. #include <linux/highmem.h>
  26. #include <linux/slab.h>
  27. #include <linux/spinlock.h>
  28. #include <linux/types.h>
  29. #include <linux/atomic.h>
  30. #include <linux/frontswap.h>
  31. #include <linux/rbtree.h>
  32. #include <linux/swap.h>
  33. #include <linux/crypto.h>
  34. #include <linux/mempool.h>
  35. #include <linux/zpool.h>
  36. #include <linux/mm_types.h>
  37. #include <linux/page-flags.h>
  38. #include <linux/swapops.h>
  39. #include <linux/writeback.h>
  40. #include <linux/pagemap.h>
  41. /*********************************
  42. * statistics
  43. **********************************/
  44. /* Total bytes used by the compressed storage */
  45. static u64 zswap_pool_total_size;
  46. /* The number of compressed pages currently stored in zswap */
  47. static atomic_t zswap_stored_pages = ATOMIC_INIT(0);
  48. /*
  49. * The statistics below are not protected from concurrent access for
  50. * performance reasons so they may not be a 100% accurate. However,
  51. * they do provide useful information on roughly how many times a
  52. * certain event is occurring.
  53. */
  54. /* Pool limit was hit (see zswap_max_pool_percent) */
  55. static u64 zswap_pool_limit_hit;
  56. /* Pages written back when pool limit was reached */
  57. static u64 zswap_written_back_pages;
  58. /* Store failed due to a reclaim failure after pool limit was reached */
  59. static u64 zswap_reject_reclaim_fail;
  60. /* Compressed page was too big for the allocator to (optimally) store */
  61. static u64 zswap_reject_compress_poor;
  62. /* Store failed because underlying allocator could not get memory */
  63. static u64 zswap_reject_alloc_fail;
  64. /* Store failed because the entry metadata could not be allocated (rare) */
  65. static u64 zswap_reject_kmemcache_fail;
  66. /* Duplicate store was encountered (rare) */
  67. static u64 zswap_duplicate_entry;
  68. /*********************************
  69. * tunables
  70. **********************************/
  71. /* Enable/disable zswap (disabled by default) */
  72. static bool zswap_enabled;
  73. module_param_named(enabled, zswap_enabled, bool, 0644);
  74. /* Crypto compressor to use */
  75. #define ZSWAP_COMPRESSOR_DEFAULT "lzo"
  76. static char *zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
  77. static int zswap_compressor_param_set(const char *,
  78. const struct kernel_param *);
  79. static struct kernel_param_ops zswap_compressor_param_ops = {
  80. .set = zswap_compressor_param_set,
  81. .get = param_get_charp,
  82. .free = param_free_charp,
  83. };
  84. module_param_cb(compressor, &zswap_compressor_param_ops,
  85. &zswap_compressor, 0644);
  86. /* Compressed storage zpool to use */
  87. #define ZSWAP_ZPOOL_DEFAULT "zbud"
  88. static char *zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
  89. static int zswap_zpool_param_set(const char *, const struct kernel_param *);
  90. static struct kernel_param_ops zswap_zpool_param_ops = {
  91. .set = zswap_zpool_param_set,
  92. .get = param_get_charp,
  93. .free = param_free_charp,
  94. };
  95. module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644);
  96. /* The maximum percentage of memory that the compressed pool can occupy */
  97. static unsigned int zswap_max_pool_percent = 20;
  98. module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644);
  99. /*********************************
  100. * data structures
  101. **********************************/
  102. struct zswap_pool {
  103. struct zpool *zpool;
  104. struct crypto_comp * __percpu *tfm;
  105. struct kref kref;
  106. struct list_head list;
  107. struct work_struct work;
  108. struct hlist_node node;
  109. char tfm_name[CRYPTO_MAX_ALG_NAME];
  110. };
  111. /*
  112. * struct zswap_entry
  113. *
  114. * This structure contains the metadata for tracking a single compressed
  115. * page within zswap.
  116. *
  117. * rbnode - links the entry into red-black tree for the appropriate swap type
  118. * offset - the swap offset for the entry. Index into the red-black tree.
  119. * refcount - the number of outstanding reference to the entry. This is needed
  120. * to protect against premature freeing of the entry by code
  121. * concurrent calls to load, invalidate, and writeback. The lock
  122. * for the zswap_tree structure that contains the entry must
  123. * be held while changing the refcount. Since the lock must
  124. * be held, there is no reason to also make refcount atomic.
  125. * length - the length in bytes of the compressed page data. Needed during
  126. * decompression
  127. * pool - the zswap_pool the entry's data is in
  128. * handle - zpool allocation handle that stores the compressed page data
  129. */
  130. struct zswap_entry {
  131. struct rb_node rbnode;
  132. pgoff_t offset;
  133. int refcount;
  134. unsigned int length;
  135. struct zswap_pool *pool;
  136. unsigned long handle;
  137. };
  138. struct zswap_header {
  139. swp_entry_t swpentry;
  140. };
  141. /*
  142. * The tree lock in the zswap_tree struct protects a few things:
  143. * - the rbtree
  144. * - the refcount field of each entry in the tree
  145. */
  146. struct zswap_tree {
  147. struct rb_root rbroot;
  148. spinlock_t lock;
  149. };
  150. static struct zswap_tree *zswap_trees[MAX_SWAPFILES];
  151. /* RCU-protected iteration */
  152. static LIST_HEAD(zswap_pools);
  153. /* protects zswap_pools list modification */
  154. static DEFINE_SPINLOCK(zswap_pools_lock);
  155. /* pool counter to provide unique names to zpool */
  156. static atomic_t zswap_pools_count = ATOMIC_INIT(0);
  157. /* used by param callback function */
  158. static bool zswap_init_started;
  159. /*********************************
  160. * helpers and fwd declarations
  161. **********************************/
  162. #define zswap_pool_debug(msg, p) \
  163. pr_debug("%s pool %s/%s\n", msg, (p)->tfm_name, \
  164. zpool_get_type((p)->zpool))
  165. static int zswap_writeback_entry(struct zpool *pool, unsigned long handle);
  166. static int zswap_pool_get(struct zswap_pool *pool);
  167. static void zswap_pool_put(struct zswap_pool *pool);
  168. static const struct zpool_ops zswap_zpool_ops = {
  169. .evict = zswap_writeback_entry
  170. };
  171. static bool zswap_is_full(void)
  172. {
  173. return totalram_pages * zswap_max_pool_percent / 100 <
  174. DIV_ROUND_UP(zswap_pool_total_size, PAGE_SIZE);
  175. }
  176. static void zswap_update_total_size(void)
  177. {
  178. struct zswap_pool *pool;
  179. u64 total = 0;
  180. rcu_read_lock();
  181. list_for_each_entry_rcu(pool, &zswap_pools, list)
  182. total += zpool_get_total_size(pool->zpool);
  183. rcu_read_unlock();
  184. zswap_pool_total_size = total;
  185. }
  186. /*********************************
  187. * zswap entry functions
  188. **********************************/
  189. static struct kmem_cache *zswap_entry_cache;
  190. static int __init zswap_entry_cache_create(void)
  191. {
  192. zswap_entry_cache = KMEM_CACHE(zswap_entry, 0);
  193. return zswap_entry_cache == NULL;
  194. }
  195. static void __init zswap_entry_cache_destroy(void)
  196. {
  197. kmem_cache_destroy(zswap_entry_cache);
  198. }
  199. static struct zswap_entry *zswap_entry_cache_alloc(gfp_t gfp)
  200. {
  201. struct zswap_entry *entry;
  202. entry = kmem_cache_alloc(zswap_entry_cache, gfp);
  203. if (!entry)
  204. return NULL;
  205. entry->refcount = 1;
  206. RB_CLEAR_NODE(&entry->rbnode);
  207. return entry;
  208. }
  209. static void zswap_entry_cache_free(struct zswap_entry *entry)
  210. {
  211. kmem_cache_free(zswap_entry_cache, entry);
  212. }
  213. /*********************************
  214. * rbtree functions
  215. **********************************/
  216. static struct zswap_entry *zswap_rb_search(struct rb_root *root, pgoff_t offset)
  217. {
  218. struct rb_node *node = root->rb_node;
  219. struct zswap_entry *entry;
  220. while (node) {
  221. entry = rb_entry(node, struct zswap_entry, rbnode);
  222. if (entry->offset > offset)
  223. node = node->rb_left;
  224. else if (entry->offset < offset)
  225. node = node->rb_right;
  226. else
  227. return entry;
  228. }
  229. return NULL;
  230. }
  231. /*
  232. * In the case that a entry with the same offset is found, a pointer to
  233. * the existing entry is stored in dupentry and the function returns -EEXIST
  234. */
  235. static int zswap_rb_insert(struct rb_root *root, struct zswap_entry *entry,
  236. struct zswap_entry **dupentry)
  237. {
  238. struct rb_node **link = &root->rb_node, *parent = NULL;
  239. struct zswap_entry *myentry;
  240. while (*link) {
  241. parent = *link;
  242. myentry = rb_entry(parent, struct zswap_entry, rbnode);
  243. if (myentry->offset > entry->offset)
  244. link = &(*link)->rb_left;
  245. else if (myentry->offset < entry->offset)
  246. link = &(*link)->rb_right;
  247. else {
  248. *dupentry = myentry;
  249. return -EEXIST;
  250. }
  251. }
  252. rb_link_node(&entry->rbnode, parent, link);
  253. rb_insert_color(&entry->rbnode, root);
  254. return 0;
  255. }
  256. static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry)
  257. {
  258. if (!RB_EMPTY_NODE(&entry->rbnode)) {
  259. rb_erase(&entry->rbnode, root);
  260. RB_CLEAR_NODE(&entry->rbnode);
  261. }
  262. }
  263. /*
  264. * Carries out the common pattern of freeing and entry's zpool allocation,
  265. * freeing the entry itself, and decrementing the number of stored pages.
  266. */
  267. static void zswap_free_entry(struct zswap_entry *entry)
  268. {
  269. zpool_free(entry->pool->zpool, entry->handle);
  270. zswap_pool_put(entry->pool);
  271. zswap_entry_cache_free(entry);
  272. atomic_dec(&zswap_stored_pages);
  273. zswap_update_total_size();
  274. }
  275. /* caller must hold the tree lock */
  276. static void zswap_entry_get(struct zswap_entry *entry)
  277. {
  278. entry->refcount++;
  279. }
  280. /* caller must hold the tree lock
  281. * remove from the tree and free it, if nobody reference the entry
  282. */
  283. static void zswap_entry_put(struct zswap_tree *tree,
  284. struct zswap_entry *entry)
  285. {
  286. int refcount = --entry->refcount;
  287. BUG_ON(refcount < 0);
  288. if (refcount == 0) {
  289. zswap_rb_erase(&tree->rbroot, entry);
  290. zswap_free_entry(entry);
  291. }
  292. }
  293. /* caller must hold the tree lock */
  294. static struct zswap_entry *zswap_entry_find_get(struct rb_root *root,
  295. pgoff_t offset)
  296. {
  297. struct zswap_entry *entry;
  298. entry = zswap_rb_search(root, offset);
  299. if (entry)
  300. zswap_entry_get(entry);
  301. return entry;
  302. }
  303. /*********************************
  304. * per-cpu code
  305. **********************************/
  306. static DEFINE_PER_CPU(u8 *, zswap_dstmem);
  307. static int zswap_dstmem_prepare(unsigned int cpu)
  308. {
  309. u8 *dst;
  310. dst = kmalloc_node(PAGE_SIZE * 2, GFP_KERNEL, cpu_to_node(cpu));
  311. if (!dst) {
  312. pr_err("can't allocate compressor buffer\n");
  313. return -ENOMEM;
  314. }
  315. per_cpu(zswap_dstmem, cpu) = dst;
  316. return 0;
  317. }
  318. static int zswap_dstmem_dead(unsigned int cpu)
  319. {
  320. u8 *dst;
  321. dst = per_cpu(zswap_dstmem, cpu);
  322. kfree(dst);
  323. per_cpu(zswap_dstmem, cpu) = NULL;
  324. return 0;
  325. }
  326. static int zswap_cpu_comp_prepare(unsigned int cpu, struct hlist_node *node)
  327. {
  328. struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
  329. struct crypto_comp *tfm;
  330. if (WARN_ON(*per_cpu_ptr(pool->tfm, cpu)))
  331. return 0;
  332. tfm = crypto_alloc_comp(pool->tfm_name, 0, 0);
  333. if (IS_ERR_OR_NULL(tfm)) {
  334. pr_err("could not alloc crypto comp %s : %ld\n",
  335. pool->tfm_name, PTR_ERR(tfm));
  336. return -ENOMEM;
  337. }
  338. *per_cpu_ptr(pool->tfm, cpu) = tfm;
  339. return 0;
  340. }
  341. static int zswap_cpu_comp_dead(unsigned int cpu, struct hlist_node *node)
  342. {
  343. struct zswap_pool *pool = hlist_entry(node, struct zswap_pool, node);
  344. struct crypto_comp *tfm;
  345. tfm = *per_cpu_ptr(pool->tfm, cpu);
  346. if (!IS_ERR_OR_NULL(tfm))
  347. crypto_free_comp(tfm);
  348. *per_cpu_ptr(pool->tfm, cpu) = NULL;
  349. return 0;
  350. }
  351. /*********************************
  352. * pool functions
  353. **********************************/
  354. static struct zswap_pool *__zswap_pool_current(void)
  355. {
  356. struct zswap_pool *pool;
  357. pool = list_first_or_null_rcu(&zswap_pools, typeof(*pool), list);
  358. WARN_ON(!pool);
  359. return pool;
  360. }
  361. static struct zswap_pool *zswap_pool_current(void)
  362. {
  363. assert_spin_locked(&zswap_pools_lock);
  364. return __zswap_pool_current();
  365. }
  366. static struct zswap_pool *zswap_pool_current_get(void)
  367. {
  368. struct zswap_pool *pool;
  369. rcu_read_lock();
  370. pool = __zswap_pool_current();
  371. if (!pool || !zswap_pool_get(pool))
  372. pool = NULL;
  373. rcu_read_unlock();
  374. return pool;
  375. }
  376. static struct zswap_pool *zswap_pool_last_get(void)
  377. {
  378. struct zswap_pool *pool, *last = NULL;
  379. rcu_read_lock();
  380. list_for_each_entry_rcu(pool, &zswap_pools, list)
  381. last = pool;
  382. if (!WARN_ON(!last) && !zswap_pool_get(last))
  383. last = NULL;
  384. rcu_read_unlock();
  385. return last;
  386. }
  387. /* type and compressor must be null-terminated */
  388. static struct zswap_pool *zswap_pool_find_get(char *type, char *compressor)
  389. {
  390. struct zswap_pool *pool;
  391. assert_spin_locked(&zswap_pools_lock);
  392. list_for_each_entry_rcu(pool, &zswap_pools, list) {
  393. if (strcmp(pool->tfm_name, compressor))
  394. continue;
  395. if (strcmp(zpool_get_type(pool->zpool), type))
  396. continue;
  397. /* if we can't get it, it's about to be destroyed */
  398. if (!zswap_pool_get(pool))
  399. continue;
  400. return pool;
  401. }
  402. return NULL;
  403. }
  404. static struct zswap_pool *zswap_pool_create(char *type, char *compressor)
  405. {
  406. struct zswap_pool *pool;
  407. char name[38]; /* 'zswap' + 32 char (max) num + \0 */
  408. gfp_t gfp = __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM;
  409. int ret;
  410. pool = kzalloc(sizeof(*pool), GFP_KERNEL);
  411. if (!pool) {
  412. pr_err("pool alloc failed\n");
  413. return NULL;
  414. }
  415. /* unique name for each pool specifically required by zsmalloc */
  416. snprintf(name, 38, "zswap%x", atomic_inc_return(&zswap_pools_count));
  417. pool->zpool = zpool_create_pool(type, name, gfp, &zswap_zpool_ops);
  418. if (!pool->zpool) {
  419. pr_err("%s zpool not available\n", type);
  420. goto error;
  421. }
  422. pr_debug("using %s zpool\n", zpool_get_type(pool->zpool));
  423. strlcpy(pool->tfm_name, compressor, sizeof(pool->tfm_name));
  424. pool->tfm = alloc_percpu(struct crypto_comp *);
  425. if (!pool->tfm) {
  426. pr_err("percpu alloc failed\n");
  427. goto error;
  428. }
  429. ret = cpuhp_state_add_instance(CPUHP_MM_ZSWP_POOL_PREPARE,
  430. &pool->node);
  431. if (ret)
  432. goto error;
  433. pr_debug("using %s compressor\n", pool->tfm_name);
  434. /* being the current pool takes 1 ref; this func expects the
  435. * caller to always add the new pool as the current pool
  436. */
  437. kref_init(&pool->kref);
  438. INIT_LIST_HEAD(&pool->list);
  439. zswap_pool_debug("created", pool);
  440. return pool;
  441. error:
  442. free_percpu(pool->tfm);
  443. if (pool->zpool)
  444. zpool_destroy_pool(pool->zpool);
  445. kfree(pool);
  446. return NULL;
  447. }
  448. static __init struct zswap_pool *__zswap_pool_create_fallback(void)
  449. {
  450. if (!crypto_has_comp(zswap_compressor, 0, 0)) {
  451. if (!strcmp(zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT)) {
  452. pr_err("default compressor %s not available\n",
  453. zswap_compressor);
  454. return NULL;
  455. }
  456. pr_err("compressor %s not available, using default %s\n",
  457. zswap_compressor, ZSWAP_COMPRESSOR_DEFAULT);
  458. param_free_charp(&zswap_compressor);
  459. zswap_compressor = ZSWAP_COMPRESSOR_DEFAULT;
  460. }
  461. if (!zpool_has_pool(zswap_zpool_type)) {
  462. if (!strcmp(zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT)) {
  463. pr_err("default zpool %s not available\n",
  464. zswap_zpool_type);
  465. return NULL;
  466. }
  467. pr_err("zpool %s not available, using default %s\n",
  468. zswap_zpool_type, ZSWAP_ZPOOL_DEFAULT);
  469. param_free_charp(&zswap_zpool_type);
  470. zswap_zpool_type = ZSWAP_ZPOOL_DEFAULT;
  471. }
  472. return zswap_pool_create(zswap_zpool_type, zswap_compressor);
  473. }
  474. static void zswap_pool_destroy(struct zswap_pool *pool)
  475. {
  476. zswap_pool_debug("destroying", pool);
  477. cpuhp_state_remove_instance(CPUHP_MM_ZSWP_POOL_PREPARE, &pool->node);
  478. free_percpu(pool->tfm);
  479. zpool_destroy_pool(pool->zpool);
  480. kfree(pool);
  481. }
  482. static int __must_check zswap_pool_get(struct zswap_pool *pool)
  483. {
  484. return kref_get_unless_zero(&pool->kref);
  485. }
  486. static void __zswap_pool_release(struct work_struct *work)
  487. {
  488. struct zswap_pool *pool = container_of(work, typeof(*pool), work);
  489. synchronize_rcu();
  490. /* nobody should have been able to get a kref... */
  491. WARN_ON(kref_get_unless_zero(&pool->kref));
  492. /* pool is now off zswap_pools list and has no references. */
  493. zswap_pool_destroy(pool);
  494. }
  495. static void __zswap_pool_empty(struct kref *kref)
  496. {
  497. struct zswap_pool *pool;
  498. pool = container_of(kref, typeof(*pool), kref);
  499. spin_lock(&zswap_pools_lock);
  500. WARN_ON(pool == zswap_pool_current());
  501. list_del_rcu(&pool->list);
  502. INIT_WORK(&pool->work, __zswap_pool_release);
  503. schedule_work(&pool->work);
  504. spin_unlock(&zswap_pools_lock);
  505. }
  506. static void zswap_pool_put(struct zswap_pool *pool)
  507. {
  508. kref_put(&pool->kref, __zswap_pool_empty);
  509. }
  510. /*********************************
  511. * param callbacks
  512. **********************************/
  513. /* val must be a null-terminated string */
  514. static int __zswap_param_set(const char *val, const struct kernel_param *kp,
  515. char *type, char *compressor)
  516. {
  517. struct zswap_pool *pool, *put_pool = NULL;
  518. char *s = strstrip((char *)val);
  519. int ret;
  520. /* no change required */
  521. if (!strcmp(s, *(char **)kp->arg))
  522. return 0;
  523. /* if this is load-time (pre-init) param setting,
  524. * don't create a pool; that's done during init.
  525. */
  526. if (!zswap_init_started)
  527. return param_set_charp(s, kp);
  528. if (!type) {
  529. if (!zpool_has_pool(s)) {
  530. pr_err("zpool %s not available\n", s);
  531. return -ENOENT;
  532. }
  533. type = s;
  534. } else if (!compressor) {
  535. if (!crypto_has_comp(s, 0, 0)) {
  536. pr_err("compressor %s not available\n", s);
  537. return -ENOENT;
  538. }
  539. compressor = s;
  540. } else {
  541. WARN_ON(1);
  542. return -EINVAL;
  543. }
  544. spin_lock(&zswap_pools_lock);
  545. pool = zswap_pool_find_get(type, compressor);
  546. if (pool) {
  547. zswap_pool_debug("using existing", pool);
  548. list_del_rcu(&pool->list);
  549. } else {
  550. spin_unlock(&zswap_pools_lock);
  551. pool = zswap_pool_create(type, compressor);
  552. spin_lock(&zswap_pools_lock);
  553. }
  554. if (pool)
  555. ret = param_set_charp(s, kp);
  556. else
  557. ret = -EINVAL;
  558. if (!ret) {
  559. put_pool = zswap_pool_current();
  560. list_add_rcu(&pool->list, &zswap_pools);
  561. } else if (pool) {
  562. /* add the possibly pre-existing pool to the end of the pools
  563. * list; if it's new (and empty) then it'll be removed and
  564. * destroyed by the put after we drop the lock
  565. */
  566. list_add_tail_rcu(&pool->list, &zswap_pools);
  567. put_pool = pool;
  568. }
  569. spin_unlock(&zswap_pools_lock);
  570. /* drop the ref from either the old current pool,
  571. * or the new pool we failed to add
  572. */
  573. if (put_pool)
  574. zswap_pool_put(put_pool);
  575. return ret;
  576. }
  577. static int zswap_compressor_param_set(const char *val,
  578. const struct kernel_param *kp)
  579. {
  580. return __zswap_param_set(val, kp, zswap_zpool_type, NULL);
  581. }
  582. static int zswap_zpool_param_set(const char *val,
  583. const struct kernel_param *kp)
  584. {
  585. return __zswap_param_set(val, kp, NULL, zswap_compressor);
  586. }
  587. /*********************************
  588. * writeback code
  589. **********************************/
  590. /* return enum for zswap_get_swap_cache_page */
  591. enum zswap_get_swap_ret {
  592. ZSWAP_SWAPCACHE_NEW,
  593. ZSWAP_SWAPCACHE_EXIST,
  594. ZSWAP_SWAPCACHE_FAIL,
  595. };
  596. /*
  597. * zswap_get_swap_cache_page
  598. *
  599. * This is an adaption of read_swap_cache_async()
  600. *
  601. * This function tries to find a page with the given swap entry
  602. * in the swapper_space address space (the swap cache). If the page
  603. * is found, it is returned in retpage. Otherwise, a page is allocated,
  604. * added to the swap cache, and returned in retpage.
  605. *
  606. * If success, the swap cache page is returned in retpage
  607. * Returns ZSWAP_SWAPCACHE_EXIST if page was already in the swap cache
  608. * Returns ZSWAP_SWAPCACHE_NEW if the new page needs to be populated,
  609. * the new page is added to swapcache and locked
  610. * Returns ZSWAP_SWAPCACHE_FAIL on error
  611. */
  612. static int zswap_get_swap_cache_page(swp_entry_t entry,
  613. struct page **retpage)
  614. {
  615. bool page_was_allocated;
  616. *retpage = __read_swap_cache_async(entry, GFP_KERNEL,
  617. NULL, 0, &page_was_allocated);
  618. if (page_was_allocated)
  619. return ZSWAP_SWAPCACHE_NEW;
  620. if (!*retpage)
  621. return ZSWAP_SWAPCACHE_FAIL;
  622. return ZSWAP_SWAPCACHE_EXIST;
  623. }
  624. /*
  625. * Attempts to free an entry by adding a page to the swap cache,
  626. * decompressing the entry data into the page, and issuing a
  627. * bio write to write the page back to the swap device.
  628. *
  629. * This can be thought of as a "resumed writeback" of the page
  630. * to the swap device. We are basically resuming the same swap
  631. * writeback path that was intercepted with the frontswap_store()
  632. * in the first place. After the page has been decompressed into
  633. * the swap cache, the compressed version stored by zswap can be
  634. * freed.
  635. */
  636. static int zswap_writeback_entry(struct zpool *pool, unsigned long handle)
  637. {
  638. struct zswap_header *zhdr;
  639. swp_entry_t swpentry;
  640. struct zswap_tree *tree;
  641. pgoff_t offset;
  642. struct zswap_entry *entry;
  643. struct page *page;
  644. struct crypto_comp *tfm;
  645. u8 *src, *dst;
  646. unsigned int dlen;
  647. int ret;
  648. struct writeback_control wbc = {
  649. .sync_mode = WB_SYNC_NONE,
  650. };
  651. /* extract swpentry from data */
  652. zhdr = zpool_map_handle(pool, handle, ZPOOL_MM_RO);
  653. swpentry = zhdr->swpentry; /* here */
  654. zpool_unmap_handle(pool, handle);
  655. tree = zswap_trees[swp_type(swpentry)];
  656. offset = swp_offset(swpentry);
  657. /* find and ref zswap entry */
  658. spin_lock(&tree->lock);
  659. entry = zswap_entry_find_get(&tree->rbroot, offset);
  660. if (!entry) {
  661. /* entry was invalidated */
  662. spin_unlock(&tree->lock);
  663. return 0;
  664. }
  665. spin_unlock(&tree->lock);
  666. BUG_ON(offset != entry->offset);
  667. /* try to allocate swap cache page */
  668. switch (zswap_get_swap_cache_page(swpentry, &page)) {
  669. case ZSWAP_SWAPCACHE_FAIL: /* no memory or invalidate happened */
  670. ret = -ENOMEM;
  671. goto fail;
  672. case ZSWAP_SWAPCACHE_EXIST:
  673. /* page is already in the swap cache, ignore for now */
  674. put_page(page);
  675. ret = -EEXIST;
  676. goto fail;
  677. case ZSWAP_SWAPCACHE_NEW: /* page is locked */
  678. /* decompress */
  679. dlen = PAGE_SIZE;
  680. src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
  681. ZPOOL_MM_RO) + sizeof(struct zswap_header);
  682. dst = kmap_atomic(page);
  683. tfm = *get_cpu_ptr(entry->pool->tfm);
  684. ret = crypto_comp_decompress(tfm, src, entry->length,
  685. dst, &dlen);
  686. put_cpu_ptr(entry->pool->tfm);
  687. kunmap_atomic(dst);
  688. zpool_unmap_handle(entry->pool->zpool, entry->handle);
  689. BUG_ON(ret);
  690. BUG_ON(dlen != PAGE_SIZE);
  691. /* page is up to date */
  692. SetPageUptodate(page);
  693. }
  694. /* move it to the tail of the inactive list after end_writeback */
  695. SetPageReclaim(page);
  696. /* start writeback */
  697. __swap_writepage(page, &wbc, end_swap_bio_write);
  698. put_page(page);
  699. zswap_written_back_pages++;
  700. spin_lock(&tree->lock);
  701. /* drop local reference */
  702. zswap_entry_put(tree, entry);
  703. /*
  704. * There are two possible situations for entry here:
  705. * (1) refcount is 1(normal case), entry is valid and on the tree
  706. * (2) refcount is 0, entry is freed and not on the tree
  707. * because invalidate happened during writeback
  708. * search the tree and free the entry if find entry
  709. */
  710. if (entry == zswap_rb_search(&tree->rbroot, offset))
  711. zswap_entry_put(tree, entry);
  712. spin_unlock(&tree->lock);
  713. goto end;
  714. /*
  715. * if we get here due to ZSWAP_SWAPCACHE_EXIST
  716. * a load may happening concurrently
  717. * it is safe and okay to not free the entry
  718. * if we free the entry in the following put
  719. * it it either okay to return !0
  720. */
  721. fail:
  722. spin_lock(&tree->lock);
  723. zswap_entry_put(tree, entry);
  724. spin_unlock(&tree->lock);
  725. end:
  726. return ret;
  727. }
  728. static int zswap_shrink(void)
  729. {
  730. struct zswap_pool *pool;
  731. int ret;
  732. pool = zswap_pool_last_get();
  733. if (!pool)
  734. return -ENOENT;
  735. ret = zpool_shrink(pool->zpool, 1, NULL);
  736. zswap_pool_put(pool);
  737. return ret;
  738. }
  739. /*********************************
  740. * frontswap hooks
  741. **********************************/
  742. /* attempts to compress and store an single page */
  743. static int zswap_frontswap_store(unsigned type, pgoff_t offset,
  744. struct page *page)
  745. {
  746. struct zswap_tree *tree = zswap_trees[type];
  747. struct zswap_entry *entry, *dupentry;
  748. struct crypto_comp *tfm;
  749. int ret;
  750. unsigned int dlen = PAGE_SIZE, len;
  751. unsigned long handle;
  752. char *buf;
  753. u8 *src, *dst;
  754. struct zswap_header *zhdr;
  755. if (!zswap_enabled || !tree) {
  756. ret = -ENODEV;
  757. goto reject;
  758. }
  759. /* reclaim space if needed */
  760. if (zswap_is_full()) {
  761. zswap_pool_limit_hit++;
  762. if (zswap_shrink()) {
  763. zswap_reject_reclaim_fail++;
  764. ret = -ENOMEM;
  765. goto reject;
  766. }
  767. }
  768. /* allocate entry */
  769. entry = zswap_entry_cache_alloc(GFP_KERNEL);
  770. if (!entry) {
  771. zswap_reject_kmemcache_fail++;
  772. ret = -ENOMEM;
  773. goto reject;
  774. }
  775. /* if entry is successfully added, it keeps the reference */
  776. entry->pool = zswap_pool_current_get();
  777. if (!entry->pool) {
  778. ret = -EINVAL;
  779. goto freepage;
  780. }
  781. /* compress */
  782. dst = get_cpu_var(zswap_dstmem);
  783. tfm = *get_cpu_ptr(entry->pool->tfm);
  784. src = kmap_atomic(page);
  785. ret = crypto_comp_compress(tfm, src, PAGE_SIZE, dst, &dlen);
  786. kunmap_atomic(src);
  787. put_cpu_ptr(entry->pool->tfm);
  788. if (ret) {
  789. ret = -EINVAL;
  790. goto put_dstmem;
  791. }
  792. /* store */
  793. len = dlen + sizeof(struct zswap_header);
  794. ret = zpool_malloc(entry->pool->zpool, len,
  795. __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM,
  796. &handle);
  797. if (ret == -ENOSPC) {
  798. zswap_reject_compress_poor++;
  799. goto put_dstmem;
  800. }
  801. if (ret) {
  802. zswap_reject_alloc_fail++;
  803. goto put_dstmem;
  804. }
  805. zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW);
  806. zhdr->swpentry = swp_entry(type, offset);
  807. buf = (u8 *)(zhdr + 1);
  808. memcpy(buf, dst, dlen);
  809. zpool_unmap_handle(entry->pool->zpool, handle);
  810. put_cpu_var(zswap_dstmem);
  811. /* populate entry */
  812. entry->offset = offset;
  813. entry->handle = handle;
  814. entry->length = dlen;
  815. /* map */
  816. spin_lock(&tree->lock);
  817. do {
  818. ret = zswap_rb_insert(&tree->rbroot, entry, &dupentry);
  819. if (ret == -EEXIST) {
  820. zswap_duplicate_entry++;
  821. /* remove from rbtree */
  822. zswap_rb_erase(&tree->rbroot, dupentry);
  823. zswap_entry_put(tree, dupentry);
  824. }
  825. } while (ret == -EEXIST);
  826. spin_unlock(&tree->lock);
  827. /* update stats */
  828. atomic_inc(&zswap_stored_pages);
  829. zswap_update_total_size();
  830. return 0;
  831. put_dstmem:
  832. put_cpu_var(zswap_dstmem);
  833. zswap_pool_put(entry->pool);
  834. freepage:
  835. zswap_entry_cache_free(entry);
  836. reject:
  837. return ret;
  838. }
  839. /*
  840. * returns 0 if the page was successfully decompressed
  841. * return -1 on entry not found or error
  842. */
  843. static int zswap_frontswap_load(unsigned type, pgoff_t offset,
  844. struct page *page)
  845. {
  846. struct zswap_tree *tree = zswap_trees[type];
  847. struct zswap_entry *entry;
  848. struct crypto_comp *tfm;
  849. u8 *src, *dst;
  850. unsigned int dlen;
  851. int ret;
  852. /* find */
  853. spin_lock(&tree->lock);
  854. entry = zswap_entry_find_get(&tree->rbroot, offset);
  855. if (!entry) {
  856. /* entry was written back */
  857. spin_unlock(&tree->lock);
  858. return -1;
  859. }
  860. spin_unlock(&tree->lock);
  861. /* decompress */
  862. dlen = PAGE_SIZE;
  863. src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle,
  864. ZPOOL_MM_RO) + sizeof(struct zswap_header);
  865. dst = kmap_atomic(page);
  866. tfm = *get_cpu_ptr(entry->pool->tfm);
  867. ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen);
  868. put_cpu_ptr(entry->pool->tfm);
  869. kunmap_atomic(dst);
  870. zpool_unmap_handle(entry->pool->zpool, entry->handle);
  871. BUG_ON(ret);
  872. spin_lock(&tree->lock);
  873. zswap_entry_put(tree, entry);
  874. spin_unlock(&tree->lock);
  875. return 0;
  876. }
  877. /* frees an entry in zswap */
  878. static void zswap_frontswap_invalidate_page(unsigned type, pgoff_t offset)
  879. {
  880. struct zswap_tree *tree = zswap_trees[type];
  881. struct zswap_entry *entry;
  882. /* find */
  883. spin_lock(&tree->lock);
  884. entry = zswap_rb_search(&tree->rbroot, offset);
  885. if (!entry) {
  886. /* entry was written back */
  887. spin_unlock(&tree->lock);
  888. return;
  889. }
  890. /* remove from rbtree */
  891. zswap_rb_erase(&tree->rbroot, entry);
  892. /* drop the initial reference from entry creation */
  893. zswap_entry_put(tree, entry);
  894. spin_unlock(&tree->lock);
  895. }
  896. /* frees all zswap entries for the given swap type */
  897. static void zswap_frontswap_invalidate_area(unsigned type)
  898. {
  899. struct zswap_tree *tree = zswap_trees[type];
  900. struct zswap_entry *entry, *n;
  901. if (!tree)
  902. return;
  903. /* walk the tree and free everything */
  904. spin_lock(&tree->lock);
  905. rbtree_postorder_for_each_entry_safe(entry, n, &tree->rbroot, rbnode)
  906. zswap_free_entry(entry);
  907. tree->rbroot = RB_ROOT;
  908. spin_unlock(&tree->lock);
  909. kfree(tree);
  910. zswap_trees[type] = NULL;
  911. }
  912. static void zswap_frontswap_init(unsigned type)
  913. {
  914. struct zswap_tree *tree;
  915. tree = kzalloc(sizeof(struct zswap_tree), GFP_KERNEL);
  916. if (!tree) {
  917. pr_err("alloc failed, zswap disabled for swap type %d\n", type);
  918. return;
  919. }
  920. tree->rbroot = RB_ROOT;
  921. spin_lock_init(&tree->lock);
  922. zswap_trees[type] = tree;
  923. }
  924. static struct frontswap_ops zswap_frontswap_ops = {
  925. .store = zswap_frontswap_store,
  926. .load = zswap_frontswap_load,
  927. .invalidate_page = zswap_frontswap_invalidate_page,
  928. .invalidate_area = zswap_frontswap_invalidate_area,
  929. .init = zswap_frontswap_init
  930. };
  931. /*********************************
  932. * debugfs functions
  933. **********************************/
  934. #ifdef CONFIG_DEBUG_FS
  935. #include <linux/debugfs.h>
  936. static struct dentry *zswap_debugfs_root;
  937. static int __init zswap_debugfs_init(void)
  938. {
  939. if (!debugfs_initialized())
  940. return -ENODEV;
  941. zswap_debugfs_root = debugfs_create_dir("zswap", NULL);
  942. if (!zswap_debugfs_root)
  943. return -ENOMEM;
  944. debugfs_create_u64("pool_limit_hit", S_IRUGO,
  945. zswap_debugfs_root, &zswap_pool_limit_hit);
  946. debugfs_create_u64("reject_reclaim_fail", S_IRUGO,
  947. zswap_debugfs_root, &zswap_reject_reclaim_fail);
  948. debugfs_create_u64("reject_alloc_fail", S_IRUGO,
  949. zswap_debugfs_root, &zswap_reject_alloc_fail);
  950. debugfs_create_u64("reject_kmemcache_fail", S_IRUGO,
  951. zswap_debugfs_root, &zswap_reject_kmemcache_fail);
  952. debugfs_create_u64("reject_compress_poor", S_IRUGO,
  953. zswap_debugfs_root, &zswap_reject_compress_poor);
  954. debugfs_create_u64("written_back_pages", S_IRUGO,
  955. zswap_debugfs_root, &zswap_written_back_pages);
  956. debugfs_create_u64("duplicate_entry", S_IRUGO,
  957. zswap_debugfs_root, &zswap_duplicate_entry);
  958. debugfs_create_u64("pool_total_size", S_IRUGO,
  959. zswap_debugfs_root, &zswap_pool_total_size);
  960. debugfs_create_atomic_t("stored_pages", S_IRUGO,
  961. zswap_debugfs_root, &zswap_stored_pages);
  962. return 0;
  963. }
  964. static void __exit zswap_debugfs_exit(void)
  965. {
  966. debugfs_remove_recursive(zswap_debugfs_root);
  967. }
  968. #else
  969. static int __init zswap_debugfs_init(void)
  970. {
  971. return 0;
  972. }
  973. static void __exit zswap_debugfs_exit(void) { }
  974. #endif
  975. /*********************************
  976. * module init and exit
  977. **********************************/
  978. static int __init init_zswap(void)
  979. {
  980. struct zswap_pool *pool;
  981. int ret;
  982. zswap_init_started = true;
  983. if (zswap_entry_cache_create()) {
  984. pr_err("entry cache creation failed\n");
  985. goto cache_fail;
  986. }
  987. ret = cpuhp_setup_state(CPUHP_MM_ZSWP_MEM_PREPARE, "mm/zswap:prepare",
  988. zswap_dstmem_prepare, zswap_dstmem_dead);
  989. if (ret) {
  990. pr_err("dstmem alloc failed\n");
  991. goto dstmem_fail;
  992. }
  993. ret = cpuhp_setup_state_multi(CPUHP_MM_ZSWP_POOL_PREPARE,
  994. "mm/zswap_pool:prepare",
  995. zswap_cpu_comp_prepare,
  996. zswap_cpu_comp_dead);
  997. if (ret)
  998. goto hp_fail;
  999. pool = __zswap_pool_create_fallback();
  1000. if (!pool) {
  1001. pr_err("pool creation failed\n");
  1002. goto pool_fail;
  1003. }
  1004. pr_info("loaded using pool %s/%s\n", pool->tfm_name,
  1005. zpool_get_type(pool->zpool));
  1006. list_add(&pool->list, &zswap_pools);
  1007. frontswap_register_ops(&zswap_frontswap_ops);
  1008. if (zswap_debugfs_init())
  1009. pr_warn("debugfs initialization failed\n");
  1010. return 0;
  1011. pool_fail:
  1012. cpuhp_remove_state_nocalls(CPUHP_MM_ZSWP_POOL_PREPARE);
  1013. hp_fail:
  1014. cpuhp_remove_state(CPUHP_MM_ZSWP_MEM_PREPARE);
  1015. dstmem_fail:
  1016. zswap_entry_cache_destroy();
  1017. cache_fail:
  1018. return -ENOMEM;
  1019. }
  1020. /* must be late so crypto has time to come up */
  1021. late_initcall(init_zswap);
  1022. MODULE_LICENSE("GPL");
  1023. MODULE_AUTHOR("Seth Jennings <sjennings@variantweb.net>");
  1024. MODULE_DESCRIPTION("Compressed cache for swap pages");