rhashtable.c 29 KB

1234567891011121314151617181920212223242526272829303132333435363738394041424344454647484950515253545556575859606162636465666768697071727374757677787980818283848586878889909192939495969798991001011021031041051061071081091101111121131141151161171181191201211221231241251261271281291301311321331341351361371381391401411421431441451461471481491501511521531541551561571581591601611621631641651661671681691701711721731741751761771781791801811821831841851861871881891901911921931941951961971981992002012022032042052062072082092102112122132142152162172182192202212222232242252262272282292302312322332342352362372382392402412422432442452462472482492502512522532542552562572582592602612622632642652662672682692702712722732742752762772782792802812822832842852862872882892902912922932942952962972982993003013023033043053063073083093103113123133143153163173183193203213223233243253263273283293303313323333343353363373383393403413423433443453463473483493503513523533543553563573583593603613623633643653663673683693703713723733743753763773783793803813823833843853863873883893903913923933943953963973983994004014024034044054064074084094104114124134144154164174184194204214224234244254264274284294304314324334344354364374384394404414424434444454464474484494504514524534544554564574584594604614624634644654664674684694704714724734744754764774784794804814824834844854864874884894904914924934944954964974984995005015025035045055065075085095105115125135145155165175185195205215225235245255265275285295305315325335345355365375385395405415425435445455465475485495505515525535545555565575585595605615625635645655665675685695705715725735745755765775785795805815825835845855865875885895905915925935945955965975985996006016026036046056066076086096106116126136146156166176186196206216226236246256266276286296306316326336346356366376386396406416426436446456466476486496506516526536546556566576586596606616626636646656666676686696706716726736746756766776786796806816826836846856866876886896906916926936946956966976986997007017027037047057067077087097107117127137147157167177187197207217227237247257267277287297307317327337347357367377387397407417427437447457467477487497507517527537547557567577587597607617627637647657667677687697707717727737747757767777787797807817827837847857867877887897907917927937947957967977987998008018028038048058068078088098108118128138148158168178188198208218228238248258268278288298308318328338348358368378388398408418428438448458468478488498508518528538548558568578588598608618628638648658668678688698708718728738748758768778788798808818828838848858868878888898908918928938948958968978988999009019029039049059069079089099109119129139149159169179189199209219229239249259269279289299309319329339349359369379389399409419429439449459469479489499509519529539549559569579589599609619629639649659669679689699709719729739749759769779789799809819829839849859869879889899909919929939949959969979989991000100110021003100410051006100710081009101010111012101310141015101610171018101910201021102210231024102510261027102810291030103110321033103410351036103710381039104010411042104310441045104610471048104910501051105210531054105510561057105810591060106110621063106410651066106710681069107010711072107310741075107610771078107910801081108210831084108510861087108810891090109110921093109410951096109710981099110011011102110311041105110611071108110911101111111211131114111511161117111811191120112111221123112411251126112711281129113011311132113311341135113611371138113911401141114211431144114511461147114811491150115111521153115411551156115711581159116011611162116311641165116611671168116911701171117211731174117511761177117811791180118111821183118411851186118711881189119011911192119311941195119611971198119912001201120212031204120512061207120812091210121112121213121412151216121712181219122012211222122312241225122612271228122912301231123212331234123512361237
  1. /*
  2. * Resizable, Scalable, Concurrent Hash Table
  3. *
  4. * Copyright (c) 2015 Herbert Xu <herbert@gondor.apana.org.au>
  5. * Copyright (c) 2014-2015 Thomas Graf <tgraf@suug.ch>
  6. * Copyright (c) 2008-2014 Patrick McHardy <kaber@trash.net>
  7. *
  8. * Code partially derived from nft_hash
  9. * Rewritten with rehash code from br_multicast plus single list
  10. * pointer as suggested by Josh Triplett
  11. *
  12. * This program is free software; you can redistribute it and/or modify
  13. * it under the terms of the GNU General Public License version 2 as
  14. * published by the Free Software Foundation.
  15. */
  16. #include <linux/atomic.h>
  17. #include <linux/kernel.h>
  18. #include <linux/init.h>
  19. #include <linux/log2.h>
  20. #include <linux/sched.h>
  21. #include <linux/rculist.h>
  22. #include <linux/slab.h>
  23. #include <linux/vmalloc.h>
  24. #include <linux/mm.h>
  25. #include <linux/jhash.h>
  26. #include <linux/random.h>
  27. #include <linux/rhashtable.h>
  28. #include <linux/err.h>
  29. #include <linux/export.h>
  30. #define HASH_DEFAULT_SIZE 64UL
  31. #define HASH_MIN_SIZE 4U
  32. #define BUCKET_LOCKS_PER_CPU 32UL
  33. union nested_table {
  34. union nested_table __rcu *table;
  35. struct rhash_head __rcu *bucket;
  36. };
  37. static u32 head_hashfn(struct rhashtable *ht,
  38. const struct bucket_table *tbl,
  39. const struct rhash_head *he)
  40. {
  41. return rht_head_hashfn(ht, tbl, he, ht->p);
  42. }
  43. #ifdef CONFIG_PROVE_LOCKING
  44. #define ASSERT_RHT_MUTEX(HT) BUG_ON(!lockdep_rht_mutex_is_held(HT))
  45. int lockdep_rht_mutex_is_held(struct rhashtable *ht)
  46. {
  47. return (debug_locks) ? lockdep_is_held(&ht->mutex) : 1;
  48. }
  49. EXPORT_SYMBOL_GPL(lockdep_rht_mutex_is_held);
  50. int lockdep_rht_bucket_is_held(const struct bucket_table *tbl, u32 hash)
  51. {
  52. spinlock_t *lock = rht_bucket_lock(tbl, hash);
  53. return (debug_locks) ? lockdep_is_held(lock) : 1;
  54. }
  55. EXPORT_SYMBOL_GPL(lockdep_rht_bucket_is_held);
  56. #else
  57. #define ASSERT_RHT_MUTEX(HT)
  58. #endif
  59. static void nested_table_free(union nested_table *ntbl, unsigned int size)
  60. {
  61. const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
  62. const unsigned int len = 1 << shift;
  63. unsigned int i;
  64. ntbl = rcu_dereference_raw(ntbl->table);
  65. if (!ntbl)
  66. return;
  67. if (size > len) {
  68. size >>= shift;
  69. for (i = 0; i < len; i++)
  70. nested_table_free(ntbl + i, size);
  71. }
  72. kfree(ntbl);
  73. }
  74. static void nested_bucket_table_free(const struct bucket_table *tbl)
  75. {
  76. unsigned int size = tbl->size >> tbl->nest;
  77. unsigned int len = 1 << tbl->nest;
  78. union nested_table *ntbl;
  79. unsigned int i;
  80. ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
  81. for (i = 0; i < len; i++)
  82. nested_table_free(ntbl + i, size);
  83. kfree(ntbl);
  84. }
  85. static void bucket_table_free(const struct bucket_table *tbl)
  86. {
  87. if (tbl->nest)
  88. nested_bucket_table_free(tbl);
  89. free_bucket_spinlocks(tbl->locks);
  90. kvfree(tbl);
  91. }
  92. static void bucket_table_free_rcu(struct rcu_head *head)
  93. {
  94. bucket_table_free(container_of(head, struct bucket_table, rcu));
  95. }
  96. static union nested_table *nested_table_alloc(struct rhashtable *ht,
  97. union nested_table __rcu **prev,
  98. bool leaf)
  99. {
  100. union nested_table *ntbl;
  101. int i;
  102. ntbl = rcu_dereference(*prev);
  103. if (ntbl)
  104. return ntbl;
  105. ntbl = kzalloc(PAGE_SIZE, GFP_ATOMIC);
  106. if (ntbl && leaf) {
  107. for (i = 0; i < PAGE_SIZE / sizeof(ntbl[0]); i++)
  108. INIT_RHT_NULLS_HEAD(ntbl[i].bucket);
  109. }
  110. rcu_assign_pointer(*prev, ntbl);
  111. return ntbl;
  112. }
  113. static struct bucket_table *nested_bucket_table_alloc(struct rhashtable *ht,
  114. size_t nbuckets,
  115. gfp_t gfp)
  116. {
  117. const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
  118. struct bucket_table *tbl;
  119. size_t size;
  120. if (nbuckets < (1 << (shift + 1)))
  121. return NULL;
  122. size = sizeof(*tbl) + sizeof(tbl->buckets[0]);
  123. tbl = kzalloc(size, gfp);
  124. if (!tbl)
  125. return NULL;
  126. if (!nested_table_alloc(ht, (union nested_table __rcu **)tbl->buckets,
  127. false)) {
  128. kfree(tbl);
  129. return NULL;
  130. }
  131. tbl->nest = (ilog2(nbuckets) - 1) % shift + 1;
  132. return tbl;
  133. }
  134. static struct bucket_table *bucket_table_alloc(struct rhashtable *ht,
  135. size_t nbuckets,
  136. gfp_t gfp)
  137. {
  138. struct bucket_table *tbl = NULL;
  139. size_t size, max_locks;
  140. int i;
  141. size = sizeof(*tbl) + nbuckets * sizeof(tbl->buckets[0]);
  142. tbl = kvzalloc(size, gfp);
  143. size = nbuckets;
  144. if (tbl == NULL && (gfp & ~__GFP_NOFAIL) != GFP_KERNEL) {
  145. tbl = nested_bucket_table_alloc(ht, nbuckets, gfp);
  146. nbuckets = 0;
  147. }
  148. if (tbl == NULL)
  149. return NULL;
  150. tbl->size = size;
  151. max_locks = size >> 1;
  152. if (tbl->nest)
  153. max_locks = min_t(size_t, max_locks, 1U << tbl->nest);
  154. if (alloc_bucket_spinlocks(&tbl->locks, &tbl->locks_mask, max_locks,
  155. ht->p.locks_mul, gfp) < 0) {
  156. bucket_table_free(tbl);
  157. return NULL;
  158. }
  159. INIT_LIST_HEAD(&tbl->walkers);
  160. tbl->hash_rnd = get_random_u32();
  161. for (i = 0; i < nbuckets; i++)
  162. INIT_RHT_NULLS_HEAD(tbl->buckets[i]);
  163. return tbl;
  164. }
  165. static struct bucket_table *rhashtable_last_table(struct rhashtable *ht,
  166. struct bucket_table *tbl)
  167. {
  168. struct bucket_table *new_tbl;
  169. do {
  170. new_tbl = tbl;
  171. tbl = rht_dereference_rcu(tbl->future_tbl, ht);
  172. } while (tbl);
  173. return new_tbl;
  174. }
  175. static int rhashtable_rehash_one(struct rhashtable *ht, unsigned int old_hash)
  176. {
  177. struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
  178. struct bucket_table *new_tbl = rhashtable_last_table(ht, old_tbl);
  179. struct rhash_head __rcu **pprev = rht_bucket_var(old_tbl, old_hash);
  180. int err = -EAGAIN;
  181. struct rhash_head *head, *next, *entry;
  182. spinlock_t *new_bucket_lock;
  183. unsigned int new_hash;
  184. if (new_tbl->nest)
  185. goto out;
  186. err = -ENOENT;
  187. rht_for_each(entry, old_tbl, old_hash) {
  188. err = 0;
  189. next = rht_dereference_bucket(entry->next, old_tbl, old_hash);
  190. if (rht_is_a_nulls(next))
  191. break;
  192. pprev = &entry->next;
  193. }
  194. if (err)
  195. goto out;
  196. new_hash = head_hashfn(ht, new_tbl, entry);
  197. new_bucket_lock = rht_bucket_lock(new_tbl, new_hash);
  198. spin_lock_nested(new_bucket_lock, SINGLE_DEPTH_NESTING);
  199. head = rht_dereference_bucket(new_tbl->buckets[new_hash],
  200. new_tbl, new_hash);
  201. RCU_INIT_POINTER(entry->next, head);
  202. rcu_assign_pointer(new_tbl->buckets[new_hash], entry);
  203. spin_unlock(new_bucket_lock);
  204. rcu_assign_pointer(*pprev, next);
  205. out:
  206. return err;
  207. }
  208. static int rhashtable_rehash_chain(struct rhashtable *ht,
  209. unsigned int old_hash)
  210. {
  211. struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
  212. spinlock_t *old_bucket_lock;
  213. int err;
  214. old_bucket_lock = rht_bucket_lock(old_tbl, old_hash);
  215. spin_lock_bh(old_bucket_lock);
  216. while (!(err = rhashtable_rehash_one(ht, old_hash)))
  217. ;
  218. if (err == -ENOENT) {
  219. old_tbl->rehash++;
  220. err = 0;
  221. }
  222. spin_unlock_bh(old_bucket_lock);
  223. return err;
  224. }
  225. static int rhashtable_rehash_attach(struct rhashtable *ht,
  226. struct bucket_table *old_tbl,
  227. struct bucket_table *new_tbl)
  228. {
  229. /* Make insertions go into the new, empty table right away. Deletions
  230. * and lookups will be attempted in both tables until we synchronize.
  231. * As cmpxchg() provides strong barriers, we do not need
  232. * rcu_assign_pointer().
  233. */
  234. if (cmpxchg(&old_tbl->future_tbl, NULL, new_tbl) != NULL)
  235. return -EEXIST;
  236. return 0;
  237. }
  238. static int rhashtable_rehash_table(struct rhashtable *ht)
  239. {
  240. struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
  241. struct bucket_table *new_tbl;
  242. struct rhashtable_walker *walker;
  243. unsigned int old_hash;
  244. int err;
  245. new_tbl = rht_dereference(old_tbl->future_tbl, ht);
  246. if (!new_tbl)
  247. return 0;
  248. for (old_hash = 0; old_hash < old_tbl->size; old_hash++) {
  249. err = rhashtable_rehash_chain(ht, old_hash);
  250. if (err)
  251. return err;
  252. cond_resched();
  253. }
  254. /* Publish the new table pointer. */
  255. rcu_assign_pointer(ht->tbl, new_tbl);
  256. spin_lock(&ht->lock);
  257. list_for_each_entry(walker, &old_tbl->walkers, list)
  258. walker->tbl = NULL;
  259. spin_unlock(&ht->lock);
  260. /* Wait for readers. All new readers will see the new
  261. * table, and thus no references to the old table will
  262. * remain.
  263. */
  264. call_rcu(&old_tbl->rcu, bucket_table_free_rcu);
  265. return rht_dereference(new_tbl->future_tbl, ht) ? -EAGAIN : 0;
  266. }
  267. static int rhashtable_rehash_alloc(struct rhashtable *ht,
  268. struct bucket_table *old_tbl,
  269. unsigned int size)
  270. {
  271. struct bucket_table *new_tbl;
  272. int err;
  273. ASSERT_RHT_MUTEX(ht);
  274. new_tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
  275. if (new_tbl == NULL)
  276. return -ENOMEM;
  277. err = rhashtable_rehash_attach(ht, old_tbl, new_tbl);
  278. if (err)
  279. bucket_table_free(new_tbl);
  280. return err;
  281. }
  282. /**
  283. * rhashtable_shrink - Shrink hash table while allowing concurrent lookups
  284. * @ht: the hash table to shrink
  285. *
  286. * This function shrinks the hash table to fit, i.e., the smallest
  287. * size would not cause it to expand right away automatically.
  288. *
  289. * The caller must ensure that no concurrent resizing occurs by holding
  290. * ht->mutex.
  291. *
  292. * The caller must ensure that no concurrent table mutations take place.
  293. * It is however valid to have concurrent lookups if they are RCU protected.
  294. *
  295. * It is valid to have concurrent insertions and deletions protected by per
  296. * bucket locks or concurrent RCU protected lookups and traversals.
  297. */
  298. static int rhashtable_shrink(struct rhashtable *ht)
  299. {
  300. struct bucket_table *old_tbl = rht_dereference(ht->tbl, ht);
  301. unsigned int nelems = atomic_read(&ht->nelems);
  302. unsigned int size = 0;
  303. if (nelems)
  304. size = roundup_pow_of_two(nelems * 3 / 2);
  305. if (size < ht->p.min_size)
  306. size = ht->p.min_size;
  307. if (old_tbl->size <= size)
  308. return 0;
  309. if (rht_dereference(old_tbl->future_tbl, ht))
  310. return -EEXIST;
  311. return rhashtable_rehash_alloc(ht, old_tbl, size);
  312. }
  313. static void rht_deferred_worker(struct work_struct *work)
  314. {
  315. struct rhashtable *ht;
  316. struct bucket_table *tbl;
  317. int err = 0;
  318. ht = container_of(work, struct rhashtable, run_work);
  319. mutex_lock(&ht->mutex);
  320. tbl = rht_dereference(ht->tbl, ht);
  321. tbl = rhashtable_last_table(ht, tbl);
  322. if (rht_grow_above_75(ht, tbl))
  323. err = rhashtable_rehash_alloc(ht, tbl, tbl->size * 2);
  324. else if (ht->p.automatic_shrinking && rht_shrink_below_30(ht, tbl))
  325. err = rhashtable_shrink(ht);
  326. else if (tbl->nest)
  327. err = rhashtable_rehash_alloc(ht, tbl, tbl->size);
  328. if (!err)
  329. err = rhashtable_rehash_table(ht);
  330. mutex_unlock(&ht->mutex);
  331. if (err)
  332. schedule_work(&ht->run_work);
  333. }
  334. static int rhashtable_insert_rehash(struct rhashtable *ht,
  335. struct bucket_table *tbl)
  336. {
  337. struct bucket_table *old_tbl;
  338. struct bucket_table *new_tbl;
  339. unsigned int size;
  340. int err;
  341. old_tbl = rht_dereference_rcu(ht->tbl, ht);
  342. size = tbl->size;
  343. err = -EBUSY;
  344. if (rht_grow_above_75(ht, tbl))
  345. size *= 2;
  346. /* Do not schedule more than one rehash */
  347. else if (old_tbl != tbl)
  348. goto fail;
  349. err = -ENOMEM;
  350. new_tbl = bucket_table_alloc(ht, size, GFP_ATOMIC | __GFP_NOWARN);
  351. if (new_tbl == NULL)
  352. goto fail;
  353. err = rhashtable_rehash_attach(ht, tbl, new_tbl);
  354. if (err) {
  355. bucket_table_free(new_tbl);
  356. if (err == -EEXIST)
  357. err = 0;
  358. } else
  359. schedule_work(&ht->run_work);
  360. return err;
  361. fail:
  362. /* Do not fail the insert if someone else did a rehash. */
  363. if (likely(rcu_access_pointer(tbl->future_tbl)))
  364. return 0;
  365. /* Schedule async rehash to retry allocation in process context. */
  366. if (err == -ENOMEM)
  367. schedule_work(&ht->run_work);
  368. return err;
  369. }
  370. static void *rhashtable_lookup_one(struct rhashtable *ht,
  371. struct bucket_table *tbl, unsigned int hash,
  372. const void *key, struct rhash_head *obj)
  373. {
  374. struct rhashtable_compare_arg arg = {
  375. .ht = ht,
  376. .key = key,
  377. };
  378. struct rhash_head __rcu **pprev;
  379. struct rhash_head *head;
  380. int elasticity;
  381. elasticity = RHT_ELASTICITY;
  382. pprev = rht_bucket_var(tbl, hash);
  383. rht_for_each_continue(head, *pprev, tbl, hash) {
  384. struct rhlist_head *list;
  385. struct rhlist_head *plist;
  386. elasticity--;
  387. if (!key ||
  388. (ht->p.obj_cmpfn ?
  389. ht->p.obj_cmpfn(&arg, rht_obj(ht, head)) :
  390. rhashtable_compare(&arg, rht_obj(ht, head)))) {
  391. pprev = &head->next;
  392. continue;
  393. }
  394. if (!ht->rhlist)
  395. return rht_obj(ht, head);
  396. list = container_of(obj, struct rhlist_head, rhead);
  397. plist = container_of(head, struct rhlist_head, rhead);
  398. RCU_INIT_POINTER(list->next, plist);
  399. head = rht_dereference_bucket(head->next, tbl, hash);
  400. RCU_INIT_POINTER(list->rhead.next, head);
  401. rcu_assign_pointer(*pprev, obj);
  402. return NULL;
  403. }
  404. if (elasticity <= 0)
  405. return ERR_PTR(-EAGAIN);
  406. return ERR_PTR(-ENOENT);
  407. }
  408. static struct bucket_table *rhashtable_insert_one(struct rhashtable *ht,
  409. struct bucket_table *tbl,
  410. unsigned int hash,
  411. struct rhash_head *obj,
  412. void *data)
  413. {
  414. struct rhash_head __rcu **pprev;
  415. struct bucket_table *new_tbl;
  416. struct rhash_head *head;
  417. if (!IS_ERR_OR_NULL(data))
  418. return ERR_PTR(-EEXIST);
  419. if (PTR_ERR(data) != -EAGAIN && PTR_ERR(data) != -ENOENT)
  420. return ERR_CAST(data);
  421. new_tbl = rht_dereference_rcu(tbl->future_tbl, ht);
  422. if (new_tbl)
  423. return new_tbl;
  424. if (PTR_ERR(data) != -ENOENT)
  425. return ERR_CAST(data);
  426. if (unlikely(rht_grow_above_max(ht, tbl)))
  427. return ERR_PTR(-E2BIG);
  428. if (unlikely(rht_grow_above_100(ht, tbl)))
  429. return ERR_PTR(-EAGAIN);
  430. pprev = rht_bucket_insert(ht, tbl, hash);
  431. if (!pprev)
  432. return ERR_PTR(-ENOMEM);
  433. head = rht_dereference_bucket(*pprev, tbl, hash);
  434. RCU_INIT_POINTER(obj->next, head);
  435. if (ht->rhlist) {
  436. struct rhlist_head *list;
  437. list = container_of(obj, struct rhlist_head, rhead);
  438. RCU_INIT_POINTER(list->next, NULL);
  439. }
  440. rcu_assign_pointer(*pprev, obj);
  441. atomic_inc(&ht->nelems);
  442. if (rht_grow_above_75(ht, tbl))
  443. schedule_work(&ht->run_work);
  444. return NULL;
  445. }
  446. static void *rhashtable_try_insert(struct rhashtable *ht, const void *key,
  447. struct rhash_head *obj)
  448. {
  449. struct bucket_table *new_tbl;
  450. struct bucket_table *tbl;
  451. unsigned int hash;
  452. spinlock_t *lock;
  453. void *data;
  454. tbl = rcu_dereference(ht->tbl);
  455. /* All insertions must grab the oldest table containing
  456. * the hashed bucket that is yet to be rehashed.
  457. */
  458. for (;;) {
  459. hash = rht_head_hashfn(ht, tbl, obj, ht->p);
  460. lock = rht_bucket_lock(tbl, hash);
  461. spin_lock_bh(lock);
  462. if (tbl->rehash <= hash)
  463. break;
  464. spin_unlock_bh(lock);
  465. tbl = rht_dereference_rcu(tbl->future_tbl, ht);
  466. }
  467. data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
  468. new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
  469. if (PTR_ERR(new_tbl) != -EEXIST)
  470. data = ERR_CAST(new_tbl);
  471. while (!IS_ERR_OR_NULL(new_tbl)) {
  472. tbl = new_tbl;
  473. hash = rht_head_hashfn(ht, tbl, obj, ht->p);
  474. spin_lock_nested(rht_bucket_lock(tbl, hash),
  475. SINGLE_DEPTH_NESTING);
  476. data = rhashtable_lookup_one(ht, tbl, hash, key, obj);
  477. new_tbl = rhashtable_insert_one(ht, tbl, hash, obj, data);
  478. if (PTR_ERR(new_tbl) != -EEXIST)
  479. data = ERR_CAST(new_tbl);
  480. spin_unlock(rht_bucket_lock(tbl, hash));
  481. }
  482. spin_unlock_bh(lock);
  483. if (PTR_ERR(data) == -EAGAIN)
  484. data = ERR_PTR(rhashtable_insert_rehash(ht, tbl) ?:
  485. -EAGAIN);
  486. return data;
  487. }
  488. void *rhashtable_insert_slow(struct rhashtable *ht, const void *key,
  489. struct rhash_head *obj)
  490. {
  491. void *data;
  492. do {
  493. rcu_read_lock();
  494. data = rhashtable_try_insert(ht, key, obj);
  495. rcu_read_unlock();
  496. } while (PTR_ERR(data) == -EAGAIN);
  497. return data;
  498. }
  499. EXPORT_SYMBOL_GPL(rhashtable_insert_slow);
  500. /**
  501. * rhashtable_walk_enter - Initialise an iterator
  502. * @ht: Table to walk over
  503. * @iter: Hash table Iterator
  504. *
  505. * This function prepares a hash table walk.
  506. *
  507. * Note that if you restart a walk after rhashtable_walk_stop you
  508. * may see the same object twice. Also, you may miss objects if
  509. * there are removals in between rhashtable_walk_stop and the next
  510. * call to rhashtable_walk_start.
  511. *
  512. * For a completely stable walk you should construct your own data
  513. * structure outside the hash table.
  514. *
  515. * This function may be called from any process context, including
  516. * non-preemptable context, but cannot be called from softirq or
  517. * hardirq context.
  518. *
  519. * You must call rhashtable_walk_exit after this function returns.
  520. */
  521. void rhashtable_walk_enter(struct rhashtable *ht, struct rhashtable_iter *iter)
  522. {
  523. iter->ht = ht;
  524. iter->p = NULL;
  525. iter->slot = 0;
  526. iter->skip = 0;
  527. iter->end_of_table = 0;
  528. spin_lock(&ht->lock);
  529. iter->walker.tbl =
  530. rcu_dereference_protected(ht->tbl, lockdep_is_held(&ht->lock));
  531. list_add(&iter->walker.list, &iter->walker.tbl->walkers);
  532. spin_unlock(&ht->lock);
  533. }
  534. EXPORT_SYMBOL_GPL(rhashtable_walk_enter);
  535. /**
  536. * rhashtable_walk_exit - Free an iterator
  537. * @iter: Hash table Iterator
  538. *
  539. * This function frees resources allocated by rhashtable_walk_init.
  540. */
  541. void rhashtable_walk_exit(struct rhashtable_iter *iter)
  542. {
  543. spin_lock(&iter->ht->lock);
  544. if (iter->walker.tbl)
  545. list_del(&iter->walker.list);
  546. spin_unlock(&iter->ht->lock);
  547. }
  548. EXPORT_SYMBOL_GPL(rhashtable_walk_exit);
  549. /**
  550. * rhashtable_walk_start_check - Start a hash table walk
  551. * @iter: Hash table iterator
  552. *
  553. * Start a hash table walk at the current iterator position. Note that we take
  554. * the RCU lock in all cases including when we return an error. So you must
  555. * always call rhashtable_walk_stop to clean up.
  556. *
  557. * Returns zero if successful.
  558. *
  559. * Returns -EAGAIN if resize event occured. Note that the iterator
  560. * will rewind back to the beginning and you may use it immediately
  561. * by calling rhashtable_walk_next.
  562. *
  563. * rhashtable_walk_start is defined as an inline variant that returns
  564. * void. This is preferred in cases where the caller would ignore
  565. * resize events and always continue.
  566. */
  567. int rhashtable_walk_start_check(struct rhashtable_iter *iter)
  568. __acquires(RCU)
  569. {
  570. struct rhashtable *ht = iter->ht;
  571. bool rhlist = ht->rhlist;
  572. rcu_read_lock();
  573. spin_lock(&ht->lock);
  574. if (iter->walker.tbl)
  575. list_del(&iter->walker.list);
  576. spin_unlock(&ht->lock);
  577. if (iter->end_of_table)
  578. return 0;
  579. if (!iter->walker.tbl) {
  580. iter->walker.tbl = rht_dereference_rcu(ht->tbl, ht);
  581. iter->slot = 0;
  582. iter->skip = 0;
  583. return -EAGAIN;
  584. }
  585. if (iter->p && !rhlist) {
  586. /*
  587. * We need to validate that 'p' is still in the table, and
  588. * if so, update 'skip'
  589. */
  590. struct rhash_head *p;
  591. int skip = 0;
  592. rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
  593. skip++;
  594. if (p == iter->p) {
  595. iter->skip = skip;
  596. goto found;
  597. }
  598. }
  599. iter->p = NULL;
  600. } else if (iter->p && rhlist) {
  601. /* Need to validate that 'list' is still in the table, and
  602. * if so, update 'skip' and 'p'.
  603. */
  604. struct rhash_head *p;
  605. struct rhlist_head *list;
  606. int skip = 0;
  607. rht_for_each_rcu(p, iter->walker.tbl, iter->slot) {
  608. for (list = container_of(p, struct rhlist_head, rhead);
  609. list;
  610. list = rcu_dereference(list->next)) {
  611. skip++;
  612. if (list == iter->list) {
  613. iter->p = p;
  614. iter->skip = skip;
  615. goto found;
  616. }
  617. }
  618. }
  619. iter->p = NULL;
  620. }
  621. found:
  622. return 0;
  623. }
  624. EXPORT_SYMBOL_GPL(rhashtable_walk_start_check);
  625. /**
  626. * __rhashtable_walk_find_next - Find the next element in a table (or the first
  627. * one in case of a new walk).
  628. *
  629. * @iter: Hash table iterator
  630. *
  631. * Returns the found object or NULL when the end of the table is reached.
  632. *
  633. * Returns -EAGAIN if resize event occurred.
  634. */
  635. static void *__rhashtable_walk_find_next(struct rhashtable_iter *iter)
  636. {
  637. struct bucket_table *tbl = iter->walker.tbl;
  638. struct rhlist_head *list = iter->list;
  639. struct rhashtable *ht = iter->ht;
  640. struct rhash_head *p = iter->p;
  641. bool rhlist = ht->rhlist;
  642. if (!tbl)
  643. return NULL;
  644. for (; iter->slot < tbl->size; iter->slot++) {
  645. int skip = iter->skip;
  646. rht_for_each_rcu(p, tbl, iter->slot) {
  647. if (rhlist) {
  648. list = container_of(p, struct rhlist_head,
  649. rhead);
  650. do {
  651. if (!skip)
  652. goto next;
  653. skip--;
  654. list = rcu_dereference(list->next);
  655. } while (list);
  656. continue;
  657. }
  658. if (!skip)
  659. break;
  660. skip--;
  661. }
  662. next:
  663. if (!rht_is_a_nulls(p)) {
  664. iter->skip++;
  665. iter->p = p;
  666. iter->list = list;
  667. return rht_obj(ht, rhlist ? &list->rhead : p);
  668. }
  669. iter->skip = 0;
  670. }
  671. iter->p = NULL;
  672. /* Ensure we see any new tables. */
  673. smp_rmb();
  674. iter->walker.tbl = rht_dereference_rcu(tbl->future_tbl, ht);
  675. if (iter->walker.tbl) {
  676. iter->slot = 0;
  677. iter->skip = 0;
  678. return ERR_PTR(-EAGAIN);
  679. } else {
  680. iter->end_of_table = true;
  681. }
  682. return NULL;
  683. }
  684. /**
  685. * rhashtable_walk_next - Return the next object and advance the iterator
  686. * @iter: Hash table iterator
  687. *
  688. * Note that you must call rhashtable_walk_stop when you are finished
  689. * with the walk.
  690. *
  691. * Returns the next object or NULL when the end of the table is reached.
  692. *
  693. * Returns -EAGAIN if resize event occurred. Note that the iterator
  694. * will rewind back to the beginning and you may continue to use it.
  695. */
  696. void *rhashtable_walk_next(struct rhashtable_iter *iter)
  697. {
  698. struct rhlist_head *list = iter->list;
  699. struct rhashtable *ht = iter->ht;
  700. struct rhash_head *p = iter->p;
  701. bool rhlist = ht->rhlist;
  702. if (p) {
  703. if (!rhlist || !(list = rcu_dereference(list->next))) {
  704. p = rcu_dereference(p->next);
  705. list = container_of(p, struct rhlist_head, rhead);
  706. }
  707. if (!rht_is_a_nulls(p)) {
  708. iter->skip++;
  709. iter->p = p;
  710. iter->list = list;
  711. return rht_obj(ht, rhlist ? &list->rhead : p);
  712. }
  713. /* At the end of this slot, switch to next one and then find
  714. * next entry from that point.
  715. */
  716. iter->skip = 0;
  717. iter->slot++;
  718. }
  719. return __rhashtable_walk_find_next(iter);
  720. }
  721. EXPORT_SYMBOL_GPL(rhashtable_walk_next);
  722. /**
  723. * rhashtable_walk_peek - Return the next object but don't advance the iterator
  724. * @iter: Hash table iterator
  725. *
  726. * Returns the next object or NULL when the end of the table is reached.
  727. *
  728. * Returns -EAGAIN if resize event occurred. Note that the iterator
  729. * will rewind back to the beginning and you may continue to use it.
  730. */
  731. void *rhashtable_walk_peek(struct rhashtable_iter *iter)
  732. {
  733. struct rhlist_head *list = iter->list;
  734. struct rhashtable *ht = iter->ht;
  735. struct rhash_head *p = iter->p;
  736. if (p)
  737. return rht_obj(ht, ht->rhlist ? &list->rhead : p);
  738. /* No object found in current iter, find next one in the table. */
  739. if (iter->skip) {
  740. /* A nonzero skip value points to the next entry in the table
  741. * beyond that last one that was found. Decrement skip so
  742. * we find the current value. __rhashtable_walk_find_next
  743. * will restore the original value of skip assuming that
  744. * the table hasn't changed.
  745. */
  746. iter->skip--;
  747. }
  748. return __rhashtable_walk_find_next(iter);
  749. }
  750. EXPORT_SYMBOL_GPL(rhashtable_walk_peek);
  751. /**
  752. * rhashtable_walk_stop - Finish a hash table walk
  753. * @iter: Hash table iterator
  754. *
  755. * Finish a hash table walk. Does not reset the iterator to the start of the
  756. * hash table.
  757. */
  758. void rhashtable_walk_stop(struct rhashtable_iter *iter)
  759. __releases(RCU)
  760. {
  761. struct rhashtable *ht;
  762. struct bucket_table *tbl = iter->walker.tbl;
  763. if (!tbl)
  764. goto out;
  765. ht = iter->ht;
  766. spin_lock(&ht->lock);
  767. if (tbl->rehash < tbl->size)
  768. list_add(&iter->walker.list, &tbl->walkers);
  769. else
  770. iter->walker.tbl = NULL;
  771. spin_unlock(&ht->lock);
  772. out:
  773. rcu_read_unlock();
  774. }
  775. EXPORT_SYMBOL_GPL(rhashtable_walk_stop);
  776. static size_t rounded_hashtable_size(const struct rhashtable_params *params)
  777. {
  778. size_t retsize;
  779. if (params->nelem_hint)
  780. retsize = max(roundup_pow_of_two(params->nelem_hint * 4 / 3),
  781. (unsigned long)params->min_size);
  782. else
  783. retsize = max(HASH_DEFAULT_SIZE,
  784. (unsigned long)params->min_size);
  785. return retsize;
  786. }
  787. static u32 rhashtable_jhash2(const void *key, u32 length, u32 seed)
  788. {
  789. return jhash2(key, length, seed);
  790. }
  791. /**
  792. * rhashtable_init - initialize a new hash table
  793. * @ht: hash table to be initialized
  794. * @params: configuration parameters
  795. *
  796. * Initializes a new hash table based on the provided configuration
  797. * parameters. A table can be configured either with a variable or
  798. * fixed length key:
  799. *
  800. * Configuration Example 1: Fixed length keys
  801. * struct test_obj {
  802. * int key;
  803. * void * my_member;
  804. * struct rhash_head node;
  805. * };
  806. *
  807. * struct rhashtable_params params = {
  808. * .head_offset = offsetof(struct test_obj, node),
  809. * .key_offset = offsetof(struct test_obj, key),
  810. * .key_len = sizeof(int),
  811. * .hashfn = jhash,
  812. * };
  813. *
  814. * Configuration Example 2: Variable length keys
  815. * struct test_obj {
  816. * [...]
  817. * struct rhash_head node;
  818. * };
  819. *
  820. * u32 my_hash_fn(const void *data, u32 len, u32 seed)
  821. * {
  822. * struct test_obj *obj = data;
  823. *
  824. * return [... hash ...];
  825. * }
  826. *
  827. * struct rhashtable_params params = {
  828. * .head_offset = offsetof(struct test_obj, node),
  829. * .hashfn = jhash,
  830. * .obj_hashfn = my_hash_fn,
  831. * };
  832. */
  833. int rhashtable_init(struct rhashtable *ht,
  834. const struct rhashtable_params *params)
  835. {
  836. struct bucket_table *tbl;
  837. size_t size;
  838. if ((!params->key_len && !params->obj_hashfn) ||
  839. (params->obj_hashfn && !params->obj_cmpfn))
  840. return -EINVAL;
  841. memset(ht, 0, sizeof(*ht));
  842. mutex_init(&ht->mutex);
  843. spin_lock_init(&ht->lock);
  844. memcpy(&ht->p, params, sizeof(*params));
  845. if (params->min_size)
  846. ht->p.min_size = roundup_pow_of_two(params->min_size);
  847. /* Cap total entries at 2^31 to avoid nelems overflow. */
  848. ht->max_elems = 1u << 31;
  849. if (params->max_size) {
  850. ht->p.max_size = rounddown_pow_of_two(params->max_size);
  851. if (ht->p.max_size < ht->max_elems / 2)
  852. ht->max_elems = ht->p.max_size * 2;
  853. }
  854. ht->p.min_size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
  855. size = rounded_hashtable_size(&ht->p);
  856. if (params->locks_mul)
  857. ht->p.locks_mul = roundup_pow_of_two(params->locks_mul);
  858. else
  859. ht->p.locks_mul = BUCKET_LOCKS_PER_CPU;
  860. ht->key_len = ht->p.key_len;
  861. if (!params->hashfn) {
  862. ht->p.hashfn = jhash;
  863. if (!(ht->key_len & (sizeof(u32) - 1))) {
  864. ht->key_len /= sizeof(u32);
  865. ht->p.hashfn = rhashtable_jhash2;
  866. }
  867. }
  868. /*
  869. * This is api initialization and thus we need to guarantee the
  870. * initial rhashtable allocation. Upon failure, retry with the
  871. * smallest possible size with __GFP_NOFAIL semantics.
  872. */
  873. tbl = bucket_table_alloc(ht, size, GFP_KERNEL);
  874. if (unlikely(tbl == NULL)) {
  875. size = max_t(u16, ht->p.min_size, HASH_MIN_SIZE);
  876. tbl = bucket_table_alloc(ht, size, GFP_KERNEL | __GFP_NOFAIL);
  877. }
  878. atomic_set(&ht->nelems, 0);
  879. RCU_INIT_POINTER(ht->tbl, tbl);
  880. INIT_WORK(&ht->run_work, rht_deferred_worker);
  881. return 0;
  882. }
  883. EXPORT_SYMBOL_GPL(rhashtable_init);
  884. /**
  885. * rhltable_init - initialize a new hash list table
  886. * @hlt: hash list table to be initialized
  887. * @params: configuration parameters
  888. *
  889. * Initializes a new hash list table.
  890. *
  891. * See documentation for rhashtable_init.
  892. */
  893. int rhltable_init(struct rhltable *hlt, const struct rhashtable_params *params)
  894. {
  895. int err;
  896. err = rhashtable_init(&hlt->ht, params);
  897. hlt->ht.rhlist = true;
  898. return err;
  899. }
  900. EXPORT_SYMBOL_GPL(rhltable_init);
  901. static void rhashtable_free_one(struct rhashtable *ht, struct rhash_head *obj,
  902. void (*free_fn)(void *ptr, void *arg),
  903. void *arg)
  904. {
  905. struct rhlist_head *list;
  906. if (!ht->rhlist) {
  907. free_fn(rht_obj(ht, obj), arg);
  908. return;
  909. }
  910. list = container_of(obj, struct rhlist_head, rhead);
  911. do {
  912. obj = &list->rhead;
  913. list = rht_dereference(list->next, ht);
  914. free_fn(rht_obj(ht, obj), arg);
  915. } while (list);
  916. }
  917. /**
  918. * rhashtable_free_and_destroy - free elements and destroy hash table
  919. * @ht: the hash table to destroy
  920. * @free_fn: callback to release resources of element
  921. * @arg: pointer passed to free_fn
  922. *
  923. * Stops an eventual async resize. If defined, invokes free_fn for each
  924. * element to releasal resources. Please note that RCU protected
  925. * readers may still be accessing the elements. Releasing of resources
  926. * must occur in a compatible manner. Then frees the bucket array.
  927. *
  928. * This function will eventually sleep to wait for an async resize
  929. * to complete. The caller is responsible that no further write operations
  930. * occurs in parallel.
  931. */
  932. void rhashtable_free_and_destroy(struct rhashtable *ht,
  933. void (*free_fn)(void *ptr, void *arg),
  934. void *arg)
  935. {
  936. struct bucket_table *tbl, *next_tbl;
  937. unsigned int i;
  938. cancel_work_sync(&ht->run_work);
  939. mutex_lock(&ht->mutex);
  940. tbl = rht_dereference(ht->tbl, ht);
  941. restart:
  942. if (free_fn) {
  943. for (i = 0; i < tbl->size; i++) {
  944. struct rhash_head *pos, *next;
  945. cond_resched();
  946. for (pos = rht_dereference(*rht_bucket(tbl, i), ht),
  947. next = !rht_is_a_nulls(pos) ?
  948. rht_dereference(pos->next, ht) : NULL;
  949. !rht_is_a_nulls(pos);
  950. pos = next,
  951. next = !rht_is_a_nulls(pos) ?
  952. rht_dereference(pos->next, ht) : NULL)
  953. rhashtable_free_one(ht, pos, free_fn, arg);
  954. }
  955. }
  956. next_tbl = rht_dereference(tbl->future_tbl, ht);
  957. bucket_table_free(tbl);
  958. if (next_tbl) {
  959. tbl = next_tbl;
  960. goto restart;
  961. }
  962. mutex_unlock(&ht->mutex);
  963. }
  964. EXPORT_SYMBOL_GPL(rhashtable_free_and_destroy);
  965. void rhashtable_destroy(struct rhashtable *ht)
  966. {
  967. return rhashtable_free_and_destroy(ht, NULL, NULL);
  968. }
  969. EXPORT_SYMBOL_GPL(rhashtable_destroy);
  970. struct rhash_head __rcu **rht_bucket_nested(const struct bucket_table *tbl,
  971. unsigned int hash)
  972. {
  973. const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
  974. static struct rhash_head __rcu *rhnull =
  975. (struct rhash_head __rcu *)NULLS_MARKER(0);
  976. unsigned int index = hash & ((1 << tbl->nest) - 1);
  977. unsigned int size = tbl->size >> tbl->nest;
  978. unsigned int subhash = hash;
  979. union nested_table *ntbl;
  980. ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
  981. ntbl = rht_dereference_bucket_rcu(ntbl[index].table, tbl, hash);
  982. subhash >>= tbl->nest;
  983. while (ntbl && size > (1 << shift)) {
  984. index = subhash & ((1 << shift) - 1);
  985. ntbl = rht_dereference_bucket_rcu(ntbl[index].table,
  986. tbl, hash);
  987. size >>= shift;
  988. subhash >>= shift;
  989. }
  990. if (!ntbl)
  991. return &rhnull;
  992. return &ntbl[subhash].bucket;
  993. }
  994. EXPORT_SYMBOL_GPL(rht_bucket_nested);
  995. struct rhash_head __rcu **rht_bucket_nested_insert(struct rhashtable *ht,
  996. struct bucket_table *tbl,
  997. unsigned int hash)
  998. {
  999. const unsigned int shift = PAGE_SHIFT - ilog2(sizeof(void *));
  1000. unsigned int index = hash & ((1 << tbl->nest) - 1);
  1001. unsigned int size = tbl->size >> tbl->nest;
  1002. union nested_table *ntbl;
  1003. ntbl = (union nested_table *)rcu_dereference_raw(tbl->buckets[0]);
  1004. hash >>= tbl->nest;
  1005. ntbl = nested_table_alloc(ht, &ntbl[index].table,
  1006. size <= (1 << shift));
  1007. while (ntbl && size > (1 << shift)) {
  1008. index = hash & ((1 << shift) - 1);
  1009. size >>= shift;
  1010. hash >>= shift;
  1011. ntbl = nested_table_alloc(ht, &ntbl[index].table,
  1012. size <= (1 << shift));
  1013. }
  1014. if (!ntbl)
  1015. return NULL;
  1016. return &ntbl[hash].bucket;
  1017. }
  1018. EXPORT_SYMBOL_GPL(rht_bucket_nested_insert);