hashtab.c 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651
  1. /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  2. *
  3. * This program is free software; you can redistribute it and/or
  4. * modify it under the terms of version 2 of the GNU General Public
  5. * License as published by the Free Software Foundation.
  6. *
  7. * This program is distributed in the hope that it will be useful, but
  8. * WITHOUT ANY WARRANTY; without even the implied warranty of
  9. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  10. * General Public License for more details.
  11. */
  12. #include <linux/bpf.h>
  13. #include <linux/jhash.h>
  14. #include <linux/filter.h>
  15. #include <linux/vmalloc.h>
  16. struct bucket {
  17. struct hlist_head head;
  18. raw_spinlock_t lock;
  19. };
  20. struct bpf_htab {
  21. struct bpf_map map;
  22. struct bucket *buckets;
  23. atomic_t count; /* number of elements in this hashtable */
  24. u32 n_buckets; /* number of hash buckets */
  25. u32 elem_size; /* size of each element in bytes */
  26. };
  27. /* each htab element is struct htab_elem + key + value */
  28. struct htab_elem {
  29. struct hlist_node hash_node;
  30. struct rcu_head rcu;
  31. union {
  32. u32 hash;
  33. u32 key_size;
  34. };
  35. char key[0] __aligned(8);
  36. };
  37. /* Called from syscall */
  38. static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
  39. {
  40. bool percpu = attr->map_type == BPF_MAP_TYPE_PERCPU_HASH;
  41. struct bpf_htab *htab;
  42. int err, i;
  43. u64 cost;
  44. htab = kzalloc(sizeof(*htab), GFP_USER);
  45. if (!htab)
  46. return ERR_PTR(-ENOMEM);
  47. /* mandatory map attributes */
  48. htab->map.map_type = attr->map_type;
  49. htab->map.key_size = attr->key_size;
  50. htab->map.value_size = attr->value_size;
  51. htab->map.max_entries = attr->max_entries;
  52. /* check sanity of attributes.
  53. * value_size == 0 may be allowed in the future to use map as a set
  54. */
  55. err = -EINVAL;
  56. if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
  57. htab->map.value_size == 0)
  58. goto free_htab;
  59. /* hash table size must be power of 2 */
  60. htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
  61. err = -E2BIG;
  62. if (htab->map.key_size > MAX_BPF_STACK)
  63. /* eBPF programs initialize keys on stack, so they cannot be
  64. * larger than max stack size
  65. */
  66. goto free_htab;
  67. if (htab->map.value_size >= (1 << (KMALLOC_SHIFT_MAX - 1)) -
  68. MAX_BPF_STACK - sizeof(struct htab_elem))
  69. /* if value_size is bigger, the user space won't be able to
  70. * access the elements via bpf syscall. This check also makes
  71. * sure that the elem_size doesn't overflow and it's
  72. * kmalloc-able later in htab_map_update_elem()
  73. */
  74. goto free_htab;
  75. if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
  76. /* make sure the size for pcpu_alloc() is reasonable */
  77. goto free_htab;
  78. htab->elem_size = sizeof(struct htab_elem) +
  79. round_up(htab->map.key_size, 8);
  80. if (percpu)
  81. htab->elem_size += sizeof(void *);
  82. else
  83. htab->elem_size += htab->map.value_size;
  84. /* prevent zero size kmalloc and check for u32 overflow */
  85. if (htab->n_buckets == 0 ||
  86. htab->n_buckets > U32_MAX / sizeof(struct bucket))
  87. goto free_htab;
  88. cost = (u64) htab->n_buckets * sizeof(struct bucket) +
  89. (u64) htab->elem_size * htab->map.max_entries;
  90. if (percpu)
  91. cost += (u64) round_up(htab->map.value_size, 8) *
  92. num_possible_cpus() * htab->map.max_entries;
  93. if (cost >= U32_MAX - PAGE_SIZE)
  94. /* make sure page count doesn't overflow */
  95. goto free_htab;
  96. htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
  97. err = -ENOMEM;
  98. htab->buckets = kmalloc_array(htab->n_buckets, sizeof(struct bucket),
  99. GFP_USER | __GFP_NOWARN);
  100. if (!htab->buckets) {
  101. htab->buckets = vmalloc(htab->n_buckets * sizeof(struct bucket));
  102. if (!htab->buckets)
  103. goto free_htab;
  104. }
  105. for (i = 0; i < htab->n_buckets; i++) {
  106. INIT_HLIST_HEAD(&htab->buckets[i].head);
  107. raw_spin_lock_init(&htab->buckets[i].lock);
  108. }
  109. atomic_set(&htab->count, 0);
  110. return &htab->map;
  111. free_htab:
  112. kfree(htab);
  113. return ERR_PTR(err);
  114. }
  115. static inline u32 htab_map_hash(const void *key, u32 key_len)
  116. {
  117. return jhash(key, key_len, 0);
  118. }
  119. static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
  120. {
  121. return &htab->buckets[hash & (htab->n_buckets - 1)];
  122. }
  123. static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
  124. {
  125. return &__select_bucket(htab, hash)->head;
  126. }
  127. static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
  128. void *key, u32 key_size)
  129. {
  130. struct htab_elem *l;
  131. hlist_for_each_entry_rcu(l, head, hash_node)
  132. if (l->hash == hash && !memcmp(&l->key, key, key_size))
  133. return l;
  134. return NULL;
  135. }
  136. /* Called from syscall or from eBPF program */
  137. static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
  138. {
  139. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  140. struct hlist_head *head;
  141. struct htab_elem *l;
  142. u32 hash, key_size;
  143. /* Must be called with rcu_read_lock. */
  144. WARN_ON_ONCE(!rcu_read_lock_held());
  145. key_size = map->key_size;
  146. hash = htab_map_hash(key, key_size);
  147. head = select_bucket(htab, hash);
  148. l = lookup_elem_raw(head, hash, key, key_size);
  149. return l;
  150. }
  151. static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
  152. {
  153. struct htab_elem *l = __htab_map_lookup_elem(map, key);
  154. if (l)
  155. return l->key + round_up(map->key_size, 8);
  156. return NULL;
  157. }
  158. /* Called from syscall */
  159. static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
  160. {
  161. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  162. struct hlist_head *head;
  163. struct htab_elem *l, *next_l;
  164. u32 hash, key_size;
  165. int i;
  166. WARN_ON_ONCE(!rcu_read_lock_held());
  167. key_size = map->key_size;
  168. hash = htab_map_hash(key, key_size);
  169. head = select_bucket(htab, hash);
  170. /* lookup the key */
  171. l = lookup_elem_raw(head, hash, key, key_size);
  172. if (!l) {
  173. i = 0;
  174. goto find_first_elem;
  175. }
  176. /* key was found, get next key in the same bucket */
  177. next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
  178. struct htab_elem, hash_node);
  179. if (next_l) {
  180. /* if next elem in this hash list is non-zero, just return it */
  181. memcpy(next_key, next_l->key, key_size);
  182. return 0;
  183. }
  184. /* no more elements in this hash list, go to the next bucket */
  185. i = hash & (htab->n_buckets - 1);
  186. i++;
  187. find_first_elem:
  188. /* iterate over buckets */
  189. for (; i < htab->n_buckets; i++) {
  190. head = select_bucket(htab, i);
  191. /* pick first element in the bucket */
  192. next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
  193. struct htab_elem, hash_node);
  194. if (next_l) {
  195. /* if it's not empty, just return it */
  196. memcpy(next_key, next_l->key, key_size);
  197. return 0;
  198. }
  199. }
  200. /* itereated over all buckets and all elements */
  201. return -ENOENT;
  202. }
  203. static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
  204. void __percpu *pptr)
  205. {
  206. *(void __percpu **)(l->key + key_size) = pptr;
  207. }
  208. static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
  209. {
  210. return *(void __percpu **)(l->key + key_size);
  211. }
  212. static void htab_percpu_elem_free(struct htab_elem *l)
  213. {
  214. free_percpu(htab_elem_get_ptr(l, l->key_size));
  215. kfree(l);
  216. }
  217. static void htab_percpu_elem_free_rcu(struct rcu_head *head)
  218. {
  219. struct htab_elem *l = container_of(head, struct htab_elem, rcu);
  220. htab_percpu_elem_free(l);
  221. }
  222. static void free_htab_elem(struct htab_elem *l, bool percpu, u32 key_size)
  223. {
  224. if (percpu) {
  225. l->key_size = key_size;
  226. call_rcu(&l->rcu, htab_percpu_elem_free_rcu);
  227. } else {
  228. kfree_rcu(l, rcu);
  229. }
  230. }
  231. static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
  232. void *value, u32 key_size, u32 hash,
  233. bool percpu, bool onallcpus)
  234. {
  235. u32 size = htab->map.value_size;
  236. struct htab_elem *l_new;
  237. void __percpu *pptr;
  238. l_new = kmalloc(htab->elem_size, GFP_ATOMIC | __GFP_NOWARN);
  239. if (!l_new)
  240. return NULL;
  241. memcpy(l_new->key, key, key_size);
  242. if (percpu) {
  243. /* round up value_size to 8 bytes */
  244. size = round_up(size, 8);
  245. /* alloc_percpu zero-fills */
  246. pptr = __alloc_percpu_gfp(size, 8, GFP_ATOMIC | __GFP_NOWARN);
  247. if (!pptr) {
  248. kfree(l_new);
  249. return NULL;
  250. }
  251. if (!onallcpus) {
  252. /* copy true value_size bytes */
  253. memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
  254. } else {
  255. int off = 0, cpu;
  256. for_each_possible_cpu(cpu) {
  257. bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
  258. value + off, size);
  259. off += size;
  260. }
  261. }
  262. htab_elem_set_ptr(l_new, key_size, pptr);
  263. } else {
  264. memcpy(l_new->key + round_up(key_size, 8), value, size);
  265. }
  266. l_new->hash = hash;
  267. return l_new;
  268. }
  269. static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
  270. u64 map_flags)
  271. {
  272. if (!l_old && unlikely(atomic_read(&htab->count) >= htab->map.max_entries))
  273. /* if elem with this 'key' doesn't exist and we've reached
  274. * max_entries limit, fail insertion of new elem
  275. */
  276. return -E2BIG;
  277. if (l_old && map_flags == BPF_NOEXIST)
  278. /* elem already exists */
  279. return -EEXIST;
  280. if (!l_old && map_flags == BPF_EXIST)
  281. /* elem doesn't exist, cannot update it */
  282. return -ENOENT;
  283. return 0;
  284. }
  285. /* Called from syscall or from eBPF program */
  286. static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
  287. u64 map_flags)
  288. {
  289. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  290. struct htab_elem *l_new = NULL, *l_old;
  291. struct hlist_head *head;
  292. unsigned long flags;
  293. struct bucket *b;
  294. u32 key_size, hash;
  295. int ret;
  296. if (unlikely(map_flags > BPF_EXIST))
  297. /* unknown flags */
  298. return -EINVAL;
  299. WARN_ON_ONCE(!rcu_read_lock_held());
  300. key_size = map->key_size;
  301. hash = htab_map_hash(key, key_size);
  302. /* allocate new element outside of the lock, since
  303. * we're most likley going to insert it
  304. */
  305. l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false);
  306. if (!l_new)
  307. return -ENOMEM;
  308. b = __select_bucket(htab, hash);
  309. head = &b->head;
  310. /* bpf_map_update_elem() can be called in_irq() */
  311. raw_spin_lock_irqsave(&b->lock, flags);
  312. l_old = lookup_elem_raw(head, hash, key, key_size);
  313. ret = check_flags(htab, l_old, map_flags);
  314. if (ret)
  315. goto err;
  316. /* add new element to the head of the list, so that
  317. * concurrent search will find it before old elem
  318. */
  319. hlist_add_head_rcu(&l_new->hash_node, head);
  320. if (l_old) {
  321. hlist_del_rcu(&l_old->hash_node);
  322. kfree_rcu(l_old, rcu);
  323. } else {
  324. atomic_inc(&htab->count);
  325. }
  326. raw_spin_unlock_irqrestore(&b->lock, flags);
  327. return 0;
  328. err:
  329. raw_spin_unlock_irqrestore(&b->lock, flags);
  330. kfree(l_new);
  331. return ret;
  332. }
  333. static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
  334. void *value, u64 map_flags,
  335. bool onallcpus)
  336. {
  337. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  338. struct htab_elem *l_new = NULL, *l_old;
  339. struct hlist_head *head;
  340. unsigned long flags;
  341. struct bucket *b;
  342. u32 key_size, hash;
  343. int ret;
  344. if (unlikely(map_flags > BPF_EXIST))
  345. /* unknown flags */
  346. return -EINVAL;
  347. WARN_ON_ONCE(!rcu_read_lock_held());
  348. key_size = map->key_size;
  349. hash = htab_map_hash(key, key_size);
  350. b = __select_bucket(htab, hash);
  351. head = &b->head;
  352. /* bpf_map_update_elem() can be called in_irq() */
  353. raw_spin_lock_irqsave(&b->lock, flags);
  354. l_old = lookup_elem_raw(head, hash, key, key_size);
  355. ret = check_flags(htab, l_old, map_flags);
  356. if (ret)
  357. goto err;
  358. if (l_old) {
  359. void __percpu *pptr = htab_elem_get_ptr(l_old, key_size);
  360. u32 size = htab->map.value_size;
  361. /* per-cpu hash map can update value in-place */
  362. if (!onallcpus) {
  363. memcpy(this_cpu_ptr(pptr), value, size);
  364. } else {
  365. int off = 0, cpu;
  366. size = round_up(size, 8);
  367. for_each_possible_cpu(cpu) {
  368. bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
  369. value + off, size);
  370. off += size;
  371. }
  372. }
  373. } else {
  374. l_new = alloc_htab_elem(htab, key, value, key_size,
  375. hash, true, onallcpus);
  376. if (!l_new) {
  377. ret = -ENOMEM;
  378. goto err;
  379. }
  380. hlist_add_head_rcu(&l_new->hash_node, head);
  381. atomic_inc(&htab->count);
  382. }
  383. ret = 0;
  384. err:
  385. raw_spin_unlock_irqrestore(&b->lock, flags);
  386. return ret;
  387. }
  388. static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
  389. void *value, u64 map_flags)
  390. {
  391. return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
  392. }
  393. /* Called from syscall or from eBPF program */
  394. static int htab_map_delete_elem(struct bpf_map *map, void *key)
  395. {
  396. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  397. bool percpu = map->map_type == BPF_MAP_TYPE_PERCPU_HASH;
  398. struct hlist_head *head;
  399. struct bucket *b;
  400. struct htab_elem *l;
  401. unsigned long flags;
  402. u32 hash, key_size;
  403. int ret = -ENOENT;
  404. WARN_ON_ONCE(!rcu_read_lock_held());
  405. key_size = map->key_size;
  406. hash = htab_map_hash(key, key_size);
  407. b = __select_bucket(htab, hash);
  408. head = &b->head;
  409. raw_spin_lock_irqsave(&b->lock, flags);
  410. l = lookup_elem_raw(head, hash, key, key_size);
  411. if (l) {
  412. hlist_del_rcu(&l->hash_node);
  413. atomic_dec(&htab->count);
  414. free_htab_elem(l, percpu, key_size);
  415. ret = 0;
  416. }
  417. raw_spin_unlock_irqrestore(&b->lock, flags);
  418. return ret;
  419. }
  420. static void delete_all_elements(struct bpf_htab *htab)
  421. {
  422. int i;
  423. for (i = 0; i < htab->n_buckets; i++) {
  424. struct hlist_head *head = select_bucket(htab, i);
  425. struct hlist_node *n;
  426. struct htab_elem *l;
  427. hlist_for_each_entry_safe(l, n, head, hash_node) {
  428. hlist_del_rcu(&l->hash_node);
  429. atomic_dec(&htab->count);
  430. if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH) {
  431. l->key_size = htab->map.key_size;
  432. htab_percpu_elem_free(l);
  433. } else {
  434. kfree(l);
  435. }
  436. }
  437. }
  438. }
  439. /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
  440. static void htab_map_free(struct bpf_map *map)
  441. {
  442. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  443. /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
  444. * so the programs (can be more than one that used this map) were
  445. * disconnected from events. Wait for outstanding critical sections in
  446. * these programs to complete
  447. */
  448. synchronize_rcu();
  449. /* some of kfree_rcu() callbacks for elements of this map may not have
  450. * executed. It's ok. Proceed to free residual elements and map itself
  451. */
  452. delete_all_elements(htab);
  453. kvfree(htab->buckets);
  454. kfree(htab);
  455. }
  456. static const struct bpf_map_ops htab_ops = {
  457. .map_alloc = htab_map_alloc,
  458. .map_free = htab_map_free,
  459. .map_get_next_key = htab_map_get_next_key,
  460. .map_lookup_elem = htab_map_lookup_elem,
  461. .map_update_elem = htab_map_update_elem,
  462. .map_delete_elem = htab_map_delete_elem,
  463. };
  464. static struct bpf_map_type_list htab_type __read_mostly = {
  465. .ops = &htab_ops,
  466. .type = BPF_MAP_TYPE_HASH,
  467. };
  468. /* Called from eBPF program */
  469. static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
  470. {
  471. struct htab_elem *l = __htab_map_lookup_elem(map, key);
  472. if (l)
  473. return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
  474. else
  475. return NULL;
  476. }
  477. int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
  478. {
  479. struct htab_elem *l;
  480. void __percpu *pptr;
  481. int ret = -ENOENT;
  482. int cpu, off = 0;
  483. u32 size;
  484. /* per_cpu areas are zero-filled and bpf programs can only
  485. * access 'value_size' of them, so copying rounded areas
  486. * will not leak any kernel data
  487. */
  488. size = round_up(map->value_size, 8);
  489. rcu_read_lock();
  490. l = __htab_map_lookup_elem(map, key);
  491. if (!l)
  492. goto out;
  493. pptr = htab_elem_get_ptr(l, map->key_size);
  494. for_each_possible_cpu(cpu) {
  495. bpf_long_memcpy(value + off,
  496. per_cpu_ptr(pptr, cpu), size);
  497. off += size;
  498. }
  499. ret = 0;
  500. out:
  501. rcu_read_unlock();
  502. return ret;
  503. }
  504. int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
  505. u64 map_flags)
  506. {
  507. int ret;
  508. rcu_read_lock();
  509. ret = __htab_percpu_map_update_elem(map, key, value, map_flags, true);
  510. rcu_read_unlock();
  511. return ret;
  512. }
  513. static const struct bpf_map_ops htab_percpu_ops = {
  514. .map_alloc = htab_map_alloc,
  515. .map_free = htab_map_free,
  516. .map_get_next_key = htab_map_get_next_key,
  517. .map_lookup_elem = htab_percpu_map_lookup_elem,
  518. .map_update_elem = htab_percpu_map_update_elem,
  519. .map_delete_elem = htab_map_delete_elem,
  520. };
  521. static struct bpf_map_type_list htab_percpu_type __read_mostly = {
  522. .ops = &htab_percpu_ops,
  523. .type = BPF_MAP_TYPE_PERCPU_HASH,
  524. };
  525. static int __init register_htab_map(void)
  526. {
  527. bpf_register_map_type(&htab_type);
  528. bpf_register_map_type(&htab_percpu_type);
  529. return 0;
  530. }
  531. late_initcall(register_htab_map);