hashtab.c 28 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697989910010110210310410510610710810911011111211311411511611711811912012112212312412512612712812913013113213313413513613713813914014114214314414514614714814915015115215315415515615715815916016116216316416516616716816917017117217317417517617717817918018118218318418518618718818919019119219319419519619719819920020120220320420520620720820921021121221321421521621721821922022122222322422522622722822923023123223323423523623723823924024124224324424524624724824925025125225325425525625725825926026126226326426526626726826927027127227327427527627727827928028128228328428528628728828929029129229329429529629729829930030130230330430530630730830931031131231331431531631731831932032132232332432532632732832933033133233333433533633733833934034134234334434534634734834935035135235335435535635735835936036136236336436536636736836937037137237337437537637737837938038138238338438538638738838939039139239339439539639739839940040140240340440540640740840941041141241341441541641741841942042142242342442542642742842943043143243343443543643743843944044144244344444544644744844945045145245345445545645745845946046146246346446546646746846947047147247347447547647747847948048148248348448548648748848949049149249349449549649749849950050150250350450550650750850951051151251351451551651751851952052152252352452552652752852953053153253353453553653753853954054154254354454554654754854955055155255355455555655755855956056156256356456556656756856957057157257357457557657757857958058158258358458558658758858959059159259359459559659759859960060160260360460560660760860961061161261361461561661761861962062162262362462562662762862963063163263363463563663763863964064164264364464564664764864965065165265365465565665765865966066166266366466566666766866967067167267367467567667767867968068168268368468568668768868969069169269369469569669769869970070170270370470570670770870971071171271371471571671771871972072172272372472572672772872973073173273373473573673773873974074174274374474574674774874975075175275375475575675775875976076176276376476576676776876977077177277377477577677777877978078178278378478578678778878979079179279379479579679779879980080180280380480580680780880981081181281381481581681781881982082182282382482582682782882983083183283383483583683783883984084184284384484584684784884985085185285385485585685785885986086186286386486586686786886987087187287387487587687787887988088188288388488588688788888989089189289389489589689789889990090190290390490590690790890991091191291391491591691791891992092192292392492592692792892993093193293393493593693793893994094194294394494594694794894995095195295395495595695795895996096196296396496596696796896997097197297397497597697797897998098198298398498598698798898999099199299399499599699799899910001001100210031004100510061007100810091010101110121013101410151016101710181019102010211022102310241025102610271028102910301031103210331034103510361037103810391040104110421043104410451046104710481049105010511052105310541055105610571058105910601061106210631064106510661067106810691070107110721073107410751076107710781079108010811082108310841085108610871088108910901091109210931094109510961097109810991100110111021103110411051106110711081109111011111112111311141115111611171118111911201121112211231124112511261127112811291130113111321133113411351136113711381139114011411142114311441145114611471148114911501151115211531154
  1. /* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
  2. * Copyright (c) 2016 Facebook
  3. *
  4. * This program is free software; you can redistribute it and/or
  5. * modify it under the terms of version 2 of the GNU General Public
  6. * License as published by the Free Software Foundation.
  7. *
  8. * This program is distributed in the hope that it will be useful, but
  9. * WITHOUT ANY WARRANTY; without even the implied warranty of
  10. * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. * General Public License for more details.
  12. */
  13. #include <linux/bpf.h>
  14. #include <linux/jhash.h>
  15. #include <linux/filter.h>
  16. #include "percpu_freelist.h"
  17. #include "bpf_lru_list.h"
  18. struct bucket {
  19. struct hlist_head head;
  20. raw_spinlock_t lock;
  21. };
  22. struct bpf_htab {
  23. struct bpf_map map;
  24. struct bucket *buckets;
  25. void *elems;
  26. union {
  27. struct pcpu_freelist freelist;
  28. struct bpf_lru lru;
  29. };
  30. void __percpu *extra_elems;
  31. atomic_t count; /* number of elements in this hashtable */
  32. u32 n_buckets; /* number of hash buckets */
  33. u32 elem_size; /* size of each element in bytes */
  34. };
  35. enum extra_elem_state {
  36. HTAB_NOT_AN_EXTRA_ELEM = 0,
  37. HTAB_EXTRA_ELEM_FREE,
  38. HTAB_EXTRA_ELEM_USED
  39. };
  40. /* each htab element is struct htab_elem + key + value */
  41. struct htab_elem {
  42. union {
  43. struct hlist_node hash_node;
  44. struct bpf_htab *htab;
  45. struct pcpu_freelist_node fnode;
  46. };
  47. union {
  48. struct rcu_head rcu;
  49. enum extra_elem_state state;
  50. struct bpf_lru_node lru_node;
  51. };
  52. u32 hash;
  53. char key[0] __aligned(8);
  54. };
  55. static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node);
  56. static bool htab_is_lru(const struct bpf_htab *htab)
  57. {
  58. return htab->map.map_type == BPF_MAP_TYPE_LRU_HASH ||
  59. htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
  60. }
  61. static bool htab_is_percpu(const struct bpf_htab *htab)
  62. {
  63. return htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH ||
  64. htab->map.map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH;
  65. }
  66. static inline void htab_elem_set_ptr(struct htab_elem *l, u32 key_size,
  67. void __percpu *pptr)
  68. {
  69. *(void __percpu **)(l->key + key_size) = pptr;
  70. }
  71. static inline void __percpu *htab_elem_get_ptr(struct htab_elem *l, u32 key_size)
  72. {
  73. return *(void __percpu **)(l->key + key_size);
  74. }
  75. static struct htab_elem *get_htab_elem(struct bpf_htab *htab, int i)
  76. {
  77. return (struct htab_elem *) (htab->elems + i * htab->elem_size);
  78. }
  79. static void htab_free_elems(struct bpf_htab *htab)
  80. {
  81. int i;
  82. if (!htab_is_percpu(htab))
  83. goto free_elems;
  84. for (i = 0; i < htab->map.max_entries; i++) {
  85. void __percpu *pptr;
  86. pptr = htab_elem_get_ptr(get_htab_elem(htab, i),
  87. htab->map.key_size);
  88. free_percpu(pptr);
  89. }
  90. free_elems:
  91. bpf_map_area_free(htab->elems);
  92. }
  93. static struct htab_elem *prealloc_lru_pop(struct bpf_htab *htab, void *key,
  94. u32 hash)
  95. {
  96. struct bpf_lru_node *node = bpf_lru_pop_free(&htab->lru, hash);
  97. struct htab_elem *l;
  98. if (node) {
  99. l = container_of(node, struct htab_elem, lru_node);
  100. memcpy(l->key, key, htab->map.key_size);
  101. return l;
  102. }
  103. return NULL;
  104. }
  105. static int prealloc_init(struct bpf_htab *htab)
  106. {
  107. int err = -ENOMEM, i;
  108. htab->elems = bpf_map_area_alloc(htab->elem_size *
  109. htab->map.max_entries);
  110. if (!htab->elems)
  111. return -ENOMEM;
  112. if (!htab_is_percpu(htab))
  113. goto skip_percpu_elems;
  114. for (i = 0; i < htab->map.max_entries; i++) {
  115. u32 size = round_up(htab->map.value_size, 8);
  116. void __percpu *pptr;
  117. pptr = __alloc_percpu_gfp(size, 8, GFP_USER | __GFP_NOWARN);
  118. if (!pptr)
  119. goto free_elems;
  120. htab_elem_set_ptr(get_htab_elem(htab, i), htab->map.key_size,
  121. pptr);
  122. }
  123. skip_percpu_elems:
  124. if (htab_is_lru(htab))
  125. err = bpf_lru_init(&htab->lru,
  126. htab->map.map_flags & BPF_F_NO_COMMON_LRU,
  127. offsetof(struct htab_elem, hash) -
  128. offsetof(struct htab_elem, lru_node),
  129. htab_lru_map_delete_node,
  130. htab);
  131. else
  132. err = pcpu_freelist_init(&htab->freelist);
  133. if (err)
  134. goto free_elems;
  135. if (htab_is_lru(htab))
  136. bpf_lru_populate(&htab->lru, htab->elems,
  137. offsetof(struct htab_elem, lru_node),
  138. htab->elem_size, htab->map.max_entries);
  139. else
  140. pcpu_freelist_populate(&htab->freelist, htab->elems,
  141. htab->elem_size, htab->map.max_entries);
  142. return 0;
  143. free_elems:
  144. htab_free_elems(htab);
  145. return err;
  146. }
  147. static void prealloc_destroy(struct bpf_htab *htab)
  148. {
  149. htab_free_elems(htab);
  150. if (htab_is_lru(htab))
  151. bpf_lru_destroy(&htab->lru);
  152. else
  153. pcpu_freelist_destroy(&htab->freelist);
  154. }
  155. static int alloc_extra_elems(struct bpf_htab *htab)
  156. {
  157. void __percpu *pptr;
  158. int cpu;
  159. pptr = __alloc_percpu_gfp(htab->elem_size, 8, GFP_USER | __GFP_NOWARN);
  160. if (!pptr)
  161. return -ENOMEM;
  162. for_each_possible_cpu(cpu) {
  163. ((struct htab_elem *)per_cpu_ptr(pptr, cpu))->state =
  164. HTAB_EXTRA_ELEM_FREE;
  165. }
  166. htab->extra_elems = pptr;
  167. return 0;
  168. }
  169. /* Called from syscall */
  170. static struct bpf_map *htab_map_alloc(union bpf_attr *attr)
  171. {
  172. bool percpu = (attr->map_type == BPF_MAP_TYPE_PERCPU_HASH ||
  173. attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
  174. bool lru = (attr->map_type == BPF_MAP_TYPE_LRU_HASH ||
  175. attr->map_type == BPF_MAP_TYPE_LRU_PERCPU_HASH);
  176. /* percpu_lru means each cpu has its own LRU list.
  177. * it is different from BPF_MAP_TYPE_PERCPU_HASH where
  178. * the map's value itself is percpu. percpu_lru has
  179. * nothing to do with the map's value.
  180. */
  181. bool percpu_lru = (attr->map_flags & BPF_F_NO_COMMON_LRU);
  182. bool prealloc = !(attr->map_flags & BPF_F_NO_PREALLOC);
  183. struct bpf_htab *htab;
  184. int err, i;
  185. u64 cost;
  186. if (lru && !capable(CAP_SYS_ADMIN))
  187. /* LRU implementation is much complicated than other
  188. * maps. Hence, limit to CAP_SYS_ADMIN for now.
  189. */
  190. return ERR_PTR(-EPERM);
  191. if (attr->map_flags & ~(BPF_F_NO_PREALLOC | BPF_F_NO_COMMON_LRU))
  192. /* reserved bits should not be used */
  193. return ERR_PTR(-EINVAL);
  194. if (!lru && percpu_lru)
  195. return ERR_PTR(-EINVAL);
  196. if (lru && !prealloc)
  197. return ERR_PTR(-ENOTSUPP);
  198. htab = kzalloc(sizeof(*htab), GFP_USER);
  199. if (!htab)
  200. return ERR_PTR(-ENOMEM);
  201. /* mandatory map attributes */
  202. htab->map.map_type = attr->map_type;
  203. htab->map.key_size = attr->key_size;
  204. htab->map.value_size = attr->value_size;
  205. htab->map.max_entries = attr->max_entries;
  206. htab->map.map_flags = attr->map_flags;
  207. /* check sanity of attributes.
  208. * value_size == 0 may be allowed in the future to use map as a set
  209. */
  210. err = -EINVAL;
  211. if (htab->map.max_entries == 0 || htab->map.key_size == 0 ||
  212. htab->map.value_size == 0)
  213. goto free_htab;
  214. if (percpu_lru) {
  215. /* ensure each CPU's lru list has >=1 elements.
  216. * since we are at it, make each lru list has the same
  217. * number of elements.
  218. */
  219. htab->map.max_entries = roundup(attr->max_entries,
  220. num_possible_cpus());
  221. if (htab->map.max_entries < attr->max_entries)
  222. htab->map.max_entries = rounddown(attr->max_entries,
  223. num_possible_cpus());
  224. }
  225. /* hash table size must be power of 2 */
  226. htab->n_buckets = roundup_pow_of_two(htab->map.max_entries);
  227. err = -E2BIG;
  228. if (htab->map.key_size > MAX_BPF_STACK)
  229. /* eBPF programs initialize keys on stack, so they cannot be
  230. * larger than max stack size
  231. */
  232. goto free_htab;
  233. if (htab->map.value_size >= KMALLOC_MAX_SIZE -
  234. MAX_BPF_STACK - sizeof(struct htab_elem))
  235. /* if value_size is bigger, the user space won't be able to
  236. * access the elements via bpf syscall. This check also makes
  237. * sure that the elem_size doesn't overflow and it's
  238. * kmalloc-able later in htab_map_update_elem()
  239. */
  240. goto free_htab;
  241. if (percpu && round_up(htab->map.value_size, 8) > PCPU_MIN_UNIT_SIZE)
  242. /* make sure the size for pcpu_alloc() is reasonable */
  243. goto free_htab;
  244. htab->elem_size = sizeof(struct htab_elem) +
  245. round_up(htab->map.key_size, 8);
  246. if (percpu)
  247. htab->elem_size += sizeof(void *);
  248. else
  249. htab->elem_size += round_up(htab->map.value_size, 8);
  250. /* prevent zero size kmalloc and check for u32 overflow */
  251. if (htab->n_buckets == 0 ||
  252. htab->n_buckets > U32_MAX / sizeof(struct bucket))
  253. goto free_htab;
  254. cost = (u64) htab->n_buckets * sizeof(struct bucket) +
  255. (u64) htab->elem_size * htab->map.max_entries;
  256. if (percpu)
  257. cost += (u64) round_up(htab->map.value_size, 8) *
  258. num_possible_cpus() * htab->map.max_entries;
  259. else
  260. cost += (u64) htab->elem_size * num_possible_cpus();
  261. if (cost >= U32_MAX - PAGE_SIZE)
  262. /* make sure page count doesn't overflow */
  263. goto free_htab;
  264. htab->map.pages = round_up(cost, PAGE_SIZE) >> PAGE_SHIFT;
  265. /* if map size is larger than memlock limit, reject it early */
  266. err = bpf_map_precharge_memlock(htab->map.pages);
  267. if (err)
  268. goto free_htab;
  269. err = -ENOMEM;
  270. htab->buckets = bpf_map_area_alloc(htab->n_buckets *
  271. sizeof(struct bucket));
  272. if (!htab->buckets)
  273. goto free_htab;
  274. for (i = 0; i < htab->n_buckets; i++) {
  275. INIT_HLIST_HEAD(&htab->buckets[i].head);
  276. raw_spin_lock_init(&htab->buckets[i].lock);
  277. }
  278. if (!percpu && !lru) {
  279. /* lru itself can remove the least used element, so
  280. * there is no need for an extra elem during map_update.
  281. */
  282. err = alloc_extra_elems(htab);
  283. if (err)
  284. goto free_buckets;
  285. }
  286. if (prealloc) {
  287. err = prealloc_init(htab);
  288. if (err)
  289. goto free_extra_elems;
  290. }
  291. return &htab->map;
  292. free_extra_elems:
  293. free_percpu(htab->extra_elems);
  294. free_buckets:
  295. bpf_map_area_free(htab->buckets);
  296. free_htab:
  297. kfree(htab);
  298. return ERR_PTR(err);
  299. }
  300. static inline u32 htab_map_hash(const void *key, u32 key_len)
  301. {
  302. return jhash(key, key_len, 0);
  303. }
  304. static inline struct bucket *__select_bucket(struct bpf_htab *htab, u32 hash)
  305. {
  306. return &htab->buckets[hash & (htab->n_buckets - 1)];
  307. }
  308. static inline struct hlist_head *select_bucket(struct bpf_htab *htab, u32 hash)
  309. {
  310. return &__select_bucket(htab, hash)->head;
  311. }
  312. static struct htab_elem *lookup_elem_raw(struct hlist_head *head, u32 hash,
  313. void *key, u32 key_size)
  314. {
  315. struct htab_elem *l;
  316. hlist_for_each_entry_rcu(l, head, hash_node)
  317. if (l->hash == hash && !memcmp(&l->key, key, key_size))
  318. return l;
  319. return NULL;
  320. }
  321. /* Called from syscall or from eBPF program */
  322. static void *__htab_map_lookup_elem(struct bpf_map *map, void *key)
  323. {
  324. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  325. struct hlist_head *head;
  326. struct htab_elem *l;
  327. u32 hash, key_size;
  328. /* Must be called with rcu_read_lock. */
  329. WARN_ON_ONCE(!rcu_read_lock_held());
  330. key_size = map->key_size;
  331. hash = htab_map_hash(key, key_size);
  332. head = select_bucket(htab, hash);
  333. l = lookup_elem_raw(head, hash, key, key_size);
  334. return l;
  335. }
  336. static void *htab_map_lookup_elem(struct bpf_map *map, void *key)
  337. {
  338. struct htab_elem *l = __htab_map_lookup_elem(map, key);
  339. if (l)
  340. return l->key + round_up(map->key_size, 8);
  341. return NULL;
  342. }
  343. static void *htab_lru_map_lookup_elem(struct bpf_map *map, void *key)
  344. {
  345. struct htab_elem *l = __htab_map_lookup_elem(map, key);
  346. if (l) {
  347. bpf_lru_node_set_ref(&l->lru_node);
  348. return l->key + round_up(map->key_size, 8);
  349. }
  350. return NULL;
  351. }
  352. /* It is called from the bpf_lru_list when the LRU needs to delete
  353. * older elements from the htab.
  354. */
  355. static bool htab_lru_map_delete_node(void *arg, struct bpf_lru_node *node)
  356. {
  357. struct bpf_htab *htab = (struct bpf_htab *)arg;
  358. struct htab_elem *l, *tgt_l;
  359. struct hlist_head *head;
  360. unsigned long flags;
  361. struct bucket *b;
  362. tgt_l = container_of(node, struct htab_elem, lru_node);
  363. b = __select_bucket(htab, tgt_l->hash);
  364. head = &b->head;
  365. raw_spin_lock_irqsave(&b->lock, flags);
  366. hlist_for_each_entry_rcu(l, head, hash_node)
  367. if (l == tgt_l) {
  368. hlist_del_rcu(&l->hash_node);
  369. break;
  370. }
  371. raw_spin_unlock_irqrestore(&b->lock, flags);
  372. return l == tgt_l;
  373. }
  374. /* Called from syscall */
  375. static int htab_map_get_next_key(struct bpf_map *map, void *key, void *next_key)
  376. {
  377. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  378. struct hlist_head *head;
  379. struct htab_elem *l, *next_l;
  380. u32 hash, key_size;
  381. int i;
  382. WARN_ON_ONCE(!rcu_read_lock_held());
  383. key_size = map->key_size;
  384. hash = htab_map_hash(key, key_size);
  385. head = select_bucket(htab, hash);
  386. /* lookup the key */
  387. l = lookup_elem_raw(head, hash, key, key_size);
  388. if (!l) {
  389. i = 0;
  390. goto find_first_elem;
  391. }
  392. /* key was found, get next key in the same bucket */
  393. next_l = hlist_entry_safe(rcu_dereference_raw(hlist_next_rcu(&l->hash_node)),
  394. struct htab_elem, hash_node);
  395. if (next_l) {
  396. /* if next elem in this hash list is non-zero, just return it */
  397. memcpy(next_key, next_l->key, key_size);
  398. return 0;
  399. }
  400. /* no more elements in this hash list, go to the next bucket */
  401. i = hash & (htab->n_buckets - 1);
  402. i++;
  403. find_first_elem:
  404. /* iterate over buckets */
  405. for (; i < htab->n_buckets; i++) {
  406. head = select_bucket(htab, i);
  407. /* pick first element in the bucket */
  408. next_l = hlist_entry_safe(rcu_dereference_raw(hlist_first_rcu(head)),
  409. struct htab_elem, hash_node);
  410. if (next_l) {
  411. /* if it's not empty, just return it */
  412. memcpy(next_key, next_l->key, key_size);
  413. return 0;
  414. }
  415. }
  416. /* iterated over all buckets and all elements */
  417. return -ENOENT;
  418. }
  419. static void htab_elem_free(struct bpf_htab *htab, struct htab_elem *l)
  420. {
  421. if (htab->map.map_type == BPF_MAP_TYPE_PERCPU_HASH)
  422. free_percpu(htab_elem_get_ptr(l, htab->map.key_size));
  423. kfree(l);
  424. }
  425. static void htab_elem_free_rcu(struct rcu_head *head)
  426. {
  427. struct htab_elem *l = container_of(head, struct htab_elem, rcu);
  428. struct bpf_htab *htab = l->htab;
  429. /* must increment bpf_prog_active to avoid kprobe+bpf triggering while
  430. * we're calling kfree, otherwise deadlock is possible if kprobes
  431. * are placed somewhere inside of slub
  432. */
  433. preempt_disable();
  434. __this_cpu_inc(bpf_prog_active);
  435. htab_elem_free(htab, l);
  436. __this_cpu_dec(bpf_prog_active);
  437. preempt_enable();
  438. }
  439. static void free_htab_elem(struct bpf_htab *htab, struct htab_elem *l)
  440. {
  441. if (l->state == HTAB_EXTRA_ELEM_USED) {
  442. l->state = HTAB_EXTRA_ELEM_FREE;
  443. return;
  444. }
  445. if (!(htab->map.map_flags & BPF_F_NO_PREALLOC)) {
  446. pcpu_freelist_push(&htab->freelist, &l->fnode);
  447. } else {
  448. atomic_dec(&htab->count);
  449. l->htab = htab;
  450. call_rcu(&l->rcu, htab_elem_free_rcu);
  451. }
  452. }
  453. static void pcpu_copy_value(struct bpf_htab *htab, void __percpu *pptr,
  454. void *value, bool onallcpus)
  455. {
  456. if (!onallcpus) {
  457. /* copy true value_size bytes */
  458. memcpy(this_cpu_ptr(pptr), value, htab->map.value_size);
  459. } else {
  460. u32 size = round_up(htab->map.value_size, 8);
  461. int off = 0, cpu;
  462. for_each_possible_cpu(cpu) {
  463. bpf_long_memcpy(per_cpu_ptr(pptr, cpu),
  464. value + off, size);
  465. off += size;
  466. }
  467. }
  468. }
  469. static struct htab_elem *alloc_htab_elem(struct bpf_htab *htab, void *key,
  470. void *value, u32 key_size, u32 hash,
  471. bool percpu, bool onallcpus,
  472. bool old_elem_exists)
  473. {
  474. u32 size = htab->map.value_size;
  475. bool prealloc = !(htab->map.map_flags & BPF_F_NO_PREALLOC);
  476. struct htab_elem *l_new;
  477. void __percpu *pptr;
  478. int err = 0;
  479. if (prealloc) {
  480. l_new = (struct htab_elem *)pcpu_freelist_pop(&htab->freelist);
  481. if (!l_new)
  482. err = -E2BIG;
  483. } else {
  484. if (atomic_inc_return(&htab->count) > htab->map.max_entries) {
  485. atomic_dec(&htab->count);
  486. err = -E2BIG;
  487. } else {
  488. l_new = kmalloc(htab->elem_size,
  489. GFP_ATOMIC | __GFP_NOWARN);
  490. if (!l_new)
  491. return ERR_PTR(-ENOMEM);
  492. }
  493. }
  494. if (err) {
  495. if (!old_elem_exists)
  496. return ERR_PTR(err);
  497. /* if we're updating the existing element and the hash table
  498. * is full, use per-cpu extra elems
  499. */
  500. l_new = this_cpu_ptr(htab->extra_elems);
  501. if (l_new->state != HTAB_EXTRA_ELEM_FREE)
  502. return ERR_PTR(-E2BIG);
  503. l_new->state = HTAB_EXTRA_ELEM_USED;
  504. } else {
  505. l_new->state = HTAB_NOT_AN_EXTRA_ELEM;
  506. }
  507. memcpy(l_new->key, key, key_size);
  508. if (percpu) {
  509. /* round up value_size to 8 bytes */
  510. size = round_up(size, 8);
  511. if (prealloc) {
  512. pptr = htab_elem_get_ptr(l_new, key_size);
  513. } else {
  514. /* alloc_percpu zero-fills */
  515. pptr = __alloc_percpu_gfp(size, 8,
  516. GFP_ATOMIC | __GFP_NOWARN);
  517. if (!pptr) {
  518. kfree(l_new);
  519. return ERR_PTR(-ENOMEM);
  520. }
  521. }
  522. pcpu_copy_value(htab, pptr, value, onallcpus);
  523. if (!prealloc)
  524. htab_elem_set_ptr(l_new, key_size, pptr);
  525. } else {
  526. memcpy(l_new->key + round_up(key_size, 8), value, size);
  527. }
  528. l_new->hash = hash;
  529. return l_new;
  530. }
  531. static int check_flags(struct bpf_htab *htab, struct htab_elem *l_old,
  532. u64 map_flags)
  533. {
  534. if (l_old && map_flags == BPF_NOEXIST)
  535. /* elem already exists */
  536. return -EEXIST;
  537. if (!l_old && map_flags == BPF_EXIST)
  538. /* elem doesn't exist, cannot update it */
  539. return -ENOENT;
  540. return 0;
  541. }
  542. /* Called from syscall or from eBPF program */
  543. static int htab_map_update_elem(struct bpf_map *map, void *key, void *value,
  544. u64 map_flags)
  545. {
  546. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  547. struct htab_elem *l_new = NULL, *l_old;
  548. struct hlist_head *head;
  549. unsigned long flags;
  550. struct bucket *b;
  551. u32 key_size, hash;
  552. int ret;
  553. if (unlikely(map_flags > BPF_EXIST))
  554. /* unknown flags */
  555. return -EINVAL;
  556. WARN_ON_ONCE(!rcu_read_lock_held());
  557. key_size = map->key_size;
  558. hash = htab_map_hash(key, key_size);
  559. b = __select_bucket(htab, hash);
  560. head = &b->head;
  561. /* bpf_map_update_elem() can be called in_irq() */
  562. raw_spin_lock_irqsave(&b->lock, flags);
  563. l_old = lookup_elem_raw(head, hash, key, key_size);
  564. ret = check_flags(htab, l_old, map_flags);
  565. if (ret)
  566. goto err;
  567. l_new = alloc_htab_elem(htab, key, value, key_size, hash, false, false,
  568. !!l_old);
  569. if (IS_ERR(l_new)) {
  570. /* all pre-allocated elements are in use or memory exhausted */
  571. ret = PTR_ERR(l_new);
  572. goto err;
  573. }
  574. /* add new element to the head of the list, so that
  575. * concurrent search will find it before old elem
  576. */
  577. hlist_add_head_rcu(&l_new->hash_node, head);
  578. if (l_old) {
  579. hlist_del_rcu(&l_old->hash_node);
  580. free_htab_elem(htab, l_old);
  581. }
  582. ret = 0;
  583. err:
  584. raw_spin_unlock_irqrestore(&b->lock, flags);
  585. return ret;
  586. }
  587. static int htab_lru_map_update_elem(struct bpf_map *map, void *key, void *value,
  588. u64 map_flags)
  589. {
  590. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  591. struct htab_elem *l_new, *l_old = NULL;
  592. struct hlist_head *head;
  593. unsigned long flags;
  594. struct bucket *b;
  595. u32 key_size, hash;
  596. int ret;
  597. if (unlikely(map_flags > BPF_EXIST))
  598. /* unknown flags */
  599. return -EINVAL;
  600. WARN_ON_ONCE(!rcu_read_lock_held());
  601. key_size = map->key_size;
  602. hash = htab_map_hash(key, key_size);
  603. b = __select_bucket(htab, hash);
  604. head = &b->head;
  605. /* For LRU, we need to alloc before taking bucket's
  606. * spinlock because getting free nodes from LRU may need
  607. * to remove older elements from htab and this removal
  608. * operation will need a bucket lock.
  609. */
  610. l_new = prealloc_lru_pop(htab, key, hash);
  611. if (!l_new)
  612. return -ENOMEM;
  613. memcpy(l_new->key + round_up(map->key_size, 8), value, map->value_size);
  614. /* bpf_map_update_elem() can be called in_irq() */
  615. raw_spin_lock_irqsave(&b->lock, flags);
  616. l_old = lookup_elem_raw(head, hash, key, key_size);
  617. ret = check_flags(htab, l_old, map_flags);
  618. if (ret)
  619. goto err;
  620. /* add new element to the head of the list, so that
  621. * concurrent search will find it before old elem
  622. */
  623. hlist_add_head_rcu(&l_new->hash_node, head);
  624. if (l_old) {
  625. bpf_lru_node_set_ref(&l_new->lru_node);
  626. hlist_del_rcu(&l_old->hash_node);
  627. }
  628. ret = 0;
  629. err:
  630. raw_spin_unlock_irqrestore(&b->lock, flags);
  631. if (ret)
  632. bpf_lru_push_free(&htab->lru, &l_new->lru_node);
  633. else if (l_old)
  634. bpf_lru_push_free(&htab->lru, &l_old->lru_node);
  635. return ret;
  636. }
  637. static int __htab_percpu_map_update_elem(struct bpf_map *map, void *key,
  638. void *value, u64 map_flags,
  639. bool onallcpus)
  640. {
  641. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  642. struct htab_elem *l_new = NULL, *l_old;
  643. struct hlist_head *head;
  644. unsigned long flags;
  645. struct bucket *b;
  646. u32 key_size, hash;
  647. int ret;
  648. if (unlikely(map_flags > BPF_EXIST))
  649. /* unknown flags */
  650. return -EINVAL;
  651. WARN_ON_ONCE(!rcu_read_lock_held());
  652. key_size = map->key_size;
  653. hash = htab_map_hash(key, key_size);
  654. b = __select_bucket(htab, hash);
  655. head = &b->head;
  656. /* bpf_map_update_elem() can be called in_irq() */
  657. raw_spin_lock_irqsave(&b->lock, flags);
  658. l_old = lookup_elem_raw(head, hash, key, key_size);
  659. ret = check_flags(htab, l_old, map_flags);
  660. if (ret)
  661. goto err;
  662. if (l_old) {
  663. /* per-cpu hash map can update value in-place */
  664. pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
  665. value, onallcpus);
  666. } else {
  667. l_new = alloc_htab_elem(htab, key, value, key_size,
  668. hash, true, onallcpus, false);
  669. if (IS_ERR(l_new)) {
  670. ret = PTR_ERR(l_new);
  671. goto err;
  672. }
  673. hlist_add_head_rcu(&l_new->hash_node, head);
  674. }
  675. ret = 0;
  676. err:
  677. raw_spin_unlock_irqrestore(&b->lock, flags);
  678. return ret;
  679. }
  680. static int __htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
  681. void *value, u64 map_flags,
  682. bool onallcpus)
  683. {
  684. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  685. struct htab_elem *l_new = NULL, *l_old;
  686. struct hlist_head *head;
  687. unsigned long flags;
  688. struct bucket *b;
  689. u32 key_size, hash;
  690. int ret;
  691. if (unlikely(map_flags > BPF_EXIST))
  692. /* unknown flags */
  693. return -EINVAL;
  694. WARN_ON_ONCE(!rcu_read_lock_held());
  695. key_size = map->key_size;
  696. hash = htab_map_hash(key, key_size);
  697. b = __select_bucket(htab, hash);
  698. head = &b->head;
  699. /* For LRU, we need to alloc before taking bucket's
  700. * spinlock because LRU's elem alloc may need
  701. * to remove older elem from htab and this removal
  702. * operation will need a bucket lock.
  703. */
  704. if (map_flags != BPF_EXIST) {
  705. l_new = prealloc_lru_pop(htab, key, hash);
  706. if (!l_new)
  707. return -ENOMEM;
  708. }
  709. /* bpf_map_update_elem() can be called in_irq() */
  710. raw_spin_lock_irqsave(&b->lock, flags);
  711. l_old = lookup_elem_raw(head, hash, key, key_size);
  712. ret = check_flags(htab, l_old, map_flags);
  713. if (ret)
  714. goto err;
  715. if (l_old) {
  716. bpf_lru_node_set_ref(&l_old->lru_node);
  717. /* per-cpu hash map can update value in-place */
  718. pcpu_copy_value(htab, htab_elem_get_ptr(l_old, key_size),
  719. value, onallcpus);
  720. } else {
  721. pcpu_copy_value(htab, htab_elem_get_ptr(l_new, key_size),
  722. value, onallcpus);
  723. hlist_add_head_rcu(&l_new->hash_node, head);
  724. l_new = NULL;
  725. }
  726. ret = 0;
  727. err:
  728. raw_spin_unlock_irqrestore(&b->lock, flags);
  729. if (l_new)
  730. bpf_lru_push_free(&htab->lru, &l_new->lru_node);
  731. return ret;
  732. }
  733. static int htab_percpu_map_update_elem(struct bpf_map *map, void *key,
  734. void *value, u64 map_flags)
  735. {
  736. return __htab_percpu_map_update_elem(map, key, value, map_flags, false);
  737. }
  738. static int htab_lru_percpu_map_update_elem(struct bpf_map *map, void *key,
  739. void *value, u64 map_flags)
  740. {
  741. return __htab_lru_percpu_map_update_elem(map, key, value, map_flags,
  742. false);
  743. }
  744. /* Called from syscall or from eBPF program */
  745. static int htab_map_delete_elem(struct bpf_map *map, void *key)
  746. {
  747. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  748. struct hlist_head *head;
  749. struct bucket *b;
  750. struct htab_elem *l;
  751. unsigned long flags;
  752. u32 hash, key_size;
  753. int ret = -ENOENT;
  754. WARN_ON_ONCE(!rcu_read_lock_held());
  755. key_size = map->key_size;
  756. hash = htab_map_hash(key, key_size);
  757. b = __select_bucket(htab, hash);
  758. head = &b->head;
  759. raw_spin_lock_irqsave(&b->lock, flags);
  760. l = lookup_elem_raw(head, hash, key, key_size);
  761. if (l) {
  762. hlist_del_rcu(&l->hash_node);
  763. free_htab_elem(htab, l);
  764. ret = 0;
  765. }
  766. raw_spin_unlock_irqrestore(&b->lock, flags);
  767. return ret;
  768. }
  769. static int htab_lru_map_delete_elem(struct bpf_map *map, void *key)
  770. {
  771. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  772. struct hlist_head *head;
  773. struct bucket *b;
  774. struct htab_elem *l;
  775. unsigned long flags;
  776. u32 hash, key_size;
  777. int ret = -ENOENT;
  778. WARN_ON_ONCE(!rcu_read_lock_held());
  779. key_size = map->key_size;
  780. hash = htab_map_hash(key, key_size);
  781. b = __select_bucket(htab, hash);
  782. head = &b->head;
  783. raw_spin_lock_irqsave(&b->lock, flags);
  784. l = lookup_elem_raw(head, hash, key, key_size);
  785. if (l) {
  786. hlist_del_rcu(&l->hash_node);
  787. ret = 0;
  788. }
  789. raw_spin_unlock_irqrestore(&b->lock, flags);
  790. if (l)
  791. bpf_lru_push_free(&htab->lru, &l->lru_node);
  792. return ret;
  793. }
  794. static void delete_all_elements(struct bpf_htab *htab)
  795. {
  796. int i;
  797. for (i = 0; i < htab->n_buckets; i++) {
  798. struct hlist_head *head = select_bucket(htab, i);
  799. struct hlist_node *n;
  800. struct htab_elem *l;
  801. hlist_for_each_entry_safe(l, n, head, hash_node) {
  802. hlist_del_rcu(&l->hash_node);
  803. if (l->state != HTAB_EXTRA_ELEM_USED)
  804. htab_elem_free(htab, l);
  805. }
  806. }
  807. }
  808. /* Called when map->refcnt goes to zero, either from workqueue or from syscall */
  809. static void htab_map_free(struct bpf_map *map)
  810. {
  811. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  812. /* at this point bpf_prog->aux->refcnt == 0 and this map->refcnt == 0,
  813. * so the programs (can be more than one that used this map) were
  814. * disconnected from events. Wait for outstanding critical sections in
  815. * these programs to complete
  816. */
  817. synchronize_rcu();
  818. /* some of free_htab_elem() callbacks for elements of this map may
  819. * not have executed. Wait for them.
  820. */
  821. rcu_barrier();
  822. if (htab->map.map_flags & BPF_F_NO_PREALLOC)
  823. delete_all_elements(htab);
  824. else
  825. prealloc_destroy(htab);
  826. free_percpu(htab->extra_elems);
  827. bpf_map_area_free(htab->buckets);
  828. kfree(htab);
  829. }
  830. static const struct bpf_map_ops htab_ops = {
  831. .map_alloc = htab_map_alloc,
  832. .map_free = htab_map_free,
  833. .map_get_next_key = htab_map_get_next_key,
  834. .map_lookup_elem = htab_map_lookup_elem,
  835. .map_update_elem = htab_map_update_elem,
  836. .map_delete_elem = htab_map_delete_elem,
  837. };
  838. static struct bpf_map_type_list htab_type __ro_after_init = {
  839. .ops = &htab_ops,
  840. .type = BPF_MAP_TYPE_HASH,
  841. };
  842. static const struct bpf_map_ops htab_lru_ops = {
  843. .map_alloc = htab_map_alloc,
  844. .map_free = htab_map_free,
  845. .map_get_next_key = htab_map_get_next_key,
  846. .map_lookup_elem = htab_lru_map_lookup_elem,
  847. .map_update_elem = htab_lru_map_update_elem,
  848. .map_delete_elem = htab_lru_map_delete_elem,
  849. };
  850. static struct bpf_map_type_list htab_lru_type __ro_after_init = {
  851. .ops = &htab_lru_ops,
  852. .type = BPF_MAP_TYPE_LRU_HASH,
  853. };
  854. /* Called from eBPF program */
  855. static void *htab_percpu_map_lookup_elem(struct bpf_map *map, void *key)
  856. {
  857. struct htab_elem *l = __htab_map_lookup_elem(map, key);
  858. if (l)
  859. return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
  860. else
  861. return NULL;
  862. }
  863. static void *htab_lru_percpu_map_lookup_elem(struct bpf_map *map, void *key)
  864. {
  865. struct htab_elem *l = __htab_map_lookup_elem(map, key);
  866. if (l) {
  867. bpf_lru_node_set_ref(&l->lru_node);
  868. return this_cpu_ptr(htab_elem_get_ptr(l, map->key_size));
  869. }
  870. return NULL;
  871. }
  872. int bpf_percpu_hash_copy(struct bpf_map *map, void *key, void *value)
  873. {
  874. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  875. struct htab_elem *l;
  876. void __percpu *pptr;
  877. int ret = -ENOENT;
  878. int cpu, off = 0;
  879. u32 size;
  880. /* per_cpu areas are zero-filled and bpf programs can only
  881. * access 'value_size' of them, so copying rounded areas
  882. * will not leak any kernel data
  883. */
  884. size = round_up(map->value_size, 8);
  885. rcu_read_lock();
  886. l = __htab_map_lookup_elem(map, key);
  887. if (!l)
  888. goto out;
  889. if (htab_is_lru(htab))
  890. bpf_lru_node_set_ref(&l->lru_node);
  891. pptr = htab_elem_get_ptr(l, map->key_size);
  892. for_each_possible_cpu(cpu) {
  893. bpf_long_memcpy(value + off,
  894. per_cpu_ptr(pptr, cpu), size);
  895. off += size;
  896. }
  897. ret = 0;
  898. out:
  899. rcu_read_unlock();
  900. return ret;
  901. }
  902. int bpf_percpu_hash_update(struct bpf_map *map, void *key, void *value,
  903. u64 map_flags)
  904. {
  905. struct bpf_htab *htab = container_of(map, struct bpf_htab, map);
  906. int ret;
  907. rcu_read_lock();
  908. if (htab_is_lru(htab))
  909. ret = __htab_lru_percpu_map_update_elem(map, key, value,
  910. map_flags, true);
  911. else
  912. ret = __htab_percpu_map_update_elem(map, key, value, map_flags,
  913. true);
  914. rcu_read_unlock();
  915. return ret;
  916. }
  917. static const struct bpf_map_ops htab_percpu_ops = {
  918. .map_alloc = htab_map_alloc,
  919. .map_free = htab_map_free,
  920. .map_get_next_key = htab_map_get_next_key,
  921. .map_lookup_elem = htab_percpu_map_lookup_elem,
  922. .map_update_elem = htab_percpu_map_update_elem,
  923. .map_delete_elem = htab_map_delete_elem,
  924. };
  925. static struct bpf_map_type_list htab_percpu_type __ro_after_init = {
  926. .ops = &htab_percpu_ops,
  927. .type = BPF_MAP_TYPE_PERCPU_HASH,
  928. };
  929. static const struct bpf_map_ops htab_lru_percpu_ops = {
  930. .map_alloc = htab_map_alloc,
  931. .map_free = htab_map_free,
  932. .map_get_next_key = htab_map_get_next_key,
  933. .map_lookup_elem = htab_lru_percpu_map_lookup_elem,
  934. .map_update_elem = htab_lru_percpu_map_update_elem,
  935. .map_delete_elem = htab_lru_map_delete_elem,
  936. };
  937. static struct bpf_map_type_list htab_lru_percpu_type __ro_after_init = {
  938. .ops = &htab_lru_percpu_ops,
  939. .type = BPF_MAP_TYPE_LRU_PERCPU_HASH,
  940. };
  941. static int __init register_htab_map(void)
  942. {
  943. bpf_register_map_type(&htab_type);
  944. bpf_register_map_type(&htab_percpu_type);
  945. bpf_register_map_type(&htab_lru_type);
  946. bpf_register_map_type(&htab_lru_percpu_type);
  947. return 0;
  948. }
  949. late_initcall(register_htab_map);