|
@@ -9,16 +9,19 @@
|
|
#include <linux/filter.h>
|
|
#include <linux/filter.h>
|
|
#include <linux/stacktrace.h>
|
|
#include <linux/stacktrace.h>
|
|
#include <linux/perf_event.h>
|
|
#include <linux/perf_event.h>
|
|
|
|
+#include <linux/elf.h>
|
|
|
|
+#include <linux/pagemap.h>
|
|
#include "percpu_freelist.h"
|
|
#include "percpu_freelist.h"
|
|
|
|
|
|
-#define STACK_CREATE_FLAG_MASK \
|
|
|
|
- (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY)
|
|
|
|
|
|
+#define STACK_CREATE_FLAG_MASK \
|
|
|
|
+ (BPF_F_NUMA_NODE | BPF_F_RDONLY | BPF_F_WRONLY | \
|
|
|
|
+ BPF_F_STACK_BUILD_ID)
|
|
|
|
|
|
struct stack_map_bucket {
|
|
struct stack_map_bucket {
|
|
struct pcpu_freelist_node fnode;
|
|
struct pcpu_freelist_node fnode;
|
|
u32 hash;
|
|
u32 hash;
|
|
u32 nr;
|
|
u32 nr;
|
|
- u64 ip[];
|
|
|
|
|
|
+ u64 data[];
|
|
};
|
|
};
|
|
|
|
|
|
struct bpf_stack_map {
|
|
struct bpf_stack_map {
|
|
@@ -29,6 +32,17 @@ struct bpf_stack_map {
|
|
struct stack_map_bucket *buckets[];
|
|
struct stack_map_bucket *buckets[];
|
|
};
|
|
};
|
|
|
|
|
|
|
|
+static inline bool stack_map_use_build_id(struct bpf_map *map)
|
|
|
|
+{
|
|
|
|
+ return (map->map_flags & BPF_F_STACK_BUILD_ID);
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static inline int stack_map_data_size(struct bpf_map *map)
|
|
|
|
+{
|
|
|
|
+ return stack_map_use_build_id(map) ?
|
|
|
|
+ sizeof(struct bpf_stack_build_id) : sizeof(u64);
|
|
|
|
+}
|
|
|
|
+
|
|
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
|
|
static int prealloc_elems_and_freelist(struct bpf_stack_map *smap)
|
|
{
|
|
{
|
|
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
|
|
u32 elem_size = sizeof(struct stack_map_bucket) + smap->map.value_size;
|
|
@@ -68,8 +82,16 @@ static struct bpf_map *stack_map_alloc(union bpf_attr *attr)
|
|
|
|
|
|
/* check sanity of attributes */
|
|
/* check sanity of attributes */
|
|
if (attr->max_entries == 0 || attr->key_size != 4 ||
|
|
if (attr->max_entries == 0 || attr->key_size != 4 ||
|
|
- value_size < 8 || value_size % 8 ||
|
|
|
|
- value_size / 8 > sysctl_perf_event_max_stack)
|
|
|
|
|
|
+ value_size < 8 || value_size % 8)
|
|
|
|
+ return ERR_PTR(-EINVAL);
|
|
|
|
+
|
|
|
|
+ BUILD_BUG_ON(sizeof(struct bpf_stack_build_id) % sizeof(u64));
|
|
|
|
+ if (attr->map_flags & BPF_F_STACK_BUILD_ID) {
|
|
|
|
+ if (value_size % sizeof(struct bpf_stack_build_id) ||
|
|
|
|
+ value_size / sizeof(struct bpf_stack_build_id)
|
|
|
|
+ > sysctl_perf_event_max_stack)
|
|
|
|
+ return ERR_PTR(-EINVAL);
|
|
|
|
+ } else if (value_size / 8 > sysctl_perf_event_max_stack)
|
|
return ERR_PTR(-EINVAL);
|
|
return ERR_PTR(-EINVAL);
|
|
|
|
|
|
/* hash table size must be power of 2 */
|
|
/* hash table size must be power of 2 */
|
|
@@ -114,13 +136,184 @@ free_smap:
|
|
return ERR_PTR(err);
|
|
return ERR_PTR(err);
|
|
}
|
|
}
|
|
|
|
|
|
|
|
+#define BPF_BUILD_ID 3
|
|
|
|
+/*
|
|
|
|
+ * Parse build id from the note segment. This logic can be shared between
|
|
|
|
+ * 32-bit and 64-bit system, because Elf32_Nhdr and Elf64_Nhdr are
|
|
|
|
+ * identical.
|
|
|
|
+ */
|
|
|
|
+static inline int stack_map_parse_build_id(void *page_addr,
|
|
|
|
+ unsigned char *build_id,
|
|
|
|
+ void *note_start,
|
|
|
|
+ Elf32_Word note_size)
|
|
|
|
+{
|
|
|
|
+ Elf32_Word note_offs = 0, new_offs;
|
|
|
|
+
|
|
|
|
+ /* check for overflow */
|
|
|
|
+ if (note_start < page_addr || note_start + note_size < note_start)
|
|
|
|
+ return -EINVAL;
|
|
|
|
+
|
|
|
|
+ /* only supports note that fits in the first page */
|
|
|
|
+ if (note_start + note_size > page_addr + PAGE_SIZE)
|
|
|
|
+ return -EINVAL;
|
|
|
|
+
|
|
|
|
+ while (note_offs + sizeof(Elf32_Nhdr) < note_size) {
|
|
|
|
+ Elf32_Nhdr *nhdr = (Elf32_Nhdr *)(note_start + note_offs);
|
|
|
|
+
|
|
|
|
+ if (nhdr->n_type == BPF_BUILD_ID &&
|
|
|
|
+ nhdr->n_namesz == sizeof("GNU") &&
|
|
|
|
+ nhdr->n_descsz == BPF_BUILD_ID_SIZE) {
|
|
|
|
+ memcpy(build_id,
|
|
|
|
+ note_start + note_offs +
|
|
|
|
+ ALIGN(sizeof("GNU"), 4) + sizeof(Elf32_Nhdr),
|
|
|
|
+ BPF_BUILD_ID_SIZE);
|
|
|
|
+ return 0;
|
|
|
|
+ }
|
|
|
|
+ new_offs = note_offs + sizeof(Elf32_Nhdr) +
|
|
|
|
+ ALIGN(nhdr->n_namesz, 4) + ALIGN(nhdr->n_descsz, 4);
|
|
|
|
+ if (new_offs <= note_offs) /* overflow */
|
|
|
|
+ break;
|
|
|
|
+ note_offs = new_offs;
|
|
|
|
+ }
|
|
|
|
+ return -EINVAL;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/* Parse build ID from 32-bit ELF */
|
|
|
|
+static int stack_map_get_build_id_32(void *page_addr,
|
|
|
|
+ unsigned char *build_id)
|
|
|
|
+{
|
|
|
|
+ Elf32_Ehdr *ehdr = (Elf32_Ehdr *)page_addr;
|
|
|
|
+ Elf32_Phdr *phdr;
|
|
|
|
+ int i;
|
|
|
|
+
|
|
|
|
+ /* only supports phdr that fits in one page */
|
|
|
|
+ if (ehdr->e_phnum >
|
|
|
|
+ (PAGE_SIZE - sizeof(Elf32_Ehdr)) / sizeof(Elf32_Phdr))
|
|
|
|
+ return -EINVAL;
|
|
|
|
+
|
|
|
|
+ phdr = (Elf32_Phdr *)(page_addr + sizeof(Elf32_Ehdr));
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < ehdr->e_phnum; ++i)
|
|
|
|
+ if (phdr[i].p_type == PT_NOTE)
|
|
|
|
+ return stack_map_parse_build_id(page_addr, build_id,
|
|
|
|
+ page_addr + phdr[i].p_offset,
|
|
|
|
+ phdr[i].p_filesz);
|
|
|
|
+ return -EINVAL;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/* Parse build ID from 64-bit ELF */
|
|
|
|
+static int stack_map_get_build_id_64(void *page_addr,
|
|
|
|
+ unsigned char *build_id)
|
|
|
|
+{
|
|
|
|
+ Elf64_Ehdr *ehdr = (Elf64_Ehdr *)page_addr;
|
|
|
|
+ Elf64_Phdr *phdr;
|
|
|
|
+ int i;
|
|
|
|
+
|
|
|
|
+ /* only supports phdr that fits in one page */
|
|
|
|
+ if (ehdr->e_phnum >
|
|
|
|
+ (PAGE_SIZE - sizeof(Elf64_Ehdr)) / sizeof(Elf64_Phdr))
|
|
|
|
+ return -EINVAL;
|
|
|
|
+
|
|
|
|
+ phdr = (Elf64_Phdr *)(page_addr + sizeof(Elf64_Ehdr));
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < ehdr->e_phnum; ++i)
|
|
|
|
+ if (phdr[i].p_type == PT_NOTE)
|
|
|
|
+ return stack_map_parse_build_id(page_addr, build_id,
|
|
|
|
+ page_addr + phdr[i].p_offset,
|
|
|
|
+ phdr[i].p_filesz);
|
|
|
|
+ return -EINVAL;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+/* Parse build ID of ELF file mapped to vma */
|
|
|
|
+static int stack_map_get_build_id(struct vm_area_struct *vma,
|
|
|
|
+ unsigned char *build_id)
|
|
|
|
+{
|
|
|
|
+ Elf32_Ehdr *ehdr;
|
|
|
|
+ struct page *page;
|
|
|
|
+ void *page_addr;
|
|
|
|
+ int ret;
|
|
|
|
+
|
|
|
|
+ /* only works for page backed storage */
|
|
|
|
+ if (!vma->vm_file)
|
|
|
|
+ return -EINVAL;
|
|
|
|
+
|
|
|
|
+ page = find_get_page(vma->vm_file->f_mapping, 0);
|
|
|
|
+ if (!page)
|
|
|
|
+ return -EFAULT; /* page not mapped */
|
|
|
|
+
|
|
|
|
+ ret = -EINVAL;
|
|
|
|
+ page_addr = page_address(page);
|
|
|
|
+ ehdr = (Elf32_Ehdr *)page_addr;
|
|
|
|
+
|
|
|
|
+ /* compare magic x7f "ELF" */
|
|
|
|
+ if (memcmp(ehdr->e_ident, ELFMAG, SELFMAG) != 0)
|
|
|
|
+ goto out;
|
|
|
|
+
|
|
|
|
+ /* only support executable file and shared object file */
|
|
|
|
+ if (ehdr->e_type != ET_EXEC && ehdr->e_type != ET_DYN)
|
|
|
|
+ goto out;
|
|
|
|
+
|
|
|
|
+ if (ehdr->e_ident[EI_CLASS] == ELFCLASS32)
|
|
|
|
+ ret = stack_map_get_build_id_32(page_addr, build_id);
|
|
|
|
+ else if (ehdr->e_ident[EI_CLASS] == ELFCLASS64)
|
|
|
|
+ ret = stack_map_get_build_id_64(page_addr, build_id);
|
|
|
|
+out:
|
|
|
|
+ put_page(page);
|
|
|
|
+ return ret;
|
|
|
|
+}
|
|
|
|
+
|
|
|
|
+static void stack_map_get_build_id_offset(struct bpf_map *map,
|
|
|
|
+ struct stack_map_bucket *bucket,
|
|
|
|
+ u64 *ips, u32 trace_nr, bool user)
|
|
|
|
+{
|
|
|
|
+ int i;
|
|
|
|
+ struct vm_area_struct *vma;
|
|
|
|
+ struct bpf_stack_build_id *id_offs;
|
|
|
|
+
|
|
|
|
+ bucket->nr = trace_nr;
|
|
|
|
+ id_offs = (struct bpf_stack_build_id *)bucket->data;
|
|
|
|
+
|
|
|
|
+ /*
|
|
|
|
+ * We cannot do up_read() in nmi context, so build_id lookup is
|
|
|
|
+ * only supported for non-nmi events. If at some point, it is
|
|
|
|
+ * possible to run find_vma() without taking the semaphore, we
|
|
|
|
+ * would like to allow build_id lookup in nmi context.
|
|
|
|
+ *
|
|
|
|
+ * Same fallback is used for kernel stack (!user) on a stackmap
|
|
|
|
+ * with build_id.
|
|
|
|
+ */
|
|
|
|
+ if (!user || !current || !current->mm || in_nmi() ||
|
|
|
|
+ down_read_trylock(¤t->mm->mmap_sem) == 0) {
|
|
|
|
+ /* cannot access current->mm, fall back to ips */
|
|
|
|
+ for (i = 0; i < trace_nr; i++) {
|
|
|
|
+ id_offs[i].status = BPF_STACK_BUILD_ID_IP;
|
|
|
|
+ id_offs[i].ip = ips[i];
|
|
|
|
+ }
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
|
|
+
|
|
|
|
+ for (i = 0; i < trace_nr; i++) {
|
|
|
|
+ vma = find_vma(current->mm, ips[i]);
|
|
|
|
+ if (!vma || stack_map_get_build_id(vma, id_offs[i].build_id)) {
|
|
|
|
+ /* per entry fall back to ips */
|
|
|
|
+ id_offs[i].status = BPF_STACK_BUILD_ID_IP;
|
|
|
|
+ id_offs[i].ip = ips[i];
|
|
|
|
+ continue;
|
|
|
|
+ }
|
|
|
|
+ id_offs[i].offset = (vma->vm_pgoff << PAGE_SHIFT) + ips[i]
|
|
|
|
+ - vma->vm_start;
|
|
|
|
+ id_offs[i].status = BPF_STACK_BUILD_ID_VALID;
|
|
|
|
+ }
|
|
|
|
+ up_read(¤t->mm->mmap_sem);
|
|
|
|
+}
|
|
|
|
+
|
|
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
|
|
BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
|
|
u64, flags)
|
|
u64, flags)
|
|
{
|
|
{
|
|
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
|
|
struct bpf_stack_map *smap = container_of(map, struct bpf_stack_map, map);
|
|
struct perf_callchain_entry *trace;
|
|
struct perf_callchain_entry *trace;
|
|
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
|
|
struct stack_map_bucket *bucket, *new_bucket, *old_bucket;
|
|
- u32 max_depth = map->value_size / 8;
|
|
|
|
|
|
+ u32 max_depth = map->value_size / stack_map_data_size(map);
|
|
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
|
|
/* stack_map_alloc() checks that max_depth <= sysctl_perf_event_max_stack */
|
|
u32 init_nr = sysctl_perf_event_max_stack - max_depth;
|
|
u32 init_nr = sysctl_perf_event_max_stack - max_depth;
|
|
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
|
|
u32 skip = flags & BPF_F_SKIP_FIELD_MASK;
|
|
@@ -128,6 +321,7 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
|
|
bool user = flags & BPF_F_USER_STACK;
|
|
bool user = flags & BPF_F_USER_STACK;
|
|
bool kernel = !user;
|
|
bool kernel = !user;
|
|
u64 *ips;
|
|
u64 *ips;
|
|
|
|
+ bool hash_matches;
|
|
|
|
|
|
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
|
|
if (unlikely(flags & ~(BPF_F_SKIP_FIELD_MASK | BPF_F_USER_STACK |
|
|
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
|
|
BPF_F_FAST_STACK_CMP | BPF_F_REUSE_STACKID)))
|
|
@@ -156,24 +350,43 @@ BPF_CALL_3(bpf_get_stackid, struct pt_regs *, regs, struct bpf_map *, map,
|
|
id = hash & (smap->n_buckets - 1);
|
|
id = hash & (smap->n_buckets - 1);
|
|
bucket = READ_ONCE(smap->buckets[id]);
|
|
bucket = READ_ONCE(smap->buckets[id]);
|
|
|
|
|
|
- if (bucket && bucket->hash == hash) {
|
|
|
|
- if (flags & BPF_F_FAST_STACK_CMP)
|
|
|
|
|
|
+ hash_matches = bucket && bucket->hash == hash;
|
|
|
|
+ /* fast cmp */
|
|
|
|
+ if (hash_matches && flags & BPF_F_FAST_STACK_CMP)
|
|
|
|
+ return id;
|
|
|
|
+
|
|
|
|
+ if (stack_map_use_build_id(map)) {
|
|
|
|
+ /* for build_id+offset, pop a bucket before slow cmp */
|
|
|
|
+ new_bucket = (struct stack_map_bucket *)
|
|
|
|
+ pcpu_freelist_pop(&smap->freelist);
|
|
|
|
+ if (unlikely(!new_bucket))
|
|
|
|
+ return -ENOMEM;
|
|
|
|
+ stack_map_get_build_id_offset(map, new_bucket, ips,
|
|
|
|
+ trace_nr, user);
|
|
|
|
+ trace_len = trace_nr * sizeof(struct bpf_stack_build_id);
|
|
|
|
+ if (hash_matches && bucket->nr == trace_nr &&
|
|
|
|
+ memcmp(bucket->data, new_bucket->data, trace_len) == 0) {
|
|
|
|
+ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
|
|
return id;
|
|
return id;
|
|
- if (bucket->nr == trace_nr &&
|
|
|
|
- memcmp(bucket->ip, ips, trace_len) == 0)
|
|
|
|
|
|
+ }
|
|
|
|
+ if (bucket && !(flags & BPF_F_REUSE_STACKID)) {
|
|
|
|
+ pcpu_freelist_push(&smap->freelist, &new_bucket->fnode);
|
|
|
|
+ return -EEXIST;
|
|
|
|
+ }
|
|
|
|
+ } else {
|
|
|
|
+ if (hash_matches && bucket->nr == trace_nr &&
|
|
|
|
+ memcmp(bucket->data, ips, trace_len) == 0)
|
|
return id;
|
|
return id;
|
|
|
|
+ if (bucket && !(flags & BPF_F_REUSE_STACKID))
|
|
|
|
+ return -EEXIST;
|
|
|
|
+
|
|
|
|
+ new_bucket = (struct stack_map_bucket *)
|
|
|
|
+ pcpu_freelist_pop(&smap->freelist);
|
|
|
|
+ if (unlikely(!new_bucket))
|
|
|
|
+ return -ENOMEM;
|
|
|
|
+ memcpy(new_bucket->data, ips, trace_len);
|
|
}
|
|
}
|
|
|
|
|
|
- /* this call stack is not in the map, try to add it */
|
|
|
|
- if (bucket && !(flags & BPF_F_REUSE_STACKID))
|
|
|
|
- return -EEXIST;
|
|
|
|
-
|
|
|
|
- new_bucket = (struct stack_map_bucket *)
|
|
|
|
- pcpu_freelist_pop(&smap->freelist);
|
|
|
|
- if (unlikely(!new_bucket))
|
|
|
|
- return -ENOMEM;
|
|
|
|
-
|
|
|
|
- memcpy(new_bucket->ip, ips, trace_len);
|
|
|
|
new_bucket->hash = hash;
|
|
new_bucket->hash = hash;
|
|
new_bucket->nr = trace_nr;
|
|
new_bucket->nr = trace_nr;
|
|
|
|
|
|
@@ -212,8 +425,8 @@ int bpf_stackmap_copy(struct bpf_map *map, void *key, void *value)
|
|
if (!bucket)
|
|
if (!bucket)
|
|
return -ENOENT;
|
|
return -ENOENT;
|
|
|
|
|
|
- trace_len = bucket->nr * sizeof(u64);
|
|
|
|
- memcpy(value, bucket->ip, trace_len);
|
|
|
|
|
|
+ trace_len = bucket->nr * stack_map_data_size(map);
|
|
|
|
+ memcpy(value, bucket->data, trace_len);
|
|
memset(value + trace_len, 0, map->value_size - trace_len);
|
|
memset(value + trace_len, 0, map->value_size - trace_len);
|
|
|
|
|
|
old_bucket = xchg(&smap->buckets[id], bucket);
|
|
old_bucket = xchg(&smap->buckets[id], bucket);
|