浏览代码

Merge branch 'uprobes/core' of git://git.kernel.org/pub/scm/linux/kernel/git/oleg/misc into perf/core

Pull uprobes updates from Oleg Nesterov:

 - "uretprobes" - an optimization to uprobes, like kretprobes are an optimization
   to kprobes. "perf probe -x file sym%return" now works like kretprobes.

 - PowerPC fixes plus a couple of cleanups/optimizations in uprobes and trace_uprobes.

Signed-off-by: Ingo Molnar <mingo@kernel.org>
Ingo Molnar 12 年之前
父节点
当前提交
b5210b2a34

+ 67 - 47
Documentation/trace/uprobetracer.txt

@@ -1,6 +1,8 @@
-		Uprobe-tracer: Uprobe-based Event Tracing
-		=========================================
-                 Documentation written by Srikar Dronamraju
+            Uprobe-tracer: Uprobe-based Event Tracing
+            =========================================
+
+           Documentation written by Srikar Dronamraju
+
 
 
 Overview
 Overview
 --------
 --------
@@ -13,78 +15,94 @@ current_tracer. Instead of that, add probe points via
 /sys/kernel/debug/tracing/events/uprobes/<EVENT>/enabled.
 /sys/kernel/debug/tracing/events/uprobes/<EVENT>/enabled.
 
 
 However unlike kprobe-event tracer, the uprobe event interface expects the
 However unlike kprobe-event tracer, the uprobe event interface expects the
-user to calculate the offset of the probepoint in the object
+user to calculate the offset of the probepoint in the object.
 
 
 Synopsis of uprobe_tracer
 Synopsis of uprobe_tracer
 -------------------------
 -------------------------
-  p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]	: Set a probe
+  p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] : Set a uprobe
+  r[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS] : Set a return uprobe (uretprobe)
+  -:[GRP/]EVENT                                  : Clear uprobe or uretprobe event
 
 
- GRP		: Group name. If omitted, use "uprobes" for it.
- EVENT		: Event name. If omitted, the event name is generated
-		  based on SYMBOL+offs.
- PATH		: path to an executable or a library.
- SYMBOL[+offs]	: Symbol+offset where the probe is inserted.
+  GRP           : Group name. If omitted, "uprobes" is the default value.
+  EVENT         : Event name. If omitted, the event name is generated based
+                  on SYMBOL+offs.
+  PATH          : Path to an executable or a library.
+  SYMBOL[+offs] : Symbol+offset where the probe is inserted.
 
 
- FETCHARGS	: Arguments. Each probe can have up to 128 args.
-  %REG		: Fetch register REG
+  FETCHARGS     : Arguments. Each probe can have up to 128 args.
+   %REG         : Fetch register REG
 
 
 Event Profiling
 Event Profiling
 ---------------
 ---------------
- You can check the total number of probe hits and probe miss-hits via
+You can check the total number of probe hits and probe miss-hits via
 /sys/kernel/debug/tracing/uprobe_profile.
 /sys/kernel/debug/tracing/uprobe_profile.
- The first column is event name, the second is the number of probe hits,
+The first column is event name, the second is the number of probe hits,
 the third is the number of probe miss-hits.
 the third is the number of probe miss-hits.
 
 
 Usage examples
 Usage examples
 --------------
 --------------
-To add a probe as a new event, write a new definition to uprobe_events
-as below.
+ * Add a probe as a new uprobe event, write a new definition to uprobe_events
+as below: (sets a uprobe at an offset of 0x4245c0 in the executable /bin/bash)
+
+    echo 'p: /bin/bash:0x4245c0' > /sys/kernel/debug/tracing/uprobe_events
+
+ * Add a probe as a new uretprobe event:
+
+    echo 'r: /bin/bash:0x4245c0' > /sys/kernel/debug/tracing/uprobe_events
+
+ * Unset registered event:
 
 
-  echo 'p: /bin/bash:0x4245c0' > /sys/kernel/debug/tracing/uprobe_events
+    echo '-:bash_0x4245c0' >> /sys/kernel/debug/tracing/uprobe_events
 
 
- This sets a uprobe at an offset of 0x4245c0 in the executable /bin/bash
+ * Print out the events that are registered:
 
 
-  echo > /sys/kernel/debug/tracing/uprobe_events
+    cat /sys/kernel/debug/tracing/uprobe_events
 
 
- This clears all probe points.
+ * Clear all events:
 
 
-The following example shows how to dump the instruction pointer and %ax
-a register at the probed text address.  Here we are trying to probe
-function zfree in /bin/zsh
+    echo > /sys/kernel/debug/tracing/uprobe_events
+
+Following example shows how to dump the instruction pointer and %ax register
+at the probed text address. Probe zfree function in /bin/zsh:
 
 
     # cd /sys/kernel/debug/tracing/
     # cd /sys/kernel/debug/tracing/
-    # cat /proc/`pgrep  zsh`/maps | grep /bin/zsh | grep r-xp
+    # cat /proc/`pgrep zsh`/maps | grep /bin/zsh | grep r-xp
     00400000-0048a000 r-xp 00000000 08:03 130904 /bin/zsh
     00400000-0048a000 r-xp 00000000 08:03 130904 /bin/zsh
     # objdump -T /bin/zsh | grep -w zfree
     # objdump -T /bin/zsh | grep -w zfree
     0000000000446420 g    DF .text  0000000000000012  Base        zfree
     0000000000446420 g    DF .text  0000000000000012  Base        zfree
 
 
-0x46420 is the offset of zfree in object /bin/zsh that is loaded at
-0x00400000. Hence the command to probe would be :
+  0x46420 is the offset of zfree in object /bin/zsh that is loaded at
+  0x00400000. Hence the command to uprobe would be:
+
+    # echo 'p:zfree_entry /bin/zsh:0x46420 %ip %ax' > uprobe_events
+
+  And the same for the uretprobe would be:
 
 
-    # echo 'p /bin/zsh:0x46420 %ip %ax' > uprobe_events
+    # echo 'r:zfree_exit /bin/zsh:0x46420 %ip %ax' >> uprobe_events
 
 
-Please note: User has to explicitly calculate the offset of the probepoint
+Please note: User has to explicitly calculate the offset of the probe-point
 in the object. We can see the events that are registered by looking at the
 in the object. We can see the events that are registered by looking at the
 uprobe_events file.
 uprobe_events file.
 
 
     # cat uprobe_events
     # cat uprobe_events
-    p:uprobes/p_zsh_0x46420 /bin/zsh:0x00046420 arg1=%ip arg2=%ax
+    p:uprobes/zfree_entry /bin/zsh:0x00046420 arg1=%ip arg2=%ax
+    r:uprobes/zfree_exit /bin/zsh:0x00046420 arg1=%ip arg2=%ax
 
 
-The format of events can be seen by viewing the file events/uprobes/p_zsh_0x46420/format
+Format of events can be seen by viewing the file events/uprobes/zfree_entry/format
 
 
-    # cat events/uprobes/p_zsh_0x46420/format
-    name: p_zsh_0x46420
+    # cat events/uprobes/zfree_entry/format
+    name: zfree_entry
     ID: 922
     ID: 922
     format:
     format:
-	field:unsigned short common_type;	offset:0;	size:2;	signed:0;
-	field:unsigned char common_flags;	offset:2;	size:1;	signed:0;
-	field:unsigned char common_preempt_count;	offset:3;	size:1;	signed:0;
-	field:int common_pid;	offset:4;	size:4;	signed:1;
-	field:int common_padding;	offset:8;	size:4;	signed:1;
+         field:unsigned short common_type;         offset:0;  size:2; signed:0;
+         field:unsigned char common_flags;         offset:2;  size:1; signed:0;
+         field:unsigned char common_preempt_count; offset:3;  size:1; signed:0;
+         field:int common_pid;                     offset:4;  size:4; signed:1;
+         field:int common_padding;                 offset:8;  size:4; signed:1;
 
 
-	field:unsigned long __probe_ip;	offset:12;	size:4;	signed:0;
-	field:u32 arg1;	offset:16;	size:4;	signed:0;
-	field:u32 arg2;	offset:20;	size:4;	signed:0;
+         field:unsigned long __probe_ip;           offset:12; size:4; signed:0;
+         field:u32 arg1;                           offset:16; size:4; signed:0;
+         field:u32 arg2;                           offset:20; size:4; signed:0;
 
 
     print fmt: "(%lx) arg1=%lx arg2=%lx", REC->__probe_ip, REC->arg1, REC->arg2
     print fmt: "(%lx) arg1=%lx arg2=%lx", REC->__probe_ip, REC->arg1, REC->arg2
 
 
@@ -94,6 +112,7 @@ events, you need to enable it by:
     # echo 1 > events/uprobes/enable
     # echo 1 > events/uprobes/enable
 
 
 Lets disable the event after sleeping for some time.
 Lets disable the event after sleeping for some time.
+
     # sleep 20
     # sleep 20
     # echo 0 > events/uprobes/enable
     # echo 0 > events/uprobes/enable
 
 
@@ -104,10 +123,11 @@ And you can see the traced information via /sys/kernel/debug/tracing/trace.
     #
     #
     #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
     #           TASK-PID    CPU#    TIMESTAMP  FUNCTION
     #              | |       |          |         |
     #              | |       |          |         |
-                 zsh-24842 [006] 258544.995456: p_zsh_0x46420: (0x446420) arg1=446421 arg2=79
-                 zsh-24842 [007] 258545.000270: p_zsh_0x46420: (0x446420) arg1=446421 arg2=79
-                 zsh-24842 [002] 258545.043929: p_zsh_0x46420: (0x446420) arg1=446421 arg2=79
-                 zsh-24842 [004] 258547.046129: p_zsh_0x46420: (0x446420) arg1=446421 arg2=79
-
-Each line shows us probes were triggered for a pid 24842 with ip being
-0x446421 and contents of ax register being 79.
+                 zsh-24842 [006] 258544.995456: zfree_entry: (0x446420) arg1=446420 arg2=79
+                 zsh-24842 [007] 258545.000270: zfree_exit:  (0x446540 <- 0x446420) arg1=446540 arg2=0
+                 zsh-24842 [002] 258545.043929: zfree_entry: (0x446420) arg1=446420 arg2=79
+                 zsh-24842 [004] 258547.046129: zfree_exit:  (0x446540 <- 0x446420) arg1=446540 arg2=0
+
+Output shows us uprobe was triggered for a pid 24842 with ip being 0x446420
+and contents of ax register being 79. And uretprobe was triggered with ip at
+0x446540 with counterpart function entry at 0x446420.

+ 1 - 0
arch/powerpc/include/asm/uprobes.h

@@ -51,4 +51,5 @@ extern int  arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
 extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
 extern int  arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
 extern int  arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
 extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
+extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs);
 #endif	/* _ASM_UPROBES_H */
 #endif	/* _ASM_UPROBES_H */

+ 23 - 6
arch/powerpc/kernel/uprobes.c

@@ -30,6 +30,16 @@
 
 
 #define UPROBE_TRAP_NR	UINT_MAX
 #define UPROBE_TRAP_NR	UINT_MAX
 
 
+/**
+ * is_trap_insn - check if the instruction is a trap variant
+ * @insn: instruction to be checked.
+ * Returns true if @insn is a trap variant.
+ */
+bool is_trap_insn(uprobe_opcode_t *insn)
+{
+	return (is_trap(*insn));
+}
+
 /**
 /**
  * arch_uprobe_analyze_insn
  * arch_uprobe_analyze_insn
  * @mm: the probed address space.
  * @mm: the probed address space.
@@ -43,12 +53,6 @@ int arch_uprobe_analyze_insn(struct arch_uprobe *auprobe,
 	if (addr & 0x03)
 	if (addr & 0x03)
 		return -EINVAL;
 		return -EINVAL;
 
 
-	/*
-	 * We currently don't support a uprobe on an already
-	 * existing breakpoint instruction underneath
-	 */
-	if (is_trap(auprobe->ainsn))
-		return -ENOTSUPP;
 	return 0;
 	return 0;
 }
 }
 
 
@@ -188,3 +192,16 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
 
 
 	return false;
 	return false;
 }
 }
+
+unsigned long
+arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
+{
+	unsigned long orig_ret_vaddr;
+
+	orig_ret_vaddr = regs->link;
+
+	/* Replace the return addr with trampoline addr */
+	regs->link = trampoline_vaddr;
+
+	return orig_ret_vaddr;
+}

+ 1 - 0
arch/x86/include/asm/uprobes.h

@@ -55,4 +55,5 @@ extern int  arch_uprobe_post_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
 extern bool arch_uprobe_xol_was_trapped(struct task_struct *tsk);
 extern int  arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
 extern int  arch_uprobe_exception_notify(struct notifier_block *self, unsigned long val, void *data);
 extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
 extern void arch_uprobe_abort_xol(struct arch_uprobe *aup, struct pt_regs *regs);
+extern unsigned long arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs);
 #endif	/* _ASM_UPROBES_H */
 #endif	/* _ASM_UPROBES_H */

+ 29 - 0
arch/x86/kernel/uprobes.c

@@ -697,3 +697,32 @@ bool arch_uprobe_skip_sstep(struct arch_uprobe *auprobe, struct pt_regs *regs)
 		send_sig(SIGTRAP, current, 0);
 		send_sig(SIGTRAP, current, 0);
 	return ret;
 	return ret;
 }
 }
+
+unsigned long
+arch_uretprobe_hijack_return_addr(unsigned long trampoline_vaddr, struct pt_regs *regs)
+{
+	int rasize, ncopied;
+	unsigned long orig_ret_vaddr = 0; /* clear high bits for 32-bit apps */
+
+	rasize = is_ia32_task() ? 4 : 8;
+	ncopied = copy_from_user(&orig_ret_vaddr, (void __user *)regs->sp, rasize);
+	if (unlikely(ncopied))
+		return -1;
+
+	/* check whether address has been already hijacked */
+	if (orig_ret_vaddr == trampoline_vaddr)
+		return orig_ret_vaddr;
+
+	ncopied = copy_to_user((void __user *)regs->sp, &trampoline_vaddr, rasize);
+	if (likely(!ncopied))
+		return orig_ret_vaddr;
+
+	if (ncopied != rasize) {
+		pr_err("uprobe: return address clobbered: pid=%d, %%sp=%#lx, "
+			"%%ip=%#lx\n", current->pid, regs->sp, regs->ip);
+
+		force_sig_info(SIGSEGV, SEND_SIG_FORCED, current);
+	}
+
+	return -1;
+}

+ 8 - 0
include/linux/uprobes.h

@@ -38,6 +38,8 @@ struct inode;
 #define UPROBE_HANDLER_REMOVE		1
 #define UPROBE_HANDLER_REMOVE		1
 #define UPROBE_HANDLER_MASK		1
 #define UPROBE_HANDLER_MASK		1
 
 
+#define MAX_URETPROBE_DEPTH		64
+
 enum uprobe_filter_ctx {
 enum uprobe_filter_ctx {
 	UPROBE_FILTER_REGISTER,
 	UPROBE_FILTER_REGISTER,
 	UPROBE_FILTER_UNREGISTER,
 	UPROBE_FILTER_UNREGISTER,
@@ -46,6 +48,9 @@ enum uprobe_filter_ctx {
 
 
 struct uprobe_consumer {
 struct uprobe_consumer {
 	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
 	int (*handler)(struct uprobe_consumer *self, struct pt_regs *regs);
+	int (*ret_handler)(struct uprobe_consumer *self,
+				unsigned long func,
+				struct pt_regs *regs);
 	bool (*filter)(struct uprobe_consumer *self,
 	bool (*filter)(struct uprobe_consumer *self,
 				enum uprobe_filter_ctx ctx,
 				enum uprobe_filter_ctx ctx,
 				struct mm_struct *mm);
 				struct mm_struct *mm);
@@ -68,6 +73,8 @@ struct uprobe_task {
 	enum uprobe_task_state		state;
 	enum uprobe_task_state		state;
 	struct arch_uprobe_task		autask;
 	struct arch_uprobe_task		autask;
 
 
+	struct return_instance		*return_instances;
+	unsigned int			depth;
 	struct uprobe			*active_uprobe;
 	struct uprobe			*active_uprobe;
 
 
 	unsigned long			xol_vaddr;
 	unsigned long			xol_vaddr;
@@ -100,6 +107,7 @@ struct uprobes_state {
 extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
 extern int __weak set_swbp(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
 extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
 extern int __weak set_orig_insn(struct arch_uprobe *aup, struct mm_struct *mm, unsigned long vaddr);
 extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
 extern bool __weak is_swbp_insn(uprobe_opcode_t *insn);
+extern bool __weak is_trap_insn(uprobe_opcode_t *insn);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
 extern int uprobe_apply(struct inode *inode, loff_t offset, struct uprobe_consumer *uc, bool);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);
 extern void uprobe_unregister(struct inode *inode, loff_t offset, struct uprobe_consumer *uc);

+ 251 - 49
kernel/events/uprobes.c

@@ -75,6 +75,15 @@ struct uprobe {
 	struct arch_uprobe	arch;
 	struct arch_uprobe	arch;
 };
 };
 
 
+struct return_instance {
+	struct uprobe		*uprobe;
+	unsigned long		func;
+	unsigned long		orig_ret_vaddr; /* original return address */
+	bool			chained;	/* true, if instance is nested */
+
+	struct return_instance	*next;		/* keep as stack */
+};
+
 /*
 /*
  * valid_vma: Verify if the specified vma is an executable vma
  * valid_vma: Verify if the specified vma is an executable vma
  * Relax restrictions while unregistering: vm_flags might have
  * Relax restrictions while unregistering: vm_flags might have
@@ -173,10 +182,31 @@ bool __weak is_swbp_insn(uprobe_opcode_t *insn)
 	return *insn == UPROBE_SWBP_INSN;
 	return *insn == UPROBE_SWBP_INSN;
 }
 }
 
 
-static void copy_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t *opcode)
+/**
+ * is_trap_insn - check if instruction is breakpoint instruction.
+ * @insn: instruction to be checked.
+ * Default implementation of is_trap_insn
+ * Returns true if @insn is a breakpoint instruction.
+ *
+ * This function is needed for the case where an architecture has multiple
+ * trap instructions (like powerpc).
+ */
+bool __weak is_trap_insn(uprobe_opcode_t *insn)
+{
+	return is_swbp_insn(insn);
+}
+
+static void copy_from_page(struct page *page, unsigned long vaddr, void *dst, int len)
 {
 {
 	void *kaddr = kmap_atomic(page);
 	void *kaddr = kmap_atomic(page);
-	memcpy(opcode, kaddr + (vaddr & ~PAGE_MASK), UPROBE_SWBP_INSN_SIZE);
+	memcpy(dst, kaddr + (vaddr & ~PAGE_MASK), len);
+	kunmap_atomic(kaddr);
+}
+
+static void copy_to_page(struct page *page, unsigned long vaddr, const void *src, int len)
+{
+	void *kaddr = kmap_atomic(page);
+	memcpy(kaddr + (vaddr & ~PAGE_MASK), src, len);
 	kunmap_atomic(kaddr);
 	kunmap_atomic(kaddr);
 }
 }
 
 
@@ -185,7 +215,16 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
 	uprobe_opcode_t old_opcode;
 	uprobe_opcode_t old_opcode;
 	bool is_swbp;
 	bool is_swbp;
 
 
-	copy_opcode(page, vaddr, &old_opcode);
+	/*
+	 * Note: We only check if the old_opcode is UPROBE_SWBP_INSN here.
+	 * We do not check if it is any other 'trap variant' which could
+	 * be conditional trap instruction such as the one powerpc supports.
+	 *
+	 * The logic is that we do not care if the underlying instruction
+	 * is a trap variant; uprobes always wins over any other (gdb)
+	 * breakpoint.
+	 */
+	copy_from_page(page, vaddr, &old_opcode, UPROBE_SWBP_INSN_SIZE);
 	is_swbp = is_swbp_insn(&old_opcode);
 	is_swbp = is_swbp_insn(&old_opcode);
 
 
 	if (is_swbp_insn(new_opcode)) {
 	if (is_swbp_insn(new_opcode)) {
@@ -204,7 +243,7 @@ static int verify_opcode(struct page *page, unsigned long vaddr, uprobe_opcode_t
  * Expect the breakpoint instruction to be the smallest size instruction for
  * Expect the breakpoint instruction to be the smallest size instruction for
  * the architecture. If an arch has variable length instruction and the
  * the architecture. If an arch has variable length instruction and the
  * breakpoint instruction is not of the smallest length instruction
  * breakpoint instruction is not of the smallest length instruction
- * supported by that architecture then we need to modify is_swbp_at_addr and
+ * supported by that architecture then we need to modify is_trap_at_addr and
  * write_opcode accordingly. This would never be a problem for archs that
  * write_opcode accordingly. This would never be a problem for archs that
  * have fixed length instructions.
  * have fixed length instructions.
  */
  */
@@ -225,7 +264,6 @@ static int write_opcode(struct mm_struct *mm, unsigned long vaddr,
 			uprobe_opcode_t opcode)
 			uprobe_opcode_t opcode)
 {
 {
 	struct page *old_page, *new_page;
 	struct page *old_page, *new_page;
-	void *vaddr_old, *vaddr_new;
 	struct vm_area_struct *vma;
 	struct vm_area_struct *vma;
 	int ret;
 	int ret;
 
 
@@ -246,15 +284,8 @@ retry:
 
 
 	__SetPageUptodate(new_page);
 	__SetPageUptodate(new_page);
 
 
-	/* copy the page now that we've got it stable */
-	vaddr_old = kmap_atomic(old_page);
-	vaddr_new = kmap_atomic(new_page);
-
-	memcpy(vaddr_new, vaddr_old, PAGE_SIZE);
-	memcpy(vaddr_new + (vaddr & ~PAGE_MASK), &opcode, UPROBE_SWBP_INSN_SIZE);
-
-	kunmap_atomic(vaddr_new);
-	kunmap_atomic(vaddr_old);
+	copy_highpage(new_page, old_page);
+	copy_to_page(new_page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
 
 
 	ret = anon_vma_prepare(vma);
 	ret = anon_vma_prepare(vma);
 	if (ret)
 	if (ret)
@@ -477,30 +508,18 @@ __copy_insn(struct address_space *mapping, struct file *filp, char *insn,
 			unsigned long nbytes, loff_t offset)
 			unsigned long nbytes, loff_t offset)
 {
 {
 	struct page *page;
 	struct page *page;
-	void *vaddr;
-	unsigned long off;
-	pgoff_t idx;
-
-	if (!filp)
-		return -EINVAL;
 
 
 	if (!mapping->a_ops->readpage)
 	if (!mapping->a_ops->readpage)
 		return -EIO;
 		return -EIO;
-
-	idx = offset >> PAGE_CACHE_SHIFT;
-	off = offset & ~PAGE_MASK;
-
 	/*
 	/*
 	 * Ensure that the page that has the original instruction is
 	 * Ensure that the page that has the original instruction is
 	 * populated and in page-cache.
 	 * populated and in page-cache.
 	 */
 	 */
-	page = read_mapping_page(mapping, idx, filp);
+	page = read_mapping_page(mapping, offset >> PAGE_CACHE_SHIFT, filp);
 	if (IS_ERR(page))
 	if (IS_ERR(page))
 		return PTR_ERR(page);
 		return PTR_ERR(page);
 
 
-	vaddr = kmap_atomic(page);
-	memcpy(insn, vaddr + off, nbytes);
-	kunmap_atomic(vaddr);
+	copy_from_page(page, offset, insn, nbytes);
 	page_cache_release(page);
 	page_cache_release(page);
 
 
 	return 0;
 	return 0;
@@ -550,7 +569,7 @@ static int prepare_uprobe(struct uprobe *uprobe, struct file *file,
 		goto out;
 		goto out;
 
 
 	ret = -ENOTSUPP;
 	ret = -ENOTSUPP;
-	if (is_swbp_insn((uprobe_opcode_t *)uprobe->arch.insn))
+	if (is_trap_insn((uprobe_opcode_t *)uprobe->arch.insn))
 		goto out;
 		goto out;
 
 
 	ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
 	ret = arch_uprobe_analyze_insn(&uprobe->arch, mm, vaddr);
@@ -758,7 +777,7 @@ register_for_each_vma(struct uprobe *uprobe, struct uprobe_consumer *new)
 		down_write(&mm->mmap_sem);
 		down_write(&mm->mmap_sem);
 		vma = find_vma(mm, info->vaddr);
 		vma = find_vma(mm, info->vaddr);
 		if (!vma || !valid_vma(vma, is_register) ||
 		if (!vma || !valid_vma(vma, is_register) ||
-		    vma->vm_file->f_mapping->host != uprobe->inode)
+		    file_inode(vma->vm_file) != uprobe->inode)
 			goto unlock;
 			goto unlock;
 
 
 		if (vma->vm_start > info->vaddr ||
 		if (vma->vm_start > info->vaddr ||
@@ -828,6 +847,10 @@ int uprobe_register(struct inode *inode, loff_t offset, struct uprobe_consumer *
 	struct uprobe *uprobe;
 	struct uprobe *uprobe;
 	int ret;
 	int ret;
 
 
+	/* Uprobe must have at least one set consumer */
+	if (!uc->handler && !uc->ret_handler)
+		return -EINVAL;
+
 	/* Racy, just to catch the obvious mistakes */
 	/* Racy, just to catch the obvious mistakes */
 	if (offset > i_size_read(inode))
 	if (offset > i_size_read(inode))
 		return -EINVAL;
 		return -EINVAL;
@@ -917,7 +940,7 @@ static int unapply_uprobe(struct uprobe *uprobe, struct mm_struct *mm)
 		loff_t offset;
 		loff_t offset;
 
 
 		if (!valid_vma(vma, false) ||
 		if (!valid_vma(vma, false) ||
-		    vma->vm_file->f_mapping->host != uprobe->inode)
+		    file_inode(vma->vm_file) != uprobe->inode)
 			continue;
 			continue;
 
 
 		offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
 		offset = (loff_t)vma->vm_pgoff << PAGE_SHIFT;
@@ -1010,7 +1033,7 @@ int uprobe_mmap(struct vm_area_struct *vma)
 	if (no_uprobe_events() || !valid_vma(vma, true))
 	if (no_uprobe_events() || !valid_vma(vma, true))
 		return 0;
 		return 0;
 
 
-	inode = vma->vm_file->f_mapping->host;
+	inode = file_inode(vma->vm_file);
 	if (!inode)
 	if (!inode)
 		return 0;
 		return 0;
 
 
@@ -1041,7 +1064,7 @@ vma_has_uprobes(struct vm_area_struct *vma, unsigned long start, unsigned long e
 	struct inode *inode;
 	struct inode *inode;
 	struct rb_node *n;
 	struct rb_node *n;
 
 
-	inode = vma->vm_file->f_mapping->host;
+	inode = file_inode(vma->vm_file);
 
 
 	min = vaddr_to_offset(vma, start);
 	min = vaddr_to_offset(vma, start);
 	max = min + (end - start) - 1;
 	max = min + (end - start) - 1;
@@ -1114,6 +1137,7 @@ static struct xol_area *get_xol_area(void)
 {
 {
 	struct mm_struct *mm = current->mm;
 	struct mm_struct *mm = current->mm;
 	struct xol_area *area;
 	struct xol_area *area;
+	uprobe_opcode_t insn = UPROBE_SWBP_INSN;
 
 
 	area = mm->uprobes_state.xol_area;
 	area = mm->uprobes_state.xol_area;
 	if (area)
 	if (area)
@@ -1131,7 +1155,12 @@ static struct xol_area *get_xol_area(void)
 	if (!area->page)
 	if (!area->page)
 		goto free_bitmap;
 		goto free_bitmap;
 
 
+	/* allocate first slot of task's xol_area for the return probes */
+	set_bit(0, area->bitmap);
+	copy_to_page(area->page, 0, &insn, UPROBE_SWBP_INSN_SIZE);
+	atomic_set(&area->slot_count, 1);
 	init_waitqueue_head(&area->wq);
 	init_waitqueue_head(&area->wq);
+
 	if (!xol_add_vma(area))
 	if (!xol_add_vma(area))
 		return area;
 		return area;
 
 
@@ -1216,9 +1245,7 @@ static unsigned long xol_take_insn_slot(struct xol_area *area)
 static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
 static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
 {
 {
 	struct xol_area *area;
 	struct xol_area *area;
-	unsigned long offset;
 	unsigned long xol_vaddr;
 	unsigned long xol_vaddr;
-	void *vaddr;
 
 
 	area = get_xol_area();
 	area = get_xol_area();
 	if (!area)
 	if (!area)
@@ -1229,10 +1256,7 @@ static unsigned long xol_get_insn_slot(struct uprobe *uprobe)
 		return 0;
 		return 0;
 
 
 	/* Initialize the slot */
 	/* Initialize the slot */
-	offset = xol_vaddr & ~PAGE_MASK;
-	vaddr = kmap_atomic(area->page);
-	memcpy(vaddr + offset, uprobe->arch.insn, MAX_UINSN_BYTES);
-	kunmap_atomic(vaddr);
+	copy_to_page(area->page, xol_vaddr, uprobe->arch.insn, MAX_UINSN_BYTES);
 	/*
 	/*
 	 * We probably need flush_icache_user_range() but it needs vma.
 	 * We probably need flush_icache_user_range() but it needs vma.
 	 * This should work on supported architectures too.
 	 * This should work on supported architectures too.
@@ -1298,6 +1322,7 @@ unsigned long __weak uprobe_get_swbp_addr(struct pt_regs *regs)
 void uprobe_free_utask(struct task_struct *t)
 void uprobe_free_utask(struct task_struct *t)
 {
 {
 	struct uprobe_task *utask = t->utask;
 	struct uprobe_task *utask = t->utask;
+	struct return_instance *ri, *tmp;
 
 
 	if (!utask)
 	if (!utask)
 		return;
 		return;
@@ -1305,6 +1330,15 @@ void uprobe_free_utask(struct task_struct *t)
 	if (utask->active_uprobe)
 	if (utask->active_uprobe)
 		put_uprobe(utask->active_uprobe);
 		put_uprobe(utask->active_uprobe);
 
 
+	ri = utask->return_instances;
+	while (ri) {
+		tmp = ri;
+		ri = ri->next;
+
+		put_uprobe(tmp->uprobe);
+		kfree(tmp);
+	}
+
 	xol_free_insn_slot(t);
 	xol_free_insn_slot(t);
 	kfree(utask);
 	kfree(utask);
 	t->utask = NULL;
 	t->utask = NULL;
@@ -1333,6 +1367,93 @@ static struct uprobe_task *get_utask(void)
 	return current->utask;
 	return current->utask;
 }
 }
 
 
+/*
+ * Current area->vaddr notion assume the trampoline address is always
+ * equal area->vaddr.
+ *
+ * Returns -1 in case the xol_area is not allocated.
+ */
+static unsigned long get_trampoline_vaddr(void)
+{
+	struct xol_area *area;
+	unsigned long trampoline_vaddr = -1;
+
+	area = current->mm->uprobes_state.xol_area;
+	smp_read_barrier_depends();
+	if (area)
+		trampoline_vaddr = area->vaddr;
+
+	return trampoline_vaddr;
+}
+
+static void prepare_uretprobe(struct uprobe *uprobe, struct pt_regs *regs)
+{
+	struct return_instance *ri;
+	struct uprobe_task *utask;
+	unsigned long orig_ret_vaddr, trampoline_vaddr;
+	bool chained = false;
+
+	if (!get_xol_area())
+		return;
+
+	utask = get_utask();
+	if (!utask)
+		return;
+
+	if (utask->depth >= MAX_URETPROBE_DEPTH) {
+		printk_ratelimited(KERN_INFO "uprobe: omit uretprobe due to"
+				" nestedness limit pid/tgid=%d/%d\n",
+				current->pid, current->tgid);
+		return;
+	}
+
+	ri = kzalloc(sizeof(struct return_instance), GFP_KERNEL);
+	if (!ri)
+		goto fail;
+
+	trampoline_vaddr = get_trampoline_vaddr();
+	orig_ret_vaddr = arch_uretprobe_hijack_return_addr(trampoline_vaddr, regs);
+	if (orig_ret_vaddr == -1)
+		goto fail;
+
+	/*
+	 * We don't want to keep trampoline address in stack, rather keep the
+	 * original return address of first caller thru all the consequent
+	 * instances. This also makes breakpoint unwrapping easier.
+	 */
+	if (orig_ret_vaddr == trampoline_vaddr) {
+		if (!utask->return_instances) {
+			/*
+			 * This situation is not possible. Likely we have an
+			 * attack from user-space.
+			 */
+			pr_warn("uprobe: unable to set uretprobe pid/tgid=%d/%d\n",
+						current->pid, current->tgid);
+			goto fail;
+		}
+
+		chained = true;
+		orig_ret_vaddr = utask->return_instances->orig_ret_vaddr;
+	}
+
+	atomic_inc(&uprobe->ref);
+	ri->uprobe = uprobe;
+	ri->func = instruction_pointer(regs);
+	ri->orig_ret_vaddr = orig_ret_vaddr;
+	ri->chained = chained;
+
+	utask->depth++;
+
+	/* add instance to the stack */
+	ri->next = utask->return_instances;
+	utask->return_instances = ri;
+
+	return;
+
+ fail:
+	kfree(ri);
+}
+
 /* Prepare to single-step probed instruction out of line. */
 /* Prepare to single-step probed instruction out of line. */
 static int
 static int
 pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
 pre_ssout(struct uprobe *uprobe, struct pt_regs *regs, unsigned long bp_vaddr)
@@ -1431,7 +1552,7 @@ static void mmf_recalc_uprobes(struct mm_struct *mm)
 	clear_bit(MMF_HAS_UPROBES, &mm->flags);
 	clear_bit(MMF_HAS_UPROBES, &mm->flags);
 }
 }
 
 
-static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
+static int is_trap_at_addr(struct mm_struct *mm, unsigned long vaddr)
 {
 {
 	struct page *page;
 	struct page *page;
 	uprobe_opcode_t opcode;
 	uprobe_opcode_t opcode;
@@ -1449,10 +1570,11 @@ static int is_swbp_at_addr(struct mm_struct *mm, unsigned long vaddr)
 	if (result < 0)
 	if (result < 0)
 		return result;
 		return result;
 
 
-	copy_opcode(page, vaddr, &opcode);
+	copy_from_page(page, vaddr, &opcode, UPROBE_SWBP_INSN_SIZE);
 	put_page(page);
 	put_page(page);
  out:
  out:
-	return is_swbp_insn(&opcode);
+	/* This needs to return true for any variant of the trap insn */
+	return is_trap_insn(&opcode);
 }
 }
 
 
 static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
 static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
@@ -1465,14 +1587,14 @@ static struct uprobe *find_active_uprobe(unsigned long bp_vaddr, int *is_swbp)
 	vma = find_vma(mm, bp_vaddr);
 	vma = find_vma(mm, bp_vaddr);
 	if (vma && vma->vm_start <= bp_vaddr) {
 	if (vma && vma->vm_start <= bp_vaddr) {
 		if (valid_vma(vma, false)) {
 		if (valid_vma(vma, false)) {
-			struct inode *inode = vma->vm_file->f_mapping->host;
+			struct inode *inode = file_inode(vma->vm_file);
 			loff_t offset = vaddr_to_offset(vma, bp_vaddr);
 			loff_t offset = vaddr_to_offset(vma, bp_vaddr);
 
 
 			uprobe = find_uprobe(inode, offset);
 			uprobe = find_uprobe(inode, offset);
 		}
 		}
 
 
 		if (!uprobe)
 		if (!uprobe)
-			*is_swbp = is_swbp_at_addr(mm, bp_vaddr);
+			*is_swbp = is_trap_at_addr(mm, bp_vaddr);
 	} else {
 	} else {
 		*is_swbp = -EFAULT;
 		*is_swbp = -EFAULT;
 	}
 	}
@@ -1488,16 +1610,27 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 {
 {
 	struct uprobe_consumer *uc;
 	struct uprobe_consumer *uc;
 	int remove = UPROBE_HANDLER_REMOVE;
 	int remove = UPROBE_HANDLER_REMOVE;
+	bool need_prep = false; /* prepare return uprobe, when needed */
 
 
 	down_read(&uprobe->register_rwsem);
 	down_read(&uprobe->register_rwsem);
 	for (uc = uprobe->consumers; uc; uc = uc->next) {
 	for (uc = uprobe->consumers; uc; uc = uc->next) {
-		int rc = uc->handler(uc, regs);
+		int rc = 0;
+
+		if (uc->handler) {
+			rc = uc->handler(uc, regs);
+			WARN(rc & ~UPROBE_HANDLER_MASK,
+				"bad rc=0x%x from %pf()\n", rc, uc->handler);
+		}
+
+		if (uc->ret_handler)
+			need_prep = true;
 
 
-		WARN(rc & ~UPROBE_HANDLER_MASK,
-			"bad rc=0x%x from %pf()\n", rc, uc->handler);
 		remove &= rc;
 		remove &= rc;
 	}
 	}
 
 
+	if (need_prep && !remove)
+		prepare_uretprobe(uprobe, regs); /* put bp at return */
+
 	if (remove && uprobe->consumers) {
 	if (remove && uprobe->consumers) {
 		WARN_ON(!uprobe_is_active(uprobe));
 		WARN_ON(!uprobe_is_active(uprobe));
 		unapply_uprobe(uprobe, current->mm);
 		unapply_uprobe(uprobe, current->mm);
@@ -1505,6 +1638,64 @@ static void handler_chain(struct uprobe *uprobe, struct pt_regs *regs)
 	up_read(&uprobe->register_rwsem);
 	up_read(&uprobe->register_rwsem);
 }
 }
 
 
+static void
+handle_uretprobe_chain(struct return_instance *ri, struct pt_regs *regs)
+{
+	struct uprobe *uprobe = ri->uprobe;
+	struct uprobe_consumer *uc;
+
+	down_read(&uprobe->register_rwsem);
+	for (uc = uprobe->consumers; uc; uc = uc->next) {
+		if (uc->ret_handler)
+			uc->ret_handler(uc, ri->func, regs);
+	}
+	up_read(&uprobe->register_rwsem);
+}
+
+static bool handle_trampoline(struct pt_regs *regs)
+{
+	struct uprobe_task *utask;
+	struct return_instance *ri, *tmp;
+	bool chained;
+
+	utask = current->utask;
+	if (!utask)
+		return false;
+
+	ri = utask->return_instances;
+	if (!ri)
+		return false;
+
+	/*
+	 * TODO: we should throw out return_instance's invalidated by
+	 * longjmp(), currently we assume that the probed function always
+	 * returns.
+	 */
+	instruction_pointer_set(regs, ri->orig_ret_vaddr);
+
+	for (;;) {
+		handle_uretprobe_chain(ri, regs);
+
+		chained = ri->chained;
+		put_uprobe(ri->uprobe);
+
+		tmp = ri;
+		ri = ri->next;
+		kfree(tmp);
+
+		if (!chained)
+			break;
+
+		utask->depth--;
+
+		BUG_ON(!ri);
+	}
+
+	utask->return_instances = ri;
+
+	return true;
+}
+
 /*
 /*
  * Run handler and ask thread to singlestep.
  * Run handler and ask thread to singlestep.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
  * Ensure all non-fatal signals cannot interrupt thread while it singlesteps.
@@ -1516,8 +1707,15 @@ static void handle_swbp(struct pt_regs *regs)
 	int uninitialized_var(is_swbp);
 	int uninitialized_var(is_swbp);
 
 
 	bp_vaddr = uprobe_get_swbp_addr(regs);
 	bp_vaddr = uprobe_get_swbp_addr(regs);
-	uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
+	if (bp_vaddr == get_trampoline_vaddr()) {
+		if (handle_trampoline(regs))
+			return;
+
+		pr_warn("uprobe: unable to handle uretprobe pid/tgid=%d/%d\n",
+						current->pid, current->tgid);
+	}
 
 
+	uprobe = find_active_uprobe(bp_vaddr, &is_swbp);
 	if (!uprobe) {
 	if (!uprobe) {
 		if (is_swbp > 0) {
 		if (is_swbp > 0) {
 			/* No matching uprobe; signal SIGTRAP. */
 			/* No matching uprobe; signal SIGTRAP. */
@@ -1616,7 +1814,11 @@ void uprobe_notify_resume(struct pt_regs *regs)
  */
  */
 int uprobe_pre_sstep_notifier(struct pt_regs *regs)
 int uprobe_pre_sstep_notifier(struct pt_regs *regs)
 {
 {
-	if (!current->mm || !test_bit(MMF_HAS_UPROBES, &current->mm->flags))
+	if (!current->mm)
+		return 0;
+
+	if (!test_bit(MMF_HAS_UPROBES, &current->mm->flags) &&
+	    (!current->utask || !current->utask->return_instances))
 		return 0;
 		return 0;
 
 
 	set_thread_flag(TIF_UPROBE);
 	set_thread_flag(TIF_UPROBE);

+ 0 - 5
kernel/trace/trace.h

@@ -103,11 +103,6 @@ struct kretprobe_trace_entry_head {
 	unsigned long		ret_ip;
 	unsigned long		ret_ip;
 };
 };
 
 
-struct uprobe_trace_entry_head {
-	struct trace_entry	ent;
-	unsigned long		ip;
-};
-
 /*
 /*
  * trace_flag_type is an enumeration that holds different
  * trace_flag_type is an enumeration that holds different
  * states when a trace occurs. These are:
  * states when a trace occurs. These are:

+ 146 - 57
kernel/trace/trace_uprobe.c

@@ -28,6 +28,18 @@
 
 
 #define UPROBE_EVENT_SYSTEM	"uprobes"
 #define UPROBE_EVENT_SYSTEM	"uprobes"
 
 
+struct uprobe_trace_entry_head {
+	struct trace_entry	ent;
+	unsigned long		vaddr[];
+};
+
+#define SIZEOF_TRACE_ENTRY(is_return)			\
+	(sizeof(struct uprobe_trace_entry_head) +	\
+	 sizeof(unsigned long) * (is_return ? 2 : 1))
+
+#define DATAOF_TRACE_ENTRY(entry, is_return)		\
+	((void*)(entry) + SIZEOF_TRACE_ENTRY(is_return))
+
 struct trace_uprobe_filter {
 struct trace_uprobe_filter {
 	rwlock_t		rwlock;
 	rwlock_t		rwlock;
 	int			nr_systemwide;
 	int			nr_systemwide;
@@ -64,6 +76,8 @@ static DEFINE_MUTEX(uprobe_lock);
 static LIST_HEAD(uprobe_list);
 static LIST_HEAD(uprobe_list);
 
 
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
 static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs);
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+				unsigned long func, struct pt_regs *regs);
 
 
 static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
 static inline void init_trace_uprobe_filter(struct trace_uprobe_filter *filter)
 {
 {
@@ -77,11 +91,16 @@ static inline bool uprobe_filter_is_empty(struct trace_uprobe_filter *filter)
 	return !filter->nr_systemwide && list_empty(&filter->perf_events);
 	return !filter->nr_systemwide && list_empty(&filter->perf_events);
 }
 }
 
 
+static inline bool is_ret_probe(struct trace_uprobe *tu)
+{
+	return tu->consumer.ret_handler != NULL;
+}
+
 /*
 /*
  * Allocate new trace_uprobe and initialize it (including uprobes).
  * Allocate new trace_uprobe and initialize it (including uprobes).
  */
  */
 static struct trace_uprobe *
 static struct trace_uprobe *
-alloc_trace_uprobe(const char *group, const char *event, int nargs)
+alloc_trace_uprobe(const char *group, const char *event, int nargs, bool is_ret)
 {
 {
 	struct trace_uprobe *tu;
 	struct trace_uprobe *tu;
 
 
@@ -106,6 +125,8 @@ alloc_trace_uprobe(const char *group, const char *event, int nargs)
 
 
 	INIT_LIST_HEAD(&tu->list);
 	INIT_LIST_HEAD(&tu->list);
 	tu->consumer.handler = uprobe_dispatcher;
 	tu->consumer.handler = uprobe_dispatcher;
+	if (is_ret)
+		tu->consumer.ret_handler = uretprobe_dispatcher;
 	init_trace_uprobe_filter(&tu->filter);
 	init_trace_uprobe_filter(&tu->filter);
 	return tu;
 	return tu;
 
 
@@ -180,7 +201,7 @@ end:
 
 
 /*
 /*
  * Argument syntax:
  * Argument syntax:
- *  - Add uprobe: p[:[GRP/]EVENT] PATH:SYMBOL[+offs] [FETCHARGS]
+ *  - Add uprobe: p|r[:[GRP/]EVENT] PATH:SYMBOL [FETCHARGS]
  *
  *
  *  - Remove uprobe: -:[GRP/]EVENT
  *  - Remove uprobe: -:[GRP/]EVENT
  */
  */
@@ -192,20 +213,23 @@ static int create_trace_uprobe(int argc, char **argv)
 	char buf[MAX_EVENT_NAME_LEN];
 	char buf[MAX_EVENT_NAME_LEN];
 	struct path path;
 	struct path path;
 	unsigned long offset;
 	unsigned long offset;
-	bool is_delete;
+	bool is_delete, is_return;
 	int i, ret;
 	int i, ret;
 
 
 	inode = NULL;
 	inode = NULL;
 	ret = 0;
 	ret = 0;
 	is_delete = false;
 	is_delete = false;
+	is_return = false;
 	event = NULL;
 	event = NULL;
 	group = NULL;
 	group = NULL;
 
 
 	/* argc must be >= 1 */
 	/* argc must be >= 1 */
 	if (argv[0][0] == '-')
 	if (argv[0][0] == '-')
 		is_delete = true;
 		is_delete = true;
+	else if (argv[0][0] == 'r')
+		is_return = true;
 	else if (argv[0][0] != 'p') {
 	else if (argv[0][0] != 'p') {
-		pr_info("Probe definition must be started with 'p' or '-'.\n");
+		pr_info("Probe definition must be started with 'p', 'r' or '-'.\n");
 		return -EINVAL;
 		return -EINVAL;
 	}
 	}
 
 
@@ -303,7 +327,7 @@ static int create_trace_uprobe(int argc, char **argv)
 		kfree(tail);
 		kfree(tail);
 	}
 	}
 
 
-	tu = alloc_trace_uprobe(group, event, argc);
+	tu = alloc_trace_uprobe(group, event, argc, is_return);
 	if (IS_ERR(tu)) {
 	if (IS_ERR(tu)) {
 		pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
 		pr_info("Failed to allocate trace_uprobe.(%d)\n", (int)PTR_ERR(tu));
 		ret = PTR_ERR(tu);
 		ret = PTR_ERR(tu);
@@ -414,9 +438,10 @@ static void probes_seq_stop(struct seq_file *m, void *v)
 static int probes_seq_show(struct seq_file *m, void *v)
 static int probes_seq_show(struct seq_file *m, void *v)
 {
 {
 	struct trace_uprobe *tu = v;
 	struct trace_uprobe *tu = v;
+	char c = is_ret_probe(tu) ? 'r' : 'p';
 	int i;
 	int i;
 
 
-	seq_printf(m, "p:%s/%s", tu->call.class->system, tu->call.name);
+	seq_printf(m, "%c:%s/%s", c, tu->call.class->system, tu->call.name);
 	seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
 	seq_printf(m, " %s:0x%p", tu->filename, (void *)tu->offset);
 
 
 	for (i = 0; i < tu->nr_args; i++)
 	for (i = 0; i < tu->nr_args; i++)
@@ -485,65 +510,81 @@ static const struct file_operations uprobe_profile_ops = {
 	.release	= seq_release,
 	.release	= seq_release,
 };
 };
 
 
-/* uprobe handler */
-static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static void uprobe_trace_print(struct trace_uprobe *tu,
+				unsigned long func, struct pt_regs *regs)
 {
 {
 	struct uprobe_trace_entry_head *entry;
 	struct uprobe_trace_entry_head *entry;
 	struct ring_buffer_event *event;
 	struct ring_buffer_event *event;
 	struct ring_buffer *buffer;
 	struct ring_buffer *buffer;
-	u8 *data;
-	int size, i, pc;
-	unsigned long irq_flags;
+	void *data;
+	int size, i;
 	struct ftrace_event_call *call = &tu->call;
 	struct ftrace_event_call *call = &tu->call;
 
 
-	local_save_flags(irq_flags);
-	pc = preempt_count();
-
-	size = sizeof(*entry) + tu->size;
-
+	size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
 	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
 	event = trace_current_buffer_lock_reserve(&buffer, call->event.type,
-						  size, irq_flags, pc);
+						  size + tu->size, 0, 0);
 	if (!event)
 	if (!event)
-		return 0;
+		return;
 
 
 	entry = ring_buffer_event_data(event);
 	entry = ring_buffer_event_data(event);
-	entry->ip = instruction_pointer(task_pt_regs(current));
-	data = (u8 *)&entry[1];
+	if (is_ret_probe(tu)) {
+		entry->vaddr[0] = func;
+		entry->vaddr[1] = instruction_pointer(regs);
+		data = DATAOF_TRACE_ENTRY(entry, true);
+	} else {
+		entry->vaddr[0] = instruction_pointer(regs);
+		data = DATAOF_TRACE_ENTRY(entry, false);
+	}
+
 	for (i = 0; i < tu->nr_args; i++)
 	for (i = 0; i < tu->nr_args; i++)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
 
 	if (!filter_current_check_discard(buffer, call, entry, event))
 	if (!filter_current_check_discard(buffer, call, entry, event))
-		trace_buffer_unlock_commit(buffer, event, irq_flags, pc);
+		trace_buffer_unlock_commit(buffer, event, 0, 0);
+}
 
 
+/* uprobe handler */
+static int uprobe_trace_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+	if (!is_ret_probe(tu))
+		uprobe_trace_print(tu, 0, regs);
 	return 0;
 	return 0;
 }
 }
 
 
+static void uretprobe_trace_func(struct trace_uprobe *tu, unsigned long func,
+				struct pt_regs *regs)
+{
+	uprobe_trace_print(tu, func, regs);
+}
+
 /* Event entry printers */
 /* Event entry printers */
 static enum print_line_t
 static enum print_line_t
 print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
 print_uprobe_event(struct trace_iterator *iter, int flags, struct trace_event *event)
 {
 {
-	struct uprobe_trace_entry_head *field;
+	struct uprobe_trace_entry_head *entry;
 	struct trace_seq *s = &iter->seq;
 	struct trace_seq *s = &iter->seq;
 	struct trace_uprobe *tu;
 	struct trace_uprobe *tu;
 	u8 *data;
 	u8 *data;
 	int i;
 	int i;
 
 
-	field = (struct uprobe_trace_entry_head *)iter->ent;
+	entry = (struct uprobe_trace_entry_head *)iter->ent;
 	tu = container_of(event, struct trace_uprobe, call.event);
 	tu = container_of(event, struct trace_uprobe, call.event);
 
 
-	if (!trace_seq_printf(s, "%s: (", tu->call.name))
-		goto partial;
-
-	if (!seq_print_ip_sym(s, field->ip, flags | TRACE_ITER_SYM_OFFSET))
-		goto partial;
-
-	if (!trace_seq_puts(s, ")"))
-		goto partial;
+	if (is_ret_probe(tu)) {
+		if (!trace_seq_printf(s, "%s: (0x%lx <- 0x%lx)", tu->call.name,
+					entry->vaddr[1], entry->vaddr[0]))
+			goto partial;
+		data = DATAOF_TRACE_ENTRY(entry, true);
+	} else {
+		if (!trace_seq_printf(s, "%s: (0x%lx)", tu->call.name,
+					entry->vaddr[0]))
+			goto partial;
+		data = DATAOF_TRACE_ENTRY(entry, false);
+	}
 
 
-	data = (u8 *)&field[1];
 	for (i = 0; i < tu->nr_args; i++) {
 	for (i = 0; i < tu->nr_args; i++) {
 		if (!tu->args[i].type->print(s, tu->args[i].name,
 		if (!tu->args[i].type->print(s, tu->args[i].name,
-					     data + tu->args[i].offset, field))
+					     data + tu->args[i].offset, entry))
 			goto partial;
 			goto partial;
 	}
 	}
 
 
@@ -595,16 +636,23 @@ static void probe_event_disable(struct trace_uprobe *tu, int flag)
 
 
 static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
 static int uprobe_event_define_fields(struct ftrace_event_call *event_call)
 {
 {
-	int ret, i;
+	int ret, i, size;
 	struct uprobe_trace_entry_head field;
 	struct uprobe_trace_entry_head field;
-	struct trace_uprobe *tu = (struct trace_uprobe *)event_call->data;
+	struct trace_uprobe *tu = event_call->data;
 
 
-	DEFINE_FIELD(unsigned long, ip, FIELD_STRING_IP, 0);
+	if (is_ret_probe(tu)) {
+		DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_FUNC, 0);
+		DEFINE_FIELD(unsigned long, vaddr[1], FIELD_STRING_RETIP, 0);
+		size = SIZEOF_TRACE_ENTRY(true);
+	} else {
+		DEFINE_FIELD(unsigned long, vaddr[0], FIELD_STRING_IP, 0);
+		size = SIZEOF_TRACE_ENTRY(false);
+	}
 	/* Set argument names as fields */
 	/* Set argument names as fields */
 	for (i = 0; i < tu->nr_args; i++) {
 	for (i = 0; i < tu->nr_args; i++) {
 		ret = trace_define_field(event_call, tu->args[i].type->fmttype,
 		ret = trace_define_field(event_call, tu->args[i].type->fmttype,
 					 tu->args[i].name,
 					 tu->args[i].name,
-					 sizeof(field) + tu->args[i].offset,
+					 size + tu->args[i].offset,
 					 tu->args[i].type->size,
 					 tu->args[i].type->size,
 					 tu->args[i].type->is_signed,
 					 tu->args[i].type->is_signed,
 					 FILTER_OTHER);
 					 FILTER_OTHER);
@@ -622,8 +670,13 @@ static int __set_print_fmt(struct trace_uprobe *tu, char *buf, int len)
 	int i;
 	int i;
 	int pos = 0;
 	int pos = 0;
 
 
-	fmt = "(%lx)";
-	arg = "REC->" FIELD_STRING_IP;
+	if (is_ret_probe(tu)) {
+		fmt = "(%lx <- %lx)";
+		arg = "REC->" FIELD_STRING_FUNC ", REC->" FIELD_STRING_RETIP;
+	} else {
+		fmt = "(%lx)";
+		arg = "REC->" FIELD_STRING_IP;
+	}
 
 
 	/* When len=0, we just calculate the needed length */
 	/* When len=0, we just calculate the needed length */
 
 
@@ -752,49 +805,68 @@ static bool uprobe_perf_filter(struct uprobe_consumer *uc,
 	return ret;
 	return ret;
 }
 }
 
 
-/* uprobe profile handler */
-static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+static void uprobe_perf_print(struct trace_uprobe *tu,
+				unsigned long func, struct pt_regs *regs)
 {
 {
 	struct ftrace_event_call *call = &tu->call;
 	struct ftrace_event_call *call = &tu->call;
 	struct uprobe_trace_entry_head *entry;
 	struct uprobe_trace_entry_head *entry;
 	struct hlist_head *head;
 	struct hlist_head *head;
-	u8 *data;
-	int size, __size, i;
-	int rctx;
+	void *data;
+	int size, rctx, i;
 
 
-	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
-		return UPROBE_HANDLER_REMOVE;
-
-	__size = sizeof(*entry) + tu->size;
-	size = ALIGN(__size + sizeof(u32), sizeof(u64));
-	size -= sizeof(u32);
+	size = SIZEOF_TRACE_ENTRY(is_ret_probe(tu));
+	size = ALIGN(size + tu->size + sizeof(u32), sizeof(u64)) - sizeof(u32);
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
 	if (WARN_ONCE(size > PERF_MAX_TRACE_SIZE, "profile buffer not large enough"))
-		return 0;
+		return;
 
 
 	preempt_disable();
 	preempt_disable();
+	head = this_cpu_ptr(call->perf_events);
+	if (hlist_empty(head))
+		goto out;
 
 
 	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
 	entry = perf_trace_buf_prepare(size, call->event.type, regs, &rctx);
 	if (!entry)
 	if (!entry)
 		goto out;
 		goto out;
 
 
-	entry->ip = instruction_pointer(task_pt_regs(current));
-	data = (u8 *)&entry[1];
+	if (is_ret_probe(tu)) {
+		entry->vaddr[0] = func;
+		entry->vaddr[1] = instruction_pointer(regs);
+		data = DATAOF_TRACE_ENTRY(entry, true);
+	} else {
+		entry->vaddr[0] = instruction_pointer(regs);
+		data = DATAOF_TRACE_ENTRY(entry, false);
+	}
+
 	for (i = 0; i < tu->nr_args; i++)
 	for (i = 0; i < tu->nr_args; i++)
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 		call_fetch(&tu->args[i].fetch, regs, data + tu->args[i].offset);
 
 
-	head = this_cpu_ptr(call->perf_events);
-	perf_trace_buf_submit(entry, size, rctx, entry->ip, 1, regs, head, NULL);
-
+	perf_trace_buf_submit(entry, size, rctx, 0, 1, regs, head, NULL);
  out:
  out:
 	preempt_enable();
 	preempt_enable();
+}
+
+/* uprobe profile handler */
+static int uprobe_perf_func(struct trace_uprobe *tu, struct pt_regs *regs)
+{
+	if (!uprobe_perf_filter(&tu->consumer, 0, current->mm))
+		return UPROBE_HANDLER_REMOVE;
+
+	if (!is_ret_probe(tu))
+		uprobe_perf_print(tu, 0, regs);
 	return 0;
 	return 0;
 }
 }
+
+static void uretprobe_perf_func(struct trace_uprobe *tu, unsigned long func,
+				struct pt_regs *regs)
+{
+	uprobe_perf_print(tu, func, regs);
+}
 #endif	/* CONFIG_PERF_EVENTS */
 #endif	/* CONFIG_PERF_EVENTS */
 
 
 static
 static
 int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
 int trace_uprobe_register(struct ftrace_event_call *event, enum trace_reg type, void *data)
 {
 {
-	struct trace_uprobe *tu = (struct trace_uprobe *)event->data;
+	struct trace_uprobe *tu = event->data;
 
 
 	switch (type) {
 	switch (type) {
 	case TRACE_REG_REGISTER:
 	case TRACE_REG_REGISTER:
@@ -843,6 +915,23 @@ static int uprobe_dispatcher(struct uprobe_consumer *con, struct pt_regs *regs)
 	return ret;
 	return ret;
 }
 }
 
 
+static int uretprobe_dispatcher(struct uprobe_consumer *con,
+				unsigned long func, struct pt_regs *regs)
+{
+	struct trace_uprobe *tu;
+
+	tu = container_of(con, struct trace_uprobe, consumer);
+
+	if (tu->flags & TP_FLAG_TRACE)
+		uretprobe_trace_func(tu, func, regs);
+
+#ifdef CONFIG_PERF_EVENTS
+	if (tu->flags & TP_FLAG_PROFILE)
+		uretprobe_perf_func(tu, func, regs);
+#endif
+	return 0;
+}
+
 static struct trace_event_functions uprobe_funcs = {
 static struct trace_event_functions uprobe_funcs = {
 	.trace		= print_uprobe_event
 	.trace		= print_uprobe_event
 };
 };