|
@@ -25,6 +25,9 @@ struct reciprocal_value {
|
|
|
u8 sh1, sh2;
|
|
|
};
|
|
|
|
|
|
+/* "reciprocal_value" and "reciprocal_divide" together implement the basic
|
|
|
+ * version of the algorithm described in Figure 4.1 of the paper.
|
|
|
+ */
|
|
|
struct reciprocal_value reciprocal_value(u32 d);
|
|
|
|
|
|
static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
|
|
@@ -33,4 +36,69 @@ static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
|
|
|
return (t + ((a - t) >> R.sh1)) >> R.sh2;
|
|
|
}
|
|
|
|
|
|
+struct reciprocal_value_adv {
|
|
|
+ u32 m;
|
|
|
+ u8 sh, exp;
|
|
|
+ bool is_wide_m;
|
|
|
+};
|
|
|
+
|
|
|
+/* "reciprocal_value_adv" implements the advanced version of the algorithm
|
|
|
+ * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
|
|
|
+ * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
|
|
|
+ * exception case could be easily handled before calling "reciprocal_value_adv".
|
|
|
+ *
|
|
|
+ * The advanced version requires more complex calculation to get the reciprocal
|
|
|
+ * multiplier and other control variables, but then could reduce the required
|
|
|
+ * emulation operations.
|
|
|
+ *
|
|
|
+ * It makes no sense to use this advanced version for host divide emulation,
|
|
|
+ * those extra complexities for calculating multiplier etc could completely
|
|
|
+ * waive our saving on emulation operations.
|
|
|
+ *
|
|
|
+ * However, it makes sense to use it for JIT divide code generation for which
|
|
|
+ * we are willing to trade performance of JITed code with that of host. As shown
|
|
|
+ * by the following pseudo code, the required emulation operations could go down
|
|
|
+ * from 6 (the basic version) to 3 or 4.
|
|
|
+ *
|
|
|
+ * To use the result of "reciprocal_value_adv", suppose we want to calculate
|
|
|
+ * n/d, the pseudo C code will be:
|
|
|
+ *
|
|
|
+ * struct reciprocal_value_adv rvalue;
|
|
|
+ * u8 pre_shift, exp;
|
|
|
+ *
|
|
|
+ * // handle exception case.
|
|
|
+ * if (d >= (1U << 31)) {
|
|
|
+ * result = n >= d;
|
|
|
+ * return;
|
|
|
+ * }
|
|
|
+ *
|
|
|
+ * rvalue = reciprocal_value_adv(d, 32)
|
|
|
+ * exp = rvalue.exp;
|
|
|
+ * if (rvalue.is_wide_m && !(d & 1)) {
|
|
|
+ * // floor(log2(d & (2^32 -d)))
|
|
|
+ * pre_shift = fls(d & -d) - 1;
|
|
|
+ * rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
|
|
|
+ * } else {
|
|
|
+ * pre_shift = 0;
|
|
|
+ * }
|
|
|
+ *
|
|
|
+ * // code generation starts.
|
|
|
+ * if (imm == 1U << exp) {
|
|
|
+ * result = n >> exp;
|
|
|
+ * } else if (rvalue.is_wide_m) {
|
|
|
+ * // pre_shift must be zero when reached here.
|
|
|
+ * t = (n * rvalue.m) >> 32;
|
|
|
+ * result = n - t;
|
|
|
+ * result >>= 1;
|
|
|
+ * result += t;
|
|
|
+ * result >>= rvalue.sh - 1;
|
|
|
+ * } else {
|
|
|
+ * if (pre_shift)
|
|
|
+ * result = n >> pre_shift;
|
|
|
+ * result = ((u64)result * rvalue.m) >> 32;
|
|
|
+ * result >>= rvalue.sh;
|
|
|
+ * }
|
|
|
+ */
|
|
|
+struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
|
|
|
+
|
|
|
#endif /* _LINUX_RECIPROCAL_DIV_H */
|