|
@@ -196,6 +196,12 @@
|
|
|
|
|
|
#include "i915_drv.h"
|
|
|
#include "i915_oa_hsw.h"
|
|
|
+#include "i915_oa_bdw.h"
|
|
|
+#include "i915_oa_chv.h"
|
|
|
+#include "i915_oa_sklgt2.h"
|
|
|
+#include "i915_oa_sklgt3.h"
|
|
|
+#include "i915_oa_sklgt4.h"
|
|
|
+#include "i915_oa_bxt.h"
|
|
|
|
|
|
/* HW requires this to be a power of two, between 128k and 16M, though driver
|
|
|
* is currently generally designed assuming the largest 16M size is used such
|
|
@@ -215,7 +221,7 @@
|
|
|
*
|
|
|
* Although this can be observed explicitly while copying reports to userspace
|
|
|
* by checking for a zeroed report-id field in tail reports, we want to account
|
|
|
- * for this earlier, as part of the _oa_buffer_check to avoid lots of redundant
|
|
|
+ * for this earlier, as part of the oa_buffer_check to avoid lots of redundant
|
|
|
* read() attempts.
|
|
|
*
|
|
|
* In effect we define a tail pointer for reading that lags the real tail
|
|
@@ -237,7 +243,7 @@
|
|
|
* indicates that an updated tail pointer is needed.
|
|
|
*
|
|
|
* Most of the implementation details for this workaround are in
|
|
|
- * gen7_oa_buffer_check_unlocked() and gen7_appand_oa_reports()
|
|
|
+ * oa_buffer_check_unlocked() and _append_oa_reports()
|
|
|
*
|
|
|
* Note for posterity: previously the driver used to define an effective tail
|
|
|
* pointer that lagged the real pointer by a 'tail margin' measured in bytes
|
|
@@ -272,6 +278,13 @@ static u32 i915_perf_stream_paranoid = true;
|
|
|
|
|
|
#define INVALID_CTX_ID 0xffffffff
|
|
|
|
|
|
+/* On Gen8+ automatically triggered OA reports include a 'reason' field... */
|
|
|
+#define OAREPORT_REASON_MASK 0x3f
|
|
|
+#define OAREPORT_REASON_SHIFT 19
|
|
|
+#define OAREPORT_REASON_TIMER (1<<0)
|
|
|
+#define OAREPORT_REASON_CTX_SWITCH (1<<3)
|
|
|
+#define OAREPORT_REASON_CLK_RATIO (1<<5)
|
|
|
+
|
|
|
|
|
|
/* For sysctl proc_dointvec_minmax of i915_oa_max_sample_rate
|
|
|
*
|
|
@@ -303,6 +316,13 @@ static struct i915_oa_format hsw_oa_formats[I915_OA_FORMAT_MAX] = {
|
|
|
[I915_OA_FORMAT_C4_B8] = { 7, 64 },
|
|
|
};
|
|
|
|
|
|
+static struct i915_oa_format gen8_plus_oa_formats[I915_OA_FORMAT_MAX] = {
|
|
|
+ [I915_OA_FORMAT_A12] = { 0, 64 },
|
|
|
+ [I915_OA_FORMAT_A12_B8_C8] = { 2, 128 },
|
|
|
+ [I915_OA_FORMAT_A32u40_A4u32_B8_C8] = { 5, 256 },
|
|
|
+ [I915_OA_FORMAT_C4_B8] = { 7, 64 },
|
|
|
+};
|
|
|
+
|
|
|
#define SAMPLE_OA_REPORT (1<<0)
|
|
|
|
|
|
/**
|
|
@@ -332,8 +352,20 @@ struct perf_open_properties {
|
|
|
int oa_period_exponent;
|
|
|
};
|
|
|
|
|
|
+static u32 gen8_oa_hw_tail_read(struct drm_i915_private *dev_priv)
|
|
|
+{
|
|
|
+ return I915_READ(GEN8_OATAILPTR) & GEN8_OATAILPTR_MASK;
|
|
|
+}
|
|
|
+
|
|
|
+static u32 gen7_oa_hw_tail_read(struct drm_i915_private *dev_priv)
|
|
|
+{
|
|
|
+ u32 oastatus1 = I915_READ(GEN7_OASTATUS1);
|
|
|
+
|
|
|
+ return oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
- * gen7_oa_buffer_check_unlocked - check for data and update tail ptr state
|
|
|
+ * oa_buffer_check_unlocked - check for data and update tail ptr state
|
|
|
* @dev_priv: i915 device instance
|
|
|
*
|
|
|
* This is either called via fops (for blocking reads in user ctx) or the poll
|
|
@@ -356,12 +388,11 @@ struct perf_open_properties {
|
|
|
*
|
|
|
* Returns: %true if the OA buffer contains data, else %false
|
|
|
*/
|
|
|
-static bool gen7_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
|
|
|
+static bool oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
|
|
|
{
|
|
|
int report_size = dev_priv->perf.oa.oa_buffer.format_size;
|
|
|
unsigned long flags;
|
|
|
unsigned int aged_idx;
|
|
|
- u32 oastatus1;
|
|
|
u32 head, hw_tail, aged_tail, aging_tail;
|
|
|
u64 now;
|
|
|
|
|
@@ -381,8 +412,7 @@ static bool gen7_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
|
|
|
aged_tail = dev_priv->perf.oa.oa_buffer.tails[aged_idx].offset;
|
|
|
aging_tail = dev_priv->perf.oa.oa_buffer.tails[!aged_idx].offset;
|
|
|
|
|
|
- oastatus1 = I915_READ(GEN7_OASTATUS1);
|
|
|
- hw_tail = oastatus1 & GEN7_OASTATUS1_TAIL_MASK;
|
|
|
+ hw_tail = dev_priv->perf.oa.ops.oa_hw_tail_read(dev_priv);
|
|
|
|
|
|
/* The tail pointer increases in 64 byte increments,
|
|
|
* not in report_size steps...
|
|
@@ -404,6 +434,7 @@ static bool gen7_oa_buffer_check_unlocked(struct drm_i915_private *dev_priv)
|
|
|
if (aging_tail != INVALID_TAIL_PTR &&
|
|
|
((now - dev_priv->perf.oa.oa_buffer.aging_timestamp) >
|
|
|
OA_TAIL_MARGIN_NSEC)) {
|
|
|
+
|
|
|
aged_idx ^= 1;
|
|
|
dev_priv->perf.oa.oa_buffer.aged_tail_idx = aged_idx;
|
|
|
|
|
@@ -533,6 +564,300 @@ static int append_oa_sample(struct i915_perf_stream *stream,
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
+/**
|
|
|
+ * Copies all buffered OA reports into userspace read() buffer.
|
|
|
+ * @stream: An i915-perf stream opened for OA metrics
|
|
|
+ * @buf: destination buffer given by userspace
|
|
|
+ * @count: the number of bytes userspace wants to read
|
|
|
+ * @offset: (inout): the current position for writing into @buf
|
|
|
+ *
|
|
|
+ * Notably any error condition resulting in a short read (-%ENOSPC or
|
|
|
+ * -%EFAULT) will be returned even though one or more records may
|
|
|
+ * have been successfully copied. In this case it's up to the caller
|
|
|
+ * to decide if the error should be squashed before returning to
|
|
|
+ * userspace.
|
|
|
+ *
|
|
|
+ * Note: reports are consumed from the head, and appended to the
|
|
|
+ * tail, so the tail chases the head?... If you think that's mad
|
|
|
+ * and back-to-front you're not alone, but this follows the
|
|
|
+ * Gen PRM naming convention.
|
|
|
+ *
|
|
|
+ * Returns: 0 on success, negative error code on failure.
|
|
|
+ */
|
|
|
+static int gen8_append_oa_reports(struct i915_perf_stream *stream,
|
|
|
+ char __user *buf,
|
|
|
+ size_t count,
|
|
|
+ size_t *offset)
|
|
|
+{
|
|
|
+ struct drm_i915_private *dev_priv = stream->dev_priv;
|
|
|
+ int report_size = dev_priv->perf.oa.oa_buffer.format_size;
|
|
|
+ u8 *oa_buf_base = dev_priv->perf.oa.oa_buffer.vaddr;
|
|
|
+ u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
|
|
|
+ u32 mask = (OA_BUFFER_SIZE - 1);
|
|
|
+ size_t start_offset = *offset;
|
|
|
+ unsigned long flags;
|
|
|
+ unsigned int aged_tail_idx;
|
|
|
+ u32 head, tail;
|
|
|
+ u32 taken;
|
|
|
+ int ret = 0;
|
|
|
+
|
|
|
+ if (WARN_ON(!stream->enabled))
|
|
|
+ return -EIO;
|
|
|
+
|
|
|
+ spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
|
|
|
+
|
|
|
+ head = dev_priv->perf.oa.oa_buffer.head;
|
|
|
+ aged_tail_idx = dev_priv->perf.oa.oa_buffer.aged_tail_idx;
|
|
|
+ tail = dev_priv->perf.oa.oa_buffer.tails[aged_tail_idx].offset;
|
|
|
+
|
|
|
+ spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * An invalid tail pointer here means we're still waiting for the poll
|
|
|
+ * hrtimer callback to give us a pointer
|
|
|
+ */
|
|
|
+ if (tail == INVALID_TAIL_PTR)
|
|
|
+ return -EAGAIN;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * NB: oa_buffer.head/tail include the gtt_offset which we don't want
|
|
|
+ * while indexing relative to oa_buf_base.
|
|
|
+ */
|
|
|
+ head -= gtt_offset;
|
|
|
+ tail -= gtt_offset;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * An out of bounds or misaligned head or tail pointer implies a driver
|
|
|
+ * bug since we validate + align the tail pointers we read from the
|
|
|
+ * hardware and we are in full control of the head pointer which should
|
|
|
+ * only be incremented by multiples of the report size (notably also
|
|
|
+ * all a power of two).
|
|
|
+ */
|
|
|
+ if (WARN_ONCE(head > OA_BUFFER_SIZE || head % report_size ||
|
|
|
+ tail > OA_BUFFER_SIZE || tail % report_size,
|
|
|
+ "Inconsistent OA buffer pointers: head = %u, tail = %u\n",
|
|
|
+ head, tail))
|
|
|
+ return -EIO;
|
|
|
+
|
|
|
+
|
|
|
+ for (/* none */;
|
|
|
+ (taken = OA_TAKEN(tail, head));
|
|
|
+ head = (head + report_size) & mask) {
|
|
|
+ u8 *report = oa_buf_base + head;
|
|
|
+ u32 *report32 = (void *)report;
|
|
|
+ u32 ctx_id;
|
|
|
+ u32 reason;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * All the report sizes factor neatly into the buffer
|
|
|
+ * size so we never expect to see a report split
|
|
|
+ * between the beginning and end of the buffer.
|
|
|
+ *
|
|
|
+ * Given the initial alignment check a misalignment
|
|
|
+ * here would imply a driver bug that would result
|
|
|
+ * in an overrun.
|
|
|
+ */
|
|
|
+ if (WARN_ON((OA_BUFFER_SIZE - head) < report_size)) {
|
|
|
+ DRM_ERROR("Spurious OA head ptr: non-integral report offset\n");
|
|
|
+ break;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The reason field includes flags identifying what
|
|
|
+ * triggered this specific report (mostly timer
|
|
|
+ * triggered or e.g. due to a context switch).
|
|
|
+ *
|
|
|
+ * This field is never expected to be zero so we can
|
|
|
+ * check that the report isn't invalid before copying
|
|
|
+ * it to userspace...
|
|
|
+ */
|
|
|
+ reason = ((report32[0] >> OAREPORT_REASON_SHIFT) &
|
|
|
+ OAREPORT_REASON_MASK);
|
|
|
+ if (reason == 0) {
|
|
|
+ if (__ratelimit(&dev_priv->perf.oa.spurious_report_rs))
|
|
|
+ DRM_NOTE("Skipping spurious, invalid OA report\n");
|
|
|
+ continue;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * XXX: Just keep the lower 21 bits for now since I'm not
|
|
|
+ * entirely sure if the HW touches any of the higher bits in
|
|
|
+ * this field
|
|
|
+ */
|
|
|
+ ctx_id = report32[2] & 0x1fffff;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Squash whatever is in the CTX_ID field if it's marked as
|
|
|
+ * invalid to be sure we avoid false-positive, single-context
|
|
|
+ * filtering below...
|
|
|
+ *
|
|
|
+ * Note: that we don't clear the valid_ctx_bit so userspace can
|
|
|
+ * understand that the ID has been squashed by the kernel.
|
|
|
+ */
|
|
|
+ if (!(report32[0] & dev_priv->perf.oa.gen8_valid_ctx_bit))
|
|
|
+ ctx_id = report32[2] = INVALID_CTX_ID;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * NB: For Gen 8 the OA unit no longer supports clock gating
|
|
|
+ * off for a specific context and the kernel can't securely
|
|
|
+ * stop the counters from updating as system-wide / global
|
|
|
+ * values.
|
|
|
+ *
|
|
|
+ * Automatic reports now include a context ID so reports can be
|
|
|
+ * filtered on the cpu but it's not worth trying to
|
|
|
+ * automatically subtract/hide counter progress for other
|
|
|
+ * contexts while filtering since we can't stop userspace
|
|
|
+ * issuing MI_REPORT_PERF_COUNT commands which would still
|
|
|
+ * provide a side-band view of the real values.
|
|
|
+ *
|
|
|
+ * To allow userspace (such as Mesa/GL_INTEL_performance_query)
|
|
|
+ * to normalize counters for a single filtered context then it
|
|
|
+ * needs be forwarded bookend context-switch reports so that it
|
|
|
+ * can track switches in between MI_REPORT_PERF_COUNT commands
|
|
|
+ * and can itself subtract/ignore the progress of counters
|
|
|
+ * associated with other contexts. Note that the hardware
|
|
|
+ * automatically triggers reports when switching to a new
|
|
|
+ * context which are tagged with the ID of the newly active
|
|
|
+ * context. To avoid the complexity (and likely fragility) of
|
|
|
+ * reading ahead while parsing reports to try and minimize
|
|
|
+ * forwarding redundant context switch reports (i.e. between
|
|
|
+ * other, unrelated contexts) we simply elect to forward them
|
|
|
+ * all.
|
|
|
+ *
|
|
|
+ * We don't rely solely on the reason field to identify context
|
|
|
+ * switches since it's not-uncommon for periodic samples to
|
|
|
+ * identify a switch before any 'context switch' report.
|
|
|
+ */
|
|
|
+ if (!dev_priv->perf.oa.exclusive_stream->ctx ||
|
|
|
+ dev_priv->perf.oa.specific_ctx_id == ctx_id ||
|
|
|
+ (dev_priv->perf.oa.oa_buffer.last_ctx_id ==
|
|
|
+ dev_priv->perf.oa.specific_ctx_id) ||
|
|
|
+ reason & OAREPORT_REASON_CTX_SWITCH) {
|
|
|
+
|
|
|
+ /*
|
|
|
+ * While filtering for a single context we avoid
|
|
|
+ * leaking the IDs of other contexts.
|
|
|
+ */
|
|
|
+ if (dev_priv->perf.oa.exclusive_stream->ctx &&
|
|
|
+ dev_priv->perf.oa.specific_ctx_id != ctx_id) {
|
|
|
+ report32[2] = INVALID_CTX_ID;
|
|
|
+ }
|
|
|
+
|
|
|
+ ret = append_oa_sample(stream, buf, count, offset,
|
|
|
+ report);
|
|
|
+ if (ret)
|
|
|
+ break;
|
|
|
+
|
|
|
+ dev_priv->perf.oa.oa_buffer.last_ctx_id = ctx_id;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The above reason field sanity check is based on
|
|
|
+ * the assumption that the OA buffer is initially
|
|
|
+ * zeroed and we reset the field after copying so the
|
|
|
+ * check is still meaningful once old reports start
|
|
|
+ * being overwritten.
|
|
|
+ */
|
|
|
+ report32[0] = 0;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (start_offset != *offset) {
|
|
|
+ spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We removed the gtt_offset for the copy loop above, indexing
|
|
|
+ * relative to oa_buf_base so put back here...
|
|
|
+ */
|
|
|
+ head += gtt_offset;
|
|
|
+
|
|
|
+ I915_WRITE(GEN8_OAHEADPTR, head & GEN8_OAHEADPTR_MASK);
|
|
|
+ dev_priv->perf.oa.oa_buffer.head = head;
|
|
|
+
|
|
|
+ spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
|
|
|
+ }
|
|
|
+
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * gen8_oa_read - copy status records then buffered OA reports
|
|
|
+ * @stream: An i915-perf stream opened for OA metrics
|
|
|
+ * @buf: destination buffer given by userspace
|
|
|
+ * @count: the number of bytes userspace wants to read
|
|
|
+ * @offset: (inout): the current position for writing into @buf
|
|
|
+ *
|
|
|
+ * Checks OA unit status registers and if necessary appends corresponding
|
|
|
+ * status records for userspace (such as for a buffer full condition) and then
|
|
|
+ * initiate appending any buffered OA reports.
|
|
|
+ *
|
|
|
+ * Updates @offset according to the number of bytes successfully copied into
|
|
|
+ * the userspace buffer.
|
|
|
+ *
|
|
|
+ * NB: some data may be successfully copied to the userspace buffer
|
|
|
+ * even if an error is returned, and this is reflected in the
|
|
|
+ * updated @offset.
|
|
|
+ *
|
|
|
+ * Returns: zero on success or a negative error code
|
|
|
+ */
|
|
|
+static int gen8_oa_read(struct i915_perf_stream *stream,
|
|
|
+ char __user *buf,
|
|
|
+ size_t count,
|
|
|
+ size_t *offset)
|
|
|
+{
|
|
|
+ struct drm_i915_private *dev_priv = stream->dev_priv;
|
|
|
+ u32 oastatus;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ if (WARN_ON(!dev_priv->perf.oa.oa_buffer.vaddr))
|
|
|
+ return -EIO;
|
|
|
+
|
|
|
+ oastatus = I915_READ(GEN8_OASTATUS);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We treat OABUFFER_OVERFLOW as a significant error:
|
|
|
+ *
|
|
|
+ * Although theoretically we could handle this more gracefully
|
|
|
+ * sometimes, some Gens don't correctly suppress certain
|
|
|
+ * automatically triggered reports in this condition and so we
|
|
|
+ * have to assume that old reports are now being trampled
|
|
|
+ * over.
|
|
|
+ *
|
|
|
+ * Considering how we don't currently give userspace control
|
|
|
+ * over the OA buffer size and always configure a large 16MB
|
|
|
+ * buffer, then a buffer overflow does anyway likely indicate
|
|
|
+ * that something has gone quite badly wrong.
|
|
|
+ */
|
|
|
+ if (oastatus & GEN8_OASTATUS_OABUFFER_OVERFLOW) {
|
|
|
+ ret = append_oa_status(stream, buf, count, offset,
|
|
|
+ DRM_I915_PERF_RECORD_OA_BUFFER_LOST);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+
|
|
|
+ DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
|
|
|
+ dev_priv->perf.oa.period_exponent);
|
|
|
+
|
|
|
+ dev_priv->perf.oa.ops.oa_disable(dev_priv);
|
|
|
+ dev_priv->perf.oa.ops.oa_enable(dev_priv);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Note: .oa_enable() is expected to re-init the oabuffer and
|
|
|
+ * reset GEN8_OASTATUS for us
|
|
|
+ */
|
|
|
+ oastatus = I915_READ(GEN8_OASTATUS);
|
|
|
+ }
|
|
|
+
|
|
|
+ if (oastatus & GEN8_OASTATUS_REPORT_LOST) {
|
|
|
+ ret = append_oa_status(stream, buf, count, offset,
|
|
|
+ DRM_I915_PERF_RECORD_OA_REPORT_LOST);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+ I915_WRITE(GEN8_OASTATUS,
|
|
|
+ oastatus & ~GEN8_OASTATUS_REPORT_LOST);
|
|
|
+ }
|
|
|
+
|
|
|
+ return gen8_append_oa_reports(stream, buf, count, offset);
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* Copies all buffered OA reports into userspace read() buffer.
|
|
|
* @stream: An i915-perf stream opened for OA metrics
|
|
@@ -732,7 +1057,8 @@ static int gen7_oa_read(struct i915_perf_stream *stream,
|
|
|
if (ret)
|
|
|
return ret;
|
|
|
|
|
|
- DRM_DEBUG("OA buffer overflow: force restart\n");
|
|
|
+ DRM_DEBUG("OA buffer overflow (exponent = %d): force restart\n",
|
|
|
+ dev_priv->perf.oa.period_exponent);
|
|
|
|
|
|
dev_priv->perf.oa.ops.oa_disable(dev_priv);
|
|
|
dev_priv->perf.oa.ops.oa_enable(dev_priv);
|
|
@@ -775,7 +1101,7 @@ static int i915_oa_wait_unlocked(struct i915_perf_stream *stream)
|
|
|
return -EIO;
|
|
|
|
|
|
return wait_event_interruptible(dev_priv->perf.oa.poll_wq,
|
|
|
- dev_priv->perf.oa.ops.oa_buffer_check(dev_priv));
|
|
|
+ oa_buffer_check_unlocked(dev_priv));
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -832,30 +1158,38 @@ static int i915_oa_read(struct i915_perf_stream *stream,
|
|
|
static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
|
|
|
{
|
|
|
struct drm_i915_private *dev_priv = stream->dev_priv;
|
|
|
- struct intel_engine_cs *engine = dev_priv->engine[RCS];
|
|
|
- struct intel_ring *ring;
|
|
|
- int ret;
|
|
|
|
|
|
- ret = i915_mutex_lock_interruptible(&dev_priv->drm);
|
|
|
- if (ret)
|
|
|
- return ret;
|
|
|
+ if (i915.enable_execlists)
|
|
|
+ dev_priv->perf.oa.specific_ctx_id = stream->ctx->hw_id;
|
|
|
+ else {
|
|
|
+ struct intel_engine_cs *engine = dev_priv->engine[RCS];
|
|
|
+ struct intel_ring *ring;
|
|
|
+ int ret;
|
|
|
|
|
|
- /* As the ID is the gtt offset of the context's vma we pin
|
|
|
- * the vma to ensure the ID remains fixed.
|
|
|
- *
|
|
|
- * NB: implied RCS engine...
|
|
|
- */
|
|
|
- ring = engine->context_pin(engine, stream->ctx);
|
|
|
- mutex_unlock(&dev_priv->drm.struct_mutex);
|
|
|
- if (IS_ERR(ring))
|
|
|
- return PTR_ERR(ring);
|
|
|
+ ret = i915_mutex_lock_interruptible(&dev_priv->drm);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * As the ID is the gtt offset of the context's vma we
|
|
|
+ * pin the vma to ensure the ID remains fixed.
|
|
|
+ *
|
|
|
+ * NB: implied RCS engine...
|
|
|
+ */
|
|
|
+ ring = engine->context_pin(engine, stream->ctx);
|
|
|
+ mutex_unlock(&dev_priv->drm.struct_mutex);
|
|
|
+ if (IS_ERR(ring))
|
|
|
+ return PTR_ERR(ring);
|
|
|
|
|
|
- /* Explicitly track the ID (instead of calling i915_ggtt_offset()
|
|
|
- * on the fly) considering the difference with gen8+ and
|
|
|
- * execlists
|
|
|
- */
|
|
|
- dev_priv->perf.oa.specific_ctx_id =
|
|
|
- i915_ggtt_offset(stream->ctx->engine[engine->id].state);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Explicitly track the ID (instead of calling
|
|
|
+ * i915_ggtt_offset() on the fly) considering the difference
|
|
|
+ * with gen8+ and execlists
|
|
|
+ */
|
|
|
+ dev_priv->perf.oa.specific_ctx_id =
|
|
|
+ i915_ggtt_offset(stream->ctx->engine[engine->id].state);
|
|
|
+ }
|
|
|
|
|
|
return 0;
|
|
|
}
|
|
@@ -870,14 +1204,19 @@ static int oa_get_render_ctx_id(struct i915_perf_stream *stream)
|
|
|
static void oa_put_render_ctx_id(struct i915_perf_stream *stream)
|
|
|
{
|
|
|
struct drm_i915_private *dev_priv = stream->dev_priv;
|
|
|
- struct intel_engine_cs *engine = dev_priv->engine[RCS];
|
|
|
|
|
|
- mutex_lock(&dev_priv->drm.struct_mutex);
|
|
|
+ if (i915.enable_execlists) {
|
|
|
+ dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
|
|
|
+ } else {
|
|
|
+ struct intel_engine_cs *engine = dev_priv->engine[RCS];
|
|
|
|
|
|
- dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
|
|
|
- engine->context_unpin(engine, stream->ctx);
|
|
|
+ mutex_lock(&dev_priv->drm.struct_mutex);
|
|
|
|
|
|
- mutex_unlock(&dev_priv->drm.struct_mutex);
|
|
|
+ dev_priv->perf.oa.specific_ctx_id = INVALID_CTX_ID;
|
|
|
+ engine->context_unpin(engine, stream->ctx);
|
|
|
+
|
|
|
+ mutex_unlock(&dev_priv->drm.struct_mutex);
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
static void
|
|
@@ -901,6 +1240,12 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
|
|
|
|
|
|
BUG_ON(stream != dev_priv->perf.oa.exclusive_stream);
|
|
|
|
|
|
+ /*
|
|
|
+ * Unset exclusive_stream first, it might be checked while
|
|
|
+ * disabling the metric set on gen8+.
|
|
|
+ */
|
|
|
+ dev_priv->perf.oa.exclusive_stream = NULL;
|
|
|
+
|
|
|
dev_priv->perf.oa.ops.disable_metric_set(dev_priv);
|
|
|
|
|
|
free_oa_buffer(dev_priv);
|
|
@@ -911,8 +1256,6 @@ static void i915_oa_stream_destroy(struct i915_perf_stream *stream)
|
|
|
if (stream->ctx)
|
|
|
oa_put_render_ctx_id(stream);
|
|
|
|
|
|
- dev_priv->perf.oa.exclusive_stream = NULL;
|
|
|
-
|
|
|
if (dev_priv->perf.oa.spurious_report_rs.missed) {
|
|
|
DRM_NOTE("%d spurious OA report notices suppressed due to ratelimiting\n",
|
|
|
dev_priv->perf.oa.spurious_report_rs.missed);
|
|
@@ -967,6 +1310,65 @@ static void gen7_init_oa_buffer(struct drm_i915_private *dev_priv)
|
|
|
dev_priv->perf.oa.pollin = false;
|
|
|
}
|
|
|
|
|
|
+static void gen8_init_oa_buffer(struct drm_i915_private *dev_priv)
|
|
|
+{
|
|
|
+ u32 gtt_offset = i915_ggtt_offset(dev_priv->perf.oa.oa_buffer.vma);
|
|
|
+ unsigned long flags;
|
|
|
+
|
|
|
+ spin_lock_irqsave(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
|
|
|
+
|
|
|
+ I915_WRITE(GEN8_OASTATUS, 0);
|
|
|
+ I915_WRITE(GEN8_OAHEADPTR, gtt_offset);
|
|
|
+ dev_priv->perf.oa.oa_buffer.head = gtt_offset;
|
|
|
+
|
|
|
+ I915_WRITE(GEN8_OABUFFER_UDW, 0);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * PRM says:
|
|
|
+ *
|
|
|
+ * "This MMIO must be set before the OATAILPTR
|
|
|
+ * register and after the OAHEADPTR register. This is
|
|
|
+ * to enable proper functionality of the overflow
|
|
|
+ * bit."
|
|
|
+ */
|
|
|
+ I915_WRITE(GEN8_OABUFFER, gtt_offset |
|
|
|
+ OABUFFER_SIZE_16M | OA_MEM_SELECT_GGTT);
|
|
|
+ I915_WRITE(GEN8_OATAILPTR, gtt_offset & GEN8_OATAILPTR_MASK);
|
|
|
+
|
|
|
+ /* Mark that we need updated tail pointers to read from... */
|
|
|
+ dev_priv->perf.oa.oa_buffer.tails[0].offset = INVALID_TAIL_PTR;
|
|
|
+ dev_priv->perf.oa.oa_buffer.tails[1].offset = INVALID_TAIL_PTR;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Reset state used to recognise context switches, affecting which
|
|
|
+ * reports we will forward to userspace while filtering for a single
|
|
|
+ * context.
|
|
|
+ */
|
|
|
+ dev_priv->perf.oa.oa_buffer.last_ctx_id = INVALID_CTX_ID;
|
|
|
+
|
|
|
+ spin_unlock_irqrestore(&dev_priv->perf.oa.oa_buffer.ptr_lock, flags);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * NB: although the OA buffer will initially be allocated
|
|
|
+ * zeroed via shmfs (and so this memset is redundant when
|
|
|
+ * first allocating), we may re-init the OA buffer, either
|
|
|
+ * when re-enabling a stream or in error/reset paths.
|
|
|
+ *
|
|
|
+ * The reason we clear the buffer for each re-init is for the
|
|
|
+ * sanity check in gen8_append_oa_reports() that looks at the
|
|
|
+ * reason field to make sure it's non-zero which relies on
|
|
|
+ * the assumption that new reports are being written to zeroed
|
|
|
+ * memory...
|
|
|
+ */
|
|
|
+ memset(dev_priv->perf.oa.oa_buffer.vaddr, 0, OA_BUFFER_SIZE);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Maybe make ->pollin per-stream state if we support multiple
|
|
|
+ * concurrent streams in the future.
|
|
|
+ */
|
|
|
+ dev_priv->perf.oa.pollin = false;
|
|
|
+}
|
|
|
+
|
|
|
static int alloc_oa_buffer(struct drm_i915_private *dev_priv)
|
|
|
{
|
|
|
struct drm_i915_gem_object *bo;
|
|
@@ -1114,6 +1516,324 @@ static void hsw_disable_metric_set(struct drm_i915_private *dev_priv)
|
|
|
~GT_NOA_ENABLE));
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * NB: It must always remain pointer safe to run this even if the OA unit
|
|
|
+ * has been disabled.
|
|
|
+ *
|
|
|
+ * It's fine to put out-of-date values into these per-context registers
|
|
|
+ * in the case that the OA unit has been disabled.
|
|
|
+ */
|
|
|
+static void gen8_update_reg_state_unlocked(struct i915_gem_context *ctx,
|
|
|
+ u32 *reg_state)
|
|
|
+{
|
|
|
+ struct drm_i915_private *dev_priv = ctx->i915;
|
|
|
+ const struct i915_oa_reg *flex_regs = dev_priv->perf.oa.flex_regs;
|
|
|
+ int n_flex_regs = dev_priv->perf.oa.flex_regs_len;
|
|
|
+ u32 ctx_oactxctrl = dev_priv->perf.oa.ctx_oactxctrl_offset;
|
|
|
+ u32 ctx_flexeu0 = dev_priv->perf.oa.ctx_flexeu0_offset;
|
|
|
+ /* The MMIO offsets for Flex EU registers aren't contiguous */
|
|
|
+ u32 flex_mmio[] = {
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL0),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL1),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL2),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL3),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL4),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL5),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL6),
|
|
|
+ };
|
|
|
+ int i;
|
|
|
+
|
|
|
+ reg_state[ctx_oactxctrl] = i915_mmio_reg_offset(GEN8_OACTXCONTROL);
|
|
|
+ reg_state[ctx_oactxctrl+1] = (dev_priv->perf.oa.period_exponent <<
|
|
|
+ GEN8_OA_TIMER_PERIOD_SHIFT) |
|
|
|
+ (dev_priv->perf.oa.periodic ?
|
|
|
+ GEN8_OA_TIMER_ENABLE : 0) |
|
|
|
+ GEN8_OA_COUNTER_RESUME;
|
|
|
+
|
|
|
+ for (i = 0; i < ARRAY_SIZE(flex_mmio); i++) {
|
|
|
+ u32 state_offset = ctx_flexeu0 + i * 2;
|
|
|
+ u32 mmio = flex_mmio[i];
|
|
|
+
|
|
|
+ /*
|
|
|
+ * This arbitrary default will select the 'EU FPU0 Pipeline
|
|
|
+ * Active' event. In the future it's anticipated that there
|
|
|
+ * will be an explicit 'No Event' we can select, but not yet...
|
|
|
+ */
|
|
|
+ u32 value = 0;
|
|
|
+ int j;
|
|
|
+
|
|
|
+ for (j = 0; j < n_flex_regs; j++) {
|
|
|
+ if (i915_mmio_reg_offset(flex_regs[j].addr) == mmio) {
|
|
|
+ value = flex_regs[j].value;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ reg_state[state_offset] = mmio;
|
|
|
+ reg_state[state_offset+1] = value;
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Same as gen8_update_reg_state_unlocked only through the batchbuffer. This
|
|
|
+ * is only used by the kernel context.
|
|
|
+ */
|
|
|
+static int gen8_emit_oa_config(struct drm_i915_gem_request *req)
|
|
|
+{
|
|
|
+ struct drm_i915_private *dev_priv = req->i915;
|
|
|
+ const struct i915_oa_reg *flex_regs = dev_priv->perf.oa.flex_regs;
|
|
|
+ int n_flex_regs = dev_priv->perf.oa.flex_regs_len;
|
|
|
+ /* The MMIO offsets for Flex EU registers aren't contiguous */
|
|
|
+ u32 flex_mmio[] = {
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL0),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL1),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL2),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL3),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL4),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL5),
|
|
|
+ i915_mmio_reg_offset(EU_PERF_CNTL6),
|
|
|
+ };
|
|
|
+ u32 *cs;
|
|
|
+ int i;
|
|
|
+
|
|
|
+ cs = intel_ring_begin(req, n_flex_regs * 2 + 4);
|
|
|
+ if (IS_ERR(cs))
|
|
|
+ return PTR_ERR(cs);
|
|
|
+
|
|
|
+ *cs++ = MI_LOAD_REGISTER_IMM(n_flex_regs + 1);
|
|
|
+
|
|
|
+ *cs++ = i915_mmio_reg_offset(GEN8_OACTXCONTROL);
|
|
|
+ *cs++ = (dev_priv->perf.oa.period_exponent << GEN8_OA_TIMER_PERIOD_SHIFT) |
|
|
|
+ (dev_priv->perf.oa.periodic ? GEN8_OA_TIMER_ENABLE : 0) |
|
|
|
+ GEN8_OA_COUNTER_RESUME;
|
|
|
+
|
|
|
+ for (i = 0; i < ARRAY_SIZE(flex_mmio); i++) {
|
|
|
+ u32 mmio = flex_mmio[i];
|
|
|
+
|
|
|
+ /*
|
|
|
+ * This arbitrary default will select the 'EU FPU0 Pipeline
|
|
|
+ * Active' event. In the future it's anticipated that there
|
|
|
+ * will be an explicit 'No Event' we can select, but not
|
|
|
+ * yet...
|
|
|
+ */
|
|
|
+ u32 value = 0;
|
|
|
+ int j;
|
|
|
+
|
|
|
+ for (j = 0; j < n_flex_regs; j++) {
|
|
|
+ if (i915_mmio_reg_offset(flex_regs[j].addr) == mmio) {
|
|
|
+ value = flex_regs[j].value;
|
|
|
+ break;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ *cs++ = mmio;
|
|
|
+ *cs++ = value;
|
|
|
+ }
|
|
|
+
|
|
|
+ *cs++ = MI_NOOP;
|
|
|
+ intel_ring_advance(req, cs);
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static int gen8_switch_to_updated_kernel_context(struct drm_i915_private *dev_priv)
|
|
|
+{
|
|
|
+ struct intel_engine_cs *engine = dev_priv->engine[RCS];
|
|
|
+ struct i915_gem_timeline *timeline;
|
|
|
+ struct drm_i915_gem_request *req;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ lockdep_assert_held(&dev_priv->drm.struct_mutex);
|
|
|
+
|
|
|
+ i915_gem_retire_requests(dev_priv);
|
|
|
+
|
|
|
+ req = i915_gem_request_alloc(engine, dev_priv->kernel_context);
|
|
|
+ if (IS_ERR(req))
|
|
|
+ return PTR_ERR(req);
|
|
|
+
|
|
|
+ ret = gen8_emit_oa_config(req);
|
|
|
+ if (ret) {
|
|
|
+ i915_add_request(req);
|
|
|
+ return ret;
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Queue this switch after all other activity */
|
|
|
+ list_for_each_entry(timeline, &dev_priv->gt.timelines, link) {
|
|
|
+ struct drm_i915_gem_request *prev;
|
|
|
+ struct intel_timeline *tl;
|
|
|
+
|
|
|
+ tl = &timeline->engine[engine->id];
|
|
|
+ prev = i915_gem_active_raw(&tl->last_request,
|
|
|
+ &dev_priv->drm.struct_mutex);
|
|
|
+ if (prev)
|
|
|
+ i915_sw_fence_await_sw_fence_gfp(&req->submit,
|
|
|
+ &prev->submit,
|
|
|
+ GFP_KERNEL);
|
|
|
+ }
|
|
|
+
|
|
|
+ ret = i915_switch_context(req);
|
|
|
+ i915_add_request(req);
|
|
|
+
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+/*
|
|
|
+ * Manages updating the per-context aspects of the OA stream
|
|
|
+ * configuration across all contexts.
|
|
|
+ *
|
|
|
+ * The awkward consideration here is that OACTXCONTROL controls the
|
|
|
+ * exponent for periodic sampling which is primarily used for system
|
|
|
+ * wide profiling where we'd like a consistent sampling period even in
|
|
|
+ * the face of context switches.
|
|
|
+ *
|
|
|
+ * Our approach of updating the register state context (as opposed to
|
|
|
+ * say using a workaround batch buffer) ensures that the hardware
|
|
|
+ * won't automatically reload an out-of-date timer exponent even
|
|
|
+ * transiently before a WA BB could be parsed.
|
|
|
+ *
|
|
|
+ * This function needs to:
|
|
|
+ * - Ensure the currently running context's per-context OA state is
|
|
|
+ * updated
|
|
|
+ * - Ensure that all existing contexts will have the correct per-context
|
|
|
+ * OA state if they are scheduled for use.
|
|
|
+ * - Ensure any new contexts will be initialized with the correct
|
|
|
+ * per-context OA state.
|
|
|
+ *
|
|
|
+ * Note: it's only the RCS/Render context that has any OA state.
|
|
|
+ */
|
|
|
+static int gen8_configure_all_contexts(struct drm_i915_private *dev_priv,
|
|
|
+ bool interruptible)
|
|
|
+{
|
|
|
+ struct i915_gem_context *ctx;
|
|
|
+ int ret;
|
|
|
+ unsigned int wait_flags = I915_WAIT_LOCKED;
|
|
|
+
|
|
|
+ if (interruptible) {
|
|
|
+ ret = i915_mutex_lock_interruptible(&dev_priv->drm);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+
|
|
|
+ wait_flags |= I915_WAIT_INTERRUPTIBLE;
|
|
|
+ } else {
|
|
|
+ mutex_lock(&dev_priv->drm.struct_mutex);
|
|
|
+ }
|
|
|
+
|
|
|
+ /* Switch away from any user context. */
|
|
|
+ ret = gen8_switch_to_updated_kernel_context(dev_priv);
|
|
|
+ if (ret)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The OA register config is setup through the context image. This image
|
|
|
+ * might be written to by the GPU on context switch (in particular on
|
|
|
+ * lite-restore). This means we can't safely update a context's image,
|
|
|
+ * if this context is scheduled/submitted to run on the GPU.
|
|
|
+ *
|
|
|
+ * We could emit the OA register config through the batch buffer but
|
|
|
+ * this might leave small interval of time where the OA unit is
|
|
|
+ * configured at an invalid sampling period.
|
|
|
+ *
|
|
|
+ * So far the best way to work around this issue seems to be draining
|
|
|
+ * the GPU from any submitted work.
|
|
|
+ */
|
|
|
+ ret = i915_gem_wait_for_idle(dev_priv, wait_flags);
|
|
|
+ if (ret)
|
|
|
+ goto out;
|
|
|
+
|
|
|
+ /* Update all contexts now that we've stalled the submission. */
|
|
|
+ list_for_each_entry(ctx, &dev_priv->context_list, link) {
|
|
|
+ struct intel_context *ce = &ctx->engine[RCS];
|
|
|
+ u32 *regs;
|
|
|
+
|
|
|
+ /* OA settings will be set upon first use */
|
|
|
+ if (!ce->state)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ regs = i915_gem_object_pin_map(ce->state->obj, I915_MAP_WB);
|
|
|
+ if (IS_ERR(regs)) {
|
|
|
+ ret = PTR_ERR(regs);
|
|
|
+ goto out;
|
|
|
+ }
|
|
|
+
|
|
|
+ ce->state->obj->mm.dirty = true;
|
|
|
+ regs += LRC_STATE_PN * PAGE_SIZE / sizeof(*regs);
|
|
|
+
|
|
|
+ gen8_update_reg_state_unlocked(ctx, regs);
|
|
|
+
|
|
|
+ i915_gem_object_unpin_map(ce->state->obj);
|
|
|
+ }
|
|
|
+
|
|
|
+ out:
|
|
|
+ mutex_unlock(&dev_priv->drm.struct_mutex);
|
|
|
+
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+static int gen8_enable_metric_set(struct drm_i915_private *dev_priv)
|
|
|
+{
|
|
|
+ int ret = dev_priv->perf.oa.ops.select_metric_set(dev_priv);
|
|
|
+ int i;
|
|
|
+
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * We disable slice/unslice clock ratio change reports on SKL since
|
|
|
+ * they are too noisy. The HW generates a lot of redundant reports
|
|
|
+ * where the ratio hasn't really changed causing a lot of redundant
|
|
|
+ * work to processes and increasing the chances we'll hit buffer
|
|
|
+ * overruns.
|
|
|
+ *
|
|
|
+ * Although we don't currently use the 'disable overrun' OABUFFER
|
|
|
+ * feature it's worth noting that clock ratio reports have to be
|
|
|
+ * disabled before considering to use that feature since the HW doesn't
|
|
|
+ * correctly block these reports.
|
|
|
+ *
|
|
|
+ * Currently none of the high-level metrics we have depend on knowing
|
|
|
+ * this ratio to normalize.
|
|
|
+ *
|
|
|
+ * Note: This register is not power context saved and restored, but
|
|
|
+ * that's OK considering that we disable RC6 while the OA unit is
|
|
|
+ * enabled.
|
|
|
+ *
|
|
|
+ * The _INCLUDE_CLK_RATIO bit allows the slice/unslice frequency to
|
|
|
+ * be read back from automatically triggered reports, as part of the
|
|
|
+ * RPT_ID field.
|
|
|
+ */
|
|
|
+ if (IS_SKYLAKE(dev_priv) || IS_BROXTON(dev_priv)) {
|
|
|
+ I915_WRITE(GEN8_OA_DEBUG,
|
|
|
+ _MASKED_BIT_ENABLE(GEN9_OA_DEBUG_DISABLE_CLK_RATIO_REPORTS |
|
|
|
+ GEN9_OA_DEBUG_INCLUDE_CLK_RATIO));
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Update all contexts prior writing the mux configurations as we need
|
|
|
+ * to make sure all slices/subslices are ON before writing to NOA
|
|
|
+ * registers.
|
|
|
+ */
|
|
|
+ ret = gen8_configure_all_contexts(dev_priv, true);
|
|
|
+ if (ret)
|
|
|
+ return ret;
|
|
|
+
|
|
|
+ I915_WRITE(GDT_CHICKEN_BITS, 0xA0);
|
|
|
+ for (i = 0; i < dev_priv->perf.oa.n_mux_configs; i++) {
|
|
|
+ config_oa_regs(dev_priv, dev_priv->perf.oa.mux_regs[i],
|
|
|
+ dev_priv->perf.oa.mux_regs_lens[i]);
|
|
|
+ }
|
|
|
+ I915_WRITE(GDT_CHICKEN_BITS, 0x80);
|
|
|
+
|
|
|
+ config_oa_regs(dev_priv, dev_priv->perf.oa.b_counter_regs,
|
|
|
+ dev_priv->perf.oa.b_counter_regs_len);
|
|
|
+
|
|
|
+ return 0;
|
|
|
+}
|
|
|
+
|
|
|
+static void gen8_disable_metric_set(struct drm_i915_private *dev_priv)
|
|
|
+{
|
|
|
+ /* Reset all contexts' slices/subslices configurations. */
|
|
|
+ gen8_configure_all_contexts(dev_priv, false);
|
|
|
+}
|
|
|
+
|
|
|
static void gen7_update_oacontrol_locked(struct drm_i915_private *dev_priv)
|
|
|
{
|
|
|
lockdep_assert_held(&dev_priv->perf.hook_lock);
|
|
@@ -1158,6 +1878,31 @@ static void gen7_oa_enable(struct drm_i915_private *dev_priv)
|
|
|
spin_unlock_irqrestore(&dev_priv->perf.hook_lock, flags);
|
|
|
}
|
|
|
|
|
|
+static void gen8_oa_enable(struct drm_i915_private *dev_priv)
|
|
|
+{
|
|
|
+ u32 report_format = dev_priv->perf.oa.oa_buffer.format;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Reset buf pointers so we don't forward reports from before now.
|
|
|
+ *
|
|
|
+ * Think carefully if considering trying to avoid this, since it
|
|
|
+ * also ensures status flags and the buffer itself are cleared
|
|
|
+ * in error paths, and we have checks for invalid reports based
|
|
|
+ * on the assumption that certain fields are written to zeroed
|
|
|
+ * memory which this helps maintains.
|
|
|
+ */
|
|
|
+ gen8_init_oa_buffer(dev_priv);
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Note: we don't rely on the hardware to perform single context
|
|
|
+ * filtering and instead filter on the cpu based on the context-id
|
|
|
+ * field of reports
|
|
|
+ */
|
|
|
+ I915_WRITE(GEN8_OACONTROL, (report_format <<
|
|
|
+ GEN8_OA_REPORT_FORMAT_SHIFT) |
|
|
|
+ GEN8_OA_COUNTER_ENABLE);
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* i915_oa_stream_enable - handle `I915_PERF_IOCTL_ENABLE` for OA stream
|
|
|
* @stream: An i915 perf stream opened for OA metrics
|
|
@@ -1184,6 +1929,11 @@ static void gen7_oa_disable(struct drm_i915_private *dev_priv)
|
|
|
I915_WRITE(GEN7_OACONTROL, 0);
|
|
|
}
|
|
|
|
|
|
+static void gen8_oa_disable(struct drm_i915_private *dev_priv)
|
|
|
+{
|
|
|
+ I915_WRITE(GEN8_OACONTROL, 0);
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* i915_oa_stream_disable - handle `I915_PERF_IOCTL_DISABLE` for OA stream
|
|
|
* @stream: An i915 perf stream opened for OA metrics
|
|
@@ -1362,6 +2112,21 @@ err_oa_buf_alloc:
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
|
+void i915_oa_init_reg_state(struct intel_engine_cs *engine,
|
|
|
+ struct i915_gem_context *ctx,
|
|
|
+ u32 *reg_state)
|
|
|
+{
|
|
|
+ struct drm_i915_private *dev_priv = engine->i915;
|
|
|
+
|
|
|
+ if (engine->id != RCS)
|
|
|
+ return;
|
|
|
+
|
|
|
+ if (!dev_priv->perf.initialized)
|
|
|
+ return;
|
|
|
+
|
|
|
+ gen8_update_reg_state_unlocked(ctx, reg_state);
|
|
|
+}
|
|
|
+
|
|
|
/**
|
|
|
* i915_perf_read_locked - &i915_perf_stream_ops->read with error normalisation
|
|
|
* @stream: An i915 perf stream
|
|
@@ -1487,7 +2252,7 @@ static enum hrtimer_restart oa_poll_check_timer_cb(struct hrtimer *hrtimer)
|
|
|
container_of(hrtimer, typeof(*dev_priv),
|
|
|
perf.oa.poll_check_timer);
|
|
|
|
|
|
- if (dev_priv->perf.oa.ops.oa_buffer_check(dev_priv)) {
|
|
|
+ if (oa_buffer_check_unlocked(dev_priv)) {
|
|
|
dev_priv->perf.oa.pollin = true;
|
|
|
wake_up(&dev_priv->perf.oa.poll_wq);
|
|
|
}
|
|
@@ -1776,6 +2541,7 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
|
|
|
struct i915_gem_context *specific_ctx = NULL;
|
|
|
struct i915_perf_stream *stream = NULL;
|
|
|
unsigned long f_flags = 0;
|
|
|
+ bool privileged_op = true;
|
|
|
int stream_fd;
|
|
|
int ret;
|
|
|
|
|
@@ -1793,12 +2559,29 @@ i915_perf_open_ioctl_locked(struct drm_i915_private *dev_priv,
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+ /*
|
|
|
+ * On Haswell the OA unit supports clock gating off for a specific
|
|
|
+ * context and in this mode there's no visibility of metrics for the
|
|
|
+ * rest of the system, which we consider acceptable for a
|
|
|
+ * non-privileged client.
|
|
|
+ *
|
|
|
+ * For Gen8+ the OA unit no longer supports clock gating off for a
|
|
|
+ * specific context and the kernel can't securely stop the counters
|
|
|
+ * from updating as system-wide / global values. Even though we can
|
|
|
+ * filter reports based on the included context ID we can't block
|
|
|
+ * clients from seeing the raw / global counter values via
|
|
|
+ * MI_REPORT_PERF_COUNT commands and so consider it a privileged op to
|
|
|
+ * enable the OA unit by default.
|
|
|
+ */
|
|
|
+ if (IS_HASWELL(dev_priv) && specific_ctx)
|
|
|
+ privileged_op = false;
|
|
|
+
|
|
|
/* Similar to perf's kernel.perf_paranoid_cpu sysctl option
|
|
|
* we check a dev.i915.perf_stream_paranoid sysctl option
|
|
|
* to determine if it's ok to access system wide OA counters
|
|
|
* without CAP_SYS_ADMIN privileges.
|
|
|
*/
|
|
|
- if (!specific_ctx &&
|
|
|
+ if (privileged_op &&
|
|
|
i915_perf_stream_paranoid && !capable(CAP_SYS_ADMIN)) {
|
|
|
DRM_DEBUG("Insufficient privileges to open system-wide i915 perf stream\n");
|
|
|
ret = -EACCES;
|
|
@@ -2070,9 +2853,6 @@ int i915_perf_open_ioctl(struct drm_device *dev, void *data,
|
|
|
*/
|
|
|
void i915_perf_register(struct drm_i915_private *dev_priv)
|
|
|
{
|
|
|
- if (!IS_HASWELL(dev_priv))
|
|
|
- return;
|
|
|
-
|
|
|
if (!dev_priv->perf.initialized)
|
|
|
return;
|
|
|
|
|
@@ -2088,11 +2868,38 @@ void i915_perf_register(struct drm_i915_private *dev_priv)
|
|
|
if (!dev_priv->perf.metrics_kobj)
|
|
|
goto exit;
|
|
|
|
|
|
- if (i915_perf_register_sysfs_hsw(dev_priv)) {
|
|
|
- kobject_put(dev_priv->perf.metrics_kobj);
|
|
|
- dev_priv->perf.metrics_kobj = NULL;
|
|
|
+ if (IS_HASWELL(dev_priv)) {
|
|
|
+ if (i915_perf_register_sysfs_hsw(dev_priv))
|
|
|
+ goto sysfs_error;
|
|
|
+ } else if (IS_BROADWELL(dev_priv)) {
|
|
|
+ if (i915_perf_register_sysfs_bdw(dev_priv))
|
|
|
+ goto sysfs_error;
|
|
|
+ } else if (IS_CHERRYVIEW(dev_priv)) {
|
|
|
+ if (i915_perf_register_sysfs_chv(dev_priv))
|
|
|
+ goto sysfs_error;
|
|
|
+ } else if (IS_SKYLAKE(dev_priv)) {
|
|
|
+ if (IS_SKL_GT2(dev_priv)) {
|
|
|
+ if (i915_perf_register_sysfs_sklgt2(dev_priv))
|
|
|
+ goto sysfs_error;
|
|
|
+ } else if (IS_SKL_GT3(dev_priv)) {
|
|
|
+ if (i915_perf_register_sysfs_sklgt3(dev_priv))
|
|
|
+ goto sysfs_error;
|
|
|
+ } else if (IS_SKL_GT4(dev_priv)) {
|
|
|
+ if (i915_perf_register_sysfs_sklgt4(dev_priv))
|
|
|
+ goto sysfs_error;
|
|
|
+ } else
|
|
|
+ goto sysfs_error;
|
|
|
+ } else if (IS_BROXTON(dev_priv)) {
|
|
|
+ if (i915_perf_register_sysfs_bxt(dev_priv))
|
|
|
+ goto sysfs_error;
|
|
|
}
|
|
|
|
|
|
+ goto exit;
|
|
|
+
|
|
|
+sysfs_error:
|
|
|
+ kobject_put(dev_priv->perf.metrics_kobj);
|
|
|
+ dev_priv->perf.metrics_kobj = NULL;
|
|
|
+
|
|
|
exit:
|
|
|
mutex_unlock(&dev_priv->perf.lock);
|
|
|
}
|
|
@@ -2108,13 +2915,24 @@ exit:
|
|
|
*/
|
|
|
void i915_perf_unregister(struct drm_i915_private *dev_priv)
|
|
|
{
|
|
|
- if (!IS_HASWELL(dev_priv))
|
|
|
- return;
|
|
|
-
|
|
|
if (!dev_priv->perf.metrics_kobj)
|
|
|
return;
|
|
|
|
|
|
- i915_perf_unregister_sysfs_hsw(dev_priv);
|
|
|
+ if (IS_HASWELL(dev_priv))
|
|
|
+ i915_perf_unregister_sysfs_hsw(dev_priv);
|
|
|
+ else if (IS_BROADWELL(dev_priv))
|
|
|
+ i915_perf_unregister_sysfs_bdw(dev_priv);
|
|
|
+ else if (IS_CHERRYVIEW(dev_priv))
|
|
|
+ i915_perf_unregister_sysfs_chv(dev_priv);
|
|
|
+ else if (IS_SKYLAKE(dev_priv)) {
|
|
|
+ if (IS_SKL_GT2(dev_priv))
|
|
|
+ i915_perf_unregister_sysfs_sklgt2(dev_priv);
|
|
|
+ else if (IS_SKL_GT3(dev_priv))
|
|
|
+ i915_perf_unregister_sysfs_sklgt3(dev_priv);
|
|
|
+ else if (IS_SKL_GT4(dev_priv))
|
|
|
+ i915_perf_unregister_sysfs_sklgt4(dev_priv);
|
|
|
+ } else if (IS_BROXTON(dev_priv))
|
|
|
+ i915_perf_unregister_sysfs_bxt(dev_priv);
|
|
|
|
|
|
kobject_put(dev_priv->perf.metrics_kobj);
|
|
|
dev_priv->perf.metrics_kobj = NULL;
|
|
@@ -2173,36 +2991,105 @@ static struct ctl_table dev_root[] = {
|
|
|
*/
|
|
|
void i915_perf_init(struct drm_i915_private *dev_priv)
|
|
|
{
|
|
|
- if (!IS_HASWELL(dev_priv))
|
|
|
- return;
|
|
|
-
|
|
|
- hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
|
|
|
- CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
|
- dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
|
|
|
- init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
|
|
|
+ dev_priv->perf.oa.n_builtin_sets = 0;
|
|
|
+
|
|
|
+ if (IS_HASWELL(dev_priv)) {
|
|
|
+ dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
|
|
|
+ dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
|
|
|
+ dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
|
|
|
+ dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
|
|
|
+ dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
|
|
|
+ dev_priv->perf.oa.ops.read = gen7_oa_read;
|
|
|
+ dev_priv->perf.oa.ops.oa_hw_tail_read =
|
|
|
+ gen7_oa_hw_tail_read;
|
|
|
+
|
|
|
+ dev_priv->perf.oa.oa_formats = hsw_oa_formats;
|
|
|
+
|
|
|
+ dev_priv->perf.oa.n_builtin_sets =
|
|
|
+ i915_oa_n_builtin_metric_sets_hsw;
|
|
|
+ } else if (i915.enable_execlists) {
|
|
|
+ /* Note: that although we could theoretically also support the
|
|
|
+ * legacy ringbuffer mode on BDW (and earlier iterations of
|
|
|
+ * this driver, before upstreaming did this) it didn't seem
|
|
|
+ * worth the complexity to maintain now that BDW+ enable
|
|
|
+ * execlist mode by default.
|
|
|
+ */
|
|
|
|
|
|
- INIT_LIST_HEAD(&dev_priv->perf.streams);
|
|
|
- mutex_init(&dev_priv->perf.lock);
|
|
|
- spin_lock_init(&dev_priv->perf.hook_lock);
|
|
|
- spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
|
|
|
+ if (IS_GEN8(dev_priv)) {
|
|
|
+ dev_priv->perf.oa.ctx_oactxctrl_offset = 0x120;
|
|
|
+ dev_priv->perf.oa.ctx_flexeu0_offset = 0x2ce;
|
|
|
+ dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<25);
|
|
|
+
|
|
|
+ if (IS_BROADWELL(dev_priv)) {
|
|
|
+ dev_priv->perf.oa.n_builtin_sets =
|
|
|
+ i915_oa_n_builtin_metric_sets_bdw;
|
|
|
+ dev_priv->perf.oa.ops.select_metric_set =
|
|
|
+ i915_oa_select_metric_set_bdw;
|
|
|
+ } else if (IS_CHERRYVIEW(dev_priv)) {
|
|
|
+ dev_priv->perf.oa.n_builtin_sets =
|
|
|
+ i915_oa_n_builtin_metric_sets_chv;
|
|
|
+ dev_priv->perf.oa.ops.select_metric_set =
|
|
|
+ i915_oa_select_metric_set_chv;
|
|
|
+ }
|
|
|
+ } else if (IS_GEN9(dev_priv)) {
|
|
|
+ dev_priv->perf.oa.ctx_oactxctrl_offset = 0x128;
|
|
|
+ dev_priv->perf.oa.ctx_flexeu0_offset = 0x3de;
|
|
|
+ dev_priv->perf.oa.gen8_valid_ctx_bit = (1<<16);
|
|
|
+
|
|
|
+ if (IS_SKL_GT2(dev_priv)) {
|
|
|
+ dev_priv->perf.oa.n_builtin_sets =
|
|
|
+ i915_oa_n_builtin_metric_sets_sklgt2;
|
|
|
+ dev_priv->perf.oa.ops.select_metric_set =
|
|
|
+ i915_oa_select_metric_set_sklgt2;
|
|
|
+ } else if (IS_SKL_GT3(dev_priv)) {
|
|
|
+ dev_priv->perf.oa.n_builtin_sets =
|
|
|
+ i915_oa_n_builtin_metric_sets_sklgt3;
|
|
|
+ dev_priv->perf.oa.ops.select_metric_set =
|
|
|
+ i915_oa_select_metric_set_sklgt3;
|
|
|
+ } else if (IS_SKL_GT4(dev_priv)) {
|
|
|
+ dev_priv->perf.oa.n_builtin_sets =
|
|
|
+ i915_oa_n_builtin_metric_sets_sklgt4;
|
|
|
+ dev_priv->perf.oa.ops.select_metric_set =
|
|
|
+ i915_oa_select_metric_set_sklgt4;
|
|
|
+ } else if (IS_BROXTON(dev_priv)) {
|
|
|
+ dev_priv->perf.oa.n_builtin_sets =
|
|
|
+ i915_oa_n_builtin_metric_sets_bxt;
|
|
|
+ dev_priv->perf.oa.ops.select_metric_set =
|
|
|
+ i915_oa_select_metric_set_bxt;
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
- dev_priv->perf.oa.ops.init_oa_buffer = gen7_init_oa_buffer;
|
|
|
- dev_priv->perf.oa.ops.enable_metric_set = hsw_enable_metric_set;
|
|
|
- dev_priv->perf.oa.ops.disable_metric_set = hsw_disable_metric_set;
|
|
|
- dev_priv->perf.oa.ops.oa_enable = gen7_oa_enable;
|
|
|
- dev_priv->perf.oa.ops.oa_disable = gen7_oa_disable;
|
|
|
- dev_priv->perf.oa.ops.read = gen7_oa_read;
|
|
|
- dev_priv->perf.oa.ops.oa_buffer_check =
|
|
|
- gen7_oa_buffer_check_unlocked;
|
|
|
+ if (dev_priv->perf.oa.n_builtin_sets) {
|
|
|
+ dev_priv->perf.oa.ops.init_oa_buffer = gen8_init_oa_buffer;
|
|
|
+ dev_priv->perf.oa.ops.enable_metric_set =
|
|
|
+ gen8_enable_metric_set;
|
|
|
+ dev_priv->perf.oa.ops.disable_metric_set =
|
|
|
+ gen8_disable_metric_set;
|
|
|
+ dev_priv->perf.oa.ops.oa_enable = gen8_oa_enable;
|
|
|
+ dev_priv->perf.oa.ops.oa_disable = gen8_oa_disable;
|
|
|
+ dev_priv->perf.oa.ops.read = gen8_oa_read;
|
|
|
+ dev_priv->perf.oa.ops.oa_hw_tail_read =
|
|
|
+ gen8_oa_hw_tail_read;
|
|
|
+
|
|
|
+ dev_priv->perf.oa.oa_formats = gen8_plus_oa_formats;
|
|
|
+ }
|
|
|
+ }
|
|
|
|
|
|
- dev_priv->perf.oa.oa_formats = hsw_oa_formats;
|
|
|
+ if (dev_priv->perf.oa.n_builtin_sets) {
|
|
|
+ hrtimer_init(&dev_priv->perf.oa.poll_check_timer,
|
|
|
+ CLOCK_MONOTONIC, HRTIMER_MODE_REL);
|
|
|
+ dev_priv->perf.oa.poll_check_timer.function = oa_poll_check_timer_cb;
|
|
|
+ init_waitqueue_head(&dev_priv->perf.oa.poll_wq);
|
|
|
|
|
|
- dev_priv->perf.oa.n_builtin_sets =
|
|
|
- i915_oa_n_builtin_metric_sets_hsw;
|
|
|
+ INIT_LIST_HEAD(&dev_priv->perf.streams);
|
|
|
+ mutex_init(&dev_priv->perf.lock);
|
|
|
+ spin_lock_init(&dev_priv->perf.hook_lock);
|
|
|
+ spin_lock_init(&dev_priv->perf.oa.oa_buffer.ptr_lock);
|
|
|
|
|
|
- dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
|
|
|
+ dev_priv->perf.sysctl_header = register_sysctl_table(dev_root);
|
|
|
|
|
|
- dev_priv->perf.initialized = true;
|
|
|
+ dev_priv->perf.initialized = true;
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
/**
|
|
@@ -2217,5 +3104,6 @@ void i915_perf_fini(struct drm_i915_private *dev_priv)
|
|
|
unregister_sysctl_table(dev_priv->perf.sysctl_header);
|
|
|
|
|
|
memset(&dev_priv->perf.oa.ops, 0, sizeof(dev_priv->perf.oa.ops));
|
|
|
+
|
|
|
dev_priv->perf.initialized = false;
|
|
|
}
|