|
|
@@ -26,6 +26,7 @@
|
|
|
#include <linux/ptrace.h>
|
|
|
#include <linux/nvme_ioctl.h>
|
|
|
#include <linux/t10-pi.h>
|
|
|
+#include <linux/pm_qos.h>
|
|
|
#include <scsi/sg.h>
|
|
|
#include <asm/unaligned.h>
|
|
|
|
|
|
@@ -56,6 +57,11 @@ EXPORT_SYMBOL_GPL(nvme_max_retries);
|
|
|
static int nvme_char_major;
|
|
|
module_param(nvme_char_major, int, 0);
|
|
|
|
|
|
+static unsigned long default_ps_max_latency_us = 25000;
|
|
|
+module_param(default_ps_max_latency_us, ulong, 0644);
|
|
|
+MODULE_PARM_DESC(default_ps_max_latency_us,
|
|
|
+ "max power saving latency for new devices; use PM QOS to change per device");
|
|
|
+
|
|
|
static LIST_HEAD(nvme_ctrl_list);
|
|
|
static DEFINE_SPINLOCK(dev_list_lock);
|
|
|
|
|
|
@@ -1252,6 +1258,122 @@ static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
|
|
|
blk_queue_write_cache(q, vwc, vwc);
|
|
|
}
|
|
|
|
|
|
+static void nvme_configure_apst(struct nvme_ctrl *ctrl)
|
|
|
+{
|
|
|
+ /*
|
|
|
+ * APST (Autonomous Power State Transition) lets us program a
|
|
|
+ * table of power state transitions that the controller will
|
|
|
+ * perform automatically. We configure it with a simple
|
|
|
+ * heuristic: we are willing to spend at most 2% of the time
|
|
|
+ * transitioning between power states. Therefore, when running
|
|
|
+ * in any given state, we will enter the next lower-power
|
|
|
+ * non-operational state after waiting 100 * (enlat + exlat)
|
|
|
+ * microseconds, as long as that state's total latency is under
|
|
|
+ * the requested maximum latency.
|
|
|
+ *
|
|
|
+ * We will not autonomously enter any non-operational state for
|
|
|
+ * which the total latency exceeds ps_max_latency_us. Users
|
|
|
+ * can set ps_max_latency_us to zero to turn off APST.
|
|
|
+ */
|
|
|
+
|
|
|
+ unsigned apste;
|
|
|
+ struct nvme_feat_auto_pst *table;
|
|
|
+ int ret;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If APST isn't supported or if we haven't been initialized yet,
|
|
|
+ * then don't do anything.
|
|
|
+ */
|
|
|
+ if (!ctrl->apsta)
|
|
|
+ return;
|
|
|
+
|
|
|
+ if (ctrl->npss > 31) {
|
|
|
+ dev_warn(ctrl->device, "NPSS is invalid; not using APST\n");
|
|
|
+ return;
|
|
|
+ }
|
|
|
+
|
|
|
+ table = kzalloc(sizeof(*table), GFP_KERNEL);
|
|
|
+ if (!table)
|
|
|
+ return;
|
|
|
+
|
|
|
+ if (ctrl->ps_max_latency_us == 0) {
|
|
|
+ /* Turn off APST. */
|
|
|
+ apste = 0;
|
|
|
+ } else {
|
|
|
+ __le64 target = cpu_to_le64(0);
|
|
|
+ int state;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Walk through all states from lowest- to highest-power.
|
|
|
+ * According to the spec, lower-numbered states use more
|
|
|
+ * power. NPSS, despite the name, is the index of the
|
|
|
+ * lowest-power state, not the number of states.
|
|
|
+ */
|
|
|
+ for (state = (int)ctrl->npss; state >= 0; state--) {
|
|
|
+ u64 total_latency_us, transition_ms;
|
|
|
+
|
|
|
+ if (target)
|
|
|
+ table->entries[state] = target;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Is this state a useful non-operational state for
|
|
|
+ * higher-power states to autonomously transition to?
|
|
|
+ */
|
|
|
+ if (!(ctrl->psd[state].flags &
|
|
|
+ NVME_PS_FLAGS_NON_OP_STATE))
|
|
|
+ continue;
|
|
|
+
|
|
|
+ total_latency_us =
|
|
|
+ (u64)le32_to_cpu(ctrl->psd[state].entry_lat) +
|
|
|
+ + le32_to_cpu(ctrl->psd[state].exit_lat);
|
|
|
+ if (total_latency_us > ctrl->ps_max_latency_us)
|
|
|
+ continue;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * This state is good. Use it as the APST idle
|
|
|
+ * target for higher power states.
|
|
|
+ */
|
|
|
+ transition_ms = total_latency_us + 19;
|
|
|
+ do_div(transition_ms, 20);
|
|
|
+ if (transition_ms > (1 << 24) - 1)
|
|
|
+ transition_ms = (1 << 24) - 1;
|
|
|
+
|
|
|
+ target = cpu_to_le64((state << 3) |
|
|
|
+ (transition_ms << 8));
|
|
|
+ }
|
|
|
+
|
|
|
+ apste = 1;
|
|
|
+ }
|
|
|
+
|
|
|
+ ret = nvme_set_features(ctrl, NVME_FEAT_AUTO_PST, apste,
|
|
|
+ table, sizeof(*table), NULL);
|
|
|
+ if (ret)
|
|
|
+ dev_err(ctrl->device, "failed to set APST feature (%d)\n", ret);
|
|
|
+
|
|
|
+ kfree(table);
|
|
|
+}
|
|
|
+
|
|
|
+static void nvme_set_latency_tolerance(struct device *dev, s32 val)
|
|
|
+{
|
|
|
+ struct nvme_ctrl *ctrl = dev_get_drvdata(dev);
|
|
|
+ u64 latency;
|
|
|
+
|
|
|
+ switch (val) {
|
|
|
+ case PM_QOS_LATENCY_TOLERANCE_NO_CONSTRAINT:
|
|
|
+ case PM_QOS_LATENCY_ANY:
|
|
|
+ latency = U64_MAX;
|
|
|
+ break;
|
|
|
+
|
|
|
+ default:
|
|
|
+ latency = val;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (ctrl->ps_max_latency_us != latency) {
|
|
|
+ ctrl->ps_max_latency_us = latency;
|
|
|
+ nvme_configure_apst(ctrl);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
struct nvme_core_quirk_entry {
|
|
|
/*
|
|
|
* NVMe model and firmware strings are padded with spaces. For
|
|
|
@@ -1265,6 +1387,16 @@ struct nvme_core_quirk_entry {
|
|
|
};
|
|
|
|
|
|
static const struct nvme_core_quirk_entry core_quirks[] = {
|
|
|
+ /*
|
|
|
+ * Seen on a Samsung "SM951 NVMe SAMSUNG 256GB": using APST causes
|
|
|
+ * the controller to go out to lunch. It dies when the watchdog
|
|
|
+ * timer reads CSTS and gets 0xffffffff.
|
|
|
+ */
|
|
|
+ {
|
|
|
+ .vid = 0x144d,
|
|
|
+ .fr = "BXW75D0Q",
|
|
|
+ .quirks = NVME_QUIRK_NO_APST,
|
|
|
+ },
|
|
|
};
|
|
|
|
|
|
/* match is null-terminated but idstr is space-padded. */
|
|
|
@@ -1307,6 +1439,7 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
|
|
|
u64 cap;
|
|
|
int ret, page_shift;
|
|
|
u32 max_hw_sectors;
|
|
|
+ u8 prev_apsta;
|
|
|
|
|
|
ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs);
|
|
|
if (ret) {
|
|
|
@@ -1368,6 +1501,11 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
|
|
|
ctrl->sgls = le32_to_cpu(id->sgls);
|
|
|
ctrl->kas = le16_to_cpu(id->kas);
|
|
|
|
|
|
+ ctrl->npss = id->npss;
|
|
|
+ prev_apsta = ctrl->apsta;
|
|
|
+ ctrl->apsta = (ctrl->quirks & NVME_QUIRK_NO_APST) ? 0 : id->apsta;
|
|
|
+ memcpy(ctrl->psd, id->psd, sizeof(ctrl->psd));
|
|
|
+
|
|
|
if (ctrl->ops->is_fabrics) {
|
|
|
ctrl->icdoff = le16_to_cpu(id->icdoff);
|
|
|
ctrl->ioccsz = le32_to_cpu(id->ioccsz);
|
|
|
@@ -1392,7 +1530,15 @@ int nvme_init_identify(struct nvme_ctrl *ctrl)
|
|
|
|
|
|
kfree(id);
|
|
|
|
|
|
+ if (ctrl->apsta && !prev_apsta)
|
|
|
+ dev_pm_qos_expose_latency_tolerance(ctrl->device);
|
|
|
+ else if (!ctrl->apsta && prev_apsta)
|
|
|
+ dev_pm_qos_hide_latency_tolerance(ctrl->device);
|
|
|
+
|
|
|
+ nvme_configure_apst(ctrl);
|
|
|
+
|
|
|
ctrl->identified = true;
|
|
|
+
|
|
|
return ret;
|
|
|
}
|
|
|
EXPORT_SYMBOL_GPL(nvme_init_identify);
|
|
|
@@ -2154,6 +2300,14 @@ int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev,
|
|
|
list_add_tail(&ctrl->node, &nvme_ctrl_list);
|
|
|
spin_unlock(&dev_list_lock);
|
|
|
|
|
|
+ /*
|
|
|
+ * Initialize latency tolerance controls. The sysfs files won't
|
|
|
+ * be visible to userspace unless the device actually supports APST.
|
|
|
+ */
|
|
|
+ ctrl->device->power.set_latency_tolerance = nvme_set_latency_tolerance;
|
|
|
+ dev_pm_qos_update_user_latency_tolerance(ctrl->device,
|
|
|
+ min(default_ps_max_latency_us, (unsigned long)S32_MAX));
|
|
|
+
|
|
|
return 0;
|
|
|
out_release_instance:
|
|
|
nvme_release_instance(ctrl);
|