|
@@ -36,12 +36,56 @@
|
|
|
#include <asm/reg.h>
|
|
|
#include <asm/smp.h> /* Required for cpu_sibling_mask() in UP configs */
|
|
|
#include <asm/opal.h>
|
|
|
+#include <linux/timer.h>
|
|
|
|
|
|
#define POWERNV_MAX_PSTATES 256
|
|
|
#define PMSR_PSAFE_ENABLE (1UL << 30)
|
|
|
#define PMSR_SPR_EM_DISABLE (1UL << 31)
|
|
|
#define PMSR_MAX(x) ((x >> 32) & 0xFF)
|
|
|
|
|
|
+#define MAX_RAMP_DOWN_TIME 5120
|
|
|
+/*
|
|
|
+ * On an idle system we want the global pstate to ramp-down from max value to
|
|
|
+ * min over a span of ~5 secs. Also we want it to initially ramp-down slowly and
|
|
|
+ * then ramp-down rapidly later on.
|
|
|
+ *
|
|
|
+ * This gives a percentage rampdown for time elapsed in milliseconds.
|
|
|
+ * ramp_down_percentage = ((ms * ms) >> 18)
|
|
|
+ * ~= 3.8 * (sec * sec)
|
|
|
+ *
|
|
|
+ * At 0 ms ramp_down_percent = 0
|
|
|
+ * At 5120 ms ramp_down_percent = 100
|
|
|
+ */
|
|
|
+#define ramp_down_percent(time) ((time * time) >> 18)
|
|
|
+
|
|
|
+/* Interval after which the timer is queued to bring down global pstate */
|
|
|
+#define GPSTATE_TIMER_INTERVAL 2000
|
|
|
+
|
|
|
+/**
|
|
|
+ * struct global_pstate_info - Per policy data structure to maintain history of
|
|
|
+ * global pstates
|
|
|
+ * @highest_lpstate: The local pstate from which we are ramping down
|
|
|
+ * @elapsed_time: Time in ms spent in ramping down from
|
|
|
+ * highest_lpstate
|
|
|
+ * @last_sampled_time: Time from boot in ms when global pstates were
|
|
|
+ * last set
|
|
|
+ * @last_lpstate,last_gpstate: Last set values for local and global pstates
|
|
|
+ * @timer: Is used for ramping down if cpu goes idle for
|
|
|
+ * a long time with global pstate held high
|
|
|
+ * @gpstate_lock: A spinlock to maintain synchronization between
|
|
|
+ * routines called by the timer handler and
|
|
|
+ * governer's target_index calls
|
|
|
+ */
|
|
|
+struct global_pstate_info {
|
|
|
+ int highest_lpstate;
|
|
|
+ unsigned int elapsed_time;
|
|
|
+ unsigned int last_sampled_time;
|
|
|
+ int last_lpstate;
|
|
|
+ int last_gpstate;
|
|
|
+ spinlock_t gpstate_lock;
|
|
|
+ struct timer_list timer;
|
|
|
+};
|
|
|
+
|
|
|
static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1];
|
|
|
static bool rebooting, throttled, occ_reset;
|
|
|
|
|
@@ -94,6 +138,17 @@ static struct powernv_pstate_info {
|
|
|
int nr_pstates;
|
|
|
} powernv_pstate_info;
|
|
|
|
|
|
+static inline void reset_gpstates(struct cpufreq_policy *policy)
|
|
|
+{
|
|
|
+ struct global_pstate_info *gpstates = policy->driver_data;
|
|
|
+
|
|
|
+ gpstates->highest_lpstate = 0;
|
|
|
+ gpstates->elapsed_time = 0;
|
|
|
+ gpstates->last_sampled_time = 0;
|
|
|
+ gpstates->last_lpstate = 0;
|
|
|
+ gpstates->last_gpstate = 0;
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* Initialize the freq table based on data obtained
|
|
|
* from the firmware passed via device-tree
|
|
@@ -285,6 +340,7 @@ static inline void set_pmspr(unsigned long sprn, unsigned long val)
|
|
|
struct powernv_smp_call_data {
|
|
|
unsigned int freq;
|
|
|
int pstate_id;
|
|
|
+ int gpstate_id;
|
|
|
};
|
|
|
|
|
|
/*
|
|
@@ -343,19 +399,21 @@ static unsigned int powernv_cpufreq_get(unsigned int cpu)
|
|
|
* (struct powernv_smp_call_data *) and the pstate_id which needs to be set
|
|
|
* on this CPU should be present in freq_data->pstate_id.
|
|
|
*/
|
|
|
-static void set_pstate(void *freq_data)
|
|
|
+static void set_pstate(void *data)
|
|
|
{
|
|
|
unsigned long val;
|
|
|
- unsigned long pstate_ul =
|
|
|
- ((struct powernv_smp_call_data *) freq_data)->pstate_id;
|
|
|
+ struct powernv_smp_call_data *freq_data = data;
|
|
|
+ unsigned long pstate_ul = freq_data->pstate_id;
|
|
|
+ unsigned long gpstate_ul = freq_data->gpstate_id;
|
|
|
|
|
|
val = get_pmspr(SPRN_PMCR);
|
|
|
val = val & 0x0000FFFFFFFFFFFFULL;
|
|
|
|
|
|
pstate_ul = pstate_ul & 0xFF;
|
|
|
+ gpstate_ul = gpstate_ul & 0xFF;
|
|
|
|
|
|
/* Set both global(bits 56..63) and local(bits 48..55) PStates */
|
|
|
- val = val | (pstate_ul << 56) | (pstate_ul << 48);
|
|
|
+ val = val | (gpstate_ul << 56) | (pstate_ul << 48);
|
|
|
|
|
|
pr_debug("Setting cpu %d pmcr to %016lX\n",
|
|
|
raw_smp_processor_id(), val);
|
|
@@ -424,6 +482,110 @@ next:
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+/**
|
|
|
+ * calc_global_pstate - Calculate global pstate
|
|
|
+ * @elapsed_time: Elapsed time in milliseconds
|
|
|
+ * @local_pstate: New local pstate
|
|
|
+ * @highest_lpstate: pstate from which its ramping down
|
|
|
+ *
|
|
|
+ * Finds the appropriate global pstate based on the pstate from which its
|
|
|
+ * ramping down and the time elapsed in ramping down. It follows a quadratic
|
|
|
+ * equation which ensures that it reaches ramping down to pmin in 5sec.
|
|
|
+ */
|
|
|
+static inline int calc_global_pstate(unsigned int elapsed_time,
|
|
|
+ int highest_lpstate, int local_pstate)
|
|
|
+{
|
|
|
+ int pstate_diff;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Using ramp_down_percent we get the percentage of rampdown
|
|
|
+ * that we are expecting to be dropping. Difference between
|
|
|
+ * highest_lpstate and powernv_pstate_info.min will give a absolute
|
|
|
+ * number of how many pstates we will drop eventually by the end of
|
|
|
+ * 5 seconds, then just scale it get the number pstates to be dropped.
|
|
|
+ */
|
|
|
+ pstate_diff = ((int)ramp_down_percent(elapsed_time) *
|
|
|
+ (highest_lpstate - powernv_pstate_info.min)) / 100;
|
|
|
+
|
|
|
+ /* Ensure that global pstate is >= to local pstate */
|
|
|
+ if (highest_lpstate - pstate_diff < local_pstate)
|
|
|
+ return local_pstate;
|
|
|
+ else
|
|
|
+ return highest_lpstate - pstate_diff;
|
|
|
+}
|
|
|
+
|
|
|
+static inline void queue_gpstate_timer(struct global_pstate_info *gpstates)
|
|
|
+{
|
|
|
+ unsigned int timer_interval;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * Setting up timer to fire after GPSTATE_TIMER_INTERVAL ms, But
|
|
|
+ * if it exceeds MAX_RAMP_DOWN_TIME ms for ramp down time.
|
|
|
+ * Set timer such that it fires exactly at MAX_RAMP_DOWN_TIME
|
|
|
+ * seconds of ramp down time.
|
|
|
+ */
|
|
|
+ if ((gpstates->elapsed_time + GPSTATE_TIMER_INTERVAL)
|
|
|
+ > MAX_RAMP_DOWN_TIME)
|
|
|
+ timer_interval = MAX_RAMP_DOWN_TIME - gpstates->elapsed_time;
|
|
|
+ else
|
|
|
+ timer_interval = GPSTATE_TIMER_INTERVAL;
|
|
|
+
|
|
|
+ mod_timer_pinned(&gpstates->timer, jiffies +
|
|
|
+ msecs_to_jiffies(timer_interval));
|
|
|
+}
|
|
|
+
|
|
|
+/**
|
|
|
+ * gpstate_timer_handler
|
|
|
+ *
|
|
|
+ * @data: pointer to cpufreq_policy on which timer was queued
|
|
|
+ *
|
|
|
+ * This handler brings down the global pstate closer to the local pstate
|
|
|
+ * according quadratic equation. Queues a new timer if it is still not equal
|
|
|
+ * to local pstate
|
|
|
+ */
|
|
|
+void gpstate_timer_handler(unsigned long data)
|
|
|
+{
|
|
|
+ struct cpufreq_policy *policy = (struct cpufreq_policy *)data;
|
|
|
+ struct global_pstate_info *gpstates = policy->driver_data;
|
|
|
+ int gpstate_id;
|
|
|
+ unsigned int time_diff = jiffies_to_msecs(jiffies)
|
|
|
+ - gpstates->last_sampled_time;
|
|
|
+ struct powernv_smp_call_data freq_data;
|
|
|
+
|
|
|
+ if (!spin_trylock(&gpstates->gpstate_lock))
|
|
|
+ return;
|
|
|
+
|
|
|
+ gpstates->last_sampled_time += time_diff;
|
|
|
+ gpstates->elapsed_time += time_diff;
|
|
|
+ freq_data.pstate_id = gpstates->last_lpstate;
|
|
|
+
|
|
|
+ if ((gpstates->last_gpstate == freq_data.pstate_id) ||
|
|
|
+ (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME)) {
|
|
|
+ gpstate_id = freq_data.pstate_id;
|
|
|
+ reset_gpstates(policy);
|
|
|
+ gpstates->highest_lpstate = freq_data.pstate_id;
|
|
|
+ } else {
|
|
|
+ gpstate_id = calc_global_pstate(gpstates->elapsed_time,
|
|
|
+ gpstates->highest_lpstate,
|
|
|
+ freq_data.pstate_id);
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If local pstate is equal to global pstate, rampdown is over
|
|
|
+ * So timer is not required to be queued.
|
|
|
+ */
|
|
|
+ if (gpstate_id != freq_data.pstate_id)
|
|
|
+ queue_gpstate_timer(gpstates);
|
|
|
+
|
|
|
+ freq_data.gpstate_id = gpstate_id;
|
|
|
+ gpstates->last_gpstate = freq_data.gpstate_id;
|
|
|
+ gpstates->last_lpstate = freq_data.pstate_id;
|
|
|
+
|
|
|
+ /* Timer may get migrated to a different cpu on cpu hot unplug */
|
|
|
+ smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
|
|
|
+ spin_unlock(&gpstates->gpstate_lock);
|
|
|
+}
|
|
|
+
|
|
|
/*
|
|
|
* powernv_cpufreq_target_index: Sets the frequency corresponding to
|
|
|
* the cpufreq table entry indexed by new_index on the cpus in the
|
|
@@ -433,6 +595,9 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
|
|
|
unsigned int new_index)
|
|
|
{
|
|
|
struct powernv_smp_call_data freq_data;
|
|
|
+ unsigned int cur_msec, gpstate_id;
|
|
|
+ unsigned long flags;
|
|
|
+ struct global_pstate_info *gpstates = policy->driver_data;
|
|
|
|
|
|
if (unlikely(rebooting) && new_index != get_nominal_index())
|
|
|
return 0;
|
|
@@ -440,22 +605,70 @@ static int powernv_cpufreq_target_index(struct cpufreq_policy *policy,
|
|
|
if (!throttled)
|
|
|
powernv_cpufreq_throttle_check(NULL);
|
|
|
|
|
|
+ cur_msec = jiffies_to_msecs(get_jiffies_64());
|
|
|
+
|
|
|
+ spin_lock_irqsave(&gpstates->gpstate_lock, flags);
|
|
|
freq_data.pstate_id = powernv_freqs[new_index].driver_data;
|
|
|
|
|
|
+ if (!gpstates->last_sampled_time) {
|
|
|
+ gpstate_id = freq_data.pstate_id;
|
|
|
+ gpstates->highest_lpstate = freq_data.pstate_id;
|
|
|
+ goto gpstates_done;
|
|
|
+ }
|
|
|
+
|
|
|
+ if (gpstates->last_gpstate > freq_data.pstate_id) {
|
|
|
+ gpstates->elapsed_time += cur_msec -
|
|
|
+ gpstates->last_sampled_time;
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If its has been ramping down for more than MAX_RAMP_DOWN_TIME
|
|
|
+ * we should be resetting all global pstate related data. Set it
|
|
|
+ * equal to local pstate to start fresh.
|
|
|
+ */
|
|
|
+ if (gpstates->elapsed_time > MAX_RAMP_DOWN_TIME) {
|
|
|
+ reset_gpstates(policy);
|
|
|
+ gpstates->highest_lpstate = freq_data.pstate_id;
|
|
|
+ gpstate_id = freq_data.pstate_id;
|
|
|
+ } else {
|
|
|
+ /* Elaspsed_time is less than 5 seconds, continue to rampdown */
|
|
|
+ gpstate_id = calc_global_pstate(gpstates->elapsed_time,
|
|
|
+ gpstates->highest_lpstate,
|
|
|
+ freq_data.pstate_id);
|
|
|
+ }
|
|
|
+ } else {
|
|
|
+ reset_gpstates(policy);
|
|
|
+ gpstates->highest_lpstate = freq_data.pstate_id;
|
|
|
+ gpstate_id = freq_data.pstate_id;
|
|
|
+ }
|
|
|
+
|
|
|
+ /*
|
|
|
+ * If local pstate is equal to global pstate, rampdown is over
|
|
|
+ * So timer is not required to be queued.
|
|
|
+ */
|
|
|
+ if (gpstate_id != freq_data.pstate_id)
|
|
|
+ queue_gpstate_timer(gpstates);
|
|
|
+
|
|
|
+gpstates_done:
|
|
|
+ freq_data.gpstate_id = gpstate_id;
|
|
|
+ gpstates->last_sampled_time = cur_msec;
|
|
|
+ gpstates->last_gpstate = freq_data.gpstate_id;
|
|
|
+ gpstates->last_lpstate = freq_data.pstate_id;
|
|
|
+
|
|
|
/*
|
|
|
* Use smp_call_function to send IPI and execute the
|
|
|
* mtspr on target CPU. We could do that without IPI
|
|
|
* if current CPU is within policy->cpus (core)
|
|
|
*/
|
|
|
smp_call_function_any(policy->cpus, set_pstate, &freq_data, 1);
|
|
|
-
|
|
|
+ spin_unlock_irqrestore(&gpstates->gpstate_lock, flags);
|
|
|
return 0;
|
|
|
}
|
|
|
|
|
|
static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
|
|
|
{
|
|
|
- int base, i;
|
|
|
+ int base, i, ret;
|
|
|
struct kernfs_node *kn;
|
|
|
+ struct global_pstate_info *gpstates;
|
|
|
|
|
|
base = cpu_first_thread_sibling(policy->cpu);
|
|
|
|
|
@@ -475,7 +688,34 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy)
|
|
|
} else {
|
|
|
kernfs_put(kn);
|
|
|
}
|
|
|
- return cpufreq_table_validate_and_show(policy, powernv_freqs);
|
|
|
+
|
|
|
+ gpstates = kzalloc(sizeof(*gpstates), GFP_KERNEL);
|
|
|
+ if (!gpstates)
|
|
|
+ return -ENOMEM;
|
|
|
+
|
|
|
+ policy->driver_data = gpstates;
|
|
|
+
|
|
|
+ /* initialize timer */
|
|
|
+ init_timer_deferrable(&gpstates->timer);
|
|
|
+ gpstates->timer.data = (unsigned long)policy;
|
|
|
+ gpstates->timer.function = gpstate_timer_handler;
|
|
|
+ gpstates->timer.expires = jiffies +
|
|
|
+ msecs_to_jiffies(GPSTATE_TIMER_INTERVAL);
|
|
|
+ spin_lock_init(&gpstates->gpstate_lock);
|
|
|
+ ret = cpufreq_table_validate_and_show(policy, powernv_freqs);
|
|
|
+
|
|
|
+ if (ret < 0)
|
|
|
+ kfree(policy->driver_data);
|
|
|
+
|
|
|
+ return ret;
|
|
|
+}
|
|
|
+
|
|
|
+static int powernv_cpufreq_cpu_exit(struct cpufreq_policy *policy)
|
|
|
+{
|
|
|
+ /* timer is deleted in cpufreq_cpu_stop() */
|
|
|
+ kfree(policy->driver_data);
|
|
|
+
|
|
|
+ return 0;
|
|
|
}
|
|
|
|
|
|
static int powernv_cpufreq_reboot_notifier(struct notifier_block *nb,
|
|
@@ -603,15 +843,19 @@ static struct notifier_block powernv_cpufreq_opal_nb = {
|
|
|
static void powernv_cpufreq_stop_cpu(struct cpufreq_policy *policy)
|
|
|
{
|
|
|
struct powernv_smp_call_data freq_data;
|
|
|
+ struct global_pstate_info *gpstates = policy->driver_data;
|
|
|
|
|
|
freq_data.pstate_id = powernv_pstate_info.min;
|
|
|
+ freq_data.gpstate_id = powernv_pstate_info.min;
|
|
|
smp_call_function_single(policy->cpu, set_pstate, &freq_data, 1);
|
|
|
+ del_timer_sync(&gpstates->timer);
|
|
|
}
|
|
|
|
|
|
static struct cpufreq_driver powernv_cpufreq_driver = {
|
|
|
.name = "powernv-cpufreq",
|
|
|
.flags = CPUFREQ_CONST_LOOPS,
|
|
|
.init = powernv_cpufreq_cpu_init,
|
|
|
+ .exit = powernv_cpufreq_cpu_exit,
|
|
|
.verify = cpufreq_generic_frequency_table_verify,
|
|
|
.target_index = powernv_cpufreq_target_index,
|
|
|
.get = powernv_cpufreq_get,
|