7 жил өмнө · de73f38f76
--- a/arch/x86/kernel/cpu/intel_rdt.c
+++ b/arch/x86/kernel/cpu/intel_rdt.c
@@ -33,7 +33,6 @@
 
				 #include <asm/intel_rdt_sched.h>
			
 
				 #include "intel_rdt.h"
			
 
				 
			
 
				-#define MAX_MBA_BW	100u
			
 
				 #define MBA_IS_LINEAR	0x4
			
 
				 #define MBA_MAX_MBPS	U32_MAX
			
 
				 
			
@@ -350,7 +349,7 @@ static int get_cache_id(int cpu, int level)
 
				  * that can be written to QOS_MSRs.
			
 
				  * There are currently no SKUs which support non linear delay values.
			
 
				  */
			
 
				-static u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
			
 
				+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r)
			
 
				 {
			
 
				 	if (r->membw.delay_linear)
			
 
				 		return MAX_MBA_BW - bw;
			
--- a/arch/x86/kernel/cpu/intel_rdt.h
+++ b/arch/x86/kernel/cpu/intel_rdt.h
@@ -28,6 +28,7 @@
 
				 
			
 
				 #define MBM_CNTR_WIDTH			24
			
 
				 #define MBM_OVERFLOW_INTERVAL		1000
			
 
				+#define MAX_MBA_BW			100u
			
 
				 
			
 
				 #define RMID_VAL_ERROR			BIT_ULL(63)
			
 
				 #define RMID_VAL_UNAVAIL		BIT_ULL(62)
			
@@ -461,6 +462,7 @@ void mbm_setup_overflow_handler(struct rdt_domain *dom,
 
				 void mbm_handle_overflow(struct work_struct *work);
			
 
				 bool is_mba_sc(struct rdt_resource *r);
			
 
				 void setup_default_ctrlval(struct rdt_resource *r, u32 *dc, u32 *dm);
			
 
				+u32 delay_bw_map(unsigned long bw, struct rdt_resource *r);
			
 
				 void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms);
			
 
				 void cqm_handle_limbo(struct work_struct *work);
			
 
				 bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d);
			
--- a/arch/x86/kernel/cpu/intel_rdt_monitor.c
+++ b/arch/x86/kernel/cpu/intel_rdt_monitor.c
@@ -329,6 +329,118 @@ void mon_event_count(void *info)
 
				 	}
			
 
				 }
			
 
				 
			
 
				+/*
			
 
				+ * Feedback loop for MBA software controller (mba_sc)
			
 
				+ *
			
 
				+ * mba_sc is a feedback loop where we periodically read MBM counters and
			
 
				+ * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so
			
 
				+ * that:
			
 
				+ *
			
 
				+ *   current bandwdith(cur_bw) < user specified bandwidth(user_bw)
			
 
				+ *
			
 
				+ * This uses the MBM counters to measure the bandwidth and MBA throttle
			
 
				+ * MSRs to control the bandwidth for a particular rdtgrp. It builds on the
			
 
				+ * fact that resctrl rdtgroups have both monitoring and control.
			
 
				+ *
			
 
				+ * The frequency of the checks is 1s and we just tag along the MBM overflow
			
 
				+ * timer. Having 1s interval makes the calculation of bandwidth simpler.
			
 
				+ *
			
 
				+ * Although MBA's goal is to restrict the bandwidth to a maximum, there may
			
 
				+ * be a need to increase the bandwidth to avoid uncecessarily restricting
			
 
				+ * the L2 <-> L3 traffic.
			
 
				+ *
			
 
				+ * Since MBA controls the L2 external bandwidth where as MBM measures the
			
 
				+ * L3 external bandwidth the following sequence could lead to such a
			
 
				+ * situation.
			
 
				+ *
			
 
				+ * Consider an rdtgroup which had high L3 <-> memory traffic in initial
			
 
				+ * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but
			
 
				+ * after some time rdtgroup has mostly L2 <-> L3 traffic.
			
 
				+ *
			
 
				+ * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its
			
 
				+ * throttle MSRs already have low percentage values.  To avoid
			
 
				+ * unnecessarily restricting such rdtgroups, we also increase the bandwidth.
			
 
				+ */
			
 
				+static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_domain *dom_mbm)
			
 
				+{
			
 
				+	u32 closid, rmid, cur_msr, cur_msr_val, new_msr_val;
			
 
				+	struct mbm_state *pmbm_data, *cmbm_data;
			
 
				+	u32 cur_bw, delta_bw, user_bw;
			
 
				+	struct rdt_resource *r_mba;
			
 
				+	struct rdt_domain *dom_mba;
			
 
				+	struct list_head *head;
			
 
				+	struct rdtgroup *entry;
			
 
				+
			
 
				+	r_mba = &rdt_resources_all[RDT_RESOURCE_MBA];
			
 
				+	closid = rgrp->closid;
			
 
				+	rmid = rgrp->mon.rmid;
			
 
				+	pmbm_data = &dom_mbm->mbm_local[rmid];
			
 
				+
			
 
				+	dom_mba = get_domain_from_cpu(smp_processor_id(), r_mba);
			
 
				+	if (!dom_mba) {
			
 
				+		pr_warn_once("Failure to get domain for MBA update\n");
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	cur_bw = pmbm_data->prev_bw;
			
 
				+	user_bw = dom_mba->mbps_val[closid];
			
 
				+	delta_bw = pmbm_data->delta_bw;
			
 
				+	cur_msr_val = dom_mba->ctrl_val[closid];
			
 
				+
			
 
				+	/*
			
 
				+	 * For Ctrl groups read data from child monitor groups.
			
 
				+	 */
			
 
				+	head = &rgrp->mon.crdtgrp_list;
			
 
				+	list_for_each_entry(entry, head, mon.crdtgrp_list) {
			
 
				+		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
			
 
				+		cur_bw += cmbm_data->prev_bw;
			
 
				+		delta_bw += cmbm_data->delta_bw;
			
 
				+	}
			
 
				+
			
 
				+	/*
			
 
				+	 * Scale up/down the bandwidth linearly for the ctrl group.  The
			
 
				+	 * bandwidth step is the bandwidth granularity specified by the
			
 
				+	 * hardware.
			
 
				+	 *
			
 
				+	 * The delta_bw is used when increasing the bandwidth so that we
			
 
				+	 * dont alternately increase and decrease the control values
			
 
				+	 * continuously.
			
 
				+	 *
			
 
				+	 * For ex: consider cur_bw = 90MBps, user_bw = 100MBps and if
			
 
				+	 * bandwidth step is 20MBps(> user_bw - cur_bw), we would keep
			
 
				+	 * switching between 90 and 110 continuously if we only check
			
 
				+	 * cur_bw < user_bw.
			
 
				+	 */
			
 
				+	if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) {
			
 
				+		new_msr_val = cur_msr_val - r_mba->membw.bw_gran;
			
 
				+	} else if (cur_msr_val < MAX_MBA_BW &&
			
 
				+		   (user_bw > (cur_bw + delta_bw))) {
			
 
				+		new_msr_val = cur_msr_val + r_mba->membw.bw_gran;
			
 
				+	} else {
			
 
				+		return;
			
 
				+	}
			
 
				+
			
 
				+	cur_msr = r_mba->msr_base + closid;
			
 
				+	wrmsrl(cur_msr, delay_bw_map(new_msr_val, r_mba));
			
 
				+	dom_mba->ctrl_val[closid] = new_msr_val;
			
 
				+
			
 
				+	/*
			
 
				+	 * Delta values are updated dynamically package wise for each
			
 
				+	 * rdtgrp everytime the throttle MSR changes value.
			
 
				+	 *
			
 
				+	 * This is because (1)the increase in bandwidth is not perfectly
			
 
				+	 * linear and only "approximately" linear even when the hardware
			
 
				+	 * says it is linear.(2)Also since MBA is a core specific
			
 
				+	 * mechanism, the delta values vary based on number of cores used
			
 
				+	 * by the rdtgrp.
			
 
				+	 */
			
 
				+	pmbm_data->delta_comp = true;
			
 
				+	list_for_each_entry(entry, head, mon.crdtgrp_list) {
			
 
				+		cmbm_data = &dom_mbm->mbm_local[entry->mon.rmid];
			
 
				+		cmbm_data->delta_comp = true;
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 static void mbm_update(struct rdt_domain *d, int rmid)
			
 
				 {
			
 
				 	struct rmid_read rr;
			
@@ -346,7 +458,16 @@ static void mbm_update(struct rdt_domain *d, int rmid)
 
				 	}
			
 
				 	if (is_mbm_local_enabled()) {
			
 
				 		rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID;
			
 
				-		__mon_event_count(rmid, &rr);
			
 
				+
			
 
				+		/*
			
 
				+		 * Call the MBA software controller only for the
			
 
				+		 * control groups and when user has enabled
			
 
				+		 * the software controller explicitly.
			
 
				+		 */
			
 
				+		if (!is_mba_sc(NULL))
			
 
				+			__mon_event_count(rmid, &rr);
			
 
				+		else
			
 
				+			mbm_bw_count(rmid, &rr);
			
 
				 	}
			
 
				 }
			
 
				 
			
@@ -417,6 +538,9 @@ void mbm_handle_overflow(struct work_struct *work)
 
				 		head = &prgrp->mon.crdtgrp_list;
			
 
				 		list_for_each_entry(crgrp, head, mon.crdtgrp_list)
			
 
				 			mbm_update(d, crgrp->mon.rmid);
			
 
				+
			
 
				+		if (is_mba_sc(NULL))
			
 
				+			update_mba_bw(prgrp, d);
			
 
				 	}
			
 
				 
			
 
				 	schedule_delayed_work_on(cpu, &d->mbm_over, delay);