|
@@ -4858,52 +4858,18 @@ static int build_zonelists_node(pg_data_t *pgdat, struct zonelist *zonelist,
|
|
|
return nr_zones;
|
|
|
}
|
|
|
|
|
|
-
|
|
|
-/*
|
|
|
- * zonelist_order:
|
|
|
- * 0 = automatic detection of better ordering.
|
|
|
- * 1 = order by ([node] distance, -zonetype)
|
|
|
- * 2 = order by (-zonetype, [node] distance)
|
|
|
- *
|
|
|
- * If not NUMA, ZONELIST_ORDER_ZONE and ZONELIST_ORDER_NODE will create
|
|
|
- * the same zonelist. So only NUMA can configure this param.
|
|
|
- */
|
|
|
-#define ZONELIST_ORDER_DEFAULT 0
|
|
|
-#define ZONELIST_ORDER_NODE 1
|
|
|
-#define ZONELIST_ORDER_ZONE 2
|
|
|
-
|
|
|
-/* zonelist order in the kernel.
|
|
|
- * set_zonelist_order() will set this to NODE or ZONE.
|
|
|
- */
|
|
|
-static int current_zonelist_order = ZONELIST_ORDER_DEFAULT;
|
|
|
-static char zonelist_order_name[3][8] = {"Default", "Node", "Zone"};
|
|
|
-
|
|
|
-
|
|
|
#ifdef CONFIG_NUMA
|
|
|
-/* The value user specified ....changed by config */
|
|
|
-static int user_zonelist_order = ZONELIST_ORDER_DEFAULT;
|
|
|
-/* string for sysctl */
|
|
|
-#define NUMA_ZONELIST_ORDER_LEN 16
|
|
|
-char numa_zonelist_order[16] = "default";
|
|
|
-
|
|
|
-/*
|
|
|
- * interface for configure zonelist ordering.
|
|
|
- * command line option "numa_zonelist_order"
|
|
|
- * = "[dD]efault - default, automatic configuration.
|
|
|
- * = "[nN]ode - order by node locality, then by zone within node
|
|
|
- * = "[zZ]one - order by zone, then by locality within zone
|
|
|
- */
|
|
|
|
|
|
static int __parse_numa_zonelist_order(char *s)
|
|
|
{
|
|
|
- if (*s == 'd' || *s == 'D') {
|
|
|
- user_zonelist_order = ZONELIST_ORDER_DEFAULT;
|
|
|
- } else if (*s == 'n' || *s == 'N') {
|
|
|
- user_zonelist_order = ZONELIST_ORDER_NODE;
|
|
|
- } else if (*s == 'z' || *s == 'Z') {
|
|
|
- user_zonelist_order = ZONELIST_ORDER_ZONE;
|
|
|
- } else {
|
|
|
- pr_warn("Ignoring invalid numa_zonelist_order value: %s\n", s);
|
|
|
+ /*
|
|
|
+ * We used to support different zonlists modes but they turned
|
|
|
+ * out to be just not useful. Let's keep the warning in place
|
|
|
+ * if somebody still use the cmd line parameter so that we do
|
|
|
+ * not fail it silently
|
|
|
+ */
|
|
|
+ if (!(*s == 'd' || *s == 'D' || *s == 'n' || *s == 'N')) {
|
|
|
+ pr_warn("Ignoring unsupported numa_zonelist_order value: %s\n", s);
|
|
|
return -EINVAL;
|
|
|
}
|
|
|
return 0;
|
|
@@ -4911,19 +4877,15 @@ static int __parse_numa_zonelist_order(char *s)
|
|
|
|
|
|
static __init int setup_numa_zonelist_order(char *s)
|
|
|
{
|
|
|
- int ret;
|
|
|
-
|
|
|
if (!s)
|
|
|
return 0;
|
|
|
|
|
|
- ret = __parse_numa_zonelist_order(s);
|
|
|
- if (ret == 0)
|
|
|
- strlcpy(numa_zonelist_order, s, NUMA_ZONELIST_ORDER_LEN);
|
|
|
-
|
|
|
- return ret;
|
|
|
+ return __parse_numa_zonelist_order(s);
|
|
|
}
|
|
|
early_param("numa_zonelist_order", setup_numa_zonelist_order);
|
|
|
|
|
|
+char numa_zonelist_order[] = "Node";
|
|
|
+
|
|
|
/*
|
|
|
* sysctl handler for numa_zonelist_order
|
|
|
*/
|
|
@@ -4931,42 +4893,17 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
|
|
|
void __user *buffer, size_t *length,
|
|
|
loff_t *ppos)
|
|
|
{
|
|
|
- char saved_string[NUMA_ZONELIST_ORDER_LEN];
|
|
|
+ char *str;
|
|
|
int ret;
|
|
|
- static DEFINE_MUTEX(zl_order_mutex);
|
|
|
|
|
|
- mutex_lock(&zl_order_mutex);
|
|
|
- if (write) {
|
|
|
- if (strlen((char *)table->data) >= NUMA_ZONELIST_ORDER_LEN) {
|
|
|
- ret = -EINVAL;
|
|
|
- goto out;
|
|
|
- }
|
|
|
- strcpy(saved_string, (char *)table->data);
|
|
|
- }
|
|
|
- ret = proc_dostring(table, write, buffer, length, ppos);
|
|
|
- if (ret)
|
|
|
- goto out;
|
|
|
- if (write) {
|
|
|
- int oldval = user_zonelist_order;
|
|
|
+ if (!write)
|
|
|
+ return proc_dostring(table, write, buffer, length, ppos);
|
|
|
+ str = memdup_user_nul(buffer, 16);
|
|
|
+ if (IS_ERR(str))
|
|
|
+ return PTR_ERR(str);
|
|
|
|
|
|
- ret = __parse_numa_zonelist_order((char *)table->data);
|
|
|
- if (ret) {
|
|
|
- /*
|
|
|
- * bogus value. restore saved string
|
|
|
- */
|
|
|
- strncpy((char *)table->data, saved_string,
|
|
|
- NUMA_ZONELIST_ORDER_LEN);
|
|
|
- user_zonelist_order = oldval;
|
|
|
- } else if (oldval != user_zonelist_order) {
|
|
|
- mem_hotplug_begin();
|
|
|
- mutex_lock(&zonelists_mutex);
|
|
|
- build_all_zonelists(NULL, NULL);
|
|
|
- mutex_unlock(&zonelists_mutex);
|
|
|
- mem_hotplug_done();
|
|
|
- }
|
|
|
- }
|
|
|
-out:
|
|
|
- mutex_unlock(&zl_order_mutex);
|
|
|
+ ret = __parse_numa_zonelist_order(str);
|
|
|
+ kfree(str);
|
|
|
return ret;
|
|
|
}
|
|
|
|
|
@@ -5075,70 +5012,12 @@ static void build_thisnode_zonelists(pg_data_t *pgdat)
|
|
|
*/
|
|
|
static int node_order[MAX_NUMNODES];
|
|
|
|
|
|
-static void build_zonelists_in_zone_order(pg_data_t *pgdat, int nr_nodes)
|
|
|
-{
|
|
|
- int pos, j, node;
|
|
|
- int zone_type; /* needs to be signed */
|
|
|
- struct zone *z;
|
|
|
- struct zonelist *zonelist;
|
|
|
-
|
|
|
- zonelist = &pgdat->node_zonelists[ZONELIST_FALLBACK];
|
|
|
- pos = 0;
|
|
|
- for (zone_type = MAX_NR_ZONES - 1; zone_type >= 0; zone_type--) {
|
|
|
- for (j = 0; j < nr_nodes; j++) {
|
|
|
- node = node_order[j];
|
|
|
- z = &NODE_DATA(node)->node_zones[zone_type];
|
|
|
- if (managed_zone(z)) {
|
|
|
- zoneref_set_zone(z,
|
|
|
- &zonelist->_zonerefs[pos++]);
|
|
|
- check_highest_zone(zone_type);
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
- zonelist->_zonerefs[pos].zone = NULL;
|
|
|
- zonelist->_zonerefs[pos].zone_idx = 0;
|
|
|
-}
|
|
|
-
|
|
|
-#if defined(CONFIG_64BIT)
|
|
|
-/*
|
|
|
- * Devices that require DMA32/DMA are relatively rare and do not justify a
|
|
|
- * penalty to every machine in case the specialised case applies. Default
|
|
|
- * to Node-ordering on 64-bit NUMA machines
|
|
|
- */
|
|
|
-static int default_zonelist_order(void)
|
|
|
-{
|
|
|
- return ZONELIST_ORDER_NODE;
|
|
|
-}
|
|
|
-#else
|
|
|
-/*
|
|
|
- * On 32-bit, the Normal zone needs to be preserved for allocations accessible
|
|
|
- * by the kernel. If processes running on node 0 deplete the low memory zone
|
|
|
- * then reclaim will occur more frequency increasing stalls and potentially
|
|
|
- * be easier to OOM if a large percentage of the zone is under writeback or
|
|
|
- * dirty. The problem is significantly worse if CONFIG_HIGHPTE is not set.
|
|
|
- * Hence, default to zone ordering on 32-bit.
|
|
|
- */
|
|
|
-static int default_zonelist_order(void)
|
|
|
-{
|
|
|
- return ZONELIST_ORDER_ZONE;
|
|
|
-}
|
|
|
-#endif /* CONFIG_64BIT */
|
|
|
-
|
|
|
-static void set_zonelist_order(void)
|
|
|
-{
|
|
|
- if (user_zonelist_order == ZONELIST_ORDER_DEFAULT)
|
|
|
- current_zonelist_order = default_zonelist_order();
|
|
|
- else
|
|
|
- current_zonelist_order = user_zonelist_order;
|
|
|
-}
|
|
|
-
|
|
|
static void build_zonelists(pg_data_t *pgdat)
|
|
|
{
|
|
|
int i, node, load;
|
|
|
nodemask_t used_mask;
|
|
|
int local_node, prev_node;
|
|
|
struct zonelist *zonelist;
|
|
|
- unsigned int order = current_zonelist_order;
|
|
|
|
|
|
/* initialize zonelists */
|
|
|
for (i = 0; i < MAX_ZONELISTS; i++) {
|
|
@@ -5168,15 +5047,7 @@ static void build_zonelists(pg_data_t *pgdat)
|
|
|
|
|
|
prev_node = node;
|
|
|
load--;
|
|
|
- if (order == ZONELIST_ORDER_NODE)
|
|
|
- build_zonelists_in_node_order(pgdat, node);
|
|
|
- else
|
|
|
- node_order[i++] = node; /* remember order */
|
|
|
- }
|
|
|
-
|
|
|
- if (order == ZONELIST_ORDER_ZONE) {
|
|
|
- /* calculate node order -- i.e., DMA last! */
|
|
|
- build_zonelists_in_zone_order(pgdat, i);
|
|
|
+ build_zonelists_in_node_order(pgdat, node);
|
|
|
}
|
|
|
|
|
|
build_thisnode_zonelists(pgdat);
|
|
@@ -5204,11 +5075,6 @@ static void setup_min_unmapped_ratio(void);
|
|
|
static void setup_min_slab_ratio(void);
|
|
|
#else /* CONFIG_NUMA */
|
|
|
|
|
|
-static void set_zonelist_order(void)
|
|
|
-{
|
|
|
- current_zonelist_order = ZONELIST_ORDER_ZONE;
|
|
|
-}
|
|
|
-
|
|
|
static void build_zonelists(pg_data_t *pgdat)
|
|
|
{
|
|
|
int node, local_node;
|
|
@@ -5348,8 +5214,6 @@ build_all_zonelists_init(void)
|
|
|
*/
|
|
|
void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
|
|
|
{
|
|
|
- set_zonelist_order();
|
|
|
-
|
|
|
if (system_state == SYSTEM_BOOTING) {
|
|
|
build_all_zonelists_init();
|
|
|
} else {
|
|
@@ -5375,9 +5239,8 @@ void __ref build_all_zonelists(pg_data_t *pgdat, struct zone *zone)
|
|
|
else
|
|
|
page_group_by_mobility_disabled = 0;
|
|
|
|
|
|
- pr_info("Built %i zonelists in %s order, mobility grouping %s. Total pages: %ld\n",
|
|
|
+ pr_info("Built %i zonelists, mobility grouping %s. Total pages: %ld\n",
|
|
|
nr_online_nodes,
|
|
|
- zonelist_order_name[current_zonelist_order],
|
|
|
page_group_by_mobility_disabled ? "off" : "on",
|
|
|
vm_total_pages);
|
|
|
#ifdef CONFIG_NUMA
|