|
@@ -86,8 +86,10 @@ void vm_events_fold_cpu(int cpu)
|
|
|
*
|
|
|
* vm_stat contains the global counters
|
|
|
*/
|
|
|
-atomic_long_t vm_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
|
|
|
-EXPORT_SYMBOL(vm_stat);
|
|
|
+atomic_long_t vm_zone_stat[NR_VM_ZONE_STAT_ITEMS] __cacheline_aligned_in_smp;
|
|
|
+atomic_long_t vm_node_stat[NR_VM_NODE_STAT_ITEMS] __cacheline_aligned_in_smp;
|
|
|
+EXPORT_SYMBOL(vm_zone_stat);
|
|
|
+EXPORT_SYMBOL(vm_node_stat);
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
@@ -167,19 +169,36 @@ int calculate_normal_threshold(struct zone *zone)
|
|
|
*/
|
|
|
void refresh_zone_stat_thresholds(void)
|
|
|
{
|
|
|
+ struct pglist_data *pgdat;
|
|
|
struct zone *zone;
|
|
|
int cpu;
|
|
|
int threshold;
|
|
|
|
|
|
+ /* Zero current pgdat thresholds */
|
|
|
+ for_each_online_pgdat(pgdat) {
|
|
|
+ for_each_online_cpu(cpu) {
|
|
|
+ per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold = 0;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
for_each_populated_zone(zone) {
|
|
|
+ struct pglist_data *pgdat = zone->zone_pgdat;
|
|
|
unsigned long max_drift, tolerate_drift;
|
|
|
|
|
|
threshold = calculate_normal_threshold(zone);
|
|
|
|
|
|
- for_each_online_cpu(cpu)
|
|
|
+ for_each_online_cpu(cpu) {
|
|
|
+ int pgdat_threshold;
|
|
|
+
|
|
|
per_cpu_ptr(zone->pageset, cpu)->stat_threshold
|
|
|
= threshold;
|
|
|
|
|
|
+ /* Base nodestat threshold on the largest populated zone. */
|
|
|
+ pgdat_threshold = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold;
|
|
|
+ per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->stat_threshold
|
|
|
+ = max(threshold, pgdat_threshold);
|
|
|
+ }
|
|
|
+
|
|
|
/*
|
|
|
* Only set percpu_drift_mark if there is a danger that
|
|
|
* NR_FREE_PAGES reports the low watermark is ok when in fact
|
|
@@ -238,6 +257,26 @@ void __mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
|
|
|
}
|
|
|
EXPORT_SYMBOL(__mod_zone_page_state);
|
|
|
|
|
|
+void __mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
|
|
|
+ long delta)
|
|
|
+{
|
|
|
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
|
|
|
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
|
|
|
+ long x;
|
|
|
+ long t;
|
|
|
+
|
|
|
+ x = delta + __this_cpu_read(*p);
|
|
|
+
|
|
|
+ t = __this_cpu_read(pcp->stat_threshold);
|
|
|
+
|
|
|
+ if (unlikely(x > t || x < -t)) {
|
|
|
+ node_page_state_add(x, pgdat, item);
|
|
|
+ x = 0;
|
|
|
+ }
|
|
|
+ __this_cpu_write(*p, x);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(__mod_node_page_state);
|
|
|
+
|
|
|
/*
|
|
|
* Optimized increment and decrement functions.
|
|
|
*
|
|
@@ -277,12 +316,34 @@ void __inc_zone_state(struct zone *zone, enum zone_stat_item item)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+void __inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
|
|
|
+{
|
|
|
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
|
|
|
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
|
|
|
+ s8 v, t;
|
|
|
+
|
|
|
+ v = __this_cpu_inc_return(*p);
|
|
|
+ t = __this_cpu_read(pcp->stat_threshold);
|
|
|
+ if (unlikely(v > t)) {
|
|
|
+ s8 overstep = t >> 1;
|
|
|
+
|
|
|
+ node_page_state_add(v + overstep, pgdat, item);
|
|
|
+ __this_cpu_write(*p, -overstep);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
void __inc_zone_page_state(struct page *page, enum zone_stat_item item)
|
|
|
{
|
|
|
__inc_zone_state(page_zone(page), item);
|
|
|
}
|
|
|
EXPORT_SYMBOL(__inc_zone_page_state);
|
|
|
|
|
|
+void __inc_node_page_state(struct page *page, enum node_stat_item item)
|
|
|
+{
|
|
|
+ __inc_node_state(page_pgdat(page), item);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(__inc_node_page_state);
|
|
|
+
|
|
|
void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
|
|
|
{
|
|
|
struct per_cpu_pageset __percpu *pcp = zone->pageset;
|
|
@@ -299,12 +360,34 @@ void __dec_zone_state(struct zone *zone, enum zone_stat_item item)
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+void __dec_node_state(struct pglist_data *pgdat, enum node_stat_item item)
|
|
|
+{
|
|
|
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
|
|
|
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
|
|
|
+ s8 v, t;
|
|
|
+
|
|
|
+ v = __this_cpu_dec_return(*p);
|
|
|
+ t = __this_cpu_read(pcp->stat_threshold);
|
|
|
+ if (unlikely(v < - t)) {
|
|
|
+ s8 overstep = t >> 1;
|
|
|
+
|
|
|
+ node_page_state_add(v - overstep, pgdat, item);
|
|
|
+ __this_cpu_write(*p, overstep);
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
void __dec_zone_page_state(struct page *page, enum zone_stat_item item)
|
|
|
{
|
|
|
__dec_zone_state(page_zone(page), item);
|
|
|
}
|
|
|
EXPORT_SYMBOL(__dec_zone_page_state);
|
|
|
|
|
|
+void __dec_node_page_state(struct page *page, enum node_stat_item item)
|
|
|
+{
|
|
|
+ __dec_node_state(page_pgdat(page), item);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(__dec_node_page_state);
|
|
|
+
|
|
|
#ifdef CONFIG_HAVE_CMPXCHG_LOCAL
|
|
|
/*
|
|
|
* If we have cmpxchg_local support then we do not need to incur the overhead
|
|
@@ -318,8 +401,8 @@ EXPORT_SYMBOL(__dec_zone_page_state);
|
|
|
* 1 Overstepping half of threshold
|
|
|
* -1 Overstepping minus half of threshold
|
|
|
*/
|
|
|
-static inline void mod_state(struct zone *zone, enum zone_stat_item item,
|
|
|
- long delta, int overstep_mode)
|
|
|
+static inline void mod_zone_state(struct zone *zone,
|
|
|
+ enum zone_stat_item item, long delta, int overstep_mode)
|
|
|
{
|
|
|
struct per_cpu_pageset __percpu *pcp = zone->pageset;
|
|
|
s8 __percpu *p = pcp->vm_stat_diff + item;
|
|
@@ -359,26 +442,88 @@ static inline void mod_state(struct zone *zone, enum zone_stat_item item,
|
|
|
void mod_zone_page_state(struct zone *zone, enum zone_stat_item item,
|
|
|
long delta)
|
|
|
{
|
|
|
- mod_state(zone, item, delta, 0);
|
|
|
+ mod_zone_state(zone, item, delta, 0);
|
|
|
}
|
|
|
EXPORT_SYMBOL(mod_zone_page_state);
|
|
|
|
|
|
void inc_zone_state(struct zone *zone, enum zone_stat_item item)
|
|
|
{
|
|
|
- mod_state(zone, item, 1, 1);
|
|
|
+ mod_zone_state(zone, item, 1, 1);
|
|
|
}
|
|
|
|
|
|
void inc_zone_page_state(struct page *page, enum zone_stat_item item)
|
|
|
{
|
|
|
- mod_state(page_zone(page), item, 1, 1);
|
|
|
+ mod_zone_state(page_zone(page), item, 1, 1);
|
|
|
}
|
|
|
EXPORT_SYMBOL(inc_zone_page_state);
|
|
|
|
|
|
void dec_zone_page_state(struct page *page, enum zone_stat_item item)
|
|
|
{
|
|
|
- mod_state(page_zone(page), item, -1, -1);
|
|
|
+ mod_zone_state(page_zone(page), item, -1, -1);
|
|
|
}
|
|
|
EXPORT_SYMBOL(dec_zone_page_state);
|
|
|
+
|
|
|
+static inline void mod_node_state(struct pglist_data *pgdat,
|
|
|
+ enum node_stat_item item, int delta, int overstep_mode)
|
|
|
+{
|
|
|
+ struct per_cpu_nodestat __percpu *pcp = pgdat->per_cpu_nodestats;
|
|
|
+ s8 __percpu *p = pcp->vm_node_stat_diff + item;
|
|
|
+ long o, n, t, z;
|
|
|
+
|
|
|
+ do {
|
|
|
+ z = 0; /* overflow to node counters */
|
|
|
+
|
|
|
+ /*
|
|
|
+ * The fetching of the stat_threshold is racy. We may apply
|
|
|
+ * a counter threshold to the wrong the cpu if we get
|
|
|
+ * rescheduled while executing here. However, the next
|
|
|
+ * counter update will apply the threshold again and
|
|
|
+ * therefore bring the counter under the threshold again.
|
|
|
+ *
|
|
|
+ * Most of the time the thresholds are the same anyways
|
|
|
+ * for all cpus in a node.
|
|
|
+ */
|
|
|
+ t = this_cpu_read(pcp->stat_threshold);
|
|
|
+
|
|
|
+ o = this_cpu_read(*p);
|
|
|
+ n = delta + o;
|
|
|
+
|
|
|
+ if (n > t || n < -t) {
|
|
|
+ int os = overstep_mode * (t >> 1) ;
|
|
|
+
|
|
|
+ /* Overflow must be added to node counters */
|
|
|
+ z = n + os;
|
|
|
+ n = -os;
|
|
|
+ }
|
|
|
+ } while (this_cpu_cmpxchg(*p, o, n) != o);
|
|
|
+
|
|
|
+ if (z)
|
|
|
+ node_page_state_add(z, pgdat, item);
|
|
|
+}
|
|
|
+
|
|
|
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
|
|
|
+ long delta)
|
|
|
+{
|
|
|
+ mod_node_state(pgdat, item, delta, 0);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(mod_node_page_state);
|
|
|
+
|
|
|
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
|
|
|
+{
|
|
|
+ mod_node_state(pgdat, item, 1, 1);
|
|
|
+}
|
|
|
+
|
|
|
+void inc_node_page_state(struct page *page, enum node_stat_item item)
|
|
|
+{
|
|
|
+ mod_node_state(page_pgdat(page), item, 1, 1);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(inc_node_page_state);
|
|
|
+
|
|
|
+void dec_node_page_state(struct page *page, enum node_stat_item item)
|
|
|
+{
|
|
|
+ mod_node_state(page_pgdat(page), item, -1, -1);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(dec_node_page_state);
|
|
|
#else
|
|
|
/*
|
|
|
* Use interrupt disable to serialize counter updates
|
|
@@ -424,21 +569,69 @@ void dec_zone_page_state(struct page *page, enum zone_stat_item item)
|
|
|
local_irq_restore(flags);
|
|
|
}
|
|
|
EXPORT_SYMBOL(dec_zone_page_state);
|
|
|
-#endif
|
|
|
|
|
|
+void inc_node_state(struct pglist_data *pgdat, enum node_stat_item item)
|
|
|
+{
|
|
|
+ unsigned long flags;
|
|
|
+
|
|
|
+ local_irq_save(flags);
|
|
|
+ __inc_node_state(pgdat, item);
|
|
|
+ local_irq_restore(flags);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(inc_node_state);
|
|
|
+
|
|
|
+void mod_node_page_state(struct pglist_data *pgdat, enum node_stat_item item,
|
|
|
+ long delta)
|
|
|
+{
|
|
|
+ unsigned long flags;
|
|
|
+
|
|
|
+ local_irq_save(flags);
|
|
|
+ __mod_node_page_state(pgdat, item, delta);
|
|
|
+ local_irq_restore(flags);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(mod_node_page_state);
|
|
|
+
|
|
|
+void inc_node_page_state(struct page *page, enum node_stat_item item)
|
|
|
+{
|
|
|
+ unsigned long flags;
|
|
|
+ struct pglist_data *pgdat;
|
|
|
+
|
|
|
+ pgdat = page_pgdat(page);
|
|
|
+ local_irq_save(flags);
|
|
|
+ __inc_node_state(pgdat, item);
|
|
|
+ local_irq_restore(flags);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(inc_node_page_state);
|
|
|
+
|
|
|
+void dec_node_page_state(struct page *page, enum node_stat_item item)
|
|
|
+{
|
|
|
+ unsigned long flags;
|
|
|
+
|
|
|
+ local_irq_save(flags);
|
|
|
+ __dec_node_page_state(page, item);
|
|
|
+ local_irq_restore(flags);
|
|
|
+}
|
|
|
+EXPORT_SYMBOL(dec_node_page_state);
|
|
|
+#endif
|
|
|
|
|
|
/*
|
|
|
* Fold a differential into the global counters.
|
|
|
* Returns the number of counters updated.
|
|
|
*/
|
|
|
-static int fold_diff(int *diff)
|
|
|
+static int fold_diff(int *zone_diff, int *node_diff)
|
|
|
{
|
|
|
int i;
|
|
|
int changes = 0;
|
|
|
|
|
|
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++)
|
|
|
- if (diff[i]) {
|
|
|
- atomic_long_add(diff[i], &vm_stat[i]);
|
|
|
+ if (zone_diff[i]) {
|
|
|
+ atomic_long_add(zone_diff[i], &vm_zone_stat[i]);
|
|
|
+ changes++;
|
|
|
+ }
|
|
|
+
|
|
|
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
|
|
|
+ if (node_diff[i]) {
|
|
|
+ atomic_long_add(node_diff[i], &vm_node_stat[i]);
|
|
|
changes++;
|
|
|
}
|
|
|
return changes;
|
|
@@ -462,9 +655,11 @@ static int fold_diff(int *diff)
|
|
|
*/
|
|
|
static int refresh_cpu_vm_stats(bool do_pagesets)
|
|
|
{
|
|
|
+ struct pglist_data *pgdat;
|
|
|
struct zone *zone;
|
|
|
int i;
|
|
|
- int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
|
|
|
+ int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
|
|
|
+ int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
|
|
|
int changes = 0;
|
|
|
|
|
|
for_each_populated_zone(zone) {
|
|
@@ -477,7 +672,7 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
|
|
|
if (v) {
|
|
|
|
|
|
atomic_long_add(v, &zone->vm_stat[i]);
|
|
|
- global_diff[i] += v;
|
|
|
+ global_zone_diff[i] += v;
|
|
|
#ifdef CONFIG_NUMA
|
|
|
/* 3 seconds idle till flush */
|
|
|
__this_cpu_write(p->expire, 3);
|
|
@@ -516,7 +711,22 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
|
|
|
}
|
|
|
#endif
|
|
|
}
|
|
|
- changes += fold_diff(global_diff);
|
|
|
+
|
|
|
+ for_each_online_pgdat(pgdat) {
|
|
|
+ struct per_cpu_nodestat __percpu *p = pgdat->per_cpu_nodestats;
|
|
|
+
|
|
|
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++) {
|
|
|
+ int v;
|
|
|
+
|
|
|
+ v = this_cpu_xchg(p->vm_node_stat_diff[i], 0);
|
|
|
+ if (v) {
|
|
|
+ atomic_long_add(v, &pgdat->vm_stat[i]);
|
|
|
+ global_node_diff[i] += v;
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ changes += fold_diff(global_zone_diff, global_node_diff);
|
|
|
return changes;
|
|
|
}
|
|
|
|
|
@@ -527,9 +737,11 @@ static int refresh_cpu_vm_stats(bool do_pagesets)
|
|
|
*/
|
|
|
void cpu_vm_stats_fold(int cpu)
|
|
|
{
|
|
|
+ struct pglist_data *pgdat;
|
|
|
struct zone *zone;
|
|
|
int i;
|
|
|
- int global_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
|
|
|
+ int global_zone_diff[NR_VM_ZONE_STAT_ITEMS] = { 0, };
|
|
|
+ int global_node_diff[NR_VM_NODE_STAT_ITEMS] = { 0, };
|
|
|
|
|
|
for_each_populated_zone(zone) {
|
|
|
struct per_cpu_pageset *p;
|
|
@@ -543,11 +755,27 @@ void cpu_vm_stats_fold(int cpu)
|
|
|
v = p->vm_stat_diff[i];
|
|
|
p->vm_stat_diff[i] = 0;
|
|
|
atomic_long_add(v, &zone->vm_stat[i]);
|
|
|
- global_diff[i] += v;
|
|
|
+ global_zone_diff[i] += v;
|
|
|
}
|
|
|
}
|
|
|
|
|
|
- fold_diff(global_diff);
|
|
|
+ for_each_online_pgdat(pgdat) {
|
|
|
+ struct per_cpu_nodestat *p;
|
|
|
+
|
|
|
+ p = per_cpu_ptr(pgdat->per_cpu_nodestats, cpu);
|
|
|
+
|
|
|
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
|
|
|
+ if (p->vm_node_stat_diff[i]) {
|
|
|
+ int v;
|
|
|
+
|
|
|
+ v = p->vm_node_stat_diff[i];
|
|
|
+ p->vm_node_stat_diff[i] = 0;
|
|
|
+ atomic_long_add(v, &pgdat->vm_stat[i]);
|
|
|
+ global_node_diff[i] += v;
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ fold_diff(global_zone_diff, global_node_diff);
|
|
|
}
|
|
|
|
|
|
/*
|
|
@@ -563,16 +791,19 @@ void drain_zonestat(struct zone *zone, struct per_cpu_pageset *pset)
|
|
|
int v = pset->vm_stat_diff[i];
|
|
|
pset->vm_stat_diff[i] = 0;
|
|
|
atomic_long_add(v, &zone->vm_stat[i]);
|
|
|
- atomic_long_add(v, &vm_stat[i]);
|
|
|
+ atomic_long_add(v, &vm_zone_stat[i]);
|
|
|
}
|
|
|
}
|
|
|
#endif
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
|
|
/*
|
|
|
- * Determine the per node value of a stat item.
|
|
|
+ * Determine the per node value of a stat item. This function
|
|
|
+ * is called frequently in a NUMA machine, so try to be as
|
|
|
+ * frugal as possible.
|
|
|
*/
|
|
|
-unsigned long node_page_state(int node, enum zone_stat_item item)
|
|
|
+unsigned long sum_zone_node_page_state(int node,
|
|
|
+ enum zone_stat_item item)
|
|
|
{
|
|
|
struct zone *zones = NODE_DATA(node)->node_zones;
|
|
|
int i;
|
|
@@ -584,6 +815,19 @@ unsigned long node_page_state(int node, enum zone_stat_item item)
|
|
|
return count;
|
|
|
}
|
|
|
|
|
|
+/*
|
|
|
+ * Determine the per node value of a stat item.
|
|
|
+ */
|
|
|
+unsigned long node_page_state(struct pglist_data *pgdat,
|
|
|
+ enum node_stat_item item)
|
|
|
+{
|
|
|
+ long x = atomic_long_read(&pgdat->vm_stat[item]);
|
|
|
+#ifdef CONFIG_SMP
|
|
|
+ if (x < 0)
|
|
|
+ x = 0;
|
|
|
+#endif
|
|
|
+ return x;
|
|
|
+}
|
|
|
#endif
|
|
|
|
|
|
#ifdef CONFIG_COMPACTION
|
|
@@ -1287,6 +1531,7 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
|
|
|
if (*pos >= ARRAY_SIZE(vmstat_text))
|
|
|
return NULL;
|
|
|
stat_items_size = NR_VM_ZONE_STAT_ITEMS * sizeof(unsigned long) +
|
|
|
+ NR_VM_NODE_STAT_ITEMS * sizeof(unsigned long) +
|
|
|
NR_VM_WRITEBACK_STAT_ITEMS * sizeof(unsigned long);
|
|
|
|
|
|
#ifdef CONFIG_VM_EVENT_COUNTERS
|
|
@@ -1301,6 +1546,10 @@ static void *vmstat_start(struct seq_file *m, loff_t *pos)
|
|
|
v[i] = global_page_state(i);
|
|
|
v += NR_VM_ZONE_STAT_ITEMS;
|
|
|
|
|
|
+ for (i = 0; i < NR_VM_NODE_STAT_ITEMS; i++)
|
|
|
+ v[i] = global_node_page_state(i);
|
|
|
+ v += NR_VM_NODE_STAT_ITEMS;
|
|
|
+
|
|
|
global_dirty_limits(v + NR_DIRTY_BG_THRESHOLD,
|
|
|
v + NR_DIRTY_THRESHOLD);
|
|
|
v += NR_VM_WRITEBACK_STAT_ITEMS;
|
|
@@ -1390,7 +1639,7 @@ int vmstat_refresh(struct ctl_table *table, int write,
|
|
|
if (err)
|
|
|
return err;
|
|
|
for (i = 0; i < NR_VM_ZONE_STAT_ITEMS; i++) {
|
|
|
- val = atomic_long_read(&vm_stat[i]);
|
|
|
+ val = atomic_long_read(&vm_zone_stat[i]);
|
|
|
if (val < 0) {
|
|
|
switch (i) {
|
|
|
case NR_ALLOC_BATCH:
|