|
@@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages;
|
|
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
|
|
/* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
|
|
long total_swap_pages;
|
|
long total_swap_pages;
|
|
static int least_priority;
|
|
static int least_priority;
|
|
-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
|
|
|
|
|
|
|
|
static const char Bad_file[] = "Bad swap file entry ";
|
|
static const char Bad_file[] = "Bad swap file entry ";
|
|
static const char Unused_file[] = "Unused swap file entry ";
|
|
static const char Unused_file[] = "Unused swap file entry ";
|
|
static const char Bad_offset[] = "Bad swap offset entry ";
|
|
static const char Bad_offset[] = "Bad swap offset entry ";
|
|
static const char Unused_offset[] = "Unused swap offset entry ";
|
|
static const char Unused_offset[] = "Unused swap offset entry ";
|
|
|
|
|
|
-struct swap_list_t swap_list = {-1, -1};
|
|
|
|
|
|
+/*
|
|
|
|
+ * all active swap_info_structs
|
|
|
|
+ * protected with swap_lock, and ordered by priority.
|
|
|
|
+ */
|
|
|
|
+LIST_HEAD(swap_list_head);
|
|
|
|
|
|
struct swap_info_struct *swap_info[MAX_SWAPFILES];
|
|
struct swap_info_struct *swap_info[MAX_SWAPFILES];
|
|
|
|
|
|
@@ -640,66 +643,54 @@ no_page:
|
|
|
|
|
|
swp_entry_t get_swap_page(void)
|
|
swp_entry_t get_swap_page(void)
|
|
{
|
|
{
|
|
- struct swap_info_struct *si;
|
|
|
|
|
|
+ struct swap_info_struct *si, *next;
|
|
pgoff_t offset;
|
|
pgoff_t offset;
|
|
- int type, next;
|
|
|
|
- int wrapped = 0;
|
|
|
|
- int hp_index;
|
|
|
|
|
|
+ struct list_head *tmp;
|
|
|
|
|
|
spin_lock(&swap_lock);
|
|
spin_lock(&swap_lock);
|
|
if (atomic_long_read(&nr_swap_pages) <= 0)
|
|
if (atomic_long_read(&nr_swap_pages) <= 0)
|
|
goto noswap;
|
|
goto noswap;
|
|
atomic_long_dec(&nr_swap_pages);
|
|
atomic_long_dec(&nr_swap_pages);
|
|
|
|
|
|
- for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
|
|
|
|
- hp_index = atomic_xchg(&highest_priority_index, -1);
|
|
|
|
- /*
|
|
|
|
- * highest_priority_index records current highest priority swap
|
|
|
|
- * type which just frees swap entries. If its priority is
|
|
|
|
- * higher than that of swap_list.next swap type, we use it. It
|
|
|
|
- * isn't protected by swap_lock, so it can be an invalid value
|
|
|
|
- * if the corresponding swap type is swapoff. We double check
|
|
|
|
- * the flags here. It's even possible the swap type is swapoff
|
|
|
|
- * and swapon again and its priority is changed. In such rare
|
|
|
|
- * case, low prority swap type might be used, but eventually
|
|
|
|
- * high priority swap will be used after several rounds of
|
|
|
|
- * swap.
|
|
|
|
- */
|
|
|
|
- if (hp_index != -1 && hp_index != type &&
|
|
|
|
- swap_info[type]->prio < swap_info[hp_index]->prio &&
|
|
|
|
- (swap_info[hp_index]->flags & SWP_WRITEOK)) {
|
|
|
|
- type = hp_index;
|
|
|
|
- swap_list.next = type;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
- si = swap_info[type];
|
|
|
|
- next = si->next;
|
|
|
|
- if (next < 0 ||
|
|
|
|
- (!wrapped && si->prio != swap_info[next]->prio)) {
|
|
|
|
- next = swap_list.head;
|
|
|
|
- wrapped++;
|
|
|
|
- }
|
|
|
|
-
|
|
|
|
|
|
+ list_for_each(tmp, &swap_list_head) {
|
|
|
|
+ si = list_entry(tmp, typeof(*si), list);
|
|
spin_lock(&si->lock);
|
|
spin_lock(&si->lock);
|
|
- if (!si->highest_bit) {
|
|
|
|
- spin_unlock(&si->lock);
|
|
|
|
- continue;
|
|
|
|
- }
|
|
|
|
- if (!(si->flags & SWP_WRITEOK)) {
|
|
|
|
|
|
+ if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
|
|
spin_unlock(&si->lock);
|
|
spin_unlock(&si->lock);
|
|
continue;
|
|
continue;
|
|
}
|
|
}
|
|
|
|
|
|
- swap_list.next = next;
|
|
|
|
|
|
+ /*
|
|
|
|
+ * rotate the current swap_info that we're going to use
|
|
|
|
+ * to after any other swap_info that have the same prio,
|
|
|
|
+ * so that all equal-priority swap_info get used equally
|
|
|
|
+ */
|
|
|
|
+ next = si;
|
|
|
|
+ list_for_each_entry_continue(next, &swap_list_head, list) {
|
|
|
|
+ if (si->prio != next->prio)
|
|
|
|
+ break;
|
|
|
|
+ list_rotate_left(&si->list);
|
|
|
|
+ next = si;
|
|
|
|
+ }
|
|
|
|
|
|
spin_unlock(&swap_lock);
|
|
spin_unlock(&swap_lock);
|
|
/* This is called for allocating swap entry for cache */
|
|
/* This is called for allocating swap entry for cache */
|
|
offset = scan_swap_map(si, SWAP_HAS_CACHE);
|
|
offset = scan_swap_map(si, SWAP_HAS_CACHE);
|
|
spin_unlock(&si->lock);
|
|
spin_unlock(&si->lock);
|
|
if (offset)
|
|
if (offset)
|
|
- return swp_entry(type, offset);
|
|
|
|
|
|
+ return swp_entry(si->type, offset);
|
|
spin_lock(&swap_lock);
|
|
spin_lock(&swap_lock);
|
|
- next = swap_list.next;
|
|
|
|
|
|
+ /*
|
|
|
|
+ * if we got here, it's likely that si was almost full before,
|
|
|
|
+ * and since scan_swap_map() can drop the si->lock, multiple
|
|
|
|
+ * callers probably all tried to get a page from the same si
|
|
|
|
+ * and it filled up before we could get one. So we need to
|
|
|
|
+ * try again. Since we dropped the swap_lock, there may now
|
|
|
|
+ * be non-full higher priority swap_infos, and this si may have
|
|
|
|
+ * even been removed from the list (although very unlikely).
|
|
|
|
+ * Let's start over.
|
|
|
|
+ */
|
|
|
|
+ tmp = &swap_list_head;
|
|
}
|
|
}
|
|
|
|
|
|
atomic_long_inc(&nr_swap_pages);
|
|
atomic_long_inc(&nr_swap_pages);
|
|
@@ -766,27 +757,6 @@ out:
|
|
return NULL;
|
|
return NULL;
|
|
}
|
|
}
|
|
|
|
|
|
-/*
|
|
|
|
- * This swap type frees swap entry, check if it is the highest priority swap
|
|
|
|
- * type which just frees swap entry. get_swap_page() uses
|
|
|
|
- * highest_priority_index to search highest priority swap type. The
|
|
|
|
- * swap_info_struct.lock can't protect us if there are multiple swap types
|
|
|
|
- * active, so we use atomic_cmpxchg.
|
|
|
|
- */
|
|
|
|
-static void set_highest_priority_index(int type)
|
|
|
|
-{
|
|
|
|
- int old_hp_index, new_hp_index;
|
|
|
|
-
|
|
|
|
- do {
|
|
|
|
- old_hp_index = atomic_read(&highest_priority_index);
|
|
|
|
- if (old_hp_index != -1 &&
|
|
|
|
- swap_info[old_hp_index]->prio >= swap_info[type]->prio)
|
|
|
|
- break;
|
|
|
|
- new_hp_index = type;
|
|
|
|
- } while (atomic_cmpxchg(&highest_priority_index,
|
|
|
|
- old_hp_index, new_hp_index) != old_hp_index);
|
|
|
|
-}
|
|
|
|
-
|
|
|
|
static unsigned char swap_entry_free(struct swap_info_struct *p,
|
|
static unsigned char swap_entry_free(struct swap_info_struct *p,
|
|
swp_entry_t entry, unsigned char usage)
|
|
swp_entry_t entry, unsigned char usage)
|
|
{
|
|
{
|
|
@@ -830,7 +800,6 @@ static unsigned char swap_entry_free(struct swap_info_struct *p,
|
|
p->lowest_bit = offset;
|
|
p->lowest_bit = offset;
|
|
if (offset > p->highest_bit)
|
|
if (offset > p->highest_bit)
|
|
p->highest_bit = offset;
|
|
p->highest_bit = offset;
|
|
- set_highest_priority_index(p->type);
|
|
|
|
atomic_long_inc(&nr_swap_pages);
|
|
atomic_long_inc(&nr_swap_pages);
|
|
p->inuse_pages--;
|
|
p->inuse_pages--;
|
|
frontswap_invalidate_page(p->type, offset);
|
|
frontswap_invalidate_page(p->type, offset);
|
|
@@ -1765,7 +1734,7 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
|
|
unsigned char *swap_map,
|
|
unsigned char *swap_map,
|
|
struct swap_cluster_info *cluster_info)
|
|
struct swap_cluster_info *cluster_info)
|
|
{
|
|
{
|
|
- int i, prev;
|
|
|
|
|
|
+ struct swap_info_struct *si;
|
|
|
|
|
|
if (prio >= 0)
|
|
if (prio >= 0)
|
|
p->prio = prio;
|
|
p->prio = prio;
|
|
@@ -1777,18 +1746,28 @@ static void _enable_swap_info(struct swap_info_struct *p, int prio,
|
|
atomic_long_add(p->pages, &nr_swap_pages);
|
|
atomic_long_add(p->pages, &nr_swap_pages);
|
|
total_swap_pages += p->pages;
|
|
total_swap_pages += p->pages;
|
|
|
|
|
|
- /* insert swap space into swap_list: */
|
|
|
|
- prev = -1;
|
|
|
|
- for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
|
|
|
|
- if (p->prio >= swap_info[i]->prio)
|
|
|
|
- break;
|
|
|
|
- prev = i;
|
|
|
|
|
|
+ assert_spin_locked(&swap_lock);
|
|
|
|
+ BUG_ON(!list_empty(&p->list));
|
|
|
|
+ /*
|
|
|
|
+ * insert into swap list; the list is in priority order,
|
|
|
|
+ * so that get_swap_page() can get a page from the highest
|
|
|
|
+ * priority swap_info_struct with available page(s), and
|
|
|
|
+ * swapoff can adjust the auto-assigned (i.e. negative) prio
|
|
|
|
+ * values for any lower-priority swap_info_structs when
|
|
|
|
+ * removing a negative-prio swap_info_struct
|
|
|
|
+ */
|
|
|
|
+ list_for_each_entry(si, &swap_list_head, list) {
|
|
|
|
+ if (p->prio >= si->prio) {
|
|
|
|
+ list_add_tail(&p->list, &si->list);
|
|
|
|
+ return;
|
|
|
|
+ }
|
|
}
|
|
}
|
|
- p->next = i;
|
|
|
|
- if (prev < 0)
|
|
|
|
- swap_list.head = swap_list.next = p->type;
|
|
|
|
- else
|
|
|
|
- swap_info[prev]->next = p->type;
|
|
|
|
|
|
+ /*
|
|
|
|
+ * this covers two cases:
|
|
|
|
+ * 1) p->prio is less than all existing prio
|
|
|
|
+ * 2) the swap list is empty
|
|
|
|
+ */
|
|
|
|
+ list_add_tail(&p->list, &swap_list_head);
|
|
}
|
|
}
|
|
|
|
|
|
static void enable_swap_info(struct swap_info_struct *p, int prio,
|
|
static void enable_swap_info(struct swap_info_struct *p, int prio,
|
|
@@ -1823,8 +1802,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
|
struct address_space *mapping;
|
|
struct address_space *mapping;
|
|
struct inode *inode;
|
|
struct inode *inode;
|
|
struct filename *pathname;
|
|
struct filename *pathname;
|
|
- int i, type, prev;
|
|
|
|
- int err;
|
|
|
|
|
|
+ int err, found = 0;
|
|
unsigned int old_block_size;
|
|
unsigned int old_block_size;
|
|
|
|
|
|
if (!capable(CAP_SYS_ADMIN))
|
|
if (!capable(CAP_SYS_ADMIN))
|
|
@@ -1842,17 +1820,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
|
goto out;
|
|
goto out;
|
|
|
|
|
|
mapping = victim->f_mapping;
|
|
mapping = victim->f_mapping;
|
|
- prev = -1;
|
|
|
|
spin_lock(&swap_lock);
|
|
spin_lock(&swap_lock);
|
|
- for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
|
|
|
|
- p = swap_info[type];
|
|
|
|
|
|
+ list_for_each_entry(p, &swap_list_head, list) {
|
|
if (p->flags & SWP_WRITEOK) {
|
|
if (p->flags & SWP_WRITEOK) {
|
|
- if (p->swap_file->f_mapping == mapping)
|
|
|
|
|
|
+ if (p->swap_file->f_mapping == mapping) {
|
|
|
|
+ found = 1;
|
|
break;
|
|
break;
|
|
|
|
+ }
|
|
}
|
|
}
|
|
- prev = type;
|
|
|
|
}
|
|
}
|
|
- if (type < 0) {
|
|
|
|
|
|
+ if (!found) {
|
|
err = -EINVAL;
|
|
err = -EINVAL;
|
|
spin_unlock(&swap_lock);
|
|
spin_unlock(&swap_lock);
|
|
goto out_dput;
|
|
goto out_dput;
|
|
@@ -1864,20 +1841,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
|
spin_unlock(&swap_lock);
|
|
spin_unlock(&swap_lock);
|
|
goto out_dput;
|
|
goto out_dput;
|
|
}
|
|
}
|
|
- if (prev < 0)
|
|
|
|
- swap_list.head = p->next;
|
|
|
|
- else
|
|
|
|
- swap_info[prev]->next = p->next;
|
|
|
|
- if (type == swap_list.next) {
|
|
|
|
- /* just pick something that's safe... */
|
|
|
|
- swap_list.next = swap_list.head;
|
|
|
|
- }
|
|
|
|
spin_lock(&p->lock);
|
|
spin_lock(&p->lock);
|
|
if (p->prio < 0) {
|
|
if (p->prio < 0) {
|
|
- for (i = p->next; i >= 0; i = swap_info[i]->next)
|
|
|
|
- swap_info[i]->prio = p->prio--;
|
|
|
|
|
|
+ struct swap_info_struct *si = p;
|
|
|
|
+
|
|
|
|
+ list_for_each_entry_continue(si, &swap_list_head, list) {
|
|
|
|
+ si->prio++;
|
|
|
|
+ }
|
|
least_priority++;
|
|
least_priority++;
|
|
}
|
|
}
|
|
|
|
+ list_del_init(&p->list);
|
|
atomic_long_sub(p->pages, &nr_swap_pages);
|
|
atomic_long_sub(p->pages, &nr_swap_pages);
|
|
total_swap_pages -= p->pages;
|
|
total_swap_pages -= p->pages;
|
|
p->flags &= ~SWP_WRITEOK;
|
|
p->flags &= ~SWP_WRITEOK;
|
|
@@ -1885,7 +1858,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
|
spin_unlock(&swap_lock);
|
|
spin_unlock(&swap_lock);
|
|
|
|
|
|
set_current_oom_origin();
|
|
set_current_oom_origin();
|
|
- err = try_to_unuse(type, false, 0); /* force all pages to be unused */
|
|
|
|
|
|
+ err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
|
|
clear_current_oom_origin();
|
|
clear_current_oom_origin();
|
|
|
|
|
|
if (err) {
|
|
if (err) {
|
|
@@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
|
frontswap_map = frontswap_map_get(p);
|
|
frontswap_map = frontswap_map_get(p);
|
|
spin_unlock(&p->lock);
|
|
spin_unlock(&p->lock);
|
|
spin_unlock(&swap_lock);
|
|
spin_unlock(&swap_lock);
|
|
- frontswap_invalidate_area(type);
|
|
|
|
|
|
+ frontswap_invalidate_area(p->type);
|
|
frontswap_map_set(p, NULL);
|
|
frontswap_map_set(p, NULL);
|
|
mutex_unlock(&swapon_mutex);
|
|
mutex_unlock(&swapon_mutex);
|
|
free_percpu(p->percpu_cluster);
|
|
free_percpu(p->percpu_cluster);
|
|
@@ -1935,7 +1908,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
|
vfree(cluster_info);
|
|
vfree(cluster_info);
|
|
vfree(frontswap_map);
|
|
vfree(frontswap_map);
|
|
/* Destroy swap account information */
|
|
/* Destroy swap account information */
|
|
- swap_cgroup_swapoff(type);
|
|
|
|
|
|
+ swap_cgroup_swapoff(p->type);
|
|
|
|
|
|
inode = mapping->host;
|
|
inode = mapping->host;
|
|
if (S_ISBLK(inode->i_mode)) {
|
|
if (S_ISBLK(inode->i_mode)) {
|
|
@@ -2142,8 +2115,8 @@ static struct swap_info_struct *alloc_swap_info(void)
|
|
*/
|
|
*/
|
|
}
|
|
}
|
|
INIT_LIST_HEAD(&p->first_swap_extent.list);
|
|
INIT_LIST_HEAD(&p->first_swap_extent.list);
|
|
|
|
+ INIT_LIST_HEAD(&p->list);
|
|
p->flags = SWP_USED;
|
|
p->flags = SWP_USED;
|
|
- p->next = -1;
|
|
|
|
spin_unlock(&swap_lock);
|
|
spin_unlock(&swap_lock);
|
|
spin_lock_init(&p->lock);
|
|
spin_lock_init(&p->lock);
|
|
|
|
|