11 年之前 · 9343224bfd
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -1726,16 +1726,16 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 
				 			option description.
			
 
				 
			
 
				 	memmap=nn[KMG]@ss[KMG]
			
 
				-			[KNL] Force usage of a specific region of memory
			
 
				-			Region of memory to be used, from ss to ss+nn.
			
 
				+			[KNL] Force usage of a specific region of memory.
			
 
				+			Region of memory to be used is from ss to ss+nn.
			
 
				 
			
 
				 	memmap=nn[KMG]#ss[KMG]
			
 
				 			[KNL,ACPI] Mark specific memory as ACPI data.
			
 
				-			Region of memory to be used, from ss to ss+nn.
			
 
				+			Region of memory to be marked is from ss to ss+nn.
			
 
				 
			
 
				 	memmap=nn[KMG]$ss[KMG]
			
 
				 			[KNL,ACPI] Mark specific memory as reserved.
			
 
				-			Region of memory to be used, from ss to ss+nn.
			
 
				+			Region of memory to be reserved is from ss to ss+nn.
			
 
				 			Example: Exclude memory from 0x18690000-0x1869ffff
			
 
				 			         memmap=64K$0x18690000
			
 
				 			         or
			
--- a/arch/x86/mm/numa.c
+++ b/arch/x86/mm/numa.c
@@ -493,14 +493,6 @@ static int __init numa_register_memblks(struct numa_meminfo *mi)
 
				 		struct numa_memblk *mb = &mi->blk[i];
			
 
				 		memblock_set_node(mb->start, mb->end - mb->start,
			
 
				 				  &memblock.memory, mb->nid);
			
 
				-
			
 
				-		/*
			
 
				-		 * At this time, all memory regions reserved by memblock are
			
 
				-		 * used by the kernel. Set the nid in memblock.reserved will
			
 
				-		 * mark out all the nodes the kernel resides in.
			
 
				-		 */
			
 
				-		memblock_set_node(mb->start, mb->end - mb->start,
			
 
				-				  &memblock.reserved, mb->nid);
			
 
				 	}
			
 
				 
			
 
				 	/*
			
@@ -565,10 +557,21 @@ static void __init numa_init_array(void)
 
				 static void __init numa_clear_kernel_node_hotplug(void)
			
 
				 {
			
 
				 	int i, nid;
			
 
				-	nodemask_t numa_kernel_nodes;
			
 
				+	nodemask_t numa_kernel_nodes = NODE_MASK_NONE;
			
 
				 	unsigned long start, end;
			
 
				 	struct memblock_type *type = &memblock.reserved;
			
 
				 
			
 
				+	/*
			
 
				+	 * At this time, all memory regions reserved by memblock are
			
 
				+	 * used by the kernel. Set the nid in memblock.reserved will
			
 
				+	 * mark out all the nodes the kernel resides in.
			
 
				+	 */
			
 
				+	for (i = 0; i < numa_meminfo.nr_blks; i++) {
			
 
				+		struct numa_memblk *mb = &numa_meminfo.blk[i];
			
 
				+		memblock_set_node(mb->start, mb->end - mb->start,
			
 
				+				  &memblock.reserved, mb->nid);
			
 
				+	}
			
 
				+
			
 
				 	/* Mark all kernel nodes. */
			
 
				 	for (i = 0; i < type->cnt; i++)
			
 
				 		node_set(type->regions[i].nid, numa_kernel_nodes);
			
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -654,14 +654,16 @@ EXPORT_SYMBOL(mark_buffer_dirty_inode);
 
				 static void __set_page_dirty(struct page *page,
			
 
				 		struct address_space *mapping, int warn)
			
 
				 {
			
 
				-	spin_lock_irq(&mapping->tree_lock);
			
 
				+	unsigned long flags;
			
 
				+
			
 
				+	spin_lock_irqsave(&mapping->tree_lock, flags);
			
 
				 	if (page->mapping) {	/* Race with truncate? */
			
 
				 		WARN_ON_ONCE(warn && !PageUptodate(page));
			
 
				 		account_page_dirtied(page, mapping);
			
 
				 		radix_tree_tag_set(&mapping->page_tree,
			
 
				 				page_index(page), PAGECACHE_TAG_DIRTY);
			
 
				 	}
			
 
				-	spin_unlock_irq(&mapping->tree_lock);
			
 
				+	spin_unlock_irqrestore(&mapping->tree_lock, flags);
			
 
				 	__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
			
 
				 }
			
 
				 
			
--- a/fs/ocfs2/alloc.c
+++ b/fs/ocfs2/alloc.c
@@ -4742,6 +4742,7 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 
				 				enum ocfs2_alloc_restarted *reason_ret)
			
 
				 {
			
 
				 	int status = 0, err = 0;
			
 
				+	int need_free = 0;
			
 
				 	int free_extents;
			
 
				 	enum ocfs2_alloc_restarted reason = RESTART_NONE;
			
 
				 	u32 bit_off, num_bits;
			
@@ -4796,7 +4797,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 
				 					      OCFS2_JOURNAL_ACCESS_WRITE);
			
 
				 	if (status < 0) {
			
 
				 		mlog_errno(status);
			
 
				-		goto leave;
			
 
				+		need_free = 1;
			
 
				+		goto bail;
			
 
				 	}
			
 
				 
			
 
				 	block = ocfs2_clusters_to_blocks(osb->sb, bit_off);
			
@@ -4807,7 +4809,8 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 
				 				     num_bits, flags, meta_ac);
			
 
				 	if (status < 0) {
			
 
				 		mlog_errno(status);
			
 
				-		goto leave;
			
 
				+		need_free = 1;
			
 
				+		goto bail;
			
 
				 	}
			
 
				 
			
 
				 	ocfs2_journal_dirty(handle, et->et_root_bh);
			
@@ -4821,6 +4824,19 @@ int ocfs2_add_clusters_in_btree(handle_t *handle,
 
				 		reason = RESTART_TRANS;
			
 
				 	}
			
 
				 
			
 
				+bail:
			
 
				+	if (need_free) {
			
 
				+		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
			
 
				+			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
			
 
				+					bit_off, num_bits);
			
 
				+		else
			
 
				+			ocfs2_free_clusters(handle,
			
 
				+					data_ac->ac_inode,
			
 
				+					data_ac->ac_bh,
			
 
				+					ocfs2_clusters_to_blocks(osb->sb, bit_off),
			
 
				+					num_bits);
			
 
				+	}
			
 
				+
			
 
				 leave:
			
 
				 	if (reason_ret)
			
 
				 		*reason_ret = reason;
			
@@ -6805,6 +6821,8 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 
				 					 struct buffer_head *di_bh)
			
 
				 {
			
 
				 	int ret, i, has_data, num_pages = 0;
			
 
				+	int need_free = 0;
			
 
				+	u32 bit_off, num;
			
 
				 	handle_t *handle;
			
 
				 	u64 uninitialized_var(block);
			
 
				 	struct ocfs2_inode_info *oi = OCFS2_I(inode);
			
@@ -6850,7 +6868,6 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 
				 	}
			
 
				 
			
 
				 	if (has_data) {
			
 
				-		u32 bit_off, num;
			
 
				 		unsigned int page_end;
			
 
				 		u64 phys;
			
 
				 
			
@@ -6886,6 +6903,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 
				 		ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages);
			
 
				 		if (ret) {
			
 
				 			mlog_errno(ret);
			
 
				+			need_free = 1;
			
 
				 			goto out_commit;
			
 
				 		}
			
 
				 
			
@@ -6896,6 +6914,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 
				 		ret = ocfs2_read_inline_data(inode, pages[0], di_bh);
			
 
				 		if (ret) {
			
 
				 			mlog_errno(ret);
			
 
				+			need_free = 1;
			
 
				 			goto out_commit;
			
 
				 		}
			
 
				 
			
@@ -6927,6 +6946,7 @@ int ocfs2_convert_inline_data_to_extents(struct inode *inode,
 
				 		ret = ocfs2_insert_extent(handle, &et, 0, block, 1, 0, NULL);
			
 
				 		if (ret) {
			
 
				 			mlog_errno(ret);
			
 
				+			need_free = 1;
			
 
				 			goto out_commit;
			
 
				 		}
			
 
				 
			
@@ -6938,6 +6958,18 @@ out_commit:
 
				 		dquot_free_space_nodirty(inode,
			
 
				 					  ocfs2_clusters_to_bytes(osb->sb, 1));
			
 
				 
			
 
				+	if (need_free) {
			
 
				+		if (data_ac->ac_which == OCFS2_AC_USE_LOCAL)
			
 
				+			ocfs2_free_local_alloc_bits(osb, handle, data_ac,
			
 
				+					bit_off, num);
			
 
				+		else
			
 
				+			ocfs2_free_clusters(handle,
			
 
				+					data_ac->ac_inode,
			
 
				+					data_ac->ac_bh,
			
 
				+					ocfs2_clusters_to_blocks(osb->sb, bit_off),
			
 
				+					num);
			
 
				+	}
			
 
				+
			
 
				 	ocfs2_commit_trans(osb, handle);
			
 
				 
			
 
				 out_unlock:
			
--- a/fs/ocfs2/localalloc.c
+++ b/fs/ocfs2/localalloc.c
@@ -781,6 +781,48 @@ bail:
 
				 	return status;
			
 
				 }
			
 
				 
			
 
				+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
			
 
				+				handle_t *handle,
			
 
				+				struct ocfs2_alloc_context *ac,
			
 
				+				u32 bit_off,
			
 
				+				u32 num_bits)
			
 
				+{
			
 
				+	int status, start;
			
 
				+	u32 clear_bits;
			
 
				+	struct inode *local_alloc_inode;
			
 
				+	void *bitmap;
			
 
				+	struct ocfs2_dinode *alloc;
			
 
				+	struct ocfs2_local_alloc *la;
			
 
				+
			
 
				+	BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL);
			
 
				+
			
 
				+	local_alloc_inode = ac->ac_inode;
			
 
				+	alloc = (struct ocfs2_dinode *) osb->local_alloc_bh->b_data;
			
 
				+	la = OCFS2_LOCAL_ALLOC(alloc);
			
 
				+
			
 
				+	bitmap = la->la_bitmap;
			
 
				+	start = bit_off - le32_to_cpu(la->la_bm_off);
			
 
				+	clear_bits = num_bits;
			
 
				+
			
 
				+	status = ocfs2_journal_access_di(handle,
			
 
				+			INODE_CACHE(local_alloc_inode),
			
 
				+			osb->local_alloc_bh,
			
 
				+			OCFS2_JOURNAL_ACCESS_WRITE);
			
 
				+	if (status < 0) {
			
 
				+		mlog_errno(status);
			
 
				+		goto bail;
			
 
				+	}
			
 
				+
			
 
				+	while (clear_bits--)
			
 
				+		ocfs2_clear_bit(start++, bitmap);
			
 
				+
			
 
				+	le32_add_cpu(&alloc->id1.bitmap1.i_used, -num_bits);
			
 
				+	ocfs2_journal_dirty(handle, osb->local_alloc_bh);
			
 
				+
			
 
				+bail:
			
 
				+	return status;
			
 
				+}
			
 
				+
			
 
				 static u32 ocfs2_local_alloc_count_bits(struct ocfs2_dinode *alloc)
			
 
				 {
			
 
				 	u32 count;
			
--- a/fs/ocfs2/localalloc.h
+++ b/fs/ocfs2/localalloc.h
@@ -55,6 +55,12 @@ int ocfs2_claim_local_alloc_bits(struct ocfs2_super *osb,
 
				 				 u32 *bit_off,
			
 
				 				 u32 *num_bits);
			
 
				 
			
 
				+int ocfs2_free_local_alloc_bits(struct ocfs2_super *osb,
			
 
				+				handle_t *handle,
			
 
				+				struct ocfs2_alloc_context *ac,
			
 
				+				u32 bit_off,
			
 
				+				u32 num_bits);
			
 
				+
			
 
				 void ocfs2_local_alloc_seen_free_bits(struct ocfs2_super *osb,
			
 
				 				      unsigned int num_clusters);
			
 
				 void ocfs2_la_enable_worker(struct work_struct *work);
			
--- a/include/linux/page-flags.h
+++ b/include/linux/page-flags.h
@@ -228,9 +228,9 @@ PAGEFLAG(OwnerPriv1, owner_priv_1) TESTCLEARFLAG(OwnerPriv1, owner_priv_1)
 
				 TESTPAGEFLAG(Writeback, writeback) TESTSCFLAG(Writeback, writeback)
			
 
				 PAGEFLAG(MappedToDisk, mappedtodisk)
			
 
				 
			
 
				-/* PG_readahead is only used for file reads; PG_reclaim is only for writes */
			
 
				+/* PG_readahead is only used for reads; PG_reclaim is only for writes */
			
 
				 PAGEFLAG(Reclaim, reclaim) TESTCLEARFLAG(Reclaim, reclaim)
			
 
				-PAGEFLAG(Readahead, reclaim)		/* Reminder to do async read-ahead */
			
 
				+PAGEFLAG(Readahead, reclaim) TESTCLEARFLAG(Readahead, reclaim)
			
 
				 
			
 
				 #ifdef CONFIG_HIGHMEM
			
 
				 /*
			
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2173,11 +2173,12 @@ int __set_page_dirty_nobuffers(struct page *page)
 
				 	if (!TestSetPageDirty(page)) {
			
 
				 		struct address_space *mapping = page_mapping(page);
			
 
				 		struct address_space *mapping2;
			
 
				+		unsigned long flags;
			
 
				 
			
 
				 		if (!mapping)
			
 
				 			return 1;
			
 
				 
			
 
				-		spin_lock_irq(&mapping->tree_lock);
			
 
				+		spin_lock_irqsave(&mapping->tree_lock, flags);
			
 
				 		mapping2 = page_mapping(page);
			
 
				 		if (mapping2) { /* Race with truncate? */
			
 
				 			BUG_ON(mapping2 != mapping);
			
@@ -2186,7 +2187,7 @@ int __set_page_dirty_nobuffers(struct page *page)
 
				 			radix_tree_tag_set(&mapping->page_tree,
			
 
				 				page_index(page), PAGECACHE_TAG_DIRTY);
			
 
				 		}
			
 
				-		spin_unlock_irq(&mapping->tree_lock);
			
 
				+		spin_unlock_irqrestore(&mapping->tree_lock, flags);
			
 
				 		if (mapping->host) {
			
 
				 			/* !PageAnon && !swapper_space */
			
 
				 			__mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
			
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -63,6 +63,8 @@ unsigned long total_swapcache_pages(void)
 
				 	return ret;
			
 
				 }
			
 
				 
			
 
				+static atomic_t swapin_readahead_hits = ATOMIC_INIT(4);
			
 
				+
			
 
				 void show_swap_cache_info(void)
			
 
				 {
			
 
				 	printk("%lu pages in swap cache\n", total_swapcache_pages());
			
@@ -286,8 +288,11 @@ struct page * lookup_swap_cache(swp_entry_t entry)
 
				 
			
 
				 	page = find_get_page(swap_address_space(entry), entry.val);
			
 
				 
			
 
				-	if (page)
			
 
				+	if (page) {
			
 
				 		INC_CACHE_INFO(find_success);
			
 
				+		if (TestClearPageReadahead(page))
			
 
				+			atomic_inc(&swapin_readahead_hits);
			
 
				+	}
			
 
				 
			
 
				 	INC_CACHE_INFO(find_total);
			
 
				 	return page;
			
@@ -389,6 +394,50 @@ struct page *read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask,
 
				 	return found_page;
			
 
				 }
			
 
				 
			
 
				+static unsigned long swapin_nr_pages(unsigned long offset)
			
 
				+{
			
 
				+	static unsigned long prev_offset;
			
 
				+	unsigned int pages, max_pages, last_ra;
			
 
				+	static atomic_t last_readahead_pages;
			
 
				+
			
 
				+	max_pages = 1 << ACCESS_ONCE(page_cluster);
			
 
				+	if (max_pages <= 1)
			
 
				+		return 1;
			
 
				+
			
 
				+	/*
			
 
				+	 * This heuristic has been found to work well on both sequential and
			
 
				+	 * random loads, swapping to hard disk or to SSD: please don't ask
			
 
				+	 * what the "+ 2" means, it just happens to work well, that's all.
			
 
				+	 */
			
 
				+	pages = atomic_xchg(&swapin_readahead_hits, 0) + 2;
			
 
				+	if (pages == 2) {
			
 
				+		/*
			
 
				+		 * We can have no readahead hits to judge by: but must not get
			
 
				+		 * stuck here forever, so check for an adjacent offset instead
			
 
				+		 * (and don't even bother to check whether swap type is same).
			
 
				+		 */
			
 
				+		if (offset != prev_offset + 1 && offset != prev_offset - 1)
			
 
				+			pages = 1;
			
 
				+		prev_offset = offset;
			
 
				+	} else {
			
 
				+		unsigned int roundup = 4;
			
 
				+		while (roundup < pages)
			
 
				+			roundup <<= 1;
			
 
				+		pages = roundup;
			
 
				+	}
			
 
				+
			
 
				+	if (pages > max_pages)
			
 
				+		pages = max_pages;
			
 
				+
			
 
				+	/* Don't shrink readahead too fast */
			
 
				+	last_ra = atomic_read(&last_readahead_pages) / 2;
			
 
				+	if (pages < last_ra)
			
 
				+		pages = last_ra;
			
 
				+	atomic_set(&last_readahead_pages, pages);
			
 
				+
			
 
				+	return pages;
			
 
				+}
			
 
				+
			
 
				 /**
			
 
				  * swapin_readahead - swap in pages in hope we need them soon
			
 
				  * @entry: swap entry of this memory
			
@@ -412,11 +461,16 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 
				 			struct vm_area_struct *vma, unsigned long addr)
			
 
				 {
			
 
				 	struct page *page;
			
 
				-	unsigned long offset = swp_offset(entry);
			
 
				+	unsigned long entry_offset = swp_offset(entry);
			
 
				+	unsigned long offset = entry_offset;
			
 
				 	unsigned long start_offset, end_offset;
			
 
				-	unsigned long mask = (1UL << page_cluster) - 1;
			
 
				+	unsigned long mask;
			
 
				 	struct blk_plug plug;
			
 
				 
			
 
				+	mask = swapin_nr_pages(offset) - 1;
			
 
				+	if (!mask)
			
 
				+		goto skip;
			
 
				+
			
 
				 	/* Read a page_cluster sized and aligned cluster around offset. */
			
 
				 	start_offset = offset & ~mask;
			
 
				 	end_offset = offset | mask;
			
@@ -430,10 +484,13 @@ struct page *swapin_readahead(swp_entry_t entry, gfp_t gfp_mask,
 
				 						gfp_mask, vma, addr);
			
 
				 		if (!page)
			
 
				 			continue;
			
 
				+		if (offset != entry_offset)
			
 
				+			SetPageReadahead(page);
			
 
				 		page_cache_release(page);
			
 
				 	}
			
 
				 	blk_finish_plug(&plug);
			
 
				 
			
 
				 	lru_add_drain();	/* Push any new pages onto the LRU now */
			
 
				+skip:
			
 
				 	return read_swap_cache_async(entry, gfp_mask, vma, addr);
			
 
				 }
			
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -1923,7 +1923,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
				 	p->swap_map = NULL;
			
 
				 	cluster_info = p->cluster_info;
			
 
				 	p->cluster_info = NULL;
			
 
				-	p->flags = 0;
			
 
				 	frontswap_map = frontswap_map_get(p);
			
 
				 	spin_unlock(&p->lock);
			
 
				 	spin_unlock(&swap_lock);
			
@@ -1949,6 +1948,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
 
				 		mutex_unlock(&inode->i_mutex);
			
 
				 	}
			
 
				 	filp_close(swap_file, NULL);
			
 
				+
			
 
				+	/*
			
 
				+	 * Clear the SWP_USED flag after all resources are freed so that swapon
			
 
				+	 * can reuse this swap_info in alloc_swap_info() safely.  It is ok to
			
 
				+	 * not hold p->lock after we cleared its SWP_WRITEOK.
			
 
				+	 */
			
 
				+	spin_lock(&swap_lock);
			
 
				+	p->flags = 0;
			
 
				+	spin_unlock(&swap_lock);
			
 
				+
			
 
				 	err = 0;
			
 
				 	atomic_inc(&proc_poll_event);
			
 
				 	wake_up_interruptible(&proc_poll_wait);