Browse Source

xfs: use rhashtable to track buffer cache

On filesystems with a lot of metadata and in metadata intensive workloads
xfs_buf_find() is showing up at the top of the CPU cycles trace. Most of
the CPU time is spent on CPU cache misses while traversing the rbtree.

As the buffer cache does not need any kind of ordering, but fast lookups
a hashtable is the natural data structure to use. The rhashtable
infrastructure provides a self-scaling hashtable implementation and
allows lookups to proceed while the table is going through a resize
operation.

This reduces the CPU-time spent for the lookups to 1/3 even for small
filesystems with a relatively small number of cached buffers, with
possibly much larger gains on higher loaded filesystems.

[dchinner: reduce minimum hash size to an acceptable size for large
	   filesystems with many AGs with no active use.]
[dchinner: remove stale rbtree asserts.]
[dchinner: use xfs_buf_map for compare function argument.]
[dchinner: make functions static.]
[dchinner: remove redundant comments.]

Signed-off-by: Lucas Stach <dev@lynxeye.de>
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>
Lucas Stach 8 years ago
parent
commit
6031e73a5b
5 changed files with 85 additions and 52 deletions
  1. 73 47
      fs/xfs/xfs_buf.c
  2. 1 1
      fs/xfs/xfs_buf.h
  3. 1 0
      fs/xfs/xfs_linux.h
  4. 5 2
      fs/xfs/xfs_mount.c
  5. 5 2
      fs/xfs/xfs_mount.h

+ 73 - 47
fs/xfs/xfs_buf.c

@@ -219,7 +219,6 @@ _xfs_buf_alloc(
 	init_completion(&bp->b_iowait);
 	init_completion(&bp->b_iowait);
 	INIT_LIST_HEAD(&bp->b_lru);
 	INIT_LIST_HEAD(&bp->b_lru);
 	INIT_LIST_HEAD(&bp->b_list);
 	INIT_LIST_HEAD(&bp->b_list);
-	RB_CLEAR_NODE(&bp->b_rbnode);
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
 	sema_init(&bp->b_sema, 0); /* held, no waiters */
 	spin_lock_init(&bp->b_lock);
 	spin_lock_init(&bp->b_lock);
 	XB_SET_OWNER(bp);
 	XB_SET_OWNER(bp);
@@ -473,6 +472,62 @@ _xfs_buf_map_pages(
 /*
 /*
  *	Finding and Reading Buffers
  *	Finding and Reading Buffers
  */
  */
+static int
+_xfs_buf_obj_cmp(
+	struct rhashtable_compare_arg	*arg,
+	const void			*obj)
+{
+	const struct xfs_buf_map	*map = arg->key;
+	const struct xfs_buf		*bp = obj;
+
+	/*
+	 * The key hashing in the lookup path depends on the key being the
+	 * first element of the compare_arg, make sure to assert this.
+	 */
+	BUILD_BUG_ON(offsetof(struct xfs_buf_map, bm_bn) != 0);
+
+	if (bp->b_bn != map->bm_bn)
+		return 1;
+
+	if (unlikely(bp->b_length != map->bm_len)) {
+		/*
+		 * found a block number match. If the range doesn't
+		 * match, the only way this is allowed is if the buffer
+		 * in the cache is stale and the transaction that made
+		 * it stale has not yet committed. i.e. we are
+		 * reallocating a busy extent. Skip this buffer and
+		 * continue searching for an exact match.
+		 */
+		ASSERT(bp->b_flags & XBF_STALE);
+		return 1;
+	}
+	return 0;
+}
+
+static const struct rhashtable_params xfs_buf_hash_params = {
+	.min_size		= 32,	/* empty AGs have minimal footprint */
+	.nelem_hint		= 16,
+	.key_len		= sizeof(xfs_daddr_t),
+	.key_offset		= offsetof(struct xfs_buf, b_bn),
+	.head_offset		= offsetof(struct xfs_buf, b_rhash_head),
+	.automatic_shrinking	= true,
+	.obj_cmpfn		= _xfs_buf_obj_cmp,
+};
+
+int
+xfs_buf_hash_init(
+	struct xfs_perag	*pag)
+{
+	spin_lock_init(&pag->pag_buf_lock);
+	return rhashtable_init(&pag->pag_buf_hash, &xfs_buf_hash_params);
+}
+
+void
+xfs_buf_hash_destroy(
+	struct xfs_perag	*pag)
+{
+	rhashtable_destroy(&pag->pag_buf_hash);
+}
 
 
 /*
 /*
  *	Look up, and creates if absent, a lockable buffer for
  *	Look up, and creates if absent, a lockable buffer for
@@ -488,27 +543,24 @@ _xfs_buf_find(
 	xfs_buf_t		*new_bp)
 	xfs_buf_t		*new_bp)
 {
 {
 	struct xfs_perag	*pag;
 	struct xfs_perag	*pag;
-	struct rb_node		**rbp;
-	struct rb_node		*parent;
 	xfs_buf_t		*bp;
 	xfs_buf_t		*bp;
-	xfs_daddr_t		blkno = map[0].bm_bn;
+	struct xfs_buf_map	cmap = { .bm_bn = map[0].bm_bn };
 	xfs_daddr_t		eofs;
 	xfs_daddr_t		eofs;
-	int			numblks = 0;
 	int			i;
 	int			i;
 
 
 	for (i = 0; i < nmaps; i++)
 	for (i = 0; i < nmaps; i++)
-		numblks += map[i].bm_len;
+		cmap.bm_len += map[i].bm_len;
 
 
 	/* Check for IOs smaller than the sector size / not sector aligned */
 	/* Check for IOs smaller than the sector size / not sector aligned */
-	ASSERT(!(BBTOB(numblks) < btp->bt_meta_sectorsize));
-	ASSERT(!(BBTOB(blkno) & (xfs_off_t)btp->bt_meta_sectormask));
+	ASSERT(!(BBTOB(cmap.bm_len) < btp->bt_meta_sectorsize));
+	ASSERT(!(BBTOB(cmap.bm_bn) & (xfs_off_t)btp->bt_meta_sectormask));
 
 
 	/*
 	/*
 	 * Corrupted block numbers can get through to here, unfortunately, so we
 	 * Corrupted block numbers can get through to here, unfortunately, so we
 	 * have to check that the buffer falls within the filesystem bounds.
 	 * have to check that the buffer falls within the filesystem bounds.
 	 */
 	 */
 	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
 	eofs = XFS_FSB_TO_BB(btp->bt_mount, btp->bt_mount->m_sb.sb_dblocks);
-	if (blkno < 0 || blkno >= eofs) {
+	if (cmap.bm_bn < 0 || cmap.bm_bn >= eofs) {
 		/*
 		/*
 		 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
 		 * XXX (dgc): we should really be returning -EFSCORRUPTED here,
 		 * but none of the higher level infrastructure supports
 		 * but none of the higher level infrastructure supports
@@ -516,53 +568,29 @@ _xfs_buf_find(
 		 */
 		 */
 		xfs_alert(btp->bt_mount,
 		xfs_alert(btp->bt_mount,
 			  "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
 			  "%s: Block out of range: block 0x%llx, EOFS 0x%llx ",
-			  __func__, blkno, eofs);
+			  __func__, cmap.bm_bn, eofs);
 		WARN_ON(1);
 		WARN_ON(1);
 		return NULL;
 		return NULL;
 	}
 	}
 
 
-	/* get tree root */
 	pag = xfs_perag_get(btp->bt_mount,
 	pag = xfs_perag_get(btp->bt_mount,
-				xfs_daddr_to_agno(btp->bt_mount, blkno));
+			    xfs_daddr_to_agno(btp->bt_mount, cmap.bm_bn));
 
 
-	/* walk tree */
 	spin_lock(&pag->pag_buf_lock);
 	spin_lock(&pag->pag_buf_lock);
-	rbp = &pag->pag_buf_tree.rb_node;
-	parent = NULL;
-	bp = NULL;
-	while (*rbp) {
-		parent = *rbp;
-		bp = rb_entry(parent, struct xfs_buf, b_rbnode);
-
-		if (blkno < bp->b_bn)
-			rbp = &(*rbp)->rb_left;
-		else if (blkno > bp->b_bn)
-			rbp = &(*rbp)->rb_right;
-		else {
-			/*
-			 * found a block number match. If the range doesn't
-			 * match, the only way this is allowed is if the buffer
-			 * in the cache is stale and the transaction that made
-			 * it stale has not yet committed. i.e. we are
-			 * reallocating a busy extent. Skip this buffer and
-			 * continue searching to the right for an exact match.
-			 */
-			if (bp->b_length != numblks) {
-				ASSERT(bp->b_flags & XBF_STALE);
-				rbp = &(*rbp)->rb_right;
-				continue;
-			}
-			atomic_inc(&bp->b_hold);
-			goto found;
-		}
+	bp = rhashtable_lookup_fast(&pag->pag_buf_hash, &cmap,
+				    xfs_buf_hash_params);
+	if (bp) {
+		atomic_inc(&bp->b_hold);
+		goto found;
 	}
 	}
 
 
 	/* No match found */
 	/* No match found */
 	if (new_bp) {
 	if (new_bp) {
-		rb_link_node(&new_bp->b_rbnode, parent, rbp);
-		rb_insert_color(&new_bp->b_rbnode, &pag->pag_buf_tree);
 		/* the buffer keeps the perag reference until it is freed */
 		/* the buffer keeps the perag reference until it is freed */
 		new_bp->b_pag = pag;
 		new_bp->b_pag = pag;
+		rhashtable_insert_fast(&pag->pag_buf_hash,
+				       &new_bp->b_rhash_head,
+				       xfs_buf_hash_params);
 		spin_unlock(&pag->pag_buf_lock);
 		spin_unlock(&pag->pag_buf_lock);
 	} else {
 	} else {
 		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
 		XFS_STATS_INC(btp->bt_mount, xb_miss_locked);
@@ -930,7 +958,6 @@ xfs_buf_rele(
 
 
 	if (!pag) {
 	if (!pag) {
 		ASSERT(list_empty(&bp->b_lru));
 		ASSERT(list_empty(&bp->b_lru));
-		ASSERT(RB_EMPTY_NODE(&bp->b_rbnode));
 		if (atomic_dec_and_test(&bp->b_hold)) {
 		if (atomic_dec_and_test(&bp->b_hold)) {
 			xfs_buf_ioacct_dec(bp);
 			xfs_buf_ioacct_dec(bp);
 			xfs_buf_free(bp);
 			xfs_buf_free(bp);
@@ -938,8 +965,6 @@ xfs_buf_rele(
 		return;
 		return;
 	}
 	}
 
 
-	ASSERT(!RB_EMPTY_NODE(&bp->b_rbnode));
-
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 	ASSERT(atomic_read(&bp->b_hold) > 0);
 
 
 	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
 	release = atomic_dec_and_lock(&bp->b_hold, &pag->pag_buf_lock);
@@ -983,7 +1008,8 @@ xfs_buf_rele(
 		}
 		}
 
 
 		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
 		ASSERT(!(bp->b_flags & _XBF_DELWRI_Q));
-		rb_erase(&bp->b_rbnode, &pag->pag_buf_tree);
+		rhashtable_remove_fast(&pag->pag_buf_hash, &bp->b_rhash_head,
+				       xfs_buf_hash_params);
 		spin_unlock(&pag->pag_buf_lock);
 		spin_unlock(&pag->pag_buf_lock);
 		xfs_perag_put(pag);
 		xfs_perag_put(pag);
 		freebuf = true;
 		freebuf = true;

+ 1 - 1
fs/xfs/xfs_buf.h

@@ -151,7 +151,7 @@ typedef struct xfs_buf {
 	 * which is the only bit that is touched if we hit the semaphore
 	 * which is the only bit that is touched if we hit the semaphore
 	 * fast-path on locking.
 	 * fast-path on locking.
 	 */
 	 */
-	struct rb_node		b_rbnode;	/* rbtree node */
+	struct rhash_head	b_rhash_head;	/* pag buffer hash node */
 	xfs_daddr_t		b_bn;		/* block number of buffer */
 	xfs_daddr_t		b_bn;		/* block number of buffer */
 	int			b_length;	/* size of buffer in BBs */
 	int			b_length;	/* size of buffer in BBs */
 	atomic_t		b_hold;		/* reference count */
 	atomic_t		b_hold;		/* reference count */

+ 1 - 0
fs/xfs/xfs_linux.h

@@ -78,6 +78,7 @@ typedef __u32			xfs_nlink_t;
 #include <linux/freezer.h>
 #include <linux/freezer.h>
 #include <linux/list_sort.h>
 #include <linux/list_sort.h>
 #include <linux/ratelimit.h>
 #include <linux/ratelimit.h>
+#include <linux/rhashtable.h>
 
 
 #include <asm/page.h>
 #include <asm/page.h>
 #include <asm/div64.h>
 #include <asm/div64.h>

+ 5 - 2
fs/xfs/xfs_mount.c

@@ -157,6 +157,7 @@ xfs_free_perag(
 		spin_unlock(&mp->m_perag_lock);
 		spin_unlock(&mp->m_perag_lock);
 		ASSERT(pag);
 		ASSERT(pag);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
 		ASSERT(atomic_read(&pag->pag_ref) == 0);
+		xfs_buf_hash_destroy(pag);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
 		call_rcu(&pag->rcu_head, __xfs_free_perag);
 	}
 	}
 }
 }
@@ -212,8 +213,8 @@ xfs_initialize_perag(
 		spin_lock_init(&pag->pag_ici_lock);
 		spin_lock_init(&pag->pag_ici_lock);
 		mutex_init(&pag->pag_ici_reclaim_lock);
 		mutex_init(&pag->pag_ici_reclaim_lock);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
 		INIT_RADIX_TREE(&pag->pag_ici_root, GFP_ATOMIC);
-		spin_lock_init(&pag->pag_buf_lock);
-		pag->pag_buf_tree = RB_ROOT;
+		if (xfs_buf_hash_init(pag))
+			goto out_unwind;
 
 
 		if (radix_tree_preload(GFP_NOFS))
 		if (radix_tree_preload(GFP_NOFS))
 			goto out_unwind;
 			goto out_unwind;
@@ -239,9 +240,11 @@ xfs_initialize_perag(
 	return 0;
 	return 0;
 
 
 out_unwind:
 out_unwind:
+	xfs_buf_hash_destroy(pag);
 	kmem_free(pag);
 	kmem_free(pag);
 	for (; index > first_initialised; index--) {
 	for (; index > first_initialised; index--) {
 		pag = radix_tree_delete(&mp->m_perag_tree, index);
 		pag = radix_tree_delete(&mp->m_perag_tree, index);
+		xfs_buf_hash_destroy(pag);
 		kmem_free(pag);
 		kmem_free(pag);
 	}
 	}
 	return error;
 	return error;

+ 5 - 2
fs/xfs/xfs_mount.h

@@ -393,8 +393,8 @@ typedef struct xfs_perag {
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 	unsigned long	pag_ici_reclaim_cursor;	/* reclaim restart point */
 
 
 	/* buffer cache index */
 	/* buffer cache index */
-	spinlock_t	pag_buf_lock;	/* lock for pag_buf_tree */
-	struct rb_root	pag_buf_tree;	/* ordered tree of active buffers */
+	spinlock_t	pag_buf_lock;	/* lock for pag_buf_hash */
+	struct rhashtable pag_buf_hash;
 
 
 	/* for rcu-safe freeing */
 	/* for rcu-safe freeing */
 	struct rcu_head	rcu_head;
 	struct rcu_head	rcu_head;
@@ -424,6 +424,9 @@ xfs_perag_resv(
 	}
 	}
 }
 }
 
 
+int xfs_buf_hash_init(xfs_perag_t *pag);
+void xfs_buf_hash_destroy(xfs_perag_t *pag);
+
 extern void	xfs_uuid_table_free(void);
 extern void	xfs_uuid_table_free(void);
 extern int	xfs_log_sbcount(xfs_mount_t *);
 extern int	xfs_log_sbcount(xfs_mount_t *);
 extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);
 extern __uint64_t xfs_default_resblks(xfs_mount_t *mp);